{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 3000, "global_step": 34068, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005870611717740989, "grad_norm": 2.2037928104400635, "learning_rate": 1e-05, "loss": 0.0641, "step": 10 }, { "epoch": 0.0011741223435481978, "grad_norm": 2.5393388271331787, "learning_rate": 2e-05, "loss": 0.0055, "step": 20 }, { "epoch": 0.0017611835153222965, "grad_norm": 0.06004801765084267, "learning_rate": 3e-05, "loss": 0.0203, "step": 30 }, { "epoch": 0.0023482446870963956, "grad_norm": 6.423142910003662, "learning_rate": 4e-05, "loss": 0.0313, "step": 40 }, { "epoch": 0.0029353058588704943, "grad_norm": 1.324741005897522, "learning_rate": 5e-05, "loss": 0.0162, "step": 50 }, { "epoch": 0.003522367030644593, "grad_norm": 3.5551211833953857, "learning_rate": 6e-05, "loss": 0.0106, "step": 60 }, { "epoch": 0.004109428202418692, "grad_norm": 0.13222283124923706, "learning_rate": 7e-05, "loss": 0.0108, "step": 70 }, { "epoch": 0.004696489374192791, "grad_norm": 1.9146218299865723, "learning_rate": 8e-05, "loss": 0.0172, "step": 80 }, { "epoch": 0.0052835505459668895, "grad_norm": 1.515202522277832, "learning_rate": 9e-05, "loss": 0.024, "step": 90 }, { "epoch": 0.005870611717740989, "grad_norm": 1.8395302295684814, "learning_rate": 0.0001, "loss": 0.0046, "step": 100 }, { "epoch": 0.006457672889515088, "grad_norm": 0.4752308428287506, "learning_rate": 9.999997861546707e-05, "loss": 0.0133, "step": 110 }, { "epoch": 0.007044734061289186, "grad_norm": 3.3688418865203857, "learning_rate": 9.99999144618865e-05, "loss": 0.0128, "step": 120 }, { "epoch": 0.007631795233063285, "grad_norm": 1.3503026962280273, "learning_rate": 9.999980753931321e-05, "loss": 0.0141, "step": 130 }, { "epoch": 0.008218856404837384, "grad_norm": 0.5728911757469177, "learning_rate": 9.999965784783865e-05, "loss": 0.0168, "step": 140 }, { "epoch": 0.008805917576611482, "grad_norm": 0.20352545380592346, "learning_rate": 9.999946538759087e-05, "loss": 0.0085, "step": 150 }, { "epoch": 0.009392978748385582, "grad_norm": 0.5463225841522217, "learning_rate": 9.999923015873447e-05, "loss": 0.0273, "step": 160 }, { "epoch": 0.00998003992015968, "grad_norm": 2.2422802448272705, "learning_rate": 9.999895216147068e-05, "loss": 0.0206, "step": 170 }, { "epoch": 0.010567101091933779, "grad_norm": 1.3659040927886963, "learning_rate": 9.99986313960373e-05, "loss": 0.024, "step": 180 }, { "epoch": 0.011154162263707879, "grad_norm": 0.4480757713317871, "learning_rate": 9.99982678627087e-05, "loss": 0.0115, "step": 190 }, { "epoch": 0.011741223435481977, "grad_norm": 0.2278975248336792, "learning_rate": 9.999786156179584e-05, "loss": 0.0277, "step": 200 }, { "epoch": 0.012328284607256075, "grad_norm": 1.7811647653579712, "learning_rate": 9.999741249364625e-05, "loss": 0.0169, "step": 210 }, { "epoch": 0.012915345779030175, "grad_norm": 0.5214270949363708, "learning_rate": 9.999692065864407e-05, "loss": 0.0129, "step": 220 }, { "epoch": 0.013502406950804274, "grad_norm": 0.010121507570147514, "learning_rate": 9.999638605721e-05, "loss": 0.0047, "step": 230 }, { "epoch": 0.014089468122578372, "grad_norm": 0.07541653513908386, "learning_rate": 9.999580868980134e-05, "loss": 0.0068, "step": 240 }, { "epoch": 0.014676529294352472, "grad_norm": 2.229921817779541, "learning_rate": 9.999518855691194e-05, "loss": 0.0432, "step": 250 }, { "epoch": 0.01526359046612657, "grad_norm": 1.3554790019989014, "learning_rate": 9.999452565907225e-05, "loss": 0.0287, "step": 260 }, { "epoch": 0.01585065163790067, "grad_norm": 0.2221003770828247, "learning_rate": 9.999381999684934e-05, "loss": 0.0041, "step": 270 }, { "epoch": 0.01643771280967477, "grad_norm": 0.7399819493293762, "learning_rate": 9.999307157084676e-05, "loss": 0.0187, "step": 280 }, { "epoch": 0.017024773981448867, "grad_norm": 3.3960468769073486, "learning_rate": 9.999228038170475e-05, "loss": 0.0203, "step": 290 }, { "epoch": 0.017611835153222965, "grad_norm": 1.684557318687439, "learning_rate": 9.999144643010004e-05, "loss": 0.0342, "step": 300 }, { "epoch": 0.018198896324997063, "grad_norm": 0.7973483204841614, "learning_rate": 9.999056971674601e-05, "loss": 0.0176, "step": 310 }, { "epoch": 0.018785957496771165, "grad_norm": 1.082037091255188, "learning_rate": 9.998965024239256e-05, "loss": 0.014, "step": 320 }, { "epoch": 0.019373018668545263, "grad_norm": 0.7168899178504944, "learning_rate": 9.99886880078262e-05, "loss": 0.0193, "step": 330 }, { "epoch": 0.01996007984031936, "grad_norm": 0.8778390884399414, "learning_rate": 9.998768301387001e-05, "loss": 0.0147, "step": 340 }, { "epoch": 0.02054714101209346, "grad_norm": 0.03346811980009079, "learning_rate": 9.998663526138365e-05, "loss": 0.0255, "step": 350 }, { "epoch": 0.021134202183867558, "grad_norm": 0.2908102571964264, "learning_rate": 9.998554475126332e-05, "loss": 0.0155, "step": 360 }, { "epoch": 0.02172126335564166, "grad_norm": 0.8321800231933594, "learning_rate": 9.998441148444184e-05, "loss": 0.0202, "step": 370 }, { "epoch": 0.022308324527415758, "grad_norm": 1.5566340684890747, "learning_rate": 9.99832354618886e-05, "loss": 0.0183, "step": 380 }, { "epoch": 0.022895385699189856, "grad_norm": 0.016553422436118126, "learning_rate": 9.998201668460952e-05, "loss": 0.0262, "step": 390 }, { "epoch": 0.023482446870963954, "grad_norm": 2.6109025478363037, "learning_rate": 9.998075515364715e-05, "loss": 0.0388, "step": 400 }, { "epoch": 0.024069508042738053, "grad_norm": 3.345412254333496, "learning_rate": 9.997945087008055e-05, "loss": 0.0183, "step": 410 }, { "epoch": 0.02465656921451215, "grad_norm": 0.02132261171936989, "learning_rate": 9.99781038350254e-05, "loss": 0.0198, "step": 420 }, { "epoch": 0.025243630386286253, "grad_norm": 2.097240447998047, "learning_rate": 9.997671404963391e-05, "loss": 0.0133, "step": 430 }, { "epoch": 0.02583069155806035, "grad_norm": 1.1681588888168335, "learning_rate": 9.99752815150949e-05, "loss": 0.0208, "step": 440 }, { "epoch": 0.02641775272983445, "grad_norm": 0.15087229013442993, "learning_rate": 9.99738062326337e-05, "loss": 0.0122, "step": 450 }, { "epoch": 0.027004813901608547, "grad_norm": 1.4533928632736206, "learning_rate": 9.997228820351228e-05, "loss": 0.0259, "step": 460 }, { "epoch": 0.027591875073382646, "grad_norm": 3.868213176727295, "learning_rate": 9.997072742902912e-05, "loss": 0.0163, "step": 470 }, { "epoch": 0.028178936245156744, "grad_norm": 1.1523321866989136, "learning_rate": 9.996912391051925e-05, "loss": 0.0178, "step": 480 }, { "epoch": 0.028765997416930845, "grad_norm": 1.964536190032959, "learning_rate": 9.996747764935431e-05, "loss": 0.0203, "step": 490 }, { "epoch": 0.029353058588704944, "grad_norm": 1.6125319004058838, "learning_rate": 9.996578864694249e-05, "loss": 0.0199, "step": 500 }, { "epoch": 0.029940119760479042, "grad_norm": 1.3507105112075806, "learning_rate": 9.996405690472852e-05, "loss": 0.0239, "step": 510 }, { "epoch": 0.03052718093225314, "grad_norm": 3.7466275691986084, "learning_rate": 9.996228242419372e-05, "loss": 0.0211, "step": 520 }, { "epoch": 0.03111424210402724, "grad_norm": 1.5872238874435425, "learning_rate": 9.996046520685592e-05, "loss": 0.0338, "step": 530 }, { "epoch": 0.03170130327580134, "grad_norm": 1.883832335472107, "learning_rate": 9.995860525426954e-05, "loss": 0.0154, "step": 540 }, { "epoch": 0.032288364447575435, "grad_norm": 1.1487303972244263, "learning_rate": 9.995670256802554e-05, "loss": 0.0248, "step": 550 }, { "epoch": 0.03287542561934954, "grad_norm": 0.1820375919342041, "learning_rate": 9.995475714975146e-05, "loss": 0.0173, "step": 560 }, { "epoch": 0.03346248679112364, "grad_norm": 0.43540775775909424, "learning_rate": 9.995276900111139e-05, "loss": 0.0198, "step": 570 }, { "epoch": 0.03404954796289773, "grad_norm": 1.9775478839874268, "learning_rate": 9.995073812380594e-05, "loss": 0.0257, "step": 580 }, { "epoch": 0.034636609134671835, "grad_norm": 2.404892683029175, "learning_rate": 9.994866451957225e-05, "loss": 0.0212, "step": 590 }, { "epoch": 0.03522367030644593, "grad_norm": 1.327643871307373, "learning_rate": 9.994654819018408e-05, "loss": 0.019, "step": 600 }, { "epoch": 0.03581073147822003, "grad_norm": 0.5496881604194641, "learning_rate": 9.99443891374517e-05, "loss": 0.0202, "step": 610 }, { "epoch": 0.036397792649994126, "grad_norm": 1.8398231267929077, "learning_rate": 9.99421873632219e-05, "loss": 0.0307, "step": 620 }, { "epoch": 0.03698485382176823, "grad_norm": 2.2247824668884277, "learning_rate": 9.993994286937805e-05, "loss": 0.0275, "step": 630 }, { "epoch": 0.03757191499354233, "grad_norm": 2.914898157119751, "learning_rate": 9.993765565784006e-05, "loss": 0.0211, "step": 640 }, { "epoch": 0.038158976165316424, "grad_norm": 2.512962579727173, "learning_rate": 9.993532573056436e-05, "loss": 0.016, "step": 650 }, { "epoch": 0.038746037337090526, "grad_norm": 3.5847184658050537, "learning_rate": 9.993295308954391e-05, "loss": 0.0274, "step": 660 }, { "epoch": 0.03933309850886462, "grad_norm": 1.5583494901657104, "learning_rate": 9.993053773680823e-05, "loss": 0.0237, "step": 670 }, { "epoch": 0.03992015968063872, "grad_norm": 1.3348017930984497, "learning_rate": 9.992807967442339e-05, "loss": 0.0273, "step": 680 }, { "epoch": 0.040507220852412824, "grad_norm": 0.9014214277267456, "learning_rate": 9.992557890449195e-05, "loss": 0.0182, "step": 690 }, { "epoch": 0.04109428202418692, "grad_norm": 1.2551409006118774, "learning_rate": 9.992303542915302e-05, "loss": 0.0463, "step": 700 }, { "epoch": 0.04168134319596102, "grad_norm": 1.183083415031433, "learning_rate": 9.992044925058224e-05, "loss": 0.0356, "step": 710 }, { "epoch": 0.042268404367735116, "grad_norm": 1.657358169555664, "learning_rate": 9.99178203709918e-05, "loss": 0.0368, "step": 720 }, { "epoch": 0.04285546553950922, "grad_norm": 2.1028082370758057, "learning_rate": 9.991514879263038e-05, "loss": 0.0266, "step": 730 }, { "epoch": 0.04344252671128332, "grad_norm": 3.882103443145752, "learning_rate": 9.991243451778318e-05, "loss": 0.0392, "step": 740 }, { "epoch": 0.044029587883057414, "grad_norm": 11.489930152893066, "learning_rate": 9.990967754877197e-05, "loss": 0.0376, "step": 750 }, { "epoch": 0.044616649054831516, "grad_norm": 1.9137790203094482, "learning_rate": 9.9906877887955e-05, "loss": 0.0156, "step": 760 }, { "epoch": 0.04520371022660561, "grad_norm": 2.4942731857299805, "learning_rate": 9.990403553772704e-05, "loss": 0.034, "step": 770 }, { "epoch": 0.04579077139837971, "grad_norm": 2.418353796005249, "learning_rate": 9.990115050051939e-05, "loss": 0.0255, "step": 780 }, { "epoch": 0.04637783257015381, "grad_norm": 2.717972993850708, "learning_rate": 9.989822277879985e-05, "loss": 0.0201, "step": 790 }, { "epoch": 0.04696489374192791, "grad_norm": 0.6746355891227722, "learning_rate": 9.989525237507276e-05, "loss": 0.0336, "step": 800 }, { "epoch": 0.04755195491370201, "grad_norm": 0.4549112021923065, "learning_rate": 9.989223929187893e-05, "loss": 0.0493, "step": 810 }, { "epoch": 0.048139016085476105, "grad_norm": 4.386491298675537, "learning_rate": 9.988918353179568e-05, "loss": 0.0359, "step": 820 }, { "epoch": 0.04872607725725021, "grad_norm": 2.7425849437713623, "learning_rate": 9.988608509743688e-05, "loss": 0.0363, "step": 830 }, { "epoch": 0.0493131384290243, "grad_norm": 1.0640265941619873, "learning_rate": 9.988294399145285e-05, "loss": 0.0326, "step": 840 }, { "epoch": 0.0499001996007984, "grad_norm": 1.4166111946105957, "learning_rate": 9.987976021653046e-05, "loss": 0.0211, "step": 850 }, { "epoch": 0.050487260772572505, "grad_norm": 2.1535682678222656, "learning_rate": 9.987653377539303e-05, "loss": 0.0284, "step": 860 }, { "epoch": 0.0510743219443466, "grad_norm": 2.582031726837158, "learning_rate": 9.987326467080041e-05, "loss": 0.0428, "step": 870 }, { "epoch": 0.0516613831161207, "grad_norm": 4.288167953491211, "learning_rate": 9.98699529055489e-05, "loss": 0.0263, "step": 880 }, { "epoch": 0.052248444287894796, "grad_norm": 0.8033009171485901, "learning_rate": 9.986659848247135e-05, "loss": 0.0371, "step": 890 }, { "epoch": 0.0528355054596689, "grad_norm": 1.2396597862243652, "learning_rate": 9.986320140443708e-05, "loss": 0.023, "step": 900 }, { "epoch": 0.053422566631443, "grad_norm": 2.016923189163208, "learning_rate": 9.985976167435187e-05, "loss": 0.0465, "step": 910 }, { "epoch": 0.054009627803217095, "grad_norm": 2.335644006729126, "learning_rate": 9.9856279295158e-05, "loss": 0.0285, "step": 920 }, { "epoch": 0.054596688974991196, "grad_norm": 2.3769099712371826, "learning_rate": 9.985275426983425e-05, "loss": 0.0367, "step": 930 }, { "epoch": 0.05518375014676529, "grad_norm": 0.9392242431640625, "learning_rate": 9.984918660139583e-05, "loss": 0.0346, "step": 940 }, { "epoch": 0.05577081131853939, "grad_norm": 0.7598341107368469, "learning_rate": 9.984557629289449e-05, "loss": 0.0449, "step": 950 }, { "epoch": 0.05635787249031349, "grad_norm": 0.39787986874580383, "learning_rate": 9.984192334741839e-05, "loss": 0.0218, "step": 960 }, { "epoch": 0.05694493366208759, "grad_norm": 0.6477892994880676, "learning_rate": 9.98382277680922e-05, "loss": 0.0338, "step": 970 }, { "epoch": 0.05753199483386169, "grad_norm": 2.9608235359191895, "learning_rate": 9.983448955807708e-05, "loss": 0.0146, "step": 980 }, { "epoch": 0.058119056005635786, "grad_norm": 0.14413294196128845, "learning_rate": 9.983070872057059e-05, "loss": 0.0476, "step": 990 }, { "epoch": 0.05870611717740989, "grad_norm": 3.8315868377685547, "learning_rate": 9.982688525880679e-05, "loss": 0.026, "step": 1000 }, { "epoch": 0.05929317834918398, "grad_norm": 1.5634944438934326, "learning_rate": 9.98230191760562e-05, "loss": 0.0377, "step": 1010 }, { "epoch": 0.059880239520958084, "grad_norm": 2.4407360553741455, "learning_rate": 9.981911047562583e-05, "loss": 0.0376, "step": 1020 }, { "epoch": 0.060467300692732186, "grad_norm": 3.7755749225616455, "learning_rate": 9.981515916085906e-05, "loss": 0.0443, "step": 1030 }, { "epoch": 0.06105436186450628, "grad_norm": 3.4814929962158203, "learning_rate": 9.981116523513579e-05, "loss": 0.032, "step": 1040 }, { "epoch": 0.06164142303628038, "grad_norm": 2.6388165950775146, "learning_rate": 9.980712870187236e-05, "loss": 0.0653, "step": 1050 }, { "epoch": 0.06222848420805448, "grad_norm": 0.9947832226753235, "learning_rate": 9.980304956452153e-05, "loss": 0.0223, "step": 1060 }, { "epoch": 0.06281554537982857, "grad_norm": 2.903304100036621, "learning_rate": 9.979892782657253e-05, "loss": 0.0509, "step": 1070 }, { "epoch": 0.06340260655160268, "grad_norm": 2.6675798892974854, "learning_rate": 9.9794763491551e-05, "loss": 0.0424, "step": 1080 }, { "epoch": 0.06398966772337678, "grad_norm": 2.005167007446289, "learning_rate": 9.979055656301905e-05, "loss": 0.0258, "step": 1090 }, { "epoch": 0.06457672889515087, "grad_norm": 1.8828363418579102, "learning_rate": 9.978630704457521e-05, "loss": 0.0478, "step": 1100 }, { "epoch": 0.06516379006692498, "grad_norm": 0.378522664308548, "learning_rate": 9.978201493985444e-05, "loss": 0.0209, "step": 1110 }, { "epoch": 0.06575085123869907, "grad_norm": 0.7414391040802002, "learning_rate": 9.97776802525281e-05, "loss": 0.0409, "step": 1120 }, { "epoch": 0.06633791241047317, "grad_norm": 1.4679638147354126, "learning_rate": 9.977330298630402e-05, "loss": 0.0295, "step": 1130 }, { "epoch": 0.06692497358224728, "grad_norm": 0.9425148963928223, "learning_rate": 9.976888314492644e-05, "loss": 0.0472, "step": 1140 }, { "epoch": 0.06751203475402137, "grad_norm": 0.8603097200393677, "learning_rate": 9.9764420732176e-05, "loss": 0.0381, "step": 1150 }, { "epoch": 0.06809909592579547, "grad_norm": 1.1279311180114746, "learning_rate": 9.975991575186977e-05, "loss": 0.0532, "step": 1160 }, { "epoch": 0.06868615709756956, "grad_norm": 1.5424326658248901, "learning_rate": 9.97553682078612e-05, "loss": 0.0275, "step": 1170 }, { "epoch": 0.06927321826934367, "grad_norm": 1.0373824834823608, "learning_rate": 9.975077810404021e-05, "loss": 0.0511, "step": 1180 }, { "epoch": 0.06986027944111776, "grad_norm": 0.08669783174991608, "learning_rate": 9.974614544433307e-05, "loss": 0.0148, "step": 1190 }, { "epoch": 0.07044734061289186, "grad_norm": 2.591121196746826, "learning_rate": 9.974147023270249e-05, "loss": 0.0343, "step": 1200 }, { "epoch": 0.07103440178466597, "grad_norm": 2.7418529987335205, "learning_rate": 9.973675247314753e-05, "loss": 0.0391, "step": 1210 }, { "epoch": 0.07162146295644006, "grad_norm": 1.4994189739227295, "learning_rate": 9.973199216970368e-05, "loss": 0.0432, "step": 1220 }, { "epoch": 0.07220852412821416, "grad_norm": 3.171295166015625, "learning_rate": 9.972718932644283e-05, "loss": 0.0228, "step": 1230 }, { "epoch": 0.07279558529998825, "grad_norm": 1.6535285711288452, "learning_rate": 9.972234394747324e-05, "loss": 0.0222, "step": 1240 }, { "epoch": 0.07338264647176236, "grad_norm": 1.2440599203109741, "learning_rate": 9.971745603693956e-05, "loss": 0.0239, "step": 1250 }, { "epoch": 0.07396970764353646, "grad_norm": 0.9509127140045166, "learning_rate": 9.971252559902277e-05, "loss": 0.0236, "step": 1260 }, { "epoch": 0.07455676881531055, "grad_norm": 2.0213236808776855, "learning_rate": 9.970755263794035e-05, "loss": 0.0441, "step": 1270 }, { "epoch": 0.07514382998708466, "grad_norm": 2.064988851547241, "learning_rate": 9.970253715794603e-05, "loss": 0.0308, "step": 1280 }, { "epoch": 0.07573089115885875, "grad_norm": 3.5013694763183594, "learning_rate": 9.969747916332996e-05, "loss": 0.0298, "step": 1290 }, { "epoch": 0.07631795233063285, "grad_norm": 1.9559962749481201, "learning_rate": 9.969237865841867e-05, "loss": 0.0224, "step": 1300 }, { "epoch": 0.07690501350240696, "grad_norm": 0.7560620307922363, "learning_rate": 9.968723564757503e-05, "loss": 0.0356, "step": 1310 }, { "epoch": 0.07749207467418105, "grad_norm": 5.014816761016846, "learning_rate": 9.968205013519826e-05, "loss": 0.0483, "step": 1320 }, { "epoch": 0.07807913584595515, "grad_norm": 3.6365370750427246, "learning_rate": 9.967682212572398e-05, "loss": 0.0227, "step": 1330 }, { "epoch": 0.07866619701772924, "grad_norm": 1.2204110622406006, "learning_rate": 9.967155162362413e-05, "loss": 0.0604, "step": 1340 }, { "epoch": 0.07925325818950335, "grad_norm": 4.171288967132568, "learning_rate": 9.966623863340696e-05, "loss": 0.0326, "step": 1350 }, { "epoch": 0.07984031936127745, "grad_norm": 1.4878485202789307, "learning_rate": 9.966088315961715e-05, "loss": 0.0336, "step": 1360 }, { "epoch": 0.08042738053305154, "grad_norm": 2.7584917545318604, "learning_rate": 9.965548520683563e-05, "loss": 0.0267, "step": 1370 }, { "epoch": 0.08101444170482565, "grad_norm": 0.9514058828353882, "learning_rate": 9.965004477967974e-05, "loss": 0.0412, "step": 1380 }, { "epoch": 0.08160150287659974, "grad_norm": 3.680694103240967, "learning_rate": 9.964456188280311e-05, "loss": 0.0243, "step": 1390 }, { "epoch": 0.08218856404837384, "grad_norm": 0.7756885886192322, "learning_rate": 9.96390365208957e-05, "loss": 0.0221, "step": 1400 }, { "epoch": 0.08277562522014793, "grad_norm": 1.2462010383605957, "learning_rate": 9.96334686986838e-05, "loss": 0.0475, "step": 1410 }, { "epoch": 0.08336268639192204, "grad_norm": 1.963977336883545, "learning_rate": 9.962785842093003e-05, "loss": 0.0259, "step": 1420 }, { "epoch": 0.08394974756369614, "grad_norm": 5.26522159576416, "learning_rate": 9.962220569243332e-05, "loss": 0.0441, "step": 1430 }, { "epoch": 0.08453680873547023, "grad_norm": 2.0117955207824707, "learning_rate": 9.961651051802891e-05, "loss": 0.0552, "step": 1440 }, { "epoch": 0.08512386990724434, "grad_norm": 4.85704231262207, "learning_rate": 9.961077290258833e-05, "loss": 0.0495, "step": 1450 }, { "epoch": 0.08571093107901843, "grad_norm": 2.6156978607177734, "learning_rate": 9.960499285101945e-05, "loss": 0.0156, "step": 1460 }, { "epoch": 0.08629799225079253, "grad_norm": 0.8362844586372375, "learning_rate": 9.95991703682664e-05, "loss": 0.0355, "step": 1470 }, { "epoch": 0.08688505342256664, "grad_norm": 1.7795151472091675, "learning_rate": 9.959330545930963e-05, "loss": 0.0218, "step": 1480 }, { "epoch": 0.08747211459434073, "grad_norm": 1.2182948589324951, "learning_rate": 9.958739812916586e-05, "loss": 0.0325, "step": 1490 }, { "epoch": 0.08805917576611483, "grad_norm": 2.649080991744995, "learning_rate": 9.958144838288814e-05, "loss": 0.0214, "step": 1500 }, { "epoch": 0.08864623693788892, "grad_norm": 2.225630521774292, "learning_rate": 9.957545622556574e-05, "loss": 0.0234, "step": 1510 }, { "epoch": 0.08923329810966303, "grad_norm": 1.3121471405029297, "learning_rate": 9.956942166232427e-05, "loss": 0.0368, "step": 1520 }, { "epoch": 0.08982035928143713, "grad_norm": 6.808068752288818, "learning_rate": 9.956334469832556e-05, "loss": 0.0569, "step": 1530 }, { "epoch": 0.09040742045321122, "grad_norm": 2.119955062866211, "learning_rate": 9.955722533876773e-05, "loss": 0.0414, "step": 1540 }, { "epoch": 0.09099448162498533, "grad_norm": 1.2916687726974487, "learning_rate": 9.955106358888517e-05, "loss": 0.031, "step": 1550 }, { "epoch": 0.09158154279675942, "grad_norm": 2.2134711742401123, "learning_rate": 9.954485945394856e-05, "loss": 0.0469, "step": 1560 }, { "epoch": 0.09216860396853352, "grad_norm": 1.7012152671813965, "learning_rate": 9.953861293926474e-05, "loss": 0.0271, "step": 1570 }, { "epoch": 0.09275566514030761, "grad_norm": 2.5345096588134766, "learning_rate": 9.95323240501769e-05, "loss": 0.0338, "step": 1580 }, { "epoch": 0.09334272631208172, "grad_norm": 1.7120459079742432, "learning_rate": 9.952599279206444e-05, "loss": 0.0317, "step": 1590 }, { "epoch": 0.09392978748385582, "grad_norm": 0.7913612127304077, "learning_rate": 9.951961917034299e-05, "loss": 0.0419, "step": 1600 }, { "epoch": 0.09451684865562991, "grad_norm": 1.9415456056594849, "learning_rate": 9.951320319046442e-05, "loss": 0.036, "step": 1610 }, { "epoch": 0.09510390982740402, "grad_norm": 2.7447025775909424, "learning_rate": 9.950674485791685e-05, "loss": 0.0334, "step": 1620 }, { "epoch": 0.09569097099917812, "grad_norm": 1.6320359706878662, "learning_rate": 9.950024417822462e-05, "loss": 0.0343, "step": 1630 }, { "epoch": 0.09627803217095221, "grad_norm": 2.6058413982391357, "learning_rate": 9.949370115694827e-05, "loss": 0.052, "step": 1640 }, { "epoch": 0.09686509334272632, "grad_norm": 1.936697244644165, "learning_rate": 9.94871157996846e-05, "loss": 0.0201, "step": 1650 }, { "epoch": 0.09745215451450041, "grad_norm": 0.7421324253082275, "learning_rate": 9.948048811206658e-05, "loss": 0.0302, "step": 1660 }, { "epoch": 0.09803921568627451, "grad_norm": 1.3068259954452515, "learning_rate": 9.947381809976344e-05, "loss": 0.0323, "step": 1670 }, { "epoch": 0.0986262768580486, "grad_norm": 1.5505841970443726, "learning_rate": 9.946710576848058e-05, "loss": 0.027, "step": 1680 }, { "epoch": 0.09921333802982271, "grad_norm": 2.2337491512298584, "learning_rate": 9.946035112395958e-05, "loss": 0.0259, "step": 1690 }, { "epoch": 0.0998003992015968, "grad_norm": 1.8403325080871582, "learning_rate": 9.945355417197824e-05, "loss": 0.0245, "step": 1700 }, { "epoch": 0.1003874603733709, "grad_norm": 3.5541958808898926, "learning_rate": 9.944671491835056e-05, "loss": 0.0268, "step": 1710 }, { "epoch": 0.10097452154514501, "grad_norm": 2.1355576515197754, "learning_rate": 9.943983336892669e-05, "loss": 0.0351, "step": 1720 }, { "epoch": 0.1015615827169191, "grad_norm": 1.5419174432754517, "learning_rate": 9.9432909529593e-05, "loss": 0.0297, "step": 1730 }, { "epoch": 0.1021486438886932, "grad_norm": 1.242248773574829, "learning_rate": 9.9425943406272e-05, "loss": 0.0346, "step": 1740 }, { "epoch": 0.1027357050604673, "grad_norm": 1.4329538345336914, "learning_rate": 9.941893500492241e-05, "loss": 0.0516, "step": 1750 }, { "epoch": 0.1033227662322414, "grad_norm": 1.0378223657608032, "learning_rate": 9.941188433153904e-05, "loss": 0.0181, "step": 1760 }, { "epoch": 0.1039098274040155, "grad_norm": 2.057999849319458, "learning_rate": 9.940479139215293e-05, "loss": 0.0407, "step": 1770 }, { "epoch": 0.10449688857578959, "grad_norm": 1.731401801109314, "learning_rate": 9.939765619283124e-05, "loss": 0.0575, "step": 1780 }, { "epoch": 0.1050839497475637, "grad_norm": 2.289888858795166, "learning_rate": 9.93904787396773e-05, "loss": 0.0222, "step": 1790 }, { "epoch": 0.1056710109193378, "grad_norm": 3.2877893447875977, "learning_rate": 9.938325903883055e-05, "loss": 0.1163, "step": 1800 }, { "epoch": 0.10625807209111189, "grad_norm": 0.9616751074790955, "learning_rate": 9.937599709646661e-05, "loss": 0.0686, "step": 1810 }, { "epoch": 0.106845133262886, "grad_norm": 0.8145966529846191, "learning_rate": 9.936869291879718e-05, "loss": 0.0317, "step": 1820 }, { "epoch": 0.1074321944346601, "grad_norm": 1.424609661102295, "learning_rate": 9.936134651207015e-05, "loss": 0.0287, "step": 1830 }, { "epoch": 0.10801925560643419, "grad_norm": 0.43495574593544006, "learning_rate": 9.935395788256947e-05, "loss": 0.0215, "step": 1840 }, { "epoch": 0.10860631677820828, "grad_norm": 2.1051337718963623, "learning_rate": 9.934652703661527e-05, "loss": 0.0321, "step": 1850 }, { "epoch": 0.10919337794998239, "grad_norm": 1.501328706741333, "learning_rate": 9.933905398056372e-05, "loss": 0.0225, "step": 1860 }, { "epoch": 0.10978043912175649, "grad_norm": 2.1189637184143066, "learning_rate": 9.933153872080714e-05, "loss": 0.0581, "step": 1870 }, { "epoch": 0.11036750029353058, "grad_norm": 2.349695920944214, "learning_rate": 9.932398126377396e-05, "loss": 0.0331, "step": 1880 }, { "epoch": 0.11095456146530469, "grad_norm": 2.785264015197754, "learning_rate": 9.931638161592867e-05, "loss": 0.0227, "step": 1890 }, { "epoch": 0.11154162263707879, "grad_norm": 1.9872111082077026, "learning_rate": 9.930873978377187e-05, "loss": 0.0247, "step": 1900 }, { "epoch": 0.11212868380885288, "grad_norm": 2.1452584266662598, "learning_rate": 9.930105577384026e-05, "loss": 0.0475, "step": 1910 }, { "epoch": 0.11271574498062698, "grad_norm": 3.118792772293091, "learning_rate": 9.929332959270659e-05, "loss": 0.0546, "step": 1920 }, { "epoch": 0.11330280615240108, "grad_norm": 1.63677978515625, "learning_rate": 9.928556124697967e-05, "loss": 0.042, "step": 1930 }, { "epoch": 0.11388986732417518, "grad_norm": 4.233394622802734, "learning_rate": 9.927775074330441e-05, "loss": 0.0424, "step": 1940 }, { "epoch": 0.11447692849594927, "grad_norm": 1.0141804218292236, "learning_rate": 9.926989808836178e-05, "loss": 0.0529, "step": 1950 }, { "epoch": 0.11506398966772338, "grad_norm": 1.6904869079589844, "learning_rate": 9.926200328886878e-05, "loss": 0.0296, "step": 1960 }, { "epoch": 0.11565105083949748, "grad_norm": 3.563488721847534, "learning_rate": 9.92540663515785e-05, "loss": 0.0413, "step": 1970 }, { "epoch": 0.11623811201127157, "grad_norm": 2.9499549865722656, "learning_rate": 9.924608728328001e-05, "loss": 0.0411, "step": 1980 }, { "epoch": 0.11682517318304568, "grad_norm": 0.8236364722251892, "learning_rate": 9.923806609079847e-05, "loss": 0.039, "step": 1990 }, { "epoch": 0.11741223435481977, "grad_norm": 4.066677570343018, "learning_rate": 9.923000278099508e-05, "loss": 0.0514, "step": 2000 }, { "epoch": 0.11799929552659387, "grad_norm": 5.0400390625, "learning_rate": 9.922189736076701e-05, "loss": 0.057, "step": 2010 }, { "epoch": 0.11858635669836796, "grad_norm": 2.4912304878234863, "learning_rate": 9.921374983704752e-05, "loss": 0.0258, "step": 2020 }, { "epoch": 0.11917341787014207, "grad_norm": 2.3315348625183105, "learning_rate": 9.92055602168058e-05, "loss": 0.0773, "step": 2030 }, { "epoch": 0.11976047904191617, "grad_norm": 2.5812909603118896, "learning_rate": 9.919732850704716e-05, "loss": 0.0425, "step": 2040 }, { "epoch": 0.12034754021369026, "grad_norm": 1.1976637840270996, "learning_rate": 9.918905471481281e-05, "loss": 0.0487, "step": 2050 }, { "epoch": 0.12093460138546437, "grad_norm": 1.7439115047454834, "learning_rate": 9.918073884718e-05, "loss": 0.0425, "step": 2060 }, { "epoch": 0.12152166255723847, "grad_norm": 2.745136022567749, "learning_rate": 9.917238091126198e-05, "loss": 0.0374, "step": 2070 }, { "epoch": 0.12210872372901256, "grad_norm": 4.099924564361572, "learning_rate": 9.916398091420797e-05, "loss": 0.0504, "step": 2080 }, { "epoch": 0.12269578490078666, "grad_norm": 2.8515045642852783, "learning_rate": 9.915553886320317e-05, "loss": 0.0246, "step": 2090 }, { "epoch": 0.12328284607256076, "grad_norm": 1.8732186555862427, "learning_rate": 9.914705476546875e-05, "loss": 0.0653, "step": 2100 }, { "epoch": 0.12386990724433486, "grad_norm": 2.660759925842285, "learning_rate": 9.913852862826185e-05, "loss": 0.0244, "step": 2110 }, { "epoch": 0.12445696841610895, "grad_norm": 2.177604913711548, "learning_rate": 9.912996045887556e-05, "loss": 0.0488, "step": 2120 }, { "epoch": 0.12504402958788305, "grad_norm": 1.5271365642547607, "learning_rate": 9.912135026463895e-05, "loss": 0.0495, "step": 2130 }, { "epoch": 0.12563109075965714, "grad_norm": 0.54610276222229, "learning_rate": 9.911269805291699e-05, "loss": 0.0471, "step": 2140 }, { "epoch": 0.12621815193143127, "grad_norm": 0.24762262403964996, "learning_rate": 9.910400383111067e-05, "loss": 0.029, "step": 2150 }, { "epoch": 0.12680521310320536, "grad_norm": 1.192949891090393, "learning_rate": 9.909526760665682e-05, "loss": 0.0397, "step": 2160 }, { "epoch": 0.12739227427497946, "grad_norm": 1.758433222770691, "learning_rate": 9.908648938702825e-05, "loss": 0.0719, "step": 2170 }, { "epoch": 0.12797933544675355, "grad_norm": 1.0003618001937866, "learning_rate": 9.90776691797337e-05, "loss": 0.0394, "step": 2180 }, { "epoch": 0.12856639661852765, "grad_norm": 0.9429585337638855, "learning_rate": 9.90688069923178e-05, "loss": 0.0471, "step": 2190 }, { "epoch": 0.12915345779030174, "grad_norm": 1.6162910461425781, "learning_rate": 9.90599028323611e-05, "loss": 0.0622, "step": 2200 }, { "epoch": 0.12974051896207583, "grad_norm": 2.433619499206543, "learning_rate": 9.905095670748005e-05, "loss": 0.047, "step": 2210 }, { "epoch": 0.13032758013384996, "grad_norm": 2.3690948486328125, "learning_rate": 9.904196862532702e-05, "loss": 0.0322, "step": 2220 }, { "epoch": 0.13091464130562405, "grad_norm": 3.2311367988586426, "learning_rate": 9.903293859359023e-05, "loss": 0.051, "step": 2230 }, { "epoch": 0.13150170247739815, "grad_norm": 2.1195015907287598, "learning_rate": 9.902386661999379e-05, "loss": 0.034, "step": 2240 }, { "epoch": 0.13208876364917224, "grad_norm": 1.6307891607284546, "learning_rate": 9.901475271229772e-05, "loss": 0.0544, "step": 2250 }, { "epoch": 0.13267582482094634, "grad_norm": 1.770676612854004, "learning_rate": 9.900559687829786e-05, "loss": 0.0434, "step": 2260 }, { "epoch": 0.13326288599272043, "grad_norm": 2.920513391494751, "learning_rate": 9.899639912582596e-05, "loss": 0.0292, "step": 2270 }, { "epoch": 0.13384994716449455, "grad_norm": 1.8063915967941284, "learning_rate": 9.89871594627496e-05, "loss": 0.0362, "step": 2280 }, { "epoch": 0.13443700833626865, "grad_norm": 3.2409045696258545, "learning_rate": 9.897787789697221e-05, "loss": 0.0651, "step": 2290 }, { "epoch": 0.13502406950804274, "grad_norm": 0.9920338988304138, "learning_rate": 9.896855443643308e-05, "loss": 0.0244, "step": 2300 }, { "epoch": 0.13561113067981684, "grad_norm": 2.6896822452545166, "learning_rate": 9.895918908910731e-05, "loss": 0.0348, "step": 2310 }, { "epoch": 0.13619819185159093, "grad_norm": 3.40159273147583, "learning_rate": 9.894978186300585e-05, "loss": 0.0359, "step": 2320 }, { "epoch": 0.13678525302336503, "grad_norm": 2.5290367603302, "learning_rate": 9.894033276617547e-05, "loss": 0.0472, "step": 2330 }, { "epoch": 0.13737231419513912, "grad_norm": 1.901503562927246, "learning_rate": 9.893084180669873e-05, "loss": 0.0437, "step": 2340 }, { "epoch": 0.13795937536691324, "grad_norm": 1.9427504539489746, "learning_rate": 9.892130899269405e-05, "loss": 0.0403, "step": 2350 }, { "epoch": 0.13854643653868734, "grad_norm": 0.3844375014305115, "learning_rate": 9.891173433231559e-05, "loss": 0.0285, "step": 2360 }, { "epoch": 0.13913349771046143, "grad_norm": 2.4379961490631104, "learning_rate": 9.890211783375338e-05, "loss": 0.0438, "step": 2370 }, { "epoch": 0.13972055888223553, "grad_norm": 1.542114019393921, "learning_rate": 9.889245950523315e-05, "loss": 0.0357, "step": 2380 }, { "epoch": 0.14030762005400962, "grad_norm": 3.4859018325805664, "learning_rate": 9.888275935501647e-05, "loss": 0.0497, "step": 2390 }, { "epoch": 0.14089468122578372, "grad_norm": 0.8069745898246765, "learning_rate": 9.887301739140066e-05, "loss": 0.0365, "step": 2400 }, { "epoch": 0.1414817423975578, "grad_norm": 6.08782434463501, "learning_rate": 9.886323362271882e-05, "loss": 0.0764, "step": 2410 }, { "epoch": 0.14206880356933194, "grad_norm": 2.0983567237854004, "learning_rate": 9.88534080573398e-05, "loss": 0.0562, "step": 2420 }, { "epoch": 0.14265586474110603, "grad_norm": 0.8409938812255859, "learning_rate": 9.884354070366822e-05, "loss": 0.0368, "step": 2430 }, { "epoch": 0.14324292591288013, "grad_norm": 2.17271089553833, "learning_rate": 9.883363157014442e-05, "loss": 0.024, "step": 2440 }, { "epoch": 0.14382998708465422, "grad_norm": 1.280503273010254, "learning_rate": 9.882368066524448e-05, "loss": 0.0247, "step": 2450 }, { "epoch": 0.14441704825642832, "grad_norm": 2.7436742782592773, "learning_rate": 9.881368799748021e-05, "loss": 0.0458, "step": 2460 }, { "epoch": 0.1450041094282024, "grad_norm": 2.1065752506256104, "learning_rate": 9.880365357539917e-05, "loss": 0.0557, "step": 2470 }, { "epoch": 0.1455911705999765, "grad_norm": 2.5970640182495117, "learning_rate": 9.879357740758462e-05, "loss": 0.0661, "step": 2480 }, { "epoch": 0.14617823177175063, "grad_norm": 2.9739792346954346, "learning_rate": 9.878345950265552e-05, "loss": 0.0725, "step": 2490 }, { "epoch": 0.14676529294352472, "grad_norm": 3.2631781101226807, "learning_rate": 9.877329986926653e-05, "loss": 0.043, "step": 2500 }, { "epoch": 0.14735235411529882, "grad_norm": 2.616386651992798, "learning_rate": 9.876309851610801e-05, "loss": 0.0345, "step": 2510 }, { "epoch": 0.1479394152870729, "grad_norm": 2.065317153930664, "learning_rate": 9.875285545190603e-05, "loss": 0.0458, "step": 2520 }, { "epoch": 0.148526476458847, "grad_norm": 0.7852377891540527, "learning_rate": 9.874257068542227e-05, "loss": 0.0303, "step": 2530 }, { "epoch": 0.1491135376306211, "grad_norm": 1.7618194818496704, "learning_rate": 9.873224422545417e-05, "loss": 0.0558, "step": 2540 }, { "epoch": 0.1497005988023952, "grad_norm": 1.0067861080169678, "learning_rate": 9.872187608083478e-05, "loss": 0.0234, "step": 2550 }, { "epoch": 0.15028765997416932, "grad_norm": 5.33658504486084, "learning_rate": 9.871146626043282e-05, "loss": 0.0451, "step": 2560 }, { "epoch": 0.1508747211459434, "grad_norm": 1.3059951066970825, "learning_rate": 9.870101477315263e-05, "loss": 0.0463, "step": 2570 }, { "epoch": 0.1514617823177175, "grad_norm": 1.686173677444458, "learning_rate": 9.869052162793424e-05, "loss": 0.0425, "step": 2580 }, { "epoch": 0.1520488434894916, "grad_norm": 2.3160104751586914, "learning_rate": 9.867998683375329e-05, "loss": 0.0555, "step": 2590 }, { "epoch": 0.1526359046612657, "grad_norm": 0.8404874205589294, "learning_rate": 9.866941039962104e-05, "loss": 0.0361, "step": 2600 }, { "epoch": 0.1532229658330398, "grad_norm": 2.111457109451294, "learning_rate": 9.865879233458438e-05, "loss": 0.0263, "step": 2610 }, { "epoch": 0.15381002700481392, "grad_norm": 3.2856898307800293, "learning_rate": 9.86481326477258e-05, "loss": 0.0476, "step": 2620 }, { "epoch": 0.154397088176588, "grad_norm": 0.6448665857315063, "learning_rate": 9.863743134816342e-05, "loss": 0.0428, "step": 2630 }, { "epoch": 0.1549841493483621, "grad_norm": 2.341824531555176, "learning_rate": 9.862668844505087e-05, "loss": 0.0518, "step": 2640 }, { "epoch": 0.1555712105201362, "grad_norm": 3.485743761062622, "learning_rate": 9.86159039475775e-05, "loss": 0.0454, "step": 2650 }, { "epoch": 0.1561582716919103, "grad_norm": 1.6375038623809814, "learning_rate": 9.86050778649681e-05, "loss": 0.0438, "step": 2660 }, { "epoch": 0.1567453328636844, "grad_norm": 4.4719672203063965, "learning_rate": 9.859421020648317e-05, "loss": 0.0361, "step": 2670 }, { "epoch": 0.15733239403545848, "grad_norm": 1.005717396736145, "learning_rate": 9.858330098141866e-05, "loss": 0.0367, "step": 2680 }, { "epoch": 0.1579194552072326, "grad_norm": 1.5214030742645264, "learning_rate": 9.857235019910611e-05, "loss": 0.0365, "step": 2690 }, { "epoch": 0.1585065163790067, "grad_norm": 2.988581895828247, "learning_rate": 9.856135786891265e-05, "loss": 0.0521, "step": 2700 }, { "epoch": 0.1590935775507808, "grad_norm": 2.4597716331481934, "learning_rate": 9.855032400024089e-05, "loss": 0.0311, "step": 2710 }, { "epoch": 0.1596806387225549, "grad_norm": 1.8115123510360718, "learning_rate": 9.853924860252898e-05, "loss": 0.0458, "step": 2720 }, { "epoch": 0.16026769989432899, "grad_norm": 2.618330955505371, "learning_rate": 9.852813168525064e-05, "loss": 0.0295, "step": 2730 }, { "epoch": 0.16085476106610308, "grad_norm": 1.6817182302474976, "learning_rate": 9.851697325791505e-05, "loss": 0.03, "step": 2740 }, { "epoch": 0.16144182223787718, "grad_norm": 1.044686198234558, "learning_rate": 9.850577333006693e-05, "loss": 0.0393, "step": 2750 }, { "epoch": 0.1620288834096513, "grad_norm": 4.514500141143799, "learning_rate": 9.84945319112865e-05, "loss": 0.0397, "step": 2760 }, { "epoch": 0.1626159445814254, "grad_norm": 1.7709953784942627, "learning_rate": 9.848324901118943e-05, "loss": 0.0324, "step": 2770 }, { "epoch": 0.1632030057531995, "grad_norm": 3.2024428844451904, "learning_rate": 9.847192463942694e-05, "loss": 0.0369, "step": 2780 }, { "epoch": 0.16379006692497358, "grad_norm": 2.4505691528320312, "learning_rate": 9.846055880568566e-05, "loss": 0.0494, "step": 2790 }, { "epoch": 0.16437712809674768, "grad_norm": 0.6586551666259766, "learning_rate": 9.844915151968773e-05, "loss": 0.0284, "step": 2800 }, { "epoch": 0.16496418926852177, "grad_norm": 1.8783758878707886, "learning_rate": 9.843770279119069e-05, "loss": 0.0451, "step": 2810 }, { "epoch": 0.16555125044029587, "grad_norm": 1.4282265901565552, "learning_rate": 9.842621262998761e-05, "loss": 0.0307, "step": 2820 }, { "epoch": 0.16613831161207, "grad_norm": 1.9189605712890625, "learning_rate": 9.841468104590695e-05, "loss": 0.0483, "step": 2830 }, { "epoch": 0.16672537278384408, "grad_norm": 1.8889552354812622, "learning_rate": 9.840310804881261e-05, "loss": 0.066, "step": 2840 }, { "epoch": 0.16731243395561818, "grad_norm": 3.8601179122924805, "learning_rate": 9.839149364860389e-05, "loss": 0.0332, "step": 2850 }, { "epoch": 0.16789949512739227, "grad_norm": 1.879103660583496, "learning_rate": 9.837983785521559e-05, "loss": 0.056, "step": 2860 }, { "epoch": 0.16848655629916637, "grad_norm": 2.205148696899414, "learning_rate": 9.83681406786178e-05, "loss": 0.0375, "step": 2870 }, { "epoch": 0.16907361747094046, "grad_norm": 1.0379294157028198, "learning_rate": 9.835640212881608e-05, "loss": 0.0265, "step": 2880 }, { "epoch": 0.16966067864271456, "grad_norm": 3.219978094100952, "learning_rate": 9.834462221585139e-05, "loss": 0.0234, "step": 2890 }, { "epoch": 0.17024773981448868, "grad_norm": 3.693683624267578, "learning_rate": 9.833280094980002e-05, "loss": 0.0869, "step": 2900 }, { "epoch": 0.17083480098626277, "grad_norm": 2.207987070083618, "learning_rate": 9.832093834077367e-05, "loss": 0.045, "step": 2910 }, { "epoch": 0.17142186215803687, "grad_norm": 0.669643759727478, "learning_rate": 9.83090343989194e-05, "loss": 0.0302, "step": 2920 }, { "epoch": 0.17200892332981096, "grad_norm": 2.841738700866699, "learning_rate": 9.829708913441962e-05, "loss": 0.0455, "step": 2930 }, { "epoch": 0.17259598450158506, "grad_norm": 2.0086352825164795, "learning_rate": 9.828510255749208e-05, "loss": 0.0255, "step": 2940 }, { "epoch": 0.17318304567335915, "grad_norm": 1.1377476453781128, "learning_rate": 9.827307467838987e-05, "loss": 0.0432, "step": 2950 }, { "epoch": 0.17377010684513328, "grad_norm": 2.0405306816101074, "learning_rate": 9.826100550740143e-05, "loss": 0.0498, "step": 2960 }, { "epoch": 0.17435716801690737, "grad_norm": 1.9128247499465942, "learning_rate": 9.824889505485048e-05, "loss": 0.0223, "step": 2970 }, { "epoch": 0.17494422918868147, "grad_norm": 2.2797882556915283, "learning_rate": 9.823674333109608e-05, "loss": 0.0663, "step": 2980 }, { "epoch": 0.17553129036045556, "grad_norm": 1.8146817684173584, "learning_rate": 9.82245503465326e-05, "loss": 0.0255, "step": 2990 }, { "epoch": 0.17611835153222966, "grad_norm": 0.6516047716140747, "learning_rate": 9.821231611158969e-05, "loss": 0.0304, "step": 3000 }, { "epoch": 0.17611835153222966, "eval_loss": 0.4754152297973633, "eval_runtime": 269.6361, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 3000 }, { "epoch": 0.17670541270400375, "grad_norm": 0.47869399189949036, "learning_rate": 9.820004063673228e-05, "loss": 0.033, "step": 3010 }, { "epoch": 0.17729247387577785, "grad_norm": 2.7709405422210693, "learning_rate": 9.818772393246058e-05, "loss": 0.0433, "step": 3020 }, { "epoch": 0.17787953504755197, "grad_norm": 0.6569466590881348, "learning_rate": 9.817536600931007e-05, "loss": 0.0356, "step": 3030 }, { "epoch": 0.17846659621932606, "grad_norm": 2.474756956100464, "learning_rate": 9.81629668778515e-05, "loss": 0.0626, "step": 3040 }, { "epoch": 0.17905365739110016, "grad_norm": 3.932004451751709, "learning_rate": 9.815052654869084e-05, "loss": 0.0589, "step": 3050 }, { "epoch": 0.17964071856287425, "grad_norm": 1.8189294338226318, "learning_rate": 9.813804503246932e-05, "loss": 0.0407, "step": 3060 }, { "epoch": 0.18022777973464835, "grad_norm": 1.9111417531967163, "learning_rate": 9.812552233986338e-05, "loss": 0.0389, "step": 3070 }, { "epoch": 0.18081484090642244, "grad_norm": 3.685927629470825, "learning_rate": 9.811295848158472e-05, "loss": 0.0425, "step": 3080 }, { "epoch": 0.18140190207819654, "grad_norm": 2.441706895828247, "learning_rate": 9.810035346838023e-05, "loss": 0.0436, "step": 3090 }, { "epoch": 0.18198896324997066, "grad_norm": 1.8486565351486206, "learning_rate": 9.8087707311032e-05, "loss": 0.05, "step": 3100 }, { "epoch": 0.18257602442174475, "grad_norm": 0.3728896379470825, "learning_rate": 9.807502002035729e-05, "loss": 0.0323, "step": 3110 }, { "epoch": 0.18316308559351885, "grad_norm": 1.8062225580215454, "learning_rate": 9.80622916072086e-05, "loss": 0.0199, "step": 3120 }, { "epoch": 0.18375014676529294, "grad_norm": 0.7379009127616882, "learning_rate": 9.804952208247358e-05, "loss": 0.0365, "step": 3130 }, { "epoch": 0.18433720793706704, "grad_norm": 2.159383773803711, "learning_rate": 9.803671145707502e-05, "loss": 0.0508, "step": 3140 }, { "epoch": 0.18492426910884113, "grad_norm": 1.8557137250900269, "learning_rate": 9.80238597419709e-05, "loss": 0.0744, "step": 3150 }, { "epoch": 0.18551133028061523, "grad_norm": 0.7379667162895203, "learning_rate": 9.801096694815435e-05, "loss": 0.0436, "step": 3160 }, { "epoch": 0.18609839145238935, "grad_norm": 2.2723872661590576, "learning_rate": 9.799803308665362e-05, "loss": 0.0698, "step": 3170 }, { "epoch": 0.18668545262416344, "grad_norm": 2.0510456562042236, "learning_rate": 9.798505816853208e-05, "loss": 0.0361, "step": 3180 }, { "epoch": 0.18727251379593754, "grad_norm": 1.9840871095657349, "learning_rate": 9.797204220488823e-05, "loss": 0.0456, "step": 3190 }, { "epoch": 0.18785957496771163, "grad_norm": 0.5295336246490479, "learning_rate": 9.795898520685569e-05, "loss": 0.069, "step": 3200 }, { "epoch": 0.18844663613948573, "grad_norm": 1.9793596267700195, "learning_rate": 9.794588718560319e-05, "loss": 0.076, "step": 3210 }, { "epoch": 0.18903369731125982, "grad_norm": 1.951541781425476, "learning_rate": 9.793274815233451e-05, "loss": 0.0556, "step": 3220 }, { "epoch": 0.18962075848303392, "grad_norm": 2.235807418823242, "learning_rate": 9.791956811828855e-05, "loss": 0.0489, "step": 3230 }, { "epoch": 0.19020781965480804, "grad_norm": 2.71808123588562, "learning_rate": 9.790634709473924e-05, "loss": 0.0408, "step": 3240 }, { "epoch": 0.19079488082658214, "grad_norm": 1.706302285194397, "learning_rate": 9.789308509299562e-05, "loss": 0.0242, "step": 3250 }, { "epoch": 0.19138194199835623, "grad_norm": 2.558588981628418, "learning_rate": 9.787978212440176e-05, "loss": 0.0296, "step": 3260 }, { "epoch": 0.19196900317013033, "grad_norm": 1.5892531871795654, "learning_rate": 9.786643820033674e-05, "loss": 0.0527, "step": 3270 }, { "epoch": 0.19255606434190442, "grad_norm": 2.68218731880188, "learning_rate": 9.785305333221474e-05, "loss": 0.0604, "step": 3280 }, { "epoch": 0.19314312551367852, "grad_norm": 2.679905652999878, "learning_rate": 9.78396275314849e-05, "loss": 0.0211, "step": 3290 }, { "epoch": 0.19373018668545264, "grad_norm": 1.5836975574493408, "learning_rate": 9.782616080963143e-05, "loss": 0.0665, "step": 3300 }, { "epoch": 0.19431724785722673, "grad_norm": 1.1025676727294922, "learning_rate": 9.781265317817347e-05, "loss": 0.039, "step": 3310 }, { "epoch": 0.19490430902900083, "grad_norm": 2.7383909225463867, "learning_rate": 9.779910464866523e-05, "loss": 0.0612, "step": 3320 }, { "epoch": 0.19549137020077492, "grad_norm": 1.8712137937545776, "learning_rate": 9.778551523269586e-05, "loss": 0.0458, "step": 3330 }, { "epoch": 0.19607843137254902, "grad_norm": 0.3346378803253174, "learning_rate": 9.777188494188948e-05, "loss": 0.0316, "step": 3340 }, { "epoch": 0.1966654925443231, "grad_norm": 3.2127461433410645, "learning_rate": 9.775821378790519e-05, "loss": 0.0396, "step": 3350 }, { "epoch": 0.1972525537160972, "grad_norm": 2.9048569202423096, "learning_rate": 9.774450178243706e-05, "loss": 0.0397, "step": 3360 }, { "epoch": 0.19783961488787133, "grad_norm": 2.8324990272521973, "learning_rate": 9.773074893721407e-05, "loss": 0.03, "step": 3370 }, { "epoch": 0.19842667605964542, "grad_norm": 3.4215381145477295, "learning_rate": 9.771695526400013e-05, "loss": 0.0548, "step": 3380 }, { "epoch": 0.19901373723141952, "grad_norm": 2.7100284099578857, "learning_rate": 9.770312077459411e-05, "loss": 0.0399, "step": 3390 }, { "epoch": 0.1996007984031936, "grad_norm": 1.099760890007019, "learning_rate": 9.768924548082979e-05, "loss": 0.0456, "step": 3400 }, { "epoch": 0.2001878595749677, "grad_norm": 5.312976360321045, "learning_rate": 9.76753293945758e-05, "loss": 0.0423, "step": 3410 }, { "epoch": 0.2007749207467418, "grad_norm": 2.148909091949463, "learning_rate": 9.766137252773572e-05, "loss": 0.0388, "step": 3420 }, { "epoch": 0.2013619819185159, "grad_norm": 2.3653364181518555, "learning_rate": 9.764737489224799e-05, "loss": 0.0378, "step": 3430 }, { "epoch": 0.20194904309029002, "grad_norm": 3.389256000518799, "learning_rate": 9.763333650008593e-05, "loss": 0.0486, "step": 3440 }, { "epoch": 0.20253610426206411, "grad_norm": 1.1765735149383545, "learning_rate": 9.76192573632577e-05, "loss": 0.0774, "step": 3450 }, { "epoch": 0.2031231654338382, "grad_norm": 3.8393447399139404, "learning_rate": 9.760513749380635e-05, "loss": 0.0538, "step": 3460 }, { "epoch": 0.2037102266056123, "grad_norm": 2.3688712120056152, "learning_rate": 9.759097690380976e-05, "loss": 0.0289, "step": 3470 }, { "epoch": 0.2042972877773864, "grad_norm": 0.3408827781677246, "learning_rate": 9.757677560538061e-05, "loss": 0.0424, "step": 3480 }, { "epoch": 0.2048843489491605, "grad_norm": 1.3343703746795654, "learning_rate": 9.756253361066643e-05, "loss": 0.0301, "step": 3490 }, { "epoch": 0.2054714101209346, "grad_norm": 1.2232662439346313, "learning_rate": 9.754825093184958e-05, "loss": 0.0203, "step": 3500 }, { "epoch": 0.2060584712927087, "grad_norm": 2.336533308029175, "learning_rate": 9.753392758114718e-05, "loss": 0.0402, "step": 3510 }, { "epoch": 0.2066455324644828, "grad_norm": 2.738333225250244, "learning_rate": 9.751956357081115e-05, "loss": 0.0438, "step": 3520 }, { "epoch": 0.2072325936362569, "grad_norm": 1.7353787422180176, "learning_rate": 9.750515891312819e-05, "loss": 0.032, "step": 3530 }, { "epoch": 0.207819654808031, "grad_norm": 1.526991367340088, "learning_rate": 9.749071362041981e-05, "loss": 0.0356, "step": 3540 }, { "epoch": 0.2084067159798051, "grad_norm": 3.5745511054992676, "learning_rate": 9.747622770504221e-05, "loss": 0.0425, "step": 3550 }, { "epoch": 0.20899377715157919, "grad_norm": 1.9567959308624268, "learning_rate": 9.746170117938638e-05, "loss": 0.0604, "step": 3560 }, { "epoch": 0.20958083832335328, "grad_norm": 2.5341362953186035, "learning_rate": 9.744713405587804e-05, "loss": 0.039, "step": 3570 }, { "epoch": 0.2101678994951274, "grad_norm": 4.469109058380127, "learning_rate": 9.743252634697767e-05, "loss": 0.044, "step": 3580 }, { "epoch": 0.2107549606669015, "grad_norm": 3.514477014541626, "learning_rate": 9.741787806518035e-05, "loss": 0.0514, "step": 3590 }, { "epoch": 0.2113420218386756, "grad_norm": 4.614218235015869, "learning_rate": 9.740318922301602e-05, "loss": 0.0357, "step": 3600 }, { "epoch": 0.2119290830104497, "grad_norm": 4.051399230957031, "learning_rate": 9.738845983304921e-05, "loss": 0.0431, "step": 3610 }, { "epoch": 0.21251614418222378, "grad_norm": 0.22102586925029755, "learning_rate": 9.737368990787916e-05, "loss": 0.058, "step": 3620 }, { "epoch": 0.21310320535399788, "grad_norm": 2.6751372814178467, "learning_rate": 9.735887946013982e-05, "loss": 0.0424, "step": 3630 }, { "epoch": 0.213690266525772, "grad_norm": 2.331186532974243, "learning_rate": 9.734402850249973e-05, "loss": 0.0388, "step": 3640 }, { "epoch": 0.2142773276975461, "grad_norm": 0.5822759866714478, "learning_rate": 9.732913704766216e-05, "loss": 0.0405, "step": 3650 }, { "epoch": 0.2148643888693202, "grad_norm": 1.9439653158187866, "learning_rate": 9.731420510836494e-05, "loss": 0.0445, "step": 3660 }, { "epoch": 0.21545145004109428, "grad_norm": 4.179188251495361, "learning_rate": 9.729923269738062e-05, "loss": 0.0634, "step": 3670 }, { "epoch": 0.21603851121286838, "grad_norm": 4.009206771850586, "learning_rate": 9.728421982751628e-05, "loss": 0.0419, "step": 3680 }, { "epoch": 0.21662557238464247, "grad_norm": 2.237543821334839, "learning_rate": 9.726916651161367e-05, "loss": 0.0254, "step": 3690 }, { "epoch": 0.21721263355641657, "grad_norm": 2.975160837173462, "learning_rate": 9.725407276254909e-05, "loss": 0.0286, "step": 3700 }, { "epoch": 0.2177996947281907, "grad_norm": 1.5479967594146729, "learning_rate": 9.723893859323348e-05, "loss": 0.0326, "step": 3710 }, { "epoch": 0.21838675589996479, "grad_norm": 1.7621835470199585, "learning_rate": 9.722376401661233e-05, "loss": 0.0437, "step": 3720 }, { "epoch": 0.21897381707173888, "grad_norm": 2.332693099975586, "learning_rate": 9.720854904566566e-05, "loss": 0.0613, "step": 3730 }, { "epoch": 0.21956087824351297, "grad_norm": 2.5728635787963867, "learning_rate": 9.71932936934081e-05, "loss": 0.0354, "step": 3740 }, { "epoch": 0.22014793941528707, "grad_norm": 0.8831126093864441, "learning_rate": 9.717799797288877e-05, "loss": 0.0313, "step": 3750 }, { "epoch": 0.22073500058706116, "grad_norm": 1.9385029077529907, "learning_rate": 9.716266189719136e-05, "loss": 0.0508, "step": 3760 }, { "epoch": 0.22132206175883526, "grad_norm": 1.5319123268127441, "learning_rate": 9.714728547943405e-05, "loss": 0.0689, "step": 3770 }, { "epoch": 0.22190912293060938, "grad_norm": 1.2483266592025757, "learning_rate": 9.713186873276955e-05, "loss": 0.0296, "step": 3780 }, { "epoch": 0.22249618410238348, "grad_norm": 2.23776912689209, "learning_rate": 9.711641167038506e-05, "loss": 0.064, "step": 3790 }, { "epoch": 0.22308324527415757, "grad_norm": 5.403620719909668, "learning_rate": 9.710091430550224e-05, "loss": 0.0404, "step": 3800 }, { "epoch": 0.22367030644593167, "grad_norm": 2.370596170425415, "learning_rate": 9.708537665137727e-05, "loss": 0.0396, "step": 3810 }, { "epoch": 0.22425736761770576, "grad_norm": 1.913191318511963, "learning_rate": 9.706979872130077e-05, "loss": 0.0435, "step": 3820 }, { "epoch": 0.22484442878947986, "grad_norm": 1.9024417400360107, "learning_rate": 9.70541805285978e-05, "loss": 0.0395, "step": 3830 }, { "epoch": 0.22543148996125395, "grad_norm": 2.4339756965637207, "learning_rate": 9.703852208662786e-05, "loss": 0.0526, "step": 3840 }, { "epoch": 0.22601855113302807, "grad_norm": 1.6247811317443848, "learning_rate": 9.702282340878493e-05, "loss": 0.07, "step": 3850 }, { "epoch": 0.22660561230480217, "grad_norm": 2.0454747676849365, "learning_rate": 9.700708450849732e-05, "loss": 0.0329, "step": 3860 }, { "epoch": 0.22719267347657626, "grad_norm": 2.070821523666382, "learning_rate": 9.69913053992278e-05, "loss": 0.047, "step": 3870 }, { "epoch": 0.22777973464835036, "grad_norm": 1.7548588514328003, "learning_rate": 9.697548609447355e-05, "loss": 0.046, "step": 3880 }, { "epoch": 0.22836679582012445, "grad_norm": 1.6130026578903198, "learning_rate": 9.695962660776607e-05, "loss": 0.0317, "step": 3890 }, { "epoch": 0.22895385699189855, "grad_norm": 0.5520845651626587, "learning_rate": 9.694372695267131e-05, "loss": 0.0446, "step": 3900 }, { "epoch": 0.22954091816367264, "grad_norm": 2.591543436050415, "learning_rate": 9.692778714278952e-05, "loss": 0.0391, "step": 3910 }, { "epoch": 0.23012797933544676, "grad_norm": 5.135702133178711, "learning_rate": 9.69118071917553e-05, "loss": 0.0307, "step": 3920 }, { "epoch": 0.23071504050722086, "grad_norm": 1.5746864080429077, "learning_rate": 9.689578711323761e-05, "loss": 0.0368, "step": 3930 }, { "epoch": 0.23130210167899495, "grad_norm": 1.950485110282898, "learning_rate": 9.687972692093973e-05, "loss": 0.036, "step": 3940 }, { "epoch": 0.23188916285076905, "grad_norm": 0.6898679733276367, "learning_rate": 9.686362662859927e-05, "loss": 0.0322, "step": 3950 }, { "epoch": 0.23247622402254314, "grad_norm": 0.18586641550064087, "learning_rate": 9.68474862499881e-05, "loss": 0.0426, "step": 3960 }, { "epoch": 0.23306328519431724, "grad_norm": 0.7819411754608154, "learning_rate": 9.683130579891238e-05, "loss": 0.0424, "step": 3970 }, { "epoch": 0.23365034636609136, "grad_norm": 0.5046377182006836, "learning_rate": 9.68150852892126e-05, "loss": 0.0499, "step": 3980 }, { "epoch": 0.23423740753786546, "grad_norm": 3.63139009475708, "learning_rate": 9.679882473476344e-05, "loss": 0.0306, "step": 3990 }, { "epoch": 0.23482446870963955, "grad_norm": 2.4393115043640137, "learning_rate": 9.67825241494739e-05, "loss": 0.0271, "step": 4000 }, { "epoch": 0.23541152988141364, "grad_norm": 2.4522364139556885, "learning_rate": 9.676618354728722e-05, "loss": 0.0361, "step": 4010 }, { "epoch": 0.23599859105318774, "grad_norm": 1.2558209896087646, "learning_rate": 9.67498029421808e-05, "loss": 0.0701, "step": 4020 }, { "epoch": 0.23658565222496183, "grad_norm": 3.0067551136016846, "learning_rate": 9.673338234816632e-05, "loss": 0.0604, "step": 4030 }, { "epoch": 0.23717271339673593, "grad_norm": 4.278449535369873, "learning_rate": 9.671692177928966e-05, "loss": 0.0558, "step": 4040 }, { "epoch": 0.23775977456851005, "grad_norm": 1.1294217109680176, "learning_rate": 9.670042124963087e-05, "loss": 0.0417, "step": 4050 }, { "epoch": 0.23834683574028415, "grad_norm": 1.9507063627243042, "learning_rate": 9.668388077330421e-05, "loss": 0.0321, "step": 4060 }, { "epoch": 0.23893389691205824, "grad_norm": 0.8584306240081787, "learning_rate": 9.666730036445809e-05, "loss": 0.045, "step": 4070 }, { "epoch": 0.23952095808383234, "grad_norm": 1.6064647436141968, "learning_rate": 9.665068003727507e-05, "loss": 0.0373, "step": 4080 }, { "epoch": 0.24010801925560643, "grad_norm": 1.9879229068756104, "learning_rate": 9.663401980597188e-05, "loss": 0.0437, "step": 4090 }, { "epoch": 0.24069508042738053, "grad_norm": 0.5784907341003418, "learning_rate": 9.661731968479936e-05, "loss": 0.041, "step": 4100 }, { "epoch": 0.24128214159915462, "grad_norm": 0.7829803824424744, "learning_rate": 9.660057968804249e-05, "loss": 0.028, "step": 4110 }, { "epoch": 0.24186920277092874, "grad_norm": 1.7535614967346191, "learning_rate": 9.658379983002035e-05, "loss": 0.0278, "step": 4120 }, { "epoch": 0.24245626394270284, "grad_norm": 2.482191562652588, "learning_rate": 9.65669801250861e-05, "loss": 0.0459, "step": 4130 }, { "epoch": 0.24304332511447693, "grad_norm": 3.86083984375, "learning_rate": 9.655012058762703e-05, "loss": 0.0457, "step": 4140 }, { "epoch": 0.24363038628625103, "grad_norm": 1.0339630842208862, "learning_rate": 9.653322123206445e-05, "loss": 0.0268, "step": 4150 }, { "epoch": 0.24421744745802512, "grad_norm": 1.7193056344985962, "learning_rate": 9.651628207285377e-05, "loss": 0.0328, "step": 4160 }, { "epoch": 0.24480450862979922, "grad_norm": 2.191051721572876, "learning_rate": 9.649930312448441e-05, "loss": 0.0313, "step": 4170 }, { "epoch": 0.2453915698015733, "grad_norm": 3.469489097595215, "learning_rate": 9.648228440147987e-05, "loss": 0.0594, "step": 4180 }, { "epoch": 0.24597863097334743, "grad_norm": 0.9163156151771545, "learning_rate": 9.646522591839764e-05, "loss": 0.024, "step": 4190 }, { "epoch": 0.24656569214512153, "grad_norm": 2.414659261703491, "learning_rate": 9.64481276898292e-05, "loss": 0.0557, "step": 4200 }, { "epoch": 0.24715275331689562, "grad_norm": 2.6754441261291504, "learning_rate": 9.64309897304001e-05, "loss": 0.0529, "step": 4210 }, { "epoch": 0.24773981448866972, "grad_norm": 2.449244499206543, "learning_rate": 9.641381205476981e-05, "loss": 0.0515, "step": 4220 }, { "epoch": 0.2483268756604438, "grad_norm": 1.6271411180496216, "learning_rate": 9.639659467763178e-05, "loss": 0.0465, "step": 4230 }, { "epoch": 0.2489139368322179, "grad_norm": 4.679442882537842, "learning_rate": 9.637933761371345e-05, "loss": 0.0404, "step": 4240 }, { "epoch": 0.249500998003992, "grad_norm": 2.5515072345733643, "learning_rate": 9.636204087777618e-05, "loss": 0.0464, "step": 4250 }, { "epoch": 0.2500880591757661, "grad_norm": 2.178102731704712, "learning_rate": 9.63447044846153e-05, "loss": 0.0574, "step": 4260 }, { "epoch": 0.2506751203475402, "grad_norm": 0.9000117778778076, "learning_rate": 9.632732844906e-05, "loss": 0.0484, "step": 4270 }, { "epoch": 0.2512621815193143, "grad_norm": 3.017303705215454, "learning_rate": 9.630991278597344e-05, "loss": 0.0692, "step": 4280 }, { "epoch": 0.25184924269108844, "grad_norm": 0.9426410794258118, "learning_rate": 9.629245751025262e-05, "loss": 0.0574, "step": 4290 }, { "epoch": 0.25243630386286253, "grad_norm": 2.272217273712158, "learning_rate": 9.62749626368285e-05, "loss": 0.0472, "step": 4300 }, { "epoch": 0.2530233650346366, "grad_norm": 4.943531513214111, "learning_rate": 9.625742818066586e-05, "loss": 0.0659, "step": 4310 }, { "epoch": 0.2536104262064107, "grad_norm": 2.196079730987549, "learning_rate": 9.623985415676332e-05, "loss": 0.0466, "step": 4320 }, { "epoch": 0.2541974873781848, "grad_norm": 2.0745205879211426, "learning_rate": 9.622224058015339e-05, "loss": 0.0356, "step": 4330 }, { "epoch": 0.2547845485499589, "grad_norm": 2.846806049346924, "learning_rate": 9.62045874659024e-05, "loss": 0.0741, "step": 4340 }, { "epoch": 0.255371609721733, "grad_norm": 2.5163679122924805, "learning_rate": 9.618689482911047e-05, "loss": 0.0425, "step": 4350 }, { "epoch": 0.2559586708935071, "grad_norm": 1.4826740026474, "learning_rate": 9.616916268491158e-05, "loss": 0.0234, "step": 4360 }, { "epoch": 0.2565457320652812, "grad_norm": 0.04228798300027847, "learning_rate": 9.615139104847348e-05, "loss": 0.0416, "step": 4370 }, { "epoch": 0.2571327932370553, "grad_norm": 0.12215807288885117, "learning_rate": 9.613357993499766e-05, "loss": 0.0608, "step": 4380 }, { "epoch": 0.2577198544088294, "grad_norm": 1.0769306421279907, "learning_rate": 9.611572935971941e-05, "loss": 0.0694, "step": 4390 }, { "epoch": 0.2583069155806035, "grad_norm": 1.0067722797393799, "learning_rate": 9.609783933790784e-05, "loss": 0.0328, "step": 4400 }, { "epoch": 0.2588939767523776, "grad_norm": 1.8800987005233765, "learning_rate": 9.607990988486568e-05, "loss": 0.0487, "step": 4410 }, { "epoch": 0.25948103792415167, "grad_norm": 2.109830617904663, "learning_rate": 9.606194101592947e-05, "loss": 0.0328, "step": 4420 }, { "epoch": 0.2600680990959258, "grad_norm": 1.2077661752700806, "learning_rate": 9.604393274646945e-05, "loss": 0.031, "step": 4430 }, { "epoch": 0.2606551602676999, "grad_norm": 2.6591012477874756, "learning_rate": 9.602588509188954e-05, "loss": 0.0343, "step": 4440 }, { "epoch": 0.261242221439474, "grad_norm": 2.237421989440918, "learning_rate": 9.600779806762738e-05, "loss": 0.0452, "step": 4450 }, { "epoch": 0.2618292826112481, "grad_norm": 3.302521228790283, "learning_rate": 9.59896716891543e-05, "loss": 0.0444, "step": 4460 }, { "epoch": 0.2624163437830222, "grad_norm": 2.2960593700408936, "learning_rate": 9.59715059719752e-05, "loss": 0.0466, "step": 4470 }, { "epoch": 0.2630034049547963, "grad_norm": 2.3601245880126953, "learning_rate": 9.595330093162876e-05, "loss": 0.0435, "step": 4480 }, { "epoch": 0.2635904661265704, "grad_norm": 0.5983108282089233, "learning_rate": 9.593505658368718e-05, "loss": 0.0348, "step": 4490 }, { "epoch": 0.2641775272983445, "grad_norm": 1.6568204164505005, "learning_rate": 9.591677294375636e-05, "loss": 0.0423, "step": 4500 }, { "epoch": 0.2647645884701186, "grad_norm": 1.6575958728790283, "learning_rate": 9.58984500274758e-05, "loss": 0.0346, "step": 4510 }, { "epoch": 0.2653516496418927, "grad_norm": 1.7203445434570312, "learning_rate": 9.588008785051854e-05, "loss": 0.0706, "step": 4520 }, { "epoch": 0.26593871081366677, "grad_norm": 2.2138168811798096, "learning_rate": 9.586168642859128e-05, "loss": 0.0424, "step": 4530 }, { "epoch": 0.26652577198544086, "grad_norm": 3.437802314758301, "learning_rate": 9.584324577743425e-05, "loss": 0.0374, "step": 4540 }, { "epoch": 0.26711283315721496, "grad_norm": 0.9754136800765991, "learning_rate": 9.582476591282119e-05, "loss": 0.0486, "step": 4550 }, { "epoch": 0.2676998943289891, "grad_norm": 5.477251052856445, "learning_rate": 9.58062468505595e-05, "loss": 0.0516, "step": 4560 }, { "epoch": 0.2682869555007632, "grad_norm": 3.3450748920440674, "learning_rate": 9.578768860649e-05, "loss": 0.0409, "step": 4570 }, { "epoch": 0.2688740166725373, "grad_norm": 2.78643536567688, "learning_rate": 9.576909119648705e-05, "loss": 0.0621, "step": 4580 }, { "epoch": 0.2694610778443114, "grad_norm": 3.63116192817688, "learning_rate": 9.575045463645858e-05, "loss": 0.0757, "step": 4590 }, { "epoch": 0.2700481390160855, "grad_norm": 2.1244685649871826, "learning_rate": 9.573177894234591e-05, "loss": 0.0418, "step": 4600 }, { "epoch": 0.2706352001878596, "grad_norm": 4.1109724044799805, "learning_rate": 9.571306413012388e-05, "loss": 0.059, "step": 4610 }, { "epoch": 0.2712222613596337, "grad_norm": 2.585906982421875, "learning_rate": 9.569431021580082e-05, "loss": 0.0388, "step": 4620 }, { "epoch": 0.27180932253140777, "grad_norm": 3.3618996143341064, "learning_rate": 9.567551721541846e-05, "loss": 0.0511, "step": 4630 }, { "epoch": 0.27239638370318187, "grad_norm": 0.7405043840408325, "learning_rate": 9.565668514505199e-05, "loss": 0.0513, "step": 4640 }, { "epoch": 0.27298344487495596, "grad_norm": 2.23223614692688, "learning_rate": 9.563781402081e-05, "loss": 0.0417, "step": 4650 }, { "epoch": 0.27357050604673006, "grad_norm": 1.4202497005462646, "learning_rate": 9.56189038588345e-05, "loss": 0.0411, "step": 4660 }, { "epoch": 0.27415756721850415, "grad_norm": 3.3020284175872803, "learning_rate": 9.559995467530091e-05, "loss": 0.0371, "step": 4670 }, { "epoch": 0.27474462839027824, "grad_norm": 1.8571077585220337, "learning_rate": 9.558096648641797e-05, "loss": 0.0443, "step": 4680 }, { "epoch": 0.27533168956205234, "grad_norm": 1.0412755012512207, "learning_rate": 9.556193930842785e-05, "loss": 0.0393, "step": 4690 }, { "epoch": 0.2759187507338265, "grad_norm": 0.8517029881477356, "learning_rate": 9.554287315760603e-05, "loss": 0.0631, "step": 4700 }, { "epoch": 0.2765058119056006, "grad_norm": 2.1072020530700684, "learning_rate": 9.552376805026136e-05, "loss": 0.051, "step": 4710 }, { "epoch": 0.2770928730773747, "grad_norm": 1.2897216081619263, "learning_rate": 9.550462400273596e-05, "loss": 0.0242, "step": 4720 }, { "epoch": 0.2776799342491488, "grad_norm": 0.9732093811035156, "learning_rate": 9.54854410314053e-05, "loss": 0.0276, "step": 4730 }, { "epoch": 0.27826699542092287, "grad_norm": 3.7374629974365234, "learning_rate": 9.546621915267815e-05, "loss": 0.0536, "step": 4740 }, { "epoch": 0.27885405659269696, "grad_norm": 2.724569320678711, "learning_rate": 9.544695838299653e-05, "loss": 0.0472, "step": 4750 }, { "epoch": 0.27944111776447106, "grad_norm": 3.4818320274353027, "learning_rate": 9.542765873883577e-05, "loss": 0.0608, "step": 4760 }, { "epoch": 0.28002817893624515, "grad_norm": 2.309112787246704, "learning_rate": 9.540832023670439e-05, "loss": 0.0567, "step": 4770 }, { "epoch": 0.28061524010801925, "grad_norm": 1.942906379699707, "learning_rate": 9.53889428931442e-05, "loss": 0.0498, "step": 4780 }, { "epoch": 0.28120230127979334, "grad_norm": 2.6248302459716797, "learning_rate": 9.536952672473021e-05, "loss": 0.0549, "step": 4790 }, { "epoch": 0.28178936245156744, "grad_norm": 3.351979970932007, "learning_rate": 9.535007174807066e-05, "loss": 0.0536, "step": 4800 }, { "epoch": 0.28237642362334153, "grad_norm": 1.2752174139022827, "learning_rate": 9.533057797980696e-05, "loss": 0.0547, "step": 4810 }, { "epoch": 0.2829634847951156, "grad_norm": 1.991886854171753, "learning_rate": 9.531104543661374e-05, "loss": 0.0186, "step": 4820 }, { "epoch": 0.2835505459668898, "grad_norm": 1.6250778436660767, "learning_rate": 9.529147413519873e-05, "loss": 0.0253, "step": 4830 }, { "epoch": 0.2841376071386639, "grad_norm": 1.0819950103759766, "learning_rate": 9.52718640923029e-05, "loss": 0.0408, "step": 4840 }, { "epoch": 0.28472466831043797, "grad_norm": 1.681856632232666, "learning_rate": 9.525221532470029e-05, "loss": 0.0326, "step": 4850 }, { "epoch": 0.28531172948221206, "grad_norm": 1.9015501737594604, "learning_rate": 9.523252784919809e-05, "loss": 0.0996, "step": 4860 }, { "epoch": 0.28589879065398616, "grad_norm": 1.970300555229187, "learning_rate": 9.52128016826366e-05, "loss": 0.0497, "step": 4870 }, { "epoch": 0.28648585182576025, "grad_norm": 1.5843287706375122, "learning_rate": 9.519303684188922e-05, "loss": 0.0567, "step": 4880 }, { "epoch": 0.28707291299753435, "grad_norm": 1.746987223625183, "learning_rate": 9.517323334386244e-05, "loss": 0.0338, "step": 4890 }, { "epoch": 0.28765997416930844, "grad_norm": 1.4393055438995361, "learning_rate": 9.515339120549576e-05, "loss": 0.0309, "step": 4900 }, { "epoch": 0.28824703534108254, "grad_norm": 2.5302774906158447, "learning_rate": 9.513351044376182e-05, "loss": 0.0284, "step": 4910 }, { "epoch": 0.28883409651285663, "grad_norm": 1.0148005485534668, "learning_rate": 9.51135910756662e-05, "loss": 0.0376, "step": 4920 }, { "epoch": 0.2894211576846307, "grad_norm": 3.0846927165985107, "learning_rate": 9.509363311824761e-05, "loss": 0.0443, "step": 4930 }, { "epoch": 0.2900082188564048, "grad_norm": 2.7418041229248047, "learning_rate": 9.507363658857768e-05, "loss": 0.0341, "step": 4940 }, { "epoch": 0.2905952800281789, "grad_norm": 3.865344762802124, "learning_rate": 9.505360150376109e-05, "loss": 0.0427, "step": 4950 }, { "epoch": 0.291182341199953, "grad_norm": 2.493309736251831, "learning_rate": 9.503352788093547e-05, "loss": 0.04, "step": 4960 }, { "epoch": 0.29176940237172716, "grad_norm": 1.4252429008483887, "learning_rate": 9.501341573727141e-05, "loss": 0.0347, "step": 4970 }, { "epoch": 0.29235646354350125, "grad_norm": 2.8794472217559814, "learning_rate": 9.499326508997246e-05, "loss": 0.0255, "step": 4980 }, { "epoch": 0.29294352471527535, "grad_norm": 2.3145668506622314, "learning_rate": 9.497307595627511e-05, "loss": 0.0423, "step": 4990 }, { "epoch": 0.29353058588704944, "grad_norm": 2.2408437728881836, "learning_rate": 9.495284835344879e-05, "loss": 0.0254, "step": 5000 }, { "epoch": 0.29411764705882354, "grad_norm": 3.4195361137390137, "learning_rate": 9.49325822987958e-05, "loss": 0.0522, "step": 5010 }, { "epoch": 0.29470470823059763, "grad_norm": 2.4050040245056152, "learning_rate": 9.491227780965136e-05, "loss": 0.0453, "step": 5020 }, { "epoch": 0.29529176940237173, "grad_norm": 4.142215251922607, "learning_rate": 9.48919349033835e-05, "loss": 0.0296, "step": 5030 }, { "epoch": 0.2958788305741458, "grad_norm": 8.010381698608398, "learning_rate": 9.487155359739321e-05, "loss": 0.0479, "step": 5040 }, { "epoch": 0.2964658917459199, "grad_norm": 1.5859040021896362, "learning_rate": 9.485113390911427e-05, "loss": 0.0325, "step": 5050 }, { "epoch": 0.297052952917694, "grad_norm": 2.35192608833313, "learning_rate": 9.483067585601327e-05, "loss": 0.0547, "step": 5060 }, { "epoch": 0.2976400140894681, "grad_norm": 2.094485282897949, "learning_rate": 9.481017945558969e-05, "loss": 0.0534, "step": 5070 }, { "epoch": 0.2982270752612422, "grad_norm": 0.1226126030087471, "learning_rate": 9.478964472537575e-05, "loss": 0.0374, "step": 5080 }, { "epoch": 0.2988141364330163, "grad_norm": 1.314904808998108, "learning_rate": 9.476907168293646e-05, "loss": 0.0504, "step": 5090 }, { "epoch": 0.2994011976047904, "grad_norm": 1.3122048377990723, "learning_rate": 9.474846034586964e-05, "loss": 0.0447, "step": 5100 }, { "epoch": 0.29998825877656454, "grad_norm": 2.8994345664978027, "learning_rate": 9.472781073180582e-05, "loss": 0.0457, "step": 5110 }, { "epoch": 0.30057531994833864, "grad_norm": 1.1444993019104004, "learning_rate": 9.47071228584083e-05, "loss": 0.0401, "step": 5120 }, { "epoch": 0.30116238112011273, "grad_norm": 2.2754225730895996, "learning_rate": 9.468639674337312e-05, "loss": 0.0383, "step": 5130 }, { "epoch": 0.3017494422918868, "grad_norm": 2.3510823249816895, "learning_rate": 9.466563240442901e-05, "loss": 0.0406, "step": 5140 }, { "epoch": 0.3023365034636609, "grad_norm": 2.2636609077453613, "learning_rate": 9.464482985933736e-05, "loss": 0.067, "step": 5150 }, { "epoch": 0.302923564635435, "grad_norm": 2.818981647491455, "learning_rate": 9.462398912589232e-05, "loss": 0.0427, "step": 5160 }, { "epoch": 0.3035106258072091, "grad_norm": 4.39980411529541, "learning_rate": 9.460311022192064e-05, "loss": 0.0729, "step": 5170 }, { "epoch": 0.3040976869789832, "grad_norm": 3.873307704925537, "learning_rate": 9.458219316528175e-05, "loss": 0.0388, "step": 5180 }, { "epoch": 0.3046847481507573, "grad_norm": 1.6399500370025635, "learning_rate": 9.456123797386771e-05, "loss": 0.0602, "step": 5190 }, { "epoch": 0.3052718093225314, "grad_norm": 1.9389368295669556, "learning_rate": 9.45402446656032e-05, "loss": 0.0426, "step": 5200 }, { "epoch": 0.3058588704943055, "grad_norm": 4.763432025909424, "learning_rate": 9.451921325844551e-05, "loss": 0.0593, "step": 5210 }, { "epoch": 0.3064459316660796, "grad_norm": 2.46632981300354, "learning_rate": 9.449814377038452e-05, "loss": 0.039, "step": 5220 }, { "epoch": 0.3070329928378537, "grad_norm": 2.234233856201172, "learning_rate": 9.447703621944264e-05, "loss": 0.0419, "step": 5230 }, { "epoch": 0.30762005400962783, "grad_norm": 1.779569149017334, "learning_rate": 9.445589062367491e-05, "loss": 0.0483, "step": 5240 }, { "epoch": 0.3082071151814019, "grad_norm": 2.168407917022705, "learning_rate": 9.443470700116887e-05, "loss": 0.0557, "step": 5250 }, { "epoch": 0.308794176353176, "grad_norm": 1.59275221824646, "learning_rate": 9.441348537004459e-05, "loss": 0.0371, "step": 5260 }, { "epoch": 0.3093812375249501, "grad_norm": 2.929985761642456, "learning_rate": 9.439222574845465e-05, "loss": 0.0408, "step": 5270 }, { "epoch": 0.3099682986967242, "grad_norm": 2.250945806503296, "learning_rate": 9.437092815458415e-05, "loss": 0.0342, "step": 5280 }, { "epoch": 0.3105553598684983, "grad_norm": 3.255429983139038, "learning_rate": 9.434959260665064e-05, "loss": 0.0642, "step": 5290 }, { "epoch": 0.3111424210402724, "grad_norm": 1.4585903882980347, "learning_rate": 9.432821912290414e-05, "loss": 0.0609, "step": 5300 }, { "epoch": 0.3117294822120465, "grad_norm": 2.1435985565185547, "learning_rate": 9.430680772162716e-05, "loss": 0.0427, "step": 5310 }, { "epoch": 0.3123165433838206, "grad_norm": 2.1471781730651855, "learning_rate": 9.428535842113459e-05, "loss": 0.0386, "step": 5320 }, { "epoch": 0.3129036045555947, "grad_norm": 2.391707420349121, "learning_rate": 9.426387123977378e-05, "loss": 0.0416, "step": 5330 }, { "epoch": 0.3134906657273688, "grad_norm": 1.038680076599121, "learning_rate": 9.424234619592442e-05, "loss": 0.0342, "step": 5340 }, { "epoch": 0.3140777268991429, "grad_norm": 4.7923808097839355, "learning_rate": 9.422078330799868e-05, "loss": 0.027, "step": 5350 }, { "epoch": 0.31466478807091697, "grad_norm": 2.100338935852051, "learning_rate": 9.419918259444104e-05, "loss": 0.062, "step": 5360 }, { "epoch": 0.31525184924269106, "grad_norm": 4.071483135223389, "learning_rate": 9.417754407372832e-05, "loss": 0.0329, "step": 5370 }, { "epoch": 0.3158389104144652, "grad_norm": 1.9182132482528687, "learning_rate": 9.415586776436973e-05, "loss": 0.0441, "step": 5380 }, { "epoch": 0.3164259715862393, "grad_norm": 2.150156259536743, "learning_rate": 9.413415368490678e-05, "loss": 0.0367, "step": 5390 }, { "epoch": 0.3170130327580134, "grad_norm": 0.9288743734359741, "learning_rate": 9.411240185391327e-05, "loss": 0.077, "step": 5400 }, { "epoch": 0.3176000939297875, "grad_norm": 3.840372323989868, "learning_rate": 9.409061228999533e-05, "loss": 0.034, "step": 5410 }, { "epoch": 0.3181871551015616, "grad_norm": 1.9792379140853882, "learning_rate": 9.406878501179135e-05, "loss": 0.052, "step": 5420 }, { "epoch": 0.3187742162733357, "grad_norm": 2.395099401473999, "learning_rate": 9.404692003797196e-05, "loss": 0.0381, "step": 5430 }, { "epoch": 0.3193612774451098, "grad_norm": 4.288181781768799, "learning_rate": 9.402501738724004e-05, "loss": 0.0565, "step": 5440 }, { "epoch": 0.3199483386168839, "grad_norm": 1.6563748121261597, "learning_rate": 9.400307707833074e-05, "loss": 0.0548, "step": 5450 }, { "epoch": 0.32053539978865797, "grad_norm": 1.2033774852752686, "learning_rate": 9.398109913001136e-05, "loss": 0.0386, "step": 5460 }, { "epoch": 0.32112246096043207, "grad_norm": 1.8615103960037231, "learning_rate": 9.395908356108145e-05, "loss": 0.074, "step": 5470 }, { "epoch": 0.32170952213220616, "grad_norm": 1.8725007772445679, "learning_rate": 9.393703039037269e-05, "loss": 0.0544, "step": 5480 }, { "epoch": 0.32229658330398026, "grad_norm": 0.7631556987762451, "learning_rate": 9.391493963674899e-05, "loss": 0.0394, "step": 5490 }, { "epoch": 0.32288364447575435, "grad_norm": 2.4178078174591064, "learning_rate": 9.389281131910633e-05, "loss": 0.0319, "step": 5500 }, { "epoch": 0.32347070564752844, "grad_norm": 2.9834976196289062, "learning_rate": 9.387064545637287e-05, "loss": 0.073, "step": 5510 }, { "epoch": 0.3240577668193026, "grad_norm": 0.6918954849243164, "learning_rate": 9.384844206750889e-05, "loss": 0.0328, "step": 5520 }, { "epoch": 0.3246448279910767, "grad_norm": 1.4635518789291382, "learning_rate": 9.382620117150673e-05, "loss": 0.0397, "step": 5530 }, { "epoch": 0.3252318891628508, "grad_norm": 1.4960331916809082, "learning_rate": 9.380392278739085e-05, "loss": 0.0516, "step": 5540 }, { "epoch": 0.3258189503346249, "grad_norm": 1.3857059478759766, "learning_rate": 9.378160693421778e-05, "loss": 0.0373, "step": 5550 }, { "epoch": 0.326406011506399, "grad_norm": 6.356856822967529, "learning_rate": 9.375925363107604e-05, "loss": 0.0595, "step": 5560 }, { "epoch": 0.32699307267817307, "grad_norm": 1.6493016481399536, "learning_rate": 9.373686289708629e-05, "loss": 0.07, "step": 5570 }, { "epoch": 0.32758013384994716, "grad_norm": 1.416532278060913, "learning_rate": 9.371443475140108e-05, "loss": 0.0448, "step": 5580 }, { "epoch": 0.32816719502172126, "grad_norm": 1.5396021604537964, "learning_rate": 9.369196921320506e-05, "loss": 0.0352, "step": 5590 }, { "epoch": 0.32875425619349535, "grad_norm": 2.6996428966522217, "learning_rate": 9.366946630171485e-05, "loss": 0.069, "step": 5600 }, { "epoch": 0.32934131736526945, "grad_norm": 1.9140021800994873, "learning_rate": 9.364692603617899e-05, "loss": 0.0608, "step": 5610 }, { "epoch": 0.32992837853704354, "grad_norm": 0.6642828583717346, "learning_rate": 9.3624348435878e-05, "loss": 0.0351, "step": 5620 }, { "epoch": 0.33051543970881764, "grad_norm": 2.6053738594055176, "learning_rate": 9.360173352012436e-05, "loss": 0.0583, "step": 5630 }, { "epoch": 0.33110250088059173, "grad_norm": 2.699401378631592, "learning_rate": 9.357908130826243e-05, "loss": 0.0466, "step": 5640 }, { "epoch": 0.3316895620523659, "grad_norm": 3.195594549179077, "learning_rate": 9.355639181966849e-05, "loss": 0.0535, "step": 5650 }, { "epoch": 0.33227662322414, "grad_norm": 2.714934825897217, "learning_rate": 9.353366507375072e-05, "loss": 0.052, "step": 5660 }, { "epoch": 0.3328636843959141, "grad_norm": 1.0885695219039917, "learning_rate": 9.351090108994913e-05, "loss": 0.0692, "step": 5670 }, { "epoch": 0.33345074556768817, "grad_norm": 3.0591847896575928, "learning_rate": 9.348809988773564e-05, "loss": 0.05, "step": 5680 }, { "epoch": 0.33403780673946226, "grad_norm": 2.081336259841919, "learning_rate": 9.346526148661392e-05, "loss": 0.048, "step": 5690 }, { "epoch": 0.33462486791123636, "grad_norm": 1.2867697477340698, "learning_rate": 9.344238590611955e-05, "loss": 0.0555, "step": 5700 }, { "epoch": 0.33521192908301045, "grad_norm": 3.112689733505249, "learning_rate": 9.341947316581989e-05, "loss": 0.0373, "step": 5710 }, { "epoch": 0.33579899025478455, "grad_norm": 4.347721099853516, "learning_rate": 9.339652328531403e-05, "loss": 0.0418, "step": 5720 }, { "epoch": 0.33638605142655864, "grad_norm": 2.7465453147888184, "learning_rate": 9.337353628423288e-05, "loss": 0.0381, "step": 5730 }, { "epoch": 0.33697311259833274, "grad_norm": 1.6170954704284668, "learning_rate": 9.335051218223912e-05, "loss": 0.0579, "step": 5740 }, { "epoch": 0.33756017377010683, "grad_norm": 5.724897384643555, "learning_rate": 9.332745099902709e-05, "loss": 0.0482, "step": 5750 }, { "epoch": 0.3381472349418809, "grad_norm": 3.405139684677124, "learning_rate": 9.330435275432293e-05, "loss": 0.0331, "step": 5760 }, { "epoch": 0.338734296113655, "grad_norm": 3.682948350906372, "learning_rate": 9.328121746788444e-05, "loss": 0.0878, "step": 5770 }, { "epoch": 0.3393213572854291, "grad_norm": 4.555486679077148, "learning_rate": 9.325804515950109e-05, "loss": 0.0339, "step": 5780 }, { "epoch": 0.33990841845720327, "grad_norm": 3.4030396938323975, "learning_rate": 9.323483584899409e-05, "loss": 0.0442, "step": 5790 }, { "epoch": 0.34049547962897736, "grad_norm": 1.8589022159576416, "learning_rate": 9.321158955621621e-05, "loss": 0.0286, "step": 5800 }, { "epoch": 0.34108254080075145, "grad_norm": 2.960686445236206, "learning_rate": 9.318830630105188e-05, "loss": 0.0522, "step": 5810 }, { "epoch": 0.34166960197252555, "grad_norm": 3.721616744995117, "learning_rate": 9.31649861034172e-05, "loss": 0.0385, "step": 5820 }, { "epoch": 0.34225666314429964, "grad_norm": 1.4774328470230103, "learning_rate": 9.314162898325981e-05, "loss": 0.058, "step": 5830 }, { "epoch": 0.34284372431607374, "grad_norm": 1.2000024318695068, "learning_rate": 9.311823496055896e-05, "loss": 0.0348, "step": 5840 }, { "epoch": 0.34343078548784783, "grad_norm": 2.8643956184387207, "learning_rate": 9.309480405532547e-05, "loss": 0.0294, "step": 5850 }, { "epoch": 0.34401784665962193, "grad_norm": 1.839704990386963, "learning_rate": 9.307133628760168e-05, "loss": 0.0514, "step": 5860 }, { "epoch": 0.344604907831396, "grad_norm": 1.7238141298294067, "learning_rate": 9.30478316774615e-05, "loss": 0.0361, "step": 5870 }, { "epoch": 0.3451919690031701, "grad_norm": 1.615378975868225, "learning_rate": 9.302429024501031e-05, "loss": 0.0633, "step": 5880 }, { "epoch": 0.3457790301749442, "grad_norm": 2.1228907108306885, "learning_rate": 9.300071201038503e-05, "loss": 0.0517, "step": 5890 }, { "epoch": 0.3463660913467183, "grad_norm": 3.311354637145996, "learning_rate": 9.297709699375403e-05, "loss": 0.0399, "step": 5900 }, { "epoch": 0.3469531525184924, "grad_norm": 2.3087093830108643, "learning_rate": 9.295344521531717e-05, "loss": 0.0353, "step": 5910 }, { "epoch": 0.34754021369026655, "grad_norm": 2.478135108947754, "learning_rate": 9.292975669530573e-05, "loss": 0.0482, "step": 5920 }, { "epoch": 0.34812727486204065, "grad_norm": 1.8561735153198242, "learning_rate": 9.290603145398243e-05, "loss": 0.0275, "step": 5930 }, { "epoch": 0.34871433603381474, "grad_norm": 0.9207778573036194, "learning_rate": 9.288226951164138e-05, "loss": 0.0306, "step": 5940 }, { "epoch": 0.34930139720558884, "grad_norm": 1.6693058013916016, "learning_rate": 9.285847088860813e-05, "loss": 0.0386, "step": 5950 }, { "epoch": 0.34988845837736293, "grad_norm": 2.181859254837036, "learning_rate": 9.283463560523956e-05, "loss": 0.0371, "step": 5960 }, { "epoch": 0.350475519549137, "grad_norm": 1.718574047088623, "learning_rate": 9.281076368192392e-05, "loss": 0.067, "step": 5970 }, { "epoch": 0.3510625807209111, "grad_norm": 2.0255677700042725, "learning_rate": 9.278685513908083e-05, "loss": 0.0326, "step": 5980 }, { "epoch": 0.3516496418926852, "grad_norm": 0.9796348214149475, "learning_rate": 9.276290999716119e-05, "loss": 0.0487, "step": 5990 }, { "epoch": 0.3522367030644593, "grad_norm": 1.1461082696914673, "learning_rate": 9.273892827664725e-05, "loss": 0.0438, "step": 6000 }, { "epoch": 0.3522367030644593, "eval_loss": 0.44839778542518616, "eval_runtime": 269.5068, "eval_samples_per_second": 3.506, "eval_steps_per_second": 3.506, "step": 6000 }, { "epoch": 0.3528237642362334, "grad_norm": 5.4352521896362305, "learning_rate": 9.27149099980525e-05, "loss": 0.042, "step": 6010 }, { "epoch": 0.3534108254080075, "grad_norm": 1.9308388233184814, "learning_rate": 9.269085518192175e-05, "loss": 0.0837, "step": 6020 }, { "epoch": 0.3539978865797816, "grad_norm": 1.8994724750518799, "learning_rate": 9.266676384883101e-05, "loss": 0.0508, "step": 6030 }, { "epoch": 0.3545849477515557, "grad_norm": 1.7783081531524658, "learning_rate": 9.264263601938759e-05, "loss": 0.0614, "step": 6040 }, { "epoch": 0.3551720089233298, "grad_norm": 2.032818555831909, "learning_rate": 9.261847171422996e-05, "loss": 0.0293, "step": 6050 }, { "epoch": 0.35575907009510394, "grad_norm": 3.544628381729126, "learning_rate": 9.259427095402782e-05, "loss": 0.069, "step": 6060 }, { "epoch": 0.35634613126687803, "grad_norm": 3.1308882236480713, "learning_rate": 9.257003375948207e-05, "loss": 0.0309, "step": 6070 }, { "epoch": 0.3569331924386521, "grad_norm": 1.8078138828277588, "learning_rate": 9.254576015132473e-05, "loss": 0.0441, "step": 6080 }, { "epoch": 0.3575202536104262, "grad_norm": 4.272693157196045, "learning_rate": 9.252145015031899e-05, "loss": 0.0871, "step": 6090 }, { "epoch": 0.3581073147822003, "grad_norm": 1.6641491651535034, "learning_rate": 9.249710377725917e-05, "loss": 0.0547, "step": 6100 }, { "epoch": 0.3586943759539744, "grad_norm": 1.8841357231140137, "learning_rate": 9.247272105297074e-05, "loss": 0.0542, "step": 6110 }, { "epoch": 0.3592814371257485, "grad_norm": 2.427034378051758, "learning_rate": 9.244830199831016e-05, "loss": 0.0411, "step": 6120 }, { "epoch": 0.3598684982975226, "grad_norm": 3.289883613586426, "learning_rate": 9.24238466341651e-05, "loss": 0.0389, "step": 6130 }, { "epoch": 0.3604555594692967, "grad_norm": 3.284317970275879, "learning_rate": 9.239935498145418e-05, "loss": 0.0446, "step": 6140 }, { "epoch": 0.3610426206410708, "grad_norm": 2.9233837127685547, "learning_rate": 9.237482706112712e-05, "loss": 0.0341, "step": 6150 }, { "epoch": 0.3616296818128449, "grad_norm": 2.011007308959961, "learning_rate": 9.235026289416463e-05, "loss": 0.0177, "step": 6160 }, { "epoch": 0.362216742984619, "grad_norm": 1.3153204917907715, "learning_rate": 9.232566250157845e-05, "loss": 0.0493, "step": 6170 }, { "epoch": 0.3628038041563931, "grad_norm": 1.6673120260238647, "learning_rate": 9.23010259044113e-05, "loss": 0.0307, "step": 6180 }, { "epoch": 0.36339086532816717, "grad_norm": 2.577718734741211, "learning_rate": 9.227635312373686e-05, "loss": 0.0457, "step": 6190 }, { "epoch": 0.3639779264999413, "grad_norm": 1.3342965841293335, "learning_rate": 9.225164418065976e-05, "loss": 0.0247, "step": 6200 }, { "epoch": 0.3645649876717154, "grad_norm": 3.58207631111145, "learning_rate": 9.222689909631557e-05, "loss": 0.0608, "step": 6210 }, { "epoch": 0.3651520488434895, "grad_norm": 5.798762798309326, "learning_rate": 9.220211789187078e-05, "loss": 0.047, "step": 6220 }, { "epoch": 0.3657391100152636, "grad_norm": 2.2851603031158447, "learning_rate": 9.217730058852276e-05, "loss": 0.0401, "step": 6230 }, { "epoch": 0.3663261711870377, "grad_norm": 1.917502999305725, "learning_rate": 9.215244720749979e-05, "loss": 0.0406, "step": 6240 }, { "epoch": 0.3669132323588118, "grad_norm": 3.289642095565796, "learning_rate": 9.212755777006097e-05, "loss": 0.0531, "step": 6250 }, { "epoch": 0.3675002935305859, "grad_norm": 2.104628801345825, "learning_rate": 9.210263229749626e-05, "loss": 0.0447, "step": 6260 }, { "epoch": 0.36808735470236, "grad_norm": 2.4235751628875732, "learning_rate": 9.207767081112642e-05, "loss": 0.0956, "step": 6270 }, { "epoch": 0.3686744158741341, "grad_norm": 2.3653531074523926, "learning_rate": 9.20526733323031e-05, "loss": 0.0709, "step": 6280 }, { "epoch": 0.36926147704590817, "grad_norm": 2.998399257659912, "learning_rate": 9.202763988240861e-05, "loss": 0.0372, "step": 6290 }, { "epoch": 0.36984853821768227, "grad_norm": 1.8805534839630127, "learning_rate": 9.200257048285615e-05, "loss": 0.0451, "step": 6300 }, { "epoch": 0.37043559938945636, "grad_norm": 0.5904949903488159, "learning_rate": 9.197746515508955e-05, "loss": 0.0405, "step": 6310 }, { "epoch": 0.37102266056123046, "grad_norm": 2.2496538162231445, "learning_rate": 9.195232392058353e-05, "loss": 0.0553, "step": 6320 }, { "epoch": 0.3716097217330046, "grad_norm": 1.8346590995788574, "learning_rate": 9.192714680084336e-05, "loss": 0.0338, "step": 6330 }, { "epoch": 0.3721967829047787, "grad_norm": 1.4481120109558105, "learning_rate": 9.19019338174051e-05, "loss": 0.0387, "step": 6340 }, { "epoch": 0.3727838440765528, "grad_norm": 1.3911155462265015, "learning_rate": 9.187668499183546e-05, "loss": 0.0384, "step": 6350 }, { "epoch": 0.3733709052483269, "grad_norm": 4.437960624694824, "learning_rate": 9.185140034573182e-05, "loss": 0.0691, "step": 6360 }, { "epoch": 0.373957966420101, "grad_norm": 1.329298734664917, "learning_rate": 9.182607990072221e-05, "loss": 0.054, "step": 6370 }, { "epoch": 0.3745450275918751, "grad_norm": 1.1170673370361328, "learning_rate": 9.180072367846523e-05, "loss": 0.0598, "step": 6380 }, { "epoch": 0.3751320887636492, "grad_norm": 2.182213068008423, "learning_rate": 9.177533170065014e-05, "loss": 0.0373, "step": 6390 }, { "epoch": 0.37571914993542327, "grad_norm": 2.554008960723877, "learning_rate": 9.174990398899677e-05, "loss": 0.0936, "step": 6400 }, { "epoch": 0.37630621110719736, "grad_norm": 1.768218994140625, "learning_rate": 9.172444056525549e-05, "loss": 0.0455, "step": 6410 }, { "epoch": 0.37689327227897146, "grad_norm": 1.6359037160873413, "learning_rate": 9.169894145120725e-05, "loss": 0.0367, "step": 6420 }, { "epoch": 0.37748033345074555, "grad_norm": 1.5694235563278198, "learning_rate": 9.167340666866351e-05, "loss": 0.0623, "step": 6430 }, { "epoch": 0.37806739462251965, "grad_norm": 3.2600700855255127, "learning_rate": 9.164783623946626e-05, "loss": 0.0541, "step": 6440 }, { "epoch": 0.37865445579429374, "grad_norm": 2.224238157272339, "learning_rate": 9.162223018548795e-05, "loss": 0.036, "step": 6450 }, { "epoch": 0.37924151696606784, "grad_norm": 3.2793667316436768, "learning_rate": 9.15965885286315e-05, "loss": 0.0601, "step": 6460 }, { "epoch": 0.379828578137842, "grad_norm": 0.3255864083766937, "learning_rate": 9.157091129083037e-05, "loss": 0.0454, "step": 6470 }, { "epoch": 0.3804156393096161, "grad_norm": 4.192474842071533, "learning_rate": 9.154519849404834e-05, "loss": 0.0481, "step": 6480 }, { "epoch": 0.3810027004813902, "grad_norm": 2.8192543983459473, "learning_rate": 9.151945016027965e-05, "loss": 0.0426, "step": 6490 }, { "epoch": 0.38158976165316427, "grad_norm": 3.663104772567749, "learning_rate": 9.149366631154899e-05, "loss": 0.0485, "step": 6500 }, { "epoch": 0.38217682282493837, "grad_norm": 2.2148454189300537, "learning_rate": 9.146784696991132e-05, "loss": 0.0432, "step": 6510 }, { "epoch": 0.38276388399671246, "grad_norm": 0.6129756569862366, "learning_rate": 9.144199215745206e-05, "loss": 0.0525, "step": 6520 }, { "epoch": 0.38335094516848656, "grad_norm": 3.0547399520874023, "learning_rate": 9.141610189628695e-05, "loss": 0.0611, "step": 6530 }, { "epoch": 0.38393800634026065, "grad_norm": 1.918065071105957, "learning_rate": 9.1390176208562e-05, "loss": 0.0419, "step": 6540 }, { "epoch": 0.38452506751203475, "grad_norm": 2.8584654331207275, "learning_rate": 9.136421511645357e-05, "loss": 0.0435, "step": 6550 }, { "epoch": 0.38511212868380884, "grad_norm": 2.5423402786254883, "learning_rate": 9.133821864216829e-05, "loss": 0.0523, "step": 6560 }, { "epoch": 0.38569918985558294, "grad_norm": 3.4923906326293945, "learning_rate": 9.131218680794308e-05, "loss": 0.045, "step": 6570 }, { "epoch": 0.38628625102735703, "grad_norm": 1.9875587224960327, "learning_rate": 9.128611963604507e-05, "loss": 0.0555, "step": 6580 }, { "epoch": 0.3868733121991311, "grad_norm": 2.863887071609497, "learning_rate": 9.126001714877161e-05, "loss": 0.0502, "step": 6590 }, { "epoch": 0.3874603733709053, "grad_norm": 3.5061120986938477, "learning_rate": 9.123387936845032e-05, "loss": 0.0422, "step": 6600 }, { "epoch": 0.38804743454267937, "grad_norm": 2.144218921661377, "learning_rate": 9.120770631743894e-05, "loss": 0.0693, "step": 6610 }, { "epoch": 0.38863449571445347, "grad_norm": 2.683347225189209, "learning_rate": 9.118149801812543e-05, "loss": 0.038, "step": 6620 }, { "epoch": 0.38922155688622756, "grad_norm": 0.9793893098831177, "learning_rate": 9.115525449292786e-05, "loss": 0.0708, "step": 6630 }, { "epoch": 0.38980861805800165, "grad_norm": 2.2941360473632812, "learning_rate": 9.112897576429446e-05, "loss": 0.0376, "step": 6640 }, { "epoch": 0.39039567922977575, "grad_norm": 2.4669976234436035, "learning_rate": 9.110266185470358e-05, "loss": 0.0649, "step": 6650 }, { "epoch": 0.39098274040154984, "grad_norm": 1.426784634590149, "learning_rate": 9.10763127866636e-05, "loss": 0.0546, "step": 6660 }, { "epoch": 0.39156980157332394, "grad_norm": 1.440475583076477, "learning_rate": 9.104992858271307e-05, "loss": 0.0359, "step": 6670 }, { "epoch": 0.39215686274509803, "grad_norm": 0.6029365062713623, "learning_rate": 9.102350926542051e-05, "loss": 0.0472, "step": 6680 }, { "epoch": 0.39274392391687213, "grad_norm": 2.413081645965576, "learning_rate": 9.099705485738454e-05, "loss": 0.0334, "step": 6690 }, { "epoch": 0.3933309850886462, "grad_norm": 2.308234214782715, "learning_rate": 9.097056538123376e-05, "loss": 0.0515, "step": 6700 }, { "epoch": 0.3939180462604203, "grad_norm": 1.7056807279586792, "learning_rate": 9.094404085962676e-05, "loss": 0.0325, "step": 6710 }, { "epoch": 0.3945051074321944, "grad_norm": 2.2686026096343994, "learning_rate": 9.091748131525212e-05, "loss": 0.0264, "step": 6720 }, { "epoch": 0.3950921686039685, "grad_norm": 3.3668978214263916, "learning_rate": 9.089088677082838e-05, "loss": 0.03, "step": 6730 }, { "epoch": 0.39567922977574266, "grad_norm": 2.45164155960083, "learning_rate": 9.086425724910403e-05, "loss": 0.0306, "step": 6740 }, { "epoch": 0.39626629094751675, "grad_norm": 1.7831369638442993, "learning_rate": 9.083759277285745e-05, "loss": 0.053, "step": 6750 }, { "epoch": 0.39685335211929085, "grad_norm": 0.3770430386066437, "learning_rate": 9.081089336489694e-05, "loss": 0.0267, "step": 6760 }, { "epoch": 0.39744041329106494, "grad_norm": 2.116936206817627, "learning_rate": 9.078415904806068e-05, "loss": 0.0644, "step": 6770 }, { "epoch": 0.39802747446283904, "grad_norm": 2.9683051109313965, "learning_rate": 9.07573898452167e-05, "loss": 0.0488, "step": 6780 }, { "epoch": 0.39861453563461313, "grad_norm": 1.8044674396514893, "learning_rate": 9.073058577926287e-05, "loss": 0.0438, "step": 6790 }, { "epoch": 0.3992015968063872, "grad_norm": 1.5561860799789429, "learning_rate": 9.070374687312689e-05, "loss": 0.052, "step": 6800 }, { "epoch": 0.3997886579781613, "grad_norm": 2.254608631134033, "learning_rate": 9.067687314976627e-05, "loss": 0.0403, "step": 6810 }, { "epoch": 0.4003757191499354, "grad_norm": 2.078115463256836, "learning_rate": 9.064996463216828e-05, "loss": 0.0319, "step": 6820 }, { "epoch": 0.4009627803217095, "grad_norm": 2.6184616088867188, "learning_rate": 9.062302134334998e-05, "loss": 0.0419, "step": 6830 }, { "epoch": 0.4015498414934836, "grad_norm": 1.869310975074768, "learning_rate": 9.059604330635813e-05, "loss": 0.0411, "step": 6840 }, { "epoch": 0.4021369026652577, "grad_norm": 1.1796845197677612, "learning_rate": 9.056903054426927e-05, "loss": 0.0379, "step": 6850 }, { "epoch": 0.4027239638370318, "grad_norm": 1.6515157222747803, "learning_rate": 9.054198308018957e-05, "loss": 0.0372, "step": 6860 }, { "epoch": 0.4033110250088059, "grad_norm": 1.957228183746338, "learning_rate": 9.051490093725494e-05, "loss": 0.0495, "step": 6870 }, { "epoch": 0.40389808618058004, "grad_norm": 1.7962557077407837, "learning_rate": 9.048778413863097e-05, "loss": 0.0314, "step": 6880 }, { "epoch": 0.40448514735235414, "grad_norm": 2.7375149726867676, "learning_rate": 9.046063270751283e-05, "loss": 0.0635, "step": 6890 }, { "epoch": 0.40507220852412823, "grad_norm": 2.7061636447906494, "learning_rate": 9.043344666712537e-05, "loss": 0.069, "step": 6900 }, { "epoch": 0.4056592696959023, "grad_norm": 5.234796047210693, "learning_rate": 9.0406226040723e-05, "loss": 0.0421, "step": 6910 }, { "epoch": 0.4062463308676764, "grad_norm": 3.509838581085205, "learning_rate": 9.037897085158976e-05, "loss": 0.0471, "step": 6920 }, { "epoch": 0.4068333920394505, "grad_norm": 2.576282024383545, "learning_rate": 9.03516811230392e-05, "loss": 0.0873, "step": 6930 }, { "epoch": 0.4074204532112246, "grad_norm": 2.2423412799835205, "learning_rate": 9.032435687841445e-05, "loss": 0.0394, "step": 6940 }, { "epoch": 0.4080075143829987, "grad_norm": 3.413788318634033, "learning_rate": 9.029699814108818e-05, "loss": 0.0897, "step": 6950 }, { "epoch": 0.4085945755547728, "grad_norm": 3.610041618347168, "learning_rate": 9.026960493446252e-05, "loss": 0.0648, "step": 6960 }, { "epoch": 0.4091816367265469, "grad_norm": 3.931255340576172, "learning_rate": 9.024217728196913e-05, "loss": 0.0553, "step": 6970 }, { "epoch": 0.409768697898321, "grad_norm": 1.4837568998336792, "learning_rate": 9.02147152070691e-05, "loss": 0.0418, "step": 6980 }, { "epoch": 0.4103557590700951, "grad_norm": 4.453707695007324, "learning_rate": 9.018721873325295e-05, "loss": 0.0511, "step": 6990 }, { "epoch": 0.4109428202418692, "grad_norm": 4.750669479370117, "learning_rate": 9.015968788404069e-05, "loss": 0.061, "step": 7000 }, { "epoch": 0.41152988141364333, "grad_norm": 1.7518789768218994, "learning_rate": 9.013212268298168e-05, "loss": 0.0347, "step": 7010 }, { "epoch": 0.4121169425854174, "grad_norm": 1.1140742301940918, "learning_rate": 9.010452315365466e-05, "loss": 0.0689, "step": 7020 }, { "epoch": 0.4127040037571915, "grad_norm": 1.4189730882644653, "learning_rate": 9.007688931966778e-05, "loss": 0.0551, "step": 7030 }, { "epoch": 0.4132910649289656, "grad_norm": 2.33185076713562, "learning_rate": 9.004922120465849e-05, "loss": 0.0578, "step": 7040 }, { "epoch": 0.4138781261007397, "grad_norm": 2.9722962379455566, "learning_rate": 9.00215188322936e-05, "loss": 0.0406, "step": 7050 }, { "epoch": 0.4144651872725138, "grad_norm": 3.8774149417877197, "learning_rate": 8.999378222626915e-05, "loss": 0.0435, "step": 7060 }, { "epoch": 0.4150522484442879, "grad_norm": 1.6501243114471436, "learning_rate": 8.996601141031056e-05, "loss": 0.029, "step": 7070 }, { "epoch": 0.415639309616062, "grad_norm": 4.021002769470215, "learning_rate": 8.993820640817246e-05, "loss": 0.07, "step": 7080 }, { "epoch": 0.4162263707878361, "grad_norm": 3.8996949195861816, "learning_rate": 8.991036724363872e-05, "loss": 0.0368, "step": 7090 }, { "epoch": 0.4168134319596102, "grad_norm": 0.3041614592075348, "learning_rate": 8.988249394052247e-05, "loss": 0.033, "step": 7100 }, { "epoch": 0.4174004931313843, "grad_norm": 4.124356269836426, "learning_rate": 8.985458652266595e-05, "loss": 0.0487, "step": 7110 }, { "epoch": 0.41798755430315837, "grad_norm": 3.1162426471710205, "learning_rate": 8.98266450139407e-05, "loss": 0.0765, "step": 7120 }, { "epoch": 0.41857461547493247, "grad_norm": 1.3574343919754028, "learning_rate": 8.979866943824735e-05, "loss": 0.049, "step": 7130 }, { "epoch": 0.41916167664670656, "grad_norm": 4.351559638977051, "learning_rate": 8.977065981951566e-05, "loss": 0.044, "step": 7140 }, { "epoch": 0.4197487378184807, "grad_norm": 3.6652841567993164, "learning_rate": 8.974261618170459e-05, "loss": 0.0481, "step": 7150 }, { "epoch": 0.4203357989902548, "grad_norm": 2.909571409225464, "learning_rate": 8.97145385488021e-05, "loss": 0.0596, "step": 7160 }, { "epoch": 0.4209228601620289, "grad_norm": 2.387852668762207, "learning_rate": 8.968642694482527e-05, "loss": 0.0454, "step": 7170 }, { "epoch": 0.421509921333803, "grad_norm": 3.3645102977752686, "learning_rate": 8.965828139382026e-05, "loss": 0.0509, "step": 7180 }, { "epoch": 0.4220969825055771, "grad_norm": 3.189063787460327, "learning_rate": 8.963010191986225e-05, "loss": 0.0626, "step": 7190 }, { "epoch": 0.4226840436773512, "grad_norm": 4.78650426864624, "learning_rate": 8.960188854705543e-05, "loss": 0.0828, "step": 7200 }, { "epoch": 0.4232711048491253, "grad_norm": 3.422757625579834, "learning_rate": 8.957364129953297e-05, "loss": 0.0424, "step": 7210 }, { "epoch": 0.4238581660208994, "grad_norm": 2.0187454223632812, "learning_rate": 8.954536020145708e-05, "loss": 0.04, "step": 7220 }, { "epoch": 0.42444522719267347, "grad_norm": 1.3502254486083984, "learning_rate": 8.951704527701883e-05, "loss": 0.0317, "step": 7230 }, { "epoch": 0.42503228836444756, "grad_norm": 1.648254156112671, "learning_rate": 8.948869655043835e-05, "loss": 0.0397, "step": 7240 }, { "epoch": 0.42561934953622166, "grad_norm": 6.5923261642456055, "learning_rate": 8.946031404596453e-05, "loss": 0.0565, "step": 7250 }, { "epoch": 0.42620641070799575, "grad_norm": 2.213615655899048, "learning_rate": 8.943189778787528e-05, "loss": 0.051, "step": 7260 }, { "epoch": 0.42679347187976985, "grad_norm": 1.5767220258712769, "learning_rate": 8.940344780047736e-05, "loss": 0.0733, "step": 7270 }, { "epoch": 0.427380533051544, "grad_norm": 2.043565273284912, "learning_rate": 8.937496410810631e-05, "loss": 0.0604, "step": 7280 }, { "epoch": 0.4279675942233181, "grad_norm": 0.9879446625709534, "learning_rate": 8.934644673512656e-05, "loss": 0.0681, "step": 7290 }, { "epoch": 0.4285546553950922, "grad_norm": 2.74680757522583, "learning_rate": 8.931789570593134e-05, "loss": 0.0515, "step": 7300 }, { "epoch": 0.4291417165668663, "grad_norm": 2.332568645477295, "learning_rate": 8.928931104494267e-05, "loss": 0.0681, "step": 7310 }, { "epoch": 0.4297287777386404, "grad_norm": 1.6378368139266968, "learning_rate": 8.926069277661134e-05, "loss": 0.0606, "step": 7320 }, { "epoch": 0.43031583891041447, "grad_norm": 3.3819258213043213, "learning_rate": 8.923204092541688e-05, "loss": 0.041, "step": 7330 }, { "epoch": 0.43090290008218857, "grad_norm": 1.0741076469421387, "learning_rate": 8.920335551586755e-05, "loss": 0.054, "step": 7340 }, { "epoch": 0.43148996125396266, "grad_norm": 2.399531602859497, "learning_rate": 8.91746365725003e-05, "loss": 0.0392, "step": 7350 }, { "epoch": 0.43207702242573676, "grad_norm": 2.9005095958709717, "learning_rate": 8.914588411988078e-05, "loss": 0.0507, "step": 7360 }, { "epoch": 0.43266408359751085, "grad_norm": 2.7736499309539795, "learning_rate": 8.911709818260333e-05, "loss": 0.0509, "step": 7370 }, { "epoch": 0.43325114476928495, "grad_norm": 2.2151551246643066, "learning_rate": 8.908827878529087e-05, "loss": 0.0344, "step": 7380 }, { "epoch": 0.43383820594105904, "grad_norm": 1.2291438579559326, "learning_rate": 8.905942595259498e-05, "loss": 0.0263, "step": 7390 }, { "epoch": 0.43442526711283314, "grad_norm": 2.7212166786193848, "learning_rate": 8.903053970919585e-05, "loss": 0.0492, "step": 7400 }, { "epoch": 0.43501232828460723, "grad_norm": 3.3400795459747314, "learning_rate": 8.900162007980221e-05, "loss": 0.0421, "step": 7410 }, { "epoch": 0.4355993894563814, "grad_norm": 1.6024249792099, "learning_rate": 8.897266708915139e-05, "loss": 0.0447, "step": 7420 }, { "epoch": 0.4361864506281555, "grad_norm": 1.3425102233886719, "learning_rate": 8.894368076200923e-05, "loss": 0.0282, "step": 7430 }, { "epoch": 0.43677351179992957, "grad_norm": 1.875572919845581, "learning_rate": 8.891466112317008e-05, "loss": 0.0521, "step": 7440 }, { "epoch": 0.43736057297170366, "grad_norm": 6.709931373596191, "learning_rate": 8.888560819745682e-05, "loss": 0.0498, "step": 7450 }, { "epoch": 0.43794763414347776, "grad_norm": 2.244819402694702, "learning_rate": 8.885652200972077e-05, "loss": 0.0392, "step": 7460 }, { "epoch": 0.43853469531525185, "grad_norm": 2.1222212314605713, "learning_rate": 8.88274025848417e-05, "loss": 0.0499, "step": 7470 }, { "epoch": 0.43912175648702595, "grad_norm": 3.548957586288452, "learning_rate": 8.879824994772785e-05, "loss": 0.0544, "step": 7480 }, { "epoch": 0.43970881765880004, "grad_norm": 1.4378647804260254, "learning_rate": 8.876906412331582e-05, "loss": 0.0385, "step": 7490 }, { "epoch": 0.44029587883057414, "grad_norm": 2.783017158508301, "learning_rate": 8.873984513657061e-05, "loss": 0.0574, "step": 7500 }, { "epoch": 0.44088294000234823, "grad_norm": 5.680621147155762, "learning_rate": 8.871059301248563e-05, "loss": 0.0567, "step": 7510 }, { "epoch": 0.44147000117412233, "grad_norm": 2.7961585521698, "learning_rate": 8.868130777608256e-05, "loss": 0.0715, "step": 7520 }, { "epoch": 0.4420570623458964, "grad_norm": 5.537487983703613, "learning_rate": 8.865198945241147e-05, "loss": 0.0381, "step": 7530 }, { "epoch": 0.4426441235176705, "grad_norm": 1.4201655387878418, "learning_rate": 8.86226380665507e-05, "loss": 0.0256, "step": 7540 }, { "epoch": 0.4432311846894446, "grad_norm": 1.8118871450424194, "learning_rate": 8.859325364360687e-05, "loss": 0.0401, "step": 7550 }, { "epoch": 0.44381824586121876, "grad_norm": 1.7263232469558716, "learning_rate": 8.856383620871489e-05, "loss": 0.0249, "step": 7560 }, { "epoch": 0.44440530703299286, "grad_norm": 1.8279962539672852, "learning_rate": 8.853438578703786e-05, "loss": 0.0715, "step": 7570 }, { "epoch": 0.44499236820476695, "grad_norm": 1.603935956954956, "learning_rate": 8.850490240376711e-05, "loss": 0.0679, "step": 7580 }, { "epoch": 0.44557942937654105, "grad_norm": 2.0516746044158936, "learning_rate": 8.84753860841222e-05, "loss": 0.0672, "step": 7590 }, { "epoch": 0.44616649054831514, "grad_norm": 0.3904460668563843, "learning_rate": 8.844583685335084e-05, "loss": 0.0799, "step": 7600 }, { "epoch": 0.44675355172008924, "grad_norm": 1.5879621505737305, "learning_rate": 8.841625473672888e-05, "loss": 0.053, "step": 7610 }, { "epoch": 0.44734061289186333, "grad_norm": 1.8831861019134521, "learning_rate": 8.838663975956031e-05, "loss": 0.0579, "step": 7620 }, { "epoch": 0.4479276740636374, "grad_norm": 2.4368581771850586, "learning_rate": 8.835699194717724e-05, "loss": 0.0558, "step": 7630 }, { "epoch": 0.4485147352354115, "grad_norm": 1.7504124641418457, "learning_rate": 8.832731132493982e-05, "loss": 0.0529, "step": 7640 }, { "epoch": 0.4491017964071856, "grad_norm": 2.665923595428467, "learning_rate": 8.829759791823632e-05, "loss": 0.0364, "step": 7650 }, { "epoch": 0.4496888575789597, "grad_norm": 1.9041054248809814, "learning_rate": 8.826785175248308e-05, "loss": 0.0504, "step": 7660 }, { "epoch": 0.4502759187507338, "grad_norm": 3.0372838973999023, "learning_rate": 8.823807285312434e-05, "loss": 0.0517, "step": 7670 }, { "epoch": 0.4508629799225079, "grad_norm": 0.7600299119949341, "learning_rate": 8.820826124563245e-05, "loss": 0.052, "step": 7680 }, { "epoch": 0.45145004109428205, "grad_norm": 2.599184274673462, "learning_rate": 8.81784169555077e-05, "loss": 0.0604, "step": 7690 }, { "epoch": 0.45203710226605615, "grad_norm": 2.0229201316833496, "learning_rate": 8.814854000827832e-05, "loss": 0.0625, "step": 7700 }, { "epoch": 0.45262416343783024, "grad_norm": 0.2576180100440979, "learning_rate": 8.811863042950053e-05, "loss": 0.0472, "step": 7710 }, { "epoch": 0.45321122460960434, "grad_norm": 1.7950199842453003, "learning_rate": 8.80886882447584e-05, "loss": 0.0578, "step": 7720 }, { "epoch": 0.45379828578137843, "grad_norm": 2.8848981857299805, "learning_rate": 8.805871347966393e-05, "loss": 0.0431, "step": 7730 }, { "epoch": 0.4543853469531525, "grad_norm": 2.17931866645813, "learning_rate": 8.802870615985694e-05, "loss": 0.0449, "step": 7740 }, { "epoch": 0.4549724081249266, "grad_norm": 1.8323543071746826, "learning_rate": 8.799866631100516e-05, "loss": 0.0474, "step": 7750 }, { "epoch": 0.4555594692967007, "grad_norm": 3.3510382175445557, "learning_rate": 8.79685939588041e-05, "loss": 0.0513, "step": 7760 }, { "epoch": 0.4561465304684748, "grad_norm": 1.8694850206375122, "learning_rate": 8.79384891289771e-05, "loss": 0.066, "step": 7770 }, { "epoch": 0.4567335916402489, "grad_norm": 3.482624053955078, "learning_rate": 8.790835184727529e-05, "loss": 0.0741, "step": 7780 }, { "epoch": 0.457320652812023, "grad_norm": 3.115752935409546, "learning_rate": 8.787818213947749e-05, "loss": 0.036, "step": 7790 }, { "epoch": 0.4579077139837971, "grad_norm": 1.0774741172790527, "learning_rate": 8.784798003139034e-05, "loss": 0.0691, "step": 7800 }, { "epoch": 0.4584947751555712, "grad_norm": 0.8824712038040161, "learning_rate": 8.781774554884814e-05, "loss": 0.0548, "step": 7810 }, { "epoch": 0.4590818363273453, "grad_norm": 0.9113776087760925, "learning_rate": 8.778747871771292e-05, "loss": 0.0275, "step": 7820 }, { "epoch": 0.45966889749911943, "grad_norm": 1.0432971715927124, "learning_rate": 8.775717956387434e-05, "loss": 0.0255, "step": 7830 }, { "epoch": 0.46025595867089353, "grad_norm": 3.4298136234283447, "learning_rate": 8.772684811324975e-05, "loss": 0.0499, "step": 7840 }, { "epoch": 0.4608430198426676, "grad_norm": 5.1340837478637695, "learning_rate": 8.76964843917841e-05, "loss": 0.0538, "step": 7850 }, { "epoch": 0.4614300810144417, "grad_norm": 1.2287262678146362, "learning_rate": 8.766608842544993e-05, "loss": 0.0414, "step": 7860 }, { "epoch": 0.4620171421862158, "grad_norm": 1.3155686855316162, "learning_rate": 8.763566024024741e-05, "loss": 0.0185, "step": 7870 }, { "epoch": 0.4626042033579899, "grad_norm": 1.028479814529419, "learning_rate": 8.760519986220423e-05, "loss": 0.0441, "step": 7880 }, { "epoch": 0.463191264529764, "grad_norm": 1.9284944534301758, "learning_rate": 8.757470731737562e-05, "loss": 0.0283, "step": 7890 }, { "epoch": 0.4637783257015381, "grad_norm": 0.6915614008903503, "learning_rate": 8.754418263184437e-05, "loss": 0.0613, "step": 7900 }, { "epoch": 0.4643653868733122, "grad_norm": 1.5400358438491821, "learning_rate": 8.751362583172068e-05, "loss": 0.0352, "step": 7910 }, { "epoch": 0.4649524480450863, "grad_norm": 1.5949668884277344, "learning_rate": 8.748303694314227e-05, "loss": 0.0478, "step": 7920 }, { "epoch": 0.4655395092168604, "grad_norm": 2.550201416015625, "learning_rate": 8.745241599227433e-05, "loss": 0.0384, "step": 7930 }, { "epoch": 0.4661265703886345, "grad_norm": 2.6728291511535645, "learning_rate": 8.742176300530944e-05, "loss": 0.0604, "step": 7940 }, { "epoch": 0.46671363156040857, "grad_norm": 1.924499750137329, "learning_rate": 8.739107800846757e-05, "loss": 0.0594, "step": 7950 }, { "epoch": 0.4673006927321827, "grad_norm": 0.8227196931838989, "learning_rate": 8.736036102799614e-05, "loss": 0.0378, "step": 7960 }, { "epoch": 0.4678877539039568, "grad_norm": 2.2939882278442383, "learning_rate": 8.732961209016983e-05, "loss": 0.0438, "step": 7970 }, { "epoch": 0.4684748150757309, "grad_norm": 1.3413143157958984, "learning_rate": 8.729883122129075e-05, "loss": 0.0418, "step": 7980 }, { "epoch": 0.469061876247505, "grad_norm": 2.554335832595825, "learning_rate": 8.726801844768825e-05, "loss": 0.0261, "step": 7990 }, { "epoch": 0.4696489374192791, "grad_norm": 4.016257286071777, "learning_rate": 8.7237173795719e-05, "loss": 0.037, "step": 8000 }, { "epoch": 0.4702359985910532, "grad_norm": 1.072961688041687, "learning_rate": 8.720629729176697e-05, "loss": 0.0335, "step": 8010 }, { "epoch": 0.4708230597628273, "grad_norm": 2.696843385696411, "learning_rate": 8.717538896224332e-05, "loss": 0.0373, "step": 8020 }, { "epoch": 0.4714101209346014, "grad_norm": 1.040705680847168, "learning_rate": 8.714444883358646e-05, "loss": 0.0493, "step": 8030 }, { "epoch": 0.4719971821063755, "grad_norm": 1.6960396766662598, "learning_rate": 8.711347693226201e-05, "loss": 0.0435, "step": 8040 }, { "epoch": 0.4725842432781496, "grad_norm": 1.921654224395752, "learning_rate": 8.708247328476273e-05, "loss": 0.0376, "step": 8050 }, { "epoch": 0.47317130444992367, "grad_norm": 1.4431146383285522, "learning_rate": 8.705143791760859e-05, "loss": 0.0467, "step": 8060 }, { "epoch": 0.47375836562169776, "grad_norm": 4.363770484924316, "learning_rate": 8.702037085734664e-05, "loss": 0.0633, "step": 8070 }, { "epoch": 0.47434542679347186, "grad_norm": 0.7148435711860657, "learning_rate": 8.698927213055107e-05, "loss": 0.045, "step": 8080 }, { "epoch": 0.47493248796524595, "grad_norm": 1.6291950941085815, "learning_rate": 8.695814176382318e-05, "loss": 0.0345, "step": 8090 }, { "epoch": 0.4755195491370201, "grad_norm": 2.1824052333831787, "learning_rate": 8.692697978379125e-05, "loss": 0.05, "step": 8100 }, { "epoch": 0.4761066103087942, "grad_norm": 0.47206181287765503, "learning_rate": 8.68957862171107e-05, "loss": 0.0892, "step": 8110 }, { "epoch": 0.4766936714805683, "grad_norm": 2.351390838623047, "learning_rate": 8.68645610904639e-05, "loss": 0.0387, "step": 8120 }, { "epoch": 0.4772807326523424, "grad_norm": 1.7469288110733032, "learning_rate": 8.683330443056026e-05, "loss": 0.0432, "step": 8130 }, { "epoch": 0.4778677938241165, "grad_norm": 1.3828728199005127, "learning_rate": 8.680201626413612e-05, "loss": 0.0565, "step": 8140 }, { "epoch": 0.4784548549958906, "grad_norm": 2.343574047088623, "learning_rate": 8.677069661795479e-05, "loss": 0.0431, "step": 8150 }, { "epoch": 0.47904191616766467, "grad_norm": 0.9054991602897644, "learning_rate": 8.673934551880654e-05, "loss": 0.0439, "step": 8160 }, { "epoch": 0.47962897733943877, "grad_norm": 1.9883683919906616, "learning_rate": 8.67079629935085e-05, "loss": 0.0344, "step": 8170 }, { "epoch": 0.48021603851121286, "grad_norm": 4.34842586517334, "learning_rate": 8.667654906890469e-05, "loss": 0.0422, "step": 8180 }, { "epoch": 0.48080309968298696, "grad_norm": 2.466914415359497, "learning_rate": 8.664510377186599e-05, "loss": 0.0428, "step": 8190 }, { "epoch": 0.48139016085476105, "grad_norm": 3.4293391704559326, "learning_rate": 8.661362712929013e-05, "loss": 0.044, "step": 8200 }, { "epoch": 0.48197722202653515, "grad_norm": 1.3960778713226318, "learning_rate": 8.658211916810165e-05, "loss": 0.0493, "step": 8210 }, { "epoch": 0.48256428319830924, "grad_norm": 3.3009042739868164, "learning_rate": 8.655057991525186e-05, "loss": 0.0518, "step": 8220 }, { "epoch": 0.48315134437008334, "grad_norm": 2.5035552978515625, "learning_rate": 8.651900939771884e-05, "loss": 0.0276, "step": 8230 }, { "epoch": 0.4837384055418575, "grad_norm": 3.129765272140503, "learning_rate": 8.648740764250745e-05, "loss": 0.0557, "step": 8240 }, { "epoch": 0.4843254667136316, "grad_norm": 3.4605374336242676, "learning_rate": 8.645577467664919e-05, "loss": 0.04, "step": 8250 }, { "epoch": 0.4849125278854057, "grad_norm": 3.6204116344451904, "learning_rate": 8.642411052720235e-05, "loss": 0.0464, "step": 8260 }, { "epoch": 0.48549958905717977, "grad_norm": 4.532052516937256, "learning_rate": 8.639241522125185e-05, "loss": 0.0689, "step": 8270 }, { "epoch": 0.48608665022895386, "grad_norm": 1.2190687656402588, "learning_rate": 8.636068878590924e-05, "loss": 0.0387, "step": 8280 }, { "epoch": 0.48667371140072796, "grad_norm": 2.1063272953033447, "learning_rate": 8.632893124831273e-05, "loss": 0.0677, "step": 8290 }, { "epoch": 0.48726077257250205, "grad_norm": 1.011233925819397, "learning_rate": 8.629714263562716e-05, "loss": 0.0424, "step": 8300 }, { "epoch": 0.48784783374427615, "grad_norm": 0.7251833081245422, "learning_rate": 8.626532297504386e-05, "loss": 0.0513, "step": 8310 }, { "epoch": 0.48843489491605024, "grad_norm": 1.6562292575836182, "learning_rate": 8.62334722937808e-05, "loss": 0.0686, "step": 8320 }, { "epoch": 0.48902195608782434, "grad_norm": 1.2332732677459717, "learning_rate": 8.620159061908245e-05, "loss": 0.0625, "step": 8330 }, { "epoch": 0.48960901725959843, "grad_norm": 1.7628116607666016, "learning_rate": 8.61696779782198e-05, "loss": 0.02, "step": 8340 }, { "epoch": 0.49019607843137253, "grad_norm": 2.3033394813537598, "learning_rate": 8.613773439849034e-05, "loss": 0.0586, "step": 8350 }, { "epoch": 0.4907831396031466, "grad_norm": 0.787381112575531, "learning_rate": 8.610575990721799e-05, "loss": 0.0489, "step": 8360 }, { "epoch": 0.4913702007749208, "grad_norm": 4.453810691833496, "learning_rate": 8.607375453175316e-05, "loss": 0.0537, "step": 8370 }, { "epoch": 0.49195726194669487, "grad_norm": 1.4812476634979248, "learning_rate": 8.604171829947263e-05, "loss": 0.0489, "step": 8380 }, { "epoch": 0.49254432311846896, "grad_norm": 1.9119701385498047, "learning_rate": 8.600965123777957e-05, "loss": 0.0457, "step": 8390 }, { "epoch": 0.49313138429024306, "grad_norm": 1.4203485250473022, "learning_rate": 8.59775533741036e-05, "loss": 0.0472, "step": 8400 }, { "epoch": 0.49371844546201715, "grad_norm": 1.6807453632354736, "learning_rate": 8.594542473590062e-05, "loss": 0.0393, "step": 8410 }, { "epoch": 0.49430550663379125, "grad_norm": 1.0950032472610474, "learning_rate": 8.591326535065283e-05, "loss": 0.0455, "step": 8420 }, { "epoch": 0.49489256780556534, "grad_norm": 2.4319326877593994, "learning_rate": 8.58810752458688e-05, "loss": 0.0638, "step": 8430 }, { "epoch": 0.49547962897733944, "grad_norm": 1.7978637218475342, "learning_rate": 8.584885444908333e-05, "loss": 0.0314, "step": 8440 }, { "epoch": 0.49606669014911353, "grad_norm": 2.297290802001953, "learning_rate": 8.58166029878575e-05, "loss": 0.0723, "step": 8450 }, { "epoch": 0.4966537513208876, "grad_norm": 0.6186020374298096, "learning_rate": 8.578432088977859e-05, "loss": 0.061, "step": 8460 }, { "epoch": 0.4972408124926617, "grad_norm": 0.46362197399139404, "learning_rate": 8.575200818246012e-05, "loss": 0.0401, "step": 8470 }, { "epoch": 0.4978278736644358, "grad_norm": 2.507780075073242, "learning_rate": 8.571966489354178e-05, "loss": 0.0735, "step": 8480 }, { "epoch": 0.4984149348362099, "grad_norm": 1.0863014459609985, "learning_rate": 8.568729105068939e-05, "loss": 0.0345, "step": 8490 }, { "epoch": 0.499001996007984, "grad_norm": 2.749016523361206, "learning_rate": 8.565488668159496e-05, "loss": 0.0429, "step": 8500 }, { "epoch": 0.49958905717975816, "grad_norm": 3.3777594566345215, "learning_rate": 8.562245181397655e-05, "loss": 0.0377, "step": 8510 }, { "epoch": 0.5001761183515322, "grad_norm": 2.7208309173583984, "learning_rate": 8.558998647557837e-05, "loss": 0.035, "step": 8520 }, { "epoch": 0.5007631795233063, "grad_norm": 1.5163689851760864, "learning_rate": 8.555749069417065e-05, "loss": 0.0461, "step": 8530 }, { "epoch": 0.5013502406950804, "grad_norm": 1.4222984313964844, "learning_rate": 8.552496449754967e-05, "loss": 0.0585, "step": 8540 }, { "epoch": 0.5019373018668545, "grad_norm": 2.8832669258117676, "learning_rate": 8.549240791353775e-05, "loss": 0.0655, "step": 8550 }, { "epoch": 0.5025243630386286, "grad_norm": 2.0506932735443115, "learning_rate": 8.545982096998315e-05, "loss": 0.0449, "step": 8560 }, { "epoch": 0.5031114242104027, "grad_norm": 6.83305025100708, "learning_rate": 8.542720369476016e-05, "loss": 0.0413, "step": 8570 }, { "epoch": 0.5036984853821769, "grad_norm": 1.7522202730178833, "learning_rate": 8.539455611576898e-05, "loss": 0.0411, "step": 8580 }, { "epoch": 0.504285546553951, "grad_norm": 0.4490013122558594, "learning_rate": 8.536187826093576e-05, "loss": 0.0509, "step": 8590 }, { "epoch": 0.5048726077257251, "grad_norm": 1.3018290996551514, "learning_rate": 8.53291701582125e-05, "loss": 0.0633, "step": 8600 }, { "epoch": 0.5054596688974992, "grad_norm": 3.179797887802124, "learning_rate": 8.529643183557708e-05, "loss": 0.0498, "step": 8610 }, { "epoch": 0.5060467300692733, "grad_norm": 4.824548721313477, "learning_rate": 8.52636633210333e-05, "loss": 0.0539, "step": 8620 }, { "epoch": 0.5066337912410473, "grad_norm": 2.7731845378875732, "learning_rate": 8.52308646426107e-05, "loss": 0.0432, "step": 8630 }, { "epoch": 0.5072208524128214, "grad_norm": 2.331210136413574, "learning_rate": 8.519803582836467e-05, "loss": 0.0688, "step": 8640 }, { "epoch": 0.5078079135845955, "grad_norm": 2.3192410469055176, "learning_rate": 8.516517690637638e-05, "loss": 0.0453, "step": 8650 }, { "epoch": 0.5083949747563696, "grad_norm": 3.357856273651123, "learning_rate": 8.513228790475269e-05, "loss": 0.0641, "step": 8660 }, { "epoch": 0.5089820359281437, "grad_norm": 2.9671857357025146, "learning_rate": 8.509936885162629e-05, "loss": 0.0401, "step": 8670 }, { "epoch": 0.5095690970999178, "grad_norm": 2.087934732437134, "learning_rate": 8.50664197751555e-05, "loss": 0.0291, "step": 8680 }, { "epoch": 0.5101561582716919, "grad_norm": 0.8386021256446838, "learning_rate": 8.503344070352434e-05, "loss": 0.068, "step": 8690 }, { "epoch": 0.510743219443466, "grad_norm": 2.6947085857391357, "learning_rate": 8.50004316649425e-05, "loss": 0.0363, "step": 8700 }, { "epoch": 0.5113302806152401, "grad_norm": 2.5281970500946045, "learning_rate": 8.496739268764529e-05, "loss": 0.0473, "step": 8710 }, { "epoch": 0.5119173417870142, "grad_norm": 1.7762385606765747, "learning_rate": 8.493432379989365e-05, "loss": 0.0427, "step": 8720 }, { "epoch": 0.5125044029587883, "grad_norm": 3.140371799468994, "learning_rate": 8.490122502997406e-05, "loss": 0.0347, "step": 8730 }, { "epoch": 0.5130914641305624, "grad_norm": 1.5524842739105225, "learning_rate": 8.486809640619859e-05, "loss": 0.0272, "step": 8740 }, { "epoch": 0.5136785253023365, "grad_norm": 1.11879563331604, "learning_rate": 8.483493795690489e-05, "loss": 0.0409, "step": 8750 }, { "epoch": 0.5142655864741106, "grad_norm": 1.88312566280365, "learning_rate": 8.480174971045603e-05, "loss": 0.0711, "step": 8760 }, { "epoch": 0.5148526476458847, "grad_norm": 2.326946496963501, "learning_rate": 8.476853169524065e-05, "loss": 0.0409, "step": 8770 }, { "epoch": 0.5154397088176588, "grad_norm": 3.046353578567505, "learning_rate": 8.473528393967278e-05, "loss": 0.0588, "step": 8780 }, { "epoch": 0.5160267699894329, "grad_norm": 0.6722098588943481, "learning_rate": 8.470200647219198e-05, "loss": 0.0395, "step": 8790 }, { "epoch": 0.516613831161207, "grad_norm": 0.5529523491859436, "learning_rate": 8.466869932126314e-05, "loss": 0.0637, "step": 8800 }, { "epoch": 0.517200892332981, "grad_norm": 2.5950729846954346, "learning_rate": 8.463536251537656e-05, "loss": 0.0444, "step": 8810 }, { "epoch": 0.5177879535047551, "grad_norm": 4.526224136352539, "learning_rate": 8.460199608304797e-05, "loss": 0.0414, "step": 8820 }, { "epoch": 0.5183750146765292, "grad_norm": 2.620432138442993, "learning_rate": 8.456860005281835e-05, "loss": 0.0439, "step": 8830 }, { "epoch": 0.5189620758483033, "grad_norm": 2.836040496826172, "learning_rate": 8.453517445325405e-05, "loss": 0.0615, "step": 8840 }, { "epoch": 0.5195491370200775, "grad_norm": 2.2003190517425537, "learning_rate": 8.450171931294673e-05, "loss": 0.0298, "step": 8850 }, { "epoch": 0.5201361981918516, "grad_norm": 2.7201812267303467, "learning_rate": 8.446823466051326e-05, "loss": 0.0331, "step": 8860 }, { "epoch": 0.5207232593636257, "grad_norm": 2.7622556686401367, "learning_rate": 8.44347205245958e-05, "loss": 0.0555, "step": 8870 }, { "epoch": 0.5213103205353998, "grad_norm": 2.5372259616851807, "learning_rate": 8.440117693386171e-05, "loss": 0.0417, "step": 8880 }, { "epoch": 0.5218973817071739, "grad_norm": 4.218869209289551, "learning_rate": 8.436760391700355e-05, "loss": 0.0486, "step": 8890 }, { "epoch": 0.522484442878948, "grad_norm": 0.7023425102233887, "learning_rate": 8.433400150273906e-05, "loss": 0.0334, "step": 8900 }, { "epoch": 0.5230715040507221, "grad_norm": 1.4348467588424683, "learning_rate": 8.430036971981112e-05, "loss": 0.0404, "step": 8910 }, { "epoch": 0.5236585652224962, "grad_norm": 2.7175183296203613, "learning_rate": 8.426670859698771e-05, "loss": 0.0634, "step": 8920 }, { "epoch": 0.5242456263942703, "grad_norm": 2.485643148422241, "learning_rate": 8.423301816306193e-05, "loss": 0.0328, "step": 8930 }, { "epoch": 0.5248326875660444, "grad_norm": 1.6880441904067993, "learning_rate": 8.419929844685197e-05, "loss": 0.056, "step": 8940 }, { "epoch": 0.5254197487378185, "grad_norm": 1.1850073337554932, "learning_rate": 8.416554947720104e-05, "loss": 0.0263, "step": 8950 }, { "epoch": 0.5260068099095926, "grad_norm": 2.0261285305023193, "learning_rate": 8.413177128297734e-05, "loss": 0.0743, "step": 8960 }, { "epoch": 0.5265938710813667, "grad_norm": 1.2257660627365112, "learning_rate": 8.409796389307417e-05, "loss": 0.0352, "step": 8970 }, { "epoch": 0.5271809322531408, "grad_norm": 3.7465314865112305, "learning_rate": 8.406412733640967e-05, "loss": 0.0339, "step": 8980 }, { "epoch": 0.5277679934249149, "grad_norm": 1.9524476528167725, "learning_rate": 8.403026164192704e-05, "loss": 0.0683, "step": 8990 }, { "epoch": 0.528355054596689, "grad_norm": 3.1063032150268555, "learning_rate": 8.399636683859437e-05, "loss": 0.0487, "step": 9000 }, { "epoch": 0.528355054596689, "eval_loss": 0.44510969519615173, "eval_runtime": 269.6504, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 9000 }, { "epoch": 0.5289421157684631, "grad_norm": 1.7905889749526978, "learning_rate": 8.396244295540462e-05, "loss": 0.0762, "step": 9010 }, { "epoch": 0.5295291769402372, "grad_norm": 0.9049012660980225, "learning_rate": 8.392849002137566e-05, "loss": 0.0471, "step": 9020 }, { "epoch": 0.5301162381120113, "grad_norm": 1.638697624206543, "learning_rate": 8.389450806555017e-05, "loss": 0.0526, "step": 9030 }, { "epoch": 0.5307032992837853, "grad_norm": 1.720030426979065, "learning_rate": 8.386049711699571e-05, "loss": 0.0532, "step": 9040 }, { "epoch": 0.5312903604555594, "grad_norm": 2.473928689956665, "learning_rate": 8.38264572048046e-05, "loss": 0.0332, "step": 9050 }, { "epoch": 0.5318774216273335, "grad_norm": 2.829211950302124, "learning_rate": 8.379238835809393e-05, "loss": 0.0586, "step": 9060 }, { "epoch": 0.5324644827991076, "grad_norm": 1.035154104232788, "learning_rate": 8.37582906060056e-05, "loss": 0.065, "step": 9070 }, { "epoch": 0.5330515439708817, "grad_norm": 1.434273362159729, "learning_rate": 8.372416397770613e-05, "loss": 0.0353, "step": 9080 }, { "epoch": 0.5336386051426558, "grad_norm": 0.6541376113891602, "learning_rate": 8.369000850238683e-05, "loss": 0.047, "step": 9090 }, { "epoch": 0.5342256663144299, "grad_norm": 2.1451282501220703, "learning_rate": 8.365582420926366e-05, "loss": 0.0411, "step": 9100 }, { "epoch": 0.534812727486204, "grad_norm": 3.246711492538452, "learning_rate": 8.362161112757723e-05, "loss": 0.0321, "step": 9110 }, { "epoch": 0.5353997886579782, "grad_norm": 3.385925054550171, "learning_rate": 8.358736928659274e-05, "loss": 0.0315, "step": 9120 }, { "epoch": 0.5359868498297523, "grad_norm": 2.667985200881958, "learning_rate": 8.355309871560006e-05, "loss": 0.0525, "step": 9130 }, { "epoch": 0.5365739110015264, "grad_norm": 0.6501687169075012, "learning_rate": 8.351879944391357e-05, "loss": 0.0386, "step": 9140 }, { "epoch": 0.5371609721733005, "grad_norm": 1.7853676080703735, "learning_rate": 8.348447150087223e-05, "loss": 0.0481, "step": 9150 }, { "epoch": 0.5377480333450746, "grad_norm": 0.8585736751556396, "learning_rate": 8.345011491583954e-05, "loss": 0.0327, "step": 9160 }, { "epoch": 0.5383350945168487, "grad_norm": 1.4739607572555542, "learning_rate": 8.341572971820344e-05, "loss": 0.0356, "step": 9170 }, { "epoch": 0.5389221556886228, "grad_norm": 0.4403301775455475, "learning_rate": 8.338131593737643e-05, "loss": 0.0458, "step": 9180 }, { "epoch": 0.5395092168603969, "grad_norm": 2.5945067405700684, "learning_rate": 8.33468736027954e-05, "loss": 0.0429, "step": 9190 }, { "epoch": 0.540096278032171, "grad_norm": 2.551771640777588, "learning_rate": 8.331240274392167e-05, "loss": 0.0286, "step": 9200 }, { "epoch": 0.5406833392039451, "grad_norm": 0.41668054461479187, "learning_rate": 8.327790339024097e-05, "loss": 0.0459, "step": 9210 }, { "epoch": 0.5412704003757192, "grad_norm": 3.3726375102996826, "learning_rate": 8.324337557126342e-05, "loss": 0.0612, "step": 9220 }, { "epoch": 0.5418574615474933, "grad_norm": 2.2610208988189697, "learning_rate": 8.320881931652347e-05, "loss": 0.0482, "step": 9230 }, { "epoch": 0.5424445227192674, "grad_norm": 3.177828311920166, "learning_rate": 8.317423465557987e-05, "loss": 0.051, "step": 9240 }, { "epoch": 0.5430315838910414, "grad_norm": 0.9072557091712952, "learning_rate": 8.313962161801569e-05, "loss": 0.0479, "step": 9250 }, { "epoch": 0.5436186450628155, "grad_norm": 0.45060959458351135, "learning_rate": 8.310498023343832e-05, "loss": 0.0389, "step": 9260 }, { "epoch": 0.5442057062345896, "grad_norm": 3.351407527923584, "learning_rate": 8.307031053147932e-05, "loss": 0.0598, "step": 9270 }, { "epoch": 0.5447927674063637, "grad_norm": 2.226046562194824, "learning_rate": 8.30356125417945e-05, "loss": 0.0416, "step": 9280 }, { "epoch": 0.5453798285781378, "grad_norm": 2.5164947509765625, "learning_rate": 8.300088629406391e-05, "loss": 0.0609, "step": 9290 }, { "epoch": 0.5459668897499119, "grad_norm": 2.2050812244415283, "learning_rate": 8.296613181799168e-05, "loss": 0.0544, "step": 9300 }, { "epoch": 0.546553950921686, "grad_norm": 1.6396516561508179, "learning_rate": 8.293134914330618e-05, "loss": 0.0577, "step": 9310 }, { "epoch": 0.5471410120934601, "grad_norm": 0.5308670997619629, "learning_rate": 8.289653829975983e-05, "loss": 0.0249, "step": 9320 }, { "epoch": 0.5477280732652342, "grad_norm": 2.1728813648223877, "learning_rate": 8.286169931712921e-05, "loss": 0.0487, "step": 9330 }, { "epoch": 0.5483151344370083, "grad_norm": 1.4126052856445312, "learning_rate": 8.28268322252149e-05, "loss": 0.0242, "step": 9340 }, { "epoch": 0.5489021956087824, "grad_norm": 1.1930028200149536, "learning_rate": 8.279193705384159e-05, "loss": 0.0456, "step": 9350 }, { "epoch": 0.5494892567805565, "grad_norm": 2.592273235321045, "learning_rate": 8.275701383285795e-05, "loss": 0.0603, "step": 9360 }, { "epoch": 0.5500763179523306, "grad_norm": 4.0102128982543945, "learning_rate": 8.272206259213662e-05, "loss": 0.0591, "step": 9370 }, { "epoch": 0.5506633791241047, "grad_norm": 3.490880012512207, "learning_rate": 8.268708336157428e-05, "loss": 0.0384, "step": 9380 }, { "epoch": 0.5512504402958789, "grad_norm": 3.501849889755249, "learning_rate": 8.265207617109148e-05, "loss": 0.0397, "step": 9390 }, { "epoch": 0.551837501467653, "grad_norm": 1.536889910697937, "learning_rate": 8.261704105063275e-05, "loss": 0.0364, "step": 9400 }, { "epoch": 0.5524245626394271, "grad_norm": 1.7452349662780762, "learning_rate": 8.258197803016646e-05, "loss": 0.0466, "step": 9410 }, { "epoch": 0.5530116238112012, "grad_norm": 4.080146789550781, "learning_rate": 8.254688713968484e-05, "loss": 0.0566, "step": 9420 }, { "epoch": 0.5535986849829753, "grad_norm": 1.8155280351638794, "learning_rate": 8.2511768409204e-05, "loss": 0.0258, "step": 9430 }, { "epoch": 0.5541857461547494, "grad_norm": 0.7214668989181519, "learning_rate": 8.247662186876386e-05, "loss": 0.0609, "step": 9440 }, { "epoch": 0.5547728073265235, "grad_norm": 2.605099678039551, "learning_rate": 8.244144754842809e-05, "loss": 0.0304, "step": 9450 }, { "epoch": 0.5553598684982975, "grad_norm": 2.348184823989868, "learning_rate": 8.240624547828417e-05, "loss": 0.0601, "step": 9460 }, { "epoch": 0.5559469296700716, "grad_norm": 1.3189982175827026, "learning_rate": 8.237101568844328e-05, "loss": 0.0498, "step": 9470 }, { "epoch": 0.5565339908418457, "grad_norm": 2.5813090801239014, "learning_rate": 8.233575820904032e-05, "loss": 0.0587, "step": 9480 }, { "epoch": 0.5571210520136198, "grad_norm": 3.1107876300811768, "learning_rate": 8.23004730702339e-05, "loss": 0.0665, "step": 9490 }, { "epoch": 0.5577081131853939, "grad_norm": 0.7311292290687561, "learning_rate": 8.226516030220623e-05, "loss": 0.0451, "step": 9500 }, { "epoch": 0.558295174357168, "grad_norm": 1.5079210996627808, "learning_rate": 8.222981993516324e-05, "loss": 0.0621, "step": 9510 }, { "epoch": 0.5588822355289421, "grad_norm": 2.4844210147857666, "learning_rate": 8.219445199933437e-05, "loss": 0.0425, "step": 9520 }, { "epoch": 0.5594692967007162, "grad_norm": 1.9239799976348877, "learning_rate": 8.215905652497273e-05, "loss": 0.0581, "step": 9530 }, { "epoch": 0.5600563578724903, "grad_norm": 1.2899315357208252, "learning_rate": 8.212363354235494e-05, "loss": 0.0308, "step": 9540 }, { "epoch": 0.5606434190442644, "grad_norm": 1.7348955869674683, "learning_rate": 8.208818308178114e-05, "loss": 0.0491, "step": 9550 }, { "epoch": 0.5612304802160385, "grad_norm": 1.6830767393112183, "learning_rate": 8.205270517357502e-05, "loss": 0.0495, "step": 9560 }, { "epoch": 0.5618175413878126, "grad_norm": 2.3700168132781982, "learning_rate": 8.201719984808369e-05, "loss": 0.0423, "step": 9570 }, { "epoch": 0.5624046025595867, "grad_norm": 3.7524573802948, "learning_rate": 8.198166713567777e-05, "loss": 0.0618, "step": 9580 }, { "epoch": 0.5629916637313608, "grad_norm": 0.6316405534744263, "learning_rate": 8.194610706675125e-05, "loss": 0.0315, "step": 9590 }, { "epoch": 0.5635787249031349, "grad_norm": 1.4591659307479858, "learning_rate": 8.191051967172157e-05, "loss": 0.0439, "step": 9600 }, { "epoch": 0.564165786074909, "grad_norm": 0.6856974363327026, "learning_rate": 8.18749049810295e-05, "loss": 0.0483, "step": 9610 }, { "epoch": 0.5647528472466831, "grad_norm": 3.4433891773223877, "learning_rate": 8.183926302513923e-05, "loss": 0.0445, "step": 9620 }, { "epoch": 0.5653399084184572, "grad_norm": 1.6874566078186035, "learning_rate": 8.180359383453815e-05, "loss": 0.0468, "step": 9630 }, { "epoch": 0.5659269695902313, "grad_norm": 2.5820140838623047, "learning_rate": 8.176789743973707e-05, "loss": 0.0367, "step": 9640 }, { "epoch": 0.5665140307620053, "grad_norm": 3.7962241172790527, "learning_rate": 8.173217387127004e-05, "loss": 0.0392, "step": 9650 }, { "epoch": 0.5671010919337796, "grad_norm": 2.679847240447998, "learning_rate": 8.169642315969427e-05, "loss": 0.0253, "step": 9660 }, { "epoch": 0.5676881531055537, "grad_norm": 2.028122901916504, "learning_rate": 8.166064533559028e-05, "loss": 0.0734, "step": 9670 }, { "epoch": 0.5682752142773277, "grad_norm": 2.9593400955200195, "learning_rate": 8.162484042956178e-05, "loss": 0.0219, "step": 9680 }, { "epoch": 0.5688622754491018, "grad_norm": 1.2236310243606567, "learning_rate": 8.158900847223556e-05, "loss": 0.0289, "step": 9690 }, { "epoch": 0.5694493366208759, "grad_norm": 1.4059587717056274, "learning_rate": 8.155314949426167e-05, "loss": 0.0306, "step": 9700 }, { "epoch": 0.57003639779265, "grad_norm": 4.031301975250244, "learning_rate": 8.151726352631316e-05, "loss": 0.0364, "step": 9710 }, { "epoch": 0.5706234589644241, "grad_norm": 3.6776328086853027, "learning_rate": 8.148135059908624e-05, "loss": 0.0351, "step": 9720 }, { "epoch": 0.5712105201361982, "grad_norm": 0.8490133285522461, "learning_rate": 8.144541074330015e-05, "loss": 0.0398, "step": 9730 }, { "epoch": 0.5717975813079723, "grad_norm": 1.5910861492156982, "learning_rate": 8.140944398969717e-05, "loss": 0.0396, "step": 9740 }, { "epoch": 0.5723846424797464, "grad_norm": 0.7195848822593689, "learning_rate": 8.13734503690426e-05, "loss": 0.0429, "step": 9750 }, { "epoch": 0.5729717036515205, "grad_norm": 1.655400037765503, "learning_rate": 8.13374299121247e-05, "loss": 0.0551, "step": 9760 }, { "epoch": 0.5735587648232946, "grad_norm": 0.9556081891059875, "learning_rate": 8.130138264975471e-05, "loss": 0.0505, "step": 9770 }, { "epoch": 0.5741458259950687, "grad_norm": 1.830175757408142, "learning_rate": 8.126530861276677e-05, "loss": 0.0361, "step": 9780 }, { "epoch": 0.5747328871668428, "grad_norm": 3.475700855255127, "learning_rate": 8.122920783201793e-05, "loss": 0.0206, "step": 9790 }, { "epoch": 0.5753199483386169, "grad_norm": 1.2503710985183716, "learning_rate": 8.119308033838814e-05, "loss": 0.0685, "step": 9800 }, { "epoch": 0.575907009510391, "grad_norm": 5.604869842529297, "learning_rate": 8.115692616278018e-05, "loss": 0.0688, "step": 9810 }, { "epoch": 0.5764940706821651, "grad_norm": 2.3936383724212646, "learning_rate": 8.112074533611967e-05, "loss": 0.0639, "step": 9820 }, { "epoch": 0.5770811318539392, "grad_norm": 3.6807427406311035, "learning_rate": 8.108453788935498e-05, "loss": 0.0428, "step": 9830 }, { "epoch": 0.5776681930257133, "grad_norm": 1.175811767578125, "learning_rate": 8.10483038534573e-05, "loss": 0.0579, "step": 9840 }, { "epoch": 0.5782552541974874, "grad_norm": 3.283430337905884, "learning_rate": 8.101204325942056e-05, "loss": 0.0532, "step": 9850 }, { "epoch": 0.5788423153692615, "grad_norm": 4.026200771331787, "learning_rate": 8.097575613826136e-05, "loss": 0.0328, "step": 9860 }, { "epoch": 0.5794293765410355, "grad_norm": 2.251068353652954, "learning_rate": 8.093944252101907e-05, "loss": 0.0543, "step": 9870 }, { "epoch": 0.5800164377128096, "grad_norm": 3.2121071815490723, "learning_rate": 8.090310243875565e-05, "loss": 0.0619, "step": 9880 }, { "epoch": 0.5806034988845837, "grad_norm": 1.9805493354797363, "learning_rate": 8.086673592255573e-05, "loss": 0.0463, "step": 9890 }, { "epoch": 0.5811905600563578, "grad_norm": 2.9728682041168213, "learning_rate": 8.083034300352657e-05, "loss": 0.0575, "step": 9900 }, { "epoch": 0.5817776212281319, "grad_norm": 1.6561614274978638, "learning_rate": 8.079392371279797e-05, "loss": 0.0478, "step": 9910 }, { "epoch": 0.582364682399906, "grad_norm": 2.3436312675476074, "learning_rate": 8.075747808152231e-05, "loss": 0.0742, "step": 9920 }, { "epoch": 0.5829517435716801, "grad_norm": 2.0365006923675537, "learning_rate": 8.072100614087453e-05, "loss": 0.0355, "step": 9930 }, { "epoch": 0.5835388047434543, "grad_norm": 2.1946043968200684, "learning_rate": 8.068450792205202e-05, "loss": 0.022, "step": 9940 }, { "epoch": 0.5841258659152284, "grad_norm": 3.2829816341400146, "learning_rate": 8.064798345627468e-05, "loss": 0.045, "step": 9950 }, { "epoch": 0.5847129270870025, "grad_norm": 1.0017696619033813, "learning_rate": 8.061143277478486e-05, "loss": 0.0498, "step": 9960 }, { "epoch": 0.5852999882587766, "grad_norm": 1.9448410272598267, "learning_rate": 8.057485590884733e-05, "loss": 0.0428, "step": 9970 }, { "epoch": 0.5858870494305507, "grad_norm": 1.5764429569244385, "learning_rate": 8.053825288974924e-05, "loss": 0.058, "step": 9980 }, { "epoch": 0.5864741106023248, "grad_norm": 3.9513072967529297, "learning_rate": 8.050162374880015e-05, "loss": 0.0606, "step": 9990 }, { "epoch": 0.5870611717740989, "grad_norm": 2.3251681327819824, "learning_rate": 8.046496851733193e-05, "loss": 0.0294, "step": 10000 }, { "epoch": 0.587648232945873, "grad_norm": 1.103040099143982, "learning_rate": 8.042828722669882e-05, "loss": 0.0439, "step": 10010 }, { "epoch": 0.5882352941176471, "grad_norm": 5.0034685134887695, "learning_rate": 8.039157990827727e-05, "loss": 0.0339, "step": 10020 }, { "epoch": 0.5888223552894212, "grad_norm": 1.0279072523117065, "learning_rate": 8.0354846593466e-05, "loss": 0.0549, "step": 10030 }, { "epoch": 0.5894094164611953, "grad_norm": 3.0541296005249023, "learning_rate": 8.031808731368608e-05, "loss": 0.0656, "step": 10040 }, { "epoch": 0.5899964776329694, "grad_norm": 1.1822483539581299, "learning_rate": 8.028130210038067e-05, "loss": 0.0358, "step": 10050 }, { "epoch": 0.5905835388047435, "grad_norm": 1.4830501079559326, "learning_rate": 8.024449098501514e-05, "loss": 0.0472, "step": 10060 }, { "epoch": 0.5911705999765176, "grad_norm": 0.9620664715766907, "learning_rate": 8.020765399907707e-05, "loss": 0.0366, "step": 10070 }, { "epoch": 0.5917576611482916, "grad_norm": 2.6279451847076416, "learning_rate": 8.017079117407611e-05, "loss": 0.031, "step": 10080 }, { "epoch": 0.5923447223200657, "grad_norm": 1.0248329639434814, "learning_rate": 8.013390254154402e-05, "loss": 0.0302, "step": 10090 }, { "epoch": 0.5929317834918398, "grad_norm": 4.71444034576416, "learning_rate": 8.009698813303465e-05, "loss": 0.0609, "step": 10100 }, { "epoch": 0.5935188446636139, "grad_norm": 1.409204363822937, "learning_rate": 8.006004798012393e-05, "loss": 0.0267, "step": 10110 }, { "epoch": 0.594105905835388, "grad_norm": 5.488358974456787, "learning_rate": 8.002308211440974e-05, "loss": 0.0621, "step": 10120 }, { "epoch": 0.5946929670071621, "grad_norm": 1.6564887762069702, "learning_rate": 7.998609056751199e-05, "loss": 0.0386, "step": 10130 }, { "epoch": 0.5952800281789362, "grad_norm": 2.0165855884552, "learning_rate": 7.994907337107258e-05, "loss": 0.0417, "step": 10140 }, { "epoch": 0.5958670893507103, "grad_norm": 1.646742343902588, "learning_rate": 7.991203055675532e-05, "loss": 0.0542, "step": 10150 }, { "epoch": 0.5964541505224844, "grad_norm": 2.7396490573883057, "learning_rate": 7.987496215624593e-05, "loss": 0.0617, "step": 10160 }, { "epoch": 0.5970412116942585, "grad_norm": 4.674464702606201, "learning_rate": 7.983786820125204e-05, "loss": 0.0545, "step": 10170 }, { "epoch": 0.5976282728660326, "grad_norm": 1.177596092224121, "learning_rate": 7.980074872350312e-05, "loss": 0.0369, "step": 10180 }, { "epoch": 0.5982153340378067, "grad_norm": 1.4684791564941406, "learning_rate": 7.976360375475047e-05, "loss": 0.04, "step": 10190 }, { "epoch": 0.5988023952095808, "grad_norm": 4.104825496673584, "learning_rate": 7.972643332676723e-05, "loss": 0.0356, "step": 10200 }, { "epoch": 0.599389456381355, "grad_norm": 1.0452766418457031, "learning_rate": 7.968923747134825e-05, "loss": 0.037, "step": 10210 }, { "epoch": 0.5999765175531291, "grad_norm": 1.7485004663467407, "learning_rate": 7.965201622031021e-05, "loss": 0.033, "step": 10220 }, { "epoch": 0.6005635787249032, "grad_norm": 1.3193644285202026, "learning_rate": 7.961476960549145e-05, "loss": 0.0455, "step": 10230 }, { "epoch": 0.6011506398966773, "grad_norm": 1.8149995803833008, "learning_rate": 7.957749765875204e-05, "loss": 0.0353, "step": 10240 }, { "epoch": 0.6017377010684514, "grad_norm": 3.0644371509552, "learning_rate": 7.954020041197369e-05, "loss": 0.0354, "step": 10250 }, { "epoch": 0.6023247622402255, "grad_norm": 2.376058340072632, "learning_rate": 7.950287789705977e-05, "loss": 0.0554, "step": 10260 }, { "epoch": 0.6029118234119996, "grad_norm": 2.0536141395568848, "learning_rate": 7.94655301459353e-05, "loss": 0.05, "step": 10270 }, { "epoch": 0.6034988845837737, "grad_norm": 1.821018934249878, "learning_rate": 7.942815719054679e-05, "loss": 0.0471, "step": 10280 }, { "epoch": 0.6040859457555477, "grad_norm": 1.5516284704208374, "learning_rate": 7.939075906286241e-05, "loss": 0.0455, "step": 10290 }, { "epoch": 0.6046730069273218, "grad_norm": 4.660377025604248, "learning_rate": 7.935333579487179e-05, "loss": 0.0454, "step": 10300 }, { "epoch": 0.6052600680990959, "grad_norm": 1.256773591041565, "learning_rate": 7.931588741858612e-05, "loss": 0.0406, "step": 10310 }, { "epoch": 0.60584712927087, "grad_norm": 4.528842926025391, "learning_rate": 7.927841396603804e-05, "loss": 0.0528, "step": 10320 }, { "epoch": 0.6064341904426441, "grad_norm": 1.1366862058639526, "learning_rate": 7.924091546928163e-05, "loss": 0.038, "step": 10330 }, { "epoch": 0.6070212516144182, "grad_norm": 1.158761978149414, "learning_rate": 7.920339196039239e-05, "loss": 0.0514, "step": 10340 }, { "epoch": 0.6076083127861923, "grad_norm": 2.8276240825653076, "learning_rate": 7.916584347146728e-05, "loss": 0.0446, "step": 10350 }, { "epoch": 0.6081953739579664, "grad_norm": 1.485958218574524, "learning_rate": 7.912827003462451e-05, "loss": 0.048, "step": 10360 }, { "epoch": 0.6087824351297405, "grad_norm": 2.347153425216675, "learning_rate": 7.909067168200375e-05, "loss": 0.0504, "step": 10370 }, { "epoch": 0.6093694963015146, "grad_norm": 2.4298806190490723, "learning_rate": 7.905304844576589e-05, "loss": 0.0303, "step": 10380 }, { "epoch": 0.6099565574732887, "grad_norm": 1.4049865007400513, "learning_rate": 7.901540035809316e-05, "loss": 0.0595, "step": 10390 }, { "epoch": 0.6105436186450628, "grad_norm": 1.0412266254425049, "learning_rate": 7.897772745118903e-05, "loss": 0.0341, "step": 10400 }, { "epoch": 0.6111306798168369, "grad_norm": 4.007213592529297, "learning_rate": 7.89400297572782e-05, "loss": 0.0698, "step": 10410 }, { "epoch": 0.611717740988611, "grad_norm": 2.98750376701355, "learning_rate": 7.890230730860657e-05, "loss": 0.0546, "step": 10420 }, { "epoch": 0.6123048021603851, "grad_norm": 4.189070224761963, "learning_rate": 7.886456013744124e-05, "loss": 0.0641, "step": 10430 }, { "epoch": 0.6128918633321592, "grad_norm": 3.3929412364959717, "learning_rate": 7.88267882760704e-05, "loss": 0.0735, "step": 10440 }, { "epoch": 0.6134789245039333, "grad_norm": 1.317287564277649, "learning_rate": 7.878899175680341e-05, "loss": 0.0211, "step": 10450 }, { "epoch": 0.6140659856757074, "grad_norm": 0.4969087839126587, "learning_rate": 7.875117061197071e-05, "loss": 0.0454, "step": 10460 }, { "epoch": 0.6146530468474815, "grad_norm": 4.775953769683838, "learning_rate": 7.87133248739238e-05, "loss": 0.0457, "step": 10470 }, { "epoch": 0.6152401080192557, "grad_norm": 2.372785806655884, "learning_rate": 7.867545457503521e-05, "loss": 0.0398, "step": 10480 }, { "epoch": 0.6158271691910298, "grad_norm": 2.851694107055664, "learning_rate": 7.863755974769851e-05, "loss": 0.0601, "step": 10490 }, { "epoch": 0.6164142303628038, "grad_norm": 1.553295612335205, "learning_rate": 7.859964042432819e-05, "loss": 0.0742, "step": 10500 }, { "epoch": 0.617001291534578, "grad_norm": 2.219014883041382, "learning_rate": 7.856169663735975e-05, "loss": 0.0496, "step": 10510 }, { "epoch": 0.617588352706352, "grad_norm": 3.396904230117798, "learning_rate": 7.852372841924961e-05, "loss": 0.0375, "step": 10520 }, { "epoch": 0.6181754138781261, "grad_norm": 1.845604658126831, "learning_rate": 7.848573580247505e-05, "loss": 0.0423, "step": 10530 }, { "epoch": 0.6187624750499002, "grad_norm": 2.3468329906463623, "learning_rate": 7.844771881953425e-05, "loss": 0.0428, "step": 10540 }, { "epoch": 0.6193495362216743, "grad_norm": 3.744361162185669, "learning_rate": 7.840967750294626e-05, "loss": 0.0537, "step": 10550 }, { "epoch": 0.6199365973934484, "grad_norm": 2.348733425140381, "learning_rate": 7.837161188525087e-05, "loss": 0.0297, "step": 10560 }, { "epoch": 0.6205236585652225, "grad_norm": 1.5336097478866577, "learning_rate": 7.83335219990087e-05, "loss": 0.0325, "step": 10570 }, { "epoch": 0.6211107197369966, "grad_norm": 2.7004475593566895, "learning_rate": 7.829540787680114e-05, "loss": 0.0458, "step": 10580 }, { "epoch": 0.6216977809087707, "grad_norm": 0.5332560539245605, "learning_rate": 7.82572695512303e-05, "loss": 0.0283, "step": 10590 }, { "epoch": 0.6222848420805448, "grad_norm": 3.654900550842285, "learning_rate": 7.8219107054919e-05, "loss": 0.0369, "step": 10600 }, { "epoch": 0.6228719032523189, "grad_norm": 2.0149872303009033, "learning_rate": 7.818092042051071e-05, "loss": 0.0563, "step": 10610 }, { "epoch": 0.623458964424093, "grad_norm": 1.153555154800415, "learning_rate": 7.814270968066956e-05, "loss": 0.0538, "step": 10620 }, { "epoch": 0.6240460255958671, "grad_norm": 4.492584705352783, "learning_rate": 7.810447486808032e-05, "loss": 0.0557, "step": 10630 }, { "epoch": 0.6246330867676412, "grad_norm": 4.326468467712402, "learning_rate": 7.806621601544832e-05, "loss": 0.0405, "step": 10640 }, { "epoch": 0.6252201479394153, "grad_norm": 2.076035499572754, "learning_rate": 7.802793315549948e-05, "loss": 0.0468, "step": 10650 }, { "epoch": 0.6258072091111894, "grad_norm": 3.4883980751037598, "learning_rate": 7.798962632098024e-05, "loss": 0.0497, "step": 10660 }, { "epoch": 0.6263942702829635, "grad_norm": 2.0908708572387695, "learning_rate": 7.795129554465754e-05, "loss": 0.0551, "step": 10670 }, { "epoch": 0.6269813314547376, "grad_norm": 3.256268262863159, "learning_rate": 7.791294085931882e-05, "loss": 0.0359, "step": 10680 }, { "epoch": 0.6275683926265117, "grad_norm": 2.078904867172241, "learning_rate": 7.787456229777196e-05, "loss": 0.059, "step": 10690 }, { "epoch": 0.6281554537982857, "grad_norm": 0.706121563911438, "learning_rate": 7.783615989284527e-05, "loss": 0.0184, "step": 10700 }, { "epoch": 0.6287425149700598, "grad_norm": 2.253751754760742, "learning_rate": 7.779773367738743e-05, "loss": 0.0463, "step": 10710 }, { "epoch": 0.6293295761418339, "grad_norm": 1.3410460948944092, "learning_rate": 7.775928368426751e-05, "loss": 0.0303, "step": 10720 }, { "epoch": 0.629916637313608, "grad_norm": 2.1979212760925293, "learning_rate": 7.772080994637494e-05, "loss": 0.0611, "step": 10730 }, { "epoch": 0.6305036984853821, "grad_norm": 2.3805341720581055, "learning_rate": 7.768231249661942e-05, "loss": 0.0401, "step": 10740 }, { "epoch": 0.6310907596571563, "grad_norm": 1.889790654182434, "learning_rate": 7.764379136793096e-05, "loss": 0.0368, "step": 10750 }, { "epoch": 0.6316778208289304, "grad_norm": 2.839601516723633, "learning_rate": 7.760524659325981e-05, "loss": 0.0513, "step": 10760 }, { "epoch": 0.6322648820007045, "grad_norm": 0.5804122686386108, "learning_rate": 7.756667820557644e-05, "loss": 0.0489, "step": 10770 }, { "epoch": 0.6328519431724786, "grad_norm": 0.8815114498138428, "learning_rate": 7.752808623787152e-05, "loss": 0.0525, "step": 10780 }, { "epoch": 0.6334390043442527, "grad_norm": 1.7328076362609863, "learning_rate": 7.748947072315592e-05, "loss": 0.0329, "step": 10790 }, { "epoch": 0.6340260655160268, "grad_norm": 1.2176601886749268, "learning_rate": 7.745083169446064e-05, "loss": 0.0308, "step": 10800 }, { "epoch": 0.6346131266878009, "grad_norm": 4.364448547363281, "learning_rate": 7.741216918483674e-05, "loss": 0.0378, "step": 10810 }, { "epoch": 0.635200187859575, "grad_norm": 3.1577255725860596, "learning_rate": 7.737348322735545e-05, "loss": 0.0526, "step": 10820 }, { "epoch": 0.6357872490313491, "grad_norm": 1.5735069513320923, "learning_rate": 7.7334773855108e-05, "loss": 0.0534, "step": 10830 }, { "epoch": 0.6363743102031232, "grad_norm": 1.8642213344573975, "learning_rate": 7.729604110120564e-05, "loss": 0.0411, "step": 10840 }, { "epoch": 0.6369613713748973, "grad_norm": 2.4640445709228516, "learning_rate": 7.725728499877967e-05, "loss": 0.046, "step": 10850 }, { "epoch": 0.6375484325466714, "grad_norm": 2.915814161300659, "learning_rate": 7.721850558098136e-05, "loss": 0.0391, "step": 10860 }, { "epoch": 0.6381354937184455, "grad_norm": 3.9695358276367188, "learning_rate": 7.717970288098184e-05, "loss": 0.0569, "step": 10870 }, { "epoch": 0.6387225548902196, "grad_norm": 1.0098638534545898, "learning_rate": 7.714087693197227e-05, "loss": 0.0359, "step": 10880 }, { "epoch": 0.6393096160619937, "grad_norm": 2.7655649185180664, "learning_rate": 7.710202776716362e-05, "loss": 0.0463, "step": 10890 }, { "epoch": 0.6398966772337678, "grad_norm": 2.5925843715667725, "learning_rate": 7.706315541978673e-05, "loss": 0.0474, "step": 10900 }, { "epoch": 0.6404837384055418, "grad_norm": 1.8666685819625854, "learning_rate": 7.702425992309229e-05, "loss": 0.0515, "step": 10910 }, { "epoch": 0.6410707995773159, "grad_norm": 0.22611987590789795, "learning_rate": 7.698534131035077e-05, "loss": 0.0389, "step": 10920 }, { "epoch": 0.64165786074909, "grad_norm": 1.4322317838668823, "learning_rate": 7.694639961485246e-05, "loss": 0.0311, "step": 10930 }, { "epoch": 0.6422449219208641, "grad_norm": 1.4518626928329468, "learning_rate": 7.69074348699073e-05, "loss": 0.0391, "step": 10940 }, { "epoch": 0.6428319830926382, "grad_norm": 2.698389768600464, "learning_rate": 7.686844710884506e-05, "loss": 0.0352, "step": 10950 }, { "epoch": 0.6434190442644123, "grad_norm": 1.6068532466888428, "learning_rate": 7.682943636501512e-05, "loss": 0.0476, "step": 10960 }, { "epoch": 0.6440061054361864, "grad_norm": 1.8866565227508545, "learning_rate": 7.679040267178653e-05, "loss": 0.0215, "step": 10970 }, { "epoch": 0.6445931666079605, "grad_norm": 2.0535993576049805, "learning_rate": 7.675134606254799e-05, "loss": 0.0689, "step": 10980 }, { "epoch": 0.6451802277797346, "grad_norm": 0.8703666925430298, "learning_rate": 7.67122665707078e-05, "loss": 0.0336, "step": 10990 }, { "epoch": 0.6457672889515087, "grad_norm": 3.0260536670684814, "learning_rate": 7.667316422969383e-05, "loss": 0.053, "step": 11000 }, { "epoch": 0.6463543501232828, "grad_norm": 0.925990641117096, "learning_rate": 7.663403907295348e-05, "loss": 0.0223, "step": 11010 }, { "epoch": 0.6469414112950569, "grad_norm": 2.501262664794922, "learning_rate": 7.65948911339537e-05, "loss": 0.0484, "step": 11020 }, { "epoch": 0.6475284724668311, "grad_norm": 2.3232693672180176, "learning_rate": 7.655572044618086e-05, "loss": 0.0504, "step": 11030 }, { "epoch": 0.6481155336386052, "grad_norm": 3.446190357208252, "learning_rate": 7.65165270431409e-05, "loss": 0.0275, "step": 11040 }, { "epoch": 0.6487025948103793, "grad_norm": 2.269209861755371, "learning_rate": 7.647731095835906e-05, "loss": 0.0401, "step": 11050 }, { "epoch": 0.6492896559821534, "grad_norm": 1.462114930152893, "learning_rate": 7.64380722253801e-05, "loss": 0.0355, "step": 11060 }, { "epoch": 0.6498767171539275, "grad_norm": 2.42257022857666, "learning_rate": 7.639881087776807e-05, "loss": 0.0463, "step": 11070 }, { "epoch": 0.6504637783257016, "grad_norm": 5.560107231140137, "learning_rate": 7.635952694910637e-05, "loss": 0.035, "step": 11080 }, { "epoch": 0.6510508394974757, "grad_norm": 2.6349921226501465, "learning_rate": 7.632022047299781e-05, "loss": 0.0316, "step": 11090 }, { "epoch": 0.6516379006692498, "grad_norm": 2.049649715423584, "learning_rate": 7.628089148306434e-05, "loss": 0.0467, "step": 11100 }, { "epoch": 0.6522249618410239, "grad_norm": 1.0041853189468384, "learning_rate": 7.624154001294729e-05, "loss": 0.0445, "step": 11110 }, { "epoch": 0.652812023012798, "grad_norm": 0.8886867165565491, "learning_rate": 7.620216609630715e-05, "loss": 0.0431, "step": 11120 }, { "epoch": 0.653399084184572, "grad_norm": 0.880000114440918, "learning_rate": 7.616276976682365e-05, "loss": 0.0664, "step": 11130 }, { "epoch": 0.6539861453563461, "grad_norm": 1.4624862670898438, "learning_rate": 7.612335105819565e-05, "loss": 0.0207, "step": 11140 }, { "epoch": 0.6545732065281202, "grad_norm": 1.1052141189575195, "learning_rate": 7.608391000414118e-05, "loss": 0.0447, "step": 11150 }, { "epoch": 0.6551602676998943, "grad_norm": 2.376422166824341, "learning_rate": 7.604444663839743e-05, "loss": 0.0379, "step": 11160 }, { "epoch": 0.6557473288716684, "grad_norm": 1.6930925846099854, "learning_rate": 7.600496099472057e-05, "loss": 0.0634, "step": 11170 }, { "epoch": 0.6563343900434425, "grad_norm": 0.8461235165596008, "learning_rate": 7.59654531068859e-05, "loss": 0.0459, "step": 11180 }, { "epoch": 0.6569214512152166, "grad_norm": 1.6959190368652344, "learning_rate": 7.592592300868774e-05, "loss": 0.0364, "step": 11190 }, { "epoch": 0.6575085123869907, "grad_norm": 3.24035906791687, "learning_rate": 7.588637073393935e-05, "loss": 0.0803, "step": 11200 }, { "epoch": 0.6580955735587648, "grad_norm": 1.1434074640274048, "learning_rate": 7.58467963164731e-05, "loss": 0.0604, "step": 11210 }, { "epoch": 0.6586826347305389, "grad_norm": 3.274583578109741, "learning_rate": 7.580719979014012e-05, "loss": 0.0367, "step": 11220 }, { "epoch": 0.659269695902313, "grad_norm": 2.5319130420684814, "learning_rate": 7.576758118881056e-05, "loss": 0.0811, "step": 11230 }, { "epoch": 0.6598567570740871, "grad_norm": 3.002586603164673, "learning_rate": 7.572794054637347e-05, "loss": 0.0592, "step": 11240 }, { "epoch": 0.6604438182458612, "grad_norm": 2.6163065433502197, "learning_rate": 7.568827789673665e-05, "loss": 0.046, "step": 11250 }, { "epoch": 0.6610308794176353, "grad_norm": 3.5569369792938232, "learning_rate": 7.564859327382685e-05, "loss": 0.0442, "step": 11260 }, { "epoch": 0.6616179405894094, "grad_norm": 2.348433017730713, "learning_rate": 7.560888671158953e-05, "loss": 0.0534, "step": 11270 }, { "epoch": 0.6622050017611835, "grad_norm": 2.007448196411133, "learning_rate": 7.556915824398894e-05, "loss": 0.0336, "step": 11280 }, { "epoch": 0.6627920629329576, "grad_norm": 1.9800559282302856, "learning_rate": 7.552940790500806e-05, "loss": 0.0425, "step": 11290 }, { "epoch": 0.6633791241047318, "grad_norm": 1.264578104019165, "learning_rate": 7.54896357286486e-05, "loss": 0.0261, "step": 11300 }, { "epoch": 0.6639661852765059, "grad_norm": 0.8466734290122986, "learning_rate": 7.544984174893095e-05, "loss": 0.04, "step": 11310 }, { "epoch": 0.66455324644828, "grad_norm": 1.571954369544983, "learning_rate": 7.54100259998941e-05, "loss": 0.0585, "step": 11320 }, { "epoch": 0.665140307620054, "grad_norm": 2.239985227584839, "learning_rate": 7.537018851559576e-05, "loss": 0.0498, "step": 11330 }, { "epoch": 0.6657273687918281, "grad_norm": 1.2417446374893188, "learning_rate": 7.533032933011209e-05, "loss": 0.0339, "step": 11340 }, { "epoch": 0.6663144299636022, "grad_norm": 2.6729910373687744, "learning_rate": 7.529044847753795e-05, "loss": 0.0284, "step": 11350 }, { "epoch": 0.6669014911353763, "grad_norm": 2.4202144145965576, "learning_rate": 7.525054599198666e-05, "loss": 0.0509, "step": 11360 }, { "epoch": 0.6674885523071504, "grad_norm": 1.1169862747192383, "learning_rate": 7.521062190759005e-05, "loss": 0.0324, "step": 11370 }, { "epoch": 0.6680756134789245, "grad_norm": 1.7308237552642822, "learning_rate": 7.517067625849846e-05, "loss": 0.0264, "step": 11380 }, { "epoch": 0.6686626746506986, "grad_norm": 3.76955509185791, "learning_rate": 7.513070907888065e-05, "loss": 0.0498, "step": 11390 }, { "epoch": 0.6692497358224727, "grad_norm": 0.26449015736579895, "learning_rate": 7.509072040292376e-05, "loss": 0.0293, "step": 11400 }, { "epoch": 0.6698367969942468, "grad_norm": 1.2897920608520508, "learning_rate": 7.505071026483337e-05, "loss": 0.0454, "step": 11410 }, { "epoch": 0.6704238581660209, "grad_norm": 2.9363067150115967, "learning_rate": 7.501067869883344e-05, "loss": 0.0548, "step": 11420 }, { "epoch": 0.671010919337795, "grad_norm": 3.353597640991211, "learning_rate": 7.49706257391662e-05, "loss": 0.041, "step": 11430 }, { "epoch": 0.6715979805095691, "grad_norm": 0.747282087802887, "learning_rate": 7.49305514200922e-05, "loss": 0.0512, "step": 11440 }, { "epoch": 0.6721850416813432, "grad_norm": 5.312231063842773, "learning_rate": 7.489045577589026e-05, "loss": 0.0655, "step": 11450 }, { "epoch": 0.6727721028531173, "grad_norm": 1.659082293510437, "learning_rate": 7.485033884085746e-05, "loss": 0.042, "step": 11460 }, { "epoch": 0.6733591640248914, "grad_norm": 2.6412506103515625, "learning_rate": 7.481020064930908e-05, "loss": 0.0392, "step": 11470 }, { "epoch": 0.6739462251966655, "grad_norm": 1.4856112003326416, "learning_rate": 7.477004123557855e-05, "loss": 0.0382, "step": 11480 }, { "epoch": 0.6745332863684396, "grad_norm": 1.4802236557006836, "learning_rate": 7.472986063401751e-05, "loss": 0.0414, "step": 11490 }, { "epoch": 0.6751203475402137, "grad_norm": 2.9699227809906006, "learning_rate": 7.46896588789957e-05, "loss": 0.0329, "step": 11500 }, { "epoch": 0.6757074087119878, "grad_norm": 1.5497137308120728, "learning_rate": 7.464943600490094e-05, "loss": 0.0432, "step": 11510 }, { "epoch": 0.6762944698837619, "grad_norm": 0.049564044922590256, "learning_rate": 7.46091920461391e-05, "loss": 0.0538, "step": 11520 }, { "epoch": 0.676881531055536, "grad_norm": 0.6027920246124268, "learning_rate": 7.456892703713415e-05, "loss": 0.0335, "step": 11530 }, { "epoch": 0.67746859222731, "grad_norm": 2.308122158050537, "learning_rate": 7.452864101232798e-05, "loss": 0.0637, "step": 11540 }, { "epoch": 0.6780556533990841, "grad_norm": 1.7618777751922607, "learning_rate": 7.448833400618055e-05, "loss": 0.0603, "step": 11550 }, { "epoch": 0.6786427145708582, "grad_norm": 0.8958557844161987, "learning_rate": 7.44480060531697e-05, "loss": 0.0281, "step": 11560 }, { "epoch": 0.6792297757426324, "grad_norm": 1.6657615900039673, "learning_rate": 7.440765718779124e-05, "loss": 0.0419, "step": 11570 }, { "epoch": 0.6798168369144065, "grad_norm": 0.36698830127716064, "learning_rate": 7.436728744455877e-05, "loss": 0.0448, "step": 11580 }, { "epoch": 0.6804038980861806, "grad_norm": 1.7312005758285522, "learning_rate": 7.432689685800386e-05, "loss": 0.0454, "step": 11590 }, { "epoch": 0.6809909592579547, "grad_norm": 1.5802057981491089, "learning_rate": 7.428648546267586e-05, "loss": 0.0315, "step": 11600 }, { "epoch": 0.6815780204297288, "grad_norm": 0.3204239308834076, "learning_rate": 7.42460532931419e-05, "loss": 0.0267, "step": 11610 }, { "epoch": 0.6821650816015029, "grad_norm": 1.8985480070114136, "learning_rate": 7.420560038398694e-05, "loss": 0.0274, "step": 11620 }, { "epoch": 0.682752142773277, "grad_norm": 2.6278510093688965, "learning_rate": 7.416512676981359e-05, "loss": 0.0369, "step": 11630 }, { "epoch": 0.6833392039450511, "grad_norm": 4.143795490264893, "learning_rate": 7.412463248524229e-05, "loss": 0.0383, "step": 11640 }, { "epoch": 0.6839262651168252, "grad_norm": 1.462797999382019, "learning_rate": 7.408411756491104e-05, "loss": 0.0269, "step": 11650 }, { "epoch": 0.6845133262885993, "grad_norm": 3.339895725250244, "learning_rate": 7.404358204347557e-05, "loss": 0.0673, "step": 11660 }, { "epoch": 0.6851003874603734, "grad_norm": 3.2953529357910156, "learning_rate": 7.400302595560919e-05, "loss": 0.0327, "step": 11670 }, { "epoch": 0.6856874486321475, "grad_norm": 2.8305249214172363, "learning_rate": 7.396244933600285e-05, "loss": 0.0425, "step": 11680 }, { "epoch": 0.6862745098039216, "grad_norm": 2.100759983062744, "learning_rate": 7.3921852219365e-05, "loss": 0.0421, "step": 11690 }, { "epoch": 0.6868615709756957, "grad_norm": 0.9254026412963867, "learning_rate": 7.388123464042167e-05, "loss": 0.0458, "step": 11700 }, { "epoch": 0.6874486321474698, "grad_norm": 2.050107955932617, "learning_rate": 7.38405966339164e-05, "loss": 0.036, "step": 11710 }, { "epoch": 0.6880356933192439, "grad_norm": 3.631403923034668, "learning_rate": 7.379993823461014e-05, "loss": 0.052, "step": 11720 }, { "epoch": 0.688622754491018, "grad_norm": 1.1665562391281128, "learning_rate": 7.375925947728135e-05, "loss": 0.043, "step": 11730 }, { "epoch": 0.689209815662792, "grad_norm": 0.6579986214637756, "learning_rate": 7.371856039672586e-05, "loss": 0.0312, "step": 11740 }, { "epoch": 0.6897968768345661, "grad_norm": 2.9885151386260986, "learning_rate": 7.367784102775694e-05, "loss": 0.0826, "step": 11750 }, { "epoch": 0.6903839380063402, "grad_norm": 1.8481273651123047, "learning_rate": 7.363710140520514e-05, "loss": 0.024, "step": 11760 }, { "epoch": 0.6909709991781143, "grad_norm": 2.1871728897094727, "learning_rate": 7.35963415639184e-05, "loss": 0.0322, "step": 11770 }, { "epoch": 0.6915580603498884, "grad_norm": 3.232172727584839, "learning_rate": 7.35555615387619e-05, "loss": 0.0445, "step": 11780 }, { "epoch": 0.6921451215216625, "grad_norm": 0.968634843826294, "learning_rate": 7.351476136461814e-05, "loss": 0.0682, "step": 11790 }, { "epoch": 0.6927321826934366, "grad_norm": 0.8169238567352295, "learning_rate": 7.34739410763868e-05, "loss": 0.0445, "step": 11800 }, { "epoch": 0.6933192438652107, "grad_norm": 1.6446173191070557, "learning_rate": 7.34331007089848e-05, "loss": 0.0387, "step": 11810 }, { "epoch": 0.6939063050369848, "grad_norm": 1.1081836223602295, "learning_rate": 7.339224029734623e-05, "loss": 0.0327, "step": 11820 }, { "epoch": 0.6944933662087589, "grad_norm": 2.7306418418884277, "learning_rate": 7.335135987642233e-05, "loss": 0.029, "step": 11830 }, { "epoch": 0.6950804273805331, "grad_norm": 1.2643860578536987, "learning_rate": 7.331045948118144e-05, "loss": 0.0325, "step": 11840 }, { "epoch": 0.6956674885523072, "grad_norm": 3.4605374336242676, "learning_rate": 7.3269539146609e-05, "loss": 0.0443, "step": 11850 }, { "epoch": 0.6962545497240813, "grad_norm": 2.64870548248291, "learning_rate": 7.32285989077075e-05, "loss": 0.0234, "step": 11860 }, { "epoch": 0.6968416108958554, "grad_norm": 1.5398218631744385, "learning_rate": 7.318763879949644e-05, "loss": 0.025, "step": 11870 }, { "epoch": 0.6974286720676295, "grad_norm": 3.69907546043396, "learning_rate": 7.314665885701234e-05, "loss": 0.0346, "step": 11880 }, { "epoch": 0.6980157332394036, "grad_norm": 1.5674400329589844, "learning_rate": 7.310565911530869e-05, "loss": 0.04, "step": 11890 }, { "epoch": 0.6986027944111777, "grad_norm": 4.008990287780762, "learning_rate": 7.30646396094559e-05, "loss": 0.0437, "step": 11900 }, { "epoch": 0.6991898555829518, "grad_norm": 1.03583824634552, "learning_rate": 7.302360037454128e-05, "loss": 0.0426, "step": 11910 }, { "epoch": 0.6997769167547259, "grad_norm": 1.5053350925445557, "learning_rate": 7.298254144566901e-05, "loss": 0.0393, "step": 11920 }, { "epoch": 0.7003639779265, "grad_norm": 2.7704758644104004, "learning_rate": 7.294146285796015e-05, "loss": 0.0271, "step": 11930 }, { "epoch": 0.700951039098274, "grad_norm": 1.0304514169692993, "learning_rate": 7.290036464655257e-05, "loss": 0.0513, "step": 11940 }, { "epoch": 0.7015381002700481, "grad_norm": 2.5655267238616943, "learning_rate": 7.285924684660089e-05, "loss": 0.032, "step": 11950 }, { "epoch": 0.7021251614418222, "grad_norm": 1.0380624532699585, "learning_rate": 7.281810949327651e-05, "loss": 0.0326, "step": 11960 }, { "epoch": 0.7027122226135963, "grad_norm": 3.0011343955993652, "learning_rate": 7.277695262176756e-05, "loss": 0.0214, "step": 11970 }, { "epoch": 0.7032992837853704, "grad_norm": 0.4370494782924652, "learning_rate": 7.273577626727884e-05, "loss": 0.0373, "step": 11980 }, { "epoch": 0.7038863449571445, "grad_norm": 3.3337936401367188, "learning_rate": 7.269458046503187e-05, "loss": 0.0315, "step": 11990 }, { "epoch": 0.7044734061289186, "grad_norm": 3.230440378189087, "learning_rate": 7.265336525026476e-05, "loss": 0.03, "step": 12000 }, { "epoch": 0.7044734061289186, "eval_loss": 0.44979673624038696, "eval_runtime": 269.5683, "eval_samples_per_second": 3.506, "eval_steps_per_second": 3.506, "step": 12000 }, { "epoch": 0.7050604673006927, "grad_norm": 1.5325340032577515, "learning_rate": 7.26121306582322e-05, "loss": 0.0634, "step": 12010 }, { "epoch": 0.7056475284724668, "grad_norm": 0.4213601350784302, "learning_rate": 7.257087672420553e-05, "loss": 0.0311, "step": 12020 }, { "epoch": 0.7062345896442409, "grad_norm": 1.2271337509155273, "learning_rate": 7.252960348347258e-05, "loss": 0.0705, "step": 12030 }, { "epoch": 0.706821650816015, "grad_norm": 1.3212566375732422, "learning_rate": 7.24883109713377e-05, "loss": 0.0293, "step": 12040 }, { "epoch": 0.7074087119877891, "grad_norm": 1.8340243101119995, "learning_rate": 7.244699922312176e-05, "loss": 0.0405, "step": 12050 }, { "epoch": 0.7079957731595632, "grad_norm": 0.9552738666534424, "learning_rate": 7.240566827416204e-05, "loss": 0.0315, "step": 12060 }, { "epoch": 0.7085828343313373, "grad_norm": 2.8367209434509277, "learning_rate": 7.236431815981223e-05, "loss": 0.061, "step": 12070 }, { "epoch": 0.7091698955031114, "grad_norm": 4.737795352935791, "learning_rate": 7.23229489154425e-05, "loss": 0.0753, "step": 12080 }, { "epoch": 0.7097569566748855, "grad_norm": 2.4975411891937256, "learning_rate": 7.22815605764393e-05, "loss": 0.0332, "step": 12090 }, { "epoch": 0.7103440178466596, "grad_norm": 1.2418122291564941, "learning_rate": 7.224015317820544e-05, "loss": 0.0576, "step": 12100 }, { "epoch": 0.7109310790184338, "grad_norm": 1.065011978149414, "learning_rate": 7.219872675616006e-05, "loss": 0.0344, "step": 12110 }, { "epoch": 0.7115181401902079, "grad_norm": 4.0738067626953125, "learning_rate": 7.215728134573852e-05, "loss": 0.0333, "step": 12120 }, { "epoch": 0.712105201361982, "grad_norm": 1.6276997327804565, "learning_rate": 7.211581698239245e-05, "loss": 0.0349, "step": 12130 }, { "epoch": 0.7126922625337561, "grad_norm": 4.080755233764648, "learning_rate": 7.207433370158972e-05, "loss": 0.0611, "step": 12140 }, { "epoch": 0.7132793237055302, "grad_norm": 1.5337743759155273, "learning_rate": 7.203283153881432e-05, "loss": 0.0603, "step": 12150 }, { "epoch": 0.7138663848773042, "grad_norm": 5.253373146057129, "learning_rate": 7.199131052956644e-05, "loss": 0.055, "step": 12160 }, { "epoch": 0.7144534460490783, "grad_norm": 1.5085833072662354, "learning_rate": 7.194977070936239e-05, "loss": 0.0312, "step": 12170 }, { "epoch": 0.7150405072208524, "grad_norm": 0.32057294249534607, "learning_rate": 7.190821211373453e-05, "loss": 0.0339, "step": 12180 }, { "epoch": 0.7156275683926265, "grad_norm": 1.0474879741668701, "learning_rate": 7.18666347782313e-05, "loss": 0.0597, "step": 12190 }, { "epoch": 0.7162146295644006, "grad_norm": 2.558286666870117, "learning_rate": 7.182503873841722e-05, "loss": 0.0569, "step": 12200 }, { "epoch": 0.7168016907361747, "grad_norm": 2.7508225440979004, "learning_rate": 7.178342402987272e-05, "loss": 0.0298, "step": 12210 }, { "epoch": 0.7173887519079488, "grad_norm": 0.7895553112030029, "learning_rate": 7.174179068819428e-05, "loss": 0.0598, "step": 12220 }, { "epoch": 0.7179758130797229, "grad_norm": 3.0784595012664795, "learning_rate": 7.170013874899426e-05, "loss": 0.05, "step": 12230 }, { "epoch": 0.718562874251497, "grad_norm": 0.4165332317352295, "learning_rate": 7.165846824790095e-05, "loss": 0.0266, "step": 12240 }, { "epoch": 0.7191499354232711, "grad_norm": 3.0013482570648193, "learning_rate": 7.161677922055853e-05, "loss": 0.0255, "step": 12250 }, { "epoch": 0.7197369965950452, "grad_norm": 1.8503293991088867, "learning_rate": 7.157507170262701e-05, "loss": 0.0402, "step": 12260 }, { "epoch": 0.7203240577668193, "grad_norm": 3.0309407711029053, "learning_rate": 7.153334572978221e-05, "loss": 0.0534, "step": 12270 }, { "epoch": 0.7209111189385934, "grad_norm": 2.4800071716308594, "learning_rate": 7.149160133771577e-05, "loss": 0.0406, "step": 12280 }, { "epoch": 0.7214981801103675, "grad_norm": 0.8500409126281738, "learning_rate": 7.144983856213507e-05, "loss": 0.0289, "step": 12290 }, { "epoch": 0.7220852412821416, "grad_norm": 1.53009831905365, "learning_rate": 7.140805743876317e-05, "loss": 0.0235, "step": 12300 }, { "epoch": 0.7226723024539157, "grad_norm": 2.7796554565429688, "learning_rate": 7.136625800333887e-05, "loss": 0.0256, "step": 12310 }, { "epoch": 0.7232593636256898, "grad_norm": 1.4432597160339355, "learning_rate": 7.132444029161667e-05, "loss": 0.0232, "step": 12320 }, { "epoch": 0.7238464247974639, "grad_norm": 1.8305975198745728, "learning_rate": 7.12826043393666e-05, "loss": 0.062, "step": 12330 }, { "epoch": 0.724433485969238, "grad_norm": 1.6505351066589355, "learning_rate": 7.12407501823744e-05, "loss": 0.0345, "step": 12340 }, { "epoch": 0.725020547141012, "grad_norm": 4.550604820251465, "learning_rate": 7.11988778564413e-05, "loss": 0.0348, "step": 12350 }, { "epoch": 0.7256076083127861, "grad_norm": 0.8290696740150452, "learning_rate": 7.115698739738412e-05, "loss": 0.0171, "step": 12360 }, { "epoch": 0.7261946694845602, "grad_norm": 2.516306161880493, "learning_rate": 7.111507884103518e-05, "loss": 0.0363, "step": 12370 }, { "epoch": 0.7267817306563343, "grad_norm": 1.364721655845642, "learning_rate": 7.107315222324227e-05, "loss": 0.0453, "step": 12380 }, { "epoch": 0.7273687918281085, "grad_norm": 1.1648774147033691, "learning_rate": 7.103120757986864e-05, "loss": 0.0161, "step": 12390 }, { "epoch": 0.7279558529998826, "grad_norm": 2.6282575130462646, "learning_rate": 7.098924494679295e-05, "loss": 0.0609, "step": 12400 }, { "epoch": 0.7285429141716567, "grad_norm": 1.9648375511169434, "learning_rate": 7.094726435990926e-05, "loss": 0.0375, "step": 12410 }, { "epoch": 0.7291299753434308, "grad_norm": 1.3704135417938232, "learning_rate": 7.090526585512696e-05, "loss": 0.0608, "step": 12420 }, { "epoch": 0.7297170365152049, "grad_norm": 0.3298032581806183, "learning_rate": 7.086324946837081e-05, "loss": 0.0334, "step": 12430 }, { "epoch": 0.730304097686979, "grad_norm": 0.2856060564517975, "learning_rate": 7.082121523558083e-05, "loss": 0.0272, "step": 12440 }, { "epoch": 0.7308911588587531, "grad_norm": 3.429799795150757, "learning_rate": 7.077916319271232e-05, "loss": 0.0704, "step": 12450 }, { "epoch": 0.7314782200305272, "grad_norm": 1.299608826637268, "learning_rate": 7.073709337573581e-05, "loss": 0.0311, "step": 12460 }, { "epoch": 0.7320652812023013, "grad_norm": 1.4074573516845703, "learning_rate": 7.069500582063702e-05, "loss": 0.0386, "step": 12470 }, { "epoch": 0.7326523423740754, "grad_norm": 2.08955979347229, "learning_rate": 7.06529005634169e-05, "loss": 0.0289, "step": 12480 }, { "epoch": 0.7332394035458495, "grad_norm": 3.107790946960449, "learning_rate": 7.061077764009147e-05, "loss": 0.0339, "step": 12490 }, { "epoch": 0.7338264647176236, "grad_norm": 1.7810473442077637, "learning_rate": 7.05686370866919e-05, "loss": 0.0369, "step": 12500 }, { "epoch": 0.7344135258893977, "grad_norm": 2.4785356521606445, "learning_rate": 7.052647893926442e-05, "loss": 0.0699, "step": 12510 }, { "epoch": 0.7350005870611718, "grad_norm": 5.314114570617676, "learning_rate": 7.048430323387034e-05, "loss": 0.0359, "step": 12520 }, { "epoch": 0.7355876482329459, "grad_norm": 1.8828675746917725, "learning_rate": 7.044211000658595e-05, "loss": 0.0446, "step": 12530 }, { "epoch": 0.73617470940472, "grad_norm": 1.7500027418136597, "learning_rate": 7.039989929350257e-05, "loss": 0.0357, "step": 12540 }, { "epoch": 0.7367617705764941, "grad_norm": 0.32012036442756653, "learning_rate": 7.035767113072645e-05, "loss": 0.022, "step": 12550 }, { "epoch": 0.7373488317482682, "grad_norm": 3.6539933681488037, "learning_rate": 7.031542555437876e-05, "loss": 0.0401, "step": 12560 }, { "epoch": 0.7379358929200422, "grad_norm": 2.1202316284179688, "learning_rate": 7.027316260059558e-05, "loss": 0.0385, "step": 12570 }, { "epoch": 0.7385229540918163, "grad_norm": 1.3387161493301392, "learning_rate": 7.023088230552787e-05, "loss": 0.0468, "step": 12580 }, { "epoch": 0.7391100152635904, "grad_norm": 1.752645492553711, "learning_rate": 7.018858470534138e-05, "loss": 0.0502, "step": 12590 }, { "epoch": 0.7396970764353645, "grad_norm": 0.23137418925762177, "learning_rate": 7.014626983621669e-05, "loss": 0.0514, "step": 12600 }, { "epoch": 0.7402841376071386, "grad_norm": 2.1703407764434814, "learning_rate": 7.010393773434917e-05, "loss": 0.0334, "step": 12610 }, { "epoch": 0.7408711987789127, "grad_norm": 3.549792528152466, "learning_rate": 7.006158843594887e-05, "loss": 0.0662, "step": 12620 }, { "epoch": 0.7414582599506868, "grad_norm": 2.0263309478759766, "learning_rate": 7.001922197724063e-05, "loss": 0.0223, "step": 12630 }, { "epoch": 0.7420453211224609, "grad_norm": 2.800609588623047, "learning_rate": 6.997683839446392e-05, "loss": 0.0482, "step": 12640 }, { "epoch": 0.742632382294235, "grad_norm": 2.8997108936309814, "learning_rate": 6.993443772387284e-05, "loss": 0.0386, "step": 12650 }, { "epoch": 0.7432194434660092, "grad_norm": 6.5187153816223145, "learning_rate": 6.989202000173614e-05, "loss": 0.0499, "step": 12660 }, { "epoch": 0.7438065046377833, "grad_norm": 2.201885223388672, "learning_rate": 6.984958526433716e-05, "loss": 0.0508, "step": 12670 }, { "epoch": 0.7443935658095574, "grad_norm": 1.613524317741394, "learning_rate": 6.980713354797376e-05, "loss": 0.0248, "step": 12680 }, { "epoch": 0.7449806269813315, "grad_norm": 0.9656441807746887, "learning_rate": 6.97646648889584e-05, "loss": 0.0284, "step": 12690 }, { "epoch": 0.7455676881531056, "grad_norm": 0.754646897315979, "learning_rate": 6.972217932361792e-05, "loss": 0.0527, "step": 12700 }, { "epoch": 0.7461547493248797, "grad_norm": 3.3350586891174316, "learning_rate": 6.967967688829369e-05, "loss": 0.0664, "step": 12710 }, { "epoch": 0.7467418104966538, "grad_norm": 1.6783524751663208, "learning_rate": 6.963715761934151e-05, "loss": 0.0622, "step": 12720 }, { "epoch": 0.7473288716684279, "grad_norm": 1.228952407836914, "learning_rate": 6.959462155313155e-05, "loss": 0.0325, "step": 12730 }, { "epoch": 0.747915932840202, "grad_norm": 0.7153427004814148, "learning_rate": 6.955206872604839e-05, "loss": 0.0251, "step": 12740 }, { "epoch": 0.7485029940119761, "grad_norm": 0.10894730687141418, "learning_rate": 6.950949917449093e-05, "loss": 0.04, "step": 12750 }, { "epoch": 0.7490900551837502, "grad_norm": 1.343906044960022, "learning_rate": 6.946691293487233e-05, "loss": 0.0341, "step": 12760 }, { "epoch": 0.7496771163555243, "grad_norm": 1.4371585845947266, "learning_rate": 6.94243100436201e-05, "loss": 0.0346, "step": 12770 }, { "epoch": 0.7502641775272983, "grad_norm": 1.6158758401870728, "learning_rate": 6.938169053717593e-05, "loss": 0.0272, "step": 12780 }, { "epoch": 0.7508512386990724, "grad_norm": 2.5721545219421387, "learning_rate": 6.933905445199578e-05, "loss": 0.055, "step": 12790 }, { "epoch": 0.7514382998708465, "grad_norm": 1.3039095401763916, "learning_rate": 6.929640182454973e-05, "loss": 0.0466, "step": 12800 }, { "epoch": 0.7520253610426206, "grad_norm": 1.7179279327392578, "learning_rate": 6.925373269132207e-05, "loss": 0.0486, "step": 12810 }, { "epoch": 0.7526124222143947, "grad_norm": 2.3523173332214355, "learning_rate": 6.921104708881115e-05, "loss": 0.023, "step": 12820 }, { "epoch": 0.7531994833861688, "grad_norm": 1.761407732963562, "learning_rate": 6.916834505352945e-05, "loss": 0.055, "step": 12830 }, { "epoch": 0.7537865445579429, "grad_norm": 3.2720420360565186, "learning_rate": 6.91256266220035e-05, "loss": 0.0669, "step": 12840 }, { "epoch": 0.754373605729717, "grad_norm": 1.0557044744491577, "learning_rate": 6.908289183077385e-05, "loss": 0.0441, "step": 12850 }, { "epoch": 0.7549606669014911, "grad_norm": 2.7804226875305176, "learning_rate": 6.904014071639503e-05, "loss": 0.033, "step": 12860 }, { "epoch": 0.7555477280732652, "grad_norm": 1.608482837677002, "learning_rate": 6.899737331543555e-05, "loss": 0.0374, "step": 12870 }, { "epoch": 0.7561347892450393, "grad_norm": 3.039224624633789, "learning_rate": 6.895458966447784e-05, "loss": 0.0286, "step": 12880 }, { "epoch": 0.7567218504168134, "grad_norm": 0.5459122657775879, "learning_rate": 6.891178980011826e-05, "loss": 0.0312, "step": 12890 }, { "epoch": 0.7573089115885875, "grad_norm": 1.7158164978027344, "learning_rate": 6.886897375896697e-05, "loss": 0.045, "step": 12900 }, { "epoch": 0.7578959727603616, "grad_norm": 1.8184658288955688, "learning_rate": 6.882614157764804e-05, "loss": 0.0411, "step": 12910 }, { "epoch": 0.7584830339321357, "grad_norm": 0.8906010985374451, "learning_rate": 6.878329329279933e-05, "loss": 0.0486, "step": 12920 }, { "epoch": 0.7590700951039099, "grad_norm": 1.9838579893112183, "learning_rate": 6.874042894107245e-05, "loss": 0.0673, "step": 12930 }, { "epoch": 0.759657156275684, "grad_norm": 0.6923233270645142, "learning_rate": 6.869754855913273e-05, "loss": 0.0211, "step": 12940 }, { "epoch": 0.7602442174474581, "grad_norm": 2.112013578414917, "learning_rate": 6.86546521836593e-05, "loss": 0.0315, "step": 12950 }, { "epoch": 0.7608312786192322, "grad_norm": 1.9117103815078735, "learning_rate": 6.86117398513449e-05, "loss": 0.0326, "step": 12960 }, { "epoch": 0.7614183397910063, "grad_norm": 2.3542299270629883, "learning_rate": 6.856881159889593e-05, "loss": 0.0689, "step": 12970 }, { "epoch": 0.7620054009627804, "grad_norm": 3.010636806488037, "learning_rate": 6.852586746303243e-05, "loss": 0.0496, "step": 12980 }, { "epoch": 0.7625924621345544, "grad_norm": 5.607231140136719, "learning_rate": 6.848290748048801e-05, "loss": 0.0321, "step": 12990 }, { "epoch": 0.7631795233063285, "grad_norm": 0.8411577343940735, "learning_rate": 6.843993168800982e-05, "loss": 0.0493, "step": 13000 }, { "epoch": 0.7637665844781026, "grad_norm": 3.7046091556549072, "learning_rate": 6.839694012235856e-05, "loss": 0.0399, "step": 13010 }, { "epoch": 0.7643536456498767, "grad_norm": 1.6446720361709595, "learning_rate": 6.835393282030841e-05, "loss": 0.0472, "step": 13020 }, { "epoch": 0.7649407068216508, "grad_norm": 1.5894074440002441, "learning_rate": 6.8310909818647e-05, "loss": 0.0415, "step": 13030 }, { "epoch": 0.7655277679934249, "grad_norm": 1.3392444849014282, "learning_rate": 6.826787115417544e-05, "loss": 0.0532, "step": 13040 }, { "epoch": 0.766114829165199, "grad_norm": 2.6800129413604736, "learning_rate": 6.822481686370815e-05, "loss": 0.0462, "step": 13050 }, { "epoch": 0.7667018903369731, "grad_norm": 1.4358599185943604, "learning_rate": 6.818174698407302e-05, "loss": 0.05, "step": 13060 }, { "epoch": 0.7672889515087472, "grad_norm": 1.4766582250595093, "learning_rate": 6.813866155211118e-05, "loss": 0.0421, "step": 13070 }, { "epoch": 0.7678760126805213, "grad_norm": 1.8147733211517334, "learning_rate": 6.80955606046771e-05, "loss": 0.0508, "step": 13080 }, { "epoch": 0.7684630738522954, "grad_norm": 1.277007818222046, "learning_rate": 6.805244417863854e-05, "loss": 0.0344, "step": 13090 }, { "epoch": 0.7690501350240695, "grad_norm": 1.3057037591934204, "learning_rate": 6.80093123108765e-05, "loss": 0.0383, "step": 13100 }, { "epoch": 0.7696371961958436, "grad_norm": 1.6316132545471191, "learning_rate": 6.796616503828515e-05, "loss": 0.0333, "step": 13110 }, { "epoch": 0.7702242573676177, "grad_norm": 3.621436595916748, "learning_rate": 6.79230023977719e-05, "loss": 0.0521, "step": 13120 }, { "epoch": 0.7708113185393918, "grad_norm": 0.9022402167320251, "learning_rate": 6.787982442625721e-05, "loss": 0.0237, "step": 13130 }, { "epoch": 0.7713983797111659, "grad_norm": 1.7554409503936768, "learning_rate": 6.783663116067473e-05, "loss": 0.0364, "step": 13140 }, { "epoch": 0.77198544088294, "grad_norm": 4.405052661895752, "learning_rate": 6.779342263797119e-05, "loss": 0.0514, "step": 13150 }, { "epoch": 0.7725725020547141, "grad_norm": 3.3128652572631836, "learning_rate": 6.775019889510635e-05, "loss": 0.0453, "step": 13160 }, { "epoch": 0.7731595632264882, "grad_norm": 0.12712036073207855, "learning_rate": 6.770695996905297e-05, "loss": 0.0302, "step": 13170 }, { "epoch": 0.7737466243982623, "grad_norm": 2.2140140533447266, "learning_rate": 6.766370589679685e-05, "loss": 0.0427, "step": 13180 }, { "epoch": 0.7743336855700363, "grad_norm": 0.7173200845718384, "learning_rate": 6.762043671533668e-05, "loss": 0.0274, "step": 13190 }, { "epoch": 0.7749207467418106, "grad_norm": 5.099856853485107, "learning_rate": 6.757715246168414e-05, "loss": 0.0405, "step": 13200 }, { "epoch": 0.7755078079135846, "grad_norm": 0.9245136380195618, "learning_rate": 6.753385317286377e-05, "loss": 0.0514, "step": 13210 }, { "epoch": 0.7760948690853587, "grad_norm": 2.752617597579956, "learning_rate": 6.749053888591295e-05, "loss": 0.0358, "step": 13220 }, { "epoch": 0.7766819302571328, "grad_norm": 1.602553367614746, "learning_rate": 6.744720963788193e-05, "loss": 0.0397, "step": 13230 }, { "epoch": 0.7772689914289069, "grad_norm": 2.3663432598114014, "learning_rate": 6.740386546583373e-05, "loss": 0.0375, "step": 13240 }, { "epoch": 0.777856052600681, "grad_norm": 0.5158788561820984, "learning_rate": 6.736050640684416e-05, "loss": 0.0359, "step": 13250 }, { "epoch": 0.7784431137724551, "grad_norm": 1.9949820041656494, "learning_rate": 6.731713249800173e-05, "loss": 0.0454, "step": 13260 }, { "epoch": 0.7790301749442292, "grad_norm": 2.5277843475341797, "learning_rate": 6.727374377640768e-05, "loss": 0.0381, "step": 13270 }, { "epoch": 0.7796172361160033, "grad_norm": 1.7521203756332397, "learning_rate": 6.723034027917592e-05, "loss": 0.0267, "step": 13280 }, { "epoch": 0.7802042972877774, "grad_norm": 0.489425390958786, "learning_rate": 6.718692204343298e-05, "loss": 0.0451, "step": 13290 }, { "epoch": 0.7807913584595515, "grad_norm": 1.8275169134140015, "learning_rate": 6.7143489106318e-05, "loss": 0.0503, "step": 13300 }, { "epoch": 0.7813784196313256, "grad_norm": 4.13484001159668, "learning_rate": 6.710004150498271e-05, "loss": 0.0627, "step": 13310 }, { "epoch": 0.7819654808030997, "grad_norm": 2.0780868530273438, "learning_rate": 6.70565792765914e-05, "loss": 0.0244, "step": 13320 }, { "epoch": 0.7825525419748738, "grad_norm": 2.7539408206939697, "learning_rate": 6.701310245832082e-05, "loss": 0.0354, "step": 13330 }, { "epoch": 0.7831396031466479, "grad_norm": 1.69619619846344, "learning_rate": 6.696961108736024e-05, "loss": 0.0504, "step": 13340 }, { "epoch": 0.783726664318422, "grad_norm": 1.7762585878372192, "learning_rate": 6.692610520091137e-05, "loss": 0.0439, "step": 13350 }, { "epoch": 0.7843137254901961, "grad_norm": 0.22033926844596863, "learning_rate": 6.68825848361883e-05, "loss": 0.0481, "step": 13360 }, { "epoch": 0.7849007866619702, "grad_norm": 0.562747597694397, "learning_rate": 6.683905003041757e-05, "loss": 0.0161, "step": 13370 }, { "epoch": 0.7854878478337443, "grad_norm": 4.002884387969971, "learning_rate": 6.679550082083803e-05, "loss": 0.0419, "step": 13380 }, { "epoch": 0.7860749090055184, "grad_norm": 2.3743350505828857, "learning_rate": 6.675193724470087e-05, "loss": 0.0293, "step": 13390 }, { "epoch": 0.7866619701772924, "grad_norm": 1.1087560653686523, "learning_rate": 6.670835933926955e-05, "loss": 0.0451, "step": 13400 }, { "epoch": 0.7872490313490665, "grad_norm": 1.0285279750823975, "learning_rate": 6.666476714181979e-05, "loss": 0.0304, "step": 13410 }, { "epoch": 0.7878360925208406, "grad_norm": 0.6177504658699036, "learning_rate": 6.662116068963954e-05, "loss": 0.0242, "step": 13420 }, { "epoch": 0.7884231536926147, "grad_norm": 2.7186331748962402, "learning_rate": 6.657754002002898e-05, "loss": 0.0391, "step": 13430 }, { "epoch": 0.7890102148643888, "grad_norm": 1.8687515258789062, "learning_rate": 6.653390517030038e-05, "loss": 0.0358, "step": 13440 }, { "epoch": 0.7895972760361629, "grad_norm": 0.6915972232818604, "learning_rate": 6.649025617777818e-05, "loss": 0.0238, "step": 13450 }, { "epoch": 0.790184337207937, "grad_norm": 1.9850742816925049, "learning_rate": 6.64465930797989e-05, "loss": 0.0592, "step": 13460 }, { "epoch": 0.7907713983797112, "grad_norm": 2.312723159790039, "learning_rate": 6.640291591371117e-05, "loss": 0.037, "step": 13470 }, { "epoch": 0.7913584595514853, "grad_norm": 1.8763644695281982, "learning_rate": 6.635922471687561e-05, "loss": 0.034, "step": 13480 }, { "epoch": 0.7919455207232594, "grad_norm": 0.05788072198629379, "learning_rate": 6.631551952666484e-05, "loss": 0.0253, "step": 13490 }, { "epoch": 0.7925325818950335, "grad_norm": 2.2860443592071533, "learning_rate": 6.627180038046347e-05, "loss": 0.0405, "step": 13500 }, { "epoch": 0.7931196430668076, "grad_norm": 1.2735824584960938, "learning_rate": 6.622806731566807e-05, "loss": 0.0343, "step": 13510 }, { "epoch": 0.7937067042385817, "grad_norm": 2.2331037521362305, "learning_rate": 6.618432036968705e-05, "loss": 0.0216, "step": 13520 }, { "epoch": 0.7942937654103558, "grad_norm": 2.2226316928863525, "learning_rate": 6.614055957994075e-05, "loss": 0.0382, "step": 13530 }, { "epoch": 0.7948808265821299, "grad_norm": 2.079427480697632, "learning_rate": 6.60967849838613e-05, "loss": 0.0363, "step": 13540 }, { "epoch": 0.795467887753904, "grad_norm": 2.138134241104126, "learning_rate": 6.60529966188927e-05, "loss": 0.028, "step": 13550 }, { "epoch": 0.7960549489256781, "grad_norm": 0.8080530166625977, "learning_rate": 6.60091945224907e-05, "loss": 0.0157, "step": 13560 }, { "epoch": 0.7966420100974522, "grad_norm": 2.0846288204193115, "learning_rate": 6.596537873212281e-05, "loss": 0.041, "step": 13570 }, { "epoch": 0.7972290712692263, "grad_norm": 2.216050148010254, "learning_rate": 6.592154928526818e-05, "loss": 0.0312, "step": 13580 }, { "epoch": 0.7978161324410004, "grad_norm": 1.5155831575393677, "learning_rate": 6.587770621941776e-05, "loss": 0.052, "step": 13590 }, { "epoch": 0.7984031936127745, "grad_norm": 2.175265312194824, "learning_rate": 6.583384957207406e-05, "loss": 0.0444, "step": 13600 }, { "epoch": 0.7989902547845485, "grad_norm": 2.381720542907715, "learning_rate": 6.578997938075125e-05, "loss": 0.0364, "step": 13610 }, { "epoch": 0.7995773159563226, "grad_norm": 1.3376045227050781, "learning_rate": 6.574609568297507e-05, "loss": 0.0365, "step": 13620 }, { "epoch": 0.8001643771280967, "grad_norm": 3.4189717769622803, "learning_rate": 6.57021985162828e-05, "loss": 0.0478, "step": 13630 }, { "epoch": 0.8007514382998708, "grad_norm": 0.809241771697998, "learning_rate": 6.565828791822327e-05, "loss": 0.0358, "step": 13640 }, { "epoch": 0.8013384994716449, "grad_norm": 1.9690725803375244, "learning_rate": 6.56143639263568e-05, "loss": 0.0489, "step": 13650 }, { "epoch": 0.801925560643419, "grad_norm": 3.3723227977752686, "learning_rate": 6.557042657825511e-05, "loss": 0.052, "step": 13660 }, { "epoch": 0.8025126218151931, "grad_norm": 2.127072811126709, "learning_rate": 6.552647591150143e-05, "loss": 0.0386, "step": 13670 }, { "epoch": 0.8030996829869672, "grad_norm": 1.5079299211502075, "learning_rate": 6.548251196369031e-05, "loss": 0.0251, "step": 13680 }, { "epoch": 0.8036867441587413, "grad_norm": 0.4091831147670746, "learning_rate": 6.54385347724277e-05, "loss": 0.021, "step": 13690 }, { "epoch": 0.8042738053305154, "grad_norm": 2.712334156036377, "learning_rate": 6.539454437533088e-05, "loss": 0.0523, "step": 13700 }, { "epoch": 0.8048608665022895, "grad_norm": 3.098822593688965, "learning_rate": 6.535054081002841e-05, "loss": 0.045, "step": 13710 }, { "epoch": 0.8054479276740636, "grad_norm": 2.6743454933166504, "learning_rate": 6.530652411416007e-05, "loss": 0.056, "step": 13720 }, { "epoch": 0.8060349888458377, "grad_norm": 0.3644692003726959, "learning_rate": 6.5262494325377e-05, "loss": 0.0292, "step": 13730 }, { "epoch": 0.8066220500176118, "grad_norm": 2.4092676639556885, "learning_rate": 6.52184514813414e-05, "loss": 0.0463, "step": 13740 }, { "epoch": 0.807209111189386, "grad_norm": 0.7954262495040894, "learning_rate": 6.517439561972671e-05, "loss": 0.0514, "step": 13750 }, { "epoch": 0.8077961723611601, "grad_norm": 1.604623794555664, "learning_rate": 6.513032677821752e-05, "loss": 0.0261, "step": 13760 }, { "epoch": 0.8083832335329342, "grad_norm": 2.0205390453338623, "learning_rate": 6.508624499450944e-05, "loss": 0.0497, "step": 13770 }, { "epoch": 0.8089702947047083, "grad_norm": 1.3266966342926025, "learning_rate": 6.504215030630925e-05, "loss": 0.0378, "step": 13780 }, { "epoch": 0.8095573558764824, "grad_norm": 0.645121157169342, "learning_rate": 6.49980427513347e-05, "loss": 0.026, "step": 13790 }, { "epoch": 0.8101444170482565, "grad_norm": 3.148008346557617, "learning_rate": 6.495392236731458e-05, "loss": 0.0506, "step": 13800 }, { "epoch": 0.8107314782200306, "grad_norm": 1.275623083114624, "learning_rate": 6.490978919198863e-05, "loss": 0.0382, "step": 13810 }, { "epoch": 0.8113185393918046, "grad_norm": 0.44733619689941406, "learning_rate": 6.486564326310754e-05, "loss": 0.0347, "step": 13820 }, { "epoch": 0.8119056005635787, "grad_norm": 1.5372415781021118, "learning_rate": 6.482148461843294e-05, "loss": 0.031, "step": 13830 }, { "epoch": 0.8124926617353528, "grad_norm": 3.134258508682251, "learning_rate": 6.477731329573729e-05, "loss": 0.0398, "step": 13840 }, { "epoch": 0.8130797229071269, "grad_norm": 3.234987735748291, "learning_rate": 6.473312933280391e-05, "loss": 0.0726, "step": 13850 }, { "epoch": 0.813666784078901, "grad_norm": 1.5991384983062744, "learning_rate": 6.468893276742695e-05, "loss": 0.0229, "step": 13860 }, { "epoch": 0.8142538452506751, "grad_norm": 1.283582329750061, "learning_rate": 6.464472363741132e-05, "loss": 0.0415, "step": 13870 }, { "epoch": 0.8148409064224492, "grad_norm": 1.0967763662338257, "learning_rate": 6.460050198057268e-05, "loss": 0.0353, "step": 13880 }, { "epoch": 0.8154279675942233, "grad_norm": 1.3352218866348267, "learning_rate": 6.45562678347374e-05, "loss": 0.0305, "step": 13890 }, { "epoch": 0.8160150287659974, "grad_norm": 0.7509410977363586, "learning_rate": 6.451202123774258e-05, "loss": 0.0478, "step": 13900 }, { "epoch": 0.8166020899377715, "grad_norm": 1.6695328950881958, "learning_rate": 6.446776222743589e-05, "loss": 0.0248, "step": 13910 }, { "epoch": 0.8171891511095456, "grad_norm": 0.5678431987762451, "learning_rate": 6.442349084167568e-05, "loss": 0.0352, "step": 13920 }, { "epoch": 0.8177762122813197, "grad_norm": 2.2780115604400635, "learning_rate": 6.437920711833086e-05, "loss": 0.0507, "step": 13930 }, { "epoch": 0.8183632734530938, "grad_norm": 1.653802752494812, "learning_rate": 6.433491109528091e-05, "loss": 0.0339, "step": 13940 }, { "epoch": 0.8189503346248679, "grad_norm": 3.931762218475342, "learning_rate": 6.429060281041581e-05, "loss": 0.0672, "step": 13950 }, { "epoch": 0.819537395796642, "grad_norm": 1.88666832447052, "learning_rate": 6.424628230163606e-05, "loss": 0.0328, "step": 13960 }, { "epoch": 0.8201244569684161, "grad_norm": 2.9051437377929688, "learning_rate": 6.420194960685255e-05, "loss": 0.0275, "step": 13970 }, { "epoch": 0.8207115181401902, "grad_norm": 2.4517440795898438, "learning_rate": 6.41576047639867e-05, "loss": 0.0327, "step": 13980 }, { "epoch": 0.8212985793119643, "grad_norm": 1.2846941947937012, "learning_rate": 6.41132478109702e-05, "loss": 0.0511, "step": 13990 }, { "epoch": 0.8218856404837384, "grad_norm": 0.5321528911590576, "learning_rate": 6.406887878574519e-05, "loss": 0.0709, "step": 14000 }, { "epoch": 0.8224727016555125, "grad_norm": 1.4567052125930786, "learning_rate": 6.402449772626412e-05, "loss": 0.0367, "step": 14010 }, { "epoch": 0.8230597628272867, "grad_norm": 0.714896023273468, "learning_rate": 6.398010467048968e-05, "loss": 0.0317, "step": 14020 }, { "epoch": 0.8236468239990608, "grad_norm": 0.9767372608184814, "learning_rate": 6.39356996563949e-05, "loss": 0.0279, "step": 14030 }, { "epoch": 0.8242338851708348, "grad_norm": 2.217620611190796, "learning_rate": 6.389128272196296e-05, "loss": 0.0564, "step": 14040 }, { "epoch": 0.8248209463426089, "grad_norm": 0.9955437779426575, "learning_rate": 6.38468539051873e-05, "loss": 0.0323, "step": 14050 }, { "epoch": 0.825408007514383, "grad_norm": 2.635800838470459, "learning_rate": 6.38024132440715e-05, "loss": 0.057, "step": 14060 }, { "epoch": 0.8259950686861571, "grad_norm": 1.7019506692886353, "learning_rate": 6.375796077662928e-05, "loss": 0.0404, "step": 14070 }, { "epoch": 0.8265821298579312, "grad_norm": 0.9067420959472656, "learning_rate": 6.371349654088442e-05, "loss": 0.031, "step": 14080 }, { "epoch": 0.8271691910297053, "grad_norm": 0.594925582408905, "learning_rate": 6.366902057487083e-05, "loss": 0.0822, "step": 14090 }, { "epoch": 0.8277562522014794, "grad_norm": 2.47068452835083, "learning_rate": 6.36245329166324e-05, "loss": 0.0256, "step": 14100 }, { "epoch": 0.8283433133732535, "grad_norm": 2.7607803344726562, "learning_rate": 6.358003360422304e-05, "loss": 0.0359, "step": 14110 }, { "epoch": 0.8289303745450276, "grad_norm": 1.8484448194503784, "learning_rate": 6.353552267570666e-05, "loss": 0.0425, "step": 14120 }, { "epoch": 0.8295174357168017, "grad_norm": 3.1313066482543945, "learning_rate": 6.349100016915703e-05, "loss": 0.0467, "step": 14130 }, { "epoch": 0.8301044968885758, "grad_norm": 2.5662596225738525, "learning_rate": 6.34464661226579e-05, "loss": 0.0448, "step": 14140 }, { "epoch": 0.8306915580603499, "grad_norm": 0.4227585792541504, "learning_rate": 6.340192057430286e-05, "loss": 0.034, "step": 14150 }, { "epoch": 0.831278619232124, "grad_norm": 0.47378337383270264, "learning_rate": 6.335736356219533e-05, "loss": 0.0234, "step": 14160 }, { "epoch": 0.8318656804038981, "grad_norm": 2.1253409385681152, "learning_rate": 6.331279512444855e-05, "loss": 0.0378, "step": 14170 }, { "epoch": 0.8324527415756722, "grad_norm": 2.14623761177063, "learning_rate": 6.326821529918553e-05, "loss": 0.0536, "step": 14180 }, { "epoch": 0.8330398027474463, "grad_norm": 1.6848366260528564, "learning_rate": 6.322362412453903e-05, "loss": 0.041, "step": 14190 }, { "epoch": 0.8336268639192204, "grad_norm": 3.853180408477783, "learning_rate": 6.31790216386515e-05, "loss": 0.0439, "step": 14200 }, { "epoch": 0.8342139250909945, "grad_norm": 1.1675387620925903, "learning_rate": 6.313440787967506e-05, "loss": 0.0475, "step": 14210 }, { "epoch": 0.8348009862627686, "grad_norm": 2.579005718231201, "learning_rate": 6.30897828857715e-05, "loss": 0.052, "step": 14220 }, { "epoch": 0.8353880474345426, "grad_norm": 0.5900012850761414, "learning_rate": 6.30451466951122e-05, "loss": 0.0197, "step": 14230 }, { "epoch": 0.8359751086063167, "grad_norm": 2.4965133666992188, "learning_rate": 6.300049934587812e-05, "loss": 0.0379, "step": 14240 }, { "epoch": 0.8365621697780908, "grad_norm": 2.155104637145996, "learning_rate": 6.295584087625979e-05, "loss": 0.0317, "step": 14250 }, { "epoch": 0.8371492309498649, "grad_norm": 0.8006505966186523, "learning_rate": 6.291117132445722e-05, "loss": 0.033, "step": 14260 }, { "epoch": 0.837736292121639, "grad_norm": 3.3270461559295654, "learning_rate": 6.286649072867988e-05, "loss": 0.047, "step": 14270 }, { "epoch": 0.8383233532934131, "grad_norm": 3.7353594303131104, "learning_rate": 6.282179912714677e-05, "loss": 0.0266, "step": 14280 }, { "epoch": 0.8389104144651873, "grad_norm": 2.0424246788024902, "learning_rate": 6.277709655808622e-05, "loss": 0.0282, "step": 14290 }, { "epoch": 0.8394974756369614, "grad_norm": 2.0882630348205566, "learning_rate": 6.273238305973596e-05, "loss": 0.0401, "step": 14300 }, { "epoch": 0.8400845368087355, "grad_norm": 1.401748776435852, "learning_rate": 6.268765867034311e-05, "loss": 0.0357, "step": 14310 }, { "epoch": 0.8406715979805096, "grad_norm": 0.7414683103561401, "learning_rate": 6.264292342816407e-05, "loss": 0.0339, "step": 14320 }, { "epoch": 0.8412586591522837, "grad_norm": 0.1633201390504837, "learning_rate": 6.25981773714645e-05, "loss": 0.04, "step": 14330 }, { "epoch": 0.8418457203240578, "grad_norm": 1.5481064319610596, "learning_rate": 6.255342053851938e-05, "loss": 0.0412, "step": 14340 }, { "epoch": 0.8424327814958319, "grad_norm": 1.6022753715515137, "learning_rate": 6.250865296761286e-05, "loss": 0.029, "step": 14350 }, { "epoch": 0.843019842667606, "grad_norm": 2.521550416946411, "learning_rate": 6.246387469703826e-05, "loss": 0.0211, "step": 14360 }, { "epoch": 0.8436069038393801, "grad_norm": 4.940478801727295, "learning_rate": 6.241908576509812e-05, "loss": 0.0318, "step": 14370 }, { "epoch": 0.8441939650111542, "grad_norm": 2.262380838394165, "learning_rate": 6.237428621010402e-05, "loss": 0.0287, "step": 14380 }, { "epoch": 0.8447810261829283, "grad_norm": 0.9917835593223572, "learning_rate": 6.232947607037666e-05, "loss": 0.0334, "step": 14390 }, { "epoch": 0.8453680873547024, "grad_norm": 2.2069954872131348, "learning_rate": 6.228465538424583e-05, "loss": 0.0256, "step": 14400 }, { "epoch": 0.8459551485264765, "grad_norm": 3.83937406539917, "learning_rate": 6.223982419005027e-05, "loss": 0.034, "step": 14410 }, { "epoch": 0.8465422096982506, "grad_norm": 3.3192458152770996, "learning_rate": 6.219498252613777e-05, "loss": 0.0314, "step": 14420 }, { "epoch": 0.8471292708700247, "grad_norm": 2.14279842376709, "learning_rate": 6.215013043086504e-05, "loss": 0.0436, "step": 14430 }, { "epoch": 0.8477163320417987, "grad_norm": 1.3427451848983765, "learning_rate": 6.210526794259772e-05, "loss": 0.0212, "step": 14440 }, { "epoch": 0.8483033932135728, "grad_norm": 6.0217790603637695, "learning_rate": 6.206039509971038e-05, "loss": 0.0338, "step": 14450 }, { "epoch": 0.8488904543853469, "grad_norm": 4.078960418701172, "learning_rate": 6.201551194058637e-05, "loss": 0.042, "step": 14460 }, { "epoch": 0.849477515557121, "grad_norm": 0.48227864503860474, "learning_rate": 6.19706185036179e-05, "loss": 0.0181, "step": 14470 }, { "epoch": 0.8500645767288951, "grad_norm": 1.5753501653671265, "learning_rate": 6.192571482720601e-05, "loss": 0.0308, "step": 14480 }, { "epoch": 0.8506516379006692, "grad_norm": 4.5797438621521, "learning_rate": 6.188080094976046e-05, "loss": 0.0435, "step": 14490 }, { "epoch": 0.8512386990724433, "grad_norm": 0.8167878985404968, "learning_rate": 6.183587690969974e-05, "loss": 0.0209, "step": 14500 }, { "epoch": 0.8518257602442174, "grad_norm": 0.5837923288345337, "learning_rate": 6.179094274545102e-05, "loss": 0.0251, "step": 14510 }, { "epoch": 0.8524128214159915, "grad_norm": 1.6540395021438599, "learning_rate": 6.174599849545015e-05, "loss": 0.0329, "step": 14520 }, { "epoch": 0.8529998825877656, "grad_norm": 3.803781747817993, "learning_rate": 6.170104419814162e-05, "loss": 0.0556, "step": 14530 }, { "epoch": 0.8535869437595397, "grad_norm": 2.439143657684326, "learning_rate": 6.165607989197847e-05, "loss": 0.0221, "step": 14540 }, { "epoch": 0.8541740049313138, "grad_norm": 2.360718011856079, "learning_rate": 6.161110561542235e-05, "loss": 0.03, "step": 14550 }, { "epoch": 0.854761066103088, "grad_norm": 4.080716133117676, "learning_rate": 6.156612140694339e-05, "loss": 0.059, "step": 14560 }, { "epoch": 0.8553481272748621, "grad_norm": 1.6676440238952637, "learning_rate": 6.152112730502027e-05, "loss": 0.0214, "step": 14570 }, { "epoch": 0.8559351884466362, "grad_norm": 1.4322593212127686, "learning_rate": 6.147612334814008e-05, "loss": 0.06, "step": 14580 }, { "epoch": 0.8565222496184103, "grad_norm": 2.004303455352783, "learning_rate": 6.143110957479839e-05, "loss": 0.0544, "step": 14590 }, { "epoch": 0.8571093107901844, "grad_norm": 1.1425613164901733, "learning_rate": 6.138608602349911e-05, "loss": 0.0443, "step": 14600 }, { "epoch": 0.8576963719619585, "grad_norm": 2.01226544380188, "learning_rate": 6.134105273275457e-05, "loss": 0.0273, "step": 14610 }, { "epoch": 0.8582834331337326, "grad_norm": 2.711271286010742, "learning_rate": 6.129600974108538e-05, "loss": 0.0527, "step": 14620 }, { "epoch": 0.8588704943055067, "grad_norm": 2.209378719329834, "learning_rate": 6.125095708702052e-05, "loss": 0.0438, "step": 14630 }, { "epoch": 0.8594575554772808, "grad_norm": 0.835532546043396, "learning_rate": 6.120589480909715e-05, "loss": 0.0151, "step": 14640 }, { "epoch": 0.8600446166490548, "grad_norm": 2.318243980407715, "learning_rate": 6.116082294586068e-05, "loss": 0.0286, "step": 14650 }, { "epoch": 0.8606316778208289, "grad_norm": 3.605604648590088, "learning_rate": 6.11157415358648e-05, "loss": 0.0468, "step": 14660 }, { "epoch": 0.861218738992603, "grad_norm": 0.7488554120063782, "learning_rate": 6.107065061767127e-05, "loss": 0.0264, "step": 14670 }, { "epoch": 0.8618058001643771, "grad_norm": 6.506105899810791, "learning_rate": 6.1025550229850004e-05, "loss": 0.037, "step": 14680 }, { "epoch": 0.8623928613361512, "grad_norm": 4.626204490661621, "learning_rate": 6.098044041097907e-05, "loss": 0.039, "step": 14690 }, { "epoch": 0.8629799225079253, "grad_norm": 2.7244555950164795, "learning_rate": 6.0935321199644544e-05, "loss": 0.0304, "step": 14700 }, { "epoch": 0.8635669836796994, "grad_norm": 2.2430191040039062, "learning_rate": 6.0890192634440546e-05, "loss": 0.0363, "step": 14710 }, { "epoch": 0.8641540448514735, "grad_norm": 2.716965913772583, "learning_rate": 6.084505475396923e-05, "loss": 0.0303, "step": 14720 }, { "epoch": 0.8647411060232476, "grad_norm": 1.0491752624511719, "learning_rate": 6.079990759684068e-05, "loss": 0.0364, "step": 14730 }, { "epoch": 0.8653281671950217, "grad_norm": 1.1998114585876465, "learning_rate": 6.075475120167293e-05, "loss": 0.0413, "step": 14740 }, { "epoch": 0.8659152283667958, "grad_norm": 1.2094826698303223, "learning_rate": 6.070958560709194e-05, "loss": 0.041, "step": 14750 }, { "epoch": 0.8665022895385699, "grad_norm": 3.386530637741089, "learning_rate": 6.066441085173149e-05, "loss": 0.0251, "step": 14760 }, { "epoch": 0.867089350710344, "grad_norm": 1.256798505783081, "learning_rate": 6.061922697423322e-05, "loss": 0.0336, "step": 14770 }, { "epoch": 0.8676764118821181, "grad_norm": 1.8139320611953735, "learning_rate": 6.057403401324659e-05, "loss": 0.0225, "step": 14780 }, { "epoch": 0.8682634730538922, "grad_norm": 1.8259650468826294, "learning_rate": 6.052883200742883e-05, "loss": 0.0399, "step": 14790 }, { "epoch": 0.8688505342256663, "grad_norm": 2.8148298263549805, "learning_rate": 6.0483620995444835e-05, "loss": 0.0458, "step": 14800 }, { "epoch": 0.8694375953974404, "grad_norm": 2.914865493774414, "learning_rate": 6.043840101596731e-05, "loss": 0.0484, "step": 14810 }, { "epoch": 0.8700246565692145, "grad_norm": 0.6122520565986633, "learning_rate": 6.0393172107676576e-05, "loss": 0.0472, "step": 14820 }, { "epoch": 0.8706117177409887, "grad_norm": 0.9464847445487976, "learning_rate": 6.034793430926058e-05, "loss": 0.0557, "step": 14830 }, { "epoch": 0.8711987789127628, "grad_norm": 2.066427707672119, "learning_rate": 6.0302687659414904e-05, "loss": 0.0414, "step": 14840 }, { "epoch": 0.8717858400845369, "grad_norm": 1.8651105165481567, "learning_rate": 6.025743219684267e-05, "loss": 0.0374, "step": 14850 }, { "epoch": 0.872372901256311, "grad_norm": 0.6826112270355225, "learning_rate": 6.021216796025456e-05, "loss": 0.0384, "step": 14860 }, { "epoch": 0.872959962428085, "grad_norm": 2.0334994792938232, "learning_rate": 6.016689498836877e-05, "loss": 0.026, "step": 14870 }, { "epoch": 0.8735470235998591, "grad_norm": 1.9022555351257324, "learning_rate": 6.012161331991093e-05, "loss": 0.0486, "step": 14880 }, { "epoch": 0.8741340847716332, "grad_norm": 0.45631638169288635, "learning_rate": 6.007632299361417e-05, "loss": 0.037, "step": 14890 }, { "epoch": 0.8747211459434073, "grad_norm": 2.617318630218506, "learning_rate": 6.003102404821895e-05, "loss": 0.0273, "step": 14900 }, { "epoch": 0.8753082071151814, "grad_norm": 2.712918519973755, "learning_rate": 5.9985716522473166e-05, "loss": 0.0337, "step": 14910 }, { "epoch": 0.8758952682869555, "grad_norm": 0.9799612164497375, "learning_rate": 5.9940400455132025e-05, "loss": 0.0613, "step": 14920 }, { "epoch": 0.8764823294587296, "grad_norm": 2.20308780670166, "learning_rate": 5.989507588495804e-05, "loss": 0.043, "step": 14930 }, { "epoch": 0.8770693906305037, "grad_norm": 2.622640371322632, "learning_rate": 5.984974285072099e-05, "loss": 0.0376, "step": 14940 }, { "epoch": 0.8776564518022778, "grad_norm": 2.3511288166046143, "learning_rate": 5.980440139119794e-05, "loss": 0.0397, "step": 14950 }, { "epoch": 0.8782435129740519, "grad_norm": 1.1025187969207764, "learning_rate": 5.975905154517309e-05, "loss": 0.0589, "step": 14960 }, { "epoch": 0.878830574145826, "grad_norm": 1.562609314918518, "learning_rate": 5.971369335143787e-05, "loss": 0.0357, "step": 14970 }, { "epoch": 0.8794176353176001, "grad_norm": 1.2567226886749268, "learning_rate": 5.966832684879084e-05, "loss": 0.0828, "step": 14980 }, { "epoch": 0.8800046964893742, "grad_norm": 3.298917055130005, "learning_rate": 5.962295207603764e-05, "loss": 0.0225, "step": 14990 }, { "epoch": 0.8805917576611483, "grad_norm": 3.532365560531616, "learning_rate": 5.9577569071991e-05, "loss": 0.0286, "step": 15000 }, { "epoch": 0.8805917576611483, "eval_loss": 0.45234647393226624, "eval_runtime": 269.6106, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 15000 }, { "epoch": 0.8811788188329224, "grad_norm": 1.6025224924087524, "learning_rate": 5.953217787547072e-05, "loss": 0.0241, "step": 15010 }, { "epoch": 0.8817658800046965, "grad_norm": 1.0338939428329468, "learning_rate": 5.9486778525303556e-05, "loss": 0.0221, "step": 15020 }, { "epoch": 0.8823529411764706, "grad_norm": 2.463163137435913, "learning_rate": 5.944137106032327e-05, "loss": 0.0434, "step": 15030 }, { "epoch": 0.8829400023482447, "grad_norm": 2.0624144077301025, "learning_rate": 5.939595551937057e-05, "loss": 0.0414, "step": 15040 }, { "epoch": 0.8835270635200188, "grad_norm": 2.9850263595581055, "learning_rate": 5.9350531941293056e-05, "loss": 0.0569, "step": 15050 }, { "epoch": 0.8841141246917928, "grad_norm": 1.0960736274719238, "learning_rate": 5.93051003649452e-05, "loss": 0.0256, "step": 15060 }, { "epoch": 0.8847011858635669, "grad_norm": 5.331958293914795, "learning_rate": 5.9259660829188337e-05, "loss": 0.0284, "step": 15070 }, { "epoch": 0.885288247035341, "grad_norm": 0.8948186039924622, "learning_rate": 5.921421337289059e-05, "loss": 0.0318, "step": 15080 }, { "epoch": 0.8858753082071151, "grad_norm": 1.8375474214553833, "learning_rate": 5.9168758034926855e-05, "loss": 0.0333, "step": 15090 }, { "epoch": 0.8864623693788892, "grad_norm": 1.2407913208007812, "learning_rate": 5.912329485417879e-05, "loss": 0.0458, "step": 15100 }, { "epoch": 0.8870494305506634, "grad_norm": 2.3570520877838135, "learning_rate": 5.9077823869534745e-05, "loss": 0.0365, "step": 15110 }, { "epoch": 0.8876364917224375, "grad_norm": 2.152998447418213, "learning_rate": 5.9032345119889765e-05, "loss": 0.0382, "step": 15120 }, { "epoch": 0.8882235528942116, "grad_norm": 3.138258218765259, "learning_rate": 5.8986858644145505e-05, "loss": 0.0367, "step": 15130 }, { "epoch": 0.8888106140659857, "grad_norm": 2.184736967086792, "learning_rate": 5.8941364481210245e-05, "loss": 0.0264, "step": 15140 }, { "epoch": 0.8893976752377598, "grad_norm": 3.6055047512054443, "learning_rate": 5.889586266999887e-05, "loss": 0.0331, "step": 15150 }, { "epoch": 0.8899847364095339, "grad_norm": 4.376768112182617, "learning_rate": 5.8850353249432744e-05, "loss": 0.0703, "step": 15160 }, { "epoch": 0.890571797581308, "grad_norm": 1.2885960340499878, "learning_rate": 5.880483625843979e-05, "loss": 0.0532, "step": 15170 }, { "epoch": 0.8911588587530821, "grad_norm": 0.7660097479820251, "learning_rate": 5.8759311735954404e-05, "loss": 0.0279, "step": 15180 }, { "epoch": 0.8917459199248562, "grad_norm": 0.7138040661811829, "learning_rate": 5.8713779720917395e-05, "loss": 0.0542, "step": 15190 }, { "epoch": 0.8923329810966303, "grad_norm": 0.8894451856613159, "learning_rate": 5.8668240252275995e-05, "loss": 0.0277, "step": 15200 }, { "epoch": 0.8929200422684044, "grad_norm": 2.522827625274658, "learning_rate": 5.8622693368983847e-05, "loss": 0.053, "step": 15210 }, { "epoch": 0.8935071034401785, "grad_norm": 1.9021809101104736, "learning_rate": 5.857713911000087e-05, "loss": 0.0349, "step": 15220 }, { "epoch": 0.8940941646119526, "grad_norm": 0.3319094181060791, "learning_rate": 5.8531577514293324e-05, "loss": 0.0354, "step": 15230 }, { "epoch": 0.8946812257837267, "grad_norm": 2.2337393760681152, "learning_rate": 5.848600862083378e-05, "loss": 0.042, "step": 15240 }, { "epoch": 0.8952682869555008, "grad_norm": 2.5856029987335205, "learning_rate": 5.844043246860098e-05, "loss": 0.0369, "step": 15250 }, { "epoch": 0.8958553481272749, "grad_norm": 1.4416180849075317, "learning_rate": 5.839484909657993e-05, "loss": 0.0401, "step": 15260 }, { "epoch": 0.896442409299049, "grad_norm": 2.852252721786499, "learning_rate": 5.834925854376181e-05, "loss": 0.0373, "step": 15270 }, { "epoch": 0.897029470470823, "grad_norm": 1.1797451972961426, "learning_rate": 5.83036608491439e-05, "loss": 0.0283, "step": 15280 }, { "epoch": 0.8976165316425971, "grad_norm": 2.4308016300201416, "learning_rate": 5.8258056051729634e-05, "loss": 0.0159, "step": 15290 }, { "epoch": 0.8982035928143712, "grad_norm": 1.619468331336975, "learning_rate": 5.821244419052849e-05, "loss": 0.0385, "step": 15300 }, { "epoch": 0.8987906539861453, "grad_norm": 1.6315438747406006, "learning_rate": 5.816682530455602e-05, "loss": 0.0422, "step": 15310 }, { "epoch": 0.8993777151579194, "grad_norm": 1.3823318481445312, "learning_rate": 5.8121199432833754e-05, "loss": 0.0248, "step": 15320 }, { "epoch": 0.8999647763296935, "grad_norm": 1.8854079246520996, "learning_rate": 5.807556661438922e-05, "loss": 0.028, "step": 15330 }, { "epoch": 0.9005518375014676, "grad_norm": 3.2190797328948975, "learning_rate": 5.802992688825587e-05, "loss": 0.0582, "step": 15340 }, { "epoch": 0.9011388986732417, "grad_norm": 2.959430694580078, "learning_rate": 5.798428029347306e-05, "loss": 0.0194, "step": 15350 }, { "epoch": 0.9017259598450158, "grad_norm": 1.8747074604034424, "learning_rate": 5.7938626869086066e-05, "loss": 0.0285, "step": 15360 }, { "epoch": 0.9023130210167899, "grad_norm": 1.2437909841537476, "learning_rate": 5.7892966654145944e-05, "loss": 0.0316, "step": 15370 }, { "epoch": 0.9029000821885641, "grad_norm": 2.640223264694214, "learning_rate": 5.784729968770961e-05, "loss": 0.0302, "step": 15380 }, { "epoch": 0.9034871433603382, "grad_norm": 1.4964799880981445, "learning_rate": 5.780162600883974e-05, "loss": 0.0348, "step": 15390 }, { "epoch": 0.9040742045321123, "grad_norm": 1.8708909749984741, "learning_rate": 5.775594565660472e-05, "loss": 0.0275, "step": 15400 }, { "epoch": 0.9046612657038864, "grad_norm": 1.6548596620559692, "learning_rate": 5.771025867007868e-05, "loss": 0.0307, "step": 15410 }, { "epoch": 0.9052483268756605, "grad_norm": 1.6874966621398926, "learning_rate": 5.766456508834142e-05, "loss": 0.0279, "step": 15420 }, { "epoch": 0.9058353880474346, "grad_norm": 2.4529452323913574, "learning_rate": 5.761886495047837e-05, "loss": 0.0344, "step": 15430 }, { "epoch": 0.9064224492192087, "grad_norm": 1.2588340044021606, "learning_rate": 5.757315829558057e-05, "loss": 0.0392, "step": 15440 }, { "epoch": 0.9070095103909828, "grad_norm": 0.4524109661579132, "learning_rate": 5.752744516274465e-05, "loss": 0.0399, "step": 15450 }, { "epoch": 0.9075965715627569, "grad_norm": 0.8751776814460754, "learning_rate": 5.748172559107277e-05, "loss": 0.044, "step": 15460 }, { "epoch": 0.908183632734531, "grad_norm": 1.0942742824554443, "learning_rate": 5.7435999619672595e-05, "loss": 0.0424, "step": 15470 }, { "epoch": 0.908770693906305, "grad_norm": 2.3219096660614014, "learning_rate": 5.739026728765726e-05, "loss": 0.0278, "step": 15480 }, { "epoch": 0.9093577550780791, "grad_norm": 1.6318626403808594, "learning_rate": 5.7344528634145354e-05, "loss": 0.0322, "step": 15490 }, { "epoch": 0.9099448162498532, "grad_norm": 3.0432188510894775, "learning_rate": 5.7298783698260874e-05, "loss": 0.0259, "step": 15500 }, { "epoch": 0.9105318774216273, "grad_norm": 2.1943719387054443, "learning_rate": 5.725303251913317e-05, "loss": 0.0435, "step": 15510 }, { "epoch": 0.9111189385934014, "grad_norm": 4.016179084777832, "learning_rate": 5.7207275135896945e-05, "loss": 0.0267, "step": 15520 }, { "epoch": 0.9117059997651755, "grad_norm": 1.7722045183181763, "learning_rate": 5.7161511587692216e-05, "loss": 0.0351, "step": 15530 }, { "epoch": 0.9122930609369496, "grad_norm": 3.096362590789795, "learning_rate": 5.7115741913664264e-05, "loss": 0.0359, "step": 15540 }, { "epoch": 0.9128801221087237, "grad_norm": 7.351997375488281, "learning_rate": 5.7069966152963614e-05, "loss": 0.0431, "step": 15550 }, { "epoch": 0.9134671832804978, "grad_norm": 2.606060743331909, "learning_rate": 5.702418434474601e-05, "loss": 0.0344, "step": 15560 }, { "epoch": 0.9140542444522719, "grad_norm": 0.09482274204492569, "learning_rate": 5.6978396528172326e-05, "loss": 0.0246, "step": 15570 }, { "epoch": 0.914641305624046, "grad_norm": 2.3057878017425537, "learning_rate": 5.693260274240863e-05, "loss": 0.0504, "step": 15580 }, { "epoch": 0.9152283667958201, "grad_norm": 1.0935826301574707, "learning_rate": 5.688680302662607e-05, "loss": 0.0288, "step": 15590 }, { "epoch": 0.9158154279675942, "grad_norm": 2.7622785568237305, "learning_rate": 5.6840997420000865e-05, "loss": 0.0312, "step": 15600 }, { "epoch": 0.9164024891393683, "grad_norm": 4.465653896331787, "learning_rate": 5.679518596171425e-05, "loss": 0.0343, "step": 15610 }, { "epoch": 0.9169895503111424, "grad_norm": 3.447021007537842, "learning_rate": 5.674936869095252e-05, "loss": 0.0465, "step": 15620 }, { "epoch": 0.9175766114829165, "grad_norm": 3.203032970428467, "learning_rate": 5.670354564690692e-05, "loss": 0.0325, "step": 15630 }, { "epoch": 0.9181636726546906, "grad_norm": 1.981807827949524, "learning_rate": 5.665771686877358e-05, "loss": 0.0538, "step": 15640 }, { "epoch": 0.9187507338264648, "grad_norm": 1.0509763956069946, "learning_rate": 5.661188239575364e-05, "loss": 0.0291, "step": 15650 }, { "epoch": 0.9193377949982389, "grad_norm": 3.080498218536377, "learning_rate": 5.6566042267052997e-05, "loss": 0.0178, "step": 15660 }, { "epoch": 0.919924856170013, "grad_norm": 1.2986247539520264, "learning_rate": 5.6520196521882475e-05, "loss": 0.0472, "step": 15670 }, { "epoch": 0.9205119173417871, "grad_norm": 1.8869589567184448, "learning_rate": 5.647434519945767e-05, "loss": 0.0501, "step": 15680 }, { "epoch": 0.9210989785135612, "grad_norm": 1.710890531539917, "learning_rate": 5.642848833899891e-05, "loss": 0.0526, "step": 15690 }, { "epoch": 0.9216860396853352, "grad_norm": 1.9326368570327759, "learning_rate": 5.638262597973134e-05, "loss": 0.0386, "step": 15700 }, { "epoch": 0.9222731008571093, "grad_norm": 3.3880348205566406, "learning_rate": 5.633675816088475e-05, "loss": 0.0213, "step": 15710 }, { "epoch": 0.9228601620288834, "grad_norm": 0.6809608936309814, "learning_rate": 5.62908849216936e-05, "loss": 0.0335, "step": 15720 }, { "epoch": 0.9234472232006575, "grad_norm": 1.8033440113067627, "learning_rate": 5.624500630139702e-05, "loss": 0.0289, "step": 15730 }, { "epoch": 0.9240342843724316, "grad_norm": 3.8078174591064453, "learning_rate": 5.619912233923872e-05, "loss": 0.032, "step": 15740 }, { "epoch": 0.9246213455442057, "grad_norm": 0.1132720559835434, "learning_rate": 5.615323307446697e-05, "loss": 0.0487, "step": 15750 }, { "epoch": 0.9252084067159798, "grad_norm": 2.2745766639709473, "learning_rate": 5.610733854633462e-05, "loss": 0.0627, "step": 15760 }, { "epoch": 0.9257954678877539, "grad_norm": 1.6652629375457764, "learning_rate": 5.6061438794098974e-05, "loss": 0.0248, "step": 15770 }, { "epoch": 0.926382529059528, "grad_norm": 1.28446626663208, "learning_rate": 5.601553385702182e-05, "loss": 0.0255, "step": 15780 }, { "epoch": 0.9269695902313021, "grad_norm": 2.2817840576171875, "learning_rate": 5.5969623774369396e-05, "loss": 0.0362, "step": 15790 }, { "epoch": 0.9275566514030762, "grad_norm": 1.6001989841461182, "learning_rate": 5.592370858541232e-05, "loss": 0.0257, "step": 15800 }, { "epoch": 0.9281437125748503, "grad_norm": 1.3455231189727783, "learning_rate": 5.587778832942556e-05, "loss": 0.0362, "step": 15810 }, { "epoch": 0.9287307737466244, "grad_norm": 1.5758661031723022, "learning_rate": 5.583186304568849e-05, "loss": 0.0451, "step": 15820 }, { "epoch": 0.9293178349183985, "grad_norm": 4.79602575302124, "learning_rate": 5.578593277348473e-05, "loss": 0.0241, "step": 15830 }, { "epoch": 0.9299048960901726, "grad_norm": 2.943692207336426, "learning_rate": 5.573999755210215e-05, "loss": 0.0197, "step": 15840 }, { "epoch": 0.9304919572619467, "grad_norm": 1.6917659044265747, "learning_rate": 5.56940574208329e-05, "loss": 0.0289, "step": 15850 }, { "epoch": 0.9310790184337208, "grad_norm": 3.5683882236480713, "learning_rate": 5.564811241897333e-05, "loss": 0.0483, "step": 15860 }, { "epoch": 0.9316660796054949, "grad_norm": 1.0227922201156616, "learning_rate": 5.5602162585823894e-05, "loss": 0.0286, "step": 15870 }, { "epoch": 0.932253140777269, "grad_norm": 1.409133791923523, "learning_rate": 5.555620796068925e-05, "loss": 0.038, "step": 15880 }, { "epoch": 0.932840201949043, "grad_norm": 2.680568218231201, "learning_rate": 5.551024858287812e-05, "loss": 0.0354, "step": 15890 }, { "epoch": 0.9334272631208171, "grad_norm": 2.6215765476226807, "learning_rate": 5.546428449170329e-05, "loss": 0.0361, "step": 15900 }, { "epoch": 0.9340143242925912, "grad_norm": 1.0609687566757202, "learning_rate": 5.54183157264816e-05, "loss": 0.027, "step": 15910 }, { "epoch": 0.9346013854643654, "grad_norm": 2.0819435119628906, "learning_rate": 5.537234232653386e-05, "loss": 0.0165, "step": 15920 }, { "epoch": 0.9351884466361395, "grad_norm": 1.355362057685852, "learning_rate": 5.532636433118484e-05, "loss": 0.0381, "step": 15930 }, { "epoch": 0.9357755078079136, "grad_norm": 1.5677322149276733, "learning_rate": 5.52803817797633e-05, "loss": 0.027, "step": 15940 }, { "epoch": 0.9363625689796877, "grad_norm": 2.1124112606048584, "learning_rate": 5.523439471160181e-05, "loss": 0.0269, "step": 15950 }, { "epoch": 0.9369496301514618, "grad_norm": 1.3802322149276733, "learning_rate": 5.518840316603689e-05, "loss": 0.0253, "step": 15960 }, { "epoch": 0.9375366913232359, "grad_norm": 1.363916039466858, "learning_rate": 5.514240718240884e-05, "loss": 0.0335, "step": 15970 }, { "epoch": 0.93812375249501, "grad_norm": 1.7724204063415527, "learning_rate": 5.509640680006175e-05, "loss": 0.049, "step": 15980 }, { "epoch": 0.9387108136667841, "grad_norm": 1.3622511625289917, "learning_rate": 5.5050402058343476e-05, "loss": 0.0259, "step": 15990 }, { "epoch": 0.9392978748385582, "grad_norm": 1.2243056297302246, "learning_rate": 5.500439299660566e-05, "loss": 0.0325, "step": 16000 }, { "epoch": 0.9398849360103323, "grad_norm": 0.9930700063705444, "learning_rate": 5.495837965420356e-05, "loss": 0.0272, "step": 16010 }, { "epoch": 0.9404719971821064, "grad_norm": 0.009694311767816544, "learning_rate": 5.491236207049613e-05, "loss": 0.0179, "step": 16020 }, { "epoch": 0.9410590583538805, "grad_norm": 3.36891770362854, "learning_rate": 5.4866340284845955e-05, "loss": 0.0151, "step": 16030 }, { "epoch": 0.9416461195256546, "grad_norm": 2.4878575801849365, "learning_rate": 5.4820314336619214e-05, "loss": 0.0406, "step": 16040 }, { "epoch": 0.9422331806974287, "grad_norm": 1.6236851215362549, "learning_rate": 5.477428426518565e-05, "loss": 0.0439, "step": 16050 }, { "epoch": 0.9428202418692028, "grad_norm": 2.8964767456054688, "learning_rate": 5.472825010991852e-05, "loss": 0.0363, "step": 16060 }, { "epoch": 0.9434073030409769, "grad_norm": 1.8873333930969238, "learning_rate": 5.468221191019457e-05, "loss": 0.0245, "step": 16070 }, { "epoch": 0.943994364212751, "grad_norm": 3.159353017807007, "learning_rate": 5.463616970539403e-05, "loss": 0.0413, "step": 16080 }, { "epoch": 0.944581425384525, "grad_norm": 2.711113452911377, "learning_rate": 5.459012353490054e-05, "loss": 0.0408, "step": 16090 }, { "epoch": 0.9451684865562991, "grad_norm": 2.389240264892578, "learning_rate": 5.454407343810112e-05, "loss": 0.0256, "step": 16100 }, { "epoch": 0.9457555477280732, "grad_norm": 0.7224491238594055, "learning_rate": 5.449801945438619e-05, "loss": 0.026, "step": 16110 }, { "epoch": 0.9463426088998473, "grad_norm": 0.4595985412597656, "learning_rate": 5.445196162314944e-05, "loss": 0.0868, "step": 16120 }, { "epoch": 0.9469296700716214, "grad_norm": 2.019042730331421, "learning_rate": 5.440589998378788e-05, "loss": 0.0402, "step": 16130 }, { "epoch": 0.9475167312433955, "grad_norm": 0.10347907990217209, "learning_rate": 5.435983457570179e-05, "loss": 0.0232, "step": 16140 }, { "epoch": 0.9481037924151696, "grad_norm": 1.186431646347046, "learning_rate": 5.431376543829467e-05, "loss": 0.0155, "step": 16150 }, { "epoch": 0.9486908535869437, "grad_norm": 3.5603644847869873, "learning_rate": 5.426769261097317e-05, "loss": 0.0635, "step": 16160 }, { "epoch": 0.9492779147587178, "grad_norm": 1.3630090951919556, "learning_rate": 5.422161613314715e-05, "loss": 0.0289, "step": 16170 }, { "epoch": 0.9498649759304919, "grad_norm": 0.08817755430936813, "learning_rate": 5.4175536044229555e-05, "loss": 0.0487, "step": 16180 }, { "epoch": 0.950452037102266, "grad_norm": 1.633805274963379, "learning_rate": 5.412945238363643e-05, "loss": 0.0301, "step": 16190 }, { "epoch": 0.9510390982740402, "grad_norm": 2.0507123470306396, "learning_rate": 5.408336519078688e-05, "loss": 0.0401, "step": 16200 }, { "epoch": 0.9516261594458143, "grad_norm": 1.2019133567810059, "learning_rate": 5.403727450510304e-05, "loss": 0.0485, "step": 16210 }, { "epoch": 0.9522132206175884, "grad_norm": 2.6347334384918213, "learning_rate": 5.399118036601001e-05, "loss": 0.0501, "step": 16220 }, { "epoch": 0.9528002817893625, "grad_norm": 0.49063390493392944, "learning_rate": 5.3945082812935857e-05, "loss": 0.0157, "step": 16230 }, { "epoch": 0.9533873429611366, "grad_norm": 0.9728673696517944, "learning_rate": 5.389898188531156e-05, "loss": 0.0261, "step": 16240 }, { "epoch": 0.9539744041329107, "grad_norm": 1.6232997179031372, "learning_rate": 5.385287762257101e-05, "loss": 0.0252, "step": 16250 }, { "epoch": 0.9545614653046848, "grad_norm": 3.001466751098633, "learning_rate": 5.380677006415093e-05, "loss": 0.0366, "step": 16260 }, { "epoch": 0.9551485264764589, "grad_norm": 4.364900588989258, "learning_rate": 5.376065924949083e-05, "loss": 0.0494, "step": 16270 }, { "epoch": 0.955735587648233, "grad_norm": 1.7932524681091309, "learning_rate": 5.3714545218033076e-05, "loss": 0.0382, "step": 16280 }, { "epoch": 0.9563226488200071, "grad_norm": 2.1642203330993652, "learning_rate": 5.366842800922274e-05, "loss": 0.0172, "step": 16290 }, { "epoch": 0.9569097099917812, "grad_norm": 1.7561495304107666, "learning_rate": 5.362230766250761e-05, "loss": 0.0262, "step": 16300 }, { "epoch": 0.9574967711635552, "grad_norm": 0.9882838129997253, "learning_rate": 5.3576184217338185e-05, "loss": 0.0287, "step": 16310 }, { "epoch": 0.9580838323353293, "grad_norm": 1.2872587442398071, "learning_rate": 5.35300577131676e-05, "loss": 0.0305, "step": 16320 }, { "epoch": 0.9586708935071034, "grad_norm": 3.7865941524505615, "learning_rate": 5.3483928189451585e-05, "loss": 0.042, "step": 16330 }, { "epoch": 0.9592579546788775, "grad_norm": 0.8969483375549316, "learning_rate": 5.343779568564848e-05, "loss": 0.0551, "step": 16340 }, { "epoch": 0.9598450158506516, "grad_norm": 3.019232988357544, "learning_rate": 5.339166024121919e-05, "loss": 0.015, "step": 16350 }, { "epoch": 0.9604320770224257, "grad_norm": 0.903888463973999, "learning_rate": 5.334552189562707e-05, "loss": 0.0369, "step": 16360 }, { "epoch": 0.9610191381941998, "grad_norm": 2.1453328132629395, "learning_rate": 5.329938068833803e-05, "loss": 0.0372, "step": 16370 }, { "epoch": 0.9616061993659739, "grad_norm": 1.4702128171920776, "learning_rate": 5.3253236658820396e-05, "loss": 0.0214, "step": 16380 }, { "epoch": 0.962193260537748, "grad_norm": 0.5381457805633545, "learning_rate": 5.320708984654489e-05, "loss": 0.0245, "step": 16390 }, { "epoch": 0.9627803217095221, "grad_norm": 1.984392523765564, "learning_rate": 5.316094029098465e-05, "loss": 0.0327, "step": 16400 }, { "epoch": 0.9633673828812962, "grad_norm": 1.3957103490829468, "learning_rate": 5.311478803161513e-05, "loss": 0.0307, "step": 16410 }, { "epoch": 0.9639544440530703, "grad_norm": 3.6839656829833984, "learning_rate": 5.306863310791411e-05, "loss": 0.0268, "step": 16420 }, { "epoch": 0.9645415052248444, "grad_norm": 1.6362731456756592, "learning_rate": 5.302247555936168e-05, "loss": 0.0291, "step": 16430 }, { "epoch": 0.9651285663966185, "grad_norm": 2.45932936668396, "learning_rate": 5.2976315425440123e-05, "loss": 0.0424, "step": 16440 }, { "epoch": 0.9657156275683926, "grad_norm": 1.0357054471969604, "learning_rate": 5.293015274563394e-05, "loss": 0.0336, "step": 16450 }, { "epoch": 0.9663026887401667, "grad_norm": 3.641936779022217, "learning_rate": 5.288398755942985e-05, "loss": 0.0303, "step": 16460 }, { "epoch": 0.9668897499119409, "grad_norm": 2.652996778488159, "learning_rate": 5.283781990631668e-05, "loss": 0.0325, "step": 16470 }, { "epoch": 0.967476811083715, "grad_norm": 2.6950008869171143, "learning_rate": 5.279164982578536e-05, "loss": 0.0167, "step": 16480 }, { "epoch": 0.9680638722554891, "grad_norm": 2.224520444869995, "learning_rate": 5.2745477357328955e-05, "loss": 0.0267, "step": 16490 }, { "epoch": 0.9686509334272632, "grad_norm": 0.9650734066963196, "learning_rate": 5.26993025404425e-05, "loss": 0.0111, "step": 16500 }, { "epoch": 0.9692379945990373, "grad_norm": 3.18520450592041, "learning_rate": 5.265312541462308e-05, "loss": 0.037, "step": 16510 }, { "epoch": 0.9698250557708114, "grad_norm": 3.1743884086608887, "learning_rate": 5.260694601936975e-05, "loss": 0.0426, "step": 16520 }, { "epoch": 0.9704121169425854, "grad_norm": 3.833486557006836, "learning_rate": 5.2560764394183494e-05, "loss": 0.0479, "step": 16530 }, { "epoch": 0.9709991781143595, "grad_norm": 0.6492809653282166, "learning_rate": 5.2514580578567216e-05, "loss": 0.0337, "step": 16540 }, { "epoch": 0.9715862392861336, "grad_norm": 2.115954875946045, "learning_rate": 5.2468394612025686e-05, "loss": 0.0286, "step": 16550 }, { "epoch": 0.9721733004579077, "grad_norm": 0.4123615324497223, "learning_rate": 5.242220653406553e-05, "loss": 0.0276, "step": 16560 }, { "epoch": 0.9727603616296818, "grad_norm": 1.5601547956466675, "learning_rate": 5.2376016384195136e-05, "loss": 0.0153, "step": 16570 }, { "epoch": 0.9733474228014559, "grad_norm": 0.18316815793514252, "learning_rate": 5.232982420192474e-05, "loss": 0.0352, "step": 16580 }, { "epoch": 0.97393448397323, "grad_norm": 1.990715503692627, "learning_rate": 5.2283630026766225e-05, "loss": 0.0365, "step": 16590 }, { "epoch": 0.9745215451450041, "grad_norm": 1.6039259433746338, "learning_rate": 5.223743389823327e-05, "loss": 0.0394, "step": 16600 }, { "epoch": 0.9751086063167782, "grad_norm": 0.5257664322853088, "learning_rate": 5.2191235855841146e-05, "loss": 0.0366, "step": 16610 }, { "epoch": 0.9756956674885523, "grad_norm": 0.4390442371368408, "learning_rate": 5.21450359391068e-05, "loss": 0.0357, "step": 16620 }, { "epoch": 0.9762827286603264, "grad_norm": 1.3897521495819092, "learning_rate": 5.2098834187548805e-05, "loss": 0.0362, "step": 16630 }, { "epoch": 0.9768697898321005, "grad_norm": 7.759096622467041, "learning_rate": 5.205263064068725e-05, "loss": 0.0402, "step": 16640 }, { "epoch": 0.9774568510038746, "grad_norm": 2.1783852577209473, "learning_rate": 5.200642533804379e-05, "loss": 0.0263, "step": 16650 }, { "epoch": 0.9780439121756487, "grad_norm": 2.72682785987854, "learning_rate": 5.196021831914157e-05, "loss": 0.029, "step": 16660 }, { "epoch": 0.9786309733474228, "grad_norm": 1.675666093826294, "learning_rate": 5.191400962350523e-05, "loss": 0.0346, "step": 16670 }, { "epoch": 0.9792180345191969, "grad_norm": 0.6754209995269775, "learning_rate": 5.1867799290660815e-05, "loss": 0.0467, "step": 16680 }, { "epoch": 0.979805095690971, "grad_norm": 0.1948157697916031, "learning_rate": 5.182158736013577e-05, "loss": 0.0374, "step": 16690 }, { "epoch": 0.9803921568627451, "grad_norm": 0.401326984167099, "learning_rate": 5.177537387145894e-05, "loss": 0.0579, "step": 16700 }, { "epoch": 0.9809792180345192, "grad_norm": 2.034702777862549, "learning_rate": 5.1729158864160466e-05, "loss": 0.0561, "step": 16710 }, { "epoch": 0.9815662792062932, "grad_norm": 3.014551877975464, "learning_rate": 5.16829423777718e-05, "loss": 0.0195, "step": 16720 }, { "epoch": 0.9821533403780673, "grad_norm": 0.5864536166191101, "learning_rate": 5.163672445182568e-05, "loss": 0.0139, "step": 16730 }, { "epoch": 0.9827404015498415, "grad_norm": 0.5289451479911804, "learning_rate": 5.1590505125856025e-05, "loss": 0.0288, "step": 16740 }, { "epoch": 0.9833274627216156, "grad_norm": 1.2977737188339233, "learning_rate": 5.1544284439398006e-05, "loss": 0.0436, "step": 16750 }, { "epoch": 0.9839145238933897, "grad_norm": 2.5961172580718994, "learning_rate": 5.149806243198794e-05, "loss": 0.0236, "step": 16760 }, { "epoch": 0.9845015850651638, "grad_norm": 1.1894290447235107, "learning_rate": 5.1451839143163226e-05, "loss": 0.0288, "step": 16770 }, { "epoch": 0.9850886462369379, "grad_norm": 0.8295955657958984, "learning_rate": 5.140561461246246e-05, "loss": 0.0374, "step": 16780 }, { "epoch": 0.985675707408712, "grad_norm": 3.0590317249298096, "learning_rate": 5.13593888794252e-05, "loss": 0.0418, "step": 16790 }, { "epoch": 0.9862627685804861, "grad_norm": 1.7654225826263428, "learning_rate": 5.1313161983592096e-05, "loss": 0.0246, "step": 16800 }, { "epoch": 0.9868498297522602, "grad_norm": 2.3438401222229004, "learning_rate": 5.126693396450476e-05, "loss": 0.0398, "step": 16810 }, { "epoch": 0.9874368909240343, "grad_norm": 3.1865170001983643, "learning_rate": 5.1220704861705774e-05, "loss": 0.0475, "step": 16820 }, { "epoch": 0.9880239520958084, "grad_norm": 0.756794273853302, "learning_rate": 5.117447471473865e-05, "loss": 0.0131, "step": 16830 }, { "epoch": 0.9886110132675825, "grad_norm": 3.404741048812866, "learning_rate": 5.1128243563147816e-05, "loss": 0.0352, "step": 16840 }, { "epoch": 0.9891980744393566, "grad_norm": 3.5811452865600586, "learning_rate": 5.108201144647851e-05, "loss": 0.0219, "step": 16850 }, { "epoch": 0.9897851356111307, "grad_norm": 2.3117177486419678, "learning_rate": 5.1035778404276815e-05, "loss": 0.0413, "step": 16860 }, { "epoch": 0.9903721967829048, "grad_norm": 1.9933563470840454, "learning_rate": 5.098954447608964e-05, "loss": 0.0286, "step": 16870 }, { "epoch": 0.9909592579546789, "grad_norm": 2.5051896572113037, "learning_rate": 5.0943309701464584e-05, "loss": 0.0225, "step": 16880 }, { "epoch": 0.991546319126453, "grad_norm": 2.6229381561279297, "learning_rate": 5.089707411995005e-05, "loss": 0.0341, "step": 16890 }, { "epoch": 0.9921333802982271, "grad_norm": 1.0904333591461182, "learning_rate": 5.0850837771095074e-05, "loss": 0.0527, "step": 16900 }, { "epoch": 0.9927204414700012, "grad_norm": 1.3770349025726318, "learning_rate": 5.080460069444936e-05, "loss": 0.0301, "step": 16910 }, { "epoch": 0.9933075026417753, "grad_norm": 1.2007999420166016, "learning_rate": 5.0758362929563244e-05, "loss": 0.0345, "step": 16920 }, { "epoch": 0.9938945638135493, "grad_norm": 2.2936861515045166, "learning_rate": 5.071212451598765e-05, "loss": 0.0531, "step": 16930 }, { "epoch": 0.9944816249853234, "grad_norm": 2.625542640686035, "learning_rate": 5.066588549327403e-05, "loss": 0.0427, "step": 16940 }, { "epoch": 0.9950686861570975, "grad_norm": 3.829827308654785, "learning_rate": 5.061964590097442e-05, "loss": 0.043, "step": 16950 }, { "epoch": 0.9956557473288716, "grad_norm": 1.3408961296081543, "learning_rate": 5.057340577864127e-05, "loss": 0.0471, "step": 16960 }, { "epoch": 0.9962428085006457, "grad_norm": 1.6706929206848145, "learning_rate": 5.052716516582753e-05, "loss": 0.0285, "step": 16970 }, { "epoch": 0.9968298696724198, "grad_norm": 0.8629145622253418, "learning_rate": 5.048092410208656e-05, "loss": 0.0317, "step": 16980 }, { "epoch": 0.9974169308441939, "grad_norm": 0.9367380738258362, "learning_rate": 5.0434682626972105e-05, "loss": 0.0344, "step": 16990 }, { "epoch": 0.998003992015968, "grad_norm": 2.3083112239837646, "learning_rate": 5.0388440780038235e-05, "loss": 0.0218, "step": 17000 }, { "epoch": 0.9985910531877422, "grad_norm": 1.1551401615142822, "learning_rate": 5.0342198600839394e-05, "loss": 0.0372, "step": 17010 }, { "epoch": 0.9991781143595163, "grad_norm": 1.9685473442077637, "learning_rate": 5.029595612893027e-05, "loss": 0.0516, "step": 17020 }, { "epoch": 0.9997651755312904, "grad_norm": 2.787243127822876, "learning_rate": 5.024971340386577e-05, "loss": 0.0446, "step": 17030 }, { "epoch": 1.0003522367030644, "grad_norm": 1.3453413248062134, "learning_rate": 5.020347046520112e-05, "loss": 0.0236, "step": 17040 }, { "epoch": 1.0009392978748386, "grad_norm": 0.5187767148017883, "learning_rate": 5.015722735249163e-05, "loss": 0.0238, "step": 17050 }, { "epoch": 1.0015263590466126, "grad_norm": 0.7118642330169678, "learning_rate": 5.0110984105292793e-05, "loss": 0.0219, "step": 17060 }, { "epoch": 1.0021134202183868, "grad_norm": 0.29987266659736633, "learning_rate": 5.0064740763160224e-05, "loss": 0.0198, "step": 17070 }, { "epoch": 1.0027004813901608, "grad_norm": 1.877467155456543, "learning_rate": 5.001849736564961e-05, "loss": 0.0416, "step": 17080 }, { "epoch": 1.003287542561935, "grad_norm": 0.24835659563541412, "learning_rate": 4.99722539523167e-05, "loss": 0.0258, "step": 17090 }, { "epoch": 1.003874603733709, "grad_norm": 1.0357491970062256, "learning_rate": 4.9926010562717255e-05, "loss": 0.0224, "step": 17100 }, { "epoch": 1.0044616649054832, "grad_norm": 2.34885311126709, "learning_rate": 4.987976723640698e-05, "loss": 0.0048, "step": 17110 }, { "epoch": 1.0050487260772571, "grad_norm": 0.7573207020759583, "learning_rate": 4.983352401294157e-05, "loss": 0.0131, "step": 17120 }, { "epoch": 1.0056357872490314, "grad_norm": 1.1532341241836548, "learning_rate": 4.97872809318766e-05, "loss": 0.013, "step": 17130 }, { "epoch": 1.0062228484208053, "grad_norm": 0.11428312957286835, "learning_rate": 4.974103803276756e-05, "loss": 0.0192, "step": 17140 }, { "epoch": 1.0068099095925795, "grad_norm": 0.08842333406209946, "learning_rate": 4.9694795355169734e-05, "loss": 0.0042, "step": 17150 }, { "epoch": 1.0073969707643537, "grad_norm": 1.681483507156372, "learning_rate": 4.964855293863828e-05, "loss": 0.0105, "step": 17160 }, { "epoch": 1.0079840319361277, "grad_norm": 1.8980822563171387, "learning_rate": 4.960231082272805e-05, "loss": 0.016, "step": 17170 }, { "epoch": 1.008571093107902, "grad_norm": 4.2246246337890625, "learning_rate": 4.955606904699371e-05, "loss": 0.0209, "step": 17180 }, { "epoch": 1.009158154279676, "grad_norm": 0.0430564284324646, "learning_rate": 4.950982765098965e-05, "loss": 0.013, "step": 17190 }, { "epoch": 1.0097452154514501, "grad_norm": 0.5098704099655151, "learning_rate": 4.946358667426984e-05, "loss": 0.015, "step": 17200 }, { "epoch": 1.0103322766232241, "grad_norm": 1.0906519889831543, "learning_rate": 4.941734615638797e-05, "loss": 0.0214, "step": 17210 }, { "epoch": 1.0109193377949983, "grad_norm": 3.039210081100464, "learning_rate": 4.93711061368973e-05, "loss": 0.0182, "step": 17220 }, { "epoch": 1.0115063989667723, "grad_norm": 2.438016176223755, "learning_rate": 4.9324866655350706e-05, "loss": 0.0399, "step": 17230 }, { "epoch": 1.0120934601385465, "grad_norm": 1.5391802787780762, "learning_rate": 4.927862775130055e-05, "loss": 0.0259, "step": 17240 }, { "epoch": 1.0126805213103205, "grad_norm": 0.10367631912231445, "learning_rate": 4.923238946429876e-05, "loss": 0.0084, "step": 17250 }, { "epoch": 1.0132675824820947, "grad_norm": 0.026676097884774208, "learning_rate": 4.918615183389665e-05, "loss": 0.0143, "step": 17260 }, { "epoch": 1.0138546436538687, "grad_norm": 0.3782263994216919, "learning_rate": 4.9139914899645096e-05, "loss": 0.0095, "step": 17270 }, { "epoch": 1.0144417048256429, "grad_norm": 1.0386979579925537, "learning_rate": 4.909367870109424e-05, "loss": 0.0133, "step": 17280 }, { "epoch": 1.0150287659974169, "grad_norm": 1.7258808612823486, "learning_rate": 4.90474432777937e-05, "loss": 0.0163, "step": 17290 }, { "epoch": 1.015615827169191, "grad_norm": 0.11780045926570892, "learning_rate": 4.900120866929238e-05, "loss": 0.014, "step": 17300 }, { "epoch": 1.016202888340965, "grad_norm": 0.9828274846076965, "learning_rate": 4.89549749151385e-05, "loss": 0.0162, "step": 17310 }, { "epoch": 1.0167899495127393, "grad_norm": 0.4108709692955017, "learning_rate": 4.890874205487957e-05, "loss": 0.032, "step": 17320 }, { "epoch": 1.0173770106845132, "grad_norm": 0.6657062768936157, "learning_rate": 4.8862510128062284e-05, "loss": 0.0198, "step": 17330 }, { "epoch": 1.0179640718562875, "grad_norm": 0.655598521232605, "learning_rate": 4.881627917423261e-05, "loss": 0.0099, "step": 17340 }, { "epoch": 1.0185511330280614, "grad_norm": 0.5765482783317566, "learning_rate": 4.8770049232935575e-05, "loss": 0.012, "step": 17350 }, { "epoch": 1.0191381941998356, "grad_norm": 0.2732393145561218, "learning_rate": 4.8723820343715484e-05, "loss": 0.0265, "step": 17360 }, { "epoch": 1.0197252553716096, "grad_norm": 1.265217661857605, "learning_rate": 4.867759254611561e-05, "loss": 0.0215, "step": 17370 }, { "epoch": 1.0203123165433838, "grad_norm": 2.5435831546783447, "learning_rate": 4.8631365879678384e-05, "loss": 0.018, "step": 17380 }, { "epoch": 1.0208993777151578, "grad_norm": 1.1228276491165161, "learning_rate": 4.85851403839452e-05, "loss": 0.0135, "step": 17390 }, { "epoch": 1.021486438886932, "grad_norm": 1.6964856386184692, "learning_rate": 4.85389160984565e-05, "loss": 0.0135, "step": 17400 }, { "epoch": 1.022073500058706, "grad_norm": 2.8319220542907715, "learning_rate": 4.8492693062751675e-05, "loss": 0.0078, "step": 17410 }, { "epoch": 1.0226605612304802, "grad_norm": 1.0305736064910889, "learning_rate": 4.844647131636907e-05, "loss": 0.012, "step": 17420 }, { "epoch": 1.0232476224022544, "grad_norm": 0.9862768650054932, "learning_rate": 4.840025089884583e-05, "loss": 0.0285, "step": 17430 }, { "epoch": 1.0238346835740284, "grad_norm": 1.3408020734786987, "learning_rate": 4.8354031849718126e-05, "loss": 0.0184, "step": 17440 }, { "epoch": 1.0244217447458026, "grad_norm": 0.399573415517807, "learning_rate": 4.8307814208520806e-05, "loss": 0.0214, "step": 17450 }, { "epoch": 1.0250088059175766, "grad_norm": 3.679910182952881, "learning_rate": 4.82615980147876e-05, "loss": 0.0093, "step": 17460 }, { "epoch": 1.0255958670893508, "grad_norm": 1.3835197687149048, "learning_rate": 4.821538330805098e-05, "loss": 0.0151, "step": 17470 }, { "epoch": 1.0261829282611248, "grad_norm": 0.055284079164266586, "learning_rate": 4.816917012784213e-05, "loss": 0.0202, "step": 17480 }, { "epoch": 1.026769989432899, "grad_norm": 0.9927650094032288, "learning_rate": 4.812295851369096e-05, "loss": 0.0129, "step": 17490 }, { "epoch": 1.027357050604673, "grad_norm": 1.693311333656311, "learning_rate": 4.807674850512601e-05, "loss": 0.0172, "step": 17500 }, { "epoch": 1.0279441117764472, "grad_norm": 1.8846806287765503, "learning_rate": 4.803054014167447e-05, "loss": 0.0226, "step": 17510 }, { "epoch": 1.0285311729482212, "grad_norm": 1.4450979232788086, "learning_rate": 4.7984333462862066e-05, "loss": 0.0227, "step": 17520 }, { "epoch": 1.0291182341199954, "grad_norm": 0.4159143567085266, "learning_rate": 4.793812850821319e-05, "loss": 0.0224, "step": 17530 }, { "epoch": 1.0297052952917694, "grad_norm": 0.6583952307701111, "learning_rate": 4.789192531725066e-05, "loss": 0.0194, "step": 17540 }, { "epoch": 1.0302923564635436, "grad_norm": 1.147028923034668, "learning_rate": 4.784572392949583e-05, "loss": 0.0385, "step": 17550 }, { "epoch": 1.0308794176353175, "grad_norm": 1.676843523979187, "learning_rate": 4.77995243844685e-05, "loss": 0.023, "step": 17560 }, { "epoch": 1.0314664788070917, "grad_norm": 2.3612804412841797, "learning_rate": 4.775332672168691e-05, "loss": 0.016, "step": 17570 }, { "epoch": 1.0320535399788657, "grad_norm": 0.7377216815948486, "learning_rate": 4.770713098066765e-05, "loss": 0.0137, "step": 17580 }, { "epoch": 1.03264060115064, "grad_norm": 0.525375485420227, "learning_rate": 4.7660937200925726e-05, "loss": 0.0116, "step": 17590 }, { "epoch": 1.033227662322414, "grad_norm": 0.29516294598579407, "learning_rate": 4.7614745421974447e-05, "loss": 0.0085, "step": 17600 }, { "epoch": 1.0338147234941881, "grad_norm": 1.0876867771148682, "learning_rate": 4.7568555683325325e-05, "loss": 0.028, "step": 17610 }, { "epoch": 1.034401784665962, "grad_norm": 2.252126932144165, "learning_rate": 4.752236802448829e-05, "loss": 0.0176, "step": 17620 }, { "epoch": 1.0349888458377363, "grad_norm": 0.04606296122074127, "learning_rate": 4.747618248497134e-05, "loss": 0.0158, "step": 17630 }, { "epoch": 1.0355759070095103, "grad_norm": 2.971982717514038, "learning_rate": 4.742999910428075e-05, "loss": 0.0142, "step": 17640 }, { "epoch": 1.0361629681812845, "grad_norm": 0.5577654838562012, "learning_rate": 4.73838179219209e-05, "loss": 0.0238, "step": 17650 }, { "epoch": 1.0367500293530585, "grad_norm": 0.7843875288963318, "learning_rate": 4.7337638977394336e-05, "loss": 0.0108, "step": 17660 }, { "epoch": 1.0373370905248327, "grad_norm": 0.3187924325466156, "learning_rate": 4.729146231020164e-05, "loss": 0.0183, "step": 17670 }, { "epoch": 1.0379241516966067, "grad_norm": 2.111429214477539, "learning_rate": 4.724528795984151e-05, "loss": 0.0226, "step": 17680 }, { "epoch": 1.0385112128683809, "grad_norm": 0.011057616211473942, "learning_rate": 4.719911596581057e-05, "loss": 0.0071, "step": 17690 }, { "epoch": 1.039098274040155, "grad_norm": 1.7219300270080566, "learning_rate": 4.715294636760352e-05, "loss": 0.0129, "step": 17700 }, { "epoch": 1.039685335211929, "grad_norm": 0.33654457330703735, "learning_rate": 4.7106779204712946e-05, "loss": 0.0133, "step": 17710 }, { "epoch": 1.0402723963837033, "grad_norm": 0.4673077464103699, "learning_rate": 4.7060614516629396e-05, "loss": 0.0127, "step": 17720 }, { "epoch": 1.0408594575554773, "grad_norm": 1.9815365076065063, "learning_rate": 4.701445234284127e-05, "loss": 0.0263, "step": 17730 }, { "epoch": 1.0414465187272515, "grad_norm": 1.209466814994812, "learning_rate": 4.696829272283483e-05, "loss": 0.0153, "step": 17740 }, { "epoch": 1.0420335798990255, "grad_norm": 1.3966259956359863, "learning_rate": 4.6922135696094175e-05, "loss": 0.0092, "step": 17750 }, { "epoch": 1.0426206410707997, "grad_norm": 1.4317821264266968, "learning_rate": 4.687598130210112e-05, "loss": 0.0253, "step": 17760 }, { "epoch": 1.0432077022425736, "grad_norm": 0.2546725273132324, "learning_rate": 4.682982958033533e-05, "loss": 0.016, "step": 17770 }, { "epoch": 1.0437947634143478, "grad_norm": 2.701737880706787, "learning_rate": 4.678368057027407e-05, "loss": 0.0245, "step": 17780 }, { "epoch": 1.0443818245861218, "grad_norm": 2.189082145690918, "learning_rate": 4.6737534311392375e-05, "loss": 0.0196, "step": 17790 }, { "epoch": 1.044968885757896, "grad_norm": 4.402464389801025, "learning_rate": 4.669139084316286e-05, "loss": 0.0162, "step": 17800 }, { "epoch": 1.04555594692967, "grad_norm": 1.7337652444839478, "learning_rate": 4.664525020505582e-05, "loss": 0.0258, "step": 17810 }, { "epoch": 1.0461430081014442, "grad_norm": 0.08800406754016876, "learning_rate": 4.6599112436539075e-05, "loss": 0.012, "step": 17820 }, { "epoch": 1.0467300692732182, "grad_norm": 1.18183434009552, "learning_rate": 4.6552977577078035e-05, "loss": 0.0123, "step": 17830 }, { "epoch": 1.0473171304449924, "grad_norm": 0.5991613268852234, "learning_rate": 4.6506845666135546e-05, "loss": 0.0137, "step": 17840 }, { "epoch": 1.0479041916167664, "grad_norm": 0.9787329435348511, "learning_rate": 4.646071674317204e-05, "loss": 0.0205, "step": 17850 }, { "epoch": 1.0484912527885406, "grad_norm": 4.429852485656738, "learning_rate": 4.6414590847645305e-05, "loss": 0.0099, "step": 17860 }, { "epoch": 1.0490783139603146, "grad_norm": 0.5103155970573425, "learning_rate": 4.636846801901056e-05, "loss": 0.0348, "step": 17870 }, { "epoch": 1.0496653751320888, "grad_norm": 2.0082802772521973, "learning_rate": 4.632234829672045e-05, "loss": 0.0172, "step": 17880 }, { "epoch": 1.0502524363038628, "grad_norm": 0.44900891184806824, "learning_rate": 4.6276231720224885e-05, "loss": 0.0139, "step": 17890 }, { "epoch": 1.050839497475637, "grad_norm": 2.634812831878662, "learning_rate": 4.6230118328971156e-05, "loss": 0.0254, "step": 17900 }, { "epoch": 1.051426558647411, "grad_norm": 1.2553977966308594, "learning_rate": 4.618400816240376e-05, "loss": 0.0131, "step": 17910 }, { "epoch": 1.0520136198191852, "grad_norm": 0.8278115391731262, "learning_rate": 4.613790125996451e-05, "loss": 0.023, "step": 17920 }, { "epoch": 1.0526006809909592, "grad_norm": 0.04974241927266121, "learning_rate": 4.609179766109236e-05, "loss": 0.0121, "step": 17930 }, { "epoch": 1.0531877421627334, "grad_norm": 0.170747309923172, "learning_rate": 4.604569740522349e-05, "loss": 0.0142, "step": 17940 }, { "epoch": 1.0537748033345073, "grad_norm": 0.12599213421344757, "learning_rate": 4.599960053179117e-05, "loss": 0.0204, "step": 17950 }, { "epoch": 1.0543618645062816, "grad_norm": 1.180310845375061, "learning_rate": 4.595350708022583e-05, "loss": 0.0297, "step": 17960 }, { "epoch": 1.0549489256780558, "grad_norm": 0.9493981599807739, "learning_rate": 4.5907417089954926e-05, "loss": 0.0064, "step": 17970 }, { "epoch": 1.0555359868498297, "grad_norm": 2.6346616744995117, "learning_rate": 4.5861330600403e-05, "loss": 0.0104, "step": 17980 }, { "epoch": 1.056123048021604, "grad_norm": 1.2605183124542236, "learning_rate": 4.581524765099154e-05, "loss": 0.0128, "step": 17990 }, { "epoch": 1.056710109193378, "grad_norm": 0.6190035939216614, "learning_rate": 4.5769168281139066e-05, "loss": 0.0227, "step": 18000 }, { "epoch": 1.056710109193378, "eval_loss": 0.4979991912841797, "eval_runtime": 269.5923, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 18000 }, { "epoch": 1.0572971703651521, "grad_norm": 0.5389529466629028, "learning_rate": 4.572309253026101e-05, "loss": 0.0098, "step": 18010 }, { "epoch": 1.0578842315369261, "grad_norm": 1.576887607574463, "learning_rate": 4.56770204377697e-05, "loss": 0.025, "step": 18020 }, { "epoch": 1.0584712927087003, "grad_norm": 1.8304966688156128, "learning_rate": 4.5630952043074356e-05, "loss": 0.0161, "step": 18030 }, { "epoch": 1.0590583538804743, "grad_norm": 0.27343544363975525, "learning_rate": 4.5584887385581e-05, "loss": 0.0142, "step": 18040 }, { "epoch": 1.0596454150522485, "grad_norm": 2.4599697589874268, "learning_rate": 4.5538826504692496e-05, "loss": 0.0203, "step": 18050 }, { "epoch": 1.0602324762240225, "grad_norm": 0.04068051651120186, "learning_rate": 4.549276943980845e-05, "loss": 0.0294, "step": 18060 }, { "epoch": 1.0608195373957967, "grad_norm": 0.5054083466529846, "learning_rate": 4.544671623032522e-05, "loss": 0.008, "step": 18070 }, { "epoch": 1.0614065985675707, "grad_norm": 0.6975878477096558, "learning_rate": 4.540066691563587e-05, "loss": 0.007, "step": 18080 }, { "epoch": 1.061993659739345, "grad_norm": 0.965081512928009, "learning_rate": 4.535462153513012e-05, "loss": 0.0126, "step": 18090 }, { "epoch": 1.0625807209111189, "grad_norm": 0.7705340385437012, "learning_rate": 4.53085801281943e-05, "loss": 0.0104, "step": 18100 }, { "epoch": 1.063167782082893, "grad_norm": 0.8081434369087219, "learning_rate": 4.526254273421143e-05, "loss": 0.0131, "step": 18110 }, { "epoch": 1.063754843254667, "grad_norm": 0.7814608812332153, "learning_rate": 4.521650939256097e-05, "loss": 0.0089, "step": 18120 }, { "epoch": 1.0643419044264413, "grad_norm": 0.01808895170688629, "learning_rate": 4.517048014261902e-05, "loss": 0.0145, "step": 18130 }, { "epoch": 1.0649289655982153, "grad_norm": 1.0433502197265625, "learning_rate": 4.512445502375813e-05, "loss": 0.0093, "step": 18140 }, { "epoch": 1.0655160267699895, "grad_norm": 0.3834342658519745, "learning_rate": 4.507843407534732e-05, "loss": 0.0124, "step": 18150 }, { "epoch": 1.0661030879417634, "grad_norm": 0.3785993158817291, "learning_rate": 4.503241733675207e-05, "loss": 0.0299, "step": 18160 }, { "epoch": 1.0666901491135377, "grad_norm": 0.4766404330730438, "learning_rate": 4.498640484733421e-05, "loss": 0.0167, "step": 18170 }, { "epoch": 1.0672772102853116, "grad_norm": 1.4658899307250977, "learning_rate": 4.494039664645201e-05, "loss": 0.0163, "step": 18180 }, { "epoch": 1.0678642714570858, "grad_norm": 1.9271597862243652, "learning_rate": 4.4894392773459957e-05, "loss": 0.0112, "step": 18190 }, { "epoch": 1.0684513326288598, "grad_norm": 0.38466793298721313, "learning_rate": 4.4848393267708974e-05, "loss": 0.0179, "step": 18200 }, { "epoch": 1.069038393800634, "grad_norm": 1.8587846755981445, "learning_rate": 4.480239816854613e-05, "loss": 0.0275, "step": 18210 }, { "epoch": 1.069625454972408, "grad_norm": 0.025200162082910538, "learning_rate": 4.4756407515314804e-05, "loss": 0.0221, "step": 18220 }, { "epoch": 1.0702125161441822, "grad_norm": 0.8217595815658569, "learning_rate": 4.471042134735451e-05, "loss": 0.018, "step": 18230 }, { "epoch": 1.0707995773159564, "grad_norm": 0.7045888304710388, "learning_rate": 4.466443970400099e-05, "loss": 0.0091, "step": 18240 }, { "epoch": 1.0713866384877304, "grad_norm": 2.5180864334106445, "learning_rate": 4.461846262458606e-05, "loss": 0.0254, "step": 18250 }, { "epoch": 1.0719736996595046, "grad_norm": 0.5709795951843262, "learning_rate": 4.4572490148437686e-05, "loss": 0.005, "step": 18260 }, { "epoch": 1.0725607608312786, "grad_norm": 3.664907693862915, "learning_rate": 4.452652231487982e-05, "loss": 0.0124, "step": 18270 }, { "epoch": 1.0731478220030528, "grad_norm": 0.7889499664306641, "learning_rate": 4.448055916323249e-05, "loss": 0.0208, "step": 18280 }, { "epoch": 1.0737348831748268, "grad_norm": 2.400569438934326, "learning_rate": 4.443460073281178e-05, "loss": 0.0147, "step": 18290 }, { "epoch": 1.074321944346601, "grad_norm": 0.731299102306366, "learning_rate": 4.43886470629296e-05, "loss": 0.0149, "step": 18300 }, { "epoch": 1.074909005518375, "grad_norm": 0.11451888084411621, "learning_rate": 4.4342698192893904e-05, "loss": 0.0157, "step": 18310 }, { "epoch": 1.0754960666901492, "grad_norm": 2.5761590003967285, "learning_rate": 4.429675416200848e-05, "loss": 0.0304, "step": 18320 }, { "epoch": 1.0760831278619232, "grad_norm": 1.9438437223434448, "learning_rate": 4.4250815009573e-05, "loss": 0.0141, "step": 18330 }, { "epoch": 1.0766701890336974, "grad_norm": 0.7531579732894897, "learning_rate": 4.420488077488295e-05, "loss": 0.0378, "step": 18340 }, { "epoch": 1.0772572502054714, "grad_norm": 2.7101943492889404, "learning_rate": 4.415895149722964e-05, "loss": 0.0142, "step": 18350 }, { "epoch": 1.0778443113772456, "grad_norm": 0.9626311659812927, "learning_rate": 4.411302721590007e-05, "loss": 0.0174, "step": 18360 }, { "epoch": 1.0784313725490196, "grad_norm": 0.23843607306480408, "learning_rate": 4.406710797017706e-05, "loss": 0.0104, "step": 18370 }, { "epoch": 1.0790184337207938, "grad_norm": 1.6074140071868896, "learning_rate": 4.402119379933904e-05, "loss": 0.0164, "step": 18380 }, { "epoch": 1.0796054948925677, "grad_norm": 0.29586324095726013, "learning_rate": 4.3975284742660153e-05, "loss": 0.0195, "step": 18390 }, { "epoch": 1.080192556064342, "grad_norm": 1.8798375129699707, "learning_rate": 4.392938083941014e-05, "loss": 0.0043, "step": 18400 }, { "epoch": 1.080779617236116, "grad_norm": 0.8807462453842163, "learning_rate": 4.388348212885435e-05, "loss": 0.0142, "step": 18410 }, { "epoch": 1.0813666784078901, "grad_norm": 3.3922951221466064, "learning_rate": 4.383758865025368e-05, "loss": 0.0229, "step": 18420 }, { "epoch": 1.0819537395796641, "grad_norm": 0.4539323151111603, "learning_rate": 4.379170044286454e-05, "loss": 0.0177, "step": 18430 }, { "epoch": 1.0825408007514383, "grad_norm": 1.919954776763916, "learning_rate": 4.3745817545938874e-05, "loss": 0.0231, "step": 18440 }, { "epoch": 1.0831278619232123, "grad_norm": 3.1497795581817627, "learning_rate": 4.369993999872402e-05, "loss": 0.024, "step": 18450 }, { "epoch": 1.0837149230949865, "grad_norm": 0.14825113117694855, "learning_rate": 4.365406784046282e-05, "loss": 0.0156, "step": 18460 }, { "epoch": 1.0843019842667605, "grad_norm": 1.4273031949996948, "learning_rate": 4.360820111039341e-05, "loss": 0.0109, "step": 18470 }, { "epoch": 1.0848890454385347, "grad_norm": 0.04318694397807121, "learning_rate": 4.3562339847749376e-05, "loss": 0.0104, "step": 18480 }, { "epoch": 1.085476106610309, "grad_norm": 0.4969497621059418, "learning_rate": 4.3516484091759545e-05, "loss": 0.0149, "step": 18490 }, { "epoch": 1.086063167782083, "grad_norm": 2.145045042037964, "learning_rate": 4.347063388164812e-05, "loss": 0.0277, "step": 18500 }, { "epoch": 1.0866502289538569, "grad_norm": 0.2778224050998688, "learning_rate": 4.342478925663447e-05, "loss": 0.0181, "step": 18510 }, { "epoch": 1.087237290125631, "grad_norm": 4.340620994567871, "learning_rate": 4.3378950255933284e-05, "loss": 0.0125, "step": 18520 }, { "epoch": 1.0878243512974053, "grad_norm": 3.725788116455078, "learning_rate": 4.333311691875433e-05, "loss": 0.0214, "step": 18530 }, { "epoch": 1.0884114124691793, "grad_norm": 1.5069423913955688, "learning_rate": 4.3287289284302615e-05, "loss": 0.0317, "step": 18540 }, { "epoch": 1.0889984736409535, "grad_norm": 0.7962707281112671, "learning_rate": 4.324146739177824e-05, "loss": 0.0153, "step": 18550 }, { "epoch": 1.0895855348127275, "grad_norm": 0.17169342935085297, "learning_rate": 4.319565128037639e-05, "loss": 0.0155, "step": 18560 }, { "epoch": 1.0901725959845017, "grad_norm": 1.0419448614120483, "learning_rate": 4.314984098928733e-05, "loss": 0.0094, "step": 18570 }, { "epoch": 1.0907596571562757, "grad_norm": 0.0351492241024971, "learning_rate": 4.3104036557696295e-05, "loss": 0.0098, "step": 18580 }, { "epoch": 1.0913467183280499, "grad_norm": 1.296635389328003, "learning_rate": 4.305823802478357e-05, "loss": 0.022, "step": 18590 }, { "epoch": 1.0919337794998238, "grad_norm": 0.0528903566300869, "learning_rate": 4.301244542972435e-05, "loss": 0.0227, "step": 18600 }, { "epoch": 1.092520840671598, "grad_norm": 1.5245438814163208, "learning_rate": 4.2966658811688785e-05, "loss": 0.0143, "step": 18610 }, { "epoch": 1.093107901843372, "grad_norm": 0.5972466468811035, "learning_rate": 4.292087820984185e-05, "loss": 0.0081, "step": 18620 }, { "epoch": 1.0936949630151462, "grad_norm": 1.2663933038711548, "learning_rate": 4.287510366334346e-05, "loss": 0.0132, "step": 18630 }, { "epoch": 1.0942820241869202, "grad_norm": 1.925042986869812, "learning_rate": 4.282933521134827e-05, "loss": 0.018, "step": 18640 }, { "epoch": 1.0948690853586944, "grad_norm": 0.5491015315055847, "learning_rate": 4.2783572893005794e-05, "loss": 0.021, "step": 18650 }, { "epoch": 1.0954561465304684, "grad_norm": 0.7465754747390747, "learning_rate": 4.273781674746023e-05, "loss": 0.013, "step": 18660 }, { "epoch": 1.0960432077022426, "grad_norm": 0.1883556991815567, "learning_rate": 4.269206681385058e-05, "loss": 0.0157, "step": 18670 }, { "epoch": 1.0966302688740166, "grad_norm": 1.0537993907928467, "learning_rate": 4.264632313131041e-05, "loss": 0.0119, "step": 18680 }, { "epoch": 1.0972173300457908, "grad_norm": 1.8062794208526611, "learning_rate": 4.260058573896809e-05, "loss": 0.0274, "step": 18690 }, { "epoch": 1.0978043912175648, "grad_norm": 0.9525005221366882, "learning_rate": 4.255485467594647e-05, "loss": 0.0079, "step": 18700 }, { "epoch": 1.098391452389339, "grad_norm": 4.0878801345825195, "learning_rate": 4.250912998136307e-05, "loss": 0.008, "step": 18710 }, { "epoch": 1.098978513561113, "grad_norm": 1.1421128511428833, "learning_rate": 4.246341169432994e-05, "loss": 0.0169, "step": 18720 }, { "epoch": 1.0995655747328872, "grad_norm": 0.7710453867912292, "learning_rate": 4.241769985395365e-05, "loss": 0.03, "step": 18730 }, { "epoch": 1.1001526359046612, "grad_norm": 1.7933598756790161, "learning_rate": 4.2371994499335264e-05, "loss": 0.0208, "step": 18740 }, { "epoch": 1.1007396970764354, "grad_norm": 2.1244444847106934, "learning_rate": 4.232629566957026e-05, "loss": 0.0295, "step": 18750 }, { "epoch": 1.1013267582482094, "grad_norm": 1.4720721244812012, "learning_rate": 4.2280603403748606e-05, "loss": 0.0263, "step": 18760 }, { "epoch": 1.1019138194199836, "grad_norm": 2.264559507369995, "learning_rate": 4.223491774095455e-05, "loss": 0.0185, "step": 18770 }, { "epoch": 1.1025008805917578, "grad_norm": 0.14515529572963715, "learning_rate": 4.2189238720266826e-05, "loss": 0.0103, "step": 18780 }, { "epoch": 1.1030879417635318, "grad_norm": 0.01907486841082573, "learning_rate": 4.214356638075836e-05, "loss": 0.006, "step": 18790 }, { "epoch": 1.103675002935306, "grad_norm": 0.2551787197589874, "learning_rate": 4.2097900761496445e-05, "loss": 0.0114, "step": 18800 }, { "epoch": 1.10426206410708, "grad_norm": 0.6972945332527161, "learning_rate": 4.2052241901542576e-05, "loss": 0.0263, "step": 18810 }, { "epoch": 1.1048491252788541, "grad_norm": 0.8127758502960205, "learning_rate": 4.2006589839952526e-05, "loss": 0.0047, "step": 18820 }, { "epoch": 1.1054361864506281, "grad_norm": 0.3596905767917633, "learning_rate": 4.1960944615776175e-05, "loss": 0.0083, "step": 18830 }, { "epoch": 1.1060232476224023, "grad_norm": 0.02701782062649727, "learning_rate": 4.191530626805762e-05, "loss": 0.0136, "step": 18840 }, { "epoch": 1.1066103087941763, "grad_norm": 1.7618811130523682, "learning_rate": 4.186967483583505e-05, "loss": 0.0181, "step": 18850 }, { "epoch": 1.1071973699659505, "grad_norm": 1.2107560634613037, "learning_rate": 4.1824050358140724e-05, "loss": 0.0091, "step": 18860 }, { "epoch": 1.1077844311377245, "grad_norm": 0.12104026973247528, "learning_rate": 4.1778432874001006e-05, "loss": 0.0155, "step": 18870 }, { "epoch": 1.1083714923094987, "grad_norm": 0.543029248714447, "learning_rate": 4.173282242243618e-05, "loss": 0.0237, "step": 18880 }, { "epoch": 1.1089585534812727, "grad_norm": 0.8624221086502075, "learning_rate": 4.168721904246063e-05, "loss": 0.0101, "step": 18890 }, { "epoch": 1.109545614653047, "grad_norm": 2.7473998069763184, "learning_rate": 4.164162277308259e-05, "loss": 0.0217, "step": 18900 }, { "epoch": 1.110132675824821, "grad_norm": 2.060375928878784, "learning_rate": 4.15960336533043e-05, "loss": 0.0106, "step": 18910 }, { "epoch": 1.110719736996595, "grad_norm": 0.04229264706373215, "learning_rate": 4.1550451722121806e-05, "loss": 0.013, "step": 18920 }, { "epoch": 1.111306798168369, "grad_norm": 1.526538372039795, "learning_rate": 4.1504877018525065e-05, "loss": 0.0388, "step": 18930 }, { "epoch": 1.1118938593401433, "grad_norm": 1.4976556301116943, "learning_rate": 4.14593095814978e-05, "loss": 0.0176, "step": 18940 }, { "epoch": 1.1124809205119173, "grad_norm": 0.9924315214157104, "learning_rate": 4.141374945001758e-05, "loss": 0.0298, "step": 18950 }, { "epoch": 1.1130679816836915, "grad_norm": 0.9775465130805969, "learning_rate": 4.136819666305566e-05, "loss": 0.0219, "step": 18960 }, { "epoch": 1.1136550428554655, "grad_norm": 5.021305084228516, "learning_rate": 4.1322651259577064e-05, "loss": 0.0167, "step": 18970 }, { "epoch": 1.1142421040272397, "grad_norm": 1.727529525756836, "learning_rate": 4.1277113278540456e-05, "loss": 0.0064, "step": 18980 }, { "epoch": 1.1148291651990136, "grad_norm": 1.548048973083496, "learning_rate": 4.123158275889819e-05, "loss": 0.0134, "step": 18990 }, { "epoch": 1.1154162263707879, "grad_norm": 3.1047353744506836, "learning_rate": 4.118605973959623e-05, "loss": 0.0128, "step": 19000 }, { "epoch": 1.1160032875425618, "grad_norm": 0.6265288591384888, "learning_rate": 4.11405442595741e-05, "loss": 0.0108, "step": 19010 }, { "epoch": 1.116590348714336, "grad_norm": 1.5217883586883545, "learning_rate": 4.1095036357764915e-05, "loss": 0.0109, "step": 19020 }, { "epoch": 1.11717740988611, "grad_norm": 0.46668195724487305, "learning_rate": 4.104953607309524e-05, "loss": 0.0198, "step": 19030 }, { "epoch": 1.1177644710578842, "grad_norm": 1.9134923219680786, "learning_rate": 4.100404344448522e-05, "loss": 0.0127, "step": 19040 }, { "epoch": 1.1183515322296582, "grad_norm": 0.603248119354248, "learning_rate": 4.095855851084836e-05, "loss": 0.0154, "step": 19050 }, { "epoch": 1.1189385934014324, "grad_norm": 1.7319767475128174, "learning_rate": 4.091308131109165e-05, "loss": 0.0137, "step": 19060 }, { "epoch": 1.1195256545732066, "grad_norm": 0.5494726896286011, "learning_rate": 4.086761188411541e-05, "loss": 0.0087, "step": 19070 }, { "epoch": 1.1201127157449806, "grad_norm": 0.5562161207199097, "learning_rate": 4.082215026881337e-05, "loss": 0.0085, "step": 19080 }, { "epoch": 1.1206997769167548, "grad_norm": 1.4518531560897827, "learning_rate": 4.0776696504072506e-05, "loss": 0.0089, "step": 19090 }, { "epoch": 1.1212868380885288, "grad_norm": 0.7333649396896362, "learning_rate": 4.073125062877317e-05, "loss": 0.014, "step": 19100 }, { "epoch": 1.121873899260303, "grad_norm": 0.05590398982167244, "learning_rate": 4.068581268178886e-05, "loss": 0.0162, "step": 19110 }, { "epoch": 1.122460960432077, "grad_norm": 0.5153812170028687, "learning_rate": 4.064038270198638e-05, "loss": 0.0231, "step": 19120 }, { "epoch": 1.1230480216038512, "grad_norm": 0.26842015981674194, "learning_rate": 4.05949607282257e-05, "loss": 0.0228, "step": 19130 }, { "epoch": 1.1236350827756252, "grad_norm": 0.20137840509414673, "learning_rate": 4.054954679935988e-05, "loss": 0.0166, "step": 19140 }, { "epoch": 1.1242221439473994, "grad_norm": 0.9210729002952576, "learning_rate": 4.05041409542352e-05, "loss": 0.0129, "step": 19150 }, { "epoch": 1.1248092051191734, "grad_norm": 0.48231643438339233, "learning_rate": 4.0458743231690925e-05, "loss": 0.0107, "step": 19160 }, { "epoch": 1.1253962662909476, "grad_norm": 4.762882232666016, "learning_rate": 4.041335367055945e-05, "loss": 0.0263, "step": 19170 }, { "epoch": 1.1259833274627216, "grad_norm": 3.452972650527954, "learning_rate": 4.0367972309666145e-05, "loss": 0.0136, "step": 19180 }, { "epoch": 1.1265703886344958, "grad_norm": 0.3142324388027191, "learning_rate": 4.03225991878294e-05, "loss": 0.0109, "step": 19190 }, { "epoch": 1.1271574498062698, "grad_norm": 0.8441796898841858, "learning_rate": 4.027723434386049e-05, "loss": 0.0143, "step": 19200 }, { "epoch": 1.127744510978044, "grad_norm": 2.2673232555389404, "learning_rate": 4.0231877816563695e-05, "loss": 0.0229, "step": 19210 }, { "epoch": 1.128331572149818, "grad_norm": 0.729567289352417, "learning_rate": 4.0186529644736114e-05, "loss": 0.015, "step": 19220 }, { "epoch": 1.1289186333215921, "grad_norm": 0.5330530405044556, "learning_rate": 4.014118986716776e-05, "loss": 0.0157, "step": 19230 }, { "epoch": 1.1295056944933661, "grad_norm": 0.33814314007759094, "learning_rate": 4.0095858522641394e-05, "loss": 0.0214, "step": 19240 }, { "epoch": 1.1300927556651403, "grad_norm": 2.5041515827178955, "learning_rate": 4.005053564993261e-05, "loss": 0.0258, "step": 19250 }, { "epoch": 1.1306798168369143, "grad_norm": 0.012002293951809406, "learning_rate": 4.000522128780978e-05, "loss": 0.0186, "step": 19260 }, { "epoch": 1.1312668780086885, "grad_norm": 0.7148932218551636, "learning_rate": 3.995991547503392e-05, "loss": 0.0102, "step": 19270 }, { "epoch": 1.1318539391804625, "grad_norm": 2.050722122192383, "learning_rate": 3.991461825035882e-05, "loss": 0.0107, "step": 19280 }, { "epoch": 1.1324410003522367, "grad_norm": 0.2507849633693695, "learning_rate": 3.986932965253081e-05, "loss": 0.0095, "step": 19290 }, { "epoch": 1.1330280615240107, "grad_norm": 1.851148009300232, "learning_rate": 3.9824049720289e-05, "loss": 0.0164, "step": 19300 }, { "epoch": 1.133615122695785, "grad_norm": 1.9028648138046265, "learning_rate": 3.9778778492364924e-05, "loss": 0.0159, "step": 19310 }, { "epoch": 1.134202183867559, "grad_norm": 1.4246652126312256, "learning_rate": 3.973351600748278e-05, "loss": 0.0145, "step": 19320 }, { "epoch": 1.134789245039333, "grad_norm": 0.08715999126434326, "learning_rate": 3.968826230435923e-05, "loss": 0.0141, "step": 19330 }, { "epoch": 1.135376306211107, "grad_norm": 2.2939882278442383, "learning_rate": 3.964301742170349e-05, "loss": 0.0158, "step": 19340 }, { "epoch": 1.1359633673828813, "grad_norm": 0.3845359683036804, "learning_rate": 3.9597781398217135e-05, "loss": 0.0194, "step": 19350 }, { "epoch": 1.1365504285546555, "grad_norm": 0.37646591663360596, "learning_rate": 3.9552554272594256e-05, "loss": 0.0114, "step": 19360 }, { "epoch": 1.1371374897264295, "grad_norm": 1.6800992488861084, "learning_rate": 3.9507336083521256e-05, "loss": 0.0188, "step": 19370 }, { "epoch": 1.1377245508982037, "grad_norm": 2.078392267227173, "learning_rate": 3.946212686967696e-05, "loss": 0.0309, "step": 19380 }, { "epoch": 1.1383116120699777, "grad_norm": 4.602158546447754, "learning_rate": 3.9416926669732454e-05, "loss": 0.0238, "step": 19390 }, { "epoch": 1.1388986732417519, "grad_norm": 2.036029577255249, "learning_rate": 3.937173552235117e-05, "loss": 0.0252, "step": 19400 }, { "epoch": 1.1394857344135259, "grad_norm": 1.749767541885376, "learning_rate": 3.932655346618876e-05, "loss": 0.013, "step": 19410 }, { "epoch": 1.1400727955853, "grad_norm": 1.1671046018600464, "learning_rate": 3.9281380539893114e-05, "loss": 0.031, "step": 19420 }, { "epoch": 1.140659856757074, "grad_norm": 0.6031616926193237, "learning_rate": 3.923621678210432e-05, "loss": 0.0217, "step": 19430 }, { "epoch": 1.1412469179288482, "grad_norm": 2.466723680496216, "learning_rate": 3.9191062231454586e-05, "loss": 0.0246, "step": 19440 }, { "epoch": 1.1418339791006222, "grad_norm": 1.7094677686691284, "learning_rate": 3.914591692656831e-05, "loss": 0.0166, "step": 19450 }, { "epoch": 1.1424210402723964, "grad_norm": 7.691342830657959, "learning_rate": 3.9100780906061896e-05, "loss": 0.0207, "step": 19460 }, { "epoch": 1.1430081014441704, "grad_norm": 2.8758277893066406, "learning_rate": 3.905565420854388e-05, "loss": 0.0282, "step": 19470 }, { "epoch": 1.1435951626159446, "grad_norm": 1.252706527709961, "learning_rate": 3.901053687261479e-05, "loss": 0.0086, "step": 19480 }, { "epoch": 1.1441822237877186, "grad_norm": 0.2504454553127289, "learning_rate": 3.896542893686716e-05, "loss": 0.0151, "step": 19490 }, { "epoch": 1.1447692849594928, "grad_norm": 1.5650849342346191, "learning_rate": 3.892033043988547e-05, "loss": 0.0175, "step": 19500 }, { "epoch": 1.1453563461312668, "grad_norm": 0.6675151586532593, "learning_rate": 3.887524142024614e-05, "loss": 0.0259, "step": 19510 }, { "epoch": 1.145943407303041, "grad_norm": 0.17080461978912354, "learning_rate": 3.883016191651744e-05, "loss": 0.0042, "step": 19520 }, { "epoch": 1.146530468474815, "grad_norm": 2.4723546504974365, "learning_rate": 3.878509196725957e-05, "loss": 0.0171, "step": 19530 }, { "epoch": 1.1471175296465892, "grad_norm": 0.2739218473434448, "learning_rate": 3.874003161102453e-05, "loss": 0.0092, "step": 19540 }, { "epoch": 1.1477045908183632, "grad_norm": 0.8519219756126404, "learning_rate": 3.869498088635608e-05, "loss": 0.0179, "step": 19550 }, { "epoch": 1.1482916519901374, "grad_norm": 1.4947580099105835, "learning_rate": 3.864993983178978e-05, "loss": 0.0196, "step": 19560 }, { "epoch": 1.1488787131619116, "grad_norm": 0.4917875826358795, "learning_rate": 3.860490848585291e-05, "loss": 0.0158, "step": 19570 }, { "epoch": 1.1494657743336856, "grad_norm": 3.007902145385742, "learning_rate": 3.8559886887064434e-05, "loss": 0.0156, "step": 19580 }, { "epoch": 1.1500528355054596, "grad_norm": 0.5176740288734436, "learning_rate": 3.851487507393498e-05, "loss": 0.017, "step": 19590 }, { "epoch": 1.1506398966772338, "grad_norm": 0.7068972587585449, "learning_rate": 3.846987308496686e-05, "loss": 0.0112, "step": 19600 }, { "epoch": 1.151226957849008, "grad_norm": 0.1920938789844513, "learning_rate": 3.8424880958653855e-05, "loss": 0.0118, "step": 19610 }, { "epoch": 1.151814019020782, "grad_norm": 1.1475507020950317, "learning_rate": 3.8379898733481455e-05, "loss": 0.0177, "step": 19620 }, { "epoch": 1.1524010801925562, "grad_norm": 0.03571141138672829, "learning_rate": 3.8334926447926576e-05, "loss": 0.0098, "step": 19630 }, { "epoch": 1.1529881413643301, "grad_norm": 0.08643414825201035, "learning_rate": 3.82899641404577e-05, "loss": 0.004, "step": 19640 }, { "epoch": 1.1535752025361043, "grad_norm": 0.038801468908786774, "learning_rate": 3.8245011849534724e-05, "loss": 0.0373, "step": 19650 }, { "epoch": 1.1541622637078783, "grad_norm": 0.48830196261405945, "learning_rate": 3.820006961360901e-05, "loss": 0.0254, "step": 19660 }, { "epoch": 1.1547493248796525, "grad_norm": 2.7519326210021973, "learning_rate": 3.8155137471123294e-05, "loss": 0.0173, "step": 19670 }, { "epoch": 1.1553363860514265, "grad_norm": 0.008496723137795925, "learning_rate": 3.8110215460511696e-05, "loss": 0.01, "step": 19680 }, { "epoch": 1.1559234472232007, "grad_norm": 2.1455023288726807, "learning_rate": 3.806530362019969e-05, "loss": 0.0092, "step": 19690 }, { "epoch": 1.1565105083949747, "grad_norm": 1.0149089097976685, "learning_rate": 3.802040198860397e-05, "loss": 0.0043, "step": 19700 }, { "epoch": 1.157097569566749, "grad_norm": 3.4099552631378174, "learning_rate": 3.7975510604132626e-05, "loss": 0.0148, "step": 19710 }, { "epoch": 1.157684630738523, "grad_norm": 0.7367317080497742, "learning_rate": 3.793062950518484e-05, "loss": 0.0236, "step": 19720 }, { "epoch": 1.158271691910297, "grad_norm": 0.44443872570991516, "learning_rate": 3.788575873015111e-05, "loss": 0.0103, "step": 19730 }, { "epoch": 1.158858753082071, "grad_norm": 2.0920252799987793, "learning_rate": 3.7840898317413034e-05, "loss": 0.0193, "step": 19740 }, { "epoch": 1.1594458142538453, "grad_norm": 1.7599064111709595, "learning_rate": 3.7796048305343383e-05, "loss": 0.035, "step": 19750 }, { "epoch": 1.1600328754256193, "grad_norm": 0.644555926322937, "learning_rate": 3.7751208732306015e-05, "loss": 0.0139, "step": 19760 }, { "epoch": 1.1606199365973935, "grad_norm": 0.03693939745426178, "learning_rate": 3.770637963665589e-05, "loss": 0.0046, "step": 19770 }, { "epoch": 1.1612069977691675, "grad_norm": 0.42754417657852173, "learning_rate": 3.766156105673891e-05, "loss": 0.0121, "step": 19780 }, { "epoch": 1.1617940589409417, "grad_norm": 0.2325069159269333, "learning_rate": 3.761675303089213e-05, "loss": 0.0165, "step": 19790 }, { "epoch": 1.1623811201127157, "grad_norm": 2.635518789291382, "learning_rate": 3.757195559744345e-05, "loss": 0.0196, "step": 19800 }, { "epoch": 1.1629681812844899, "grad_norm": 0.5041676759719849, "learning_rate": 3.7527168794711764e-05, "loss": 0.0189, "step": 19810 }, { "epoch": 1.1635552424562638, "grad_norm": 2.850172281265259, "learning_rate": 3.748239266100689e-05, "loss": 0.0192, "step": 19820 }, { "epoch": 1.164142303628038, "grad_norm": 0.2840389907360077, "learning_rate": 3.7437627234629464e-05, "loss": 0.0224, "step": 19830 }, { "epoch": 1.164729364799812, "grad_norm": 0.43667224049568176, "learning_rate": 3.7392872553871025e-05, "loss": 0.0141, "step": 19840 }, { "epoch": 1.1653164259715862, "grad_norm": 2.1170408725738525, "learning_rate": 3.7348128657013864e-05, "loss": 0.0112, "step": 19850 }, { "epoch": 1.1659034871433605, "grad_norm": 2.672410249710083, "learning_rate": 3.730339558233111e-05, "loss": 0.01, "step": 19860 }, { "epoch": 1.1664905483151344, "grad_norm": 2.02648663520813, "learning_rate": 3.7258673368086545e-05, "loss": 0.0269, "step": 19870 }, { "epoch": 1.1670776094869084, "grad_norm": 1.8439985513687134, "learning_rate": 3.721396205253478e-05, "loss": 0.0119, "step": 19880 }, { "epoch": 1.1676646706586826, "grad_norm": 0.4652295410633087, "learning_rate": 3.716926167392098e-05, "loss": 0.0123, "step": 19890 }, { "epoch": 1.1682517318304568, "grad_norm": 2.3301098346710205, "learning_rate": 3.7124572270481056e-05, "loss": 0.0155, "step": 19900 }, { "epoch": 1.1688387930022308, "grad_norm": 4.59347677230835, "learning_rate": 3.707989388044146e-05, "loss": 0.0175, "step": 19910 }, { "epoch": 1.169425854174005, "grad_norm": 0.5529350638389587, "learning_rate": 3.7035226542019275e-05, "loss": 0.0174, "step": 19920 }, { "epoch": 1.170012915345779, "grad_norm": 2.0525143146514893, "learning_rate": 3.699057029342209e-05, "loss": 0.0401, "step": 19930 }, { "epoch": 1.1705999765175532, "grad_norm": 0.05721009150147438, "learning_rate": 3.6945925172848054e-05, "loss": 0.0107, "step": 19940 }, { "epoch": 1.1711870376893272, "grad_norm": 1.8482041358947754, "learning_rate": 3.6901291218485725e-05, "loss": 0.0263, "step": 19950 }, { "epoch": 1.1717740988611014, "grad_norm": 2.0639731884002686, "learning_rate": 3.685666846851417e-05, "loss": 0.0158, "step": 19960 }, { "epoch": 1.1723611600328754, "grad_norm": 1.4494773149490356, "learning_rate": 3.6812056961102894e-05, "loss": 0.0161, "step": 19970 }, { "epoch": 1.1729482212046496, "grad_norm": 0.05280623957514763, "learning_rate": 3.67674567344117e-05, "loss": 0.0091, "step": 19980 }, { "epoch": 1.1735352823764236, "grad_norm": 0.8740899562835693, "learning_rate": 3.672286782659081e-05, "loss": 0.0123, "step": 19990 }, { "epoch": 1.1741223435481978, "grad_norm": 1.8102203607559204, "learning_rate": 3.6678290275780724e-05, "loss": 0.0191, "step": 20000 }, { "epoch": 1.1747094047199718, "grad_norm": 1.0259355306625366, "learning_rate": 3.6633724120112274e-05, "loss": 0.0257, "step": 20010 }, { "epoch": 1.175296465891746, "grad_norm": 1.0387524366378784, "learning_rate": 3.658916939770649e-05, "loss": 0.019, "step": 20020 }, { "epoch": 1.17588352706352, "grad_norm": 0.5713001489639282, "learning_rate": 3.6544626146674685e-05, "loss": 0.0115, "step": 20030 }, { "epoch": 1.1764705882352942, "grad_norm": 0.6265963912010193, "learning_rate": 3.650009440511828e-05, "loss": 0.0149, "step": 20040 }, { "epoch": 1.1770576494070681, "grad_norm": 0.13561226427555084, "learning_rate": 3.645557421112893e-05, "loss": 0.0274, "step": 20050 }, { "epoch": 1.1776447105788423, "grad_norm": 2.2962963581085205, "learning_rate": 3.641106560278834e-05, "loss": 0.0159, "step": 20060 }, { "epoch": 1.1782317717506163, "grad_norm": 2.1667020320892334, "learning_rate": 3.636656861816838e-05, "loss": 0.0137, "step": 20070 }, { "epoch": 1.1788188329223905, "grad_norm": 2.4629743099212646, "learning_rate": 3.632208329533092e-05, "loss": 0.0284, "step": 20080 }, { "epoch": 1.1794058940941645, "grad_norm": 1.6178267002105713, "learning_rate": 3.627760967232788e-05, "loss": 0.0083, "step": 20090 }, { "epoch": 1.1799929552659387, "grad_norm": 1.4193296432495117, "learning_rate": 3.6233147787201175e-05, "loss": 0.0068, "step": 20100 }, { "epoch": 1.180580016437713, "grad_norm": 0.6253151893615723, "learning_rate": 3.618869767798263e-05, "loss": 0.0141, "step": 20110 }, { "epoch": 1.181167077609487, "grad_norm": 3.456434726715088, "learning_rate": 3.6144259382694114e-05, "loss": 0.0264, "step": 20120 }, { "epoch": 1.181754138781261, "grad_norm": 0.8652618527412415, "learning_rate": 3.6099832939347237e-05, "loss": 0.0113, "step": 20130 }, { "epoch": 1.182341199953035, "grad_norm": 2.624691963195801, "learning_rate": 3.605541838594359e-05, "loss": 0.0165, "step": 20140 }, { "epoch": 1.1829282611248093, "grad_norm": 0.5651686191558838, "learning_rate": 3.6011015760474534e-05, "loss": 0.003, "step": 20150 }, { "epoch": 1.1835153222965833, "grad_norm": 0.3492438793182373, "learning_rate": 3.596662510092126e-05, "loss": 0.0137, "step": 20160 }, { "epoch": 1.1841023834683573, "grad_norm": 1.9067304134368896, "learning_rate": 3.5922246445254706e-05, "loss": 0.0134, "step": 20170 }, { "epoch": 1.1846894446401315, "grad_norm": 0.31494903564453125, "learning_rate": 3.587787983143554e-05, "loss": 0.0054, "step": 20180 }, { "epoch": 1.1852765058119057, "grad_norm": 0.13532504439353943, "learning_rate": 3.583352529741413e-05, "loss": 0.0109, "step": 20190 }, { "epoch": 1.1858635669836797, "grad_norm": 1.6830731630325317, "learning_rate": 3.578918288113055e-05, "loss": 0.014, "step": 20200 }, { "epoch": 1.1864506281554539, "grad_norm": 0.5527680516242981, "learning_rate": 3.5744852620514415e-05, "loss": 0.0125, "step": 20210 }, { "epoch": 1.1870376893272279, "grad_norm": 1.511231541633606, "learning_rate": 3.570053455348502e-05, "loss": 0.0137, "step": 20220 }, { "epoch": 1.187624750499002, "grad_norm": 0.7616772055625916, "learning_rate": 3.565622871795127e-05, "loss": 0.0256, "step": 20230 }, { "epoch": 1.188211811670776, "grad_norm": 2.778029203414917, "learning_rate": 3.561193515181147e-05, "loss": 0.0195, "step": 20240 }, { "epoch": 1.1887988728425503, "grad_norm": 2.1403579711914062, "learning_rate": 3.5567653892953564e-05, "loss": 0.0198, "step": 20250 }, { "epoch": 1.1893859340143242, "grad_norm": 0.7490493655204773, "learning_rate": 3.552338497925488e-05, "loss": 0.0281, "step": 20260 }, { "epoch": 1.1899729951860984, "grad_norm": 1.7620073556900024, "learning_rate": 3.5479128448582246e-05, "loss": 0.0113, "step": 20270 }, { "epoch": 1.1905600563578724, "grad_norm": 2.1982274055480957, "learning_rate": 3.543488433879184e-05, "loss": 0.018, "step": 20280 }, { "epoch": 1.1911471175296466, "grad_norm": 1.6664766073226929, "learning_rate": 3.539065268772929e-05, "loss": 0.0135, "step": 20290 }, { "epoch": 1.1917341787014206, "grad_norm": 4.160362243652344, "learning_rate": 3.5346433533229474e-05, "loss": 0.0184, "step": 20300 }, { "epoch": 1.1923212398731948, "grad_norm": 0.6458218693733215, "learning_rate": 3.530222691311666e-05, "loss": 0.0259, "step": 20310 }, { "epoch": 1.1929083010449688, "grad_norm": 1.137535810470581, "learning_rate": 3.525803286520437e-05, "loss": 0.0142, "step": 20320 }, { "epoch": 1.193495362216743, "grad_norm": 0.21634048223495483, "learning_rate": 3.521385142729535e-05, "loss": 0.0139, "step": 20330 }, { "epoch": 1.194082423388517, "grad_norm": 4.346113204956055, "learning_rate": 3.516968263718159e-05, "loss": 0.0234, "step": 20340 }, { "epoch": 1.1946694845602912, "grad_norm": 1.4190869331359863, "learning_rate": 3.512552653264425e-05, "loss": 0.0157, "step": 20350 }, { "epoch": 1.1952565457320652, "grad_norm": 1.1790575981140137, "learning_rate": 3.5081383151453604e-05, "loss": 0.0126, "step": 20360 }, { "epoch": 1.1958436069038394, "grad_norm": 0.6182461977005005, "learning_rate": 3.5037252531369104e-05, "loss": 0.0161, "step": 20370 }, { "epoch": 1.1964306680756134, "grad_norm": 1.2333548069000244, "learning_rate": 3.499313471013928e-05, "loss": 0.0134, "step": 20380 }, { "epoch": 1.1970177292473876, "grad_norm": 0.6007595658302307, "learning_rate": 3.494902972550165e-05, "loss": 0.0224, "step": 20390 }, { "epoch": 1.1976047904191618, "grad_norm": 1.6859315633773804, "learning_rate": 3.490493761518281e-05, "loss": 0.018, "step": 20400 }, { "epoch": 1.1981918515909358, "grad_norm": 0.025474315509200096, "learning_rate": 3.486085841689832e-05, "loss": 0.0114, "step": 20410 }, { "epoch": 1.1987789127627098, "grad_norm": 1.8451062440872192, "learning_rate": 3.481679216835273e-05, "loss": 0.0178, "step": 20420 }, { "epoch": 1.199365973934484, "grad_norm": 2.9829752445220947, "learning_rate": 3.477273890723944e-05, "loss": 0.0374, "step": 20430 }, { "epoch": 1.1999530351062582, "grad_norm": 0.15790514647960663, "learning_rate": 3.4728698671240854e-05, "loss": 0.0233, "step": 20440 }, { "epoch": 1.2005400962780322, "grad_norm": 0.18786346912384033, "learning_rate": 3.468467149802808e-05, "loss": 0.0217, "step": 20450 }, { "epoch": 1.2011271574498064, "grad_norm": 2.4212985038757324, "learning_rate": 3.4640657425261224e-05, "loss": 0.0141, "step": 20460 }, { "epoch": 1.2017142186215803, "grad_norm": 1.8529967069625854, "learning_rate": 3.459665649058904e-05, "loss": 0.0146, "step": 20470 }, { "epoch": 1.2023012797933545, "grad_norm": 0.6335251927375793, "learning_rate": 3.455266873164914e-05, "loss": 0.02, "step": 20480 }, { "epoch": 1.2028883409651285, "grad_norm": 2.4057374000549316, "learning_rate": 3.45086941860678e-05, "loss": 0.0341, "step": 20490 }, { "epoch": 1.2034754021369027, "grad_norm": 2.200655937194824, "learning_rate": 3.446473289146006e-05, "loss": 0.0188, "step": 20500 }, { "epoch": 1.2040624633086767, "grad_norm": 0.0019765030592679977, "learning_rate": 3.442078488542957e-05, "loss": 0.0069, "step": 20510 }, { "epoch": 1.204649524480451, "grad_norm": 0.9872438311576843, "learning_rate": 3.437685020556864e-05, "loss": 0.0168, "step": 20520 }, { "epoch": 1.205236585652225, "grad_norm": 1.3100894689559937, "learning_rate": 3.433292888945818e-05, "loss": 0.0077, "step": 20530 }, { "epoch": 1.2058236468239991, "grad_norm": 1.1229948997497559, "learning_rate": 3.428902097466764e-05, "loss": 0.0103, "step": 20540 }, { "epoch": 1.206410707995773, "grad_norm": 1.789831280708313, "learning_rate": 3.424512649875506e-05, "loss": 0.0126, "step": 20550 }, { "epoch": 1.2069977691675473, "grad_norm": 0.010228688828647137, "learning_rate": 3.420124549926693e-05, "loss": 0.016, "step": 20560 }, { "epoch": 1.2075848303393213, "grad_norm": 1.4194481372833252, "learning_rate": 3.4157378013738264e-05, "loss": 0.0193, "step": 20570 }, { "epoch": 1.2081718915110955, "grad_norm": 2.608931303024292, "learning_rate": 3.411352407969245e-05, "loss": 0.0083, "step": 20580 }, { "epoch": 1.2087589526828695, "grad_norm": 2.1073362827301025, "learning_rate": 3.406968373464137e-05, "loss": 0.0138, "step": 20590 }, { "epoch": 1.2093460138546437, "grad_norm": 1.5467884540557861, "learning_rate": 3.402585701608519e-05, "loss": 0.0229, "step": 20600 }, { "epoch": 1.2099330750264177, "grad_norm": 0.020903684198856354, "learning_rate": 3.398204396151251e-05, "loss": 0.0077, "step": 20610 }, { "epoch": 1.2105201361981919, "grad_norm": 0.7192597985267639, "learning_rate": 3.3938244608400164e-05, "loss": 0.0156, "step": 20620 }, { "epoch": 1.2111071973699659, "grad_norm": 0.054821472615003586, "learning_rate": 3.389445899421332e-05, "loss": 0.0069, "step": 20630 }, { "epoch": 1.21169425854174, "grad_norm": 2.3048887252807617, "learning_rate": 3.385068715640536e-05, "loss": 0.0111, "step": 20640 }, { "epoch": 1.2122813197135143, "grad_norm": 0.7605149149894714, "learning_rate": 3.380692913241791e-05, "loss": 0.0277, "step": 20650 }, { "epoch": 1.2128683808852883, "grad_norm": 1.7066093683242798, "learning_rate": 3.376318495968076e-05, "loss": 0.0124, "step": 20660 }, { "epoch": 1.2134554420570622, "grad_norm": 1.971728801727295, "learning_rate": 3.371945467561186e-05, "loss": 0.0226, "step": 20670 }, { "epoch": 1.2140425032288364, "grad_norm": 0.8244488835334778, "learning_rate": 3.367573831761728e-05, "loss": 0.0118, "step": 20680 }, { "epoch": 1.2146295644006107, "grad_norm": 0.2647351324558258, "learning_rate": 3.363203592309117e-05, "loss": 0.0118, "step": 20690 }, { "epoch": 1.2152166255723846, "grad_norm": 5.359185218811035, "learning_rate": 3.358834752941576e-05, "loss": 0.0209, "step": 20700 }, { "epoch": 1.2158036867441586, "grad_norm": 2.0620193481445312, "learning_rate": 3.354467317396124e-05, "loss": 0.0158, "step": 20710 }, { "epoch": 1.2163907479159328, "grad_norm": 0.27595534920692444, "learning_rate": 3.35010128940859e-05, "loss": 0.018, "step": 20720 }, { "epoch": 1.216977809087707, "grad_norm": 0.4861520528793335, "learning_rate": 3.345736672713588e-05, "loss": 0.0154, "step": 20730 }, { "epoch": 1.217564870259481, "grad_norm": 5.281528949737549, "learning_rate": 3.341373471044531e-05, "loss": 0.0187, "step": 20740 }, { "epoch": 1.2181519314312552, "grad_norm": 2.6023237705230713, "learning_rate": 3.33701168813362e-05, "loss": 0.016, "step": 20750 }, { "epoch": 1.2187389926030292, "grad_norm": 1.3135600090026855, "learning_rate": 3.3326513277118446e-05, "loss": 0.0141, "step": 20760 }, { "epoch": 1.2193260537748034, "grad_norm": 1.202734112739563, "learning_rate": 3.328292393508972e-05, "loss": 0.0191, "step": 20770 }, { "epoch": 1.2199131149465774, "grad_norm": 1.1474391222000122, "learning_rate": 3.323934889253556e-05, "loss": 0.0163, "step": 20780 }, { "epoch": 1.2205001761183516, "grad_norm": 3.3902037143707275, "learning_rate": 3.3195788186729245e-05, "loss": 0.0087, "step": 20790 }, { "epoch": 1.2210872372901256, "grad_norm": 1.333632230758667, "learning_rate": 3.315224185493176e-05, "loss": 0.0159, "step": 20800 }, { "epoch": 1.2216742984618998, "grad_norm": 0.49751606583595276, "learning_rate": 3.310870993439187e-05, "loss": 0.0128, "step": 20810 }, { "epoch": 1.2222613596336738, "grad_norm": 0.791757345199585, "learning_rate": 3.3065192462345915e-05, "loss": 0.0151, "step": 20820 }, { "epoch": 1.222848420805448, "grad_norm": 0.08929609507322311, "learning_rate": 3.302168947601797e-05, "loss": 0.0184, "step": 20830 }, { "epoch": 1.223435481977222, "grad_norm": 0.20553001761436462, "learning_rate": 3.297820101261964e-05, "loss": 0.022, "step": 20840 }, { "epoch": 1.2240225431489962, "grad_norm": 0.8915656805038452, "learning_rate": 3.293472710935017e-05, "loss": 0.0148, "step": 20850 }, { "epoch": 1.2246096043207702, "grad_norm": 0.42689603567123413, "learning_rate": 3.289126780339631e-05, "loss": 0.0131, "step": 20860 }, { "epoch": 1.2251966654925444, "grad_norm": 0.42955946922302246, "learning_rate": 3.2847823131932365e-05, "loss": 0.0217, "step": 20870 }, { "epoch": 1.2257837266643183, "grad_norm": 0.8357498049736023, "learning_rate": 3.280439313212006e-05, "loss": 0.0161, "step": 20880 }, { "epoch": 1.2263707878360925, "grad_norm": 0.27838730812072754, "learning_rate": 3.276097784110862e-05, "loss": 0.0203, "step": 20890 }, { "epoch": 1.2269578490078665, "grad_norm": 0.32219842076301575, "learning_rate": 3.271757729603467e-05, "loss": 0.0126, "step": 20900 }, { "epoch": 1.2275449101796407, "grad_norm": 0.9408592581748962, "learning_rate": 3.267419153402225e-05, "loss": 0.0143, "step": 20910 }, { "epoch": 1.2281319713514147, "grad_norm": 1.6443122625350952, "learning_rate": 3.2630820592182696e-05, "loss": 0.014, "step": 20920 }, { "epoch": 1.228719032523189, "grad_norm": 1.9469926357269287, "learning_rate": 3.258746450761471e-05, "loss": 0.0183, "step": 20930 }, { "epoch": 1.2293060936949631, "grad_norm": 2.3205080032348633, "learning_rate": 3.25441233174043e-05, "loss": 0.0334, "step": 20940 }, { "epoch": 1.2298931548667371, "grad_norm": 0.05783454701304436, "learning_rate": 3.250079705862468e-05, "loss": 0.0178, "step": 20950 }, { "epoch": 1.230480216038511, "grad_norm": 1.6985958814620972, "learning_rate": 3.245748576833636e-05, "loss": 0.0067, "step": 20960 }, { "epoch": 1.2310672772102853, "grad_norm": 0.7721903324127197, "learning_rate": 3.241418948358696e-05, "loss": 0.015, "step": 20970 }, { "epoch": 1.2316543383820595, "grad_norm": 0.9906013011932373, "learning_rate": 3.237090824141134e-05, "loss": 0.0149, "step": 20980 }, { "epoch": 1.2322413995538335, "grad_norm": 0.5731455683708191, "learning_rate": 3.2327642078831466e-05, "loss": 0.0097, "step": 20990 }, { "epoch": 1.2328284607256077, "grad_norm": 0.218977689743042, "learning_rate": 3.228439103285641e-05, "loss": 0.0198, "step": 21000 }, { "epoch": 1.2328284607256077, "eval_loss": 0.5017571449279785, "eval_runtime": 269.8345, "eval_samples_per_second": 3.502, "eval_steps_per_second": 3.502, "step": 21000 }, { "epoch": 1.2334155218973817, "grad_norm": 0.7364138960838318, "learning_rate": 3.2241155140482294e-05, "loss": 0.0115, "step": 21010 }, { "epoch": 1.234002583069156, "grad_norm": 0.007412275765091181, "learning_rate": 3.2197934438692314e-05, "loss": 0.0148, "step": 21020 }, { "epoch": 1.2345896442409299, "grad_norm": 0.6939229965209961, "learning_rate": 3.2154728964456605e-05, "loss": 0.0159, "step": 21030 }, { "epoch": 1.235176705412704, "grad_norm": 0.28469404578208923, "learning_rate": 3.211153875473239e-05, "loss": 0.0129, "step": 21040 }, { "epoch": 1.235763766584478, "grad_norm": 0.1567525714635849, "learning_rate": 3.206836384646371e-05, "loss": 0.0092, "step": 21050 }, { "epoch": 1.2363508277562523, "grad_norm": 2.098961353302002, "learning_rate": 3.202520427658159e-05, "loss": 0.0155, "step": 21060 }, { "epoch": 1.2369378889280263, "grad_norm": 1.6644279956817627, "learning_rate": 3.1982060082003954e-05, "loss": 0.0334, "step": 21070 }, { "epoch": 1.2375249500998005, "grad_norm": 0.6723320484161377, "learning_rate": 3.1938931299635484e-05, "loss": 0.009, "step": 21080 }, { "epoch": 1.2381120112715744, "grad_norm": 1.654505968093872, "learning_rate": 3.189581796636778e-05, "loss": 0.01, "step": 21090 }, { "epoch": 1.2386990724433486, "grad_norm": 0.6577444672584534, "learning_rate": 3.185272011907915e-05, "loss": 0.0187, "step": 21100 }, { "epoch": 1.2392861336151226, "grad_norm": 1.0867955684661865, "learning_rate": 3.180963779463472e-05, "loss": 0.0104, "step": 21110 }, { "epoch": 1.2398731947868968, "grad_norm": 0.9444228410720825, "learning_rate": 3.176657102988628e-05, "loss": 0.0114, "step": 21120 }, { "epoch": 1.2404602559586708, "grad_norm": 3.534825325012207, "learning_rate": 3.1723519861672354e-05, "loss": 0.0234, "step": 21130 }, { "epoch": 1.241047317130445, "grad_norm": 0.574012041091919, "learning_rate": 3.168048432681808e-05, "loss": 0.0085, "step": 21140 }, { "epoch": 1.241634378302219, "grad_norm": 0.6430829763412476, "learning_rate": 3.1637464462135286e-05, "loss": 0.013, "step": 21150 }, { "epoch": 1.2422214394739932, "grad_norm": 0.3673301041126251, "learning_rate": 3.159446030442232e-05, "loss": 0.0283, "step": 21160 }, { "epoch": 1.2428085006457672, "grad_norm": 0.6621904969215393, "learning_rate": 3.155147189046418e-05, "loss": 0.0124, "step": 21170 }, { "epoch": 1.2433955618175414, "grad_norm": 0.03779918700456619, "learning_rate": 3.1508499257032306e-05, "loss": 0.016, "step": 21180 }, { "epoch": 1.2439826229893154, "grad_norm": 0.7762832641601562, "learning_rate": 3.1465542440884736e-05, "loss": 0.0079, "step": 21190 }, { "epoch": 1.2445696841610896, "grad_norm": 0.8843396902084351, "learning_rate": 3.1422601478765874e-05, "loss": 0.011, "step": 21200 }, { "epoch": 1.2451567453328636, "grad_norm": 3.4436707496643066, "learning_rate": 3.137967640740665e-05, "loss": 0.0151, "step": 21210 }, { "epoch": 1.2457438065046378, "grad_norm": 0.6071956753730774, "learning_rate": 3.133676726352438e-05, "loss": 0.0126, "step": 21220 }, { "epoch": 1.246330867676412, "grad_norm": 1.4261529445648193, "learning_rate": 3.12938740838227e-05, "loss": 0.0172, "step": 21230 }, { "epoch": 1.246917928848186, "grad_norm": 1.1258330345153809, "learning_rate": 3.125099690499168e-05, "loss": 0.0117, "step": 21240 }, { "epoch": 1.24750499001996, "grad_norm": 0.3895932137966156, "learning_rate": 3.120813576370763e-05, "loss": 0.0055, "step": 21250 }, { "epoch": 1.2480920511917342, "grad_norm": 0.2762930691242218, "learning_rate": 3.1165290696633185e-05, "loss": 0.0098, "step": 21260 }, { "epoch": 1.2486791123635084, "grad_norm": 0.023398570716381073, "learning_rate": 3.11224617404172e-05, "loss": 0.0059, "step": 21270 }, { "epoch": 1.2492661735352824, "grad_norm": 0.11404114961624146, "learning_rate": 3.1079648931694796e-05, "loss": 0.047, "step": 21280 }, { "epoch": 1.2498532347070566, "grad_norm": 0.21508830785751343, "learning_rate": 3.1036852307087183e-05, "loss": 0.0171, "step": 21290 }, { "epoch": 1.2504402958788305, "grad_norm": 3.046077251434326, "learning_rate": 3.099407190320188e-05, "loss": 0.0375, "step": 21300 }, { "epoch": 1.2510273570506047, "grad_norm": 0.38301554322242737, "learning_rate": 3.095130775663237e-05, "loss": 0.0105, "step": 21310 }, { "epoch": 1.2516144182223787, "grad_norm": 2.3761250972747803, "learning_rate": 3.090855990395836e-05, "loss": 0.0122, "step": 21320 }, { "epoch": 1.252201479394153, "grad_norm": 0.27872663736343384, "learning_rate": 3.086582838174551e-05, "loss": 0.0187, "step": 21330 }, { "epoch": 1.252788540565927, "grad_norm": 2.158463478088379, "learning_rate": 3.082311322654562e-05, "loss": 0.0453, "step": 21340 }, { "epoch": 1.2533756017377011, "grad_norm": 0.7334088683128357, "learning_rate": 3.0780414474896414e-05, "loss": 0.0233, "step": 21350 }, { "epoch": 1.2539626629094751, "grad_norm": 1.0348256826400757, "learning_rate": 3.0737732163321596e-05, "loss": 0.0152, "step": 21360 }, { "epoch": 1.2545497240812493, "grad_norm": 0.04808567464351654, "learning_rate": 3.0695066328330845e-05, "loss": 0.0176, "step": 21370 }, { "epoch": 1.2551367852530233, "grad_norm": 1.2392550706863403, "learning_rate": 3.0652417006419674e-05, "loss": 0.014, "step": 21380 }, { "epoch": 1.2557238464247975, "grad_norm": 0.5190914273262024, "learning_rate": 3.0609784234069575e-05, "loss": 0.0101, "step": 21390 }, { "epoch": 1.2563109075965715, "grad_norm": 0.9737521409988403, "learning_rate": 3.0567168047747776e-05, "loss": 0.016, "step": 21400 }, { "epoch": 1.2568979687683457, "grad_norm": 0.3394123315811157, "learning_rate": 3.052456848390739e-05, "loss": 0.0171, "step": 21410 }, { "epoch": 1.2574850299401197, "grad_norm": 0.7681484222412109, "learning_rate": 3.048198557898727e-05, "loss": 0.0111, "step": 21420 }, { "epoch": 1.2580720911118939, "grad_norm": 3.3313300609588623, "learning_rate": 3.043941936941207e-05, "loss": 0.0257, "step": 21430 }, { "epoch": 1.258659152283668, "grad_norm": 2.9273416996002197, "learning_rate": 3.0396869891592093e-05, "loss": 0.0101, "step": 21440 }, { "epoch": 1.259246213455442, "grad_norm": 0.5039469599723816, "learning_rate": 3.035433718192341e-05, "loss": 0.004, "step": 21450 }, { "epoch": 1.259833274627216, "grad_norm": 0.8747548460960388, "learning_rate": 3.0311821276787654e-05, "loss": 0.0208, "step": 21460 }, { "epoch": 1.2604203357989903, "grad_norm": 1.177300214767456, "learning_rate": 3.0269322212552153e-05, "loss": 0.0215, "step": 21470 }, { "epoch": 1.2610073969707645, "grad_norm": 1.727304458618164, "learning_rate": 3.0226840025569857e-05, "loss": 0.0152, "step": 21480 }, { "epoch": 1.2615944581425385, "grad_norm": 0.13978305459022522, "learning_rate": 3.0184374752179183e-05, "loss": 0.0095, "step": 21490 }, { "epoch": 1.2621815193143124, "grad_norm": 0.845206081867218, "learning_rate": 3.014192642870416e-05, "loss": 0.0064, "step": 21500 }, { "epoch": 1.2627685804860866, "grad_norm": 1.3671202659606934, "learning_rate": 3.0099495091454268e-05, "loss": 0.0064, "step": 21510 }, { "epoch": 1.2633556416578609, "grad_norm": 1.586337685585022, "learning_rate": 3.00570807767245e-05, "loss": 0.0128, "step": 21520 }, { "epoch": 1.2639427028296348, "grad_norm": 0.500262439250946, "learning_rate": 3.0014683520795256e-05, "loss": 0.0179, "step": 21530 }, { "epoch": 1.2645297640014088, "grad_norm": 0.5322319269180298, "learning_rate": 2.9972303359932386e-05, "loss": 0.0098, "step": 21540 }, { "epoch": 1.265116825173183, "grad_norm": 1.2702996730804443, "learning_rate": 2.992994033038704e-05, "loss": 0.0176, "step": 21550 }, { "epoch": 1.2657038863449572, "grad_norm": 1.5824235677719116, "learning_rate": 2.9887594468395798e-05, "loss": 0.0122, "step": 21560 }, { "epoch": 1.2662909475167312, "grad_norm": 0.1891796886920929, "learning_rate": 2.984526581018049e-05, "loss": 0.0161, "step": 21570 }, { "epoch": 1.2668780086885054, "grad_norm": 0.02000442147254944, "learning_rate": 2.9802954391948294e-05, "loss": 0.0098, "step": 21580 }, { "epoch": 1.2674650698602794, "grad_norm": 2.0282230377197266, "learning_rate": 2.976066024989158e-05, "loss": 0.0182, "step": 21590 }, { "epoch": 1.2680521310320536, "grad_norm": 0.06673241406679153, "learning_rate": 2.9718383420187983e-05, "loss": 0.0082, "step": 21600 }, { "epoch": 1.2686391922038276, "grad_norm": 1.3715343475341797, "learning_rate": 2.96761239390003e-05, "loss": 0.0194, "step": 21610 }, { "epoch": 1.2692262533756018, "grad_norm": 2.4239096641540527, "learning_rate": 2.963388184247651e-05, "loss": 0.0145, "step": 21620 }, { "epoch": 1.2698133145473758, "grad_norm": 0.1532154381275177, "learning_rate": 2.959165716674973e-05, "loss": 0.0105, "step": 21630 }, { "epoch": 1.27040037571915, "grad_norm": 0.31090089678764343, "learning_rate": 2.9549449947938108e-05, "loss": 0.0083, "step": 21640 }, { "epoch": 1.270987436890924, "grad_norm": 0.05951221287250519, "learning_rate": 2.9507260222144973e-05, "loss": 0.0084, "step": 21650 }, { "epoch": 1.2715744980626982, "grad_norm": 2.5123090744018555, "learning_rate": 2.9465088025458586e-05, "loss": 0.0124, "step": 21660 }, { "epoch": 1.2721615592344722, "grad_norm": 0.24349847435951233, "learning_rate": 2.942293339395227e-05, "loss": 0.0056, "step": 21670 }, { "epoch": 1.2727486204062464, "grad_norm": 0.0393960103392601, "learning_rate": 2.9380796363684303e-05, "loss": 0.0151, "step": 21680 }, { "epoch": 1.2733356815780204, "grad_norm": 0.1087142676115036, "learning_rate": 2.9338676970697926e-05, "loss": 0.0083, "step": 21690 }, { "epoch": 1.2739227427497946, "grad_norm": 0.1377180814743042, "learning_rate": 2.9296575251021265e-05, "loss": 0.0116, "step": 21700 }, { "epoch": 1.2745098039215685, "grad_norm": 0.7107512950897217, "learning_rate": 2.925449124066737e-05, "loss": 0.01, "step": 21710 }, { "epoch": 1.2750968650933427, "grad_norm": 1.46084725856781, "learning_rate": 2.9212424975634078e-05, "loss": 0.0111, "step": 21720 }, { "epoch": 1.275683926265117, "grad_norm": 0.9276548027992249, "learning_rate": 2.9170376491904127e-05, "loss": 0.0143, "step": 21730 }, { "epoch": 1.276270987436891, "grad_norm": 1.665923833847046, "learning_rate": 2.912834582544497e-05, "loss": 0.0204, "step": 21740 }, { "epoch": 1.276858048608665, "grad_norm": 0.3215520679950714, "learning_rate": 2.9086333012208865e-05, "loss": 0.0217, "step": 21750 }, { "epoch": 1.2774451097804391, "grad_norm": 0.5315148234367371, "learning_rate": 2.9044338088132816e-05, "loss": 0.0176, "step": 21760 }, { "epoch": 1.2780321709522133, "grad_norm": 3.6604113578796387, "learning_rate": 2.9002361089138453e-05, "loss": 0.0093, "step": 21770 }, { "epoch": 1.2786192321239873, "grad_norm": 0.34918174147605896, "learning_rate": 2.896040205113214e-05, "loss": 0.0088, "step": 21780 }, { "epoch": 1.2792062932957613, "grad_norm": 1.7474963665008545, "learning_rate": 2.8918461010004842e-05, "loss": 0.0219, "step": 21790 }, { "epoch": 1.2797933544675355, "grad_norm": 2.510493516921997, "learning_rate": 2.887653800163218e-05, "loss": 0.0153, "step": 21800 }, { "epoch": 1.2803804156393097, "grad_norm": 4.96450662612915, "learning_rate": 2.8834633061874256e-05, "loss": 0.0179, "step": 21810 }, { "epoch": 1.2809674768110837, "grad_norm": 0.05940713733434677, "learning_rate": 2.87927462265758e-05, "loss": 0.0116, "step": 21820 }, { "epoch": 1.2815545379828577, "grad_norm": 0.5804621577262878, "learning_rate": 2.875087753156603e-05, "loss": 0.0116, "step": 21830 }, { "epoch": 1.2821415991546319, "grad_norm": 0.5918548107147217, "learning_rate": 2.8709027012658663e-05, "loss": 0.013, "step": 21840 }, { "epoch": 1.282728660326406, "grad_norm": 0.11733749508857727, "learning_rate": 2.8667194705651807e-05, "loss": 0.0233, "step": 21850 }, { "epoch": 1.28331572149818, "grad_norm": 0.25058963894844055, "learning_rate": 2.862538064632808e-05, "loss": 0.0153, "step": 21860 }, { "epoch": 1.2839027826699543, "grad_norm": 1.7111525535583496, "learning_rate": 2.858358487045441e-05, "loss": 0.0276, "step": 21870 }, { "epoch": 1.2844898438417283, "grad_norm": 1.1210541725158691, "learning_rate": 2.854180741378214e-05, "loss": 0.0048, "step": 21880 }, { "epoch": 1.2850769050135025, "grad_norm": 2.3163340091705322, "learning_rate": 2.8500048312046927e-05, "loss": 0.0256, "step": 21890 }, { "epoch": 1.2856639661852765, "grad_norm": 0.0008968147449195385, "learning_rate": 2.8458307600968725e-05, "loss": 0.0158, "step": 21900 }, { "epoch": 1.2862510273570507, "grad_norm": 1.1192528009414673, "learning_rate": 2.8416585316251776e-05, "loss": 0.0064, "step": 21910 }, { "epoch": 1.2868380885288246, "grad_norm": 3.4580559730529785, "learning_rate": 2.8374881493584516e-05, "loss": 0.0165, "step": 21920 }, { "epoch": 1.2874251497005988, "grad_norm": 2.1928298473358154, "learning_rate": 2.8333196168639632e-05, "loss": 0.0136, "step": 21930 }, { "epoch": 1.2880122108723728, "grad_norm": 0.12867216765880585, "learning_rate": 2.8291529377073956e-05, "loss": 0.0099, "step": 21940 }, { "epoch": 1.288599272044147, "grad_norm": 0.6393283605575562, "learning_rate": 2.824988115452849e-05, "loss": 0.0263, "step": 21950 }, { "epoch": 1.289186333215921, "grad_norm": 1.985285997390747, "learning_rate": 2.8208251536628344e-05, "loss": 0.0208, "step": 21960 }, { "epoch": 1.2897733943876952, "grad_norm": 0.04826156795024872, "learning_rate": 2.8166640558982743e-05, "loss": 0.0117, "step": 21970 }, { "epoch": 1.2903604555594694, "grad_norm": 0.5573629140853882, "learning_rate": 2.8125048257184896e-05, "loss": 0.0066, "step": 21980 }, { "epoch": 1.2909475167312434, "grad_norm": 3.0405917167663574, "learning_rate": 2.8083474666812127e-05, "loss": 0.0108, "step": 21990 }, { "epoch": 1.2915345779030174, "grad_norm": 0.31120508909225464, "learning_rate": 2.8041919823425633e-05, "loss": 0.0081, "step": 22000 }, { "epoch": 1.2921216390747916, "grad_norm": 0.12164461612701416, "learning_rate": 2.800038376257075e-05, "loss": 0.0214, "step": 22010 }, { "epoch": 1.2927087002465658, "grad_norm": 0.35867759585380554, "learning_rate": 2.7958866519776572e-05, "loss": 0.0056, "step": 22020 }, { "epoch": 1.2932957614183398, "grad_norm": 0.20889237523078918, "learning_rate": 2.791736813055621e-05, "loss": 0.0195, "step": 22030 }, { "epoch": 1.2938828225901138, "grad_norm": 1.40049147605896, "learning_rate": 2.787588863040661e-05, "loss": 0.0198, "step": 22040 }, { "epoch": 1.294469883761888, "grad_norm": 1.6490122079849243, "learning_rate": 2.7834428054808543e-05, "loss": 0.0043, "step": 22050 }, { "epoch": 1.2950569449336622, "grad_norm": 4.0196990966796875, "learning_rate": 2.7792986439226615e-05, "loss": 0.0158, "step": 22060 }, { "epoch": 1.2956440061054362, "grad_norm": 2.3238790035247803, "learning_rate": 2.7751563819109218e-05, "loss": 0.0171, "step": 22070 }, { "epoch": 1.2962310672772102, "grad_norm": 1.6786881685256958, "learning_rate": 2.7710160229888504e-05, "loss": 0.0195, "step": 22080 }, { "epoch": 1.2968181284489844, "grad_norm": 0.03470773994922638, "learning_rate": 2.7668775706980288e-05, "loss": 0.0244, "step": 22090 }, { "epoch": 1.2974051896207586, "grad_norm": 0.21821852028369904, "learning_rate": 2.7627410285784163e-05, "loss": 0.0053, "step": 22100 }, { "epoch": 1.2979922507925326, "grad_norm": 1.1397291421890259, "learning_rate": 2.7586064001683286e-05, "loss": 0.004, "step": 22110 }, { "epoch": 1.2985793119643068, "grad_norm": 1.734389066696167, "learning_rate": 2.754473689004453e-05, "loss": 0.0085, "step": 22120 }, { "epoch": 1.2991663731360807, "grad_norm": 2.2286031246185303, "learning_rate": 2.750342898621833e-05, "loss": 0.012, "step": 22130 }, { "epoch": 1.299753434307855, "grad_norm": 0.5030475854873657, "learning_rate": 2.7462140325538714e-05, "loss": 0.0134, "step": 22140 }, { "epoch": 1.300340495479629, "grad_norm": 0.7477059364318848, "learning_rate": 2.7420870943323197e-05, "loss": 0.0067, "step": 22150 }, { "epoch": 1.3009275566514031, "grad_norm": 0.13905635476112366, "learning_rate": 2.7379620874872856e-05, "loss": 0.0042, "step": 22160 }, { "epoch": 1.3015146178231771, "grad_norm": 1.4355264902114868, "learning_rate": 2.7338390155472215e-05, "loss": 0.0107, "step": 22170 }, { "epoch": 1.3021016789949513, "grad_norm": 1.68543541431427, "learning_rate": 2.729717882038925e-05, "loss": 0.0037, "step": 22180 }, { "epoch": 1.3026887401667253, "grad_norm": 1.7076072692871094, "learning_rate": 2.725598690487543e-05, "loss": 0.0069, "step": 22190 }, { "epoch": 1.3032758013384995, "grad_norm": 0.7237100601196289, "learning_rate": 2.721481444416548e-05, "loss": 0.0096, "step": 22200 }, { "epoch": 1.3038628625102735, "grad_norm": 0.8334245681762695, "learning_rate": 2.7173661473477608e-05, "loss": 0.0178, "step": 22210 }, { "epoch": 1.3044499236820477, "grad_norm": 2.110283851623535, "learning_rate": 2.7132528028013248e-05, "loss": 0.018, "step": 22220 }, { "epoch": 1.3050369848538217, "grad_norm": 2.716474771499634, "learning_rate": 2.7091414142957204e-05, "loss": 0.0135, "step": 22230 }, { "epoch": 1.305624046025596, "grad_norm": 0.5923571586608887, "learning_rate": 2.7050319853477522e-05, "loss": 0.016, "step": 22240 }, { "epoch": 1.3062111071973699, "grad_norm": 0.10442431271076202, "learning_rate": 2.7009245194725507e-05, "loss": 0.0083, "step": 22250 }, { "epoch": 1.306798168369144, "grad_norm": 0.6889320015907288, "learning_rate": 2.6968190201835625e-05, "loss": 0.0144, "step": 22260 }, { "epoch": 1.3073852295409183, "grad_norm": 0.5491931438446045, "learning_rate": 2.6927154909925577e-05, "loss": 0.008, "step": 22270 }, { "epoch": 1.3079722907126923, "grad_norm": 0.1568625420331955, "learning_rate": 2.6886139354096164e-05, "loss": 0.0103, "step": 22280 }, { "epoch": 1.3085593518844663, "grad_norm": 0.10188999027013779, "learning_rate": 2.684514356943132e-05, "loss": 0.0048, "step": 22290 }, { "epoch": 1.3091464130562405, "grad_norm": 2.802391529083252, "learning_rate": 2.6804167590998096e-05, "loss": 0.0183, "step": 22300 }, { "epoch": 1.3097334742280147, "grad_norm": 1.9018369913101196, "learning_rate": 2.676321145384657e-05, "loss": 0.0154, "step": 22310 }, { "epoch": 1.3103205353997887, "grad_norm": 3.2427594661712646, "learning_rate": 2.6722275193009872e-05, "loss": 0.0334, "step": 22320 }, { "epoch": 1.3109075965715626, "grad_norm": 0.24874800443649292, "learning_rate": 2.668135884350408e-05, "loss": 0.0327, "step": 22330 }, { "epoch": 1.3114946577433368, "grad_norm": 1.3628442287445068, "learning_rate": 2.664046244032832e-05, "loss": 0.0112, "step": 22340 }, { "epoch": 1.312081718915111, "grad_norm": 0.3787357807159424, "learning_rate": 2.659958601846454e-05, "loss": 0.0068, "step": 22350 }, { "epoch": 1.312668780086885, "grad_norm": 0.7853474020957947, "learning_rate": 2.6558729612877753e-05, "loss": 0.0106, "step": 22360 }, { "epoch": 1.313255841258659, "grad_norm": 0.7099056243896484, "learning_rate": 2.6517893258515702e-05, "loss": 0.0129, "step": 22370 }, { "epoch": 1.3138429024304332, "grad_norm": 2.357701301574707, "learning_rate": 2.647707699030909e-05, "loss": 0.0093, "step": 22380 }, { "epoch": 1.3144299636022074, "grad_norm": 2.000049114227295, "learning_rate": 2.6436280843171346e-05, "loss": 0.0129, "step": 22390 }, { "epoch": 1.3150170247739814, "grad_norm": 0.21669819951057434, "learning_rate": 2.639550485199874e-05, "loss": 0.0135, "step": 22400 }, { "epoch": 1.3156040859457556, "grad_norm": 1.1428183317184448, "learning_rate": 2.635474905167032e-05, "loss": 0.0173, "step": 22410 }, { "epoch": 1.3161911471175296, "grad_norm": 0.4165472388267517, "learning_rate": 2.631401347704783e-05, "loss": 0.0169, "step": 22420 }, { "epoch": 1.3167782082893038, "grad_norm": 3.154531717300415, "learning_rate": 2.627329816297569e-05, "loss": 0.0094, "step": 22430 }, { "epoch": 1.3173652694610778, "grad_norm": 0.2751377522945404, "learning_rate": 2.6232603144281066e-05, "loss": 0.007, "step": 22440 }, { "epoch": 1.317952330632852, "grad_norm": 0.9876483082771301, "learning_rate": 2.6191928455773662e-05, "loss": 0.0088, "step": 22450 }, { "epoch": 1.318539391804626, "grad_norm": 0.5597164034843445, "learning_rate": 2.615127413224588e-05, "loss": 0.0081, "step": 22460 }, { "epoch": 1.3191264529764002, "grad_norm": 1.2955923080444336, "learning_rate": 2.611064020847266e-05, "loss": 0.0173, "step": 22470 }, { "epoch": 1.3197135141481742, "grad_norm": 1.323433756828308, "learning_rate": 2.6070026719211505e-05, "loss": 0.0195, "step": 22480 }, { "epoch": 1.3203005753199484, "grad_norm": 0.15278884768486023, "learning_rate": 2.6029433699202454e-05, "loss": 0.027, "step": 22490 }, { "epoch": 1.3208876364917224, "grad_norm": 2.438828229904175, "learning_rate": 2.598886118316798e-05, "loss": 0.0103, "step": 22500 }, { "epoch": 1.3214746976634966, "grad_norm": 2.481713056564331, "learning_rate": 2.5948309205813094e-05, "loss": 0.0161, "step": 22510 }, { "epoch": 1.3220617588352708, "grad_norm": 0.1825207769870758, "learning_rate": 2.590777780182515e-05, "loss": 0.0155, "step": 22520 }, { "epoch": 1.3226488200070448, "grad_norm": 0.00188036251347512, "learning_rate": 2.5867267005873996e-05, "loss": 0.0092, "step": 22530 }, { "epoch": 1.3232358811788187, "grad_norm": 0.37991979718208313, "learning_rate": 2.582677685261179e-05, "loss": 0.0087, "step": 22540 }, { "epoch": 1.323822942350593, "grad_norm": 1.4987987279891968, "learning_rate": 2.578630737667308e-05, "loss": 0.0061, "step": 22550 }, { "epoch": 1.3244100035223672, "grad_norm": 0.6876380443572998, "learning_rate": 2.574585861267466e-05, "loss": 0.0281, "step": 22560 }, { "epoch": 1.3249970646941411, "grad_norm": 0.09047604352235794, "learning_rate": 2.570543059521569e-05, "loss": 0.0185, "step": 22570 }, { "epoch": 1.3255841258659151, "grad_norm": 3.7617619037628174, "learning_rate": 2.566502335887747e-05, "loss": 0.0184, "step": 22580 }, { "epoch": 1.3261711870376893, "grad_norm": 0.9839680194854736, "learning_rate": 2.5624636938223675e-05, "loss": 0.014, "step": 22590 }, { "epoch": 1.3267582482094635, "grad_norm": 0.8359461426734924, "learning_rate": 2.5584271367800072e-05, "loss": 0.011, "step": 22600 }, { "epoch": 1.3273453093812375, "grad_norm": 3.1844050884246826, "learning_rate": 2.5543926682134588e-05, "loss": 0.0076, "step": 22610 }, { "epoch": 1.3279323705530115, "grad_norm": 0.13429057598114014, "learning_rate": 2.550360291573735e-05, "loss": 0.0169, "step": 22620 }, { "epoch": 1.3285194317247857, "grad_norm": 0.23351898789405823, "learning_rate": 2.546330010310052e-05, "loss": 0.0119, "step": 22630 }, { "epoch": 1.32910649289656, "grad_norm": 0.8100205659866333, "learning_rate": 2.5423018278698386e-05, "loss": 0.0222, "step": 22640 }, { "epoch": 1.329693554068334, "grad_norm": 0.8847802877426147, "learning_rate": 2.5382757476987268e-05, "loss": 0.0112, "step": 22650 }, { "epoch": 1.330280615240108, "grad_norm": 0.1871187388896942, "learning_rate": 2.5342517732405523e-05, "loss": 0.0041, "step": 22660 }, { "epoch": 1.330867676411882, "grad_norm": 0.09971107542514801, "learning_rate": 2.530229907937344e-05, "loss": 0.0098, "step": 22670 }, { "epoch": 1.3314547375836563, "grad_norm": 0.7304926514625549, "learning_rate": 2.5262101552293345e-05, "loss": 0.0163, "step": 22680 }, { "epoch": 1.3320417987554303, "grad_norm": 1.4849220514297485, "learning_rate": 2.52219251855494e-05, "loss": 0.0081, "step": 22690 }, { "epoch": 1.3326288599272045, "grad_norm": 7.254383087158203, "learning_rate": 2.5181770013507754e-05, "loss": 0.0161, "step": 22700 }, { "epoch": 1.3332159210989785, "grad_norm": 0.7523486614227295, "learning_rate": 2.5141636070516382e-05, "loss": 0.0136, "step": 22710 }, { "epoch": 1.3338029822707527, "grad_norm": 2.257286787033081, "learning_rate": 2.5101523390905112e-05, "loss": 0.0138, "step": 22720 }, { "epoch": 1.3343900434425267, "grad_norm": 0.23006044328212738, "learning_rate": 2.5061432008985598e-05, "loss": 0.0158, "step": 22730 }, { "epoch": 1.3349771046143009, "grad_norm": 0.6245858669281006, "learning_rate": 2.5021361959051226e-05, "loss": 0.0455, "step": 22740 }, { "epoch": 1.3355641657860748, "grad_norm": 0.9592316746711731, "learning_rate": 2.4981313275377177e-05, "loss": 0.005, "step": 22750 }, { "epoch": 1.336151226957849, "grad_norm": 1.2934941053390503, "learning_rate": 2.4941285992220354e-05, "loss": 0.0172, "step": 22760 }, { "epoch": 1.336738288129623, "grad_norm": 4.774064064025879, "learning_rate": 2.4901280143819368e-05, "loss": 0.0157, "step": 22770 }, { "epoch": 1.3373253493013972, "grad_norm": 3.1580262184143066, "learning_rate": 2.4861295764394426e-05, "loss": 0.0224, "step": 22780 }, { "epoch": 1.3379124104731712, "grad_norm": 0.2299729436635971, "learning_rate": 2.482133288814747e-05, "loss": 0.0076, "step": 22790 }, { "epoch": 1.3384994716449454, "grad_norm": 0.013382161036133766, "learning_rate": 2.4781391549261955e-05, "loss": 0.0111, "step": 22800 }, { "epoch": 1.3390865328167196, "grad_norm": 0.6786443591117859, "learning_rate": 2.4741471781902975e-05, "loss": 0.0047, "step": 22810 }, { "epoch": 1.3396735939884936, "grad_norm": 0.010400927625596523, "learning_rate": 2.470157362021715e-05, "loss": 0.0061, "step": 22820 }, { "epoch": 1.3402606551602676, "grad_norm": 1.5406306982040405, "learning_rate": 2.4661697098332648e-05, "loss": 0.0178, "step": 22830 }, { "epoch": 1.3408477163320418, "grad_norm": 0.1092362105846405, "learning_rate": 2.462184225035905e-05, "loss": 0.0153, "step": 22840 }, { "epoch": 1.341434777503816, "grad_norm": 2.1447012424468994, "learning_rate": 2.4582009110387506e-05, "loss": 0.0172, "step": 22850 }, { "epoch": 1.34202183867559, "grad_norm": 2.2344930171966553, "learning_rate": 2.4542197712490483e-05, "loss": 0.0075, "step": 22860 }, { "epoch": 1.342608899847364, "grad_norm": 0.5468419194221497, "learning_rate": 2.4502408090721934e-05, "loss": 0.0165, "step": 22870 }, { "epoch": 1.3431959610191382, "grad_norm": 1.6260221004486084, "learning_rate": 2.446264027911716e-05, "loss": 0.0333, "step": 22880 }, { "epoch": 1.3437830221909124, "grad_norm": 0.43379461765289307, "learning_rate": 2.4422894311692807e-05, "loss": 0.0044, "step": 22890 }, { "epoch": 1.3443700833626864, "grad_norm": 1.4105299711227417, "learning_rate": 2.438317022244684e-05, "loss": 0.0206, "step": 22900 }, { "epoch": 1.3449571445344604, "grad_norm": 0.469251811504364, "learning_rate": 2.4343468045358476e-05, "loss": 0.0212, "step": 22910 }, { "epoch": 1.3455442057062346, "grad_norm": 0.9096851944923401, "learning_rate": 2.4303787814388247e-05, "loss": 0.0126, "step": 22920 }, { "epoch": 1.3461312668780088, "grad_norm": 1.3153198957443237, "learning_rate": 2.4264129563477822e-05, "loss": 0.0221, "step": 22930 }, { "epoch": 1.3467183280497828, "grad_norm": 1.9034795761108398, "learning_rate": 2.4224493326550214e-05, "loss": 0.0174, "step": 22940 }, { "epoch": 1.347305389221557, "grad_norm": 0.34362080693244934, "learning_rate": 2.418487913750946e-05, "loss": 0.0102, "step": 22950 }, { "epoch": 1.347892450393331, "grad_norm": 0.43925729393959045, "learning_rate": 2.4145287030240826e-05, "loss": 0.0096, "step": 22960 }, { "epoch": 1.3484795115651051, "grad_norm": 0.2632419764995575, "learning_rate": 2.410571703861063e-05, "loss": 0.0029, "step": 22970 }, { "epoch": 1.3490665727368791, "grad_norm": 0.3569221496582031, "learning_rate": 2.4066169196466326e-05, "loss": 0.0064, "step": 22980 }, { "epoch": 1.3496536339086533, "grad_norm": 0.8634092211723328, "learning_rate": 2.4026643537636395e-05, "loss": 0.0111, "step": 22990 }, { "epoch": 1.3502406950804273, "grad_norm": 2.1359846591949463, "learning_rate": 2.3987140095930343e-05, "loss": 0.0106, "step": 23000 }, { "epoch": 1.3508277562522015, "grad_norm": 0.8999951481819153, "learning_rate": 2.3947658905138702e-05, "loss": 0.0095, "step": 23010 }, { "epoch": 1.3514148174239755, "grad_norm": 4.4071526527404785, "learning_rate": 2.3908199999032904e-05, "loss": 0.0182, "step": 23020 }, { "epoch": 1.3520018785957497, "grad_norm": 2.316470146179199, "learning_rate": 2.3868763411365396e-05, "loss": 0.0225, "step": 23030 }, { "epoch": 1.3525889397675237, "grad_norm": 0.017609085887670517, "learning_rate": 2.382934917586947e-05, "loss": 0.0055, "step": 23040 }, { "epoch": 1.353176000939298, "grad_norm": 1.3149291276931763, "learning_rate": 2.378995732625933e-05, "loss": 0.0051, "step": 23050 }, { "epoch": 1.353763062111072, "grad_norm": 0.7345719933509827, "learning_rate": 2.375058789623004e-05, "loss": 0.0119, "step": 23060 }, { "epoch": 1.354350123282846, "grad_norm": 1.8294070959091187, "learning_rate": 2.3711240919457493e-05, "loss": 0.035, "step": 23070 }, { "epoch": 1.35493718445462, "grad_norm": 0.23536460101604462, "learning_rate": 2.367191642959832e-05, "loss": 0.0137, "step": 23080 }, { "epoch": 1.3555242456263943, "grad_norm": 0.8988510966300964, "learning_rate": 2.3632614460289985e-05, "loss": 0.0106, "step": 23090 }, { "epoch": 1.3561113067981685, "grad_norm": 0.06487403064966202, "learning_rate": 2.3593335045150626e-05, "loss": 0.014, "step": 23100 }, { "epoch": 1.3566983679699425, "grad_norm": 0.8209073543548584, "learning_rate": 2.3554078217779145e-05, "loss": 0.0156, "step": 23110 }, { "epoch": 1.3572854291417165, "grad_norm": 0.07240058481693268, "learning_rate": 2.3514844011755087e-05, "loss": 0.0033, "step": 23120 }, { "epoch": 1.3578724903134907, "grad_norm": 1.495160698890686, "learning_rate": 2.3475632460638692e-05, "loss": 0.0036, "step": 23130 }, { "epoch": 1.3584595514852649, "grad_norm": 0.2292553335428238, "learning_rate": 2.3436443597970735e-05, "loss": 0.0262, "step": 23140 }, { "epoch": 1.3590466126570389, "grad_norm": 0.006750187836587429, "learning_rate": 2.3397277457272665e-05, "loss": 0.0119, "step": 23150 }, { "epoch": 1.3596336738288128, "grad_norm": 0.023953752592206, "learning_rate": 2.3358134072046466e-05, "loss": 0.0137, "step": 23160 }, { "epoch": 1.360220735000587, "grad_norm": 1.2029908895492554, "learning_rate": 2.331901347577466e-05, "loss": 0.0134, "step": 23170 }, { "epoch": 1.3608077961723613, "grad_norm": 2.6604292392730713, "learning_rate": 2.327991570192029e-05, "loss": 0.0098, "step": 23180 }, { "epoch": 1.3613948573441352, "grad_norm": 1.4938769340515137, "learning_rate": 2.3240840783926827e-05, "loss": 0.014, "step": 23190 }, { "epoch": 1.3619819185159094, "grad_norm": 0.8838220238685608, "learning_rate": 2.320178875521826e-05, "loss": 0.0113, "step": 23200 }, { "epoch": 1.3625689796876834, "grad_norm": 1.2276970148086548, "learning_rate": 2.3162759649198928e-05, "loss": 0.0059, "step": 23210 }, { "epoch": 1.3631560408594576, "grad_norm": 3.7173707485198975, "learning_rate": 2.3123753499253618e-05, "loss": 0.0206, "step": 23220 }, { "epoch": 1.3637431020312316, "grad_norm": 2.413621187210083, "learning_rate": 2.3084770338747464e-05, "loss": 0.0074, "step": 23230 }, { "epoch": 1.3643301632030058, "grad_norm": 0.09392088651657104, "learning_rate": 2.3045810201025946e-05, "loss": 0.0068, "step": 23240 }, { "epoch": 1.3649172243747798, "grad_norm": 1.4007784128189087, "learning_rate": 2.300687311941481e-05, "loss": 0.009, "step": 23250 }, { "epoch": 1.365504285546554, "grad_norm": 0.1655406653881073, "learning_rate": 2.296795912722014e-05, "loss": 0.0099, "step": 23260 }, { "epoch": 1.366091346718328, "grad_norm": 1.608480453491211, "learning_rate": 2.29290682577282e-05, "loss": 0.0215, "step": 23270 }, { "epoch": 1.3666784078901022, "grad_norm": 0.3631249666213989, "learning_rate": 2.2890200544205516e-05, "loss": 0.0135, "step": 23280 }, { "epoch": 1.3672654690618762, "grad_norm": 2.106283664703369, "learning_rate": 2.285135601989885e-05, "loss": 0.0062, "step": 23290 }, { "epoch": 1.3678525302336504, "grad_norm": 1.1901888847351074, "learning_rate": 2.2812534718035046e-05, "loss": 0.0165, "step": 23300 }, { "epoch": 1.3684395914054244, "grad_norm": 0.20205309987068176, "learning_rate": 2.277373667182114e-05, "loss": 0.0103, "step": 23310 }, { "epoch": 1.3690266525771986, "grad_norm": 0.9821584820747375, "learning_rate": 2.2734961914444225e-05, "loss": 0.0125, "step": 23320 }, { "epoch": 1.3696137137489726, "grad_norm": 0.27546176314353943, "learning_rate": 2.2696210479071524e-05, "loss": 0.0118, "step": 23330 }, { "epoch": 1.3702007749207468, "grad_norm": 0.06945142149925232, "learning_rate": 2.2657482398850287e-05, "loss": 0.0089, "step": 23340 }, { "epoch": 1.370787836092521, "grad_norm": 0.027274999767541885, "learning_rate": 2.261877770690781e-05, "loss": 0.009, "step": 23350 }, { "epoch": 1.371374897264295, "grad_norm": 1.9020315408706665, "learning_rate": 2.2580096436351333e-05, "loss": 0.0123, "step": 23360 }, { "epoch": 1.371961958436069, "grad_norm": 0.11881349980831146, "learning_rate": 2.2541438620268124e-05, "loss": 0.0116, "step": 23370 }, { "epoch": 1.3725490196078431, "grad_norm": 0.720493733882904, "learning_rate": 2.2502804291725315e-05, "loss": 0.0147, "step": 23380 }, { "epoch": 1.3731360807796174, "grad_norm": 0.09386742860078812, "learning_rate": 2.246419348377001e-05, "loss": 0.0091, "step": 23390 }, { "epoch": 1.3737231419513913, "grad_norm": 1.8141838312149048, "learning_rate": 2.242560622942918e-05, "loss": 0.0304, "step": 23400 }, { "epoch": 1.3743102031231653, "grad_norm": 2.572324275970459, "learning_rate": 2.2387042561709654e-05, "loss": 0.0196, "step": 23410 }, { "epoch": 1.3748972642949395, "grad_norm": 1.1925442218780518, "learning_rate": 2.2348502513598035e-05, "loss": 0.0103, "step": 23420 }, { "epoch": 1.3754843254667137, "grad_norm": 2.4145772457122803, "learning_rate": 2.2309986118060784e-05, "loss": 0.0187, "step": 23430 }, { "epoch": 1.3760713866384877, "grad_norm": 1.7221524715423584, "learning_rate": 2.227149340804412e-05, "loss": 0.0118, "step": 23440 }, { "epoch": 1.3766584478102617, "grad_norm": 0.2988492548465729, "learning_rate": 2.2233024416473948e-05, "loss": 0.0124, "step": 23450 }, { "epoch": 1.377245508982036, "grad_norm": 1.0698436498641968, "learning_rate": 2.2194579176255954e-05, "loss": 0.0214, "step": 23460 }, { "epoch": 1.37783257015381, "grad_norm": 0.5979824066162109, "learning_rate": 2.215615772027546e-05, "loss": 0.0147, "step": 23470 }, { "epoch": 1.378419631325584, "grad_norm": 0.4322218596935272, "learning_rate": 2.2117760081397506e-05, "loss": 0.0318, "step": 23480 }, { "epoch": 1.3790066924973583, "grad_norm": 0.012686857022345066, "learning_rate": 2.2079386292466652e-05, "loss": 0.0117, "step": 23490 }, { "epoch": 1.3795937536691323, "grad_norm": 0.010577654466032982, "learning_rate": 2.2041036386307173e-05, "loss": 0.0051, "step": 23500 }, { "epoch": 1.3801808148409065, "grad_norm": 1.2929799556732178, "learning_rate": 2.2002710395722805e-05, "loss": 0.0105, "step": 23510 }, { "epoch": 1.3807678760126805, "grad_norm": 0.059585027396678925, "learning_rate": 2.196440835349695e-05, "loss": 0.0108, "step": 23520 }, { "epoch": 1.3813549371844547, "grad_norm": 0.1867661476135254, "learning_rate": 2.192613029239241e-05, "loss": 0.0102, "step": 23530 }, { "epoch": 1.3819419983562287, "grad_norm": 0.35511553287506104, "learning_rate": 2.188787624515156e-05, "loss": 0.0054, "step": 23540 }, { "epoch": 1.3825290595280029, "grad_norm": 2.3536529541015625, "learning_rate": 2.184964624449617e-05, "loss": 0.0071, "step": 23550 }, { "epoch": 1.3831161206997769, "grad_norm": 0.15178915858268738, "learning_rate": 2.181144032312747e-05, "loss": 0.0091, "step": 23560 }, { "epoch": 1.383703181871551, "grad_norm": 1.1093653440475464, "learning_rate": 2.1773258513726098e-05, "loss": 0.006, "step": 23570 }, { "epoch": 1.384290243043325, "grad_norm": 0.16067326068878174, "learning_rate": 2.173510084895206e-05, "loss": 0.0151, "step": 23580 }, { "epoch": 1.3848773042150992, "grad_norm": 1.6919939517974854, "learning_rate": 2.1696967361444733e-05, "loss": 0.0138, "step": 23590 }, { "epoch": 1.3854643653868732, "grad_norm": 0.9082183837890625, "learning_rate": 2.165885808382275e-05, "loss": 0.0139, "step": 23600 }, { "epoch": 1.3860514265586474, "grad_norm": 0.01875101402401924, "learning_rate": 2.16207730486841e-05, "loss": 0.0036, "step": 23610 }, { "epoch": 1.3866384877304214, "grad_norm": 1.4035495519638062, "learning_rate": 2.1582712288605994e-05, "loss": 0.0097, "step": 23620 }, { "epoch": 1.3872255489021956, "grad_norm": 0.23025992512702942, "learning_rate": 2.1544675836144907e-05, "loss": 0.0157, "step": 23630 }, { "epoch": 1.3878126100739698, "grad_norm": 0.8536239862442017, "learning_rate": 2.1506663723836502e-05, "loss": 0.008, "step": 23640 }, { "epoch": 1.3883996712457438, "grad_norm": 0.34466981887817383, "learning_rate": 2.146867598419565e-05, "loss": 0.0099, "step": 23650 }, { "epoch": 1.3889867324175178, "grad_norm": 0.10123822093009949, "learning_rate": 2.1430712649716328e-05, "loss": 0.0083, "step": 23660 }, { "epoch": 1.389573793589292, "grad_norm": 0.6164001822471619, "learning_rate": 2.1392773752871685e-05, "loss": 0.0082, "step": 23670 }, { "epoch": 1.3901608547610662, "grad_norm": 0.12118466943502426, "learning_rate": 2.13548593261139e-05, "loss": 0.0064, "step": 23680 }, { "epoch": 1.3907479159328402, "grad_norm": 1.2965704202651978, "learning_rate": 2.1316969401874316e-05, "loss": 0.0186, "step": 23690 }, { "epoch": 1.3913349771046142, "grad_norm": 0.15385189652442932, "learning_rate": 2.1279104012563266e-05, "loss": 0.017, "step": 23700 }, { "epoch": 1.3919220382763884, "grad_norm": 2.4474310874938965, "learning_rate": 2.1241263190570065e-05, "loss": 0.0103, "step": 23710 }, { "epoch": 1.3925090994481626, "grad_norm": 2.3678784370422363, "learning_rate": 2.120344696826308e-05, "loss": 0.0143, "step": 23720 }, { "epoch": 1.3930961606199366, "grad_norm": 2.978165626525879, "learning_rate": 2.1165655377989557e-05, "loss": 0.0144, "step": 23730 }, { "epoch": 1.3936832217917106, "grad_norm": 1.1261640787124634, "learning_rate": 2.112788845207574e-05, "loss": 0.0099, "step": 23740 }, { "epoch": 1.3942702829634848, "grad_norm": 2.0954904556274414, "learning_rate": 2.1090146222826758e-05, "loss": 0.0294, "step": 23750 }, { "epoch": 1.394857344135259, "grad_norm": 2.7357470989227295, "learning_rate": 2.1052428722526614e-05, "loss": 0.0124, "step": 23760 }, { "epoch": 1.395444405307033, "grad_norm": 0.5364775657653809, "learning_rate": 2.1014735983438126e-05, "loss": 0.0232, "step": 23770 }, { "epoch": 1.3960314664788072, "grad_norm": 1.5720865726470947, "learning_rate": 2.0977068037802994e-05, "loss": 0.0194, "step": 23780 }, { "epoch": 1.3966185276505811, "grad_norm": 0.08704604208469391, "learning_rate": 2.093942491784164e-05, "loss": 0.0071, "step": 23790 }, { "epoch": 1.3972055888223553, "grad_norm": 0.5620222687721252, "learning_rate": 2.090180665575329e-05, "loss": 0.0057, "step": 23800 }, { "epoch": 1.3977926499941293, "grad_norm": 0.7073603868484497, "learning_rate": 2.0864213283715927e-05, "loss": 0.0063, "step": 23810 }, { "epoch": 1.3983797111659035, "grad_norm": 0.9657589793205261, "learning_rate": 2.0826644833886215e-05, "loss": 0.0118, "step": 23820 }, { "epoch": 1.3989667723376775, "grad_norm": 2.1152420043945312, "learning_rate": 2.0789101338399485e-05, "loss": 0.0181, "step": 23830 }, { "epoch": 1.3995538335094517, "grad_norm": 0.03757241368293762, "learning_rate": 2.075158282936975e-05, "loss": 0.0104, "step": 23840 }, { "epoch": 1.4001408946812257, "grad_norm": 0.20126187801361084, "learning_rate": 2.0714089338889658e-05, "loss": 0.0084, "step": 23850 }, { "epoch": 1.400727955853, "grad_norm": 2.3680176734924316, "learning_rate": 2.067662089903039e-05, "loss": 0.0177, "step": 23860 }, { "epoch": 1.401315017024774, "grad_norm": 0.948137640953064, "learning_rate": 2.063917754184182e-05, "loss": 0.0171, "step": 23870 }, { "epoch": 1.401902078196548, "grad_norm": 3.399534225463867, "learning_rate": 2.0601759299352246e-05, "loss": 0.0115, "step": 23880 }, { "epoch": 1.4024891393683223, "grad_norm": 0.6425442695617676, "learning_rate": 2.056436620356857e-05, "loss": 0.0189, "step": 23890 }, { "epoch": 1.4030762005400963, "grad_norm": 1.1237084865570068, "learning_rate": 2.05269982864761e-05, "loss": 0.014, "step": 23900 }, { "epoch": 1.4036632617118703, "grad_norm": 0.026308121159672737, "learning_rate": 2.048965558003869e-05, "loss": 0.0045, "step": 23910 }, { "epoch": 1.4042503228836445, "grad_norm": 1.1034443378448486, "learning_rate": 2.0452338116198576e-05, "loss": 0.0079, "step": 23920 }, { "epoch": 1.4048373840554187, "grad_norm": 0.3975376486778259, "learning_rate": 2.041504592687645e-05, "loss": 0.0129, "step": 23930 }, { "epoch": 1.4054244452271927, "grad_norm": 0.2686179578304291, "learning_rate": 2.037777904397132e-05, "loss": 0.0125, "step": 23940 }, { "epoch": 1.4060115063989667, "grad_norm": 1.6290830373764038, "learning_rate": 2.03405374993606e-05, "loss": 0.0107, "step": 23950 }, { "epoch": 1.4065985675707409, "grad_norm": 1.4251973628997803, "learning_rate": 2.0303321324899992e-05, "loss": 0.0101, "step": 23960 }, { "epoch": 1.407185628742515, "grad_norm": 1.4456449747085571, "learning_rate": 2.026613055242353e-05, "loss": 0.0024, "step": 23970 }, { "epoch": 1.407772689914289, "grad_norm": 0.5611891746520996, "learning_rate": 2.0228965213743506e-05, "loss": 0.0071, "step": 23980 }, { "epoch": 1.408359751086063, "grad_norm": 1.505138635635376, "learning_rate": 2.019182534065045e-05, "loss": 0.0197, "step": 23990 }, { "epoch": 1.4089468122578372, "grad_norm": 1.6551071405410767, "learning_rate": 2.0154710964913143e-05, "loss": 0.0102, "step": 24000 }, { "epoch": 1.4089468122578372, "eval_loss": 0.5137258768081665, "eval_runtime": 269.7363, "eval_samples_per_second": 3.503, "eval_steps_per_second": 3.503, "step": 24000 }, { "epoch": 1.4095338734296115, "grad_norm": 2.8127646446228027, "learning_rate": 2.0117622118278484e-05, "loss": 0.0196, "step": 24010 }, { "epoch": 1.4101209346013854, "grad_norm": 1.3914660215377808, "learning_rate": 2.0080558832471625e-05, "loss": 0.0083, "step": 24020 }, { "epoch": 1.4107079957731596, "grad_norm": 0.17611494660377502, "learning_rate": 2.0043521139195763e-05, "loss": 0.0113, "step": 24030 }, { "epoch": 1.4112950569449336, "grad_norm": 3.4943318367004395, "learning_rate": 2.000650907013228e-05, "loss": 0.02, "step": 24040 }, { "epoch": 1.4118821181167078, "grad_norm": 0.7097033858299255, "learning_rate": 1.9969522656940593e-05, "loss": 0.0058, "step": 24050 }, { "epoch": 1.4124691792884818, "grad_norm": 1.8921847343444824, "learning_rate": 1.9932561931258213e-05, "loss": 0.0122, "step": 24060 }, { "epoch": 1.413056240460256, "grad_norm": 2.1589457988739014, "learning_rate": 1.9895626924700618e-05, "loss": 0.0113, "step": 24070 }, { "epoch": 1.41364330163203, "grad_norm": 2.850785255432129, "learning_rate": 1.985871766886136e-05, "loss": 0.0088, "step": 24080 }, { "epoch": 1.4142303628038042, "grad_norm": 0.8964874148368835, "learning_rate": 1.982183419531188e-05, "loss": 0.0119, "step": 24090 }, { "epoch": 1.4148174239755782, "grad_norm": 0.07366857677698135, "learning_rate": 1.978497653560167e-05, "loss": 0.0287, "step": 24100 }, { "epoch": 1.4154044851473524, "grad_norm": 0.5417582988739014, "learning_rate": 1.9748144721258033e-05, "loss": 0.0166, "step": 24110 }, { "epoch": 1.4159915463191264, "grad_norm": 2.048412799835205, "learning_rate": 1.9711338783786237e-05, "loss": 0.0111, "step": 24120 }, { "epoch": 1.4165786074909006, "grad_norm": 0.4081864058971405, "learning_rate": 1.9674558754669413e-05, "loss": 0.0213, "step": 24130 }, { "epoch": 1.4171656686626746, "grad_norm": 0.44963279366493225, "learning_rate": 1.963780466536847e-05, "loss": 0.0044, "step": 24140 }, { "epoch": 1.4177527298344488, "grad_norm": 0.10618758946657181, "learning_rate": 1.960107654732219e-05, "loss": 0.01, "step": 24150 }, { "epoch": 1.4183397910062228, "grad_norm": 0.6178563237190247, "learning_rate": 1.956437443194712e-05, "loss": 0.0128, "step": 24160 }, { "epoch": 1.418926852177997, "grad_norm": 0.036462098360061646, "learning_rate": 1.952769835063758e-05, "loss": 0.0069, "step": 24170 }, { "epoch": 1.4195139133497712, "grad_norm": 2.371694326400757, "learning_rate": 1.9491048334765566e-05, "loss": 0.0108, "step": 24180 }, { "epoch": 1.4201009745215452, "grad_norm": 0.01989881508052349, "learning_rate": 1.9454424415680857e-05, "loss": 0.0157, "step": 24190 }, { "epoch": 1.4206880356933191, "grad_norm": 3.062727212905884, "learning_rate": 1.9417826624710834e-05, "loss": 0.0164, "step": 24200 }, { "epoch": 1.4212750968650933, "grad_norm": 0.6262339949607849, "learning_rate": 1.938125499316058e-05, "loss": 0.0061, "step": 24210 }, { "epoch": 1.4218621580368676, "grad_norm": 0.24331939220428467, "learning_rate": 1.9344709552312783e-05, "loss": 0.0113, "step": 24220 }, { "epoch": 1.4224492192086415, "grad_norm": 0.49620071053504944, "learning_rate": 1.930819033342775e-05, "loss": 0.0331, "step": 24230 }, { "epoch": 1.4230362803804155, "grad_norm": 0.1770806908607483, "learning_rate": 1.9271697367743304e-05, "loss": 0.0181, "step": 24240 }, { "epoch": 1.4236233415521897, "grad_norm": 1.1065704822540283, "learning_rate": 1.9235230686474864e-05, "loss": 0.027, "step": 24250 }, { "epoch": 1.424210402723964, "grad_norm": 0.7962722182273865, "learning_rate": 1.9198790320815347e-05, "loss": 0.0088, "step": 24260 }, { "epoch": 1.424797463895738, "grad_norm": 1.1967579126358032, "learning_rate": 1.916237630193516e-05, "loss": 0.0072, "step": 24270 }, { "epoch": 1.425384525067512, "grad_norm": 1.5202269554138184, "learning_rate": 1.912598866098219e-05, "loss": 0.013, "step": 24280 }, { "epoch": 1.425971586239286, "grad_norm": 0.05356645584106445, "learning_rate": 1.908962742908172e-05, "loss": 0.0116, "step": 24290 }, { "epoch": 1.4265586474110603, "grad_norm": 0.032990165054798126, "learning_rate": 1.905329263733649e-05, "loss": 0.0238, "step": 24300 }, { "epoch": 1.4271457085828343, "grad_norm": 0.04489932209253311, "learning_rate": 1.901698431682658e-05, "loss": 0.009, "step": 24310 }, { "epoch": 1.4277327697546085, "grad_norm": 1.7742148637771606, "learning_rate": 1.8980702498609453e-05, "loss": 0.0228, "step": 24320 }, { "epoch": 1.4283198309263825, "grad_norm": 0.05356493964791298, "learning_rate": 1.8944447213719914e-05, "loss": 0.0095, "step": 24330 }, { "epoch": 1.4289068920981567, "grad_norm": 2.683162212371826, "learning_rate": 1.890821849317006e-05, "loss": 0.0114, "step": 24340 }, { "epoch": 1.4294939532699307, "grad_norm": 1.9921892881393433, "learning_rate": 1.8872016367949237e-05, "loss": 0.007, "step": 24350 }, { "epoch": 1.4300810144417049, "grad_norm": 1.2890311479568481, "learning_rate": 1.883584086902409e-05, "loss": 0.0161, "step": 24360 }, { "epoch": 1.4306680756134789, "grad_norm": 0.2484143227338791, "learning_rate": 1.8799692027338446e-05, "loss": 0.0164, "step": 24370 }, { "epoch": 1.431255136785253, "grad_norm": 0.40440618991851807, "learning_rate": 1.8763569873813354e-05, "loss": 0.014, "step": 24380 }, { "epoch": 1.431842197957027, "grad_norm": 0.03348202630877495, "learning_rate": 1.8727474439347027e-05, "loss": 0.0099, "step": 24390 }, { "epoch": 1.4324292591288013, "grad_norm": 1.390295386314392, "learning_rate": 1.8691405754814833e-05, "loss": 0.0052, "step": 24400 }, { "epoch": 1.4330163203005752, "grad_norm": 0.5037054419517517, "learning_rate": 1.865536385106927e-05, "loss": 0.0062, "step": 24410 }, { "epoch": 1.4336033814723494, "grad_norm": 0.09097137302160263, "learning_rate": 1.861934875893987e-05, "loss": 0.0213, "step": 24420 }, { "epoch": 1.4341904426441237, "grad_norm": 0.8542649149894714, "learning_rate": 1.85833605092333e-05, "loss": 0.0187, "step": 24430 }, { "epoch": 1.4347775038158976, "grad_norm": 0.8738613128662109, "learning_rate": 1.8547399132733195e-05, "loss": 0.0084, "step": 24440 }, { "epoch": 1.4353645649876716, "grad_norm": 0.20260785520076752, "learning_rate": 1.8511464660200307e-05, "loss": 0.0037, "step": 24450 }, { "epoch": 1.4359516261594458, "grad_norm": 0.8802362680435181, "learning_rate": 1.847555712237226e-05, "loss": 0.0075, "step": 24460 }, { "epoch": 1.43653868733122, "grad_norm": 2.442185401916504, "learning_rate": 1.8439676549963737e-05, "loss": 0.0173, "step": 24470 }, { "epoch": 1.437125748502994, "grad_norm": 0.18825408816337585, "learning_rate": 1.840382297366626e-05, "loss": 0.013, "step": 24480 }, { "epoch": 1.437712809674768, "grad_norm": 0.2534600794315338, "learning_rate": 1.8367996424148326e-05, "loss": 0.0078, "step": 24490 }, { "epoch": 1.4382998708465422, "grad_norm": 0.049735572189092636, "learning_rate": 1.8332196932055305e-05, "loss": 0.0186, "step": 24500 }, { "epoch": 1.4388869320183164, "grad_norm": 1.9720195531845093, "learning_rate": 1.8296424528009425e-05, "loss": 0.0241, "step": 24510 }, { "epoch": 1.4394739931900904, "grad_norm": 0.06996825337409973, "learning_rate": 1.8260679242609703e-05, "loss": 0.0098, "step": 24520 }, { "epoch": 1.4400610543618644, "grad_norm": 2.268746852874756, "learning_rate": 1.8224961106432003e-05, "loss": 0.0163, "step": 24530 }, { "epoch": 1.4406481155336386, "grad_norm": 0.08396535366773605, "learning_rate": 1.818927015002897e-05, "loss": 0.0168, "step": 24540 }, { "epoch": 1.4412351767054128, "grad_norm": 1.3653311729431152, "learning_rate": 1.815360640392994e-05, "loss": 0.0047, "step": 24550 }, { "epoch": 1.4418222378771868, "grad_norm": 0.1528441607952118, "learning_rate": 1.8117969898641042e-05, "loss": 0.0039, "step": 24560 }, { "epoch": 1.442409299048961, "grad_norm": 0.49751341342926025, "learning_rate": 1.8082360664645065e-05, "loss": 0.0115, "step": 24570 }, { "epoch": 1.442996360220735, "grad_norm": 1.538364291191101, "learning_rate": 1.8046778732401513e-05, "loss": 0.0111, "step": 24580 }, { "epoch": 1.4435834213925092, "grad_norm": 1.63816237449646, "learning_rate": 1.8011224132346466e-05, "loss": 0.0119, "step": 24590 }, { "epoch": 1.4441704825642832, "grad_norm": 0.10889382660388947, "learning_rate": 1.7975696894892698e-05, "loss": 0.0159, "step": 24600 }, { "epoch": 1.4447575437360574, "grad_norm": 0.11068199574947357, "learning_rate": 1.7940197050429492e-05, "loss": 0.0055, "step": 24610 }, { "epoch": 1.4453446049078313, "grad_norm": 3.096181869506836, "learning_rate": 1.7904724629322817e-05, "loss": 0.0193, "step": 24620 }, { "epoch": 1.4459316660796055, "grad_norm": 0.008653839118778706, "learning_rate": 1.7869279661915077e-05, "loss": 0.0062, "step": 24630 }, { "epoch": 1.4465187272513795, "grad_norm": 0.18575824797153473, "learning_rate": 1.7833862178525267e-05, "loss": 0.0068, "step": 24640 }, { "epoch": 1.4471057884231537, "grad_norm": 1.3114022016525269, "learning_rate": 1.77984722094488e-05, "loss": 0.0125, "step": 24650 }, { "epoch": 1.4476928495949277, "grad_norm": 0.4251237213611603, "learning_rate": 1.776310978495762e-05, "loss": 0.0183, "step": 24660 }, { "epoch": 1.448279910766702, "grad_norm": 1.0775110721588135, "learning_rate": 1.7727774935300078e-05, "loss": 0.0099, "step": 24670 }, { "epoch": 1.448866971938476, "grad_norm": 0.15549612045288086, "learning_rate": 1.769246769070095e-05, "loss": 0.0084, "step": 24680 }, { "epoch": 1.4494540331102501, "grad_norm": 0.00905792135745287, "learning_rate": 1.7657188081361402e-05, "loss": 0.0075, "step": 24690 }, { "epoch": 1.450041094282024, "grad_norm": 0.10242454707622528, "learning_rate": 1.762193613745893e-05, "loss": 0.0193, "step": 24700 }, { "epoch": 1.4506281554537983, "grad_norm": 0.28730252385139465, "learning_rate": 1.7586711889147407e-05, "loss": 0.0103, "step": 24710 }, { "epoch": 1.4512152166255725, "grad_norm": 0.29514777660369873, "learning_rate": 1.7551515366556975e-05, "loss": 0.0065, "step": 24720 }, { "epoch": 1.4518022777973465, "grad_norm": 1.5189207792282104, "learning_rate": 1.7516346599794092e-05, "loss": 0.0145, "step": 24730 }, { "epoch": 1.4523893389691205, "grad_norm": 0.1768386960029602, "learning_rate": 1.748120561894147e-05, "loss": 0.0096, "step": 24740 }, { "epoch": 1.4529764001408947, "grad_norm": 2.094917058944702, "learning_rate": 1.7446092454058066e-05, "loss": 0.0223, "step": 24750 }, { "epoch": 1.453563461312669, "grad_norm": 1.3558257818222046, "learning_rate": 1.7411007135178987e-05, "loss": 0.0319, "step": 24760 }, { "epoch": 1.4541505224844429, "grad_norm": 0.13693387806415558, "learning_rate": 1.7375949692315584e-05, "loss": 0.0084, "step": 24770 }, { "epoch": 1.4547375836562169, "grad_norm": 2.2927207946777344, "learning_rate": 1.7340920155455327e-05, "loss": 0.0136, "step": 24780 }, { "epoch": 1.455324644827991, "grad_norm": 0.8705666065216064, "learning_rate": 1.7305918554561824e-05, "loss": 0.0227, "step": 24790 }, { "epoch": 1.4559117059997653, "grad_norm": 1.1699771881103516, "learning_rate": 1.72709449195748e-05, "loss": 0.0098, "step": 24800 }, { "epoch": 1.4564987671715393, "grad_norm": 0.04188217222690582, "learning_rate": 1.7235999280410047e-05, "loss": 0.0126, "step": 24810 }, { "epoch": 1.4570858283433132, "grad_norm": 1.9177348613739014, "learning_rate": 1.720108166695943e-05, "loss": 0.0115, "step": 24820 }, { "epoch": 1.4576728895150874, "grad_norm": 0.24380716681480408, "learning_rate": 1.716619210909079e-05, "loss": 0.0075, "step": 24830 }, { "epoch": 1.4582599506868617, "grad_norm": 1.21640145778656, "learning_rate": 1.7131330636648014e-05, "loss": 0.0105, "step": 24840 }, { "epoch": 1.4588470118586356, "grad_norm": 1.1568200588226318, "learning_rate": 1.709649727945096e-05, "loss": 0.011, "step": 24850 }, { "epoch": 1.4594340730304098, "grad_norm": 2.6545119285583496, "learning_rate": 1.7061692067295447e-05, "loss": 0.0255, "step": 24860 }, { "epoch": 1.4600211342021838, "grad_norm": 0.8861936330795288, "learning_rate": 1.7026915029953168e-05, "loss": 0.0162, "step": 24870 }, { "epoch": 1.460608195373958, "grad_norm": 0.3077358901500702, "learning_rate": 1.6992166197171787e-05, "loss": 0.0199, "step": 24880 }, { "epoch": 1.461195256545732, "grad_norm": 1.2515180110931396, "learning_rate": 1.695744559867477e-05, "loss": 0.0101, "step": 24890 }, { "epoch": 1.4617823177175062, "grad_norm": 2.070460796356201, "learning_rate": 1.692275326416149e-05, "loss": 0.0188, "step": 24900 }, { "epoch": 1.4623693788892802, "grad_norm": 1.6355761289596558, "learning_rate": 1.6888089223307113e-05, "loss": 0.0179, "step": 24910 }, { "epoch": 1.4629564400610544, "grad_norm": 1.6046183109283447, "learning_rate": 1.685345350576264e-05, "loss": 0.0143, "step": 24920 }, { "epoch": 1.4635435012328284, "grad_norm": 0.4053262770175934, "learning_rate": 1.681884614115477e-05, "loss": 0.0118, "step": 24930 }, { "epoch": 1.4641305624046026, "grad_norm": 1.1307713985443115, "learning_rate": 1.6784267159086026e-05, "loss": 0.0169, "step": 24940 }, { "epoch": 1.4647176235763766, "grad_norm": 0.4161185324192047, "learning_rate": 1.6749716589134627e-05, "loss": 0.0086, "step": 24950 }, { "epoch": 1.4653046847481508, "grad_norm": 0.278931587934494, "learning_rate": 1.6715194460854468e-05, "loss": 0.0094, "step": 24960 }, { "epoch": 1.465891745919925, "grad_norm": 3.9699208736419678, "learning_rate": 1.6680700803775135e-05, "loss": 0.0181, "step": 24970 }, { "epoch": 1.466478807091699, "grad_norm": 0.42187848687171936, "learning_rate": 1.6646235647401863e-05, "loss": 0.0083, "step": 24980 }, { "epoch": 1.467065868263473, "grad_norm": 2.352708101272583, "learning_rate": 1.6611799021215525e-05, "loss": 0.011, "step": 24990 }, { "epoch": 1.4676529294352472, "grad_norm": 2.2313830852508545, "learning_rate": 1.6577390954672523e-05, "loss": 0.0093, "step": 25000 }, { "epoch": 1.4682399906070214, "grad_norm": 0.010144324973225594, "learning_rate": 1.6543011477204912e-05, "loss": 0.007, "step": 25010 }, { "epoch": 1.4688270517787954, "grad_norm": 0.9201246500015259, "learning_rate": 1.650866061822021e-05, "loss": 0.0086, "step": 25020 }, { "epoch": 1.4694141129505693, "grad_norm": 0.18726426362991333, "learning_rate": 1.6474338407101564e-05, "loss": 0.0079, "step": 25030 }, { "epoch": 1.4700011741223435, "grad_norm": 0.7087125778198242, "learning_rate": 1.6440044873207494e-05, "loss": 0.0071, "step": 25040 }, { "epoch": 1.4705882352941178, "grad_norm": 1.518667459487915, "learning_rate": 1.6405780045872092e-05, "loss": 0.0035, "step": 25050 }, { "epoch": 1.4711752964658917, "grad_norm": 3.9029223918914795, "learning_rate": 1.637154395440482e-05, "loss": 0.0125, "step": 25060 }, { "epoch": 1.4717623576376657, "grad_norm": 0.14753930270671844, "learning_rate": 1.63373366280906e-05, "loss": 0.0036, "step": 25070 }, { "epoch": 1.47234941880944, "grad_norm": 0.307205468416214, "learning_rate": 1.6303158096189734e-05, "loss": 0.0051, "step": 25080 }, { "epoch": 1.4729364799812141, "grad_norm": 0.09061327576637268, "learning_rate": 1.6269008387937917e-05, "loss": 0.014, "step": 25090 }, { "epoch": 1.4735235411529881, "grad_norm": 0.4797089099884033, "learning_rate": 1.623488753254618e-05, "loss": 0.0105, "step": 25100 }, { "epoch": 1.4741106023247623, "grad_norm": 1.880612850189209, "learning_rate": 1.620079555920082e-05, "loss": 0.0132, "step": 25110 }, { "epoch": 1.4746976634965363, "grad_norm": 0.2213641256093979, "learning_rate": 1.6166732497063524e-05, "loss": 0.0042, "step": 25120 }, { "epoch": 1.4752847246683105, "grad_norm": 0.034901391714811325, "learning_rate": 1.6132698375271164e-05, "loss": 0.0098, "step": 25130 }, { "epoch": 1.4758717858400845, "grad_norm": 0.5932609438896179, "learning_rate": 1.60986932229359e-05, "loss": 0.0191, "step": 25140 }, { "epoch": 1.4764588470118587, "grad_norm": 2.2570817470550537, "learning_rate": 1.6064717069145114e-05, "loss": 0.0068, "step": 25150 }, { "epoch": 1.4770459081836327, "grad_norm": 0.00515064038336277, "learning_rate": 1.6030769942961378e-05, "loss": 0.0106, "step": 25160 }, { "epoch": 1.4776329693554069, "grad_norm": 0.6780362129211426, "learning_rate": 1.5996851873422403e-05, "loss": 0.0225, "step": 25170 }, { "epoch": 1.4782200305271809, "grad_norm": 0.8630103468894958, "learning_rate": 1.5962962889541105e-05, "loss": 0.0052, "step": 25180 }, { "epoch": 1.478807091698955, "grad_norm": 1.4598804712295532, "learning_rate": 1.592910302030544e-05, "loss": 0.0139, "step": 25190 }, { "epoch": 1.479394152870729, "grad_norm": 0.2872070074081421, "learning_rate": 1.589527229467857e-05, "loss": 0.0128, "step": 25200 }, { "epoch": 1.4799812140425033, "grad_norm": 0.012290052138268948, "learning_rate": 1.5861470741598618e-05, "loss": 0.0195, "step": 25210 }, { "epoch": 1.4805682752142773, "grad_norm": 2.608578681945801, "learning_rate": 1.582769838997882e-05, "loss": 0.005, "step": 25220 }, { "epoch": 1.4811553363860515, "grad_norm": 0.5421618223190308, "learning_rate": 1.579395526870742e-05, "loss": 0.0126, "step": 25230 }, { "epoch": 1.4817423975578254, "grad_norm": 0.3466702401638031, "learning_rate": 1.576024140664764e-05, "loss": 0.0157, "step": 25240 }, { "epoch": 1.4823294587295996, "grad_norm": 0.07040505111217499, "learning_rate": 1.5726556832637686e-05, "loss": 0.0057, "step": 25250 }, { "epoch": 1.4829165199013739, "grad_norm": 0.9358566999435425, "learning_rate": 1.5692901575490725e-05, "loss": 0.0082, "step": 25260 }, { "epoch": 1.4835035810731478, "grad_norm": 1.6973823308944702, "learning_rate": 1.5659275663994842e-05, "loss": 0.0191, "step": 25270 }, { "epoch": 1.4840906422449218, "grad_norm": 0.9288491010665894, "learning_rate": 1.562567912691299e-05, "loss": 0.0254, "step": 25280 }, { "epoch": 1.484677703416696, "grad_norm": 0.14372193813323975, "learning_rate": 1.5592111992983042e-05, "loss": 0.0084, "step": 25290 }, { "epoch": 1.4852647645884702, "grad_norm": 0.3273877799510956, "learning_rate": 1.5558574290917676e-05, "loss": 0.0085, "step": 25300 }, { "epoch": 1.4858518257602442, "grad_norm": 0.9547355771064758, "learning_rate": 1.5525066049404425e-05, "loss": 0.0205, "step": 25310 }, { "epoch": 1.4864388869320182, "grad_norm": 0.22144687175750732, "learning_rate": 1.5491587297105616e-05, "loss": 0.0133, "step": 25320 }, { "epoch": 1.4870259481037924, "grad_norm": 0.22204560041427612, "learning_rate": 1.5458138062658362e-05, "loss": 0.0088, "step": 25330 }, { "epoch": 1.4876130092755666, "grad_norm": 2.982510805130005, "learning_rate": 1.5424718374674478e-05, "loss": 0.009, "step": 25340 }, { "epoch": 1.4882000704473406, "grad_norm": 3.0312764644622803, "learning_rate": 1.539132826174058e-05, "loss": 0.0106, "step": 25350 }, { "epoch": 1.4887871316191146, "grad_norm": 2.7994961738586426, "learning_rate": 1.5357967752417908e-05, "loss": 0.0096, "step": 25360 }, { "epoch": 1.4893741927908888, "grad_norm": 1.3636599779129028, "learning_rate": 1.5324636875242425e-05, "loss": 0.006, "step": 25370 }, { "epoch": 1.489961253962663, "grad_norm": 1.8163387775421143, "learning_rate": 1.5291335658724787e-05, "loss": 0.0137, "step": 25380 }, { "epoch": 1.490548315134437, "grad_norm": 0.03518426790833473, "learning_rate": 1.5258064131350175e-05, "loss": 0.0134, "step": 25390 }, { "epoch": 1.4911353763062112, "grad_norm": 2.0565054416656494, "learning_rate": 1.522482232157848e-05, "loss": 0.0084, "step": 25400 }, { "epoch": 1.4917224374779852, "grad_norm": 0.9762770533561707, "learning_rate": 1.519161025784408e-05, "loss": 0.0046, "step": 25410 }, { "epoch": 1.4923094986497594, "grad_norm": 0.5391456484794617, "learning_rate": 1.5158427968555977e-05, "loss": 0.0114, "step": 25420 }, { "epoch": 1.4928965598215334, "grad_norm": 1.3391125202178955, "learning_rate": 1.5125275482097678e-05, "loss": 0.01, "step": 25430 }, { "epoch": 1.4934836209933076, "grad_norm": 1.2342358827590942, "learning_rate": 1.5092152826827216e-05, "loss": 0.0178, "step": 25440 }, { "epoch": 1.4940706821650815, "grad_norm": 0.8180807828903198, "learning_rate": 1.5059060031077066e-05, "loss": 0.004, "step": 25450 }, { "epoch": 1.4946577433368557, "grad_norm": 0.21036776900291443, "learning_rate": 1.5025997123154211e-05, "loss": 0.0095, "step": 25460 }, { "epoch": 1.4952448045086297, "grad_norm": 0.1561666578054428, "learning_rate": 1.4992964131340014e-05, "loss": 0.0156, "step": 25470 }, { "epoch": 1.495831865680404, "grad_norm": 0.1823035627603531, "learning_rate": 1.49599610838903e-05, "loss": 0.0056, "step": 25480 }, { "epoch": 1.496418926852178, "grad_norm": 2.063511610031128, "learning_rate": 1.4926988009035258e-05, "loss": 0.0224, "step": 25490 }, { "epoch": 1.4970059880239521, "grad_norm": 0.5182443261146545, "learning_rate": 1.4894044934979435e-05, "loss": 0.009, "step": 25500 }, { "epoch": 1.4975930491957261, "grad_norm": 0.39108607172966003, "learning_rate": 1.4861131889901741e-05, "loss": 0.0146, "step": 25510 }, { "epoch": 1.4981801103675003, "grad_norm": 1.6948814392089844, "learning_rate": 1.4828248901955349e-05, "loss": 0.0073, "step": 25520 }, { "epoch": 1.4987671715392743, "grad_norm": 1.4525011777877808, "learning_rate": 1.4795395999267785e-05, "loss": 0.0336, "step": 25530 }, { "epoch": 1.4993542327110485, "grad_norm": 0.678019642829895, "learning_rate": 1.4762573209940761e-05, "loss": 0.0019, "step": 25540 }, { "epoch": 1.4999412938828227, "grad_norm": 2.3262994289398193, "learning_rate": 1.4729780562050333e-05, "loss": 0.0154, "step": 25550 }, { "epoch": 1.5005283550545967, "grad_norm": 1.7300747632980347, "learning_rate": 1.469701808364668e-05, "loss": 0.0068, "step": 25560 }, { "epoch": 1.5011154162263707, "grad_norm": 0.633484959602356, "learning_rate": 1.466428580275424e-05, "loss": 0.0082, "step": 25570 }, { "epoch": 1.5017024773981449, "grad_norm": 1.1872471570968628, "learning_rate": 1.4631583747371568e-05, "loss": 0.0154, "step": 25580 }, { "epoch": 1.502289538569919, "grad_norm": 0.06257125735282898, "learning_rate": 1.459891194547141e-05, "loss": 0.003, "step": 25590 }, { "epoch": 1.502876599741693, "grad_norm": 0.31022894382476807, "learning_rate": 1.4566270425000605e-05, "loss": 0.0063, "step": 25600 }, { "epoch": 1.503463660913467, "grad_norm": 0.8578490614891052, "learning_rate": 1.4533659213880124e-05, "loss": 0.0115, "step": 25610 }, { "epoch": 1.5040507220852413, "grad_norm": 0.19193144142627716, "learning_rate": 1.4501078340004953e-05, "loss": 0.0183, "step": 25620 }, { "epoch": 1.5046377832570155, "grad_norm": 0.2767280042171478, "learning_rate": 1.4468527831244188e-05, "loss": 0.0096, "step": 25630 }, { "epoch": 1.5052248444287895, "grad_norm": 1.4123883247375488, "learning_rate": 1.4436007715440908e-05, "loss": 0.0125, "step": 25640 }, { "epoch": 1.5058119056005634, "grad_norm": 1.5527786016464233, "learning_rate": 1.4403518020412221e-05, "loss": 0.0068, "step": 25650 }, { "epoch": 1.5063989667723376, "grad_norm": 1.0498160123825073, "learning_rate": 1.4371058773949204e-05, "loss": 0.0115, "step": 25660 }, { "epoch": 1.5069860279441119, "grad_norm": 0.02221166342496872, "learning_rate": 1.4338630003816889e-05, "loss": 0.0126, "step": 25670 }, { "epoch": 1.5075730891158858, "grad_norm": 1.3004329204559326, "learning_rate": 1.430623173775426e-05, "loss": 0.0104, "step": 25680 }, { "epoch": 1.5081601502876598, "grad_norm": 1.0908926725387573, "learning_rate": 1.4273864003474157e-05, "loss": 0.0156, "step": 25690 }, { "epoch": 1.508747211459434, "grad_norm": 2.631584882736206, "learning_rate": 1.4241526828663366e-05, "loss": 0.0122, "step": 25700 }, { "epoch": 1.5093342726312082, "grad_norm": 0.7961341738700867, "learning_rate": 1.4209220240982468e-05, "loss": 0.009, "step": 25710 }, { "epoch": 1.5099213338029824, "grad_norm": 1.766007423400879, "learning_rate": 1.4176944268065928e-05, "loss": 0.0054, "step": 25720 }, { "epoch": 1.5105083949747564, "grad_norm": 1.0545703172683716, "learning_rate": 1.4144698937522022e-05, "loss": 0.0061, "step": 25730 }, { "epoch": 1.5110954561465304, "grad_norm": 3.5069901943206787, "learning_rate": 1.4112484276932808e-05, "loss": 0.0075, "step": 25740 }, { "epoch": 1.5116825173183046, "grad_norm": 0.42936596274375916, "learning_rate": 1.4080300313854072e-05, "loss": 0.0111, "step": 25750 }, { "epoch": 1.5122695784900788, "grad_norm": 0.36388495564460754, "learning_rate": 1.404814707581542e-05, "loss": 0.0114, "step": 25760 }, { "epoch": 1.5128566396618528, "grad_norm": 2.5523219108581543, "learning_rate": 1.401602459032007e-05, "loss": 0.0246, "step": 25770 }, { "epoch": 1.5134437008336268, "grad_norm": 0.4812352657318115, "learning_rate": 1.3983932884845046e-05, "loss": 0.0157, "step": 25780 }, { "epoch": 1.514030762005401, "grad_norm": 0.3735608458518982, "learning_rate": 1.3951871986840997e-05, "loss": 0.0203, "step": 25790 }, { "epoch": 1.5146178231771752, "grad_norm": 1.1593714952468872, "learning_rate": 1.3919841923732186e-05, "loss": 0.0171, "step": 25800 }, { "epoch": 1.5152048843489492, "grad_norm": 0.2563406825065613, "learning_rate": 1.3887842722916555e-05, "loss": 0.0152, "step": 25810 }, { "epoch": 1.5157919455207232, "grad_norm": 1.1414568424224854, "learning_rate": 1.3855874411765602e-05, "loss": 0.0171, "step": 25820 }, { "epoch": 1.5163790066924974, "grad_norm": 0.131794273853302, "learning_rate": 1.3823937017624427e-05, "loss": 0.0171, "step": 25830 }, { "epoch": 1.5169660678642716, "grad_norm": 0.80946284532547, "learning_rate": 1.3792030567811687e-05, "loss": 0.0156, "step": 25840 }, { "epoch": 1.5175531290360456, "grad_norm": 0.5427300930023193, "learning_rate": 1.3760155089619575e-05, "loss": 0.0089, "step": 25850 }, { "epoch": 1.5181401902078195, "grad_norm": 1.4580811262130737, "learning_rate": 1.3728310610313755e-05, "loss": 0.0057, "step": 25860 }, { "epoch": 1.5187272513795937, "grad_norm": 1.8565434217453003, "learning_rate": 1.369649715713342e-05, "loss": 0.0146, "step": 25870 }, { "epoch": 1.519314312551368, "grad_norm": 5.821869850158691, "learning_rate": 1.366471475729118e-05, "loss": 0.0155, "step": 25880 }, { "epoch": 1.519901373723142, "grad_norm": 0.17705050110816956, "learning_rate": 1.3632963437973122e-05, "loss": 0.0072, "step": 25890 }, { "epoch": 1.520488434894916, "grad_norm": 1.7221649885177612, "learning_rate": 1.3601243226338734e-05, "loss": 0.0125, "step": 25900 }, { "epoch": 1.5210754960666901, "grad_norm": 1.9367401599884033, "learning_rate": 1.3569554149520886e-05, "loss": 0.0069, "step": 25910 }, { "epoch": 1.5216625572384643, "grad_norm": 0.06342757493257523, "learning_rate": 1.3537896234625835e-05, "loss": 0.0084, "step": 25920 }, { "epoch": 1.5222496184102383, "grad_norm": 0.30179139971733093, "learning_rate": 1.350626950873315e-05, "loss": 0.0094, "step": 25930 }, { "epoch": 1.5228366795820123, "grad_norm": 2.0123047828674316, "learning_rate": 1.3474673998895764e-05, "loss": 0.0165, "step": 25940 }, { "epoch": 1.5234237407537865, "grad_norm": 0.5998191833496094, "learning_rate": 1.3443109732139841e-05, "loss": 0.0136, "step": 25950 }, { "epoch": 1.5240108019255607, "grad_norm": 0.24649828672409058, "learning_rate": 1.3411576735464925e-05, "loss": 0.0187, "step": 25960 }, { "epoch": 1.5245978630973347, "grad_norm": 0.7969292998313904, "learning_rate": 1.3380075035843714e-05, "loss": 0.0121, "step": 25970 }, { "epoch": 1.5251849242691087, "grad_norm": 1.5419803857803345, "learning_rate": 1.3348604660222198e-05, "loss": 0.0135, "step": 25980 }, { "epoch": 1.5257719854408829, "grad_norm": 0.00748492730781436, "learning_rate": 1.3317165635519518e-05, "loss": 0.0103, "step": 25990 }, { "epoch": 1.526359046612657, "grad_norm": 0.43452805280685425, "learning_rate": 1.3285757988628045e-05, "loss": 0.0115, "step": 26000 }, { "epoch": 1.5269461077844313, "grad_norm": 0.8184288740158081, "learning_rate": 1.3254381746413291e-05, "loss": 0.0067, "step": 26010 }, { "epoch": 1.5275331689562053, "grad_norm": 0.6614131331443787, "learning_rate": 1.3223036935713923e-05, "loss": 0.0078, "step": 26020 }, { "epoch": 1.5281202301279793, "grad_norm": 0.08678902685642242, "learning_rate": 1.3191723583341681e-05, "loss": 0.0125, "step": 26030 }, { "epoch": 1.5287072912997535, "grad_norm": 0.2708864212036133, "learning_rate": 1.3160441716081446e-05, "loss": 0.007, "step": 26040 }, { "epoch": 1.5292943524715277, "grad_norm": 0.5324260592460632, "learning_rate": 1.3129191360691112e-05, "loss": 0.0231, "step": 26050 }, { "epoch": 1.5298814136433017, "grad_norm": 1.7831233739852905, "learning_rate": 1.309797254390167e-05, "loss": 0.0121, "step": 26060 }, { "epoch": 1.5304684748150756, "grad_norm": 0.4662072956562042, "learning_rate": 1.306678529241711e-05, "loss": 0.006, "step": 26070 }, { "epoch": 1.5310555359868498, "grad_norm": 1.2265037298202515, "learning_rate": 1.3035629632914426e-05, "loss": 0.0073, "step": 26080 }, { "epoch": 1.531642597158624, "grad_norm": 1.3609946966171265, "learning_rate": 1.3004505592043598e-05, "loss": 0.024, "step": 26090 }, { "epoch": 1.532229658330398, "grad_norm": 0.26812970638275146, "learning_rate": 1.2973413196427519e-05, "loss": 0.0099, "step": 26100 }, { "epoch": 1.532816719502172, "grad_norm": 0.2851371467113495, "learning_rate": 1.2942352472662078e-05, "loss": 0.0066, "step": 26110 }, { "epoch": 1.5334037806739462, "grad_norm": 0.5341777801513672, "learning_rate": 1.2911323447315993e-05, "loss": 0.0081, "step": 26120 }, { "epoch": 1.5339908418457204, "grad_norm": 0.4923510253429413, "learning_rate": 1.288032614693097e-05, "loss": 0.0149, "step": 26130 }, { "epoch": 1.5345779030174944, "grad_norm": 1.253936767578125, "learning_rate": 1.2849360598021471e-05, "loss": 0.0217, "step": 26140 }, { "epoch": 1.5351649641892684, "grad_norm": 0.6040740609169006, "learning_rate": 1.2818426827074886e-05, "loss": 0.0151, "step": 26150 }, { "epoch": 1.5357520253610426, "grad_norm": 1.0527094602584839, "learning_rate": 1.2787524860551352e-05, "loss": 0.0109, "step": 26160 }, { "epoch": 1.5363390865328168, "grad_norm": 1.06759512424469, "learning_rate": 1.2756654724883849e-05, "loss": 0.0146, "step": 26170 }, { "epoch": 1.5369261477045908, "grad_norm": 0.2116648107767105, "learning_rate": 1.2725816446478112e-05, "loss": 0.0165, "step": 26180 }, { "epoch": 1.5375132088763648, "grad_norm": 1.721382975578308, "learning_rate": 1.2695010051712625e-05, "loss": 0.0075, "step": 26190 }, { "epoch": 1.538100270048139, "grad_norm": 0.4924405515193939, "learning_rate": 1.2664235566938632e-05, "loss": 0.0054, "step": 26200 }, { "epoch": 1.5386873312199132, "grad_norm": 0.15361934900283813, "learning_rate": 1.2633493018480009e-05, "loss": 0.0089, "step": 26210 }, { "epoch": 1.5392743923916872, "grad_norm": 0.08735856413841248, "learning_rate": 1.2602782432633387e-05, "loss": 0.0065, "step": 26220 }, { "epoch": 1.5398614535634612, "grad_norm": 1.3857438564300537, "learning_rate": 1.2572103835668004e-05, "loss": 0.0225, "step": 26230 }, { "epoch": 1.5404485147352354, "grad_norm": 0.17827914655208588, "learning_rate": 1.2541457253825773e-05, "loss": 0.01, "step": 26240 }, { "epoch": 1.5410355759070096, "grad_norm": 0.9525066614151001, "learning_rate": 1.2510842713321208e-05, "loss": 0.0094, "step": 26250 }, { "epoch": 1.5416226370787838, "grad_norm": 0.39047691226005554, "learning_rate": 1.248026024034143e-05, "loss": 0.0144, "step": 26260 }, { "epoch": 1.5422096982505578, "grad_norm": 0.30978187918663025, "learning_rate": 1.2449709861046077e-05, "loss": 0.0208, "step": 26270 }, { "epoch": 1.5427967594223317, "grad_norm": 0.14176547527313232, "learning_rate": 1.2419191601567409e-05, "loss": 0.0139, "step": 26280 }, { "epoch": 1.543383820594106, "grad_norm": 0.9704849123954773, "learning_rate": 1.238870548801015e-05, "loss": 0.0057, "step": 26290 }, { "epoch": 1.5439708817658802, "grad_norm": 0.018119262531399727, "learning_rate": 1.235825154645156e-05, "loss": 0.0054, "step": 26300 }, { "epoch": 1.5445579429376541, "grad_norm": 0.06399316340684891, "learning_rate": 1.232782980294137e-05, "loss": 0.0195, "step": 26310 }, { "epoch": 1.5451450041094281, "grad_norm": 1.6416302919387817, "learning_rate": 1.2297440283501793e-05, "loss": 0.0067, "step": 26320 }, { "epoch": 1.5457320652812023, "grad_norm": 0.5732679963111877, "learning_rate": 1.2267083014127424e-05, "loss": 0.0145, "step": 26330 }, { "epoch": 1.5463191264529765, "grad_norm": 0.017688609659671783, "learning_rate": 1.2236758020785316e-05, "loss": 0.0137, "step": 26340 }, { "epoch": 1.5469061876247505, "grad_norm": 0.17886310815811157, "learning_rate": 1.2206465329414901e-05, "loss": 0.0088, "step": 26350 }, { "epoch": 1.5474932487965245, "grad_norm": 0.06556381285190582, "learning_rate": 1.217620496592799e-05, "loss": 0.0111, "step": 26360 }, { "epoch": 1.5480803099682987, "grad_norm": 0.24476167559623718, "learning_rate": 1.2145976956208738e-05, "loss": 0.0127, "step": 26370 }, { "epoch": 1.548667371140073, "grad_norm": 0.030447032302618027, "learning_rate": 1.211578132611359e-05, "loss": 0.0161, "step": 26380 }, { "epoch": 1.549254432311847, "grad_norm": 0.9274646043777466, "learning_rate": 1.2085618101471363e-05, "loss": 0.0102, "step": 26390 }, { "epoch": 1.5498414934836209, "grad_norm": 1.1633837223052979, "learning_rate": 1.205548730808308e-05, "loss": 0.0079, "step": 26400 }, { "epoch": 1.550428554655395, "grad_norm": 2.194584846496582, "learning_rate": 1.2025388971722068e-05, "loss": 0.0159, "step": 26410 }, { "epoch": 1.5510156158271693, "grad_norm": 3.140812397003174, "learning_rate": 1.1995323118133894e-05, "loss": 0.0096, "step": 26420 }, { "epoch": 1.5516026769989433, "grad_norm": 2.263437509536743, "learning_rate": 1.196528977303633e-05, "loss": 0.0164, "step": 26430 }, { "epoch": 1.5521897381707173, "grad_norm": 0.17677821218967438, "learning_rate": 1.1935288962119317e-05, "loss": 0.0059, "step": 26440 }, { "epoch": 1.5527767993424915, "grad_norm": 1.0963406562805176, "learning_rate": 1.190532071104502e-05, "loss": 0.0071, "step": 26450 }, { "epoch": 1.5533638605142657, "grad_norm": 0.3778134882450104, "learning_rate": 1.1875385045447679e-05, "loss": 0.0072, "step": 26460 }, { "epoch": 1.5539509216860397, "grad_norm": 0.20560497045516968, "learning_rate": 1.1845481990933716e-05, "loss": 0.0074, "step": 26470 }, { "epoch": 1.5545379828578136, "grad_norm": 0.2553133964538574, "learning_rate": 1.1815611573081681e-05, "loss": 0.0086, "step": 26480 }, { "epoch": 1.5551250440295878, "grad_norm": 0.2380114644765854, "learning_rate": 1.1785773817442137e-05, "loss": 0.0108, "step": 26490 }, { "epoch": 1.555712105201362, "grad_norm": 0.011128624901175499, "learning_rate": 1.1755968749537754e-05, "loss": 0.0047, "step": 26500 }, { "epoch": 1.556299166373136, "grad_norm": 1.5508345365524292, "learning_rate": 1.172619639486322e-05, "loss": 0.0313, "step": 26510 }, { "epoch": 1.55688622754491, "grad_norm": 0.7508226633071899, "learning_rate": 1.1696456778885262e-05, "loss": 0.0278, "step": 26520 }, { "epoch": 1.5574732887166842, "grad_norm": 0.5488082766532898, "learning_rate": 1.166674992704258e-05, "loss": 0.0208, "step": 26530 }, { "epoch": 1.5580603498884584, "grad_norm": 1.9381704330444336, "learning_rate": 1.163707586474589e-05, "loss": 0.0197, "step": 26540 }, { "epoch": 1.5586474110602326, "grad_norm": 1.983081340789795, "learning_rate": 1.1607434617377788e-05, "loss": 0.0111, "step": 26550 }, { "epoch": 1.5592344722320066, "grad_norm": 0.04777294397354126, "learning_rate": 1.157782621029288e-05, "loss": 0.0108, "step": 26560 }, { "epoch": 1.5598215334037806, "grad_norm": 0.15073652565479279, "learning_rate": 1.1548250668817612e-05, "loss": 0.0103, "step": 26570 }, { "epoch": 1.5604085945755548, "grad_norm": 2.098924398422241, "learning_rate": 1.1518708018250369e-05, "loss": 0.0057, "step": 26580 }, { "epoch": 1.560995655747329, "grad_norm": 0.07834240049123764, "learning_rate": 1.148919828386138e-05, "loss": 0.0047, "step": 26590 }, { "epoch": 1.561582716919103, "grad_norm": 2.849283456802368, "learning_rate": 1.1459721490892732e-05, "loss": 0.011, "step": 26600 }, { "epoch": 1.562169778090877, "grad_norm": 1.0505820512771606, "learning_rate": 1.1430277664558298e-05, "loss": 0.0088, "step": 26610 }, { "epoch": 1.5627568392626512, "grad_norm": 5.632779598236084, "learning_rate": 1.1400866830043789e-05, "loss": 0.0185, "step": 26620 }, { "epoch": 1.5633439004344254, "grad_norm": 0.5144931674003601, "learning_rate": 1.1371489012506698e-05, "loss": 0.0077, "step": 26630 }, { "epoch": 1.5639309616061994, "grad_norm": 1.3698240518569946, "learning_rate": 1.1342144237076236e-05, "loss": 0.0123, "step": 26640 }, { "epoch": 1.5645180227779734, "grad_norm": 0.12052831053733826, "learning_rate": 1.131283252885338e-05, "loss": 0.006, "step": 26650 }, { "epoch": 1.5651050839497476, "grad_norm": 0.18448656797409058, "learning_rate": 1.1283553912910833e-05, "loss": 0.0124, "step": 26660 }, { "epoch": 1.5656921451215218, "grad_norm": 1.7134002447128296, "learning_rate": 1.1254308414292975e-05, "loss": 0.0027, "step": 26670 }, { "epoch": 1.5662792062932958, "grad_norm": 1.3325996398925781, "learning_rate": 1.1225096058015844e-05, "loss": 0.0113, "step": 26680 }, { "epoch": 1.5668662674650697, "grad_norm": 2.014275074005127, "learning_rate": 1.1195916869067159e-05, "loss": 0.0119, "step": 26690 }, { "epoch": 1.567453328636844, "grad_norm": 0.4575527310371399, "learning_rate": 1.1166770872406223e-05, "loss": 0.0121, "step": 26700 }, { "epoch": 1.5680403898086182, "grad_norm": 2.249737501144409, "learning_rate": 1.1137658092964026e-05, "loss": 0.0115, "step": 26710 }, { "epoch": 1.5686274509803921, "grad_norm": 0.14120075106620789, "learning_rate": 1.1108578555643056e-05, "loss": 0.0219, "step": 26720 }, { "epoch": 1.5692145121521661, "grad_norm": 1.2550865411758423, "learning_rate": 1.1079532285317435e-05, "loss": 0.0047, "step": 26730 }, { "epoch": 1.5698015733239403, "grad_norm": 2.1121370792388916, "learning_rate": 1.1050519306832768e-05, "loss": 0.0133, "step": 26740 }, { "epoch": 1.5703886344957145, "grad_norm": 1.764227032661438, "learning_rate": 1.1021539645006229e-05, "loss": 0.0103, "step": 26750 }, { "epoch": 1.5709756956674885, "grad_norm": 0.009054204449057579, "learning_rate": 1.0992593324626488e-05, "loss": 0.0108, "step": 26760 }, { "epoch": 1.5715627568392625, "grad_norm": 1.0275087356567383, "learning_rate": 1.0963680370453678e-05, "loss": 0.0069, "step": 26770 }, { "epoch": 1.5721498180110367, "grad_norm": 1.6089822053909302, "learning_rate": 1.0934800807219415e-05, "loss": 0.0104, "step": 26780 }, { "epoch": 1.572736879182811, "grad_norm": 2.2903213500976562, "learning_rate": 1.090595465962671e-05, "loss": 0.0386, "step": 26790 }, { "epoch": 1.573323940354585, "grad_norm": 1.5878002643585205, "learning_rate": 1.0877141952350046e-05, "loss": 0.0048, "step": 26800 }, { "epoch": 1.573911001526359, "grad_norm": 1.2515116930007935, "learning_rate": 1.0848362710035253e-05, "loss": 0.0077, "step": 26810 }, { "epoch": 1.574498062698133, "grad_norm": 0.23582957684993744, "learning_rate": 1.0819616957299567e-05, "loss": 0.0118, "step": 26820 }, { "epoch": 1.5750851238699073, "grad_norm": 1.5625358819961548, "learning_rate": 1.0790904718731565e-05, "loss": 0.0165, "step": 26830 }, { "epoch": 1.5756721850416815, "grad_norm": 2.9153378009796143, "learning_rate": 1.0762226018891175e-05, "loss": 0.0073, "step": 26840 }, { "epoch": 1.5762592462134555, "grad_norm": 0.6501205563545227, "learning_rate": 1.0733580882309591e-05, "loss": 0.0176, "step": 26850 }, { "epoch": 1.5768463073852295, "grad_norm": 1.5556343793869019, "learning_rate": 1.0704969333489362e-05, "loss": 0.0057, "step": 26860 }, { "epoch": 1.5774333685570037, "grad_norm": 0.45732811093330383, "learning_rate": 1.0676391396904229e-05, "loss": 0.005, "step": 26870 }, { "epoch": 1.5780204297287779, "grad_norm": 0.17563748359680176, "learning_rate": 1.0647847096999276e-05, "loss": 0.0117, "step": 26880 }, { "epoch": 1.5786074909005519, "grad_norm": 1.6784812211990356, "learning_rate": 1.0619336458190726e-05, "loss": 0.0228, "step": 26890 }, { "epoch": 1.5791945520723258, "grad_norm": 1.1329185962677002, "learning_rate": 1.0590859504866058e-05, "loss": 0.0052, "step": 26900 }, { "epoch": 1.5797816132441, "grad_norm": 1.3960808515548706, "learning_rate": 1.0562416261383945e-05, "loss": 0.017, "step": 26910 }, { "epoch": 1.5803686744158743, "grad_norm": 2.32549786567688, "learning_rate": 1.0534006752074171e-05, "loss": 0.0096, "step": 26920 }, { "epoch": 1.5809557355876482, "grad_norm": 2.072798490524292, "learning_rate": 1.050563100123772e-05, "loss": 0.0111, "step": 26930 }, { "epoch": 1.5815427967594222, "grad_norm": 0.689721941947937, "learning_rate": 1.0477289033146675e-05, "loss": 0.007, "step": 26940 }, { "epoch": 1.5821298579311964, "grad_norm": 0.16267193853855133, "learning_rate": 1.0448980872044239e-05, "loss": 0.0038, "step": 26950 }, { "epoch": 1.5827169191029706, "grad_norm": 0.16729170083999634, "learning_rate": 1.0420706542144664e-05, "loss": 0.0032, "step": 26960 }, { "epoch": 1.5833039802747446, "grad_norm": 1.0334956645965576, "learning_rate": 1.03924660676333e-05, "loss": 0.0188, "step": 26970 }, { "epoch": 1.5838910414465186, "grad_norm": 1.1257108449935913, "learning_rate": 1.0364259472666504e-05, "loss": 0.0032, "step": 26980 }, { "epoch": 1.5844781026182928, "grad_norm": 1.1537364721298218, "learning_rate": 1.0336086781371679e-05, "loss": 0.0074, "step": 26990 }, { "epoch": 1.585065163790067, "grad_norm": 0.2987788915634155, "learning_rate": 1.030794801784722e-05, "loss": 0.009, "step": 27000 }, { "epoch": 1.585065163790067, "eval_loss": 0.5134320259094238, "eval_runtime": 269.7262, "eval_samples_per_second": 3.504, "eval_steps_per_second": 3.504, "step": 27000 }, { "epoch": 1.585652224961841, "grad_norm": 2.068195104598999, "learning_rate": 1.0279843206162509e-05, "loss": 0.0073, "step": 27010 }, { "epoch": 1.586239286133615, "grad_norm": 0.1688322126865387, "learning_rate": 1.0251772370357854e-05, "loss": 0.0084, "step": 27020 }, { "epoch": 1.5868263473053892, "grad_norm": 0.8966720700263977, "learning_rate": 1.022373553444454e-05, "loss": 0.0047, "step": 27030 }, { "epoch": 1.5874134084771634, "grad_norm": 0.21050378680229187, "learning_rate": 1.019573272240476e-05, "loss": 0.0067, "step": 27040 }, { "epoch": 1.5880004696489374, "grad_norm": 0.38126567006111145, "learning_rate": 1.0167763958191556e-05, "loss": 0.0047, "step": 27050 }, { "epoch": 1.5885875308207114, "grad_norm": 0.613869309425354, "learning_rate": 1.013982926572895e-05, "loss": 0.0052, "step": 27060 }, { "epoch": 1.5891745919924856, "grad_norm": 0.20027974247932434, "learning_rate": 1.0111928668911702e-05, "loss": 0.0135, "step": 27070 }, { "epoch": 1.5897616531642598, "grad_norm": 1.9748679399490356, "learning_rate": 1.0084062191605498e-05, "loss": 0.0102, "step": 27080 }, { "epoch": 1.590348714336034, "grad_norm": 0.007248531095683575, "learning_rate": 1.0056229857646771e-05, "loss": 0.0036, "step": 27090 }, { "epoch": 1.590935775507808, "grad_norm": 0.06447096914052963, "learning_rate": 1.0028431690842793e-05, "loss": 0.0135, "step": 27100 }, { "epoch": 1.591522836679582, "grad_norm": 0.022004619240760803, "learning_rate": 1.00006677149716e-05, "loss": 0.005, "step": 27110 }, { "epoch": 1.5921098978513561, "grad_norm": 1.1582072973251343, "learning_rate": 9.972937953781986e-06, "loss": 0.0196, "step": 27120 }, { "epoch": 1.5926969590231304, "grad_norm": 0.04341103509068489, "learning_rate": 9.945242430993446e-06, "loss": 0.0064, "step": 27130 }, { "epoch": 1.5932840201949043, "grad_norm": 1.97472083568573, "learning_rate": 9.917581170296241e-06, "loss": 0.0096, "step": 27140 }, { "epoch": 1.5938710813666783, "grad_norm": 0.1625327169895172, "learning_rate": 9.889954195351276e-06, "loss": 0.0107, "step": 27150 }, { "epoch": 1.5944581425384525, "grad_norm": 0.14234226942062378, "learning_rate": 9.862361529790149e-06, "loss": 0.0049, "step": 27160 }, { "epoch": 1.5950452037102267, "grad_norm": 0.9036107063293457, "learning_rate": 9.83480319721512e-06, "loss": 0.0255, "step": 27170 }, { "epoch": 1.5956322648820007, "grad_norm": 0.05637304484844208, "learning_rate": 9.807279221199067e-06, "loss": 0.0081, "step": 27180 }, { "epoch": 1.5962193260537747, "grad_norm": 0.07056168466806412, "learning_rate": 9.7797896252855e-06, "loss": 0.0013, "step": 27190 }, { "epoch": 1.596806387225549, "grad_norm": 0.43077000975608826, "learning_rate": 9.752334432988485e-06, "loss": 0.018, "step": 27200 }, { "epoch": 1.5973934483973231, "grad_norm": 2.8605754375457764, "learning_rate": 9.724913667792696e-06, "loss": 0.0121, "step": 27210 }, { "epoch": 1.597980509569097, "grad_norm": 0.9835911393165588, "learning_rate": 9.69752735315333e-06, "loss": 0.0054, "step": 27220 }, { "epoch": 1.598567570740871, "grad_norm": 0.20682783424854279, "learning_rate": 9.670175512496127e-06, "loss": 0.0092, "step": 27230 }, { "epoch": 1.5991546319126453, "grad_norm": 3.0163230895996094, "learning_rate": 9.642858169217356e-06, "loss": 0.0055, "step": 27240 }, { "epoch": 1.5997416930844195, "grad_norm": 0.4688768982887268, "learning_rate": 9.615575346683758e-06, "loss": 0.0058, "step": 27250 }, { "epoch": 1.6003287542561935, "grad_norm": 1.2958694696426392, "learning_rate": 9.588327068232539e-06, "loss": 0.0105, "step": 27260 }, { "epoch": 1.6009158154279675, "grad_norm": 0.6211140751838684, "learning_rate": 9.561113357171386e-06, "loss": 0.004, "step": 27270 }, { "epoch": 1.6015028765997417, "grad_norm": 0.8817264437675476, "learning_rate": 9.533934236778364e-06, "loss": 0.0069, "step": 27280 }, { "epoch": 1.6020899377715159, "grad_norm": 2.1518731117248535, "learning_rate": 9.506789730302034e-06, "loss": 0.0185, "step": 27290 }, { "epoch": 1.6026769989432899, "grad_norm": 0.5215532779693604, "learning_rate": 9.47967986096126e-06, "loss": 0.0059, "step": 27300 }, { "epoch": 1.6032640601150638, "grad_norm": 1.0806931257247925, "learning_rate": 9.45260465194533e-06, "loss": 0.0105, "step": 27310 }, { "epoch": 1.603851121286838, "grad_norm": 0.0196642205119133, "learning_rate": 9.425564126413889e-06, "loss": 0.01, "step": 27320 }, { "epoch": 1.6044381824586122, "grad_norm": 0.8702532052993774, "learning_rate": 9.398558307496868e-06, "loss": 0.0118, "step": 27330 }, { "epoch": 1.6050252436303862, "grad_norm": 0.4726807177066803, "learning_rate": 9.37158721829456e-06, "loss": 0.0089, "step": 27340 }, { "epoch": 1.6056123048021604, "grad_norm": 1.2431391477584839, "learning_rate": 9.344650881877515e-06, "loss": 0.0097, "step": 27350 }, { "epoch": 1.6061993659739344, "grad_norm": 0.019758054986596107, "learning_rate": 9.317749321286601e-06, "loss": 0.0048, "step": 27360 }, { "epoch": 1.6067864271457086, "grad_norm": 0.9880382418632507, "learning_rate": 9.290882559532877e-06, "loss": 0.0176, "step": 27370 }, { "epoch": 1.6073734883174828, "grad_norm": 1.799951434135437, "learning_rate": 9.264050619597697e-06, "loss": 0.0076, "step": 27380 }, { "epoch": 1.6079605494892568, "grad_norm": 1.8501851558685303, "learning_rate": 9.23725352443257e-06, "loss": 0.0118, "step": 27390 }, { "epoch": 1.6085476106610308, "grad_norm": 1.811416745185852, "learning_rate": 9.210491296959256e-06, "loss": 0.0201, "step": 27400 }, { "epoch": 1.609134671832805, "grad_norm": 0.3321267366409302, "learning_rate": 9.183763960069652e-06, "loss": 0.0119, "step": 27410 }, { "epoch": 1.6097217330045792, "grad_norm": 1.2952860593795776, "learning_rate": 9.157071536625838e-06, "loss": 0.0061, "step": 27420 }, { "epoch": 1.6103087941763532, "grad_norm": 1.584791660308838, "learning_rate": 9.130414049459995e-06, "loss": 0.0125, "step": 27430 }, { "epoch": 1.6108958553481272, "grad_norm": 0.06565705686807632, "learning_rate": 9.103791521374444e-06, "loss": 0.0113, "step": 27440 }, { "epoch": 1.6114829165199014, "grad_norm": 0.5617465972900391, "learning_rate": 9.077203975141607e-06, "loss": 0.0099, "step": 27450 }, { "epoch": 1.6120699776916756, "grad_norm": 1.095957636833191, "learning_rate": 9.050651433503965e-06, "loss": 0.0071, "step": 27460 }, { "epoch": 1.6126570388634496, "grad_norm": 0.17165639996528625, "learning_rate": 9.024133919174082e-06, "loss": 0.0049, "step": 27470 }, { "epoch": 1.6132441000352236, "grad_norm": 1.358237385749817, "learning_rate": 8.997651454834527e-06, "loss": 0.0142, "step": 27480 }, { "epoch": 1.6138311612069978, "grad_norm": 2.021362543106079, "learning_rate": 8.971204063137916e-06, "loss": 0.0141, "step": 27490 }, { "epoch": 1.614418222378772, "grad_norm": 0.008475619368255138, "learning_rate": 8.944791766706844e-06, "loss": 0.0047, "step": 27500 }, { "epoch": 1.615005283550546, "grad_norm": 0.6366258859634399, "learning_rate": 8.918414588133894e-06, "loss": 0.0141, "step": 27510 }, { "epoch": 1.61559234472232, "grad_norm": 0.19768893718719482, "learning_rate": 8.892072549981622e-06, "loss": 0.0116, "step": 27520 }, { "epoch": 1.6161794058940941, "grad_norm": 0.5427848100662231, "learning_rate": 8.865765674782528e-06, "loss": 0.0142, "step": 27530 }, { "epoch": 1.6167664670658684, "grad_norm": 1.6507068872451782, "learning_rate": 8.839493985038988e-06, "loss": 0.0151, "step": 27540 }, { "epoch": 1.6173535282376423, "grad_norm": 0.026372963562607765, "learning_rate": 8.81325750322335e-06, "loss": 0.0111, "step": 27550 }, { "epoch": 1.6179405894094163, "grad_norm": 0.4352494776248932, "learning_rate": 8.78705625177777e-06, "loss": 0.0059, "step": 27560 }, { "epoch": 1.6185276505811905, "grad_norm": 0.27398261427879333, "learning_rate": 8.76089025311434e-06, "loss": 0.0036, "step": 27570 }, { "epoch": 1.6191147117529647, "grad_norm": 1.2692209482192993, "learning_rate": 8.734759529614956e-06, "loss": 0.0082, "step": 27580 }, { "epoch": 1.6197017729247387, "grad_norm": 0.17485210299491882, "learning_rate": 8.708664103631354e-06, "loss": 0.0089, "step": 27590 }, { "epoch": 1.6202888340965127, "grad_norm": 2.3155698776245117, "learning_rate": 8.682603997485078e-06, "loss": 0.0124, "step": 27600 }, { "epoch": 1.620875895268287, "grad_norm": 1.551294207572937, "learning_rate": 8.656579233467443e-06, "loss": 0.0139, "step": 27610 }, { "epoch": 1.621462956440061, "grad_norm": 0.10387564450502396, "learning_rate": 8.63058983383957e-06, "loss": 0.0125, "step": 27620 }, { "epoch": 1.6220500176118353, "grad_norm": 0.4853443205356598, "learning_rate": 8.604635820832258e-06, "loss": 0.0141, "step": 27630 }, { "epoch": 1.6226370787836093, "grad_norm": 0.006195012014359236, "learning_rate": 8.578717216646143e-06, "loss": 0.0068, "step": 27640 }, { "epoch": 1.6232241399553833, "grad_norm": 0.3198970556259155, "learning_rate": 8.55283404345148e-06, "loss": 0.0132, "step": 27650 }, { "epoch": 1.6238112011271575, "grad_norm": 2.3255319595336914, "learning_rate": 8.526986323388263e-06, "loss": 0.0254, "step": 27660 }, { "epoch": 1.6243982622989317, "grad_norm": 0.06995700299739838, "learning_rate": 8.501174078566143e-06, "loss": 0.022, "step": 27670 }, { "epoch": 1.6249853234707057, "grad_norm": 0.23026344180107117, "learning_rate": 8.475397331064427e-06, "loss": 0.0163, "step": 27680 }, { "epoch": 1.6255723846424797, "grad_norm": 0.19775334000587463, "learning_rate": 8.449656102932075e-06, "loss": 0.0026, "step": 27690 }, { "epoch": 1.6261594458142539, "grad_norm": 0.04183727502822876, "learning_rate": 8.42395041618766e-06, "loss": 0.0027, "step": 27700 }, { "epoch": 1.626746506986028, "grad_norm": 0.1087692603468895, "learning_rate": 8.398280292819321e-06, "loss": 0.0065, "step": 27710 }, { "epoch": 1.627333568157802, "grad_norm": 1.2450783252716064, "learning_rate": 8.37264575478482e-06, "loss": 0.025, "step": 27720 }, { "epoch": 1.627920629329576, "grad_norm": 1.7207362651824951, "learning_rate": 8.347046824011467e-06, "loss": 0.0188, "step": 27730 }, { "epoch": 1.6285076905013502, "grad_norm": 0.5218789577484131, "learning_rate": 8.321483522396084e-06, "loss": 0.0072, "step": 27740 }, { "epoch": 1.6290947516731245, "grad_norm": 0.4121147692203522, "learning_rate": 8.295955871805061e-06, "loss": 0.0144, "step": 27750 }, { "epoch": 1.6296818128448984, "grad_norm": 0.06316855549812317, "learning_rate": 8.27046389407427e-06, "loss": 0.0072, "step": 27760 }, { "epoch": 1.6302688740166724, "grad_norm": 0.19479545950889587, "learning_rate": 8.245007611009087e-06, "loss": 0.0108, "step": 27770 }, { "epoch": 1.6308559351884466, "grad_norm": 0.3882885277271271, "learning_rate": 8.219587044384307e-06, "loss": 0.0165, "step": 27780 }, { "epoch": 1.6314429963602208, "grad_norm": 0.05304262042045593, "learning_rate": 8.194202215944247e-06, "loss": 0.0105, "step": 27790 }, { "epoch": 1.6320300575319948, "grad_norm": 2.962719440460205, "learning_rate": 8.168853147402566e-06, "loss": 0.0072, "step": 27800 }, { "epoch": 1.6326171187037688, "grad_norm": 2.3828048706054688, "learning_rate": 8.14353986044244e-06, "loss": 0.0046, "step": 27810 }, { "epoch": 1.633204179875543, "grad_norm": 0.10702111572027206, "learning_rate": 8.11826237671634e-06, "loss": 0.0064, "step": 27820 }, { "epoch": 1.6337912410473172, "grad_norm": 0.43127015233039856, "learning_rate": 8.093020717846177e-06, "loss": 0.0058, "step": 27830 }, { "epoch": 1.6343783022190912, "grad_norm": 0.07288127392530441, "learning_rate": 8.067814905423176e-06, "loss": 0.0185, "step": 27840 }, { "epoch": 1.6349653633908652, "grad_norm": 0.13415968418121338, "learning_rate": 8.042644961007927e-06, "loss": 0.0048, "step": 27850 }, { "epoch": 1.6355524245626394, "grad_norm": 0.9306196570396423, "learning_rate": 8.017510906130332e-06, "loss": 0.018, "step": 27860 }, { "epoch": 1.6361394857344136, "grad_norm": 0.050222061574459076, "learning_rate": 7.992412762289592e-06, "loss": 0.0035, "step": 27870 }, { "epoch": 1.6367265469061876, "grad_norm": 0.007918142713606358, "learning_rate": 7.967350550954201e-06, "loss": 0.0087, "step": 27880 }, { "epoch": 1.6373136080779616, "grad_norm": 0.5580129623413086, "learning_rate": 7.942324293561876e-06, "loss": 0.0045, "step": 27890 }, { "epoch": 1.6379006692497358, "grad_norm": 0.10453560203313828, "learning_rate": 7.917334011519646e-06, "loss": 0.0049, "step": 27900 }, { "epoch": 1.63848773042151, "grad_norm": 1.7366032600402832, "learning_rate": 7.892379726203702e-06, "loss": 0.0136, "step": 27910 }, { "epoch": 1.6390747915932842, "grad_norm": 0.05727103352546692, "learning_rate": 7.86746145895948e-06, "loss": 0.0127, "step": 27920 }, { "epoch": 1.6396618527650582, "grad_norm": 1.9890248775482178, "learning_rate": 7.84257923110161e-06, "loss": 0.019, "step": 27930 }, { "epoch": 1.6402489139368321, "grad_norm": 0.25739482045173645, "learning_rate": 7.81773306391389e-06, "loss": 0.0089, "step": 27940 }, { "epoch": 1.6408359751086063, "grad_norm": 0.23821336030960083, "learning_rate": 7.792922978649248e-06, "loss": 0.0057, "step": 27950 }, { "epoch": 1.6414230362803806, "grad_norm": 0.026927923783659935, "learning_rate": 7.768148996529789e-06, "loss": 0.0091, "step": 27960 }, { "epoch": 1.6420100974521545, "grad_norm": 0.07286237180233002, "learning_rate": 7.743411138746686e-06, "loss": 0.0041, "step": 27970 }, { "epoch": 1.6425971586239285, "grad_norm": 0.7503321766853333, "learning_rate": 7.718709426460258e-06, "loss": 0.0142, "step": 27980 }, { "epoch": 1.6431842197957027, "grad_norm": 0.8196347951889038, "learning_rate": 7.694043880799889e-06, "loss": 0.0103, "step": 27990 }, { "epoch": 1.643771280967477, "grad_norm": 0.20796020328998566, "learning_rate": 7.669414522864028e-06, "loss": 0.0209, "step": 28000 }, { "epoch": 1.644358342139251, "grad_norm": 0.14734028279781342, "learning_rate": 7.644821373720168e-06, "loss": 0.0077, "step": 28010 }, { "epoch": 1.644945403311025, "grad_norm": 0.03402048721909523, "learning_rate": 7.620264454404819e-06, "loss": 0.0189, "step": 28020 }, { "epoch": 1.645532464482799, "grad_norm": 0.07426488399505615, "learning_rate": 7.595743785923515e-06, "loss": 0.0134, "step": 28030 }, { "epoch": 1.6461195256545733, "grad_norm": 0.16101300716400146, "learning_rate": 7.571259389250779e-06, "loss": 0.009, "step": 28040 }, { "epoch": 1.6467065868263473, "grad_norm": 1.456742525100708, "learning_rate": 7.546811285330119e-06, "loss": 0.0126, "step": 28050 }, { "epoch": 1.6472936479981213, "grad_norm": 0.04736657440662384, "learning_rate": 7.522399495073962e-06, "loss": 0.0101, "step": 28060 }, { "epoch": 1.6478807091698955, "grad_norm": 0.043693553656339645, "learning_rate": 7.4980240393637216e-06, "loss": 0.0077, "step": 28070 }, { "epoch": 1.6484677703416697, "grad_norm": 1.484106421470642, "learning_rate": 7.473684939049685e-06, "loss": 0.0156, "step": 28080 }, { "epoch": 1.6490548315134437, "grad_norm": 0.08472202718257904, "learning_rate": 7.449382214951073e-06, "loss": 0.0081, "step": 28090 }, { "epoch": 1.6496418926852177, "grad_norm": 0.06400839239358902, "learning_rate": 7.425115887855983e-06, "loss": 0.0129, "step": 28100 }, { "epoch": 1.6502289538569919, "grad_norm": 0.02137874811887741, "learning_rate": 7.400885978521393e-06, "loss": 0.0042, "step": 28110 }, { "epoch": 1.650816015028766, "grad_norm": 0.3920609951019287, "learning_rate": 7.376692507673083e-06, "loss": 0.0171, "step": 28120 }, { "epoch": 1.65140307620054, "grad_norm": 2.4398319721221924, "learning_rate": 7.3525354960057195e-06, "loss": 0.0076, "step": 28130 }, { "epoch": 1.651990137372314, "grad_norm": 0.4373569190502167, "learning_rate": 7.328414964182756e-06, "loss": 0.0069, "step": 28140 }, { "epoch": 1.6525771985440882, "grad_norm": 2.4919962882995605, "learning_rate": 7.304330932836434e-06, "loss": 0.0142, "step": 28150 }, { "epoch": 1.6531642597158624, "grad_norm": 0.48688334226608276, "learning_rate": 7.2802834225677905e-06, "loss": 0.011, "step": 28160 }, { "epoch": 1.6537513208876367, "grad_norm": 1.0665478706359863, "learning_rate": 7.256272453946616e-06, "loss": 0.0178, "step": 28170 }, { "epoch": 1.6543383820594106, "grad_norm": 0.024595128372311592, "learning_rate": 7.23229804751146e-06, "loss": 0.0144, "step": 28180 }, { "epoch": 1.6549254432311846, "grad_norm": 0.3491171598434448, "learning_rate": 7.208360223769555e-06, "loss": 0.0084, "step": 28190 }, { "epoch": 1.6555125044029588, "grad_norm": 1.6058762073516846, "learning_rate": 7.184459003196892e-06, "loss": 0.0079, "step": 28200 }, { "epoch": 1.656099565574733, "grad_norm": 0.01545692328363657, "learning_rate": 7.1605944062380916e-06, "loss": 0.0065, "step": 28210 }, { "epoch": 1.656686626746507, "grad_norm": 2.165834903717041, "learning_rate": 7.136766453306537e-06, "loss": 0.0122, "step": 28220 }, { "epoch": 1.657273687918281, "grad_norm": 0.23048318922519684, "learning_rate": 7.112975164784175e-06, "loss": 0.0078, "step": 28230 }, { "epoch": 1.6578607490900552, "grad_norm": 1.7704715728759766, "learning_rate": 7.089220561021648e-06, "loss": 0.011, "step": 28240 }, { "epoch": 1.6584478102618294, "grad_norm": 0.11009885370731354, "learning_rate": 7.065502662338186e-06, "loss": 0.0023, "step": 28250 }, { "epoch": 1.6590348714336034, "grad_norm": 0.6416253447532654, "learning_rate": 7.041821489021639e-06, "loss": 0.0097, "step": 28260 }, { "epoch": 1.6596219326053774, "grad_norm": 1.033013939857483, "learning_rate": 7.018177061328451e-06, "loss": 0.0175, "step": 28270 }, { "epoch": 1.6602089937771516, "grad_norm": 4.628103733062744, "learning_rate": 6.994569399483614e-06, "loss": 0.0085, "step": 28280 }, { "epoch": 1.6607960549489258, "grad_norm": 0.5595627427101135, "learning_rate": 6.9709985236807e-06, "loss": 0.006, "step": 28290 }, { "epoch": 1.6613831161206998, "grad_norm": 1.0199556350708008, "learning_rate": 6.947464454081765e-06, "loss": 0.0033, "step": 28300 }, { "epoch": 1.6619701772924738, "grad_norm": 0.22090382874011993, "learning_rate": 6.923967210817439e-06, "loss": 0.008, "step": 28310 }, { "epoch": 1.662557238464248, "grad_norm": 0.017477478832006454, "learning_rate": 6.900506813986806e-06, "loss": 0.0088, "step": 28320 }, { "epoch": 1.6631442996360222, "grad_norm": 1.4774725437164307, "learning_rate": 6.8770832836574596e-06, "loss": 0.0091, "step": 28330 }, { "epoch": 1.6637313608077962, "grad_norm": 0.5444540977478027, "learning_rate": 6.853696639865448e-06, "loss": 0.0054, "step": 28340 }, { "epoch": 1.6643184219795701, "grad_norm": 1.9333986043930054, "learning_rate": 6.830346902615281e-06, "loss": 0.0223, "step": 28350 }, { "epoch": 1.6649054831513443, "grad_norm": 1.0381861925125122, "learning_rate": 6.807034091879866e-06, "loss": 0.0101, "step": 28360 }, { "epoch": 1.6654925443231186, "grad_norm": 0.144987091422081, "learning_rate": 6.783758227600567e-06, "loss": 0.007, "step": 28370 }, { "epoch": 1.6660796054948925, "grad_norm": 0.34596773982048035, "learning_rate": 6.760519329687099e-06, "loss": 0.0064, "step": 28380 }, { "epoch": 1.6666666666666665, "grad_norm": 0.10488889366388321, "learning_rate": 6.737317418017608e-06, "loss": 0.0112, "step": 28390 }, { "epoch": 1.6672537278384407, "grad_norm": 1.5192146301269531, "learning_rate": 6.7141525124385595e-06, "loss": 0.0049, "step": 28400 }, { "epoch": 1.667840789010215, "grad_norm": 0.5066091418266296, "learning_rate": 6.6910246327647864e-06, "loss": 0.0137, "step": 28410 }, { "epoch": 1.668427850181989, "grad_norm": 0.49690625071525574, "learning_rate": 6.667933798779447e-06, "loss": 0.0034, "step": 28420 }, { "epoch": 1.669014911353763, "grad_norm": 0.579086184501648, "learning_rate": 6.644880030234002e-06, "loss": 0.0054, "step": 28430 }, { "epoch": 1.669601972525537, "grad_norm": 0.1151830330491066, "learning_rate": 6.621863346848217e-06, "loss": 0.0055, "step": 28440 }, { "epoch": 1.6701890336973113, "grad_norm": 0.4732290804386139, "learning_rate": 6.598883768310133e-06, "loss": 0.0145, "step": 28450 }, { "epoch": 1.6707760948690855, "grad_norm": 0.1567830741405487, "learning_rate": 6.575941314276063e-06, "loss": 0.0038, "step": 28460 }, { "epoch": 1.6713631560408595, "grad_norm": 0.04848311096429825, "learning_rate": 6.553036004370533e-06, "loss": 0.0074, "step": 28470 }, { "epoch": 1.6719502172126335, "grad_norm": 0.13380120694637299, "learning_rate": 6.530167858186342e-06, "loss": 0.0083, "step": 28480 }, { "epoch": 1.6725372783844077, "grad_norm": 0.18455231189727783, "learning_rate": 6.507336895284449e-06, "loss": 0.0047, "step": 28490 }, { "epoch": 1.673124339556182, "grad_norm": 0.643710732460022, "learning_rate": 6.484543135194043e-06, "loss": 0.0094, "step": 28500 }, { "epoch": 1.6737114007279559, "grad_norm": 0.23137906193733215, "learning_rate": 6.461786597412489e-06, "loss": 0.0031, "step": 28510 }, { "epoch": 1.6742984618997299, "grad_norm": 2.435506582260132, "learning_rate": 6.439067301405305e-06, "loss": 0.0238, "step": 28520 }, { "epoch": 1.674885523071504, "grad_norm": 0.7083292007446289, "learning_rate": 6.416385266606134e-06, "loss": 0.0108, "step": 28530 }, { "epoch": 1.6754725842432783, "grad_norm": 0.7685606479644775, "learning_rate": 6.393740512416785e-06, "loss": 0.0056, "step": 28540 }, { "epoch": 1.6760596454150523, "grad_norm": 0.1088830903172493, "learning_rate": 6.37113305820714e-06, "loss": 0.0144, "step": 28550 }, { "epoch": 1.6766467065868262, "grad_norm": 0.5206865072250366, "learning_rate": 6.348562923315194e-06, "loss": 0.0057, "step": 28560 }, { "epoch": 1.6772337677586004, "grad_norm": 1.1275566816329956, "learning_rate": 6.326030127047045e-06, "loss": 0.0117, "step": 28570 }, { "epoch": 1.6778208289303747, "grad_norm": 1.9255837202072144, "learning_rate": 6.303534688676799e-06, "loss": 0.0099, "step": 28580 }, { "epoch": 1.6784078901021486, "grad_norm": 1.0019365549087524, "learning_rate": 6.281076627446652e-06, "loss": 0.0356, "step": 28590 }, { "epoch": 1.6789949512739226, "grad_norm": 0.17412810027599335, "learning_rate": 6.25865596256679e-06, "loss": 0.0031, "step": 28600 }, { "epoch": 1.6795820124456968, "grad_norm": 2.0009384155273438, "learning_rate": 6.236272713215441e-06, "loss": 0.0114, "step": 28610 }, { "epoch": 1.680169073617471, "grad_norm": 3.1755568981170654, "learning_rate": 6.213926898538825e-06, "loss": 0.0117, "step": 28620 }, { "epoch": 1.680756134789245, "grad_norm": 2.153243064880371, "learning_rate": 6.1916185376511286e-06, "loss": 0.0109, "step": 28630 }, { "epoch": 1.681343195961019, "grad_norm": 0.05015252158045769, "learning_rate": 6.1693476496344996e-06, "loss": 0.0088, "step": 28640 }, { "epoch": 1.6819302571327932, "grad_norm": 0.577706515789032, "learning_rate": 6.14711425353906e-06, "loss": 0.0025, "step": 28650 }, { "epoch": 1.6825173183045674, "grad_norm": 0.40677276253700256, "learning_rate": 6.124918368382815e-06, "loss": 0.0114, "step": 28660 }, { "epoch": 1.6831043794763414, "grad_norm": 1.0793112516403198, "learning_rate": 6.1027600131517205e-06, "loss": 0.0093, "step": 28670 }, { "epoch": 1.6836914406481154, "grad_norm": 0.9636606574058533, "learning_rate": 6.080639206799626e-06, "loss": 0.0291, "step": 28680 }, { "epoch": 1.6842785018198896, "grad_norm": 0.07298687845468521, "learning_rate": 6.058555968248247e-06, "loss": 0.0133, "step": 28690 }, { "epoch": 1.6848655629916638, "grad_norm": 0.35231101512908936, "learning_rate": 6.036510316387195e-06, "loss": 0.0166, "step": 28700 }, { "epoch": 1.685452624163438, "grad_norm": 0.8759594559669495, "learning_rate": 6.014502270073874e-06, "loss": 0.0076, "step": 28710 }, { "epoch": 1.686039685335212, "grad_norm": 0.03797895833849907, "learning_rate": 5.9925318481335925e-06, "loss": 0.0108, "step": 28720 }, { "epoch": 1.686626746506986, "grad_norm": 1.683166265487671, "learning_rate": 5.970599069359395e-06, "loss": 0.0182, "step": 28730 }, { "epoch": 1.6872138076787602, "grad_norm": 0.07160099595785141, "learning_rate": 5.948703952512214e-06, "loss": 0.0087, "step": 28740 }, { "epoch": 1.6878008688505344, "grad_norm": 0.00314601743593812, "learning_rate": 5.9268465163207e-06, "loss": 0.0104, "step": 28750 }, { "epoch": 1.6883879300223084, "grad_norm": 0.02015577256679535, "learning_rate": 5.9050267794813045e-06, "loss": 0.0073, "step": 28760 }, { "epoch": 1.6889749911940823, "grad_norm": 0.03103172965347767, "learning_rate": 5.883244760658213e-06, "loss": 0.0085, "step": 28770 }, { "epoch": 1.6895620523658565, "grad_norm": 1.1322954893112183, "learning_rate": 5.861500478483362e-06, "loss": 0.0179, "step": 28780 }, { "epoch": 1.6901491135376308, "grad_norm": 0.17238181829452515, "learning_rate": 5.83979395155641e-06, "loss": 0.0035, "step": 28790 }, { "epoch": 1.6907361747094047, "grad_norm": 0.24105772376060486, "learning_rate": 5.818125198444713e-06, "loss": 0.0104, "step": 28800 }, { "epoch": 1.6913232358811787, "grad_norm": 0.44330739974975586, "learning_rate": 5.796494237683309e-06, "loss": 0.0097, "step": 28810 }, { "epoch": 1.691910297052953, "grad_norm": 0.19631382822990417, "learning_rate": 5.774901087774937e-06, "loss": 0.0133, "step": 28820 }, { "epoch": 1.6924973582247271, "grad_norm": 0.09199246764183044, "learning_rate": 5.753345767189949e-06, "loss": 0.0025, "step": 28830 }, { "epoch": 1.6930844193965011, "grad_norm": 0.9267358183860779, "learning_rate": 5.73182829436637e-06, "loss": 0.0114, "step": 28840 }, { "epoch": 1.693671480568275, "grad_norm": 3.330157995223999, "learning_rate": 5.710348687709855e-06, "loss": 0.0036, "step": 28850 }, { "epoch": 1.6942585417400493, "grad_norm": 1.3673113584518433, "learning_rate": 5.688906965593649e-06, "loss": 0.0138, "step": 28860 }, { "epoch": 1.6948456029118235, "grad_norm": 0.03584679961204529, "learning_rate": 5.667503146358616e-06, "loss": 0.0112, "step": 28870 }, { "epoch": 1.6954326640835975, "grad_norm": 0.026026401668787003, "learning_rate": 5.64613724831316e-06, "loss": 0.0166, "step": 28880 }, { "epoch": 1.6960197252553715, "grad_norm": 1.8361384868621826, "learning_rate": 5.624809289733296e-06, "loss": 0.0077, "step": 28890 }, { "epoch": 1.6966067864271457, "grad_norm": 0.43284550309181213, "learning_rate": 5.603519288862536e-06, "loss": 0.0083, "step": 28900 }, { "epoch": 1.69719384759892, "grad_norm": 0.04571918770670891, "learning_rate": 5.582267263911961e-06, "loss": 0.0066, "step": 28910 }, { "epoch": 1.6977809087706939, "grad_norm": 0.05616581812500954, "learning_rate": 5.561053233060154e-06, "loss": 0.0105, "step": 28920 }, { "epoch": 1.6983679699424679, "grad_norm": 0.4561821520328522, "learning_rate": 5.539877214453215e-06, "loss": 0.0103, "step": 28930 }, { "epoch": 1.698955031114242, "grad_norm": 0.29431581497192383, "learning_rate": 5.518739226204689e-06, "loss": 0.0044, "step": 28940 }, { "epoch": 1.6995420922860163, "grad_norm": 1.0841506719589233, "learning_rate": 5.497639286395645e-06, "loss": 0.0124, "step": 28950 }, { "epoch": 1.7001291534577903, "grad_norm": 1.3967875242233276, "learning_rate": 5.476577413074535e-06, "loss": 0.0079, "step": 28960 }, { "epoch": 1.7007162146295642, "grad_norm": 0.29343652725219727, "learning_rate": 5.455553624257331e-06, "loss": 0.022, "step": 28970 }, { "epoch": 1.7013032758013384, "grad_norm": 0.02657872438430786, "learning_rate": 5.434567937927387e-06, "loss": 0.0043, "step": 28980 }, { "epoch": 1.7018903369731126, "grad_norm": 1.2220739126205444, "learning_rate": 5.413620372035449e-06, "loss": 0.0114, "step": 28990 }, { "epoch": 1.7024773981448869, "grad_norm": 0.11572136729955673, "learning_rate": 5.39271094449969e-06, "loss": 0.0058, "step": 29000 }, { "epoch": 1.7030644593166608, "grad_norm": 0.5487588047981262, "learning_rate": 5.371839673205625e-06, "loss": 0.0049, "step": 29010 }, { "epoch": 1.7036515204884348, "grad_norm": 0.038064517080783844, "learning_rate": 5.351006576006162e-06, "loss": 0.0153, "step": 29020 }, { "epoch": 1.704238581660209, "grad_norm": 0.040291767567396164, "learning_rate": 5.330211670721535e-06, "loss": 0.0068, "step": 29030 }, { "epoch": 1.7048256428319832, "grad_norm": 0.7021271586418152, "learning_rate": 5.309454975139338e-06, "loss": 0.0038, "step": 29040 }, { "epoch": 1.7054127040037572, "grad_norm": 2.8790230751037598, "learning_rate": 5.288736507014435e-06, "loss": 0.0345, "step": 29050 }, { "epoch": 1.7059997651755312, "grad_norm": 1.4290226697921753, "learning_rate": 5.26805628406904e-06, "loss": 0.0055, "step": 29060 }, { "epoch": 1.7065868263473054, "grad_norm": 0.519660472869873, "learning_rate": 5.247414323992605e-06, "loss": 0.0048, "step": 29070 }, { "epoch": 1.7071738875190796, "grad_norm": 1.1711570024490356, "learning_rate": 5.2268106444418875e-06, "loss": 0.011, "step": 29080 }, { "epoch": 1.7077609486908536, "grad_norm": 0.897144079208374, "learning_rate": 5.206245263040893e-06, "loss": 0.0265, "step": 29090 }, { "epoch": 1.7083480098626276, "grad_norm": 1.1236882209777832, "learning_rate": 5.1857181973808735e-06, "loss": 0.0096, "step": 29100 }, { "epoch": 1.7089350710344018, "grad_norm": 0.45875832438468933, "learning_rate": 5.165229465020277e-06, "loss": 0.008, "step": 29110 }, { "epoch": 1.709522132206176, "grad_norm": 0.37724724411964417, "learning_rate": 5.144779083484791e-06, "loss": 0.0036, "step": 29120 }, { "epoch": 1.71010919337795, "grad_norm": 1.1803650856018066, "learning_rate": 5.1243670702673e-06, "loss": 0.0059, "step": 29130 }, { "epoch": 1.710696254549724, "grad_norm": 1.0297425985336304, "learning_rate": 5.103993442827831e-06, "loss": 0.0088, "step": 29140 }, { "epoch": 1.7112833157214982, "grad_norm": 0.011964410543441772, "learning_rate": 5.0836582185936456e-06, "loss": 0.0027, "step": 29150 }, { "epoch": 1.7118703768932724, "grad_norm": 1.340688943862915, "learning_rate": 5.063361414959083e-06, "loss": 0.0061, "step": 29160 }, { "epoch": 1.7124574380650464, "grad_norm": 1.056504726409912, "learning_rate": 5.043103049285663e-06, "loss": 0.006, "step": 29170 }, { "epoch": 1.7130444992368203, "grad_norm": 0.5750206708908081, "learning_rate": 5.022883138902007e-06, "loss": 0.0033, "step": 29180 }, { "epoch": 1.7136315604085945, "grad_norm": 0.10166315734386444, "learning_rate": 5.002701701103846e-06, "loss": 0.006, "step": 29190 }, { "epoch": 1.7142186215803688, "grad_norm": 2.292682647705078, "learning_rate": 4.982558753154009e-06, "loss": 0.0072, "step": 29200 }, { "epoch": 1.7148056827521427, "grad_norm": 0.36749956011772156, "learning_rate": 4.962454312282411e-06, "loss": 0.006, "step": 29210 }, { "epoch": 1.7153927439239167, "grad_norm": 0.2949482500553131, "learning_rate": 4.942388395685993e-06, "loss": 0.0075, "step": 29220 }, { "epoch": 1.715979805095691, "grad_norm": 0.047629375010728836, "learning_rate": 4.922361020528782e-06, "loss": 0.0055, "step": 29230 }, { "epoch": 1.7165668662674651, "grad_norm": 0.8646748661994934, "learning_rate": 4.9023722039418015e-06, "loss": 0.0045, "step": 29240 }, { "epoch": 1.7171539274392391, "grad_norm": 0.06569618731737137, "learning_rate": 4.882421963023126e-06, "loss": 0.0045, "step": 29250 }, { "epoch": 1.7177409886110133, "grad_norm": 0.7197391390800476, "learning_rate": 4.86251031483782e-06, "loss": 0.0031, "step": 29260 }, { "epoch": 1.7183280497827873, "grad_norm": 1.2318994998931885, "learning_rate": 4.842637276417927e-06, "loss": 0.0089, "step": 29270 }, { "epoch": 1.7189151109545615, "grad_norm": 0.0834750384092331, "learning_rate": 4.822802864762488e-06, "loss": 0.0053, "step": 29280 }, { "epoch": 1.7195021721263357, "grad_norm": 0.3922010362148285, "learning_rate": 4.80300709683747e-06, "loss": 0.0045, "step": 29290 }, { "epoch": 1.7200892332981097, "grad_norm": 1.6912459135055542, "learning_rate": 4.7832499895758166e-06, "loss": 0.0129, "step": 29300 }, { "epoch": 1.7206762944698837, "grad_norm": 3.4761576652526855, "learning_rate": 4.76353155987736e-06, "loss": 0.0114, "step": 29310 }, { "epoch": 1.7212633556416579, "grad_norm": 0.8199915885925293, "learning_rate": 4.7438518246089245e-06, "loss": 0.0038, "step": 29320 }, { "epoch": 1.721850416813432, "grad_norm": 0.5764483213424683, "learning_rate": 4.724210800604151e-06, "loss": 0.0115, "step": 29330 }, { "epoch": 1.722437477985206, "grad_norm": 0.8750811219215393, "learning_rate": 4.704608504663627e-06, "loss": 0.0288, "step": 29340 }, { "epoch": 1.72302453915698, "grad_norm": 0.31287306547164917, "learning_rate": 4.685044953554768e-06, "loss": 0.0036, "step": 29350 }, { "epoch": 1.7236116003287543, "grad_norm": 1.7458490133285522, "learning_rate": 4.6655201640118775e-06, "loss": 0.0221, "step": 29360 }, { "epoch": 1.7241986615005285, "grad_norm": 0.10908584296703339, "learning_rate": 4.646034152736101e-06, "loss": 0.0082, "step": 29370 }, { "epoch": 1.7247857226723025, "grad_norm": 5.732516765594482, "learning_rate": 4.626586936395411e-06, "loss": 0.0164, "step": 29380 }, { "epoch": 1.7253727838440764, "grad_norm": 0.14866457879543304, "learning_rate": 4.607178531624595e-06, "loss": 0.0053, "step": 29390 }, { "epoch": 1.7259598450158506, "grad_norm": 0.7560992240905762, "learning_rate": 4.5878089550252246e-06, "loss": 0.0149, "step": 29400 }, { "epoch": 1.7265469061876249, "grad_norm": 0.5187071561813354, "learning_rate": 4.568478223165696e-06, "loss": 0.0127, "step": 29410 }, { "epoch": 1.7271339673593988, "grad_norm": 0.7606738805770874, "learning_rate": 4.549186352581131e-06, "loss": 0.005, "step": 29420 }, { "epoch": 1.7277210285311728, "grad_norm": 0.011919623240828514, "learning_rate": 4.529933359773447e-06, "loss": 0.0138, "step": 29430 }, { "epoch": 1.728308089702947, "grad_norm": 2.2456302642822266, "learning_rate": 4.510719261211293e-06, "loss": 0.0097, "step": 29440 }, { "epoch": 1.7288951508747212, "grad_norm": 0.3139123320579529, "learning_rate": 4.491544073330062e-06, "loss": 0.0175, "step": 29450 }, { "epoch": 1.7294822120464952, "grad_norm": 0.6753292083740234, "learning_rate": 4.472407812531831e-06, "loss": 0.0053, "step": 29460 }, { "epoch": 1.7300692732182692, "grad_norm": 0.02748318947851658, "learning_rate": 4.4533104951854255e-06, "loss": 0.0076, "step": 29470 }, { "epoch": 1.7306563343900434, "grad_norm": 0.0061375536024570465, "learning_rate": 4.434252137626305e-06, "loss": 0.0072, "step": 29480 }, { "epoch": 1.7312433955618176, "grad_norm": 1.0382859706878662, "learning_rate": 4.4152327561566455e-06, "loss": 0.008, "step": 29490 }, { "epoch": 1.7318304567335916, "grad_norm": 0.4101167321205139, "learning_rate": 4.3962523670452725e-06, "loss": 0.0043, "step": 29500 }, { "epoch": 1.7324175179053656, "grad_norm": 1.11329984664917, "learning_rate": 4.37731098652766e-06, "loss": 0.0084, "step": 29510 }, { "epoch": 1.7330045790771398, "grad_norm": 0.03493981435894966, "learning_rate": 4.358408630805905e-06, "loss": 0.0094, "step": 29520 }, { "epoch": 1.733591640248914, "grad_norm": 0.4795690178871155, "learning_rate": 4.339545316048721e-06, "loss": 0.0064, "step": 29530 }, { "epoch": 1.7341787014206882, "grad_norm": 0.4783584475517273, "learning_rate": 4.320721058391453e-06, "loss": 0.0092, "step": 29540 }, { "epoch": 1.7347657625924622, "grad_norm": 0.235016867518425, "learning_rate": 4.301935873936003e-06, "loss": 0.0105, "step": 29550 }, { "epoch": 1.7353528237642362, "grad_norm": 2.073943614959717, "learning_rate": 4.28318977875089e-06, "loss": 0.0114, "step": 29560 }, { "epoch": 1.7359398849360104, "grad_norm": 0.09469784796237946, "learning_rate": 4.264482788871149e-06, "loss": 0.0085, "step": 29570 }, { "epoch": 1.7365269461077846, "grad_norm": 0.6828670501708984, "learning_rate": 4.245814920298402e-06, "loss": 0.0066, "step": 29580 }, { "epoch": 1.7371140072795586, "grad_norm": 1.3681111335754395, "learning_rate": 4.227186189000787e-06, "loss": 0.0088, "step": 29590 }, { "epoch": 1.7377010684513325, "grad_norm": 0.7485164999961853, "learning_rate": 4.2085966109129796e-06, "loss": 0.0149, "step": 29600 }, { "epoch": 1.7382881296231067, "grad_norm": 0.06022016331553459, "learning_rate": 4.190046201936154e-06, "loss": 0.0017, "step": 29610 }, { "epoch": 1.738875190794881, "grad_norm": 0.6397087574005127, "learning_rate": 4.171534977937991e-06, "loss": 0.008, "step": 29620 }, { "epoch": 1.739462251966655, "grad_norm": 0.5424182415008545, "learning_rate": 4.153062954752635e-06, "loss": 0.0059, "step": 29630 }, { "epoch": 1.740049313138429, "grad_norm": 0.2838616967201233, "learning_rate": 4.134630148180724e-06, "loss": 0.0057, "step": 29640 }, { "epoch": 1.7406363743102031, "grad_norm": 0.9157633781433105, "learning_rate": 4.1162365739893125e-06, "loss": 0.0068, "step": 29650 }, { "epoch": 1.7412234354819773, "grad_norm": 0.3759089708328247, "learning_rate": 4.0978822479119325e-06, "loss": 0.0123, "step": 29660 }, { "epoch": 1.7418104966537513, "grad_norm": 0.3383537530899048, "learning_rate": 4.0795671856485475e-06, "loss": 0.0051, "step": 29670 }, { "epoch": 1.7423975578255253, "grad_norm": 0.35818520188331604, "learning_rate": 4.061291402865497e-06, "loss": 0.0055, "step": 29680 }, { "epoch": 1.7429846189972995, "grad_norm": 0.48298370838165283, "learning_rate": 4.043054915195566e-06, "loss": 0.006, "step": 29690 }, { "epoch": 1.7435716801690737, "grad_norm": 0.020862950012087822, "learning_rate": 4.024857738237875e-06, "loss": 0.0111, "step": 29700 }, { "epoch": 1.7441587413408477, "grad_norm": 0.2464524656534195, "learning_rate": 4.006699887557974e-06, "loss": 0.0035, "step": 29710 }, { "epoch": 1.7447458025126217, "grad_norm": 0.36007267236709595, "learning_rate": 3.988581378687739e-06, "loss": 0.0125, "step": 29720 }, { "epoch": 1.7453328636843959, "grad_norm": 0.5455897450447083, "learning_rate": 3.970502227125417e-06, "loss": 0.0073, "step": 29730 }, { "epoch": 1.74591992485617, "grad_norm": 4.634277820587158, "learning_rate": 3.952462448335553e-06, "loss": 0.0136, "step": 29740 }, { "epoch": 1.746506986027944, "grad_norm": 0.4345282018184662, "learning_rate": 3.934462057749067e-06, "loss": 0.0085, "step": 29750 }, { "epoch": 1.747094047199718, "grad_norm": 0.7044300436973572, "learning_rate": 3.916501070763124e-06, "loss": 0.0047, "step": 29760 }, { "epoch": 1.7476811083714923, "grad_norm": 1.710367202758789, "learning_rate": 3.898579502741234e-06, "loss": 0.008, "step": 29770 }, { "epoch": 1.7482681695432665, "grad_norm": 1.0720694065093994, "learning_rate": 3.88069736901317e-06, "loss": 0.0053, "step": 29780 }, { "epoch": 1.7488552307150405, "grad_norm": 0.8815639615058899, "learning_rate": 3.8628546848749895e-06, "loss": 0.0106, "step": 29790 }, { "epoch": 1.7494422918868147, "grad_norm": 0.6148375868797302, "learning_rate": 3.845051465588962e-06, "loss": 0.0148, "step": 29800 }, { "epoch": 1.7500293530585886, "grad_norm": 0.7402764558792114, "learning_rate": 3.827287726383644e-06, "loss": 0.0113, "step": 29810 }, { "epoch": 1.7506164142303628, "grad_norm": 0.008104000240564346, "learning_rate": 3.809563482453815e-06, "loss": 0.0087, "step": 29820 }, { "epoch": 1.751203475402137, "grad_norm": 0.34709659218788147, "learning_rate": 3.7918787489604477e-06, "loss": 0.0105, "step": 29830 }, { "epoch": 1.751790536573911, "grad_norm": 1.615087628364563, "learning_rate": 3.7742335410307306e-06, "loss": 0.0073, "step": 29840 }, { "epoch": 1.752377597745685, "grad_norm": 0.2822687327861786, "learning_rate": 3.7566278737580563e-06, "loss": 0.0041, "step": 29850 }, { "epoch": 1.7529646589174592, "grad_norm": 0.0552225224673748, "learning_rate": 3.7390617622019897e-06, "loss": 0.0033, "step": 29860 }, { "epoch": 1.7535517200892334, "grad_norm": 0.9881157875061035, "learning_rate": 3.7215352213882338e-06, "loss": 0.0101, "step": 29870 }, { "epoch": 1.7541387812610074, "grad_norm": 0.030385082587599754, "learning_rate": 3.704048266308685e-06, "loss": 0.0121, "step": 29880 }, { "epoch": 1.7547258424327814, "grad_norm": 0.033014968037605286, "learning_rate": 3.6866009119213283e-06, "loss": 0.0086, "step": 29890 }, { "epoch": 1.7553129036045556, "grad_norm": 0.3687360882759094, "learning_rate": 3.6691931731503425e-06, "loss": 0.0102, "step": 29900 }, { "epoch": 1.7558999647763298, "grad_norm": 0.4449838101863861, "learning_rate": 3.651825064885955e-06, "loss": 0.0165, "step": 29910 }, { "epoch": 1.7564870259481038, "grad_norm": 0.35303670167922974, "learning_rate": 3.6344966019845385e-06, "loss": 0.0044, "step": 29920 }, { "epoch": 1.7570740871198778, "grad_norm": 0.14896978437900543, "learning_rate": 3.6172077992685182e-06, "loss": 0.0031, "step": 29930 }, { "epoch": 1.757661148291652, "grad_norm": 0.06323855370283127, "learning_rate": 3.5999586715264267e-06, "loss": 0.0104, "step": 29940 }, { "epoch": 1.7582482094634262, "grad_norm": 1.1001636981964111, "learning_rate": 3.5827492335128333e-06, "loss": 0.0045, "step": 29950 }, { "epoch": 1.7588352706352002, "grad_norm": 0.923254132270813, "learning_rate": 3.5655794999483847e-06, "loss": 0.01, "step": 29960 }, { "epoch": 1.7594223318069742, "grad_norm": 1.3027571439743042, "learning_rate": 3.5484494855197505e-06, "loss": 0.0047, "step": 29970 }, { "epoch": 1.7600093929787484, "grad_norm": 0.7019350528717041, "learning_rate": 3.5313592048796086e-06, "loss": 0.0105, "step": 29980 }, { "epoch": 1.7605964541505226, "grad_norm": 0.11337191611528397, "learning_rate": 3.514308672646682e-06, "loss": 0.0063, "step": 29990 }, { "epoch": 1.7611835153222966, "grad_norm": 0.44551828503608704, "learning_rate": 3.497297903405666e-06, "loss": 0.0087, "step": 30000 }, { "epoch": 1.7611835153222966, "eval_loss": 0.5214746594429016, "eval_runtime": 269.6304, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 30000 }, { "epoch": 1.7617705764940705, "grad_norm": 0.008360777050256729, "learning_rate": 3.4803269117072546e-06, "loss": 0.0062, "step": 30010 }, { "epoch": 1.7623576376658447, "grad_norm": 0.7833142876625061, "learning_rate": 3.4633957120681293e-06, "loss": 0.011, "step": 30020 }, { "epoch": 1.762944698837619, "grad_norm": 0.09515856206417084, "learning_rate": 3.4465043189709168e-06, "loss": 0.0099, "step": 30030 }, { "epoch": 1.763531760009393, "grad_norm": 0.44629839062690735, "learning_rate": 3.429652746864187e-06, "loss": 0.0089, "step": 30040 }, { "epoch": 1.764118821181167, "grad_norm": 0.09620211273431778, "learning_rate": 3.4128410101624817e-06, "loss": 0.0093, "step": 30050 }, { "epoch": 1.7647058823529411, "grad_norm": 4.587845325469971, "learning_rate": 3.396069123246226e-06, "loss": 0.0083, "step": 30060 }, { "epoch": 1.7652929435247153, "grad_norm": 1.1827245950698853, "learning_rate": 3.379337100461788e-06, "loss": 0.0076, "step": 30070 }, { "epoch": 1.7658800046964895, "grad_norm": 0.8300924897193909, "learning_rate": 3.3626449561214245e-06, "loss": 0.0062, "step": 30080 }, { "epoch": 1.7664670658682635, "grad_norm": 0.024658210575580597, "learning_rate": 3.3459927045032867e-06, "loss": 0.0094, "step": 30090 }, { "epoch": 1.7670541270400375, "grad_norm": 0.7816444635391235, "learning_rate": 3.3293803598514086e-06, "loss": 0.0219, "step": 30100 }, { "epoch": 1.7676411882118117, "grad_norm": 3.3857486248016357, "learning_rate": 3.312807936375656e-06, "loss": 0.0043, "step": 30110 }, { "epoch": 1.768228249383586, "grad_norm": 0.03308376297354698, "learning_rate": 3.29627544825179e-06, "loss": 0.0066, "step": 30120 }, { "epoch": 1.76881531055536, "grad_norm": 0.03391629457473755, "learning_rate": 3.2797829096213818e-06, "loss": 0.007, "step": 30130 }, { "epoch": 1.7694023717271339, "grad_norm": 2.114734411239624, "learning_rate": 3.263330334591852e-06, "loss": 0.0058, "step": 30140 }, { "epoch": 1.769989432898908, "grad_norm": 0.18660058081150055, "learning_rate": 3.246917737236416e-06, "loss": 0.0022, "step": 30150 }, { "epoch": 1.7705764940706823, "grad_norm": 0.5085205435752869, "learning_rate": 3.2305451315941095e-06, "loss": 0.0037, "step": 30160 }, { "epoch": 1.7711635552424563, "grad_norm": 0.520336389541626, "learning_rate": 3.2142125316697467e-06, "loss": 0.008, "step": 30170 }, { "epoch": 1.7717506164142303, "grad_norm": 0.6636307239532471, "learning_rate": 3.1979199514339307e-06, "loss": 0.0051, "step": 30180 }, { "epoch": 1.7723376775860045, "grad_norm": 0.32753074169158936, "learning_rate": 3.18166740482303e-06, "loss": 0.021, "step": 30190 }, { "epoch": 1.7729247387577787, "grad_norm": 0.15476436913013458, "learning_rate": 3.1654549057391737e-06, "loss": 0.0064, "step": 30200 }, { "epoch": 1.7735117999295527, "grad_norm": 0.4490971267223358, "learning_rate": 3.1492824680502244e-06, "loss": 0.0178, "step": 30210 }, { "epoch": 1.7740988611013266, "grad_norm": 0.7970400452613831, "learning_rate": 3.1331501055897883e-06, "loss": 0.0055, "step": 30220 }, { "epoch": 1.7746859222731008, "grad_norm": 0.03366592898964882, "learning_rate": 3.1170578321571887e-06, "loss": 0.0084, "step": 30230 }, { "epoch": 1.775272983444875, "grad_norm": 0.8563421368598938, "learning_rate": 3.1010056615174365e-06, "loss": 0.0086, "step": 30240 }, { "epoch": 1.775860044616649, "grad_norm": 0.33078980445861816, "learning_rate": 3.084993607401293e-06, "loss": 0.0028, "step": 30250 }, { "epoch": 1.776447105788423, "grad_norm": 1.528594732284546, "learning_rate": 3.069021683505141e-06, "loss": 0.0122, "step": 30260 }, { "epoch": 1.7770341669601972, "grad_norm": 0.8881283402442932, "learning_rate": 3.05308990349108e-06, "loss": 0.015, "step": 30270 }, { "epoch": 1.7776212281319714, "grad_norm": 0.05420268699526787, "learning_rate": 3.0371982809868527e-06, "loss": 0.0049, "step": 30280 }, { "epoch": 1.7782082893037454, "grad_norm": 0.8013023138046265, "learning_rate": 3.021346829585847e-06, "loss": 0.0043, "step": 30290 }, { "epoch": 1.7787953504755194, "grad_norm": 1.3829023838043213, "learning_rate": 3.005535562847117e-06, "loss": 0.0059, "step": 30300 }, { "epoch": 1.7793824116472936, "grad_norm": 0.25038641691207886, "learning_rate": 2.9897644942953162e-06, "loss": 0.0142, "step": 30310 }, { "epoch": 1.7799694728190678, "grad_norm": 1.0973732471466064, "learning_rate": 2.9740336374207147e-06, "loss": 0.0059, "step": 30320 }, { "epoch": 1.7805565339908418, "grad_norm": 3.689282178878784, "learning_rate": 2.9583430056792096e-06, "loss": 0.0098, "step": 30330 }, { "epoch": 1.7811435951626158, "grad_norm": 1.0719764232635498, "learning_rate": 2.9426926124922592e-06, "loss": 0.012, "step": 30340 }, { "epoch": 1.78173065633439, "grad_norm": 0.21206724643707275, "learning_rate": 2.927082471246917e-06, "loss": 0.0115, "step": 30350 }, { "epoch": 1.7823177175061642, "grad_norm": 1.1170448064804077, "learning_rate": 2.911512595295818e-06, "loss": 0.0078, "step": 30360 }, { "epoch": 1.7829047786779384, "grad_norm": 0.8349093198776245, "learning_rate": 2.8959829979571306e-06, "loss": 0.0055, "step": 30370 }, { "epoch": 1.7834918398497124, "grad_norm": 2.214409351348877, "learning_rate": 2.880493692514602e-06, "loss": 0.0079, "step": 30380 }, { "epoch": 1.7840789010214864, "grad_norm": 0.5047364234924316, "learning_rate": 2.8650446922174723e-06, "loss": 0.0042, "step": 30390 }, { "epoch": 1.7846659621932606, "grad_norm": 1.0878037214279175, "learning_rate": 2.849636010280543e-06, "loss": 0.0113, "step": 30400 }, { "epoch": 1.7852530233650348, "grad_norm": 0.621374785900116, "learning_rate": 2.8342676598841044e-06, "loss": 0.0053, "step": 30410 }, { "epoch": 1.7858400845368088, "grad_norm": 1.328795313835144, "learning_rate": 2.818939654173952e-06, "loss": 0.016, "step": 30420 }, { "epoch": 1.7864271457085827, "grad_norm": 0.5891567468643188, "learning_rate": 2.803652006261387e-06, "loss": 0.0082, "step": 30430 }, { "epoch": 1.787014206880357, "grad_norm": 1.2182434797286987, "learning_rate": 2.7884047292231817e-06, "loss": 0.0109, "step": 30440 }, { "epoch": 1.7876012680521312, "grad_norm": 1.2911088466644287, "learning_rate": 2.7731978361015543e-06, "loss": 0.0237, "step": 30450 }, { "epoch": 1.7881883292239051, "grad_norm": 0.37283286452293396, "learning_rate": 2.75803133990421e-06, "loss": 0.0095, "step": 30460 }, { "epoch": 1.7887753903956791, "grad_norm": 0.6658843159675598, "learning_rate": 2.742905253604272e-06, "loss": 0.0017, "step": 30470 }, { "epoch": 1.7893624515674533, "grad_norm": 0.2888129949569702, "learning_rate": 2.727819590140335e-06, "loss": 0.0072, "step": 30480 }, { "epoch": 1.7899495127392275, "grad_norm": 0.9133905172348022, "learning_rate": 2.712774362416376e-06, "loss": 0.0117, "step": 30490 }, { "epoch": 1.7905365739110015, "grad_norm": 0.02645767293870449, "learning_rate": 2.6977695833018014e-06, "loss": 0.0066, "step": 30500 }, { "epoch": 1.7911236350827755, "grad_norm": 0.7585176229476929, "learning_rate": 2.6828052656314384e-06, "loss": 0.004, "step": 30510 }, { "epoch": 1.7917106962545497, "grad_norm": 0.5185384750366211, "learning_rate": 2.6678814222054593e-06, "loss": 0.0022, "step": 30520 }, { "epoch": 1.792297757426324, "grad_norm": 1.7649006843566895, "learning_rate": 2.652998065789453e-06, "loss": 0.003, "step": 30530 }, { "epoch": 1.792884818598098, "grad_norm": 0.07767531275749207, "learning_rate": 2.638155209114368e-06, "loss": 0.0062, "step": 30540 }, { "epoch": 1.7934718797698719, "grad_norm": 1.7642700672149658, "learning_rate": 2.623352864876505e-06, "loss": 0.0042, "step": 30550 }, { "epoch": 1.794058940941646, "grad_norm": 2.1818125247955322, "learning_rate": 2.6085910457375073e-06, "loss": 0.0127, "step": 30560 }, { "epoch": 1.7946460021134203, "grad_norm": 0.2745802402496338, "learning_rate": 2.5938697643243635e-06, "loss": 0.0067, "step": 30570 }, { "epoch": 1.7952330632851943, "grad_norm": 0.0070999846793711185, "learning_rate": 2.5791890332293788e-06, "loss": 0.0073, "step": 30580 }, { "epoch": 1.7958201244569683, "grad_norm": 0.15600904822349548, "learning_rate": 2.56454886501018e-06, "loss": 0.0079, "step": 30590 }, { "epoch": 1.7964071856287425, "grad_norm": 0.30416813492774963, "learning_rate": 2.5499492721896887e-06, "loss": 0.0103, "step": 30600 }, { "epoch": 1.7969942468005167, "grad_norm": 0.14762407541275024, "learning_rate": 2.535390267256138e-06, "loss": 0.0072, "step": 30610 }, { "epoch": 1.7975813079722909, "grad_norm": 0.7133505940437317, "learning_rate": 2.5208718626630045e-06, "loss": 0.0128, "step": 30620 }, { "epoch": 1.7981683691440649, "grad_norm": 0.5686302185058594, "learning_rate": 2.5063940708290823e-06, "loss": 0.0163, "step": 30630 }, { "epoch": 1.7987554303158388, "grad_norm": 1.847533941268921, "learning_rate": 2.491956904138393e-06, "loss": 0.0067, "step": 30640 }, { "epoch": 1.799342491487613, "grad_norm": 1.142364740371704, "learning_rate": 2.4775603749402187e-06, "loss": 0.0143, "step": 30650 }, { "epoch": 1.7999295526593873, "grad_norm": 1.1415369510650635, "learning_rate": 2.4632044955490983e-06, "loss": 0.0041, "step": 30660 }, { "epoch": 1.8005166138311612, "grad_norm": 1.596919298171997, "learning_rate": 2.4488892782447593e-06, "loss": 0.0059, "step": 30670 }, { "epoch": 1.8011036750029352, "grad_norm": 1.4813252687454224, "learning_rate": 2.4346147352721836e-06, "loss": 0.0076, "step": 30680 }, { "epoch": 1.8016907361747094, "grad_norm": 0.3757873475551605, "learning_rate": 2.4203808788415438e-06, "loss": 0.0068, "step": 30690 }, { "epoch": 1.8022777973464836, "grad_norm": 1.8511054515838623, "learning_rate": 2.406187721128217e-06, "loss": 0.0167, "step": 30700 }, { "epoch": 1.8028648585182576, "grad_norm": 0.3293643295764923, "learning_rate": 2.3920352742727636e-06, "loss": 0.0035, "step": 30710 }, { "epoch": 1.8034519196900316, "grad_norm": 3.0010180473327637, "learning_rate": 2.377923550380934e-06, "loss": 0.013, "step": 30720 }, { "epoch": 1.8040389808618058, "grad_norm": 0.07963775843381882, "learning_rate": 2.3638525615236164e-06, "loss": 0.0118, "step": 30730 }, { "epoch": 1.80462604203358, "grad_norm": 0.4447624683380127, "learning_rate": 2.3498223197368828e-06, "loss": 0.0094, "step": 30740 }, { "epoch": 1.805213103205354, "grad_norm": 2.473619222640991, "learning_rate": 2.3358328370219286e-06, "loss": 0.0083, "step": 30750 }, { "epoch": 1.805800164377128, "grad_norm": 0.6622852683067322, "learning_rate": 2.3218841253451084e-06, "loss": 0.0042, "step": 30760 }, { "epoch": 1.8063872255489022, "grad_norm": 1.3558241128921509, "learning_rate": 2.3079761966378787e-06, "loss": 0.0063, "step": 30770 }, { "epoch": 1.8069742867206764, "grad_norm": 0.33775874972343445, "learning_rate": 2.2941090627968287e-06, "loss": 0.002, "step": 30780 }, { "epoch": 1.8075613478924504, "grad_norm": 0.08536079525947571, "learning_rate": 2.280282735683653e-06, "loss": 0.0056, "step": 30790 }, { "epoch": 1.8081484090642244, "grad_norm": 0.5035672187805176, "learning_rate": 2.266497227125114e-06, "loss": 0.01, "step": 30800 }, { "epoch": 1.8087354702359986, "grad_norm": 0.8908909559249878, "learning_rate": 2.2527525489131008e-06, "loss": 0.0125, "step": 30810 }, { "epoch": 1.8093225314077728, "grad_norm": 0.0050659808330237865, "learning_rate": 2.2390487128045256e-06, "loss": 0.0136, "step": 30820 }, { "epoch": 1.8099095925795468, "grad_norm": 1.6174921989440918, "learning_rate": 2.2253857305214233e-06, "loss": 0.0143, "step": 30830 }, { "epoch": 1.8104966537513207, "grad_norm": 1.1043347120285034, "learning_rate": 2.211763613750839e-06, "loss": 0.0058, "step": 30840 }, { "epoch": 1.811083714923095, "grad_norm": 0.1730615496635437, "learning_rate": 2.1981823741448805e-06, "loss": 0.0077, "step": 30850 }, { "epoch": 1.8116707760948692, "grad_norm": 0.040801744908094406, "learning_rate": 2.1846420233206823e-06, "loss": 0.0056, "step": 30860 }, { "epoch": 1.8122578372666431, "grad_norm": 0.040723178535699844, "learning_rate": 2.1711425728604073e-06, "loss": 0.0087, "step": 30870 }, { "epoch": 1.8128448984384171, "grad_norm": 0.3218896985054016, "learning_rate": 2.1576840343112414e-06, "loss": 0.006, "step": 30880 }, { "epoch": 1.8134319596101913, "grad_norm": 1.7537883520126343, "learning_rate": 2.1442664191853645e-06, "loss": 0.0051, "step": 30890 }, { "epoch": 1.8140190207819655, "grad_norm": 1.382644772529602, "learning_rate": 2.130889738959946e-06, "loss": 0.0103, "step": 30900 }, { "epoch": 1.8146060819537397, "grad_norm": 0.9474086165428162, "learning_rate": 2.1175540050771492e-06, "loss": 0.0128, "step": 30910 }, { "epoch": 1.8151931431255137, "grad_norm": 0.175063356757164, "learning_rate": 2.1042592289441277e-06, "loss": 0.0038, "step": 30920 }, { "epoch": 1.8157802042972877, "grad_norm": 1.8063702583312988, "learning_rate": 2.0910054219329624e-06, "loss": 0.0079, "step": 30930 }, { "epoch": 1.816367265469062, "grad_norm": 0.24421249330043793, "learning_rate": 2.0777925953807288e-06, "loss": 0.0115, "step": 30940 }, { "epoch": 1.8169543266408361, "grad_norm": 0.6284900307655334, "learning_rate": 2.0646207605894198e-06, "loss": 0.0171, "step": 30950 }, { "epoch": 1.81754138781261, "grad_norm": 0.24449436366558075, "learning_rate": 2.051489928825995e-06, "loss": 0.0069, "step": 30960 }, { "epoch": 1.818128448984384, "grad_norm": 0.020610427483916283, "learning_rate": 2.0384001113222972e-06, "loss": 0.0046, "step": 30970 }, { "epoch": 1.8187155101561583, "grad_norm": 0.03185024484992027, "learning_rate": 2.0253513192751373e-06, "loss": 0.0107, "step": 30980 }, { "epoch": 1.8193025713279325, "grad_norm": 0.3091789186000824, "learning_rate": 2.0123435638461863e-06, "loss": 0.0079, "step": 30990 }, { "epoch": 1.8198896324997065, "grad_norm": 0.3415491282939911, "learning_rate": 1.999376856162044e-06, "loss": 0.0087, "step": 31000 }, { "epoch": 1.8204766936714805, "grad_norm": 0.35340416431427, "learning_rate": 1.986451207314194e-06, "loss": 0.0058, "step": 31010 }, { "epoch": 1.8210637548432547, "grad_norm": 0.31529369950294495, "learning_rate": 1.9735666283589972e-06, "loss": 0.009, "step": 31020 }, { "epoch": 1.8216508160150289, "grad_norm": 0.44021913409233093, "learning_rate": 1.9607231303176653e-06, "loss": 0.0021, "step": 31030 }, { "epoch": 1.8222378771868029, "grad_norm": 0.567622721195221, "learning_rate": 1.9479207241763055e-06, "loss": 0.0079, "step": 31040 }, { "epoch": 1.8228249383585768, "grad_norm": 0.35683673620224, "learning_rate": 1.9351594208858405e-06, "loss": 0.0025, "step": 31050 }, { "epoch": 1.823411999530351, "grad_norm": 0.4013507664203644, "learning_rate": 1.9224392313620665e-06, "loss": 0.0101, "step": 31060 }, { "epoch": 1.8239990607021253, "grad_norm": 0.07524754852056503, "learning_rate": 1.909760166485586e-06, "loss": 0.0078, "step": 31070 }, { "epoch": 1.8245861218738992, "grad_norm": 0.257982075214386, "learning_rate": 1.8971222371018393e-06, "loss": 0.0039, "step": 31080 }, { "epoch": 1.8251731830456732, "grad_norm": 0.06671323627233505, "learning_rate": 1.8845254540210743e-06, "loss": 0.0076, "step": 31090 }, { "epoch": 1.8257602442174474, "grad_norm": 0.3007630407810211, "learning_rate": 1.8719698280183328e-06, "loss": 0.0098, "step": 31100 }, { "epoch": 1.8263473053892216, "grad_norm": 0.9302312135696411, "learning_rate": 1.8594553698334793e-06, "loss": 0.0077, "step": 31110 }, { "epoch": 1.8269343665609956, "grad_norm": 1.2461236715316772, "learning_rate": 1.8469820901711344e-06, "loss": 0.0079, "step": 31120 }, { "epoch": 1.8275214277327696, "grad_norm": 0.025279074907302856, "learning_rate": 1.8345499997007243e-06, "loss": 0.0033, "step": 31130 }, { "epoch": 1.8281084889045438, "grad_norm": 0.33775296807289124, "learning_rate": 1.8221591090564038e-06, "loss": 0.0105, "step": 31140 }, { "epoch": 1.828695550076318, "grad_norm": 0.12173739075660706, "learning_rate": 1.8098094288371336e-06, "loss": 0.0077, "step": 31150 }, { "epoch": 1.8292826112480922, "grad_norm": 0.01120838150382042, "learning_rate": 1.7975009696065859e-06, "loss": 0.0254, "step": 31160 }, { "epoch": 1.8298696724198662, "grad_norm": 0.2868688106536865, "learning_rate": 1.785233741893183e-06, "loss": 0.003, "step": 31170 }, { "epoch": 1.8304567335916402, "grad_norm": 0.06971792131662369, "learning_rate": 1.7730077561900926e-06, "loss": 0.027, "step": 31180 }, { "epoch": 1.8310437947634144, "grad_norm": 0.09698063135147095, "learning_rate": 1.760823022955188e-06, "loss": 0.01, "step": 31190 }, { "epoch": 1.8316308559351886, "grad_norm": 0.0902014970779419, "learning_rate": 1.748679552611071e-06, "loss": 0.0241, "step": 31200 }, { "epoch": 1.8322179171069626, "grad_norm": 1.3793613910675049, "learning_rate": 1.736577355545027e-06, "loss": 0.0075, "step": 31210 }, { "epoch": 1.8328049782787366, "grad_norm": 1.8001060485839844, "learning_rate": 1.7245164421090533e-06, "loss": 0.0266, "step": 31220 }, { "epoch": 1.8333920394505108, "grad_norm": 0.11762313544750214, "learning_rate": 1.7124968226198357e-06, "loss": 0.0093, "step": 31230 }, { "epoch": 1.833979100622285, "grad_norm": 0.47632017731666565, "learning_rate": 1.7005185073587337e-06, "loss": 0.0097, "step": 31240 }, { "epoch": 1.834566161794059, "grad_norm": 0.12678590416908264, "learning_rate": 1.6885815065717625e-06, "loss": 0.0081, "step": 31250 }, { "epoch": 1.835153222965833, "grad_norm": 0.2040066421031952, "learning_rate": 1.676685830469621e-06, "loss": 0.0047, "step": 31260 }, { "epoch": 1.8357402841376071, "grad_norm": 1.1790368556976318, "learning_rate": 1.6648314892276362e-06, "loss": 0.0102, "step": 31270 }, { "epoch": 1.8363273453093814, "grad_norm": 0.33532023429870605, "learning_rate": 1.6530184929857973e-06, "loss": 0.0092, "step": 31280 }, { "epoch": 1.8369144064811553, "grad_norm": 1.6581460237503052, "learning_rate": 1.6412468518487212e-06, "loss": 0.0061, "step": 31290 }, { "epoch": 1.8375014676529293, "grad_norm": 0.6531566381454468, "learning_rate": 1.629516575885659e-06, "loss": 0.0061, "step": 31300 }, { "epoch": 1.8380885288247035, "grad_norm": 0.014209272339940071, "learning_rate": 1.617827675130451e-06, "loss": 0.0124, "step": 31310 }, { "epoch": 1.8386755899964777, "grad_norm": 0.8777631521224976, "learning_rate": 1.6061801595815774e-06, "loss": 0.0073, "step": 31320 }, { "epoch": 1.8392626511682517, "grad_norm": 0.019044015556573868, "learning_rate": 1.5945740392021013e-06, "loss": 0.0037, "step": 31330 }, { "epoch": 1.8398497123400257, "grad_norm": 0.193769633769989, "learning_rate": 1.5830093239196764e-06, "loss": 0.0148, "step": 31340 }, { "epoch": 1.8404367735118, "grad_norm": 1.4542549848556519, "learning_rate": 1.5714860236265506e-06, "loss": 0.0049, "step": 31350 }, { "epoch": 1.8410238346835741, "grad_norm": 0.3220404088497162, "learning_rate": 1.5600041481795336e-06, "loss": 0.0086, "step": 31360 }, { "epoch": 1.841610895855348, "grad_norm": 1.2098355293273926, "learning_rate": 1.5485637074000247e-06, "loss": 0.0236, "step": 31370 }, { "epoch": 1.842197957027122, "grad_norm": 1.4953199625015259, "learning_rate": 1.5371647110739408e-06, "loss": 0.0068, "step": 31380 }, { "epoch": 1.8427850181988963, "grad_norm": 2.258049249649048, "learning_rate": 1.5258071689517872e-06, "loss": 0.0106, "step": 31390 }, { "epoch": 1.8433720793706705, "grad_norm": 2.7513246536254883, "learning_rate": 1.514491090748571e-06, "loss": 0.0147, "step": 31400 }, { "epoch": 1.8439591405424445, "grad_norm": 0.9112816452980042, "learning_rate": 1.5032164861438825e-06, "loss": 0.0022, "step": 31410 }, { "epoch": 1.8445462017142185, "grad_norm": 0.37844619154930115, "learning_rate": 1.4919833647817905e-06, "loss": 0.0032, "step": 31420 }, { "epoch": 1.8451332628859927, "grad_norm": 1.8407471179962158, "learning_rate": 1.4807917362709033e-06, "loss": 0.0159, "step": 31430 }, { "epoch": 1.8457203240577669, "grad_norm": 1.7563832998275757, "learning_rate": 1.4696416101843246e-06, "loss": 0.0042, "step": 31440 }, { "epoch": 1.846307385229541, "grad_norm": 1.1273623704910278, "learning_rate": 1.4585329960596639e-06, "loss": 0.0227, "step": 31450 }, { "epoch": 1.846894446401315, "grad_norm": 0.02268887870013714, "learning_rate": 1.4474659033990313e-06, "loss": 0.0028, "step": 31460 }, { "epoch": 1.847481507573089, "grad_norm": 0.5669854879379272, "learning_rate": 1.4364403416690042e-06, "loss": 0.0084, "step": 31470 }, { "epoch": 1.8480685687448632, "grad_norm": 0.17324286699295044, "learning_rate": 1.42545632030065e-06, "loss": 0.0087, "step": 31480 }, { "epoch": 1.8486556299166375, "grad_norm": 0.06148910894989967, "learning_rate": 1.4145138486894804e-06, "loss": 0.0085, "step": 31490 }, { "epoch": 1.8492426910884114, "grad_norm": 2.422051191329956, "learning_rate": 1.4036129361954974e-06, "loss": 0.0189, "step": 31500 }, { "epoch": 1.8498297522601854, "grad_norm": 0.03764136880636215, "learning_rate": 1.3927535921431255e-06, "loss": 0.0038, "step": 31510 }, { "epoch": 1.8504168134319596, "grad_norm": 0.2697269320487976, "learning_rate": 1.381935825821251e-06, "loss": 0.0096, "step": 31520 }, { "epoch": 1.8510038746037338, "grad_norm": 0.9931702613830566, "learning_rate": 1.371159646483189e-06, "loss": 0.0193, "step": 31530 }, { "epoch": 1.8515909357755078, "grad_norm": 0.14079973101615906, "learning_rate": 1.360425063346682e-06, "loss": 0.0106, "step": 31540 }, { "epoch": 1.8521779969472818, "grad_norm": 0.838455855846405, "learning_rate": 1.3497320855938855e-06, "loss": 0.0138, "step": 31550 }, { "epoch": 1.852765058119056, "grad_norm": 0.6983908414840698, "learning_rate": 1.3390807223713886e-06, "loss": 0.0088, "step": 31560 }, { "epoch": 1.8533521192908302, "grad_norm": 0.18073470890522003, "learning_rate": 1.328470982790142e-06, "loss": 0.0028, "step": 31570 }, { "epoch": 1.8539391804626042, "grad_norm": 0.08848128467798233, "learning_rate": 1.3179028759255475e-06, "loss": 0.004, "step": 31580 }, { "epoch": 1.8545262416343782, "grad_norm": 0.6067091226577759, "learning_rate": 1.3073764108173459e-06, "loss": 0.0178, "step": 31590 }, { "epoch": 1.8551133028061524, "grad_norm": 0.6836615204811096, "learning_rate": 1.2968915964696904e-06, "loss": 0.0018, "step": 31600 }, { "epoch": 1.8557003639779266, "grad_norm": 0.6487021446228027, "learning_rate": 1.2864484418510959e-06, "loss": 0.0193, "step": 31610 }, { "epoch": 1.8562874251497006, "grad_norm": 0.6123121976852417, "learning_rate": 1.2760469558944277e-06, "loss": 0.0094, "step": 31620 }, { "epoch": 1.8568744863214746, "grad_norm": 2.8882272243499756, "learning_rate": 1.2656871474969357e-06, "loss": 0.0119, "step": 31630 }, { "epoch": 1.8574615474932488, "grad_norm": 1.7780054807662964, "learning_rate": 1.2553690255201977e-06, "loss": 0.0137, "step": 31640 }, { "epoch": 1.858048608665023, "grad_norm": 0.5469017028808594, "learning_rate": 1.2450925987901595e-06, "loss": 0.0081, "step": 31650 }, { "epoch": 1.858635669836797, "grad_norm": 0.0939243733882904, "learning_rate": 1.234857876097062e-06, "loss": 0.0096, "step": 31660 }, { "epoch": 1.859222731008571, "grad_norm": 0.1904410570859909, "learning_rate": 1.224664866195513e-06, "loss": 0.0021, "step": 31670 }, { "epoch": 1.8598097921803451, "grad_norm": 0.26256752014160156, "learning_rate": 1.214513577804416e-06, "loss": 0.0267, "step": 31680 }, { "epoch": 1.8603968533521194, "grad_norm": 0.22087575495243073, "learning_rate": 1.204404019606986e-06, "loss": 0.0191, "step": 31690 }, { "epoch": 1.8609839145238933, "grad_norm": 0.507247805595398, "learning_rate": 1.194336200250762e-06, "loss": 0.0023, "step": 31700 }, { "epoch": 1.8615709756956675, "grad_norm": 1.6338211297988892, "learning_rate": 1.1843101283475655e-06, "loss": 0.0091, "step": 31710 }, { "epoch": 1.8621580368674415, "grad_norm": 0.023288412019610405, "learning_rate": 1.174325812473509e-06, "loss": 0.0101, "step": 31720 }, { "epoch": 1.8627450980392157, "grad_norm": 3.4685113430023193, "learning_rate": 1.1643832611689943e-06, "loss": 0.0055, "step": 31730 }, { "epoch": 1.86333215921099, "grad_norm": 0.18406684696674347, "learning_rate": 1.1544824829386846e-06, "loss": 0.01, "step": 31740 }, { "epoch": 1.863919220382764, "grad_norm": 0.006672476418316364, "learning_rate": 1.1446234862515225e-06, "loss": 0.0034, "step": 31750 }, { "epoch": 1.864506281554538, "grad_norm": 0.5018321871757507, "learning_rate": 1.1348062795407233e-06, "loss": 0.0067, "step": 31760 }, { "epoch": 1.865093342726312, "grad_norm": 1.099258303642273, "learning_rate": 1.1250308712037306e-06, "loss": 0.0115, "step": 31770 }, { "epoch": 1.8656804038980863, "grad_norm": 0.03937417268753052, "learning_rate": 1.1152972696022445e-06, "loss": 0.0028, "step": 31780 }, { "epoch": 1.8662674650698603, "grad_norm": 0.09545544534921646, "learning_rate": 1.105605483062211e-06, "loss": 0.0215, "step": 31790 }, { "epoch": 1.8668545262416343, "grad_norm": 0.21313613653182983, "learning_rate": 1.0959555198738037e-06, "loss": 0.0022, "step": 31800 }, { "epoch": 1.8674415874134085, "grad_norm": 0.9208429455757141, "learning_rate": 1.0863473882914143e-06, "loss": 0.0061, "step": 31810 }, { "epoch": 1.8680286485851827, "grad_norm": 0.745741605758667, "learning_rate": 1.076781096533669e-06, "loss": 0.0231, "step": 31820 }, { "epoch": 1.8686157097569567, "grad_norm": 0.42319658398628235, "learning_rate": 1.0672566527833827e-06, "loss": 0.0133, "step": 31830 }, { "epoch": 1.8692027709287307, "grad_norm": 0.4425826072692871, "learning_rate": 1.0577740651876001e-06, "loss": 0.0118, "step": 31840 }, { "epoch": 1.8697898321005049, "grad_norm": 0.39463743567466736, "learning_rate": 1.048333341857538e-06, "loss": 0.0017, "step": 31850 }, { "epoch": 1.870376893272279, "grad_norm": 0.3169250190258026, "learning_rate": 1.0389344908686205e-06, "loss": 0.0086, "step": 31860 }, { "epoch": 1.870963954444053, "grad_norm": 1.0209932327270508, "learning_rate": 1.0295775202604495e-06, "loss": 0.0105, "step": 31870 }, { "epoch": 1.871551015615827, "grad_norm": 0.009926820173859596, "learning_rate": 1.020262438036801e-06, "loss": 0.0028, "step": 31880 }, { "epoch": 1.8721380767876012, "grad_norm": 1.3148220777511597, "learning_rate": 1.0109892521656283e-06, "loss": 0.0065, "step": 31890 }, { "epoch": 1.8727251379593755, "grad_norm": 1.018448829650879, "learning_rate": 1.0017579705790314e-06, "loss": 0.0122, "step": 31900 }, { "epoch": 1.8733121991311494, "grad_norm": 0.7915276885032654, "learning_rate": 9.925686011732826e-07, "loss": 0.012, "step": 31910 }, { "epoch": 1.8738992603029234, "grad_norm": 0.25855815410614014, "learning_rate": 9.834211518087887e-07, "loss": 0.0028, "step": 31920 }, { "epoch": 1.8744863214746976, "grad_norm": 0.4287143647670746, "learning_rate": 9.743156303101185e-07, "loss": 0.008, "step": 31930 }, { "epoch": 1.8750733826464718, "grad_norm": 1.286320447921753, "learning_rate": 9.652520444659585e-07, "loss": 0.0135, "step": 31940 }, { "epoch": 1.8756604438182458, "grad_norm": 0.2922452688217163, "learning_rate": 9.562304020291346e-07, "loss": 0.0097, "step": 31950 }, { "epoch": 1.8762475049900198, "grad_norm": 0.023812199011445045, "learning_rate": 9.472507107165852e-07, "loss": 0.0032, "step": 31960 }, { "epoch": 1.876834566161794, "grad_norm": 0.014829243533313274, "learning_rate": 9.383129782093713e-07, "loss": 0.0047, "step": 31970 }, { "epoch": 1.8774216273335682, "grad_norm": 0.3453643023967743, "learning_rate": 9.294172121526668e-07, "loss": 0.0108, "step": 31980 }, { "epoch": 1.8780086885053424, "grad_norm": 0.00518003711476922, "learning_rate": 9.205634201557456e-07, "loss": 0.0043, "step": 31990 }, { "epoch": 1.8785957496771164, "grad_norm": 0.12136801332235336, "learning_rate": 9.11751609791972e-07, "loss": 0.0088, "step": 32000 }, { "epoch": 1.8791828108488904, "grad_norm": 1.9268333911895752, "learning_rate": 9.029817885988001e-07, "loss": 0.006, "step": 32010 }, { "epoch": 1.8797698720206646, "grad_norm": 0.05153471976518631, "learning_rate": 8.942539640777792e-07, "loss": 0.0064, "step": 32020 }, { "epoch": 1.8803569331924388, "grad_norm": 1.7470871210098267, "learning_rate": 8.855681436945206e-07, "loss": 0.0045, "step": 32030 }, { "epoch": 1.8809439943642128, "grad_norm": 0.8505917191505432, "learning_rate": 8.769243348787148e-07, "loss": 0.0111, "step": 32040 }, { "epoch": 1.8815310555359868, "grad_norm": 0.028677623718976974, "learning_rate": 8.683225450241139e-07, "loss": 0.0073, "step": 32050 }, { "epoch": 1.882118116707761, "grad_norm": 0.007410742342472076, "learning_rate": 8.597627814885323e-07, "loss": 0.0085, "step": 32060 }, { "epoch": 1.8827051778795352, "grad_norm": 1.685816764831543, "learning_rate": 8.512450515938298e-07, "loss": 0.0139, "step": 32070 }, { "epoch": 1.8832922390513092, "grad_norm": 2.025792360305786, "learning_rate": 8.427693626259114e-07, "loss": 0.0088, "step": 32080 }, { "epoch": 1.8838793002230831, "grad_norm": 0.26576316356658936, "learning_rate": 8.343357218347226e-07, "loss": 0.0056, "step": 32090 }, { "epoch": 1.8844663613948573, "grad_norm": 0.031764790415763855, "learning_rate": 8.25944136434248e-07, "loss": 0.0025, "step": 32100 }, { "epoch": 1.8850534225666316, "grad_norm": 0.006499218754470348, "learning_rate": 8.175946136024792e-07, "loss": 0.0046, "step": 32110 }, { "epoch": 1.8856404837384055, "grad_norm": 1.3531373739242554, "learning_rate": 8.092871604814645e-07, "loss": 0.0119, "step": 32120 }, { "epoch": 1.8862275449101795, "grad_norm": 0.14757950603961945, "learning_rate": 8.01021784177225e-07, "loss": 0.006, "step": 32130 }, { "epoch": 1.8868146060819537, "grad_norm": 0.24395480751991272, "learning_rate": 7.927984917598164e-07, "loss": 0.0121, "step": 32140 }, { "epoch": 1.887401667253728, "grad_norm": 3.652184247970581, "learning_rate": 7.846172902632842e-07, "loss": 0.009, "step": 32150 }, { "epoch": 1.887988728425502, "grad_norm": 3.0004146099090576, "learning_rate": 7.764781866856808e-07, "loss": 0.0117, "step": 32160 }, { "epoch": 1.888575789597276, "grad_norm": 0.6791203022003174, "learning_rate": 7.683811879890479e-07, "loss": 0.0074, "step": 32170 }, { "epoch": 1.88916285076905, "grad_norm": 0.012861160561442375, "learning_rate": 7.603263010993955e-07, "loss": 0.0058, "step": 32180 }, { "epoch": 1.8897499119408243, "grad_norm": 0.12536686658859253, "learning_rate": 7.523135329067343e-07, "loss": 0.0023, "step": 32190 }, { "epoch": 1.8903369731125983, "grad_norm": 0.11532700061798096, "learning_rate": 7.443428902650262e-07, "loss": 0.0074, "step": 32200 }, { "epoch": 1.8909240342843723, "grad_norm": 0.24618609249591827, "learning_rate": 7.364143799922119e-07, "loss": 0.0063, "step": 32210 }, { "epoch": 1.8915110954561465, "grad_norm": 1.8910354375839233, "learning_rate": 7.285280088701996e-07, "loss": 0.0099, "step": 32220 }, { "epoch": 1.8920981566279207, "grad_norm": 0.39975517988204956, "learning_rate": 7.206837836448377e-07, "loss": 0.0044, "step": 32230 }, { "epoch": 1.8926852177996947, "grad_norm": 0.0536993108689785, "learning_rate": 7.128817110259312e-07, "loss": 0.0094, "step": 32240 }, { "epoch": 1.8932722789714689, "grad_norm": 0.5875653624534607, "learning_rate": 7.051217976872248e-07, "loss": 0.0022, "step": 32250 }, { "epoch": 1.8938593401432429, "grad_norm": 0.9675849676132202, "learning_rate": 6.974040502664092e-07, "loss": 0.0065, "step": 32260 }, { "epoch": 1.894446401315017, "grad_norm": 0.06338284909725189, "learning_rate": 6.897284753650924e-07, "loss": 0.0193, "step": 32270 }, { "epoch": 1.8950334624867913, "grad_norm": 1.0650804042816162, "learning_rate": 6.820950795488223e-07, "loss": 0.0064, "step": 32280 }, { "epoch": 1.8956205236585653, "grad_norm": 0.032856181263923645, "learning_rate": 6.745038693470651e-07, "loss": 0.0124, "step": 32290 }, { "epoch": 1.8962075848303392, "grad_norm": 0.3160998821258545, "learning_rate": 6.669548512531986e-07, "loss": 0.0074, "step": 32300 }, { "epoch": 1.8967946460021134, "grad_norm": 0.4787406325340271, "learning_rate": 6.594480317245133e-07, "loss": 0.0076, "step": 32310 }, { "epoch": 1.8973817071738877, "grad_norm": 0.12755951285362244, "learning_rate": 6.519834171822003e-07, "loss": 0.0068, "step": 32320 }, { "epoch": 1.8979687683456616, "grad_norm": 0.119247205555439, "learning_rate": 6.445610140113467e-07, "loss": 0.004, "step": 32330 }, { "epoch": 1.8985558295174356, "grad_norm": 0.19741986691951752, "learning_rate": 6.371808285609515e-07, "loss": 0.0078, "step": 32340 }, { "epoch": 1.8991428906892098, "grad_norm": 2.4395265579223633, "learning_rate": 6.298428671438705e-07, "loss": 0.0024, "step": 32350 }, { "epoch": 1.899729951860984, "grad_norm": 0.23129431903362274, "learning_rate": 6.225471360368773e-07, "loss": 0.007, "step": 32360 }, { "epoch": 1.900317013032758, "grad_norm": 0.9480741620063782, "learning_rate": 6.152936414805854e-07, "loss": 0.0132, "step": 32370 }, { "epoch": 1.900904074204532, "grad_norm": 2.785883903503418, "learning_rate": 6.080823896795095e-07, "loss": 0.0114, "step": 32380 }, { "epoch": 1.9014911353763062, "grad_norm": 3.1118991374969482, "learning_rate": 6.009133868020156e-07, "loss": 0.006, "step": 32390 }, { "epoch": 1.9020781965480804, "grad_norm": 0.6544510126113892, "learning_rate": 5.93786638980337e-07, "loss": 0.0062, "step": 32400 }, { "epoch": 1.9026652577198544, "grad_norm": 0.31970447301864624, "learning_rate": 5.867021523105587e-07, "loss": 0.0047, "step": 32410 }, { "epoch": 1.9032523188916284, "grad_norm": 0.0359501838684082, "learning_rate": 5.796599328526219e-07, "loss": 0.0112, "step": 32420 }, { "epoch": 1.9038393800634026, "grad_norm": 1.2342957258224487, "learning_rate": 5.726599866303084e-07, "loss": 0.005, "step": 32430 }, { "epoch": 1.9044264412351768, "grad_norm": 1.5653058290481567, "learning_rate": 5.657023196312394e-07, "loss": 0.005, "step": 32440 }, { "epoch": 1.9050135024069508, "grad_norm": 0.22834111750125885, "learning_rate": 5.587869378068711e-07, "loss": 0.0074, "step": 32450 }, { "epoch": 1.9056005635787248, "grad_norm": 1.2102018594741821, "learning_rate": 5.519138470724938e-07, "loss": 0.0196, "step": 32460 }, { "epoch": 1.906187624750499, "grad_norm": 0.04645948112010956, "learning_rate": 5.450830533072271e-07, "loss": 0.0037, "step": 32470 }, { "epoch": 1.9067746859222732, "grad_norm": 0.4139319062232971, "learning_rate": 5.38294562353997e-07, "loss": 0.0063, "step": 32480 }, { "epoch": 1.9073617470940472, "grad_norm": 1.47207772731781, "learning_rate": 5.315483800195531e-07, "loss": 0.0055, "step": 32490 }, { "epoch": 1.9079488082658211, "grad_norm": 3.1884827613830566, "learning_rate": 5.248445120744516e-07, "loss": 0.0332, "step": 32500 }, { "epoch": 1.9085358694375953, "grad_norm": 0.4252760708332062, "learning_rate": 5.181829642530667e-07, "loss": 0.0095, "step": 32510 }, { "epoch": 1.9091229306093696, "grad_norm": 1.024160623550415, "learning_rate": 5.115637422535513e-07, "loss": 0.005, "step": 32520 }, { "epoch": 1.9097099917811438, "grad_norm": 1.1204752922058105, "learning_rate": 5.049868517378653e-07, "loss": 0.0119, "step": 32530 }, { "epoch": 1.9102970529529177, "grad_norm": 0.904366135597229, "learning_rate": 4.984522983317641e-07, "loss": 0.0161, "step": 32540 }, { "epoch": 1.9108841141246917, "grad_norm": 0.027965368703007698, "learning_rate": 4.919600876247709e-07, "loss": 0.0086, "step": 32550 }, { "epoch": 1.911471175296466, "grad_norm": 1.2174148559570312, "learning_rate": 4.855102251702159e-07, "loss": 0.0053, "step": 32560 }, { "epoch": 1.9120582364682401, "grad_norm": 0.274185448884964, "learning_rate": 4.791027164851803e-07, "loss": 0.0091, "step": 32570 }, { "epoch": 1.9126452976400141, "grad_norm": 0.632271409034729, "learning_rate": 4.727375670505352e-07, "loss": 0.0193, "step": 32580 }, { "epoch": 1.913232358811788, "grad_norm": 0.3477608263492584, "learning_rate": 4.6641478231090327e-07, "loss": 0.008, "step": 32590 }, { "epoch": 1.9138194199835623, "grad_norm": 0.49516561627388, "learning_rate": 4.6013436767468053e-07, "loss": 0.0051, "step": 32600 }, { "epoch": 1.9144064811553365, "grad_norm": 2.593196392059326, "learning_rate": 4.538963285140141e-07, "loss": 0.0116, "step": 32610 }, { "epoch": 1.9149935423271105, "grad_norm": 0.1109221875667572, "learning_rate": 4.477006701648079e-07, "loss": 0.0056, "step": 32620 }, { "epoch": 1.9155806034988845, "grad_norm": 0.20114627480506897, "learning_rate": 4.4154739792670594e-07, "loss": 0.0055, "step": 32630 }, { "epoch": 1.9161676646706587, "grad_norm": 2.1865549087524414, "learning_rate": 4.3543651706312026e-07, "loss": 0.0217, "step": 32640 }, { "epoch": 1.916754725842433, "grad_norm": 2.8717215061187744, "learning_rate": 4.29368032801164e-07, "loss": 0.0127, "step": 32650 }, { "epoch": 1.9173417870142069, "grad_norm": 0.0360056534409523, "learning_rate": 4.233419503317182e-07, "loss": 0.0053, "step": 32660 }, { "epoch": 1.9179288481859809, "grad_norm": 0.5166193246841431, "learning_rate": 4.1735827480937075e-07, "loss": 0.0122, "step": 32670 }, { "epoch": 1.918515909357755, "grad_norm": 0.6271770000457764, "learning_rate": 4.114170113524496e-07, "loss": 0.0115, "step": 32680 }, { "epoch": 1.9191029705295293, "grad_norm": 0.7508412003517151, "learning_rate": 4.055181650430062e-07, "loss": 0.009, "step": 32690 }, { "epoch": 1.9196900317013033, "grad_norm": 0.10411766171455383, "learning_rate": 3.996617409268044e-07, "loss": 0.0024, "step": 32700 }, { "epoch": 1.9202770928730772, "grad_norm": 1.5734776258468628, "learning_rate": 3.9384774401330924e-07, "loss": 0.0068, "step": 32710 }, { "epoch": 1.9208641540448514, "grad_norm": 2.1649329662323, "learning_rate": 3.880761792757148e-07, "loss": 0.0201, "step": 32720 }, { "epoch": 1.9214512152166257, "grad_norm": 0.21282173693180084, "learning_rate": 3.823470516508998e-07, "loss": 0.0051, "step": 32730 }, { "epoch": 1.9220382763883996, "grad_norm": 2.166048288345337, "learning_rate": 3.766603660394663e-07, "loss": 0.0073, "step": 32740 }, { "epoch": 1.9226253375601736, "grad_norm": 3.5161662101745605, "learning_rate": 3.7101612730569004e-07, "loss": 0.0165, "step": 32750 }, { "epoch": 1.9232123987319478, "grad_norm": 1.045634388923645, "learning_rate": 3.654143402775478e-07, "loss": 0.0428, "step": 32760 }, { "epoch": 1.923799459903722, "grad_norm": 0.6151355504989624, "learning_rate": 3.598550097467068e-07, "loss": 0.0121, "step": 32770 }, { "epoch": 1.924386521075496, "grad_norm": 0.0393298976123333, "learning_rate": 3.543381404685131e-07, "loss": 0.0062, "step": 32780 }, { "epoch": 1.9249735822472702, "grad_norm": 0.01743028312921524, "learning_rate": 3.4886373716199184e-07, "loss": 0.004, "step": 32790 }, { "epoch": 1.9255606434190442, "grad_norm": 0.00625281548127532, "learning_rate": 3.434318045098417e-07, "loss": 0.0077, "step": 32800 }, { "epoch": 1.9261477045908184, "grad_norm": 0.21253876388072968, "learning_rate": 3.380423471584515e-07, "loss": 0.0055, "step": 32810 }, { "epoch": 1.9267347657625926, "grad_norm": 1.02682363986969, "learning_rate": 3.3269536971784474e-07, "loss": 0.0161, "step": 32820 }, { "epoch": 1.9273218269343666, "grad_norm": 0.9867899417877197, "learning_rate": 3.2739087676173506e-07, "loss": 0.0134, "step": 32830 }, { "epoch": 1.9279088881061406, "grad_norm": 0.4015611708164215, "learning_rate": 3.2212887282748737e-07, "loss": 0.0141, "step": 32840 }, { "epoch": 1.9284959492779148, "grad_norm": 0.18810132145881653, "learning_rate": 3.169093624161179e-07, "loss": 0.0056, "step": 32850 }, { "epoch": 1.929083010449689, "grad_norm": 0.024899575859308243, "learning_rate": 3.1173234999229973e-07, "loss": 0.009, "step": 32860 }, { "epoch": 1.929670071621463, "grad_norm": 0.06882507354021072, "learning_rate": 3.0659783998435165e-07, "loss": 0.0177, "step": 32870 }, { "epoch": 1.930257132793237, "grad_norm": 0.6683951616287231, "learning_rate": 3.0150583678423825e-07, "loss": 0.0113, "step": 32880 }, { "epoch": 1.9308441939650112, "grad_norm": 0.11474218219518661, "learning_rate": 2.9645634474756435e-07, "loss": 0.0071, "step": 32890 }, { "epoch": 1.9314312551367854, "grad_norm": 0.10880117863416672, "learning_rate": 2.914493681935693e-07, "loss": 0.0058, "step": 32900 }, { "epoch": 1.9320183163085594, "grad_norm": 0.4817119240760803, "learning_rate": 2.8648491140513266e-07, "loss": 0.0046, "step": 32910 }, { "epoch": 1.9326053774803333, "grad_norm": 0.23212800920009613, "learning_rate": 2.815629786287577e-07, "loss": 0.0118, "step": 32920 }, { "epoch": 1.9331924386521075, "grad_norm": 0.7403205037117004, "learning_rate": 2.766835740745599e-07, "loss": 0.0082, "step": 32930 }, { "epoch": 1.9337794998238818, "grad_norm": 0.22013318538665771, "learning_rate": 2.718467019163118e-07, "loss": 0.0074, "step": 32940 }, { "epoch": 1.9343665609956557, "grad_norm": 1.4403728246688843, "learning_rate": 2.670523662913649e-07, "loss": 0.008, "step": 32950 }, { "epoch": 1.9349536221674297, "grad_norm": 0.07458072155714035, "learning_rate": 2.623005713007165e-07, "loss": 0.0063, "step": 32960 }, { "epoch": 1.935540683339204, "grad_norm": 1.189927339553833, "learning_rate": 2.5759132100895975e-07, "loss": 0.0115, "step": 32970 }, { "epoch": 1.9361277445109781, "grad_norm": 0.8814135193824768, "learning_rate": 2.529246194443002e-07, "loss": 0.0055, "step": 32980 }, { "epoch": 1.9367148056827521, "grad_norm": 0.20815473794937134, "learning_rate": 2.4830047059853924e-07, "loss": 0.0141, "step": 32990 }, { "epoch": 1.937301866854526, "grad_norm": 1.3255386352539062, "learning_rate": 2.4371887842709606e-07, "loss": 0.0047, "step": 33000 }, { "epoch": 1.937301866854526, "eval_loss": 0.5217077732086182, "eval_runtime": 269.5892, "eval_samples_per_second": 3.505, "eval_steps_per_second": 3.505, "step": 33000 }, { "epoch": 1.9378889280263003, "grad_norm": 0.8180147409439087, "learning_rate": 2.391798468489803e-07, "loss": 0.0045, "step": 33010 }, { "epoch": 1.9384759891980745, "grad_norm": 0.17384904623031616, "learning_rate": 2.3468337974678624e-07, "loss": 0.0173, "step": 33020 }, { "epoch": 1.9390630503698485, "grad_norm": 0.11741580069065094, "learning_rate": 2.3022948096672049e-07, "loss": 0.0051, "step": 33030 }, { "epoch": 1.9396501115416225, "grad_norm": 0.29323628544807434, "learning_rate": 2.258181543185467e-07, "loss": 0.0137, "step": 33040 }, { "epoch": 1.9402371727133967, "grad_norm": 0.5180500745773315, "learning_rate": 2.2144940357565203e-07, "loss": 0.0094, "step": 33050 }, { "epoch": 1.940824233885171, "grad_norm": 0.010030320845544338, "learning_rate": 2.1712323247496946e-07, "loss": 0.0012, "step": 33060 }, { "epoch": 1.941411295056945, "grad_norm": 0.08542950451374054, "learning_rate": 2.1283964471703332e-07, "loss": 0.0043, "step": 33070 }, { "epoch": 1.941998356228719, "grad_norm": 0.8036894798278809, "learning_rate": 2.0859864396593488e-07, "loss": 0.0155, "step": 33080 }, { "epoch": 1.942585417400493, "grad_norm": 2.722127914428711, "learning_rate": 2.044002338493556e-07, "loss": 0.0113, "step": 33090 }, { "epoch": 1.9431724785722673, "grad_norm": 0.0801001489162445, "learning_rate": 2.0024441795853388e-07, "loss": 0.0035, "step": 33100 }, { "epoch": 1.9437595397440415, "grad_norm": 1.31703519821167, "learning_rate": 1.961311998482762e-07, "loss": 0.0126, "step": 33110 }, { "epoch": 1.9443466009158155, "grad_norm": 0.5500128865242004, "learning_rate": 1.9206058303695706e-07, "loss": 0.0056, "step": 33120 }, { "epoch": 1.9449336620875894, "grad_norm": 0.6608492136001587, "learning_rate": 1.8803257100649675e-07, "loss": 0.021, "step": 33130 }, { "epoch": 1.9455207232593636, "grad_norm": 1.5913540124893188, "learning_rate": 1.840471672023947e-07, "loss": 0.0244, "step": 33140 }, { "epoch": 1.9461077844311379, "grad_norm": 0.25986772775650024, "learning_rate": 1.801043750336795e-07, "loss": 0.0091, "step": 33150 }, { "epoch": 1.9466948456029118, "grad_norm": 0.010232365690171719, "learning_rate": 1.7620419787294785e-07, "loss": 0.0074, "step": 33160 }, { "epoch": 1.9472819067746858, "grad_norm": 2.475867509841919, "learning_rate": 1.723466390563311e-07, "loss": 0.0249, "step": 33170 }, { "epoch": 1.94786896794646, "grad_norm": 2.9302568435668945, "learning_rate": 1.6853170188352306e-07, "loss": 0.0077, "step": 33180 }, { "epoch": 1.9484560291182342, "grad_norm": 0.9175553321838379, "learning_rate": 1.6475938961774683e-07, "loss": 0.0048, "step": 33190 }, { "epoch": 1.9490430902900082, "grad_norm": 1.592864990234375, "learning_rate": 1.610297054857657e-07, "loss": 0.0061, "step": 33200 }, { "epoch": 1.9496301514617822, "grad_norm": 0.8533622622489929, "learning_rate": 1.5734265267787763e-07, "loss": 0.0179, "step": 33210 }, { "epoch": 1.9502172126335564, "grad_norm": 2.1654791831970215, "learning_rate": 1.5369823434792652e-07, "loss": 0.0068, "step": 33220 }, { "epoch": 1.9508042738053306, "grad_norm": 0.7100071310997009, "learning_rate": 1.5009645361327983e-07, "loss": 0.0105, "step": 33230 }, { "epoch": 1.9513913349771046, "grad_norm": 0.04918665811419487, "learning_rate": 1.465373135548287e-07, "loss": 0.0081, "step": 33240 }, { "epoch": 1.9519783961488786, "grad_norm": 0.3652174770832062, "learning_rate": 1.4302081721699334e-07, "loss": 0.0035, "step": 33250 }, { "epoch": 1.9525654573206528, "grad_norm": 0.0118443313986063, "learning_rate": 1.3954696760772323e-07, "loss": 0.0059, "step": 33260 }, { "epoch": 1.953152518492427, "grad_norm": 0.4527686834335327, "learning_rate": 1.3611576769848034e-07, "loss": 0.0133, "step": 33270 }, { "epoch": 1.953739579664201, "grad_norm": 0.05000549182295799, "learning_rate": 1.3272722042425577e-07, "loss": 0.0113, "step": 33280 }, { "epoch": 1.954326640835975, "grad_norm": 0.4897474944591522, "learning_rate": 1.2938132868354768e-07, "loss": 0.0089, "step": 33290 }, { "epoch": 1.9549137020077492, "grad_norm": 0.08548981696367264, "learning_rate": 1.2607809533836669e-07, "loss": 0.0053, "step": 33300 }, { "epoch": 1.9555007631795234, "grad_norm": 0.01599929668009281, "learning_rate": 1.2281752321423589e-07, "loss": 0.0104, "step": 33310 }, { "epoch": 1.9560878243512974, "grad_norm": 0.350115031003952, "learning_rate": 1.1959961510018546e-07, "loss": 0.0083, "step": 33320 }, { "epoch": 1.9566748855230713, "grad_norm": 1.249320387840271, "learning_rate": 1.1642437374876913e-07, "loss": 0.0139, "step": 33330 }, { "epoch": 1.9572619466948455, "grad_norm": 0.07248983532190323, "learning_rate": 1.1329180187600874e-07, "loss": 0.0046, "step": 33340 }, { "epoch": 1.9578490078666198, "grad_norm": 0.5684255361557007, "learning_rate": 1.1020190216146086e-07, "loss": 0.0042, "step": 33350 }, { "epoch": 1.958436069038394, "grad_norm": 1.957879900932312, "learning_rate": 1.071546772481613e-07, "loss": 0.0054, "step": 33360 }, { "epoch": 1.959023130210168, "grad_norm": 0.04956020042300224, "learning_rate": 1.0415012974265281e-07, "loss": 0.0033, "step": 33370 }, { "epoch": 1.959610191381942, "grad_norm": 0.04298626631498337, "learning_rate": 1.0118826221497401e-07, "loss": 0.0118, "step": 33380 }, { "epoch": 1.9601972525537161, "grad_norm": 0.26160338521003723, "learning_rate": 9.82690771986372e-08, "loss": 0.0045, "step": 33390 }, { "epoch": 1.9607843137254903, "grad_norm": 0.9826566576957703, "learning_rate": 9.539257719067274e-08, "loss": 0.0057, "step": 33400 }, { "epoch": 1.9613713748972643, "grad_norm": 0.45765307545661926, "learning_rate": 9.25587646515791e-08, "loss": 0.0039, "step": 33410 }, { "epoch": 1.9619584360690383, "grad_norm": 0.016645647585392, "learning_rate": 8.976764200534504e-08, "loss": 0.0173, "step": 33420 }, { "epoch": 1.9625454972408125, "grad_norm": 0.41816627979278564, "learning_rate": 8.701921163944415e-08, "loss": 0.0132, "step": 33430 }, { "epoch": 1.9631325584125867, "grad_norm": 0.17216813564300537, "learning_rate": 8.431347590483474e-08, "loss": 0.0094, "step": 33440 }, { "epoch": 1.9637196195843607, "grad_norm": 1.3329750299453735, "learning_rate": 8.165043711595987e-08, "loss": 0.0156, "step": 33450 }, { "epoch": 1.9643066807561347, "grad_norm": 0.669681966304779, "learning_rate": 7.903009755071967e-08, "loss": 0.0057, "step": 33460 }, { "epoch": 1.9648937419279089, "grad_norm": 1.701711654663086, "learning_rate": 7.645245945051005e-08, "loss": 0.012, "step": 33470 }, { "epoch": 1.965480803099683, "grad_norm": 0.06459162384271622, "learning_rate": 7.391752502019512e-08, "loss": 0.0029, "step": 33480 }, { "epoch": 1.966067864271457, "grad_norm": 0.01970849744975567, "learning_rate": 7.142529642810703e-08, "loss": 0.0077, "step": 33490 }, { "epoch": 1.966654925443231, "grad_norm": 0.1700696051120758, "learning_rate": 6.897577580606273e-08, "loss": 0.017, "step": 33500 }, { "epoch": 1.9672419866150053, "grad_norm": 0.00029024932882748544, "learning_rate": 6.656896524931955e-08, "loss": 0.0058, "step": 33510 }, { "epoch": 1.9678290477867795, "grad_norm": 1.4260274171829224, "learning_rate": 6.420486681663062e-08, "loss": 0.0051, "step": 33520 }, { "epoch": 1.9684161089585535, "grad_norm": 0.2738833427429199, "learning_rate": 6.188348253019505e-08, "loss": 0.0147, "step": 33530 }, { "epoch": 1.9690031701303274, "grad_norm": 0.014176618307828903, "learning_rate": 5.960481437568555e-08, "loss": 0.0044, "step": 33540 }, { "epoch": 1.9695902313021016, "grad_norm": 1.1728260517120361, "learning_rate": 5.7368864302226324e-08, "loss": 0.0115, "step": 33550 }, { "epoch": 1.9701772924738759, "grad_norm": 1.0428615808486938, "learning_rate": 5.517563422241523e-08, "loss": 0.0034, "step": 33560 }, { "epoch": 1.9707643536456498, "grad_norm": 0.09201149642467499, "learning_rate": 5.3025126012301586e-08, "loss": 0.0054, "step": 33570 }, { "epoch": 1.9713514148174238, "grad_norm": 0.08367258310317993, "learning_rate": 5.091734151138061e-08, "loss": 0.0045, "step": 33580 }, { "epoch": 1.971938475989198, "grad_norm": 0.30681926012039185, "learning_rate": 4.8852282522615646e-08, "loss": 0.0025, "step": 33590 }, { "epoch": 1.9725255371609722, "grad_norm": 0.10407868772745132, "learning_rate": 4.6829950812421474e-08, "loss": 0.0081, "step": 33600 }, { "epoch": 1.9731125983327464, "grad_norm": 0.018392860889434814, "learning_rate": 4.48503481106588e-08, "loss": 0.0034, "step": 33610 }, { "epoch": 1.9736996595045204, "grad_norm": 0.4253391921520233, "learning_rate": 4.2913476110650887e-08, "loss": 0.0051, "step": 33620 }, { "epoch": 1.9742867206762944, "grad_norm": 0.7737009525299072, "learning_rate": 4.101933646915024e-08, "loss": 0.0093, "step": 33630 }, { "epoch": 1.9748737818480686, "grad_norm": 2.204073429107666, "learning_rate": 3.9167930806377485e-08, "loss": 0.0109, "step": 33640 }, { "epoch": 1.9754608430198428, "grad_norm": 0.09515777230262756, "learning_rate": 3.7359260705993604e-08, "loss": 0.001, "step": 33650 }, { "epoch": 1.9760479041916168, "grad_norm": 0.03296930715441704, "learning_rate": 3.559332771508883e-08, "loss": 0.004, "step": 33660 }, { "epoch": 1.9766349653633908, "grad_norm": 0.04820576310157776, "learning_rate": 3.387013334421596e-08, "loss": 0.0018, "step": 33670 }, { "epoch": 1.977222026535165, "grad_norm": 0.07194074988365173, "learning_rate": 3.2189679067368136e-08, "loss": 0.0103, "step": 33680 }, { "epoch": 1.9778090877069392, "grad_norm": 0.06059109419584274, "learning_rate": 3.055196632196222e-08, "loss": 0.0052, "step": 33690 }, { "epoch": 1.9783961488787132, "grad_norm": 0.038826677948236465, "learning_rate": 2.8956996508883172e-08, "loss": 0.0078, "step": 33700 }, { "epoch": 1.9789832100504872, "grad_norm": 0.8903669118881226, "learning_rate": 2.7404770992423002e-08, "loss": 0.0051, "step": 33710 }, { "epoch": 1.9795702712222614, "grad_norm": 0.08376505970954895, "learning_rate": 2.5895291100336282e-08, "loss": 0.0159, "step": 33720 }, { "epoch": 1.9801573323940356, "grad_norm": 1.5218383073806763, "learning_rate": 2.4428558123795743e-08, "loss": 0.011, "step": 33730 }, { "epoch": 1.9807443935658096, "grad_norm": 1.9102550745010376, "learning_rate": 2.3004573317431112e-08, "loss": 0.0141, "step": 33740 }, { "epoch": 1.9813314547375835, "grad_norm": 0.3464989960193634, "learning_rate": 2.1623337899279173e-08, "loss": 0.0045, "step": 33750 }, { "epoch": 1.9819185159093577, "grad_norm": 2.3063037395477295, "learning_rate": 2.0284853050828166e-08, "loss": 0.0075, "step": 33760 }, { "epoch": 1.982505577081132, "grad_norm": 0.6410544514656067, "learning_rate": 1.898911991699004e-08, "loss": 0.0098, "step": 33770 }, { "epoch": 1.983092638252906, "grad_norm": 0.12403683364391327, "learning_rate": 1.7736139606111534e-08, "loss": 0.009, "step": 33780 }, { "epoch": 1.98367969942468, "grad_norm": 0.5402272939682007, "learning_rate": 1.6525913189974208e-08, "loss": 0.0042, "step": 33790 }, { "epoch": 1.9842667605964541, "grad_norm": 0.9701083898544312, "learning_rate": 1.5358441703777758e-08, "loss": 0.0221, "step": 33800 }, { "epoch": 1.9848538217682283, "grad_norm": 0.9332861304283142, "learning_rate": 1.42337261461567e-08, "loss": 0.0076, "step": 33810 }, { "epoch": 1.9854408829400023, "grad_norm": 0.9768741726875305, "learning_rate": 1.3151767479169241e-08, "loss": 0.0121, "step": 33820 }, { "epoch": 1.9860279441117763, "grad_norm": 0.5883607864379883, "learning_rate": 1.2112566628302846e-08, "loss": 0.0124, "step": 33830 }, { "epoch": 1.9866150052835505, "grad_norm": 0.31778550148010254, "learning_rate": 1.1116124482479784e-08, "loss": 0.0088, "step": 33840 }, { "epoch": 1.9872020664553247, "grad_norm": 0.05427484214305878, "learning_rate": 1.0162441894023822e-08, "loss": 0.0152, "step": 33850 }, { "epoch": 1.9877891276270987, "grad_norm": 0.04062940552830696, "learning_rate": 9.251519678710186e-09, "loss": 0.0044, "step": 33860 }, { "epoch": 1.9883761887988727, "grad_norm": 0.5054002404212952, "learning_rate": 8.383358615715598e-09, "loss": 0.0101, "step": 33870 }, { "epoch": 1.9889632499706469, "grad_norm": 0.013953852467238903, "learning_rate": 7.557959447657137e-09, "loss": 0.0291, "step": 33880 }, { "epoch": 1.989550311142421, "grad_norm": 0.09007053822278976, "learning_rate": 6.775322880553381e-09, "loss": 0.0073, "step": 33890 }, { "epoch": 1.9901373723141953, "grad_norm": 2.0966174602508545, "learning_rate": 6.035449583868813e-09, "loss": 0.0091, "step": 33900 }, { "epoch": 1.9907244334859693, "grad_norm": 0.2234669178724289, "learning_rate": 5.338340190469415e-09, "loss": 0.0063, "step": 33910 }, { "epoch": 1.9913114946577433, "grad_norm": 1.178725242614746, "learning_rate": 4.6839952966559744e-09, "loss": 0.0087, "step": 33920 }, { "epoch": 1.9918985558295175, "grad_norm": 0.22748713195323944, "learning_rate": 4.0724154621418766e-09, "loss": 0.014, "step": 33930 }, { "epoch": 1.9924856170012917, "grad_norm": 0.9043611884117126, "learning_rate": 3.503601210053109e-09, "loss": 0.0053, "step": 33940 }, { "epoch": 1.9930726781730657, "grad_norm": 0.05220215767621994, "learning_rate": 2.9775530269560146e-09, "loss": 0.006, "step": 33950 }, { "epoch": 1.9936597393448396, "grad_norm": 0.29552602767944336, "learning_rate": 2.494271362807332e-09, "loss": 0.0067, "step": 33960 }, { "epoch": 1.9942468005166138, "grad_norm": 0.5266906023025513, "learning_rate": 2.0537566310097065e-09, "loss": 0.0043, "step": 33970 }, { "epoch": 1.994833861688388, "grad_norm": 0.13338631391525269, "learning_rate": 1.6560092083672817e-09, "loss": 0.0062, "step": 33980 }, { "epoch": 1.995420922860162, "grad_norm": 0.04648716747760773, "learning_rate": 1.3010294351023523e-09, "loss": 0.0031, "step": 33990 }, { "epoch": 1.996007984031936, "grad_norm": 0.14625805616378784, "learning_rate": 9.88817614860915e-10, "loss": 0.0054, "step": 34000 }, { "epoch": 1.9965950452037102, "grad_norm": 0.029483526945114136, "learning_rate": 7.193740147015682e-10, "loss": 0.0043, "step": 34010 }, { "epoch": 1.9971821063754844, "grad_norm": 5.174992561340332, "learning_rate": 4.926988651066111e-10, "loss": 0.0135, "step": 34020 }, { "epoch": 1.9977691675472584, "grad_norm": 0.21607564389705658, "learning_rate": 3.087923599598419e-10, "loss": 0.002, "step": 34030 }, { "epoch": 1.9983562287190324, "grad_norm": 0.7728718519210815, "learning_rate": 1.6765465658541424e-10, "loss": 0.0118, "step": 34040 }, { "epoch": 1.9989432898908066, "grad_norm": 0.17154139280319214, "learning_rate": 6.928587569232647e-11, "loss": 0.0162, "step": 34050 }, { "epoch": 1.9995303510625808, "grad_norm": 1.6425248384475708, "learning_rate": 1.3686101441034992e-11, "loss": 0.006, "step": 34060 }, { "epoch": 2.0, "step": 34068, "total_flos": 4.396799291960525e+17, "train_loss": 0.02699765918700505, "train_runtime": 22406.0806, "train_samples_per_second": 1.52, "train_steps_per_second": 1.52 } ], "logging_steps": 10, "max_steps": 34068, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.396799291960525e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }