{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 106935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00028054425585636136, "grad_norm": 3.485063314437866, "learning_rate": 4.9995324262402396e-05, "loss": 1.3127, "step": 10 }, { "epoch": 0.0005610885117127227, "grad_norm": 2.7161452770233154, "learning_rate": 4.999064852480479e-05, "loss": 0.4695, "step": 20 }, { "epoch": 0.0008416327675690841, "grad_norm": 4.221207618713379, "learning_rate": 4.998597278720719e-05, "loss": 0.2888, "step": 30 }, { "epoch": 0.0011221770234254454, "grad_norm": 4.2419819831848145, "learning_rate": 4.9981297049609575e-05, "loss": 0.2791, "step": 40 }, { "epoch": 0.0014027212792818067, "grad_norm": 3.895155906677246, "learning_rate": 4.9976621312011975e-05, "loss": 0.2172, "step": 50 }, { "epoch": 0.0016832655351381681, "grad_norm": 1.9950965642929077, "learning_rate": 4.997194557441436e-05, "loss": 0.2279, "step": 60 }, { "epoch": 0.0019638097909945294, "grad_norm": 7.5022478103637695, "learning_rate": 4.996726983681676e-05, "loss": 0.1102, "step": 70 }, { "epoch": 0.002244354046850891, "grad_norm": 3.018139600753784, "learning_rate": 4.9962594099219154e-05, "loss": 0.1466, "step": 80 }, { "epoch": 0.002524898302707252, "grad_norm": 4.315135955810547, "learning_rate": 4.995791836162155e-05, "loss": 0.1139, "step": 90 }, { "epoch": 0.0028054425585636133, "grad_norm": 4.203267574310303, "learning_rate": 4.995324262402394e-05, "loss": 0.1019, "step": 100 }, { "epoch": 0.003085986814419975, "grad_norm": 1.9099010229110718, "learning_rate": 4.9948566886426334e-05, "loss": 0.0951, "step": 110 }, { "epoch": 0.0033665310702763363, "grad_norm": 1.9506219625473022, "learning_rate": 4.9943891148828734e-05, "loss": 0.1246, "step": 120 }, { "epoch": 0.0036470753261326973, "grad_norm": 7.600841045379639, "learning_rate": 4.993921541123112e-05, "loss": 0.0905, "step": 130 }, { "epoch": 0.003927619581989059, "grad_norm": 5.199939727783203, "learning_rate": 4.993453967363352e-05, "loss": 0.1099, "step": 140 }, { "epoch": 0.00420816383784542, "grad_norm": 4.315867900848389, "learning_rate": 4.992986393603591e-05, "loss": 0.0459, "step": 150 }, { "epoch": 0.004488708093701782, "grad_norm": 8.081439971923828, "learning_rate": 4.9925188198438306e-05, "loss": 0.0937, "step": 160 }, { "epoch": 0.004769252349558143, "grad_norm": 1.150046944618225, "learning_rate": 4.99205124608407e-05, "loss": 0.0959, "step": 170 }, { "epoch": 0.005049796605414504, "grad_norm": 0.47719326615333557, "learning_rate": 4.991583672324309e-05, "loss": 0.0547, "step": 180 }, { "epoch": 0.005330340861270866, "grad_norm": 0.3071495294570923, "learning_rate": 4.9911160985645486e-05, "loss": 0.0555, "step": 190 }, { "epoch": 0.005610885117127227, "grad_norm": 3.5041260719299316, "learning_rate": 4.990648524804788e-05, "loss": 0.0723, "step": 200 }, { "epoch": 0.005891429372983589, "grad_norm": 0.7606223225593567, "learning_rate": 4.990180951045028e-05, "loss": 0.1066, "step": 210 }, { "epoch": 0.00617197362883995, "grad_norm": 5.26910924911499, "learning_rate": 4.989713377285267e-05, "loss": 0.0694, "step": 220 }, { "epoch": 0.006452517884696311, "grad_norm": 7.516244888305664, "learning_rate": 4.9892458035255065e-05, "loss": 0.0733, "step": 230 }, { "epoch": 0.0067330621405526725, "grad_norm": 0.15132243931293488, "learning_rate": 4.988778229765746e-05, "loss": 0.0466, "step": 240 }, { "epoch": 0.0070136063964090336, "grad_norm": 0.9611673951148987, "learning_rate": 4.988310656005985e-05, "loss": 0.1831, "step": 250 }, { "epoch": 0.007294150652265395, "grad_norm": 4.168492794036865, "learning_rate": 4.9878430822462245e-05, "loss": 0.0438, "step": 260 }, { "epoch": 0.0075746949081217565, "grad_norm": 4.4840569496154785, "learning_rate": 4.987375508486464e-05, "loss": 0.056, "step": 270 }, { "epoch": 0.007855239163978118, "grad_norm": 5.03868293762207, "learning_rate": 4.986907934726703e-05, "loss": 0.0794, "step": 280 }, { "epoch": 0.008135783419834479, "grad_norm": 1.1371992826461792, "learning_rate": 4.986440360966943e-05, "loss": 0.0581, "step": 290 }, { "epoch": 0.00841632767569084, "grad_norm": 2.494056224822998, "learning_rate": 4.9859727872071824e-05, "loss": 0.087, "step": 300 }, { "epoch": 0.008696871931547202, "grad_norm": 2.265977382659912, "learning_rate": 4.985505213447422e-05, "loss": 0.0797, "step": 310 }, { "epoch": 0.008977416187403563, "grad_norm": 6.64182710647583, "learning_rate": 4.985037639687661e-05, "loss": 0.0419, "step": 320 }, { "epoch": 0.009257960443259924, "grad_norm": 1.3805954456329346, "learning_rate": 4.9845700659279004e-05, "loss": 0.0752, "step": 330 }, { "epoch": 0.009538504699116285, "grad_norm": 3.3813865184783936, "learning_rate": 4.98410249216814e-05, "loss": 0.0645, "step": 340 }, { "epoch": 0.009819048954972646, "grad_norm": 0.7843742966651917, "learning_rate": 4.983634918408379e-05, "loss": 0.0092, "step": 350 }, { "epoch": 0.010099593210829007, "grad_norm": 3.9801464080810547, "learning_rate": 4.983167344648619e-05, "loss": 0.0912, "step": 360 }, { "epoch": 0.01038013746668537, "grad_norm": 0.11131809651851654, "learning_rate": 4.9826997708888576e-05, "loss": 0.0383, "step": 370 }, { "epoch": 0.010660681722541731, "grad_norm": 3.289191246032715, "learning_rate": 4.9822321971290976e-05, "loss": 0.0708, "step": 380 }, { "epoch": 0.010941225978398092, "grad_norm": 1.1124424934387207, "learning_rate": 4.981764623369336e-05, "loss": 0.1425, "step": 390 }, { "epoch": 0.011221770234254453, "grad_norm": 1.3309569358825684, "learning_rate": 4.981297049609576e-05, "loss": 0.0622, "step": 400 }, { "epoch": 0.011502314490110814, "grad_norm": 0.19077451527118683, "learning_rate": 4.9808294758498156e-05, "loss": 0.068, "step": 410 }, { "epoch": 0.011782858745967177, "grad_norm": 0.21767109632492065, "learning_rate": 4.980361902090055e-05, "loss": 0.0465, "step": 420 }, { "epoch": 0.012063403001823538, "grad_norm": 0.9607604146003723, "learning_rate": 4.979894328330295e-05, "loss": 0.0715, "step": 430 }, { "epoch": 0.0123439472576799, "grad_norm": 0.42709967494010925, "learning_rate": 4.9794267545705335e-05, "loss": 0.07, "step": 440 }, { "epoch": 0.01262449151353626, "grad_norm": 0.2684326171875, "learning_rate": 4.9789591808107735e-05, "loss": 0.0632, "step": 450 }, { "epoch": 0.012905035769392621, "grad_norm": 1.2392427921295166, "learning_rate": 4.978491607051012e-05, "loss": 0.1238, "step": 460 }, { "epoch": 0.013185580025248982, "grad_norm": 0.6164854168891907, "learning_rate": 4.978024033291252e-05, "loss": 0.0564, "step": 470 }, { "epoch": 0.013466124281105345, "grad_norm": 2.493450880050659, "learning_rate": 4.977556459531491e-05, "loss": 0.0674, "step": 480 }, { "epoch": 0.013746668536961706, "grad_norm": 13.597931861877441, "learning_rate": 4.977088885771731e-05, "loss": 0.0829, "step": 490 }, { "epoch": 0.014027212792818067, "grad_norm": 0.26916855573654175, "learning_rate": 4.97662131201197e-05, "loss": 0.0887, "step": 500 }, { "epoch": 0.014307757048674428, "grad_norm": 0.5365808606147766, "learning_rate": 4.9761537382522094e-05, "loss": 0.014, "step": 510 }, { "epoch": 0.01458830130453079, "grad_norm": 16.74872589111328, "learning_rate": 4.9756861644924494e-05, "loss": 0.0514, "step": 520 }, { "epoch": 0.014868845560387152, "grad_norm": 3.7903811931610107, "learning_rate": 4.975218590732688e-05, "loss": 0.0455, "step": 530 }, { "epoch": 0.015149389816243513, "grad_norm": 4.990943908691406, "learning_rate": 4.974751016972928e-05, "loss": 0.0509, "step": 540 }, { "epoch": 0.015429934072099874, "grad_norm": 7.106185436248779, "learning_rate": 4.9742834432131667e-05, "loss": 0.0533, "step": 550 }, { "epoch": 0.015710478327956235, "grad_norm": 2.7900915145874023, "learning_rate": 4.9738158694534067e-05, "loss": 0.0759, "step": 560 }, { "epoch": 0.015991022583812596, "grad_norm": 0.17137788236141205, "learning_rate": 4.973348295693646e-05, "loss": 0.0499, "step": 570 }, { "epoch": 0.016271566839668957, "grad_norm": 0.047043249011039734, "learning_rate": 4.972880721933885e-05, "loss": 0.0204, "step": 580 }, { "epoch": 0.016552111095525318, "grad_norm": 1.7539491653442383, "learning_rate": 4.9724131481741246e-05, "loss": 0.0882, "step": 590 }, { "epoch": 0.01683265535138168, "grad_norm": 2.82010555267334, "learning_rate": 4.971945574414364e-05, "loss": 0.031, "step": 600 }, { "epoch": 0.01711319960723804, "grad_norm": 0.23794779181480408, "learning_rate": 4.971478000654603e-05, "loss": 0.0703, "step": 610 }, { "epoch": 0.017393743863094405, "grad_norm": 1.8071885108947754, "learning_rate": 4.9710104268948425e-05, "loss": 0.0513, "step": 620 }, { "epoch": 0.017674288118950766, "grad_norm": 0.895754337310791, "learning_rate": 4.9705428531350825e-05, "loss": 0.0288, "step": 630 }, { "epoch": 0.017954832374807127, "grad_norm": 1.231364369392395, "learning_rate": 4.970075279375322e-05, "loss": 0.0927, "step": 640 }, { "epoch": 0.018235376630663488, "grad_norm": 14.500687599182129, "learning_rate": 4.969607705615561e-05, "loss": 0.0553, "step": 650 }, { "epoch": 0.01851592088651985, "grad_norm": 1.886853814125061, "learning_rate": 4.9691401318558005e-05, "loss": 0.0627, "step": 660 }, { "epoch": 0.01879646514237621, "grad_norm": 3.700766086578369, "learning_rate": 4.96867255809604e-05, "loss": 0.0544, "step": 670 }, { "epoch": 0.01907700939823257, "grad_norm": 1.0360431671142578, "learning_rate": 4.968204984336279e-05, "loss": 0.0451, "step": 680 }, { "epoch": 0.019357553654088932, "grad_norm": 0.03618697449564934, "learning_rate": 4.9677374105765184e-05, "loss": 0.0148, "step": 690 }, { "epoch": 0.019638097909945293, "grad_norm": 1.8839340209960938, "learning_rate": 4.967269836816758e-05, "loss": 0.036, "step": 700 }, { "epoch": 0.019918642165801654, "grad_norm": 0.09569858759641647, "learning_rate": 4.966802263056998e-05, "loss": 0.0458, "step": 710 }, { "epoch": 0.020199186421658015, "grad_norm": 1.2312531471252441, "learning_rate": 4.966334689297237e-05, "loss": 0.0803, "step": 720 }, { "epoch": 0.02047973067751438, "grad_norm": 0.6509951949119568, "learning_rate": 4.9658671155374764e-05, "loss": 0.0674, "step": 730 }, { "epoch": 0.02076027493337074, "grad_norm": 0.4668249785900116, "learning_rate": 4.965399541777716e-05, "loss": 0.0558, "step": 740 }, { "epoch": 0.0210408191892271, "grad_norm": 0.04456448182463646, "learning_rate": 4.964931968017955e-05, "loss": 0.0138, "step": 750 }, { "epoch": 0.021321363445083463, "grad_norm": 11.212784767150879, "learning_rate": 4.964464394258194e-05, "loss": 0.044, "step": 760 }, { "epoch": 0.021601907700939824, "grad_norm": 2.8041799068450928, "learning_rate": 4.9639968204984336e-05, "loss": 0.0539, "step": 770 }, { "epoch": 0.021882451956796185, "grad_norm": 0.27477526664733887, "learning_rate": 4.9635292467386736e-05, "loss": 0.059, "step": 780 }, { "epoch": 0.022162996212652546, "grad_norm": 0.8009962439537048, "learning_rate": 4.963061672978912e-05, "loss": 0.0566, "step": 790 }, { "epoch": 0.022443540468508907, "grad_norm": 1.899442434310913, "learning_rate": 4.962594099219152e-05, "loss": 0.0213, "step": 800 }, { "epoch": 0.022724084724365268, "grad_norm": 0.5617396235466003, "learning_rate": 4.9621265254593916e-05, "loss": 0.0676, "step": 810 }, { "epoch": 0.02300462898022163, "grad_norm": 5.031060695648193, "learning_rate": 4.961658951699631e-05, "loss": 0.0461, "step": 820 }, { "epoch": 0.02328517323607799, "grad_norm": 4.896806240081787, "learning_rate": 4.96119137793987e-05, "loss": 0.0667, "step": 830 }, { "epoch": 0.023565717491934354, "grad_norm": 0.16749387979507446, "learning_rate": 4.9607238041801095e-05, "loss": 0.0633, "step": 840 }, { "epoch": 0.023846261747790715, "grad_norm": 4.158395767211914, "learning_rate": 4.9602562304203495e-05, "loss": 0.0659, "step": 850 }, { "epoch": 0.024126806003647076, "grad_norm": 2.365711212158203, "learning_rate": 4.959788656660588e-05, "loss": 0.0453, "step": 860 }, { "epoch": 0.024407350259503437, "grad_norm": 2.832343816757202, "learning_rate": 4.959321082900828e-05, "loss": 0.0678, "step": 870 }, { "epoch": 0.0246878945153598, "grad_norm": 1.5532031059265137, "learning_rate": 4.958853509141067e-05, "loss": 0.0635, "step": 880 }, { "epoch": 0.02496843877121616, "grad_norm": 0.39463192224502563, "learning_rate": 4.958385935381307e-05, "loss": 0.0702, "step": 890 }, { "epoch": 0.02524898302707252, "grad_norm": 1.0820063352584839, "learning_rate": 4.957918361621546e-05, "loss": 0.0838, "step": 900 }, { "epoch": 0.02552952728292888, "grad_norm": 2.4260494709014893, "learning_rate": 4.9574507878617854e-05, "loss": 0.0325, "step": 910 }, { "epoch": 0.025810071538785243, "grad_norm": 1.2428312301635742, "learning_rate": 4.956983214102025e-05, "loss": 0.0492, "step": 920 }, { "epoch": 0.026090615794641604, "grad_norm": 0.04418851062655449, "learning_rate": 4.956515640342264e-05, "loss": 0.0294, "step": 930 }, { "epoch": 0.026371160050497965, "grad_norm": 7.0953240394592285, "learning_rate": 4.956048066582504e-05, "loss": 0.0728, "step": 940 }, { "epoch": 0.02665170430635433, "grad_norm": 0.4869473874568939, "learning_rate": 4.955580492822743e-05, "loss": 0.0594, "step": 950 }, { "epoch": 0.02693224856221069, "grad_norm": 0.22786812484264374, "learning_rate": 4.955112919062983e-05, "loss": 0.0641, "step": 960 }, { "epoch": 0.02721279281806705, "grad_norm": 1.6069670915603638, "learning_rate": 4.954645345303221e-05, "loss": 0.0361, "step": 970 }, { "epoch": 0.027493337073923412, "grad_norm": 0.5924623012542725, "learning_rate": 4.954177771543461e-05, "loss": 0.051, "step": 980 }, { "epoch": 0.027773881329779773, "grad_norm": 1.1237338781356812, "learning_rate": 4.9537101977837006e-05, "loss": 0.0591, "step": 990 }, { "epoch": 0.028054425585636134, "grad_norm": 6.734570026397705, "learning_rate": 4.95324262402394e-05, "loss": 0.0442, "step": 1000 }, { "epoch": 0.028334969841492495, "grad_norm": 1.3949992656707764, "learning_rate": 4.952775050264179e-05, "loss": 0.0771, "step": 1010 }, { "epoch": 0.028615514097348856, "grad_norm": 0.11702559143304825, "learning_rate": 4.9523074765044186e-05, "loss": 0.0461, "step": 1020 }, { "epoch": 0.028896058353205217, "grad_norm": 0.3282710313796997, "learning_rate": 4.9518399027446585e-05, "loss": 0.0452, "step": 1030 }, { "epoch": 0.02917660260906158, "grad_norm": 0.08823459595441818, "learning_rate": 4.951372328984897e-05, "loss": 0.0394, "step": 1040 }, { "epoch": 0.02945714686491794, "grad_norm": 0.5333325266838074, "learning_rate": 4.950904755225137e-05, "loss": 0.0743, "step": 1050 }, { "epoch": 0.029737691120774304, "grad_norm": 2.598162889480591, "learning_rate": 4.9504371814653765e-05, "loss": 0.0366, "step": 1060 }, { "epoch": 0.030018235376630665, "grad_norm": 7.183851718902588, "learning_rate": 4.949969607705616e-05, "loss": 0.0353, "step": 1070 }, { "epoch": 0.030298779632487026, "grad_norm": 1.784040093421936, "learning_rate": 4.949502033945855e-05, "loss": 0.0305, "step": 1080 }, { "epoch": 0.030579323888343387, "grad_norm": 0.6993817090988159, "learning_rate": 4.9490344601860944e-05, "loss": 0.0298, "step": 1090 }, { "epoch": 0.030859868144199748, "grad_norm": 0.027545064687728882, "learning_rate": 4.948566886426334e-05, "loss": 0.0435, "step": 1100 }, { "epoch": 0.03114041240005611, "grad_norm": 3.2517247200012207, "learning_rate": 4.948099312666573e-05, "loss": 0.0913, "step": 1110 }, { "epoch": 0.03142095665591247, "grad_norm": 0.8970758318901062, "learning_rate": 4.947631738906813e-05, "loss": 0.0482, "step": 1120 }, { "epoch": 0.031701500911768835, "grad_norm": 0.12107737362384796, "learning_rate": 4.9471641651470524e-05, "loss": 0.0302, "step": 1130 }, { "epoch": 0.03198204516762519, "grad_norm": 0.042185623198747635, "learning_rate": 4.946696591387292e-05, "loss": 0.0414, "step": 1140 }, { "epoch": 0.03226258942348156, "grad_norm": 2.650482654571533, "learning_rate": 4.946229017627531e-05, "loss": 0.0512, "step": 1150 }, { "epoch": 0.032543133679337914, "grad_norm": 0.996688961982727, "learning_rate": 4.94576144386777e-05, "loss": 0.0518, "step": 1160 }, { "epoch": 0.03282367793519428, "grad_norm": 0.17445626854896545, "learning_rate": 4.9452938701080096e-05, "loss": 0.0311, "step": 1170 }, { "epoch": 0.033104222191050636, "grad_norm": 0.12567052245140076, "learning_rate": 4.944826296348249e-05, "loss": 0.0376, "step": 1180 }, { "epoch": 0.033384766446907, "grad_norm": 1.94346022605896, "learning_rate": 4.944358722588488e-05, "loss": 0.0159, "step": 1190 }, { "epoch": 0.03366531070276336, "grad_norm": 0.08602581918239594, "learning_rate": 4.943891148828728e-05, "loss": 0.0351, "step": 1200 }, { "epoch": 0.03394585495861972, "grad_norm": 0.15599554777145386, "learning_rate": 4.9434235750689676e-05, "loss": 0.0269, "step": 1210 }, { "epoch": 0.03422639921447608, "grad_norm": 0.11351972073316574, "learning_rate": 4.942956001309207e-05, "loss": 0.0686, "step": 1220 }, { "epoch": 0.034506943470332445, "grad_norm": 0.15318536758422852, "learning_rate": 4.942488427549446e-05, "loss": 0.0226, "step": 1230 }, { "epoch": 0.03478748772618881, "grad_norm": 0.8131875395774841, "learning_rate": 4.9420208537896855e-05, "loss": 0.0794, "step": 1240 }, { "epoch": 0.03506803198204517, "grad_norm": 8.834260940551758, "learning_rate": 4.941553280029925e-05, "loss": 0.0678, "step": 1250 }, { "epoch": 0.03534857623790153, "grad_norm": 2.560833692550659, "learning_rate": 4.941085706270164e-05, "loss": 0.0458, "step": 1260 }, { "epoch": 0.03562912049375789, "grad_norm": 1.7907058000564575, "learning_rate": 4.940618132510404e-05, "loss": 0.0811, "step": 1270 }, { "epoch": 0.035909664749614253, "grad_norm": 8.453978538513184, "learning_rate": 4.940150558750643e-05, "loss": 0.035, "step": 1280 }, { "epoch": 0.03619020900547061, "grad_norm": 0.1305462121963501, "learning_rate": 4.939682984990883e-05, "loss": 0.0067, "step": 1290 }, { "epoch": 0.036470753261326976, "grad_norm": 5.13889741897583, "learning_rate": 4.9392154112311214e-05, "loss": 0.0618, "step": 1300 }, { "epoch": 0.03675129751718333, "grad_norm": 9.282419204711914, "learning_rate": 4.9387478374713614e-05, "loss": 0.0572, "step": 1310 }, { "epoch": 0.0370318417730397, "grad_norm": 2.256390333175659, "learning_rate": 4.938280263711601e-05, "loss": 0.0731, "step": 1320 }, { "epoch": 0.037312386028896055, "grad_norm": 0.15080209076404572, "learning_rate": 4.93781268995184e-05, "loss": 0.0342, "step": 1330 }, { "epoch": 0.03759293028475242, "grad_norm": 2.75889253616333, "learning_rate": 4.93734511619208e-05, "loss": 0.0432, "step": 1340 }, { "epoch": 0.037873474540608784, "grad_norm": 1.362248182296753, "learning_rate": 4.936877542432319e-05, "loss": 0.0954, "step": 1350 }, { "epoch": 0.03815401879646514, "grad_norm": 0.965312659740448, "learning_rate": 4.936409968672559e-05, "loss": 0.0175, "step": 1360 }, { "epoch": 0.038434563052321506, "grad_norm": 1.5700958967208862, "learning_rate": 4.935942394912797e-05, "loss": 0.0292, "step": 1370 }, { "epoch": 0.038715107308177864, "grad_norm": 0.3090997636318207, "learning_rate": 4.935474821153037e-05, "loss": 0.0653, "step": 1380 }, { "epoch": 0.03899565156403423, "grad_norm": 0.8210850954055786, "learning_rate": 4.9350072473932766e-05, "loss": 0.0603, "step": 1390 }, { "epoch": 0.039276195819890586, "grad_norm": 0.3750794529914856, "learning_rate": 4.934539673633516e-05, "loss": 0.0322, "step": 1400 }, { "epoch": 0.03955674007574695, "grad_norm": 1.7314468622207642, "learning_rate": 4.934072099873755e-05, "loss": 0.0666, "step": 1410 }, { "epoch": 0.03983728433160331, "grad_norm": 0.34879061579704285, "learning_rate": 4.9336045261139946e-05, "loss": 0.0784, "step": 1420 }, { "epoch": 0.04011782858745967, "grad_norm": 0.31601276993751526, "learning_rate": 4.9331369523542346e-05, "loss": 0.0374, "step": 1430 }, { "epoch": 0.04039837284331603, "grad_norm": 0.5436122417449951, "learning_rate": 4.932669378594473e-05, "loss": 0.0702, "step": 1440 }, { "epoch": 0.040678917099172394, "grad_norm": 0.6000412106513977, "learning_rate": 4.932201804834713e-05, "loss": 0.0315, "step": 1450 }, { "epoch": 0.04095946135502876, "grad_norm": 0.1466413140296936, "learning_rate": 4.9317342310749525e-05, "loss": 0.0251, "step": 1460 }, { "epoch": 0.041240005610885117, "grad_norm": 5.394089221954346, "learning_rate": 4.931266657315192e-05, "loss": 0.0217, "step": 1470 }, { "epoch": 0.04152054986674148, "grad_norm": 0.02628123201429844, "learning_rate": 4.930799083555431e-05, "loss": 0.0255, "step": 1480 }, { "epoch": 0.04180109412259784, "grad_norm": 0.03501041978597641, "learning_rate": 4.9303315097956704e-05, "loss": 0.0421, "step": 1490 }, { "epoch": 0.0420816383784542, "grad_norm": 0.3241165280342102, "learning_rate": 4.92986393603591e-05, "loss": 0.028, "step": 1500 }, { "epoch": 0.04236218263431056, "grad_norm": 1.3652691841125488, "learning_rate": 4.929396362276149e-05, "loss": 0.0684, "step": 1510 }, { "epoch": 0.042642726890166925, "grad_norm": 0.5351733565330505, "learning_rate": 4.9289287885163884e-05, "loss": 0.0304, "step": 1520 }, { "epoch": 0.04292327114602328, "grad_norm": 2.3374054431915283, "learning_rate": 4.9284612147566284e-05, "loss": 0.0706, "step": 1530 }, { "epoch": 0.04320381540187965, "grad_norm": 0.2244783192873001, "learning_rate": 4.927993640996868e-05, "loss": 0.0188, "step": 1540 }, { "epoch": 0.043484359657736005, "grad_norm": 0.5515932440757751, "learning_rate": 4.927526067237107e-05, "loss": 0.0443, "step": 1550 }, { "epoch": 0.04376490391359237, "grad_norm": 1.2662254571914673, "learning_rate": 4.927058493477346e-05, "loss": 0.0541, "step": 1560 }, { "epoch": 0.044045448169448734, "grad_norm": 1.2435505390167236, "learning_rate": 4.9265909197175857e-05, "loss": 0.0212, "step": 1570 }, { "epoch": 0.04432599242530509, "grad_norm": 0.23348264396190643, "learning_rate": 4.926123345957825e-05, "loss": 0.0424, "step": 1580 }, { "epoch": 0.044606536681161456, "grad_norm": 0.08442018181085587, "learning_rate": 4.925655772198064e-05, "loss": 0.0424, "step": 1590 }, { "epoch": 0.04488708093701781, "grad_norm": 2.0251195430755615, "learning_rate": 4.925188198438304e-05, "loss": 0.0502, "step": 1600 }, { "epoch": 0.04516762519287418, "grad_norm": 3.8944458961486816, "learning_rate": 4.924720624678543e-05, "loss": 0.024, "step": 1610 }, { "epoch": 0.045448169448730535, "grad_norm": 3.3959803581237793, "learning_rate": 4.924253050918783e-05, "loss": 0.0486, "step": 1620 }, { "epoch": 0.0457287137045869, "grad_norm": 0.2768227756023407, "learning_rate": 4.923785477159022e-05, "loss": 0.0368, "step": 1630 }, { "epoch": 0.04600925796044326, "grad_norm": 1.4074018001556396, "learning_rate": 4.9233179033992615e-05, "loss": 0.0629, "step": 1640 }, { "epoch": 0.04628980221629962, "grad_norm": 0.7467813491821289, "learning_rate": 4.922850329639501e-05, "loss": 0.0381, "step": 1650 }, { "epoch": 0.04657034647215598, "grad_norm": 0.48309850692749023, "learning_rate": 4.92238275587974e-05, "loss": 0.0311, "step": 1660 }, { "epoch": 0.046850890728012344, "grad_norm": 0.8143037557601929, "learning_rate": 4.92191518211998e-05, "loss": 0.0728, "step": 1670 }, { "epoch": 0.04713143498386871, "grad_norm": 0.07704737037420273, "learning_rate": 4.921447608360219e-05, "loss": 0.0415, "step": 1680 }, { "epoch": 0.047411979239725066, "grad_norm": 0.7125360369682312, "learning_rate": 4.920980034600459e-05, "loss": 0.0549, "step": 1690 }, { "epoch": 0.04769252349558143, "grad_norm": 10.961862564086914, "learning_rate": 4.9205124608406974e-05, "loss": 0.0687, "step": 1700 }, { "epoch": 0.04797306775143779, "grad_norm": 0.7218489646911621, "learning_rate": 4.9200448870809374e-05, "loss": 0.0566, "step": 1710 }, { "epoch": 0.04825361200729415, "grad_norm": 4.515640735626221, "learning_rate": 4.919577313321177e-05, "loss": 0.0638, "step": 1720 }, { "epoch": 0.04853415626315051, "grad_norm": 2.174733877182007, "learning_rate": 4.919109739561416e-05, "loss": 0.042, "step": 1730 }, { "epoch": 0.048814700519006875, "grad_norm": 1.7980859279632568, "learning_rate": 4.918642165801656e-05, "loss": 0.0301, "step": 1740 }, { "epoch": 0.04909524477486323, "grad_norm": 0.07409381121397018, "learning_rate": 4.918174592041895e-05, "loss": 0.0446, "step": 1750 }, { "epoch": 0.0493757890307196, "grad_norm": 0.1970428228378296, "learning_rate": 4.917707018282135e-05, "loss": 0.0245, "step": 1760 }, { "epoch": 0.049656333286575954, "grad_norm": 0.542994499206543, "learning_rate": 4.917239444522373e-05, "loss": 0.0242, "step": 1770 }, { "epoch": 0.04993687754243232, "grad_norm": 6.81209135055542, "learning_rate": 4.916771870762613e-05, "loss": 0.0267, "step": 1780 }, { "epoch": 0.05021742179828868, "grad_norm": 3.0219650268554688, "learning_rate": 4.916304297002852e-05, "loss": 0.0439, "step": 1790 }, { "epoch": 0.05049796605414504, "grad_norm": 0.7817076444625854, "learning_rate": 4.915836723243092e-05, "loss": 0.0557, "step": 1800 }, { "epoch": 0.050778510310001405, "grad_norm": 0.39060816168785095, "learning_rate": 4.915369149483331e-05, "loss": 0.0734, "step": 1810 }, { "epoch": 0.05105905456585776, "grad_norm": 0.043388355523347855, "learning_rate": 4.9149015757235706e-05, "loss": 0.0421, "step": 1820 }, { "epoch": 0.05133959882171413, "grad_norm": 0.2502467930316925, "learning_rate": 4.91443400196381e-05, "loss": 0.031, "step": 1830 }, { "epoch": 0.051620143077570485, "grad_norm": 2.3040125370025635, "learning_rate": 4.913966428204049e-05, "loss": 0.0417, "step": 1840 }, { "epoch": 0.05190068733342685, "grad_norm": 5.510904312133789, "learning_rate": 4.913498854444289e-05, "loss": 0.0582, "step": 1850 }, { "epoch": 0.05218123158928321, "grad_norm": 0.4526715576648712, "learning_rate": 4.913031280684528e-05, "loss": 0.0666, "step": 1860 }, { "epoch": 0.05246177584513957, "grad_norm": 0.5540868639945984, "learning_rate": 4.912563706924768e-05, "loss": 0.0157, "step": 1870 }, { "epoch": 0.05274232010099593, "grad_norm": 1.9101145267486572, "learning_rate": 4.912096133165007e-05, "loss": 0.021, "step": 1880 }, { "epoch": 0.053022864356852294, "grad_norm": 2.581639289855957, "learning_rate": 4.9116285594052465e-05, "loss": 0.0747, "step": 1890 }, { "epoch": 0.05330340861270866, "grad_norm": 0.9686213731765747, "learning_rate": 4.911160985645486e-05, "loss": 0.0752, "step": 1900 }, { "epoch": 0.053583952868565016, "grad_norm": 0.6098451614379883, "learning_rate": 4.910693411885725e-05, "loss": 0.0641, "step": 1910 }, { "epoch": 0.05386449712442138, "grad_norm": 2.0968306064605713, "learning_rate": 4.9102258381259644e-05, "loss": 0.0474, "step": 1920 }, { "epoch": 0.05414504138027774, "grad_norm": 0.10716312378644943, "learning_rate": 4.909758264366204e-05, "loss": 0.0315, "step": 1930 }, { "epoch": 0.0544255856361341, "grad_norm": 1.62767493724823, "learning_rate": 4.909290690606444e-05, "loss": 0.0784, "step": 1940 }, { "epoch": 0.05470612989199046, "grad_norm": 0.10964605957269669, "learning_rate": 4.908823116846683e-05, "loss": 0.0781, "step": 1950 }, { "epoch": 0.054986674147846824, "grad_norm": 0.5861601829528809, "learning_rate": 4.9083555430869223e-05, "loss": 0.0545, "step": 1960 }, { "epoch": 0.05526721840370318, "grad_norm": 0.8333131670951843, "learning_rate": 4.9078879693271617e-05, "loss": 0.0384, "step": 1970 }, { "epoch": 0.055547762659559546, "grad_norm": 0.6890222430229187, "learning_rate": 4.907420395567401e-05, "loss": 0.0492, "step": 1980 }, { "epoch": 0.055828306915415904, "grad_norm": 1.2227866649627686, "learning_rate": 4.90695282180764e-05, "loss": 0.025, "step": 1990 }, { "epoch": 0.05610885117127227, "grad_norm": 5.78419828414917, "learning_rate": 4.9064852480478796e-05, "loss": 0.0485, "step": 2000 }, { "epoch": 0.05638939542712863, "grad_norm": 0.05862471088767052, "learning_rate": 4.906017674288119e-05, "loss": 0.0055, "step": 2010 }, { "epoch": 0.05666993968298499, "grad_norm": 0.7972022294998169, "learning_rate": 4.905550100528359e-05, "loss": 0.0481, "step": 2020 }, { "epoch": 0.056950483938841355, "grad_norm": 0.17803536355495453, "learning_rate": 4.905082526768598e-05, "loss": 0.03, "step": 2030 }, { "epoch": 0.05723102819469771, "grad_norm": 1.46344792842865, "learning_rate": 4.9046149530088375e-05, "loss": 0.1159, "step": 2040 }, { "epoch": 0.05751157245055408, "grad_norm": 0.08029922097921371, "learning_rate": 4.904147379249077e-05, "loss": 0.029, "step": 2050 }, { "epoch": 0.057792116706410435, "grad_norm": 0.09604211896657944, "learning_rate": 4.903679805489316e-05, "loss": 0.0081, "step": 2060 }, { "epoch": 0.0580726609622668, "grad_norm": 0.13537514209747314, "learning_rate": 4.9032122317295555e-05, "loss": 0.0662, "step": 2070 }, { "epoch": 0.05835320521812316, "grad_norm": 0.5971249938011169, "learning_rate": 4.902744657969795e-05, "loss": 0.0533, "step": 2080 }, { "epoch": 0.05863374947397952, "grad_norm": 0.19792938232421875, "learning_rate": 4.902277084210035e-05, "loss": 0.0407, "step": 2090 }, { "epoch": 0.05891429372983588, "grad_norm": 0.07569585740566254, "learning_rate": 4.9018095104502734e-05, "loss": 0.0266, "step": 2100 }, { "epoch": 0.05919483798569224, "grad_norm": 9.030657768249512, "learning_rate": 4.9013419366905134e-05, "loss": 0.0989, "step": 2110 }, { "epoch": 0.05947538224154861, "grad_norm": 0.4570305347442627, "learning_rate": 4.900874362930753e-05, "loss": 0.0171, "step": 2120 }, { "epoch": 0.059755926497404965, "grad_norm": 0.29739150404930115, "learning_rate": 4.900406789170992e-05, "loss": 0.0372, "step": 2130 }, { "epoch": 0.06003647075326133, "grad_norm": 2.43097186088562, "learning_rate": 4.8999392154112314e-05, "loss": 0.0534, "step": 2140 }, { "epoch": 0.06031701500911769, "grad_norm": 0.5041104555130005, "learning_rate": 4.899471641651471e-05, "loss": 0.0566, "step": 2150 }, { "epoch": 0.06059755926497405, "grad_norm": 0.10294745862483978, "learning_rate": 4.899004067891711e-05, "loss": 0.0253, "step": 2160 }, { "epoch": 0.06087810352083041, "grad_norm": 0.2046901434659958, "learning_rate": 4.898536494131949e-05, "loss": 0.0535, "step": 2170 }, { "epoch": 0.061158647776686774, "grad_norm": 0.05003371462225914, "learning_rate": 4.898068920372189e-05, "loss": 0.0549, "step": 2180 }, { "epoch": 0.06143919203254313, "grad_norm": 0.7600542306900024, "learning_rate": 4.897601346612428e-05, "loss": 0.0336, "step": 2190 }, { "epoch": 0.061719736288399496, "grad_norm": 4.76383638381958, "learning_rate": 4.897133772852668e-05, "loss": 0.0454, "step": 2200 }, { "epoch": 0.062000280544255854, "grad_norm": 0.24637632071971893, "learning_rate": 4.8966661990929066e-05, "loss": 0.0786, "step": 2210 }, { "epoch": 0.06228082480011222, "grad_norm": 0.10467786341905594, "learning_rate": 4.8961986253331466e-05, "loss": 0.0181, "step": 2220 }, { "epoch": 0.06256136905596858, "grad_norm": 1.64877188205719, "learning_rate": 4.895731051573386e-05, "loss": 0.0695, "step": 2230 }, { "epoch": 0.06284191331182494, "grad_norm": 2.6138644218444824, "learning_rate": 4.895263477813625e-05, "loss": 0.0552, "step": 2240 }, { "epoch": 0.0631224575676813, "grad_norm": 0.9759430885314941, "learning_rate": 4.894795904053865e-05, "loss": 0.0256, "step": 2250 }, { "epoch": 0.06340300182353767, "grad_norm": 2.029052734375, "learning_rate": 4.894328330294104e-05, "loss": 0.0867, "step": 2260 }, { "epoch": 0.06368354607939403, "grad_norm": 1.7099350690841675, "learning_rate": 4.893860756534344e-05, "loss": 0.0554, "step": 2270 }, { "epoch": 0.06396409033525038, "grad_norm": 0.29285696148872375, "learning_rate": 4.8933931827745825e-05, "loss": 0.0594, "step": 2280 }, { "epoch": 0.06424463459110674, "grad_norm": 0.06312594562768936, "learning_rate": 4.8929256090148225e-05, "loss": 0.0187, "step": 2290 }, { "epoch": 0.06452517884696311, "grad_norm": 0.02897688001394272, "learning_rate": 4.892458035255062e-05, "loss": 0.0289, "step": 2300 }, { "epoch": 0.06480572310281947, "grad_norm": 0.8514787554740906, "learning_rate": 4.891990461495301e-05, "loss": 0.0747, "step": 2310 }, { "epoch": 0.06508626735867583, "grad_norm": 0.1477888822555542, "learning_rate": 4.8915228877355404e-05, "loss": 0.0408, "step": 2320 }, { "epoch": 0.06536681161453219, "grad_norm": 1.8070688247680664, "learning_rate": 4.89105531397578e-05, "loss": 0.0555, "step": 2330 }, { "epoch": 0.06564735587038856, "grad_norm": 1.0699288845062256, "learning_rate": 4.89058774021602e-05, "loss": 0.0319, "step": 2340 }, { "epoch": 0.06592790012624491, "grad_norm": 0.3155403435230255, "learning_rate": 4.8901201664562584e-05, "loss": 0.0208, "step": 2350 }, { "epoch": 0.06620844438210127, "grad_norm": 0.3118574321269989, "learning_rate": 4.8896525926964984e-05, "loss": 0.0371, "step": 2360 }, { "epoch": 0.06648898863795764, "grad_norm": 1.968988060951233, "learning_rate": 4.889185018936738e-05, "loss": 0.0776, "step": 2370 }, { "epoch": 0.066769532893814, "grad_norm": 0.5310500860214233, "learning_rate": 4.888717445176977e-05, "loss": 0.0448, "step": 2380 }, { "epoch": 0.06705007714967036, "grad_norm": 0.5481597781181335, "learning_rate": 4.888249871417216e-05, "loss": 0.0402, "step": 2390 }, { "epoch": 0.06733062140552672, "grad_norm": 0.19095847010612488, "learning_rate": 4.8877822976574556e-05, "loss": 0.0466, "step": 2400 }, { "epoch": 0.06761116566138309, "grad_norm": 0.1806343048810959, "learning_rate": 4.887314723897695e-05, "loss": 0.0343, "step": 2410 }, { "epoch": 0.06789170991723945, "grad_norm": 0.2624540328979492, "learning_rate": 4.886847150137934e-05, "loss": 0.0216, "step": 2420 }, { "epoch": 0.0681722541730958, "grad_norm": 5.191568851470947, "learning_rate": 4.8863795763781736e-05, "loss": 0.048, "step": 2430 }, { "epoch": 0.06845279842895216, "grad_norm": 0.3591744303703308, "learning_rate": 4.8859120026184136e-05, "loss": 0.0356, "step": 2440 }, { "epoch": 0.06873334268480853, "grad_norm": 1.2097914218902588, "learning_rate": 4.885444428858653e-05, "loss": 0.021, "step": 2450 }, { "epoch": 0.06901388694066489, "grad_norm": 0.11320558190345764, "learning_rate": 4.884976855098892e-05, "loss": 0.0227, "step": 2460 }, { "epoch": 0.06929443119652125, "grad_norm": 0.07876001298427582, "learning_rate": 4.8845092813391315e-05, "loss": 0.0399, "step": 2470 }, { "epoch": 0.06957497545237762, "grad_norm": 0.1261557638645172, "learning_rate": 4.884041707579371e-05, "loss": 0.0334, "step": 2480 }, { "epoch": 0.06985551970823398, "grad_norm": 3.014240264892578, "learning_rate": 4.88357413381961e-05, "loss": 0.0306, "step": 2490 }, { "epoch": 0.07013606396409033, "grad_norm": 0.1168297529220581, "learning_rate": 4.8831065600598494e-05, "loss": 0.0386, "step": 2500 }, { "epoch": 0.07041660821994669, "grad_norm": 0.9644504189491272, "learning_rate": 4.8826389863000894e-05, "loss": 0.0347, "step": 2510 }, { "epoch": 0.07069715247580306, "grad_norm": 1.0333285331726074, "learning_rate": 4.882171412540328e-05, "loss": 0.0404, "step": 2520 }, { "epoch": 0.07097769673165942, "grad_norm": 0.11496692150831223, "learning_rate": 4.881703838780568e-05, "loss": 0.0524, "step": 2530 }, { "epoch": 0.07125824098751578, "grad_norm": 1.586235523223877, "learning_rate": 4.8812362650208074e-05, "loss": 0.053, "step": 2540 }, { "epoch": 0.07153878524337214, "grad_norm": 0.07964395731687546, "learning_rate": 4.880768691261047e-05, "loss": 0.0278, "step": 2550 }, { "epoch": 0.07181932949922851, "grad_norm": 0.24821801483631134, "learning_rate": 4.880301117501286e-05, "loss": 0.0593, "step": 2560 }, { "epoch": 0.07209987375508486, "grad_norm": 0.16367913782596588, "learning_rate": 4.879833543741525e-05, "loss": 0.0233, "step": 2570 }, { "epoch": 0.07238041801094122, "grad_norm": 0.37672024965286255, "learning_rate": 4.879365969981765e-05, "loss": 0.0718, "step": 2580 }, { "epoch": 0.0726609622667976, "grad_norm": 0.9394160509109497, "learning_rate": 4.878898396222004e-05, "loss": 0.0337, "step": 2590 }, { "epoch": 0.07294150652265395, "grad_norm": 0.054679933935403824, "learning_rate": 4.878430822462244e-05, "loss": 0.0081, "step": 2600 }, { "epoch": 0.07322205077851031, "grad_norm": 0.09962423145771027, "learning_rate": 4.8779632487024826e-05, "loss": 0.0164, "step": 2610 }, { "epoch": 0.07350259503436667, "grad_norm": 0.023140782490372658, "learning_rate": 4.8774956749427226e-05, "loss": 0.0278, "step": 2620 }, { "epoch": 0.07378313929022304, "grad_norm": 0.03699498251080513, "learning_rate": 4.877028101182962e-05, "loss": 0.0295, "step": 2630 }, { "epoch": 0.0740636835460794, "grad_norm": 3.56437611579895, "learning_rate": 4.876560527423201e-05, "loss": 0.0534, "step": 2640 }, { "epoch": 0.07434422780193575, "grad_norm": 0.14882956445217133, "learning_rate": 4.876092953663441e-05, "loss": 0.0286, "step": 2650 }, { "epoch": 0.07462477205779211, "grad_norm": 3.741131067276001, "learning_rate": 4.87562537990368e-05, "loss": 0.0419, "step": 2660 }, { "epoch": 0.07490531631364848, "grad_norm": 0.12847883999347687, "learning_rate": 4.87515780614392e-05, "loss": 0.0313, "step": 2670 }, { "epoch": 0.07518586056950484, "grad_norm": 3.666363000869751, "learning_rate": 4.8746902323841585e-05, "loss": 0.0539, "step": 2680 }, { "epoch": 0.0754664048253612, "grad_norm": 0.20049823820590973, "learning_rate": 4.8742226586243985e-05, "loss": 0.0292, "step": 2690 }, { "epoch": 0.07574694908121757, "grad_norm": 0.18441879749298096, "learning_rate": 4.873755084864637e-05, "loss": 0.0429, "step": 2700 }, { "epoch": 0.07602749333707393, "grad_norm": 0.34556883573532104, "learning_rate": 4.873287511104877e-05, "loss": 0.0188, "step": 2710 }, { "epoch": 0.07630803759293028, "grad_norm": 0.2911209166049957, "learning_rate": 4.8728199373451164e-05, "loss": 0.101, "step": 2720 }, { "epoch": 0.07658858184878664, "grad_norm": 2.957385778427124, "learning_rate": 4.872352363585356e-05, "loss": 0.0464, "step": 2730 }, { "epoch": 0.07686912610464301, "grad_norm": 0.05579303577542305, "learning_rate": 4.871884789825595e-05, "loss": 0.0414, "step": 2740 }, { "epoch": 0.07714967036049937, "grad_norm": 0.7333822250366211, "learning_rate": 4.8714172160658344e-05, "loss": 0.0517, "step": 2750 }, { "epoch": 0.07743021461635573, "grad_norm": 0.1691836416721344, "learning_rate": 4.8709496423060744e-05, "loss": 0.0185, "step": 2760 }, { "epoch": 0.07771075887221209, "grad_norm": 0.17085903882980347, "learning_rate": 4.870482068546313e-05, "loss": 0.0492, "step": 2770 }, { "epoch": 0.07799130312806846, "grad_norm": 0.7877228856086731, "learning_rate": 4.870014494786553e-05, "loss": 0.0459, "step": 2780 }, { "epoch": 0.07827184738392481, "grad_norm": 11.754535675048828, "learning_rate": 4.869546921026792e-05, "loss": 0.0196, "step": 2790 }, { "epoch": 0.07855239163978117, "grad_norm": 0.07260871678590775, "learning_rate": 4.8690793472670316e-05, "loss": 0.0515, "step": 2800 }, { "epoch": 0.07883293589563754, "grad_norm": 0.39448630809783936, "learning_rate": 4.868611773507271e-05, "loss": 0.0486, "step": 2810 }, { "epoch": 0.0791134801514939, "grad_norm": 2.148465394973755, "learning_rate": 4.86814419974751e-05, "loss": 0.0493, "step": 2820 }, { "epoch": 0.07939402440735026, "grad_norm": 0.6125537157058716, "learning_rate": 4.8676766259877496e-05, "loss": 0.0319, "step": 2830 }, { "epoch": 0.07967456866320662, "grad_norm": 1.008539080619812, "learning_rate": 4.867209052227989e-05, "loss": 0.0276, "step": 2840 }, { "epoch": 0.07995511291906299, "grad_norm": 0.02734805829823017, "learning_rate": 4.866741478468229e-05, "loss": 0.03, "step": 2850 }, { "epoch": 0.08023565717491934, "grad_norm": 0.19615893065929413, "learning_rate": 4.866273904708468e-05, "loss": 0.0641, "step": 2860 }, { "epoch": 0.0805162014307757, "grad_norm": 0.588692307472229, "learning_rate": 4.8658063309487075e-05, "loss": 0.0538, "step": 2870 }, { "epoch": 0.08079674568663206, "grad_norm": 0.14989154040813446, "learning_rate": 4.865338757188947e-05, "loss": 0.0309, "step": 2880 }, { "epoch": 0.08107728994248843, "grad_norm": 0.19508011639118195, "learning_rate": 4.864871183429186e-05, "loss": 0.0537, "step": 2890 }, { "epoch": 0.08135783419834479, "grad_norm": 0.2596297264099121, "learning_rate": 4.8644036096694255e-05, "loss": 0.0296, "step": 2900 }, { "epoch": 0.08163837845420115, "grad_norm": 0.7628892064094543, "learning_rate": 4.863936035909665e-05, "loss": 0.0512, "step": 2910 }, { "epoch": 0.08191892271005752, "grad_norm": 0.576982319355011, "learning_rate": 4.863468462149904e-05, "loss": 0.063, "step": 2920 }, { "epoch": 0.08219946696591388, "grad_norm": 0.34141871333122253, "learning_rate": 4.863000888390144e-05, "loss": 0.0248, "step": 2930 }, { "epoch": 0.08248001122177023, "grad_norm": 0.16526588797569275, "learning_rate": 4.8625333146303834e-05, "loss": 0.0504, "step": 2940 }, { "epoch": 0.08276055547762659, "grad_norm": 0.07919082790613174, "learning_rate": 4.862065740870623e-05, "loss": 0.0371, "step": 2950 }, { "epoch": 0.08304109973348296, "grad_norm": 0.15731894969940186, "learning_rate": 4.861598167110862e-05, "loss": 0.0096, "step": 2960 }, { "epoch": 0.08332164398933932, "grad_norm": 1.0250416994094849, "learning_rate": 4.8611305933511013e-05, "loss": 0.0333, "step": 2970 }, { "epoch": 0.08360218824519568, "grad_norm": 0.06363264471292496, "learning_rate": 4.8606630195913407e-05, "loss": 0.0251, "step": 2980 }, { "epoch": 0.08388273250105203, "grad_norm": 0.030437499284744263, "learning_rate": 4.86019544583158e-05, "loss": 0.0294, "step": 2990 }, { "epoch": 0.0841632767569084, "grad_norm": 0.08592546731233597, "learning_rate": 4.85972787207182e-05, "loss": 0.0381, "step": 3000 }, { "epoch": 0.08444382101276476, "grad_norm": 5.182921409606934, "learning_rate": 4.8592602983120586e-05, "loss": 0.0654, "step": 3010 }, { "epoch": 0.08472436526862112, "grad_norm": 2.778517484664917, "learning_rate": 4.8587927245522986e-05, "loss": 0.0397, "step": 3020 }, { "epoch": 0.08500490952447749, "grad_norm": 0.15203972160816193, "learning_rate": 4.858325150792538e-05, "loss": 0.0264, "step": 3030 }, { "epoch": 0.08528545378033385, "grad_norm": 0.6498059034347534, "learning_rate": 4.857857577032777e-05, "loss": 0.0299, "step": 3040 }, { "epoch": 0.08556599803619021, "grad_norm": 1.7282336950302124, "learning_rate": 4.8573900032730165e-05, "loss": 0.0485, "step": 3050 }, { "epoch": 0.08584654229204657, "grad_norm": 2.7269656658172607, "learning_rate": 4.856922429513256e-05, "loss": 0.0354, "step": 3060 }, { "epoch": 0.08612708654790294, "grad_norm": 0.2325836569070816, "learning_rate": 4.856454855753496e-05, "loss": 0.0427, "step": 3070 }, { "epoch": 0.0864076308037593, "grad_norm": 0.07272963225841522, "learning_rate": 4.8559872819937345e-05, "loss": 0.0189, "step": 3080 }, { "epoch": 0.08668817505961565, "grad_norm": 6.702547550201416, "learning_rate": 4.8555197082339745e-05, "loss": 0.0485, "step": 3090 }, { "epoch": 0.08696871931547201, "grad_norm": 2.2590365409851074, "learning_rate": 4.855052134474213e-05, "loss": 0.0499, "step": 3100 }, { "epoch": 0.08724926357132838, "grad_norm": 0.202682226896286, "learning_rate": 4.854584560714453e-05, "loss": 0.0515, "step": 3110 }, { "epoch": 0.08752980782718474, "grad_norm": 0.19072073698043823, "learning_rate": 4.854116986954692e-05, "loss": 0.0398, "step": 3120 }, { "epoch": 0.0878103520830411, "grad_norm": 0.2808748185634613, "learning_rate": 4.853649413194932e-05, "loss": 0.0097, "step": 3130 }, { "epoch": 0.08809089633889747, "grad_norm": 0.02730753645300865, "learning_rate": 4.853181839435171e-05, "loss": 0.0456, "step": 3140 }, { "epoch": 0.08837144059475383, "grad_norm": 0.048450618982315063, "learning_rate": 4.8527142656754104e-05, "loss": 0.0596, "step": 3150 }, { "epoch": 0.08865198485061018, "grad_norm": 0.12144970148801804, "learning_rate": 4.8522466919156504e-05, "loss": 0.0121, "step": 3160 }, { "epoch": 0.08893252910646654, "grad_norm": 0.14469780027866364, "learning_rate": 4.851779118155889e-05, "loss": 0.0275, "step": 3170 }, { "epoch": 0.08921307336232291, "grad_norm": 0.06013078987598419, "learning_rate": 4.851311544396129e-05, "loss": 0.0409, "step": 3180 }, { "epoch": 0.08949361761817927, "grad_norm": 0.16498319804668427, "learning_rate": 4.8508439706363676e-05, "loss": 0.0383, "step": 3190 }, { "epoch": 0.08977416187403563, "grad_norm": 0.3369481861591339, "learning_rate": 4.8503763968766076e-05, "loss": 0.0347, "step": 3200 }, { "epoch": 0.09005470612989198, "grad_norm": 0.36221784353256226, "learning_rate": 4.849908823116847e-05, "loss": 0.0303, "step": 3210 }, { "epoch": 0.09033525038574836, "grad_norm": 0.0801977813243866, "learning_rate": 4.849441249357086e-05, "loss": 0.0558, "step": 3220 }, { "epoch": 0.09061579464160471, "grad_norm": 0.10571596771478653, "learning_rate": 4.8489736755973256e-05, "loss": 0.0178, "step": 3230 }, { "epoch": 0.09089633889746107, "grad_norm": 0.0968000665307045, "learning_rate": 4.848506101837565e-05, "loss": 0.0093, "step": 3240 }, { "epoch": 0.09117688315331744, "grad_norm": 0.7715722322463989, "learning_rate": 4.848038528077805e-05, "loss": 0.0853, "step": 3250 }, { "epoch": 0.0914574274091738, "grad_norm": 0.0629628598690033, "learning_rate": 4.8475709543180435e-05, "loss": 0.0221, "step": 3260 }, { "epoch": 0.09173797166503016, "grad_norm": 0.15349021553993225, "learning_rate": 4.8471033805582835e-05, "loss": 0.0284, "step": 3270 }, { "epoch": 0.09201851592088651, "grad_norm": 0.11865158379077911, "learning_rate": 4.846635806798523e-05, "loss": 0.0411, "step": 3280 }, { "epoch": 0.09229906017674289, "grad_norm": 0.26707905530929565, "learning_rate": 4.846168233038762e-05, "loss": 0.0189, "step": 3290 }, { "epoch": 0.09257960443259924, "grad_norm": 1.9449294805526733, "learning_rate": 4.8457006592790015e-05, "loss": 0.0094, "step": 3300 }, { "epoch": 0.0928601486884556, "grad_norm": 0.4774859845638275, "learning_rate": 4.845233085519241e-05, "loss": 0.0228, "step": 3310 }, { "epoch": 0.09314069294431196, "grad_norm": 0.11169194430112839, "learning_rate": 4.84476551175948e-05, "loss": 0.0124, "step": 3320 }, { "epoch": 0.09342123720016833, "grad_norm": 0.3349843919277191, "learning_rate": 4.8442979379997194e-05, "loss": 0.0301, "step": 3330 }, { "epoch": 0.09370178145602469, "grad_norm": 0.1993156522512436, "learning_rate": 4.843830364239959e-05, "loss": 0.0341, "step": 3340 }, { "epoch": 0.09398232571188105, "grad_norm": 0.11695757508277893, "learning_rate": 4.843362790480199e-05, "loss": 0.0179, "step": 3350 }, { "epoch": 0.09426286996773742, "grad_norm": 0.16372798383235931, "learning_rate": 4.842895216720438e-05, "loss": 0.0204, "step": 3360 }, { "epoch": 0.09454341422359377, "grad_norm": 0.6103881597518921, "learning_rate": 4.8424276429606774e-05, "loss": 0.0789, "step": 3370 }, { "epoch": 0.09482395847945013, "grad_norm": 0.8173732757568359, "learning_rate": 4.841960069200917e-05, "loss": 0.057, "step": 3380 }, { "epoch": 0.09510450273530649, "grad_norm": 1.3151508569717407, "learning_rate": 4.841492495441156e-05, "loss": 0.0472, "step": 3390 }, { "epoch": 0.09538504699116286, "grad_norm": 0.507369875907898, "learning_rate": 4.841024921681395e-05, "loss": 0.0723, "step": 3400 }, { "epoch": 0.09566559124701922, "grad_norm": 0.3845808207988739, "learning_rate": 4.8405573479216346e-05, "loss": 0.0254, "step": 3410 }, { "epoch": 0.09594613550287558, "grad_norm": 0.9446573257446289, "learning_rate": 4.8400897741618746e-05, "loss": 0.0446, "step": 3420 }, { "epoch": 0.09622667975873193, "grad_norm": 0.9310720562934875, "learning_rate": 4.839622200402113e-05, "loss": 0.0372, "step": 3430 }, { "epoch": 0.0965072240145883, "grad_norm": 0.3630155622959137, "learning_rate": 4.839154626642353e-05, "loss": 0.0478, "step": 3440 }, { "epoch": 0.09678776827044466, "grad_norm": 1.9750092029571533, "learning_rate": 4.8386870528825926e-05, "loss": 0.0162, "step": 3450 }, { "epoch": 0.09706831252630102, "grad_norm": 0.6932790279388428, "learning_rate": 4.838219479122832e-05, "loss": 0.0323, "step": 3460 }, { "epoch": 0.09734885678215739, "grad_norm": 16.956911087036133, "learning_rate": 4.837751905363071e-05, "loss": 0.0621, "step": 3470 }, { "epoch": 0.09762940103801375, "grad_norm": 0.6182325482368469, "learning_rate": 4.8372843316033105e-05, "loss": 0.0566, "step": 3480 }, { "epoch": 0.09790994529387011, "grad_norm": 3.502262592315674, "learning_rate": 4.8368167578435505e-05, "loss": 0.0309, "step": 3490 }, { "epoch": 0.09819048954972646, "grad_norm": 0.07095052301883698, "learning_rate": 4.836349184083789e-05, "loss": 0.0344, "step": 3500 }, { "epoch": 0.09847103380558284, "grad_norm": 0.1703258603811264, "learning_rate": 4.835881610324029e-05, "loss": 0.0256, "step": 3510 }, { "epoch": 0.0987515780614392, "grad_norm": 0.048722609877586365, "learning_rate": 4.835414036564268e-05, "loss": 0.0356, "step": 3520 }, { "epoch": 0.09903212231729555, "grad_norm": 0.06596146523952484, "learning_rate": 4.834946462804508e-05, "loss": 0.0382, "step": 3530 }, { "epoch": 0.09931266657315191, "grad_norm": 3.7993950843811035, "learning_rate": 4.834478889044747e-05, "loss": 0.0323, "step": 3540 }, { "epoch": 0.09959321082900828, "grad_norm": 0.36656859517097473, "learning_rate": 4.8340113152849864e-05, "loss": 0.0322, "step": 3550 }, { "epoch": 0.09987375508486464, "grad_norm": 0.1252017319202423, "learning_rate": 4.8335437415252264e-05, "loss": 0.0376, "step": 3560 }, { "epoch": 0.100154299340721, "grad_norm": 0.1068749949336052, "learning_rate": 4.833076167765465e-05, "loss": 0.0362, "step": 3570 }, { "epoch": 0.10043484359657737, "grad_norm": 0.23751802742481232, "learning_rate": 4.832608594005705e-05, "loss": 0.0407, "step": 3580 }, { "epoch": 0.10071538785243372, "grad_norm": 1.2911545038223267, "learning_rate": 4.8321410202459437e-05, "loss": 0.0795, "step": 3590 }, { "epoch": 0.10099593210829008, "grad_norm": 0.18909408152103424, "learning_rate": 4.8316734464861836e-05, "loss": 0.0455, "step": 3600 }, { "epoch": 0.10127647636414644, "grad_norm": 1.3952144384384155, "learning_rate": 4.831205872726422e-05, "loss": 0.0409, "step": 3610 }, { "epoch": 0.10155702062000281, "grad_norm": 0.377779483795166, "learning_rate": 4.830738298966662e-05, "loss": 0.0382, "step": 3620 }, { "epoch": 0.10183756487585917, "grad_norm": 0.24896253645420074, "learning_rate": 4.8302707252069016e-05, "loss": 0.0615, "step": 3630 }, { "epoch": 0.10211810913171553, "grad_norm": 3.160722017288208, "learning_rate": 4.829803151447141e-05, "loss": 0.0562, "step": 3640 }, { "epoch": 0.10239865338757188, "grad_norm": 0.2485857605934143, "learning_rate": 4.82933557768738e-05, "loss": 0.0573, "step": 3650 }, { "epoch": 0.10267919764342825, "grad_norm": 9.698915481567383, "learning_rate": 4.8288680039276195e-05, "loss": 0.0478, "step": 3660 }, { "epoch": 0.10295974189928461, "grad_norm": 5.956120491027832, "learning_rate": 4.8284004301678595e-05, "loss": 0.0703, "step": 3670 }, { "epoch": 0.10324028615514097, "grad_norm": 0.1764584332704544, "learning_rate": 4.827932856408098e-05, "loss": 0.0445, "step": 3680 }, { "epoch": 0.10352083041099734, "grad_norm": 0.021128475666046143, "learning_rate": 4.827465282648338e-05, "loss": 0.0376, "step": 3690 }, { "epoch": 0.1038013746668537, "grad_norm": 0.560319185256958, "learning_rate": 4.8269977088885775e-05, "loss": 0.0076, "step": 3700 }, { "epoch": 0.10408191892271006, "grad_norm": 0.6247649192810059, "learning_rate": 4.826530135128817e-05, "loss": 0.0431, "step": 3710 }, { "epoch": 0.10436246317856641, "grad_norm": 0.08932629972696304, "learning_rate": 4.826062561369056e-05, "loss": 0.05, "step": 3720 }, { "epoch": 0.10464300743442279, "grad_norm": 0.20253758132457733, "learning_rate": 4.8255949876092954e-05, "loss": 0.0274, "step": 3730 }, { "epoch": 0.10492355169027914, "grad_norm": 0.1929681897163391, "learning_rate": 4.825127413849535e-05, "loss": 0.0614, "step": 3740 }, { "epoch": 0.1052040959461355, "grad_norm": 0.19507913291454315, "learning_rate": 4.824659840089774e-05, "loss": 0.0209, "step": 3750 }, { "epoch": 0.10548464020199186, "grad_norm": 2.405012845993042, "learning_rate": 4.824192266330014e-05, "loss": 0.0109, "step": 3760 }, { "epoch": 0.10576518445784823, "grad_norm": 0.8624261021614075, "learning_rate": 4.8237246925702534e-05, "loss": 0.0395, "step": 3770 }, { "epoch": 0.10604572871370459, "grad_norm": 0.10504943132400513, "learning_rate": 4.823257118810493e-05, "loss": 0.0208, "step": 3780 }, { "epoch": 0.10632627296956094, "grad_norm": 0.41476312279701233, "learning_rate": 4.822789545050732e-05, "loss": 0.0464, "step": 3790 }, { "epoch": 0.10660681722541732, "grad_norm": 0.13577328622341156, "learning_rate": 4.822321971290971e-05, "loss": 0.0333, "step": 3800 }, { "epoch": 0.10688736148127367, "grad_norm": 0.12625598907470703, "learning_rate": 4.8218543975312106e-05, "loss": 0.0355, "step": 3810 }, { "epoch": 0.10716790573713003, "grad_norm": 1.7914620637893677, "learning_rate": 4.82138682377145e-05, "loss": 0.041, "step": 3820 }, { "epoch": 0.10744844999298639, "grad_norm": 5.014822959899902, "learning_rate": 4.820919250011689e-05, "loss": 0.0631, "step": 3830 }, { "epoch": 0.10772899424884276, "grad_norm": 0.31051886081695557, "learning_rate": 4.820451676251929e-05, "loss": 0.056, "step": 3840 }, { "epoch": 0.10800953850469912, "grad_norm": 1.1024903059005737, "learning_rate": 4.8199841024921686e-05, "loss": 0.041, "step": 3850 }, { "epoch": 0.10829008276055548, "grad_norm": 0.43458837270736694, "learning_rate": 4.819516528732408e-05, "loss": 0.0609, "step": 3860 }, { "epoch": 0.10857062701641183, "grad_norm": 1.1420408487319946, "learning_rate": 4.819048954972647e-05, "loss": 0.035, "step": 3870 }, { "epoch": 0.1088511712722682, "grad_norm": 5.284061431884766, "learning_rate": 4.8185813812128865e-05, "loss": 0.0229, "step": 3880 }, { "epoch": 0.10913171552812456, "grad_norm": 0.23013779520988464, "learning_rate": 4.8181138074531265e-05, "loss": 0.0251, "step": 3890 }, { "epoch": 0.10941225978398092, "grad_norm": 0.05622800439596176, "learning_rate": 4.817646233693365e-05, "loss": 0.046, "step": 3900 }, { "epoch": 0.10969280403983729, "grad_norm": 1.0708738565444946, "learning_rate": 4.817178659933605e-05, "loss": 0.0708, "step": 3910 }, { "epoch": 0.10997334829569365, "grad_norm": 0.5900218486785889, "learning_rate": 4.816711086173844e-05, "loss": 0.0305, "step": 3920 }, { "epoch": 0.11025389255155, "grad_norm": 0.22604501247406006, "learning_rate": 4.816243512414084e-05, "loss": 0.0415, "step": 3930 }, { "epoch": 0.11053443680740636, "grad_norm": 0.05938999727368355, "learning_rate": 4.815775938654323e-05, "loss": 0.0324, "step": 3940 }, { "epoch": 0.11081498106326274, "grad_norm": 3.5682740211486816, "learning_rate": 4.8153083648945624e-05, "loss": 0.0404, "step": 3950 }, { "epoch": 0.11109552531911909, "grad_norm": 0.06054426729679108, "learning_rate": 4.814840791134802e-05, "loss": 0.0148, "step": 3960 }, { "epoch": 0.11137606957497545, "grad_norm": 1.173440933227539, "learning_rate": 4.814373217375041e-05, "loss": 0.0317, "step": 3970 }, { "epoch": 0.11165661383083181, "grad_norm": 0.5411765575408936, "learning_rate": 4.813905643615281e-05, "loss": 0.0244, "step": 3980 }, { "epoch": 0.11193715808668818, "grad_norm": 0.3670228123664856, "learning_rate": 4.8134380698555197e-05, "loss": 0.0826, "step": 3990 }, { "epoch": 0.11221770234254454, "grad_norm": 0.25620537996292114, "learning_rate": 4.8129704960957597e-05, "loss": 0.0301, "step": 4000 }, { "epoch": 0.1124982465984009, "grad_norm": 0.12509724497795105, "learning_rate": 4.812502922335998e-05, "loss": 0.0272, "step": 4010 }, { "epoch": 0.11277879085425727, "grad_norm": 0.0660891979932785, "learning_rate": 4.812035348576238e-05, "loss": 0.0225, "step": 4020 }, { "epoch": 0.11305933511011362, "grad_norm": 9.35112190246582, "learning_rate": 4.8115677748164776e-05, "loss": 0.0561, "step": 4030 }, { "epoch": 0.11333987936596998, "grad_norm": 0.23542071878910065, "learning_rate": 4.811100201056717e-05, "loss": 0.0178, "step": 4040 }, { "epoch": 0.11362042362182634, "grad_norm": 0.13370464742183685, "learning_rate": 4.810632627296956e-05, "loss": 0.0285, "step": 4050 }, { "epoch": 0.11390096787768271, "grad_norm": 0.7809809446334839, "learning_rate": 4.8101650535371955e-05, "loss": 0.0401, "step": 4060 }, { "epoch": 0.11418151213353907, "grad_norm": 0.6121453642845154, "learning_rate": 4.8096974797774355e-05, "loss": 0.0601, "step": 4070 }, { "epoch": 0.11446205638939543, "grad_norm": 0.07100296765565872, "learning_rate": 4.809229906017674e-05, "loss": 0.0224, "step": 4080 }, { "epoch": 0.11474260064525178, "grad_norm": 0.05799086391925812, "learning_rate": 4.808762332257914e-05, "loss": 0.0162, "step": 4090 }, { "epoch": 0.11502314490110815, "grad_norm": 1.5650147199630737, "learning_rate": 4.8082947584981535e-05, "loss": 0.0365, "step": 4100 }, { "epoch": 0.11530368915696451, "grad_norm": 0.9093208909034729, "learning_rate": 4.807827184738393e-05, "loss": 0.0488, "step": 4110 }, { "epoch": 0.11558423341282087, "grad_norm": 21.462209701538086, "learning_rate": 4.807359610978632e-05, "loss": 0.0491, "step": 4120 }, { "epoch": 0.11586477766867724, "grad_norm": 0.2652363181114197, "learning_rate": 4.8068920372188714e-05, "loss": 0.0179, "step": 4130 }, { "epoch": 0.1161453219245336, "grad_norm": 0.19232790172100067, "learning_rate": 4.806424463459111e-05, "loss": 0.0442, "step": 4140 }, { "epoch": 0.11642586618038996, "grad_norm": 0.13013720512390137, "learning_rate": 4.80595688969935e-05, "loss": 0.0726, "step": 4150 }, { "epoch": 0.11670641043624631, "grad_norm": 1.9455286264419556, "learning_rate": 4.80548931593959e-05, "loss": 0.0222, "step": 4160 }, { "epoch": 0.11698695469210268, "grad_norm": 1.8125176429748535, "learning_rate": 4.8050217421798294e-05, "loss": 0.0444, "step": 4170 }, { "epoch": 0.11726749894795904, "grad_norm": 0.07561127841472626, "learning_rate": 4.804554168420069e-05, "loss": 0.0339, "step": 4180 }, { "epoch": 0.1175480432038154, "grad_norm": 20.82721710205078, "learning_rate": 4.804086594660308e-05, "loss": 0.0406, "step": 4190 }, { "epoch": 0.11782858745967176, "grad_norm": 0.5166816711425781, "learning_rate": 4.803619020900547e-05, "loss": 0.0732, "step": 4200 }, { "epoch": 0.11810913171552813, "grad_norm": 1.2590614557266235, "learning_rate": 4.8031514471407866e-05, "loss": 0.0777, "step": 4210 }, { "epoch": 0.11838967597138449, "grad_norm": 0.1727217584848404, "learning_rate": 4.802683873381026e-05, "loss": 0.0341, "step": 4220 }, { "epoch": 0.11867022022724084, "grad_norm": 0.4193609356880188, "learning_rate": 4.802216299621265e-05, "loss": 0.0264, "step": 4230 }, { "epoch": 0.11895076448309722, "grad_norm": 0.3894844949245453, "learning_rate": 4.801748725861505e-05, "loss": 0.0444, "step": 4240 }, { "epoch": 0.11923130873895357, "grad_norm": 0.18095438182353973, "learning_rate": 4.801281152101744e-05, "loss": 0.0243, "step": 4250 }, { "epoch": 0.11951185299480993, "grad_norm": 2.326235294342041, "learning_rate": 4.800813578341984e-05, "loss": 0.0256, "step": 4260 }, { "epoch": 0.11979239725066629, "grad_norm": 0.556398868560791, "learning_rate": 4.800346004582223e-05, "loss": 0.0347, "step": 4270 }, { "epoch": 0.12007294150652266, "grad_norm": 0.3265383839607239, "learning_rate": 4.7998784308224625e-05, "loss": 0.0297, "step": 4280 }, { "epoch": 0.12035348576237902, "grad_norm": 0.09152190387248993, "learning_rate": 4.799410857062702e-05, "loss": 0.0325, "step": 4290 }, { "epoch": 0.12063403001823537, "grad_norm": 2.105902910232544, "learning_rate": 4.798943283302941e-05, "loss": 0.0249, "step": 4300 }, { "epoch": 0.12091457427409173, "grad_norm": 0.5893357396125793, "learning_rate": 4.798475709543181e-05, "loss": 0.0416, "step": 4310 }, { "epoch": 0.1211951185299481, "grad_norm": 0.09205744415521622, "learning_rate": 4.79800813578342e-05, "loss": 0.0468, "step": 4320 }, { "epoch": 0.12147566278580446, "grad_norm": 2.674283981323242, "learning_rate": 4.79754056202366e-05, "loss": 0.0485, "step": 4330 }, { "epoch": 0.12175620704166082, "grad_norm": 0.0965878814458847, "learning_rate": 4.7970729882638984e-05, "loss": 0.029, "step": 4340 }, { "epoch": 0.12203675129751718, "grad_norm": 0.03297443687915802, "learning_rate": 4.7966054145041384e-05, "loss": 0.0189, "step": 4350 }, { "epoch": 0.12231729555337355, "grad_norm": 0.13219986855983734, "learning_rate": 4.796137840744378e-05, "loss": 0.0385, "step": 4360 }, { "epoch": 0.1225978398092299, "grad_norm": 0.9125007390975952, "learning_rate": 4.795670266984617e-05, "loss": 0.0563, "step": 4370 }, { "epoch": 0.12287838406508626, "grad_norm": 0.16221286356449127, "learning_rate": 4.795202693224857e-05, "loss": 0.0391, "step": 4380 }, { "epoch": 0.12315892832094263, "grad_norm": 0.09438024461269379, "learning_rate": 4.794735119465096e-05, "loss": 0.0053, "step": 4390 }, { "epoch": 0.12343947257679899, "grad_norm": 0.028559811413288116, "learning_rate": 4.794267545705336e-05, "loss": 0.0292, "step": 4400 }, { "epoch": 0.12372001683265535, "grad_norm": 0.17028838396072388, "learning_rate": 4.793799971945574e-05, "loss": 0.0102, "step": 4410 }, { "epoch": 0.12400056108851171, "grad_norm": 0.18625353276729584, "learning_rate": 4.793332398185814e-05, "loss": 0.0426, "step": 4420 }, { "epoch": 0.12428110534436808, "grad_norm": 0.6952725648880005, "learning_rate": 4.792864824426053e-05, "loss": 0.0489, "step": 4430 }, { "epoch": 0.12456164960022444, "grad_norm": 0.9878958463668823, "learning_rate": 4.792397250666293e-05, "loss": 0.0462, "step": 4440 }, { "epoch": 0.1248421938560808, "grad_norm": 2.835447311401367, "learning_rate": 4.791929676906532e-05, "loss": 0.0335, "step": 4450 }, { "epoch": 0.12512273811193717, "grad_norm": 0.16189166903495789, "learning_rate": 4.7914621031467716e-05, "loss": 0.0209, "step": 4460 }, { "epoch": 0.12540328236779352, "grad_norm": 2.283308267593384, "learning_rate": 4.7909945293870115e-05, "loss": 0.0532, "step": 4470 }, { "epoch": 0.12568382662364988, "grad_norm": 0.39400389790534973, "learning_rate": 4.79052695562725e-05, "loss": 0.0406, "step": 4480 }, { "epoch": 0.12596437087950624, "grad_norm": 0.0988137423992157, "learning_rate": 4.79005938186749e-05, "loss": 0.036, "step": 4490 }, { "epoch": 0.1262449151353626, "grad_norm": 0.11679941415786743, "learning_rate": 4.789591808107729e-05, "loss": 0.0468, "step": 4500 }, { "epoch": 0.12652545939121895, "grad_norm": 1.5879833698272705, "learning_rate": 4.789124234347969e-05, "loss": 0.0306, "step": 4510 }, { "epoch": 0.12680600364707534, "grad_norm": 0.8052906394004822, "learning_rate": 4.788656660588208e-05, "loss": 0.0445, "step": 4520 }, { "epoch": 0.1270865479029317, "grad_norm": 0.4797222316265106, "learning_rate": 4.7881890868284474e-05, "loss": 0.0489, "step": 4530 }, { "epoch": 0.12736709215878805, "grad_norm": 0.08351923525333405, "learning_rate": 4.787721513068687e-05, "loss": 0.0385, "step": 4540 }, { "epoch": 0.1276476364146444, "grad_norm": 0.10643807798624039, "learning_rate": 4.787253939308926e-05, "loss": 0.034, "step": 4550 }, { "epoch": 0.12792818067050077, "grad_norm": 0.18182066082954407, "learning_rate": 4.7867863655491654e-05, "loss": 0.0411, "step": 4560 }, { "epoch": 0.12820872492635713, "grad_norm": 1.3187745809555054, "learning_rate": 4.786318791789405e-05, "loss": 0.0421, "step": 4570 }, { "epoch": 0.12848926918221348, "grad_norm": 1.2165220975875854, "learning_rate": 4.785851218029645e-05, "loss": 0.0151, "step": 4580 }, { "epoch": 0.12876981343806987, "grad_norm": 0.581152081489563, "learning_rate": 4.785383644269884e-05, "loss": 0.0846, "step": 4590 }, { "epoch": 0.12905035769392623, "grad_norm": 0.06336618959903717, "learning_rate": 4.784916070510123e-05, "loss": 0.0456, "step": 4600 }, { "epoch": 0.12933090194978258, "grad_norm": 0.1202174723148346, "learning_rate": 4.7844484967503626e-05, "loss": 0.0415, "step": 4610 }, { "epoch": 0.12961144620563894, "grad_norm": 0.9171125888824463, "learning_rate": 4.783980922990602e-05, "loss": 0.0513, "step": 4620 }, { "epoch": 0.1298919904614953, "grad_norm": 0.226665198802948, "learning_rate": 4.783513349230841e-05, "loss": 0.0476, "step": 4630 }, { "epoch": 0.13017253471735166, "grad_norm": 1.032552719116211, "learning_rate": 4.7830457754710806e-05, "loss": 0.0608, "step": 4640 }, { "epoch": 0.13045307897320801, "grad_norm": 0.10941661149263382, "learning_rate": 4.78257820171132e-05, "loss": 0.0682, "step": 4650 }, { "epoch": 0.13073362322906437, "grad_norm": 0.5409769415855408, "learning_rate": 4.78211062795156e-05, "loss": 0.0504, "step": 4660 }, { "epoch": 0.13101416748492076, "grad_norm": 0.875572144985199, "learning_rate": 4.781643054191799e-05, "loss": 0.0501, "step": 4670 }, { "epoch": 0.13129471174077711, "grad_norm": 0.1165657788515091, "learning_rate": 4.7811754804320385e-05, "loss": 0.0313, "step": 4680 }, { "epoch": 0.13157525599663347, "grad_norm": 0.18573306500911713, "learning_rate": 4.780707906672278e-05, "loss": 0.0512, "step": 4690 }, { "epoch": 0.13185580025248983, "grad_norm": 0.2954259514808655, "learning_rate": 4.780240332912517e-05, "loss": 0.0233, "step": 4700 }, { "epoch": 0.1321363445083462, "grad_norm": 1.2420212030410767, "learning_rate": 4.7797727591527565e-05, "loss": 0.0289, "step": 4710 }, { "epoch": 0.13241688876420254, "grad_norm": 0.0877738744020462, "learning_rate": 4.779305185392996e-05, "loss": 0.0121, "step": 4720 }, { "epoch": 0.1326974330200589, "grad_norm": 1.9130874872207642, "learning_rate": 4.778837611633236e-05, "loss": 0.0476, "step": 4730 }, { "epoch": 0.1329779772759153, "grad_norm": 0.12136524170637131, "learning_rate": 4.7783700378734744e-05, "loss": 0.038, "step": 4740 }, { "epoch": 0.13325852153177165, "grad_norm": 0.1178927943110466, "learning_rate": 4.7779024641137144e-05, "loss": 0.0594, "step": 4750 }, { "epoch": 0.133539065787628, "grad_norm": 0.03250780329108238, "learning_rate": 4.777434890353954e-05, "loss": 0.0541, "step": 4760 }, { "epoch": 0.13381961004348436, "grad_norm": 0.6142797470092773, "learning_rate": 4.776967316594193e-05, "loss": 0.0352, "step": 4770 }, { "epoch": 0.13410015429934072, "grad_norm": 6.727725982666016, "learning_rate": 4.7764997428344324e-05, "loss": 0.0507, "step": 4780 }, { "epoch": 0.13438069855519708, "grad_norm": 0.21084046363830566, "learning_rate": 4.776032169074672e-05, "loss": 0.0352, "step": 4790 }, { "epoch": 0.13466124281105343, "grad_norm": 0.818300724029541, "learning_rate": 4.775564595314912e-05, "loss": 0.0526, "step": 4800 }, { "epoch": 0.13494178706690982, "grad_norm": 1.1570965051651, "learning_rate": 4.77509702155515e-05, "loss": 0.0605, "step": 4810 }, { "epoch": 0.13522233132276618, "grad_norm": 0.3180531859397888, "learning_rate": 4.77462944779539e-05, "loss": 0.0687, "step": 4820 }, { "epoch": 0.13550287557862253, "grad_norm": 0.09996229410171509, "learning_rate": 4.774161874035629e-05, "loss": 0.0618, "step": 4830 }, { "epoch": 0.1357834198344789, "grad_norm": 1.5224186182022095, "learning_rate": 4.773694300275869e-05, "loss": 0.0384, "step": 4840 }, { "epoch": 0.13606396409033525, "grad_norm": 0.10895591974258423, "learning_rate": 4.773226726516108e-05, "loss": 0.0193, "step": 4850 }, { "epoch": 0.1363445083461916, "grad_norm": 0.08084195852279663, "learning_rate": 4.7727591527563476e-05, "loss": 0.0687, "step": 4860 }, { "epoch": 0.13662505260204796, "grad_norm": 1.3311302661895752, "learning_rate": 4.772291578996587e-05, "loss": 0.0679, "step": 4870 }, { "epoch": 0.13690559685790432, "grad_norm": 3.3622066974639893, "learning_rate": 4.771824005236826e-05, "loss": 0.0447, "step": 4880 }, { "epoch": 0.1371861411137607, "grad_norm": 0.13115733861923218, "learning_rate": 4.771356431477066e-05, "loss": 0.0676, "step": 4890 }, { "epoch": 0.13746668536961706, "grad_norm": 0.6186093091964722, "learning_rate": 4.770888857717305e-05, "loss": 0.0431, "step": 4900 }, { "epoch": 0.13774722962547342, "grad_norm": 3.9642820358276367, "learning_rate": 4.770421283957545e-05, "loss": 0.0594, "step": 4910 }, { "epoch": 0.13802777388132978, "grad_norm": 0.42617806792259216, "learning_rate": 4.7699537101977835e-05, "loss": 0.0315, "step": 4920 }, { "epoch": 0.13830831813718614, "grad_norm": 0.4854031205177307, "learning_rate": 4.7694861364380234e-05, "loss": 0.0761, "step": 4930 }, { "epoch": 0.1385888623930425, "grad_norm": 0.6262651085853577, "learning_rate": 4.769018562678263e-05, "loss": 0.0384, "step": 4940 }, { "epoch": 0.13886940664889885, "grad_norm": 1.246474027633667, "learning_rate": 4.768550988918502e-05, "loss": 0.0295, "step": 4950 }, { "epoch": 0.13914995090475524, "grad_norm": 0.06829962134361267, "learning_rate": 4.7680834151587414e-05, "loss": 0.016, "step": 4960 }, { "epoch": 0.1394304951606116, "grad_norm": 0.8612080812454224, "learning_rate": 4.767615841398981e-05, "loss": 0.0484, "step": 4970 }, { "epoch": 0.13971103941646795, "grad_norm": 0.5869552493095398, "learning_rate": 4.767148267639221e-05, "loss": 0.0488, "step": 4980 }, { "epoch": 0.1399915836723243, "grad_norm": 1.1129244565963745, "learning_rate": 4.7666806938794593e-05, "loss": 0.0562, "step": 4990 }, { "epoch": 0.14027212792818067, "grad_norm": 0.1486845761537552, "learning_rate": 4.766213120119699e-05, "loss": 0.0169, "step": 5000 }, { "epoch": 0.14055267218403703, "grad_norm": 0.2459857314825058, "learning_rate": 4.7657455463599387e-05, "loss": 0.0237, "step": 5010 }, { "epoch": 0.14083321643989338, "grad_norm": 1.2295652627944946, "learning_rate": 4.765277972600178e-05, "loss": 0.0212, "step": 5020 }, { "epoch": 0.14111376069574977, "grad_norm": 2.801490545272827, "learning_rate": 4.764810398840417e-05, "loss": 0.0286, "step": 5030 }, { "epoch": 0.14139430495160613, "grad_norm": 0.7344323992729187, "learning_rate": 4.7643428250806566e-05, "loss": 0.0577, "step": 5040 }, { "epoch": 0.14167484920746248, "grad_norm": 0.8320244550704956, "learning_rate": 4.763875251320896e-05, "loss": 0.031, "step": 5050 }, { "epoch": 0.14195539346331884, "grad_norm": 0.18310998380184174, "learning_rate": 4.763407677561135e-05, "loss": 0.0429, "step": 5060 }, { "epoch": 0.1422359377191752, "grad_norm": 0.08151710778474808, "learning_rate": 4.762940103801375e-05, "loss": 0.0218, "step": 5070 }, { "epoch": 0.14251648197503156, "grad_norm": 9.984257698059082, "learning_rate": 4.7624725300416145e-05, "loss": 0.0105, "step": 5080 }, { "epoch": 0.1427970262308879, "grad_norm": 0.5636879205703735, "learning_rate": 4.762004956281854e-05, "loss": 0.0729, "step": 5090 }, { "epoch": 0.14307757048674427, "grad_norm": 5.895951271057129, "learning_rate": 4.761537382522093e-05, "loss": 0.0486, "step": 5100 }, { "epoch": 0.14335811474260066, "grad_norm": 0.06642309576272964, "learning_rate": 4.7610698087623325e-05, "loss": 0.0251, "step": 5110 }, { "epoch": 0.14363865899845701, "grad_norm": 0.059291765093803406, "learning_rate": 4.760602235002572e-05, "loss": 0.0214, "step": 5120 }, { "epoch": 0.14391920325431337, "grad_norm": 0.20821352303028107, "learning_rate": 4.760134661242811e-05, "loss": 0.038, "step": 5130 }, { "epoch": 0.14419974751016973, "grad_norm": 0.4051700830459595, "learning_rate": 4.7596670874830504e-05, "loss": 0.0279, "step": 5140 }, { "epoch": 0.1444802917660261, "grad_norm": 0.20855094492435455, "learning_rate": 4.7591995137232904e-05, "loss": 0.0281, "step": 5150 }, { "epoch": 0.14476083602188244, "grad_norm": 0.07500243932008743, "learning_rate": 4.758731939963529e-05, "loss": 0.0306, "step": 5160 }, { "epoch": 0.1450413802777388, "grad_norm": 0.09851474314928055, "learning_rate": 4.758264366203769e-05, "loss": 0.04, "step": 5170 }, { "epoch": 0.1453219245335952, "grad_norm": 0.11080461740493774, "learning_rate": 4.7577967924440084e-05, "loss": 0.0304, "step": 5180 }, { "epoch": 0.14560246878945154, "grad_norm": 0.3976882994174957, "learning_rate": 4.757329218684248e-05, "loss": 0.0438, "step": 5190 }, { "epoch": 0.1458830130453079, "grad_norm": 0.059920139610767365, "learning_rate": 4.756861644924487e-05, "loss": 0.0417, "step": 5200 }, { "epoch": 0.14616355730116426, "grad_norm": 4.457353115081787, "learning_rate": 4.756394071164726e-05, "loss": 0.0443, "step": 5210 }, { "epoch": 0.14644410155702062, "grad_norm": 2.2652711868286133, "learning_rate": 4.755926497404966e-05, "loss": 0.0224, "step": 5220 }, { "epoch": 0.14672464581287697, "grad_norm": 7.388555526733398, "learning_rate": 4.755458923645205e-05, "loss": 0.034, "step": 5230 }, { "epoch": 0.14700519006873333, "grad_norm": 0.45829102396965027, "learning_rate": 4.754991349885445e-05, "loss": 0.0341, "step": 5240 }, { "epoch": 0.14728573432458972, "grad_norm": 0.04918600246310234, "learning_rate": 4.7545237761256836e-05, "loss": 0.0602, "step": 5250 }, { "epoch": 0.14756627858044608, "grad_norm": 0.556611955165863, "learning_rate": 4.7540562023659236e-05, "loss": 0.0374, "step": 5260 }, { "epoch": 0.14784682283630243, "grad_norm": 0.5755372643470764, "learning_rate": 4.753588628606163e-05, "loss": 0.0334, "step": 5270 }, { "epoch": 0.1481273670921588, "grad_norm": 0.06621954590082169, "learning_rate": 4.753121054846402e-05, "loss": 0.0118, "step": 5280 }, { "epoch": 0.14840791134801515, "grad_norm": 0.6427789330482483, "learning_rate": 4.752653481086642e-05, "loss": 0.0159, "step": 5290 }, { "epoch": 0.1486884556038715, "grad_norm": 0.10230038315057755, "learning_rate": 4.752185907326881e-05, "loss": 0.0348, "step": 5300 }, { "epoch": 0.14896899985972786, "grad_norm": 0.16173231601715088, "learning_rate": 4.751718333567121e-05, "loss": 0.0497, "step": 5310 }, { "epoch": 0.14924954411558422, "grad_norm": 0.22794955968856812, "learning_rate": 4.7512507598073595e-05, "loss": 0.0339, "step": 5320 }, { "epoch": 0.1495300883714406, "grad_norm": 0.7179032564163208, "learning_rate": 4.7507831860475995e-05, "loss": 0.0404, "step": 5330 }, { "epoch": 0.14981063262729696, "grad_norm": 0.18341122567653656, "learning_rate": 4.750315612287838e-05, "loss": 0.0282, "step": 5340 }, { "epoch": 0.15009117688315332, "grad_norm": 1.1655545234680176, "learning_rate": 4.749848038528078e-05, "loss": 0.0264, "step": 5350 }, { "epoch": 0.15037172113900968, "grad_norm": 0.8281294703483582, "learning_rate": 4.7493804647683174e-05, "loss": 0.0181, "step": 5360 }, { "epoch": 0.15065226539486604, "grad_norm": 0.04481811076402664, "learning_rate": 4.748912891008557e-05, "loss": 0.0333, "step": 5370 }, { "epoch": 0.1509328096507224, "grad_norm": 0.6486610174179077, "learning_rate": 4.748445317248797e-05, "loss": 0.0614, "step": 5380 }, { "epoch": 0.15121335390657875, "grad_norm": 0.31229016184806824, "learning_rate": 4.7479777434890354e-05, "loss": 0.0615, "step": 5390 }, { "epoch": 0.15149389816243514, "grad_norm": 0.03932427614927292, "learning_rate": 4.7475101697292753e-05, "loss": 0.0139, "step": 5400 }, { "epoch": 0.1517744424182915, "grad_norm": 1.8958338499069214, "learning_rate": 4.747042595969514e-05, "loss": 0.0365, "step": 5410 }, { "epoch": 0.15205498667414785, "grad_norm": 1.7459425926208496, "learning_rate": 4.746575022209754e-05, "loss": 0.0754, "step": 5420 }, { "epoch": 0.1523355309300042, "grad_norm": 8.055896759033203, "learning_rate": 4.746107448449993e-05, "loss": 0.0482, "step": 5430 }, { "epoch": 0.15261607518586057, "grad_norm": 0.2634962201118469, "learning_rate": 4.7456398746902326e-05, "loss": 0.0467, "step": 5440 }, { "epoch": 0.15289661944171692, "grad_norm": 0.37900716066360474, "learning_rate": 4.745172300930472e-05, "loss": 0.0211, "step": 5450 }, { "epoch": 0.15317716369757328, "grad_norm": 0.31229251623153687, "learning_rate": 4.744704727170711e-05, "loss": 0.0312, "step": 5460 }, { "epoch": 0.15345770795342967, "grad_norm": 0.5199376344680786, "learning_rate": 4.7442371534109506e-05, "loss": 0.0633, "step": 5470 }, { "epoch": 0.15373825220928602, "grad_norm": 0.8114891052246094, "learning_rate": 4.74376957965119e-05, "loss": 0.0321, "step": 5480 }, { "epoch": 0.15401879646514238, "grad_norm": 0.2665819227695465, "learning_rate": 4.74330200589143e-05, "loss": 0.0332, "step": 5490 }, { "epoch": 0.15429934072099874, "grad_norm": 0.12447665631771088, "learning_rate": 4.742834432131669e-05, "loss": 0.0423, "step": 5500 }, { "epoch": 0.1545798849768551, "grad_norm": 0.12392882257699966, "learning_rate": 4.7423668583719085e-05, "loss": 0.0373, "step": 5510 }, { "epoch": 0.15486042923271146, "grad_norm": 0.027480874210596085, "learning_rate": 4.741899284612148e-05, "loss": 0.0216, "step": 5520 }, { "epoch": 0.1551409734885678, "grad_norm": 0.33492496609687805, "learning_rate": 4.741431710852387e-05, "loss": 0.0438, "step": 5530 }, { "epoch": 0.15542151774442417, "grad_norm": 1.4505982398986816, "learning_rate": 4.7409641370926264e-05, "loss": 0.049, "step": 5540 }, { "epoch": 0.15570206200028056, "grad_norm": 0.8603963255882263, "learning_rate": 4.740496563332866e-05, "loss": 0.0671, "step": 5550 }, { "epoch": 0.1559826062561369, "grad_norm": 0.4133463501930237, "learning_rate": 4.740028989573105e-05, "loss": 0.0468, "step": 5560 }, { "epoch": 0.15626315051199327, "grad_norm": 0.5930065512657166, "learning_rate": 4.739561415813345e-05, "loss": 0.0479, "step": 5570 }, { "epoch": 0.15654369476784963, "grad_norm": 0.43751010298728943, "learning_rate": 4.7390938420535844e-05, "loss": 0.0561, "step": 5580 }, { "epoch": 0.15682423902370599, "grad_norm": 0.059232447296381, "learning_rate": 4.738626268293824e-05, "loss": 0.0331, "step": 5590 }, { "epoch": 0.15710478327956234, "grad_norm": 0.05005291849374771, "learning_rate": 4.738158694534063e-05, "loss": 0.0367, "step": 5600 }, { "epoch": 0.1573853275354187, "grad_norm": 4.135753631591797, "learning_rate": 4.737691120774302e-05, "loss": 0.0365, "step": 5610 }, { "epoch": 0.1576658717912751, "grad_norm": 0.07948168367147446, "learning_rate": 4.7372235470145416e-05, "loss": 0.0337, "step": 5620 }, { "epoch": 0.15794641604713144, "grad_norm": 0.6507600545883179, "learning_rate": 4.736755973254781e-05, "loss": 0.0483, "step": 5630 }, { "epoch": 0.1582269603029878, "grad_norm": 0.8163682222366333, "learning_rate": 4.736288399495021e-05, "loss": 0.065, "step": 5640 }, { "epoch": 0.15850750455884416, "grad_norm": 1.0077649354934692, "learning_rate": 4.7358208257352596e-05, "loss": 0.0401, "step": 5650 }, { "epoch": 0.15878804881470052, "grad_norm": 5.004073619842529, "learning_rate": 4.7353532519754996e-05, "loss": 0.0322, "step": 5660 }, { "epoch": 0.15906859307055687, "grad_norm": 0.43733957409858704, "learning_rate": 4.734885678215739e-05, "loss": 0.0562, "step": 5670 }, { "epoch": 0.15934913732641323, "grad_norm": 0.14018604159355164, "learning_rate": 4.734418104455978e-05, "loss": 0.0225, "step": 5680 }, { "epoch": 0.15962968158226962, "grad_norm": 0.14298337697982788, "learning_rate": 4.7339505306962175e-05, "loss": 0.0455, "step": 5690 }, { "epoch": 0.15991022583812597, "grad_norm": 0.08587956428527832, "learning_rate": 4.733482956936457e-05, "loss": 0.0249, "step": 5700 }, { "epoch": 0.16019077009398233, "grad_norm": 0.10780750960111618, "learning_rate": 4.733015383176697e-05, "loss": 0.0471, "step": 5710 }, { "epoch": 0.1604713143498387, "grad_norm": 7.7513837814331055, "learning_rate": 4.7325478094169355e-05, "loss": 0.0398, "step": 5720 }, { "epoch": 0.16075185860569505, "grad_norm": 0.08360306918621063, "learning_rate": 4.7320802356571755e-05, "loss": 0.0279, "step": 5730 }, { "epoch": 0.1610324028615514, "grad_norm": 0.11424875259399414, "learning_rate": 4.731612661897414e-05, "loss": 0.0273, "step": 5740 }, { "epoch": 0.16131294711740776, "grad_norm": 8.575593948364258, "learning_rate": 4.731145088137654e-05, "loss": 0.037, "step": 5750 }, { "epoch": 0.16159349137326412, "grad_norm": 0.27479231357574463, "learning_rate": 4.7306775143778934e-05, "loss": 0.0177, "step": 5760 }, { "epoch": 0.1618740356291205, "grad_norm": 0.5573285222053528, "learning_rate": 4.730209940618133e-05, "loss": 0.0298, "step": 5770 }, { "epoch": 0.16215457988497686, "grad_norm": 1.7300083637237549, "learning_rate": 4.729742366858372e-05, "loss": 0.0386, "step": 5780 }, { "epoch": 0.16243512414083322, "grad_norm": 1.221087098121643, "learning_rate": 4.7292747930986114e-05, "loss": 0.0334, "step": 5790 }, { "epoch": 0.16271566839668958, "grad_norm": 0.6648479700088501, "learning_rate": 4.7288072193388514e-05, "loss": 0.0534, "step": 5800 }, { "epoch": 0.16299621265254594, "grad_norm": 0.6726685762405396, "learning_rate": 4.72833964557909e-05, "loss": 0.074, "step": 5810 }, { "epoch": 0.1632767569084023, "grad_norm": 1.1949827671051025, "learning_rate": 4.72787207181933e-05, "loss": 0.0443, "step": 5820 }, { "epoch": 0.16355730116425865, "grad_norm": 0.4179614186286926, "learning_rate": 4.7274044980595686e-05, "loss": 0.0353, "step": 5830 }, { "epoch": 0.16383784542011504, "grad_norm": 2.410166025161743, "learning_rate": 4.7269369242998086e-05, "loss": 0.057, "step": 5840 }, { "epoch": 0.1641183896759714, "grad_norm": 0.16641810536384583, "learning_rate": 4.726469350540048e-05, "loss": 0.0298, "step": 5850 }, { "epoch": 0.16439893393182775, "grad_norm": 0.27338260412216187, "learning_rate": 4.726001776780287e-05, "loss": 0.0311, "step": 5860 }, { "epoch": 0.1646794781876841, "grad_norm": 0.356157124042511, "learning_rate": 4.7255342030205266e-05, "loss": 0.0304, "step": 5870 }, { "epoch": 0.16496002244354047, "grad_norm": 0.06061682105064392, "learning_rate": 4.725066629260766e-05, "loss": 0.0447, "step": 5880 }, { "epoch": 0.16524056669939682, "grad_norm": 0.07717595249414444, "learning_rate": 4.724599055501006e-05, "loss": 0.0434, "step": 5890 }, { "epoch": 0.16552111095525318, "grad_norm": 0.23718024790287018, "learning_rate": 4.7241314817412445e-05, "loss": 0.0212, "step": 5900 }, { "epoch": 0.16580165521110954, "grad_norm": 0.22147485613822937, "learning_rate": 4.7236639079814845e-05, "loss": 0.0089, "step": 5910 }, { "epoch": 0.16608219946696592, "grad_norm": 0.0643780305981636, "learning_rate": 4.723196334221724e-05, "loss": 0.0507, "step": 5920 }, { "epoch": 0.16636274372282228, "grad_norm": 0.20181897282600403, "learning_rate": 4.722728760461963e-05, "loss": 0.0476, "step": 5930 }, { "epoch": 0.16664328797867864, "grad_norm": 0.24966135621070862, "learning_rate": 4.7222611867022024e-05, "loss": 0.0539, "step": 5940 }, { "epoch": 0.166923832234535, "grad_norm": 0.2861177921295166, "learning_rate": 4.721793612942442e-05, "loss": 0.0364, "step": 5950 }, { "epoch": 0.16720437649039135, "grad_norm": 0.6841849088668823, "learning_rate": 4.721326039182681e-05, "loss": 0.0521, "step": 5960 }, { "epoch": 0.1674849207462477, "grad_norm": 0.12972019612789154, "learning_rate": 4.7208584654229204e-05, "loss": 0.0102, "step": 5970 }, { "epoch": 0.16776546500210407, "grad_norm": 7.357675075531006, "learning_rate": 4.7203908916631604e-05, "loss": 0.0501, "step": 5980 }, { "epoch": 0.16804600925796045, "grad_norm": 1.9694901704788208, "learning_rate": 4.7199233179034e-05, "loss": 0.0294, "step": 5990 }, { "epoch": 0.1683265535138168, "grad_norm": 0.8122125864028931, "learning_rate": 4.719455744143639e-05, "loss": 0.0379, "step": 6000 }, { "epoch": 0.16860709776967317, "grad_norm": 0.8295605182647705, "learning_rate": 4.718988170383878e-05, "loss": 0.0241, "step": 6010 }, { "epoch": 0.16888764202552953, "grad_norm": 0.4163786470890045, "learning_rate": 4.7185205966241177e-05, "loss": 0.0327, "step": 6020 }, { "epoch": 0.16916818628138588, "grad_norm": 0.17876361310482025, "learning_rate": 4.718053022864357e-05, "loss": 0.0334, "step": 6030 }, { "epoch": 0.16944873053724224, "grad_norm": 0.21461613476276398, "learning_rate": 4.717585449104596e-05, "loss": 0.0306, "step": 6040 }, { "epoch": 0.1697292747930986, "grad_norm": 0.20345203578472137, "learning_rate": 4.7171178753448356e-05, "loss": 0.0474, "step": 6050 }, { "epoch": 0.17000981904895499, "grad_norm": 4.265496730804443, "learning_rate": 4.7166503015850756e-05, "loss": 0.0532, "step": 6060 }, { "epoch": 0.17029036330481134, "grad_norm": 0.18898801505565643, "learning_rate": 4.716182727825314e-05, "loss": 0.0171, "step": 6070 }, { "epoch": 0.1705709075606677, "grad_norm": 0.9228972792625427, "learning_rate": 4.715715154065554e-05, "loss": 0.0421, "step": 6080 }, { "epoch": 0.17085145181652406, "grad_norm": 0.0808010846376419, "learning_rate": 4.7152475803057935e-05, "loss": 0.0505, "step": 6090 }, { "epoch": 0.17113199607238042, "grad_norm": 1.5425392389297485, "learning_rate": 4.714780006546033e-05, "loss": 0.0325, "step": 6100 }, { "epoch": 0.17141254032823677, "grad_norm": 1.9795186519622803, "learning_rate": 4.714312432786272e-05, "loss": 0.0543, "step": 6110 }, { "epoch": 0.17169308458409313, "grad_norm": 0.972235381603241, "learning_rate": 4.7138448590265115e-05, "loss": 0.0359, "step": 6120 }, { "epoch": 0.1719736288399495, "grad_norm": 0.14701052010059357, "learning_rate": 4.7133772852667515e-05, "loss": 0.0237, "step": 6130 }, { "epoch": 0.17225417309580587, "grad_norm": 0.23374591767787933, "learning_rate": 4.71290971150699e-05, "loss": 0.0351, "step": 6140 }, { "epoch": 0.17253471735166223, "grad_norm": 3.9817981719970703, "learning_rate": 4.71244213774723e-05, "loss": 0.054, "step": 6150 }, { "epoch": 0.1728152616075186, "grad_norm": 0.4547775387763977, "learning_rate": 4.711974563987469e-05, "loss": 0.0244, "step": 6160 }, { "epoch": 0.17309580586337495, "grad_norm": 1.8608020544052124, "learning_rate": 4.711506990227709e-05, "loss": 0.0275, "step": 6170 }, { "epoch": 0.1733763501192313, "grad_norm": 0.4900125563144684, "learning_rate": 4.711039416467948e-05, "loss": 0.0379, "step": 6180 }, { "epoch": 0.17365689437508766, "grad_norm": 0.939264714717865, "learning_rate": 4.7105718427081874e-05, "loss": 0.0254, "step": 6190 }, { "epoch": 0.17393743863094402, "grad_norm": 0.03782167658209801, "learning_rate": 4.7101042689484274e-05, "loss": 0.0709, "step": 6200 }, { "epoch": 0.1742179828868004, "grad_norm": 1.7012486457824707, "learning_rate": 4.709636695188666e-05, "loss": 0.0871, "step": 6210 }, { "epoch": 0.17449852714265676, "grad_norm": 1.8325848579406738, "learning_rate": 4.709169121428906e-05, "loss": 0.0422, "step": 6220 }, { "epoch": 0.17477907139851312, "grad_norm": 0.539165735244751, "learning_rate": 4.7087015476691446e-05, "loss": 0.028, "step": 6230 }, { "epoch": 0.17505961565436948, "grad_norm": 0.06257840245962143, "learning_rate": 4.7082339739093846e-05, "loss": 0.0394, "step": 6240 }, { "epoch": 0.17534015991022583, "grad_norm": 0.06025228649377823, "learning_rate": 4.707766400149623e-05, "loss": 0.0144, "step": 6250 }, { "epoch": 0.1756207041660822, "grad_norm": 4.634117603302002, "learning_rate": 4.707298826389863e-05, "loss": 0.047, "step": 6260 }, { "epoch": 0.17590124842193855, "grad_norm": 0.6232702136039734, "learning_rate": 4.7068312526301026e-05, "loss": 0.0567, "step": 6270 }, { "epoch": 0.17618179267779494, "grad_norm": 0.22408729791641235, "learning_rate": 4.706363678870342e-05, "loss": 0.0431, "step": 6280 }, { "epoch": 0.1764623369336513, "grad_norm": 0.497755765914917, "learning_rate": 4.705896105110582e-05, "loss": 0.0438, "step": 6290 }, { "epoch": 0.17674288118950765, "grad_norm": 0.2526243329048157, "learning_rate": 4.7054285313508205e-05, "loss": 0.0459, "step": 6300 }, { "epoch": 0.177023425445364, "grad_norm": 0.6737151741981506, "learning_rate": 4.7049609575910605e-05, "loss": 0.0286, "step": 6310 }, { "epoch": 0.17730396970122037, "grad_norm": 0.9993359446525574, "learning_rate": 4.704493383831299e-05, "loss": 0.0546, "step": 6320 }, { "epoch": 0.17758451395707672, "grad_norm": 1.2269740104675293, "learning_rate": 4.704025810071539e-05, "loss": 0.0378, "step": 6330 }, { "epoch": 0.17786505821293308, "grad_norm": 0.9339731931686401, "learning_rate": 4.7035582363117785e-05, "loss": 0.0294, "step": 6340 }, { "epoch": 0.17814560246878944, "grad_norm": 0.5820019841194153, "learning_rate": 4.703090662552018e-05, "loss": 0.0503, "step": 6350 }, { "epoch": 0.17842614672464582, "grad_norm": 3.4704904556274414, "learning_rate": 4.702623088792257e-05, "loss": 0.0274, "step": 6360 }, { "epoch": 0.17870669098050218, "grad_norm": 0.25787457823753357, "learning_rate": 4.7021555150324964e-05, "loss": 0.0376, "step": 6370 }, { "epoch": 0.17898723523635854, "grad_norm": 0.08602514117956161, "learning_rate": 4.701687941272736e-05, "loss": 0.0485, "step": 6380 }, { "epoch": 0.1792677794922149, "grad_norm": 0.041594602167606354, "learning_rate": 4.701220367512975e-05, "loss": 0.04, "step": 6390 }, { "epoch": 0.17954832374807125, "grad_norm": 0.21694275736808777, "learning_rate": 4.700752793753215e-05, "loss": 0.0228, "step": 6400 }, { "epoch": 0.1798288680039276, "grad_norm": 0.06653613597154617, "learning_rate": 4.7002852199934543e-05, "loss": 0.0401, "step": 6410 }, { "epoch": 0.18010941225978397, "grad_norm": 12.887453079223633, "learning_rate": 4.6998176462336937e-05, "loss": 0.0296, "step": 6420 }, { "epoch": 0.18038995651564035, "grad_norm": 0.17218825221061707, "learning_rate": 4.699350072473933e-05, "loss": 0.059, "step": 6430 }, { "epoch": 0.1806705007714967, "grad_norm": 0.23824959993362427, "learning_rate": 4.698882498714172e-05, "loss": 0.0363, "step": 6440 }, { "epoch": 0.18095104502735307, "grad_norm": 0.7475423812866211, "learning_rate": 4.6984149249544116e-05, "loss": 0.0511, "step": 6450 }, { "epoch": 0.18123158928320943, "grad_norm": 5.46797513961792, "learning_rate": 4.6979473511946516e-05, "loss": 0.0648, "step": 6460 }, { "epoch": 0.18151213353906578, "grad_norm": 0.07545479387044907, "learning_rate": 4.69747977743489e-05, "loss": 0.0184, "step": 6470 }, { "epoch": 0.18179267779492214, "grad_norm": 0.18671615421772003, "learning_rate": 4.69701220367513e-05, "loss": 0.0446, "step": 6480 }, { "epoch": 0.1820732220507785, "grad_norm": 0.44119009375572205, "learning_rate": 4.6965446299153695e-05, "loss": 0.0621, "step": 6490 }, { "epoch": 0.18235376630663488, "grad_norm": 0.29963305592536926, "learning_rate": 4.696077056155609e-05, "loss": 0.016, "step": 6500 }, { "epoch": 0.18263431056249124, "grad_norm": 0.03164295479655266, "learning_rate": 4.695609482395848e-05, "loss": 0.028, "step": 6510 }, { "epoch": 0.1829148548183476, "grad_norm": 0.8632977604866028, "learning_rate": 4.6951419086360875e-05, "loss": 0.0376, "step": 6520 }, { "epoch": 0.18319539907420396, "grad_norm": 0.12782613933086395, "learning_rate": 4.6946743348763275e-05, "loss": 0.015, "step": 6530 }, { "epoch": 0.18347594333006031, "grad_norm": 2.0353479385375977, "learning_rate": 4.694206761116566e-05, "loss": 0.0473, "step": 6540 }, { "epoch": 0.18375648758591667, "grad_norm": 0.7008491158485413, "learning_rate": 4.693739187356806e-05, "loss": 0.0393, "step": 6550 }, { "epoch": 0.18403703184177303, "grad_norm": 4.119721412658691, "learning_rate": 4.693271613597045e-05, "loss": 0.0272, "step": 6560 }, { "epoch": 0.1843175760976294, "grad_norm": 0.6680475473403931, "learning_rate": 4.692804039837285e-05, "loss": 0.0157, "step": 6570 }, { "epoch": 0.18459812035348577, "grad_norm": 4.406013011932373, "learning_rate": 4.692336466077524e-05, "loss": 0.0545, "step": 6580 }, { "epoch": 0.18487866460934213, "grad_norm": 0.2665198743343353, "learning_rate": 4.6918688923177634e-05, "loss": 0.0409, "step": 6590 }, { "epoch": 0.1851592088651985, "grad_norm": 0.39308643341064453, "learning_rate": 4.691401318558003e-05, "loss": 0.0314, "step": 6600 }, { "epoch": 0.18543975312105485, "grad_norm": 0.051599904894828796, "learning_rate": 4.690933744798242e-05, "loss": 0.0375, "step": 6610 }, { "epoch": 0.1857202973769112, "grad_norm": 0.034732963889837265, "learning_rate": 4.690466171038482e-05, "loss": 0.024, "step": 6620 }, { "epoch": 0.18600084163276756, "grad_norm": 1.0053609609603882, "learning_rate": 4.6899985972787206e-05, "loss": 0.0342, "step": 6630 }, { "epoch": 0.18628138588862392, "grad_norm": 0.025310009717941284, "learning_rate": 4.6895310235189606e-05, "loss": 0.0152, "step": 6640 }, { "epoch": 0.1865619301444803, "grad_norm": 0.18068285286426544, "learning_rate": 4.689063449759199e-05, "loss": 0.0578, "step": 6650 }, { "epoch": 0.18684247440033666, "grad_norm": 0.6618809103965759, "learning_rate": 4.688595875999439e-05, "loss": 0.0348, "step": 6660 }, { "epoch": 0.18712301865619302, "grad_norm": 2.9176204204559326, "learning_rate": 4.6881283022396786e-05, "loss": 0.0376, "step": 6670 }, { "epoch": 0.18740356291204938, "grad_norm": 0.7167972326278687, "learning_rate": 4.687660728479918e-05, "loss": 0.0638, "step": 6680 }, { "epoch": 0.18768410716790573, "grad_norm": 1.9073143005371094, "learning_rate": 4.687193154720157e-05, "loss": 0.0463, "step": 6690 }, { "epoch": 0.1879646514237621, "grad_norm": 0.11878165602684021, "learning_rate": 4.6867255809603965e-05, "loss": 0.0296, "step": 6700 }, { "epoch": 0.18824519567961845, "grad_norm": 0.09344609081745148, "learning_rate": 4.6862580072006365e-05, "loss": 0.0209, "step": 6710 }, { "epoch": 0.18852573993547483, "grad_norm": 0.6669847965240479, "learning_rate": 4.685790433440875e-05, "loss": 0.0411, "step": 6720 }, { "epoch": 0.1888062841913312, "grad_norm": 0.12092327326536179, "learning_rate": 4.685322859681115e-05, "loss": 0.0272, "step": 6730 }, { "epoch": 0.18908682844718755, "grad_norm": 0.4136148691177368, "learning_rate": 4.6848552859213545e-05, "loss": 0.0303, "step": 6740 }, { "epoch": 0.1893673727030439, "grad_norm": 2.940784454345703, "learning_rate": 4.684387712161594e-05, "loss": 0.0327, "step": 6750 }, { "epoch": 0.18964791695890026, "grad_norm": 0.13432542979717255, "learning_rate": 4.683920138401833e-05, "loss": 0.0221, "step": 6760 }, { "epoch": 0.18992846121475662, "grad_norm": 0.04497196152806282, "learning_rate": 4.6834525646420724e-05, "loss": 0.0299, "step": 6770 }, { "epoch": 0.19020900547061298, "grad_norm": 0.07155454903841019, "learning_rate": 4.682984990882312e-05, "loss": 0.0462, "step": 6780 }, { "epoch": 0.19048954972646934, "grad_norm": 0.23584584891796112, "learning_rate": 4.682517417122551e-05, "loss": 0.0634, "step": 6790 }, { "epoch": 0.19077009398232572, "grad_norm": 0.11481056362390518, "learning_rate": 4.682049843362791e-05, "loss": 0.0387, "step": 6800 }, { "epoch": 0.19105063823818208, "grad_norm": 1.1723856925964355, "learning_rate": 4.6815822696030304e-05, "loss": 0.0462, "step": 6810 }, { "epoch": 0.19133118249403844, "grad_norm": 9.590450286865234, "learning_rate": 4.68111469584327e-05, "loss": 0.0264, "step": 6820 }, { "epoch": 0.1916117267498948, "grad_norm": 0.3102656304836273, "learning_rate": 4.680647122083509e-05, "loss": 0.0181, "step": 6830 }, { "epoch": 0.19189227100575115, "grad_norm": 0.037198539823293686, "learning_rate": 4.680179548323748e-05, "loss": 0.0267, "step": 6840 }, { "epoch": 0.1921728152616075, "grad_norm": 0.2549060881137848, "learning_rate": 4.6797119745639876e-05, "loss": 0.0459, "step": 6850 }, { "epoch": 0.19245335951746387, "grad_norm": 0.11095716059207916, "learning_rate": 4.679244400804227e-05, "loss": 0.0453, "step": 6860 }, { "epoch": 0.19273390377332025, "grad_norm": 0.24384506046772003, "learning_rate": 4.678776827044466e-05, "loss": 0.0682, "step": 6870 }, { "epoch": 0.1930144480291766, "grad_norm": 2.1881957054138184, "learning_rate": 4.678309253284706e-05, "loss": 0.0452, "step": 6880 }, { "epoch": 0.19329499228503297, "grad_norm": 1.3027997016906738, "learning_rate": 4.6778416795249456e-05, "loss": 0.0345, "step": 6890 }, { "epoch": 0.19357553654088933, "grad_norm": 0.6565314531326294, "learning_rate": 4.677374105765185e-05, "loss": 0.0468, "step": 6900 }, { "epoch": 0.19385608079674568, "grad_norm": 1.2551761865615845, "learning_rate": 4.676906532005424e-05, "loss": 0.0426, "step": 6910 }, { "epoch": 0.19413662505260204, "grad_norm": 0.16059139370918274, "learning_rate": 4.6764389582456635e-05, "loss": 0.0459, "step": 6920 }, { "epoch": 0.1944171693084584, "grad_norm": 0.10880900174379349, "learning_rate": 4.675971384485903e-05, "loss": 0.0202, "step": 6930 }, { "epoch": 0.19469771356431478, "grad_norm": 0.028811011463403702, "learning_rate": 4.675503810726142e-05, "loss": 0.0163, "step": 6940 }, { "epoch": 0.19497825782017114, "grad_norm": 0.12592792510986328, "learning_rate": 4.675036236966382e-05, "loss": 0.0249, "step": 6950 }, { "epoch": 0.1952588020760275, "grad_norm": 0.023805342614650726, "learning_rate": 4.674568663206621e-05, "loss": 0.0202, "step": 6960 }, { "epoch": 0.19553934633188386, "grad_norm": 0.040050655603408813, "learning_rate": 4.674101089446861e-05, "loss": 0.0048, "step": 6970 }, { "epoch": 0.19581989058774021, "grad_norm": 0.15072916448116302, "learning_rate": 4.6736335156870994e-05, "loss": 0.0468, "step": 6980 }, { "epoch": 0.19610043484359657, "grad_norm": 26.211589813232422, "learning_rate": 4.6731659419273394e-05, "loss": 0.066, "step": 6990 }, { "epoch": 0.19638097909945293, "grad_norm": 0.2420680820941925, "learning_rate": 4.672698368167579e-05, "loss": 0.034, "step": 7000 }, { "epoch": 0.1966615233553093, "grad_norm": 0.09942755848169327, "learning_rate": 4.672230794407818e-05, "loss": 0.0463, "step": 7010 }, { "epoch": 0.19694206761116567, "grad_norm": 2.0846292972564697, "learning_rate": 4.671763220648058e-05, "loss": 0.062, "step": 7020 }, { "epoch": 0.19722261186702203, "grad_norm": 3.5598971843719482, "learning_rate": 4.6712956468882967e-05, "loss": 0.0458, "step": 7030 }, { "epoch": 0.1975031561228784, "grad_norm": 0.17946495115756989, "learning_rate": 4.6708280731285366e-05, "loss": 0.027, "step": 7040 }, { "epoch": 0.19778370037873474, "grad_norm": 0.3818371593952179, "learning_rate": 4.670360499368775e-05, "loss": 0.0519, "step": 7050 }, { "epoch": 0.1980642446345911, "grad_norm": 0.2225957065820694, "learning_rate": 4.669892925609015e-05, "loss": 0.05, "step": 7060 }, { "epoch": 0.19834478889044746, "grad_norm": 0.2855037748813629, "learning_rate": 4.669425351849254e-05, "loss": 0.0386, "step": 7070 }, { "epoch": 0.19862533314630382, "grad_norm": 0.31101202964782715, "learning_rate": 4.668957778089494e-05, "loss": 0.018, "step": 7080 }, { "epoch": 0.1989058774021602, "grad_norm": 0.27419865131378174, "learning_rate": 4.668490204329733e-05, "loss": 0.0194, "step": 7090 }, { "epoch": 0.19918642165801656, "grad_norm": 0.03294256702065468, "learning_rate": 4.6680226305699725e-05, "loss": 0.0279, "step": 7100 }, { "epoch": 0.19946696591387292, "grad_norm": 2.0270493030548096, "learning_rate": 4.6675550568102125e-05, "loss": 0.0359, "step": 7110 }, { "epoch": 0.19974751016972928, "grad_norm": 7.386188507080078, "learning_rate": 4.667087483050451e-05, "loss": 0.0344, "step": 7120 }, { "epoch": 0.20002805442558563, "grad_norm": 0.7101346850395203, "learning_rate": 4.666619909290691e-05, "loss": 0.0153, "step": 7130 }, { "epoch": 0.200308598681442, "grad_norm": 0.3108483850955963, "learning_rate": 4.66615233553093e-05, "loss": 0.0488, "step": 7140 }, { "epoch": 0.20058914293729835, "grad_norm": 0.09812460839748383, "learning_rate": 4.66568476177117e-05, "loss": 0.0432, "step": 7150 }, { "epoch": 0.20086968719315473, "grad_norm": 0.24985858798027039, "learning_rate": 4.665217188011409e-05, "loss": 0.0663, "step": 7160 }, { "epoch": 0.2011502314490111, "grad_norm": 0.23229022324085236, "learning_rate": 4.6647496142516484e-05, "loss": 0.05, "step": 7170 }, { "epoch": 0.20143077570486745, "grad_norm": 0.47655749320983887, "learning_rate": 4.664282040491888e-05, "loss": 0.0192, "step": 7180 }, { "epoch": 0.2017113199607238, "grad_norm": 2.3708152770996094, "learning_rate": 4.663814466732127e-05, "loss": 0.0467, "step": 7190 }, { "epoch": 0.20199186421658016, "grad_norm": 0.3358774185180664, "learning_rate": 4.663346892972367e-05, "loss": 0.0347, "step": 7200 }, { "epoch": 0.20227240847243652, "grad_norm": 0.49033355712890625, "learning_rate": 4.662879319212606e-05, "loss": 0.0518, "step": 7210 }, { "epoch": 0.20255295272829288, "grad_norm": 0.10854144394397736, "learning_rate": 4.662411745452846e-05, "loss": 0.0311, "step": 7220 }, { "epoch": 0.20283349698414924, "grad_norm": 2.51568603515625, "learning_rate": 4.661944171693085e-05, "loss": 0.0473, "step": 7230 }, { "epoch": 0.20311404124000562, "grad_norm": 0.6842989325523376, "learning_rate": 4.661476597933324e-05, "loss": 0.0149, "step": 7240 }, { "epoch": 0.20339458549586198, "grad_norm": 0.17539720237255096, "learning_rate": 4.6610090241735636e-05, "loss": 0.0346, "step": 7250 }, { "epoch": 0.20367512975171834, "grad_norm": 0.8725373148918152, "learning_rate": 4.660541450413803e-05, "loss": 0.0296, "step": 7260 }, { "epoch": 0.2039556740075747, "grad_norm": 0.8219775557518005, "learning_rate": 4.660073876654042e-05, "loss": 0.0294, "step": 7270 }, { "epoch": 0.20423621826343105, "grad_norm": 0.5718474984169006, "learning_rate": 4.6596063028942816e-05, "loss": 0.0373, "step": 7280 }, { "epoch": 0.2045167625192874, "grad_norm": 2.3700757026672363, "learning_rate": 4.659138729134521e-05, "loss": 0.0624, "step": 7290 }, { "epoch": 0.20479730677514377, "grad_norm": 5.431886196136475, "learning_rate": 4.658671155374761e-05, "loss": 0.0217, "step": 7300 }, { "epoch": 0.20507785103100015, "grad_norm": 0.23218287527561188, "learning_rate": 4.658203581615e-05, "loss": 0.0364, "step": 7310 }, { "epoch": 0.2053583952868565, "grad_norm": 0.6690369248390198, "learning_rate": 4.6577360078552395e-05, "loss": 0.0303, "step": 7320 }, { "epoch": 0.20563893954271287, "grad_norm": 8.929156303405762, "learning_rate": 4.657268434095479e-05, "loss": 0.0281, "step": 7330 }, { "epoch": 0.20591948379856923, "grad_norm": 1.0318032503128052, "learning_rate": 4.656800860335718e-05, "loss": 0.0664, "step": 7340 }, { "epoch": 0.20620002805442558, "grad_norm": 0.46145039796829224, "learning_rate": 4.6563332865759575e-05, "loss": 0.0265, "step": 7350 }, { "epoch": 0.20648057231028194, "grad_norm": 0.06451888382434845, "learning_rate": 4.655865712816197e-05, "loss": 0.0278, "step": 7360 }, { "epoch": 0.2067611165661383, "grad_norm": 1.3881934881210327, "learning_rate": 4.655398139056437e-05, "loss": 0.0696, "step": 7370 }, { "epoch": 0.20704166082199468, "grad_norm": 0.25591862201690674, "learning_rate": 4.6549305652966754e-05, "loss": 0.03, "step": 7380 }, { "epoch": 0.20732220507785104, "grad_norm": 7.247988700866699, "learning_rate": 4.6544629915369154e-05, "loss": 0.0151, "step": 7390 }, { "epoch": 0.2076027493337074, "grad_norm": 0.034858588129282, "learning_rate": 4.653995417777155e-05, "loss": 0.0076, "step": 7400 }, { "epoch": 0.20788329358956376, "grad_norm": 4.018956661224365, "learning_rate": 4.653527844017394e-05, "loss": 0.0615, "step": 7410 }, { "epoch": 0.2081638378454201, "grad_norm": 3.2342798709869385, "learning_rate": 4.6530602702576333e-05, "loss": 0.0323, "step": 7420 }, { "epoch": 0.20844438210127647, "grad_norm": 0.06200911104679108, "learning_rate": 4.6525926964978727e-05, "loss": 0.0254, "step": 7430 }, { "epoch": 0.20872492635713283, "grad_norm": 0.990286648273468, "learning_rate": 4.6521251227381127e-05, "loss": 0.033, "step": 7440 }, { "epoch": 0.20900547061298919, "grad_norm": 2.0407838821411133, "learning_rate": 4.651657548978351e-05, "loss": 0.0645, "step": 7450 }, { "epoch": 0.20928601486884557, "grad_norm": 0.14380258321762085, "learning_rate": 4.651189975218591e-05, "loss": 0.016, "step": 7460 }, { "epoch": 0.20956655912470193, "grad_norm": 2.5360734462738037, "learning_rate": 4.65072240145883e-05, "loss": 0.0405, "step": 7470 }, { "epoch": 0.2098471033805583, "grad_norm": 0.5487495064735413, "learning_rate": 4.65025482769907e-05, "loss": 0.025, "step": 7480 }, { "epoch": 0.21012764763641464, "grad_norm": 0.017219865694642067, "learning_rate": 4.649787253939309e-05, "loss": 0.0276, "step": 7490 }, { "epoch": 0.210408191892271, "grad_norm": 0.08425881713628769, "learning_rate": 4.6493196801795485e-05, "loss": 0.0249, "step": 7500 }, { "epoch": 0.21068873614812736, "grad_norm": 0.031156549230217934, "learning_rate": 4.648852106419788e-05, "loss": 0.0226, "step": 7510 }, { "epoch": 0.21096928040398372, "grad_norm": 0.04901917651295662, "learning_rate": 4.648384532660027e-05, "loss": 0.0248, "step": 7520 }, { "epoch": 0.2112498246598401, "grad_norm": 0.20313845574855804, "learning_rate": 4.647916958900267e-05, "loss": 0.034, "step": 7530 }, { "epoch": 0.21153036891569646, "grad_norm": 0.17315497994422913, "learning_rate": 4.647449385140506e-05, "loss": 0.0165, "step": 7540 }, { "epoch": 0.21181091317155282, "grad_norm": 0.16081887483596802, "learning_rate": 4.646981811380746e-05, "loss": 0.0393, "step": 7550 }, { "epoch": 0.21209145742740917, "grad_norm": 0.17392893135547638, "learning_rate": 4.6465142376209844e-05, "loss": 0.0613, "step": 7560 }, { "epoch": 0.21237200168326553, "grad_norm": 4.782777786254883, "learning_rate": 4.6460466638612244e-05, "loss": 0.0276, "step": 7570 }, { "epoch": 0.2126525459391219, "grad_norm": 0.02271469309926033, "learning_rate": 4.645579090101464e-05, "loss": 0.0117, "step": 7580 }, { "epoch": 0.21293309019497825, "grad_norm": 0.5204528570175171, "learning_rate": 4.645111516341703e-05, "loss": 0.0461, "step": 7590 }, { "epoch": 0.21321363445083463, "grad_norm": 0.2473534494638443, "learning_rate": 4.6446439425819424e-05, "loss": 0.0403, "step": 7600 }, { "epoch": 0.213494178706691, "grad_norm": 0.7283467054367065, "learning_rate": 4.644176368822182e-05, "loss": 0.0469, "step": 7610 }, { "epoch": 0.21377472296254735, "grad_norm": 0.35764235258102417, "learning_rate": 4.643708795062422e-05, "loss": 0.0502, "step": 7620 }, { "epoch": 0.2140552672184037, "grad_norm": 1.7401725053787231, "learning_rate": 4.64324122130266e-05, "loss": 0.0601, "step": 7630 }, { "epoch": 0.21433581147426006, "grad_norm": 0.08021201938390732, "learning_rate": 4.6427736475429e-05, "loss": 0.0189, "step": 7640 }, { "epoch": 0.21461635573011642, "grad_norm": 0.0594768226146698, "learning_rate": 4.6423060737831396e-05, "loss": 0.0414, "step": 7650 }, { "epoch": 0.21489689998597278, "grad_norm": 0.2967206835746765, "learning_rate": 4.641838500023379e-05, "loss": 0.0427, "step": 7660 }, { "epoch": 0.21517744424182914, "grad_norm": 0.6998261213302612, "learning_rate": 4.641370926263618e-05, "loss": 0.0313, "step": 7670 }, { "epoch": 0.21545798849768552, "grad_norm": 1.6872658729553223, "learning_rate": 4.6409033525038576e-05, "loss": 0.0769, "step": 7680 }, { "epoch": 0.21573853275354188, "grad_norm": 0.6886950135231018, "learning_rate": 4.640435778744097e-05, "loss": 0.0513, "step": 7690 }, { "epoch": 0.21601907700939824, "grad_norm": 0.14627604186534882, "learning_rate": 4.639968204984336e-05, "loss": 0.0066, "step": 7700 }, { "epoch": 0.2162996212652546, "grad_norm": 1.561976671218872, "learning_rate": 4.639500631224576e-05, "loss": 0.0465, "step": 7710 }, { "epoch": 0.21658016552111095, "grad_norm": 0.18836070597171783, "learning_rate": 4.6390330574648155e-05, "loss": 0.0535, "step": 7720 }, { "epoch": 0.2168607097769673, "grad_norm": 0.32127833366394043, "learning_rate": 4.638565483705055e-05, "loss": 0.0454, "step": 7730 }, { "epoch": 0.21714125403282367, "grad_norm": 0.8426222205162048, "learning_rate": 4.638097909945294e-05, "loss": 0.022, "step": 7740 }, { "epoch": 0.21742179828868005, "grad_norm": 0.2778621315956116, "learning_rate": 4.6376303361855335e-05, "loss": 0.0306, "step": 7750 }, { "epoch": 0.2177023425445364, "grad_norm": 0.10138051211833954, "learning_rate": 4.637162762425773e-05, "loss": 0.0449, "step": 7760 }, { "epoch": 0.21798288680039277, "grad_norm": 0.45279747247695923, "learning_rate": 4.636695188666012e-05, "loss": 0.0308, "step": 7770 }, { "epoch": 0.21826343105624912, "grad_norm": 1.0773769617080688, "learning_rate": 4.6362276149062514e-05, "loss": 0.0187, "step": 7780 }, { "epoch": 0.21854397531210548, "grad_norm": 0.06355073302984238, "learning_rate": 4.6357600411464914e-05, "loss": 0.0221, "step": 7790 }, { "epoch": 0.21882451956796184, "grad_norm": 2.341646909713745, "learning_rate": 4.635292467386731e-05, "loss": 0.0261, "step": 7800 }, { "epoch": 0.2191050638238182, "grad_norm": 0.053316470235586166, "learning_rate": 4.63482489362697e-05, "loss": 0.0163, "step": 7810 }, { "epoch": 0.21938560807967458, "grad_norm": 1.62228524684906, "learning_rate": 4.6343573198672094e-05, "loss": 0.013, "step": 7820 }, { "epoch": 0.21966615233553094, "grad_norm": 0.03350365161895752, "learning_rate": 4.633889746107449e-05, "loss": 0.0055, "step": 7830 }, { "epoch": 0.2199466965913873, "grad_norm": 0.5745824575424194, "learning_rate": 4.633422172347688e-05, "loss": 0.0371, "step": 7840 }, { "epoch": 0.22022724084724365, "grad_norm": 0.5403650999069214, "learning_rate": 4.632954598587927e-05, "loss": 0.0339, "step": 7850 }, { "epoch": 0.2205077851031, "grad_norm": 0.4439322352409363, "learning_rate": 4.632487024828167e-05, "loss": 0.0228, "step": 7860 }, { "epoch": 0.22078832935895637, "grad_norm": 0.36792007088661194, "learning_rate": 4.632019451068406e-05, "loss": 0.0645, "step": 7870 }, { "epoch": 0.22106887361481273, "grad_norm": 1.0735416412353516, "learning_rate": 4.631551877308646e-05, "loss": 0.0298, "step": 7880 }, { "epoch": 0.22134941787066909, "grad_norm": 2.548311948776245, "learning_rate": 4.6310843035488846e-05, "loss": 0.0587, "step": 7890 }, { "epoch": 0.22162996212652547, "grad_norm": 1.4769843816757202, "learning_rate": 4.6306167297891246e-05, "loss": 0.0108, "step": 7900 }, { "epoch": 0.22191050638238183, "grad_norm": 0.0343247652053833, "learning_rate": 4.630149156029364e-05, "loss": 0.0303, "step": 7910 }, { "epoch": 0.22219105063823819, "grad_norm": 0.030162867158651352, "learning_rate": 4.629681582269603e-05, "loss": 0.0207, "step": 7920 }, { "epoch": 0.22247159489409454, "grad_norm": 1.260990858078003, "learning_rate": 4.629214008509843e-05, "loss": 0.0573, "step": 7930 }, { "epoch": 0.2227521391499509, "grad_norm": 0.03902016952633858, "learning_rate": 4.628746434750082e-05, "loss": 0.0464, "step": 7940 }, { "epoch": 0.22303268340580726, "grad_norm": 0.4719078242778778, "learning_rate": 4.628278860990322e-05, "loss": 0.0329, "step": 7950 }, { "epoch": 0.22331322766166362, "grad_norm": 0.25839582085609436, "learning_rate": 4.6278112872305604e-05, "loss": 0.0683, "step": 7960 }, { "epoch": 0.22359377191752, "grad_norm": 0.41728881001472473, "learning_rate": 4.6273437134708004e-05, "loss": 0.0293, "step": 7970 }, { "epoch": 0.22387431617337636, "grad_norm": 0.03752991929650307, "learning_rate": 4.626876139711039e-05, "loss": 0.0186, "step": 7980 }, { "epoch": 0.22415486042923272, "grad_norm": 2.3540358543395996, "learning_rate": 4.626408565951279e-05, "loss": 0.017, "step": 7990 }, { "epoch": 0.22443540468508907, "grad_norm": 0.04047665745019913, "learning_rate": 4.6259409921915184e-05, "loss": 0.0221, "step": 8000 }, { "epoch": 0.22471594894094543, "grad_norm": 2.1056864261627197, "learning_rate": 4.625473418431758e-05, "loss": 0.0548, "step": 8010 }, { "epoch": 0.2249964931968018, "grad_norm": 5.965839385986328, "learning_rate": 4.625005844671998e-05, "loss": 0.0276, "step": 8020 }, { "epoch": 0.22527703745265815, "grad_norm": 3.0217533111572266, "learning_rate": 4.624538270912236e-05, "loss": 0.0142, "step": 8030 }, { "epoch": 0.22555758170851453, "grad_norm": 0.18710541725158691, "learning_rate": 4.624070697152476e-05, "loss": 0.0282, "step": 8040 }, { "epoch": 0.2258381259643709, "grad_norm": 0.830851137638092, "learning_rate": 4.623603123392715e-05, "loss": 0.0318, "step": 8050 }, { "epoch": 0.22611867022022725, "grad_norm": 0.6966878175735474, "learning_rate": 4.623135549632955e-05, "loss": 0.0449, "step": 8060 }, { "epoch": 0.2263992144760836, "grad_norm": 1.1275516748428345, "learning_rate": 4.622667975873194e-05, "loss": 0.0294, "step": 8070 }, { "epoch": 0.22667975873193996, "grad_norm": 0.32735446095466614, "learning_rate": 4.6222004021134336e-05, "loss": 0.029, "step": 8080 }, { "epoch": 0.22696030298779632, "grad_norm": 0.24957576394081116, "learning_rate": 4.621732828353673e-05, "loss": 0.0516, "step": 8090 }, { "epoch": 0.22724084724365268, "grad_norm": 0.10005732625722885, "learning_rate": 4.621265254593912e-05, "loss": 0.0256, "step": 8100 }, { "epoch": 0.22752139149950903, "grad_norm": 1.1909685134887695, "learning_rate": 4.620797680834152e-05, "loss": 0.0514, "step": 8110 }, { "epoch": 0.22780193575536542, "grad_norm": 0.931831955909729, "learning_rate": 4.620330107074391e-05, "loss": 0.0323, "step": 8120 }, { "epoch": 0.22808248001122178, "grad_norm": 0.8100572824478149, "learning_rate": 4.619862533314631e-05, "loss": 0.0461, "step": 8130 }, { "epoch": 0.22836302426707814, "grad_norm": 0.10492914170026779, "learning_rate": 4.61939495955487e-05, "loss": 0.09, "step": 8140 }, { "epoch": 0.2286435685229345, "grad_norm": 0.4920502007007599, "learning_rate": 4.6189273857951095e-05, "loss": 0.029, "step": 8150 }, { "epoch": 0.22892411277879085, "grad_norm": 0.8532870411872864, "learning_rate": 4.618459812035349e-05, "loss": 0.0371, "step": 8160 }, { "epoch": 0.2292046570346472, "grad_norm": 0.061888329684734344, "learning_rate": 4.617992238275588e-05, "loss": 0.0279, "step": 8170 }, { "epoch": 0.22948520129050357, "grad_norm": 2.0189361572265625, "learning_rate": 4.6175246645158274e-05, "loss": 0.0245, "step": 8180 }, { "epoch": 0.22976574554635995, "grad_norm": 1.2841471433639526, "learning_rate": 4.617057090756067e-05, "loss": 0.016, "step": 8190 }, { "epoch": 0.2300462898022163, "grad_norm": 0.13980869948863983, "learning_rate": 4.616589516996306e-05, "loss": 0.0296, "step": 8200 }, { "epoch": 0.23032683405807267, "grad_norm": 0.9885444045066833, "learning_rate": 4.616121943236546e-05, "loss": 0.0304, "step": 8210 }, { "epoch": 0.23060737831392902, "grad_norm": 0.2004157155752182, "learning_rate": 4.6156543694767854e-05, "loss": 0.0401, "step": 8220 }, { "epoch": 0.23088792256978538, "grad_norm": 0.018936652690172195, "learning_rate": 4.615186795717025e-05, "loss": 0.0199, "step": 8230 }, { "epoch": 0.23116846682564174, "grad_norm": 2.979088544845581, "learning_rate": 4.614719221957264e-05, "loss": 0.0535, "step": 8240 }, { "epoch": 0.2314490110814981, "grad_norm": 0.4403943419456482, "learning_rate": 4.614251648197503e-05, "loss": 0.0354, "step": 8250 }, { "epoch": 0.23172955533735448, "grad_norm": 0.06273607164621353, "learning_rate": 4.6137840744377426e-05, "loss": 0.0165, "step": 8260 }, { "epoch": 0.23201009959321084, "grad_norm": 0.25687527656555176, "learning_rate": 4.613316500677982e-05, "loss": 0.0208, "step": 8270 }, { "epoch": 0.2322906438490672, "grad_norm": 0.569094181060791, "learning_rate": 4.612848926918222e-05, "loss": 0.082, "step": 8280 }, { "epoch": 0.23257118810492355, "grad_norm": 0.09028616547584534, "learning_rate": 4.6123813531584606e-05, "loss": 0.0454, "step": 8290 }, { "epoch": 0.2328517323607799, "grad_norm": 0.605424702167511, "learning_rate": 4.6119137793987006e-05, "loss": 0.0297, "step": 8300 }, { "epoch": 0.23313227661663627, "grad_norm": 0.5404722094535828, "learning_rate": 4.61144620563894e-05, "loss": 0.0642, "step": 8310 }, { "epoch": 0.23341282087249263, "grad_norm": 0.87991863489151, "learning_rate": 4.610978631879179e-05, "loss": 0.0313, "step": 8320 }, { "epoch": 0.23369336512834898, "grad_norm": 3.323627233505249, "learning_rate": 4.6105110581194185e-05, "loss": 0.0294, "step": 8330 }, { "epoch": 0.23397390938420537, "grad_norm": 0.1311703473329544, "learning_rate": 4.610043484359658e-05, "loss": 0.0398, "step": 8340 }, { "epoch": 0.23425445364006173, "grad_norm": 0.3348381519317627, "learning_rate": 4.609575910599898e-05, "loss": 0.0348, "step": 8350 }, { "epoch": 0.23453499789591808, "grad_norm": 3.3416762351989746, "learning_rate": 4.6091083368401365e-05, "loss": 0.0349, "step": 8360 }, { "epoch": 0.23481554215177444, "grad_norm": 0.08779481053352356, "learning_rate": 4.6086407630803764e-05, "loss": 0.046, "step": 8370 }, { "epoch": 0.2350960864076308, "grad_norm": 0.043982066214084625, "learning_rate": 4.608173189320615e-05, "loss": 0.0261, "step": 8380 }, { "epoch": 0.23537663066348716, "grad_norm": 0.5943449139595032, "learning_rate": 4.607705615560855e-05, "loss": 0.0358, "step": 8390 }, { "epoch": 0.23565717491934352, "grad_norm": 0.12147404253482819, "learning_rate": 4.6072380418010944e-05, "loss": 0.0542, "step": 8400 }, { "epoch": 0.2359377191751999, "grad_norm": 0.1174352839589119, "learning_rate": 4.606770468041334e-05, "loss": 0.044, "step": 8410 }, { "epoch": 0.23621826343105626, "grad_norm": 0.10962548851966858, "learning_rate": 4.606302894281573e-05, "loss": 0.043, "step": 8420 }, { "epoch": 0.23649880768691262, "grad_norm": 1.219801902770996, "learning_rate": 4.6058353205218123e-05, "loss": 0.0199, "step": 8430 }, { "epoch": 0.23677935194276897, "grad_norm": 0.7079178690910339, "learning_rate": 4.605367746762052e-05, "loss": 0.056, "step": 8440 }, { "epoch": 0.23705989619862533, "grad_norm": 3.8122708797454834, "learning_rate": 4.604900173002291e-05, "loss": 0.0314, "step": 8450 }, { "epoch": 0.2373404404544817, "grad_norm": 0.3434726297855377, "learning_rate": 4.604432599242531e-05, "loss": 0.0282, "step": 8460 }, { "epoch": 0.23762098471033805, "grad_norm": 1.6829028129577637, "learning_rate": 4.6039650254827696e-05, "loss": 0.0246, "step": 8470 }, { "epoch": 0.23790152896619443, "grad_norm": 3.391624689102173, "learning_rate": 4.6034974517230096e-05, "loss": 0.0142, "step": 8480 }, { "epoch": 0.2381820732220508, "grad_norm": 0.028507916256785393, "learning_rate": 4.603029877963249e-05, "loss": 0.006, "step": 8490 }, { "epoch": 0.23846261747790715, "grad_norm": 0.06434010714292526, "learning_rate": 4.602562304203488e-05, "loss": 0.049, "step": 8500 }, { "epoch": 0.2387431617337635, "grad_norm": 0.6253107786178589, "learning_rate": 4.6020947304437275e-05, "loss": 0.0145, "step": 8510 }, { "epoch": 0.23902370598961986, "grad_norm": 0.32736337184906006, "learning_rate": 4.601627156683967e-05, "loss": 0.0289, "step": 8520 }, { "epoch": 0.23930425024547622, "grad_norm": 1.5797934532165527, "learning_rate": 4.601159582924207e-05, "loss": 0.0179, "step": 8530 }, { "epoch": 0.23958479450133258, "grad_norm": 2.985985040664673, "learning_rate": 4.6006920091644455e-05, "loss": 0.0942, "step": 8540 }, { "epoch": 0.23986533875718893, "grad_norm": 1.9111069440841675, "learning_rate": 4.6002244354046855e-05, "loss": 0.0253, "step": 8550 }, { "epoch": 0.24014588301304532, "grad_norm": 0.03631431981921196, "learning_rate": 4.599756861644925e-05, "loss": 0.0375, "step": 8560 }, { "epoch": 0.24042642726890168, "grad_norm": 4.774301052093506, "learning_rate": 4.599289287885164e-05, "loss": 0.0651, "step": 8570 }, { "epoch": 0.24070697152475803, "grad_norm": 0.1074846163392067, "learning_rate": 4.5988217141254034e-05, "loss": 0.0439, "step": 8580 }, { "epoch": 0.2409875157806144, "grad_norm": 0.06445175409317017, "learning_rate": 4.598354140365643e-05, "loss": 0.0202, "step": 8590 }, { "epoch": 0.24126806003647075, "grad_norm": 0.020588871091604233, "learning_rate": 4.597886566605882e-05, "loss": 0.0401, "step": 8600 }, { "epoch": 0.2415486042923271, "grad_norm": 2.0007176399230957, "learning_rate": 4.5974189928461214e-05, "loss": 0.0729, "step": 8610 }, { "epoch": 0.24182914854818346, "grad_norm": 0.26947805285453796, "learning_rate": 4.5969514190863614e-05, "loss": 0.0262, "step": 8620 }, { "epoch": 0.24210969280403985, "grad_norm": 2.924375295639038, "learning_rate": 4.596483845326601e-05, "loss": 0.0287, "step": 8630 }, { "epoch": 0.2423902370598962, "grad_norm": 0.3329281508922577, "learning_rate": 4.59601627156684e-05, "loss": 0.0351, "step": 8640 }, { "epoch": 0.24267078131575257, "grad_norm": 0.714296817779541, "learning_rate": 4.595548697807079e-05, "loss": 0.0249, "step": 8650 }, { "epoch": 0.24295132557160892, "grad_norm": 2.408857822418213, "learning_rate": 4.5950811240473186e-05, "loss": 0.0481, "step": 8660 }, { "epoch": 0.24323186982746528, "grad_norm": 0.8759765028953552, "learning_rate": 4.594613550287558e-05, "loss": 0.0313, "step": 8670 }, { "epoch": 0.24351241408332164, "grad_norm": 0.21158374845981598, "learning_rate": 4.594145976527797e-05, "loss": 0.0184, "step": 8680 }, { "epoch": 0.243792958339178, "grad_norm": 0.04322494938969612, "learning_rate": 4.5936784027680366e-05, "loss": 0.0191, "step": 8690 }, { "epoch": 0.24407350259503435, "grad_norm": 3.4602067470550537, "learning_rate": 4.5932108290082766e-05, "loss": 0.0507, "step": 8700 }, { "epoch": 0.24435404685089074, "grad_norm": 0.1784239411354065, "learning_rate": 4.592743255248516e-05, "loss": 0.0294, "step": 8710 }, { "epoch": 0.2446345911067471, "grad_norm": 4.166919708251953, "learning_rate": 4.592275681488755e-05, "loss": 0.0339, "step": 8720 }, { "epoch": 0.24491513536260345, "grad_norm": 1.07651686668396, "learning_rate": 4.5918081077289945e-05, "loss": 0.0248, "step": 8730 }, { "epoch": 0.2451956796184598, "grad_norm": 0.037858057767152786, "learning_rate": 4.591340533969234e-05, "loss": 0.0346, "step": 8740 }, { "epoch": 0.24547622387431617, "grad_norm": 1.8313876390457153, "learning_rate": 4.590872960209473e-05, "loss": 0.0389, "step": 8750 }, { "epoch": 0.24575676813017253, "grad_norm": 0.499666303396225, "learning_rate": 4.5904053864497125e-05, "loss": 0.0522, "step": 8760 }, { "epoch": 0.24603731238602888, "grad_norm": 1.7690165042877197, "learning_rate": 4.5899378126899525e-05, "loss": 0.0553, "step": 8770 }, { "epoch": 0.24631785664188527, "grad_norm": 0.03794454038143158, "learning_rate": 4.589470238930191e-05, "loss": 0.0166, "step": 8780 }, { "epoch": 0.24659840089774163, "grad_norm": 1.1681153774261475, "learning_rate": 4.589002665170431e-05, "loss": 0.0734, "step": 8790 }, { "epoch": 0.24687894515359798, "grad_norm": 0.13859564065933228, "learning_rate": 4.58853509141067e-05, "loss": 0.0182, "step": 8800 }, { "epoch": 0.24715948940945434, "grad_norm": 1.3705631494522095, "learning_rate": 4.58806751765091e-05, "loss": 0.0397, "step": 8810 }, { "epoch": 0.2474400336653107, "grad_norm": 0.0759497657418251, "learning_rate": 4.587599943891149e-05, "loss": 0.0146, "step": 8820 }, { "epoch": 0.24772057792116706, "grad_norm": 0.12469780445098877, "learning_rate": 4.5871323701313884e-05, "loss": 0.018, "step": 8830 }, { "epoch": 0.24800112217702341, "grad_norm": 0.7208095192909241, "learning_rate": 4.5866647963716283e-05, "loss": 0.0788, "step": 8840 }, { "epoch": 0.2482816664328798, "grad_norm": 0.592835009098053, "learning_rate": 4.586197222611867e-05, "loss": 0.0666, "step": 8850 }, { "epoch": 0.24856221068873616, "grad_norm": 0.831762969493866, "learning_rate": 4.585729648852107e-05, "loss": 0.0327, "step": 8860 }, { "epoch": 0.24884275494459251, "grad_norm": 0.08925333619117737, "learning_rate": 4.5852620750923456e-05, "loss": 0.0259, "step": 8870 }, { "epoch": 0.24912329920044887, "grad_norm": 0.0636264905333519, "learning_rate": 4.5847945013325856e-05, "loss": 0.0543, "step": 8880 }, { "epoch": 0.24940384345630523, "grad_norm": 0.24854597449302673, "learning_rate": 4.584326927572824e-05, "loss": 0.0659, "step": 8890 }, { "epoch": 0.2496843877121616, "grad_norm": 0.09957975149154663, "learning_rate": 4.583859353813064e-05, "loss": 0.0439, "step": 8900 }, { "epoch": 0.24996493196801794, "grad_norm": 0.29101935029029846, "learning_rate": 4.5833917800533036e-05, "loss": 0.0319, "step": 8910 }, { "epoch": 0.25024547622387433, "grad_norm": 0.14662200212478638, "learning_rate": 4.582924206293543e-05, "loss": 0.0277, "step": 8920 }, { "epoch": 0.25052602047973066, "grad_norm": 0.921671450138092, "learning_rate": 4.582456632533783e-05, "loss": 0.0365, "step": 8930 }, { "epoch": 0.25080656473558705, "grad_norm": 0.2547891139984131, "learning_rate": 4.5819890587740215e-05, "loss": 0.0302, "step": 8940 }, { "epoch": 0.2510871089914434, "grad_norm": 6.152005195617676, "learning_rate": 4.5815214850142615e-05, "loss": 0.0403, "step": 8950 }, { "epoch": 0.25136765324729976, "grad_norm": 1.3469940423965454, "learning_rate": 4.5810539112545e-05, "loss": 0.0585, "step": 8960 }, { "epoch": 0.25164819750315615, "grad_norm": 0.30170056223869324, "learning_rate": 4.58058633749474e-05, "loss": 0.0466, "step": 8970 }, { "epoch": 0.2519287417590125, "grad_norm": 0.32640329003334045, "learning_rate": 4.5801187637349794e-05, "loss": 0.0339, "step": 8980 }, { "epoch": 0.25220928601486886, "grad_norm": 0.6596046090126038, "learning_rate": 4.579651189975219e-05, "loss": 0.0859, "step": 8990 }, { "epoch": 0.2524898302707252, "grad_norm": 0.8016077280044556, "learning_rate": 4.579183616215458e-05, "loss": 0.0574, "step": 9000 }, { "epoch": 0.2527703745265816, "grad_norm": 0.05064346641302109, "learning_rate": 4.5787160424556974e-05, "loss": 0.048, "step": 9010 }, { "epoch": 0.2530509187824379, "grad_norm": 1.1869035959243774, "learning_rate": 4.5782484686959374e-05, "loss": 0.0265, "step": 9020 }, { "epoch": 0.2533314630382943, "grad_norm": 0.4089726507663727, "learning_rate": 4.577780894936177e-05, "loss": 0.0388, "step": 9030 }, { "epoch": 0.2536120072941507, "grad_norm": 1.0432153940200806, "learning_rate": 4.577313321176416e-05, "loss": 0.038, "step": 9040 }, { "epoch": 0.253892551550007, "grad_norm": 2.798095226287842, "learning_rate": 4.576845747416655e-05, "loss": 0.0517, "step": 9050 }, { "epoch": 0.2541730958058634, "grad_norm": 0.17752841114997864, "learning_rate": 4.5763781736568946e-05, "loss": 0.0448, "step": 9060 }, { "epoch": 0.2544536400617197, "grad_norm": 1.2113113403320312, "learning_rate": 4.575910599897134e-05, "loss": 0.0423, "step": 9070 }, { "epoch": 0.2547341843175761, "grad_norm": 1.3355779647827148, "learning_rate": 4.575443026137373e-05, "loss": 0.0555, "step": 9080 }, { "epoch": 0.25501472857343244, "grad_norm": 0.1510849893093109, "learning_rate": 4.5749754523776126e-05, "loss": 0.0104, "step": 9090 }, { "epoch": 0.2552952728292888, "grad_norm": 0.08142032474279404, "learning_rate": 4.5745078786178526e-05, "loss": 0.0677, "step": 9100 }, { "epoch": 0.2555758170851452, "grad_norm": 0.10033871978521347, "learning_rate": 4.574040304858091e-05, "loss": 0.0597, "step": 9110 }, { "epoch": 0.25585636134100154, "grad_norm": 0.1651158183813095, "learning_rate": 4.573572731098331e-05, "loss": 0.044, "step": 9120 }, { "epoch": 0.2561369055968579, "grad_norm": 0.09855394065380096, "learning_rate": 4.5731051573385705e-05, "loss": 0.0706, "step": 9130 }, { "epoch": 0.25641744985271425, "grad_norm": 0.10685568302869797, "learning_rate": 4.57263758357881e-05, "loss": 0.0155, "step": 9140 }, { "epoch": 0.25669799410857064, "grad_norm": 1.7888822555541992, "learning_rate": 4.572170009819049e-05, "loss": 0.0379, "step": 9150 }, { "epoch": 0.25697853836442697, "grad_norm": 3.7465949058532715, "learning_rate": 4.5717024360592885e-05, "loss": 0.0432, "step": 9160 }, { "epoch": 0.25725908262028335, "grad_norm": 0.5240666270256042, "learning_rate": 4.5712348622995285e-05, "loss": 0.0606, "step": 9170 }, { "epoch": 0.25753962687613974, "grad_norm": 2.227897882461548, "learning_rate": 4.570767288539767e-05, "loss": 0.0227, "step": 9180 }, { "epoch": 0.25782017113199607, "grad_norm": 0.036116112023591995, "learning_rate": 4.570299714780007e-05, "loss": 0.0502, "step": 9190 }, { "epoch": 0.25810071538785245, "grad_norm": 0.2771288752555847, "learning_rate": 4.569832141020246e-05, "loss": 0.0183, "step": 9200 }, { "epoch": 0.2583812596437088, "grad_norm": 0.16330064833164215, "learning_rate": 4.569364567260486e-05, "loss": 0.0297, "step": 9210 }, { "epoch": 0.25866180389956517, "grad_norm": 0.11793182045221329, "learning_rate": 4.568896993500725e-05, "loss": 0.0548, "step": 9220 }, { "epoch": 0.2589423481554215, "grad_norm": 0.09999874234199524, "learning_rate": 4.5684294197409644e-05, "loss": 0.0429, "step": 9230 }, { "epoch": 0.2592228924112779, "grad_norm": 0.31000038981437683, "learning_rate": 4.5679618459812044e-05, "loss": 0.038, "step": 9240 }, { "epoch": 0.2595034366671342, "grad_norm": 0.6516706943511963, "learning_rate": 4.567494272221443e-05, "loss": 0.0356, "step": 9250 }, { "epoch": 0.2597839809229906, "grad_norm": 0.05178140103816986, "learning_rate": 4.567026698461683e-05, "loss": 0.0265, "step": 9260 }, { "epoch": 0.260064525178847, "grad_norm": 0.04365124553442001, "learning_rate": 4.5665591247019216e-05, "loss": 0.0429, "step": 9270 }, { "epoch": 0.2603450694347033, "grad_norm": 0.03149515762925148, "learning_rate": 4.5660915509421616e-05, "loss": 0.0174, "step": 9280 }, { "epoch": 0.2606256136905597, "grad_norm": 0.8544738292694092, "learning_rate": 4.5656239771824e-05, "loss": 0.043, "step": 9290 }, { "epoch": 0.26090615794641603, "grad_norm": 0.2793480455875397, "learning_rate": 4.56515640342264e-05, "loss": 0.0386, "step": 9300 }, { "epoch": 0.2611867022022724, "grad_norm": 2.9573519229888916, "learning_rate": 4.5646888296628796e-05, "loss": 0.019, "step": 9310 }, { "epoch": 0.26146724645812874, "grad_norm": 0.7428318858146667, "learning_rate": 4.564221255903119e-05, "loss": 0.0729, "step": 9320 }, { "epoch": 0.26174779071398513, "grad_norm": 0.08965960890054703, "learning_rate": 4.563753682143358e-05, "loss": 0.0091, "step": 9330 }, { "epoch": 0.2620283349698415, "grad_norm": 1.0914405584335327, "learning_rate": 4.5632861083835975e-05, "loss": 0.0898, "step": 9340 }, { "epoch": 0.26230887922569784, "grad_norm": 3.34773588180542, "learning_rate": 4.5628185346238375e-05, "loss": 0.0381, "step": 9350 }, { "epoch": 0.26258942348155423, "grad_norm": 0.5783720016479492, "learning_rate": 4.562350960864076e-05, "loss": 0.0143, "step": 9360 }, { "epoch": 0.26286996773741056, "grad_norm": 0.5146954655647278, "learning_rate": 4.561883387104316e-05, "loss": 0.0388, "step": 9370 }, { "epoch": 0.26315051199326694, "grad_norm": 0.21914218366146088, "learning_rate": 4.5614158133445554e-05, "loss": 0.0199, "step": 9380 }, { "epoch": 0.2634310562491233, "grad_norm": 0.11800365149974823, "learning_rate": 4.560948239584795e-05, "loss": 0.0207, "step": 9390 }, { "epoch": 0.26371160050497966, "grad_norm": 0.5226148366928101, "learning_rate": 4.560480665825034e-05, "loss": 0.0657, "step": 9400 }, { "epoch": 0.26399214476083604, "grad_norm": 0.7890457510948181, "learning_rate": 4.5600130920652734e-05, "loss": 0.0334, "step": 9410 }, { "epoch": 0.2642726890166924, "grad_norm": 1.9329962730407715, "learning_rate": 4.559545518305513e-05, "loss": 0.0251, "step": 9420 }, { "epoch": 0.26455323327254876, "grad_norm": 0.07592236995697021, "learning_rate": 4.559077944545752e-05, "loss": 0.0494, "step": 9430 }, { "epoch": 0.2648337775284051, "grad_norm": 0.8592646718025208, "learning_rate": 4.558610370785992e-05, "loss": 0.0567, "step": 9440 }, { "epoch": 0.2651143217842615, "grad_norm": 0.8554933071136475, "learning_rate": 4.558142797026231e-05, "loss": 0.0253, "step": 9450 }, { "epoch": 0.2653948660401178, "grad_norm": 1.8290337324142456, "learning_rate": 4.5576752232664707e-05, "loss": 0.0551, "step": 9460 }, { "epoch": 0.2656754102959742, "grad_norm": 1.783876895904541, "learning_rate": 4.55720764950671e-05, "loss": 0.0277, "step": 9470 }, { "epoch": 0.2659559545518306, "grad_norm": 0.08131152391433716, "learning_rate": 4.556740075746949e-05, "loss": 0.0119, "step": 9480 }, { "epoch": 0.2662364988076869, "grad_norm": 1.6011152267456055, "learning_rate": 4.5562725019871886e-05, "loss": 0.0124, "step": 9490 }, { "epoch": 0.2665170430635433, "grad_norm": 2.342024803161621, "learning_rate": 4.555804928227428e-05, "loss": 0.0401, "step": 9500 }, { "epoch": 0.2667975873193996, "grad_norm": 1.8107287883758545, "learning_rate": 4.555337354467667e-05, "loss": 0.0377, "step": 9510 }, { "epoch": 0.267078131575256, "grad_norm": 0.20990164577960968, "learning_rate": 4.554869780707907e-05, "loss": 0.0352, "step": 9520 }, { "epoch": 0.26735867583111234, "grad_norm": 0.3588975667953491, "learning_rate": 4.5544022069481465e-05, "loss": 0.0512, "step": 9530 }, { "epoch": 0.2676392200869687, "grad_norm": 0.2696039378643036, "learning_rate": 4.553934633188386e-05, "loss": 0.0887, "step": 9540 }, { "epoch": 0.2679197643428251, "grad_norm": 0.3680812120437622, "learning_rate": 4.553467059428625e-05, "loss": 0.0227, "step": 9550 }, { "epoch": 0.26820030859868144, "grad_norm": 0.3418525457382202, "learning_rate": 4.5529994856688645e-05, "loss": 0.0234, "step": 9560 }, { "epoch": 0.2684808528545378, "grad_norm": 0.17124707996845245, "learning_rate": 4.552531911909104e-05, "loss": 0.0263, "step": 9570 }, { "epoch": 0.26876139711039415, "grad_norm": 2.0457937717437744, "learning_rate": 4.552064338149343e-05, "loss": 0.0565, "step": 9580 }, { "epoch": 0.26904194136625054, "grad_norm": 0.140323668718338, "learning_rate": 4.551596764389583e-05, "loss": 0.0229, "step": 9590 }, { "epoch": 0.26932248562210687, "grad_norm": 0.23667892813682556, "learning_rate": 4.551129190629822e-05, "loss": 0.0334, "step": 9600 }, { "epoch": 0.26960302987796325, "grad_norm": 0.1268153190612793, "learning_rate": 4.550661616870062e-05, "loss": 0.0633, "step": 9610 }, { "epoch": 0.26988357413381964, "grad_norm": 0.2465464472770691, "learning_rate": 4.550194043110301e-05, "loss": 0.025, "step": 9620 }, { "epoch": 0.27016411838967597, "grad_norm": 1.0297490358352661, "learning_rate": 4.5497264693505404e-05, "loss": 0.0272, "step": 9630 }, { "epoch": 0.27044466264553235, "grad_norm": 0.18107712268829346, "learning_rate": 4.54925889559078e-05, "loss": 0.0289, "step": 9640 }, { "epoch": 0.2707252069013887, "grad_norm": 0.1439388245344162, "learning_rate": 4.548791321831019e-05, "loss": 0.0416, "step": 9650 }, { "epoch": 0.27100575115724507, "grad_norm": 2.774142026901245, "learning_rate": 4.548323748071259e-05, "loss": 0.0445, "step": 9660 }, { "epoch": 0.2712862954131014, "grad_norm": 8.740565299987793, "learning_rate": 4.5478561743114976e-05, "loss": 0.0482, "step": 9670 }, { "epoch": 0.2715668396689578, "grad_norm": 0.11226948350667953, "learning_rate": 4.5473886005517376e-05, "loss": 0.0353, "step": 9680 }, { "epoch": 0.2718473839248141, "grad_norm": 0.38369113206863403, "learning_rate": 4.546921026791976e-05, "loss": 0.0189, "step": 9690 }, { "epoch": 0.2721279281806705, "grad_norm": 1.0133671760559082, "learning_rate": 4.546453453032216e-05, "loss": 0.0541, "step": 9700 }, { "epoch": 0.2724084724365269, "grad_norm": 0.10787016898393631, "learning_rate": 4.545985879272455e-05, "loss": 0.0218, "step": 9710 }, { "epoch": 0.2726890166923832, "grad_norm": 0.23844707012176514, "learning_rate": 4.545518305512695e-05, "loss": 0.0171, "step": 9720 }, { "epoch": 0.2729695609482396, "grad_norm": 0.597328245639801, "learning_rate": 4.545050731752934e-05, "loss": 0.021, "step": 9730 }, { "epoch": 0.2732501052040959, "grad_norm": 0.4178081452846527, "learning_rate": 4.5445831579931735e-05, "loss": 0.0561, "step": 9740 }, { "epoch": 0.2735306494599523, "grad_norm": 0.43169835209846497, "learning_rate": 4.5441155842334135e-05, "loss": 0.0161, "step": 9750 }, { "epoch": 0.27381119371580864, "grad_norm": 0.03920268639922142, "learning_rate": 4.543648010473652e-05, "loss": 0.0224, "step": 9760 }, { "epoch": 0.27409173797166503, "grad_norm": 3.104684591293335, "learning_rate": 4.543180436713892e-05, "loss": 0.0374, "step": 9770 }, { "epoch": 0.2743722822275214, "grad_norm": 0.11345984786748886, "learning_rate": 4.542712862954131e-05, "loss": 0.025, "step": 9780 }, { "epoch": 0.27465282648337774, "grad_norm": 6.9114179611206055, "learning_rate": 4.542245289194371e-05, "loss": 0.0466, "step": 9790 }, { "epoch": 0.27493337073923413, "grad_norm": 0.05458330363035202, "learning_rate": 4.54177771543461e-05, "loss": 0.0304, "step": 9800 }, { "epoch": 0.27521391499509046, "grad_norm": 0.16589929163455963, "learning_rate": 4.5413101416748494e-05, "loss": 0.0732, "step": 9810 }, { "epoch": 0.27549445925094684, "grad_norm": 0.1818813681602478, "learning_rate": 4.540842567915089e-05, "loss": 0.0258, "step": 9820 }, { "epoch": 0.2757750035068032, "grad_norm": 0.28271961212158203, "learning_rate": 4.540374994155328e-05, "loss": 0.0254, "step": 9830 }, { "epoch": 0.27605554776265956, "grad_norm": 0.4479694664478302, "learning_rate": 4.539907420395568e-05, "loss": 0.0326, "step": 9840 }, { "epoch": 0.27633609201851594, "grad_norm": 0.0373384952545166, "learning_rate": 4.539439846635807e-05, "loss": 0.0221, "step": 9850 }, { "epoch": 0.2766166362743723, "grad_norm": 0.3001349866390228, "learning_rate": 4.5389722728760467e-05, "loss": 0.0201, "step": 9860 }, { "epoch": 0.27689718053022866, "grad_norm": 0.1808239072561264, "learning_rate": 4.538504699116286e-05, "loss": 0.0371, "step": 9870 }, { "epoch": 0.277177724786085, "grad_norm": 0.06014932692050934, "learning_rate": 4.538037125356525e-05, "loss": 0.0134, "step": 9880 }, { "epoch": 0.2774582690419414, "grad_norm": 8.731575012207031, "learning_rate": 4.5375695515967646e-05, "loss": 0.0199, "step": 9890 }, { "epoch": 0.2777388132977977, "grad_norm": 0.25502559542655945, "learning_rate": 4.537101977837004e-05, "loss": 0.0177, "step": 9900 }, { "epoch": 0.2780193575536541, "grad_norm": 0.7086575031280518, "learning_rate": 4.536634404077243e-05, "loss": 0.04, "step": 9910 }, { "epoch": 0.2782999018095105, "grad_norm": 0.029934577643871307, "learning_rate": 4.5361668303174826e-05, "loss": 0.0398, "step": 9920 }, { "epoch": 0.2785804460653668, "grad_norm": 0.025641515851020813, "learning_rate": 4.5356992565577225e-05, "loss": 0.0144, "step": 9930 }, { "epoch": 0.2788609903212232, "grad_norm": 0.36575037240982056, "learning_rate": 4.535231682797962e-05, "loss": 0.0618, "step": 9940 }, { "epoch": 0.2791415345770795, "grad_norm": 0.25173190236091614, "learning_rate": 4.534764109038201e-05, "loss": 0.048, "step": 9950 }, { "epoch": 0.2794220788329359, "grad_norm": 0.07307543605566025, "learning_rate": 4.5342965352784405e-05, "loss": 0.0377, "step": 9960 }, { "epoch": 0.27970262308879223, "grad_norm": 0.18675723671913147, "learning_rate": 4.53382896151868e-05, "loss": 0.0431, "step": 9970 }, { "epoch": 0.2799831673446486, "grad_norm": 0.03682328015565872, "learning_rate": 4.533361387758919e-05, "loss": 0.0159, "step": 9980 }, { "epoch": 0.280263711600505, "grad_norm": 0.07051067799329758, "learning_rate": 4.5328938139991584e-05, "loss": 0.0366, "step": 9990 }, { "epoch": 0.28054425585636134, "grad_norm": 0.8700583577156067, "learning_rate": 4.532426240239398e-05, "loss": 0.0281, "step": 10000 }, { "epoch": 0.2808248001122177, "grad_norm": 0.0984838604927063, "learning_rate": 4.531958666479638e-05, "loss": 0.0282, "step": 10010 }, { "epoch": 0.28110534436807405, "grad_norm": 0.5598074793815613, "learning_rate": 4.5314910927198764e-05, "loss": 0.0385, "step": 10020 }, { "epoch": 0.28138588862393044, "grad_norm": 1.0545732975006104, "learning_rate": 4.5310235189601164e-05, "loss": 0.0322, "step": 10030 }, { "epoch": 0.28166643287978677, "grad_norm": 1.180093765258789, "learning_rate": 4.530555945200356e-05, "loss": 0.0281, "step": 10040 }, { "epoch": 0.28194697713564315, "grad_norm": 1.3450859785079956, "learning_rate": 4.530088371440595e-05, "loss": 0.0203, "step": 10050 }, { "epoch": 0.28222752139149954, "grad_norm": 0.8095816373825073, "learning_rate": 4.529620797680834e-05, "loss": 0.0253, "step": 10060 }, { "epoch": 0.28250806564735587, "grad_norm": 0.5756889581680298, "learning_rate": 4.5291532239210736e-05, "loss": 0.019, "step": 10070 }, { "epoch": 0.28278860990321225, "grad_norm": 0.27916932106018066, "learning_rate": 4.5286856501613136e-05, "loss": 0.0247, "step": 10080 }, { "epoch": 0.2830691541590686, "grad_norm": 0.24127107858657837, "learning_rate": 4.528218076401552e-05, "loss": 0.0035, "step": 10090 }, { "epoch": 0.28334969841492497, "grad_norm": 0.030426589772105217, "learning_rate": 4.527750502641792e-05, "loss": 0.0512, "step": 10100 }, { "epoch": 0.2836302426707813, "grad_norm": 0.03334679827094078, "learning_rate": 4.527282928882031e-05, "loss": 0.0222, "step": 10110 }, { "epoch": 0.2839107869266377, "grad_norm": 0.1143283024430275, "learning_rate": 4.526815355122271e-05, "loss": 0.0468, "step": 10120 }, { "epoch": 0.284191331182494, "grad_norm": 0.2712286412715912, "learning_rate": 4.52634778136251e-05, "loss": 0.0061, "step": 10130 }, { "epoch": 0.2844718754383504, "grad_norm": 0.7979671955108643, "learning_rate": 4.5258802076027495e-05, "loss": 0.049, "step": 10140 }, { "epoch": 0.2847524196942068, "grad_norm": 1.0310802459716797, "learning_rate": 4.5254126338429895e-05, "loss": 0.0171, "step": 10150 }, { "epoch": 0.2850329639500631, "grad_norm": 0.04600200802087784, "learning_rate": 4.524945060083228e-05, "loss": 0.013, "step": 10160 }, { "epoch": 0.2853135082059195, "grad_norm": 0.23616845905780792, "learning_rate": 4.524477486323468e-05, "loss": 0.0369, "step": 10170 }, { "epoch": 0.2855940524617758, "grad_norm": 0.3314211070537567, "learning_rate": 4.524009912563707e-05, "loss": 0.0287, "step": 10180 }, { "epoch": 0.2858745967176322, "grad_norm": 3.525461435317993, "learning_rate": 4.523542338803947e-05, "loss": 0.0089, "step": 10190 }, { "epoch": 0.28615514097348854, "grad_norm": 0.6265592575073242, "learning_rate": 4.5230747650441854e-05, "loss": 0.0799, "step": 10200 }, { "epoch": 0.2864356852293449, "grad_norm": 0.07122839987277985, "learning_rate": 4.5226071912844254e-05, "loss": 0.0187, "step": 10210 }, { "epoch": 0.2867162294852013, "grad_norm": 0.7596004009246826, "learning_rate": 4.522139617524665e-05, "loss": 0.0296, "step": 10220 }, { "epoch": 0.28699677374105764, "grad_norm": 0.023838823661208153, "learning_rate": 4.521672043764904e-05, "loss": 0.0382, "step": 10230 }, { "epoch": 0.28727731799691403, "grad_norm": 0.7005985379219055, "learning_rate": 4.5212044700051434e-05, "loss": 0.0279, "step": 10240 }, { "epoch": 0.28755786225277036, "grad_norm": 0.4581297039985657, "learning_rate": 4.520736896245383e-05, "loss": 0.0315, "step": 10250 }, { "epoch": 0.28783840650862674, "grad_norm": 0.0172579288482666, "learning_rate": 4.520269322485623e-05, "loss": 0.0081, "step": 10260 }, { "epoch": 0.2881189507644831, "grad_norm": 0.04650239273905754, "learning_rate": 4.519801748725861e-05, "loss": 0.0172, "step": 10270 }, { "epoch": 0.28839949502033946, "grad_norm": 0.012444854713976383, "learning_rate": 4.519334174966101e-05, "loss": 0.0151, "step": 10280 }, { "epoch": 0.28868003927619584, "grad_norm": 0.43255722522735596, "learning_rate": 4.5188666012063406e-05, "loss": 0.0086, "step": 10290 }, { "epoch": 0.2889605835320522, "grad_norm": 0.018341658636927605, "learning_rate": 4.51839902744658e-05, "loss": 0.0225, "step": 10300 }, { "epoch": 0.28924112778790856, "grad_norm": 0.03562648594379425, "learning_rate": 4.517931453686819e-05, "loss": 0.0186, "step": 10310 }, { "epoch": 0.2895216720437649, "grad_norm": 4.554627418518066, "learning_rate": 4.5174638799270586e-05, "loss": 0.0733, "step": 10320 }, { "epoch": 0.2898022162996213, "grad_norm": 0.050022829324007034, "learning_rate": 4.516996306167298e-05, "loss": 0.0084, "step": 10330 }, { "epoch": 0.2900827605554776, "grad_norm": 0.07313530892133713, "learning_rate": 4.516528732407537e-05, "loss": 0.0446, "step": 10340 }, { "epoch": 0.290363304811334, "grad_norm": 0.0844794362783432, "learning_rate": 4.516061158647777e-05, "loss": 0.0461, "step": 10350 }, { "epoch": 0.2906438490671904, "grad_norm": 0.10934924334287643, "learning_rate": 4.5155935848880165e-05, "loss": 0.0409, "step": 10360 }, { "epoch": 0.2909243933230467, "grad_norm": 0.3103056252002716, "learning_rate": 4.515126011128256e-05, "loss": 0.0231, "step": 10370 }, { "epoch": 0.2912049375789031, "grad_norm": 0.12657570838928223, "learning_rate": 4.514658437368495e-05, "loss": 0.015, "step": 10380 }, { "epoch": 0.2914854818347594, "grad_norm": 0.014077355153858662, "learning_rate": 4.5141908636087344e-05, "loss": 0.0093, "step": 10390 }, { "epoch": 0.2917660260906158, "grad_norm": 0.04537412151694298, "learning_rate": 4.513723289848974e-05, "loss": 0.0459, "step": 10400 }, { "epoch": 0.29204657034647213, "grad_norm": 0.1589195430278778, "learning_rate": 4.513255716089213e-05, "loss": 0.0236, "step": 10410 }, { "epoch": 0.2923271146023285, "grad_norm": 0.026674769818782806, "learning_rate": 4.5127881423294524e-05, "loss": 0.0222, "step": 10420 }, { "epoch": 0.2926076588581849, "grad_norm": 0.3200857937335968, "learning_rate": 4.5123205685696924e-05, "loss": 0.0451, "step": 10430 }, { "epoch": 0.29288820311404123, "grad_norm": 0.03775456175208092, "learning_rate": 4.511852994809932e-05, "loss": 0.0187, "step": 10440 }, { "epoch": 0.2931687473698976, "grad_norm": 0.4629463851451874, "learning_rate": 4.511385421050171e-05, "loss": 0.0161, "step": 10450 }, { "epoch": 0.29344929162575395, "grad_norm": 0.07884158194065094, "learning_rate": 4.51091784729041e-05, "loss": 0.0234, "step": 10460 }, { "epoch": 0.29372983588161033, "grad_norm": 1.3700547218322754, "learning_rate": 4.5104502735306497e-05, "loss": 0.0697, "step": 10470 }, { "epoch": 0.29401038013746666, "grad_norm": 0.867210328578949, "learning_rate": 4.509982699770889e-05, "loss": 0.0412, "step": 10480 }, { "epoch": 0.29429092439332305, "grad_norm": 0.11561106890439987, "learning_rate": 4.509515126011128e-05, "loss": 0.0263, "step": 10490 }, { "epoch": 0.29457146864917944, "grad_norm": 0.7751287221908569, "learning_rate": 4.509047552251368e-05, "loss": 0.0236, "step": 10500 }, { "epoch": 0.29485201290503577, "grad_norm": 1.9433412551879883, "learning_rate": 4.508579978491607e-05, "loss": 0.047, "step": 10510 }, { "epoch": 0.29513255716089215, "grad_norm": 2.782356023788452, "learning_rate": 4.508112404731847e-05, "loss": 0.0376, "step": 10520 }, { "epoch": 0.2954131014167485, "grad_norm": 0.3813644349575043, "learning_rate": 4.507644830972086e-05, "loss": 0.0169, "step": 10530 }, { "epoch": 0.29569364567260487, "grad_norm": 0.03207210823893547, "learning_rate": 4.5071772572123255e-05, "loss": 0.0241, "step": 10540 }, { "epoch": 0.2959741899284612, "grad_norm": 0.09116999804973602, "learning_rate": 4.506709683452565e-05, "loss": 0.0561, "step": 10550 }, { "epoch": 0.2962547341843176, "grad_norm": 0.21416139602661133, "learning_rate": 4.506242109692804e-05, "loss": 0.0416, "step": 10560 }, { "epoch": 0.2965352784401739, "grad_norm": 0.06285927444696426, "learning_rate": 4.505774535933044e-05, "loss": 0.0226, "step": 10570 }, { "epoch": 0.2968158226960303, "grad_norm": 0.26141291856765747, "learning_rate": 4.505306962173283e-05, "loss": 0.021, "step": 10580 }, { "epoch": 0.2970963669518867, "grad_norm": 0.032315611839294434, "learning_rate": 4.504839388413523e-05, "loss": 0.0151, "step": 10590 }, { "epoch": 0.297376911207743, "grad_norm": 1.7944958209991455, "learning_rate": 4.5043718146537614e-05, "loss": 0.0366, "step": 10600 }, { "epoch": 0.2976574554635994, "grad_norm": 0.12685810029506683, "learning_rate": 4.5039042408940014e-05, "loss": 0.0035, "step": 10610 }, { "epoch": 0.2979379997194557, "grad_norm": 0.0649770051240921, "learning_rate": 4.50343666713424e-05, "loss": 0.0403, "step": 10620 }, { "epoch": 0.2982185439753121, "grad_norm": 0.6758877635002136, "learning_rate": 4.50296909337448e-05, "loss": 0.0265, "step": 10630 }, { "epoch": 0.29849908823116844, "grad_norm": 0.13403376936912537, "learning_rate": 4.5025015196147194e-05, "loss": 0.0275, "step": 10640 }, { "epoch": 0.2987796324870248, "grad_norm": 0.09899009764194489, "learning_rate": 4.502033945854959e-05, "loss": 0.026, "step": 10650 }, { "epoch": 0.2990601767428812, "grad_norm": 0.5603309869766235, "learning_rate": 4.501566372095199e-05, "loss": 0.0461, "step": 10660 }, { "epoch": 0.29934072099873754, "grad_norm": 0.23153682053089142, "learning_rate": 4.501098798335437e-05, "loss": 0.0172, "step": 10670 }, { "epoch": 0.2996212652545939, "grad_norm": 0.5094754099845886, "learning_rate": 4.500631224575677e-05, "loss": 0.0369, "step": 10680 }, { "epoch": 0.29990180951045026, "grad_norm": 0.16592492163181305, "learning_rate": 4.500163650815916e-05, "loss": 0.0336, "step": 10690 }, { "epoch": 0.30018235376630664, "grad_norm": 0.6575311422348022, "learning_rate": 4.499696077056156e-05, "loss": 0.0396, "step": 10700 }, { "epoch": 0.30046289802216297, "grad_norm": 0.08082019537687302, "learning_rate": 4.499228503296395e-05, "loss": 0.0202, "step": 10710 }, { "epoch": 0.30074344227801936, "grad_norm": 3.1825737953186035, "learning_rate": 4.4987609295366346e-05, "loss": 0.0336, "step": 10720 }, { "epoch": 0.30102398653387574, "grad_norm": 0.0825008824467659, "learning_rate": 4.498293355776874e-05, "loss": 0.0463, "step": 10730 }, { "epoch": 0.3013045307897321, "grad_norm": 0.069666787981987, "learning_rate": 4.497825782017113e-05, "loss": 0.0246, "step": 10740 }, { "epoch": 0.30158507504558846, "grad_norm": 4.7177042961120605, "learning_rate": 4.497358208257353e-05, "loss": 0.0682, "step": 10750 }, { "epoch": 0.3018656193014448, "grad_norm": 0.8739906549453735, "learning_rate": 4.496890634497592e-05, "loss": 0.0375, "step": 10760 }, { "epoch": 0.3021461635573012, "grad_norm": 0.06044824793934822, "learning_rate": 4.496423060737832e-05, "loss": 0.0284, "step": 10770 }, { "epoch": 0.3024267078131575, "grad_norm": 0.15029466152191162, "learning_rate": 4.495955486978071e-05, "loss": 0.0249, "step": 10780 }, { "epoch": 0.3027072520690139, "grad_norm": 0.024806907400488853, "learning_rate": 4.4954879132183105e-05, "loss": 0.0257, "step": 10790 }, { "epoch": 0.3029877963248703, "grad_norm": 0.5186880826950073, "learning_rate": 4.49502033945855e-05, "loss": 0.0403, "step": 10800 }, { "epoch": 0.3032683405807266, "grad_norm": 0.2739562690258026, "learning_rate": 4.494552765698789e-05, "loss": 0.0595, "step": 10810 }, { "epoch": 0.303548884836583, "grad_norm": 1.911788821220398, "learning_rate": 4.4940851919390284e-05, "loss": 0.028, "step": 10820 }, { "epoch": 0.3038294290924393, "grad_norm": 0.14945781230926514, "learning_rate": 4.493617618179268e-05, "loss": 0.0502, "step": 10830 }, { "epoch": 0.3041099733482957, "grad_norm": 0.4811408817768097, "learning_rate": 4.493150044419508e-05, "loss": 0.0561, "step": 10840 }, { "epoch": 0.30439051760415203, "grad_norm": 0.0864325687289238, "learning_rate": 4.492682470659747e-05, "loss": 0.0233, "step": 10850 }, { "epoch": 0.3046710618600084, "grad_norm": 0.26804107427597046, "learning_rate": 4.4922148968999863e-05, "loss": 0.0171, "step": 10860 }, { "epoch": 0.3049516061158648, "grad_norm": 27.48790740966797, "learning_rate": 4.4917473231402257e-05, "loss": 0.0332, "step": 10870 }, { "epoch": 0.30523215037172113, "grad_norm": 0.40822991728782654, "learning_rate": 4.491279749380465e-05, "loss": 0.0104, "step": 10880 }, { "epoch": 0.3055126946275775, "grad_norm": 0.8410980105400085, "learning_rate": 4.490812175620704e-05, "loss": 0.035, "step": 10890 }, { "epoch": 0.30579323888343385, "grad_norm": 0.05546451732516289, "learning_rate": 4.4903446018609436e-05, "loss": 0.027, "step": 10900 }, { "epoch": 0.30607378313929023, "grad_norm": 6.690145969390869, "learning_rate": 4.489877028101183e-05, "loss": 0.046, "step": 10910 }, { "epoch": 0.30635432739514656, "grad_norm": 1.7508790493011475, "learning_rate": 4.489409454341423e-05, "loss": 0.0294, "step": 10920 }, { "epoch": 0.30663487165100295, "grad_norm": 0.08538148552179337, "learning_rate": 4.4889418805816616e-05, "loss": 0.0219, "step": 10930 }, { "epoch": 0.30691541590685933, "grad_norm": 0.9008023142814636, "learning_rate": 4.4884743068219015e-05, "loss": 0.0354, "step": 10940 }, { "epoch": 0.30719596016271566, "grad_norm": 1.6713216304779053, "learning_rate": 4.488006733062141e-05, "loss": 0.0309, "step": 10950 }, { "epoch": 0.30747650441857205, "grad_norm": 0.052632059901952744, "learning_rate": 4.48753915930238e-05, "loss": 0.0538, "step": 10960 }, { "epoch": 0.3077570486744284, "grad_norm": 0.23404726386070251, "learning_rate": 4.4870715855426195e-05, "loss": 0.0386, "step": 10970 }, { "epoch": 0.30803759293028476, "grad_norm": 0.5841366648674011, "learning_rate": 4.486604011782859e-05, "loss": 0.0343, "step": 10980 }, { "epoch": 0.3083181371861411, "grad_norm": 0.298030287027359, "learning_rate": 4.486136438023099e-05, "loss": 0.0488, "step": 10990 }, { "epoch": 0.3085986814419975, "grad_norm": 1.6038436889648438, "learning_rate": 4.4856688642633374e-05, "loss": 0.0638, "step": 11000 }, { "epoch": 0.3088792256978538, "grad_norm": 4.104541301727295, "learning_rate": 4.4852012905035774e-05, "loss": 0.0167, "step": 11010 }, { "epoch": 0.3091597699537102, "grad_norm": 0.4903431832790375, "learning_rate": 4.484733716743816e-05, "loss": 0.0733, "step": 11020 }, { "epoch": 0.3094403142095666, "grad_norm": 0.8616307377815247, "learning_rate": 4.484266142984056e-05, "loss": 0.0448, "step": 11030 }, { "epoch": 0.3097208584654229, "grad_norm": 2.1899027824401855, "learning_rate": 4.4837985692242954e-05, "loss": 0.0392, "step": 11040 }, { "epoch": 0.3100014027212793, "grad_norm": 0.08618391305208206, "learning_rate": 4.483330995464535e-05, "loss": 0.0279, "step": 11050 }, { "epoch": 0.3102819469771356, "grad_norm": 0.4974406957626343, "learning_rate": 4.482863421704775e-05, "loss": 0.0212, "step": 11060 }, { "epoch": 0.310562491232992, "grad_norm": 10.705431938171387, "learning_rate": 4.482395847945013e-05, "loss": 0.0578, "step": 11070 }, { "epoch": 0.31084303548884834, "grad_norm": 1.2377911806106567, "learning_rate": 4.481928274185253e-05, "loss": 0.0515, "step": 11080 }, { "epoch": 0.3111235797447047, "grad_norm": 0.3232360780239105, "learning_rate": 4.481460700425492e-05, "loss": 0.0199, "step": 11090 }, { "epoch": 0.3114041240005611, "grad_norm": 0.07259372621774673, "learning_rate": 4.480993126665732e-05, "loss": 0.0175, "step": 11100 }, { "epoch": 0.31168466825641744, "grad_norm": 0.6504152417182922, "learning_rate": 4.4805255529059706e-05, "loss": 0.0476, "step": 11110 }, { "epoch": 0.3119652125122738, "grad_norm": 0.805053174495697, "learning_rate": 4.4800579791462106e-05, "loss": 0.0233, "step": 11120 }, { "epoch": 0.31224575676813016, "grad_norm": 1.1468241214752197, "learning_rate": 4.47959040538645e-05, "loss": 0.0385, "step": 11130 }, { "epoch": 0.31252630102398654, "grad_norm": 0.20479732751846313, "learning_rate": 4.479122831626689e-05, "loss": 0.0637, "step": 11140 }, { "epoch": 0.31280684527984287, "grad_norm": 9.4793062210083, "learning_rate": 4.4786552578669285e-05, "loss": 0.0412, "step": 11150 }, { "epoch": 0.31308738953569926, "grad_norm": 0.6035559773445129, "learning_rate": 4.478187684107168e-05, "loss": 0.0304, "step": 11160 }, { "epoch": 0.31336793379155564, "grad_norm": 0.6212475299835205, "learning_rate": 4.477720110347408e-05, "loss": 0.0381, "step": 11170 }, { "epoch": 0.31364847804741197, "grad_norm": 0.21315784752368927, "learning_rate": 4.4772525365876465e-05, "loss": 0.0214, "step": 11180 }, { "epoch": 0.31392902230326836, "grad_norm": 0.6692667603492737, "learning_rate": 4.4767849628278865e-05, "loss": 0.0301, "step": 11190 }, { "epoch": 0.3142095665591247, "grad_norm": 0.56908118724823, "learning_rate": 4.476317389068126e-05, "loss": 0.0567, "step": 11200 }, { "epoch": 0.31449011081498107, "grad_norm": 6.630923748016357, "learning_rate": 4.475849815308365e-05, "loss": 0.0412, "step": 11210 }, { "epoch": 0.3147706550708374, "grad_norm": 0.10235074907541275, "learning_rate": 4.4753822415486044e-05, "loss": 0.064, "step": 11220 }, { "epoch": 0.3150511993266938, "grad_norm": 0.07023721188306808, "learning_rate": 4.474914667788844e-05, "loss": 0.0174, "step": 11230 }, { "epoch": 0.3153317435825502, "grad_norm": 0.3392440974712372, "learning_rate": 4.474447094029083e-05, "loss": 0.0101, "step": 11240 }, { "epoch": 0.3156122878384065, "grad_norm": 0.36308401823043823, "learning_rate": 4.4739795202693224e-05, "loss": 0.0334, "step": 11250 }, { "epoch": 0.3158928320942629, "grad_norm": 3.731355905532837, "learning_rate": 4.4735119465095624e-05, "loss": 0.0522, "step": 11260 }, { "epoch": 0.3161733763501192, "grad_norm": 3.1292452812194824, "learning_rate": 4.473044372749802e-05, "loss": 0.032, "step": 11270 }, { "epoch": 0.3164539206059756, "grad_norm": 0.074168361723423, "learning_rate": 4.472576798990041e-05, "loss": 0.0308, "step": 11280 }, { "epoch": 0.31673446486183193, "grad_norm": 0.0923282653093338, "learning_rate": 4.47210922523028e-05, "loss": 0.0415, "step": 11290 }, { "epoch": 0.3170150091176883, "grad_norm": 0.10337740927934647, "learning_rate": 4.4716416514705196e-05, "loss": 0.0259, "step": 11300 }, { "epoch": 0.3172955533735447, "grad_norm": 0.12452614307403564, "learning_rate": 4.471174077710759e-05, "loss": 0.0522, "step": 11310 }, { "epoch": 0.31757609762940103, "grad_norm": 0.7410706281661987, "learning_rate": 4.470706503950998e-05, "loss": 0.022, "step": 11320 }, { "epoch": 0.3178566418852574, "grad_norm": 0.035531748086214066, "learning_rate": 4.4702389301912376e-05, "loss": 0.0274, "step": 11330 }, { "epoch": 0.31813718614111375, "grad_norm": 0.03929581865668297, "learning_rate": 4.4697713564314776e-05, "loss": 0.0569, "step": 11340 }, { "epoch": 0.31841773039697013, "grad_norm": 0.08796443045139313, "learning_rate": 4.469303782671717e-05, "loss": 0.0183, "step": 11350 }, { "epoch": 0.31869827465282646, "grad_norm": 0.1026453897356987, "learning_rate": 4.468836208911956e-05, "loss": 0.0547, "step": 11360 }, { "epoch": 0.31897881890868285, "grad_norm": 0.7954708933830261, "learning_rate": 4.4683686351521955e-05, "loss": 0.0184, "step": 11370 }, { "epoch": 0.31925936316453923, "grad_norm": 0.021890198811888695, "learning_rate": 4.467901061392435e-05, "loss": 0.0213, "step": 11380 }, { "epoch": 0.31953990742039556, "grad_norm": 0.32087641954421997, "learning_rate": 4.467433487632674e-05, "loss": 0.0613, "step": 11390 }, { "epoch": 0.31982045167625195, "grad_norm": 0.5242047309875488, "learning_rate": 4.4669659138729134e-05, "loss": 0.0456, "step": 11400 }, { "epoch": 0.3201009959321083, "grad_norm": 2.0980095863342285, "learning_rate": 4.4664983401131534e-05, "loss": 0.0435, "step": 11410 }, { "epoch": 0.32038154018796466, "grad_norm": 0.5785903334617615, "learning_rate": 4.466030766353392e-05, "loss": 0.0082, "step": 11420 }, { "epoch": 0.320662084443821, "grad_norm": 0.14170975983142853, "learning_rate": 4.465563192593632e-05, "loss": 0.0404, "step": 11430 }, { "epoch": 0.3209426286996774, "grad_norm": 0.37812086939811707, "learning_rate": 4.4650956188338714e-05, "loss": 0.0486, "step": 11440 }, { "epoch": 0.3212231729555337, "grad_norm": 2.7646756172180176, "learning_rate": 4.464628045074111e-05, "loss": 0.0601, "step": 11450 }, { "epoch": 0.3215037172113901, "grad_norm": 2.9735217094421387, "learning_rate": 4.46416047131435e-05, "loss": 0.055, "step": 11460 }, { "epoch": 0.3217842614672465, "grad_norm": 0.19568365812301636, "learning_rate": 4.463692897554589e-05, "loss": 0.0184, "step": 11470 }, { "epoch": 0.3220648057231028, "grad_norm": 0.21962031722068787, "learning_rate": 4.463225323794829e-05, "loss": 0.0339, "step": 11480 }, { "epoch": 0.3223453499789592, "grad_norm": 3.555217981338501, "learning_rate": 4.462757750035068e-05, "loss": 0.0602, "step": 11490 }, { "epoch": 0.3226258942348155, "grad_norm": 0.08062517642974854, "learning_rate": 4.462290176275308e-05, "loss": 0.0442, "step": 11500 }, { "epoch": 0.3229064384906719, "grad_norm": 0.09401097148656845, "learning_rate": 4.4618226025155466e-05, "loss": 0.0488, "step": 11510 }, { "epoch": 0.32318698274652824, "grad_norm": 0.1871889978647232, "learning_rate": 4.4613550287557866e-05, "loss": 0.0484, "step": 11520 }, { "epoch": 0.3234675270023846, "grad_norm": 3.0869736671447754, "learning_rate": 4.460887454996025e-05, "loss": 0.0482, "step": 11530 }, { "epoch": 0.323748071258241, "grad_norm": 0.4347812831401825, "learning_rate": 4.460419881236265e-05, "loss": 0.024, "step": 11540 }, { "epoch": 0.32402861551409734, "grad_norm": 0.2895217537879944, "learning_rate": 4.4599523074765045e-05, "loss": 0.0138, "step": 11550 }, { "epoch": 0.3243091597699537, "grad_norm": 0.19572694599628448, "learning_rate": 4.459484733716744e-05, "loss": 0.0394, "step": 11560 }, { "epoch": 0.32458970402581006, "grad_norm": 0.020016714930534363, "learning_rate": 4.459017159956984e-05, "loss": 0.0244, "step": 11570 }, { "epoch": 0.32487024828166644, "grad_norm": 1.1044505834579468, "learning_rate": 4.4585495861972225e-05, "loss": 0.0694, "step": 11580 }, { "epoch": 0.32515079253752277, "grad_norm": 0.07132922857999802, "learning_rate": 4.4580820124374625e-05, "loss": 0.0166, "step": 11590 }, { "epoch": 0.32543133679337916, "grad_norm": 2.7114641666412354, "learning_rate": 4.457614438677702e-05, "loss": 0.0636, "step": 11600 }, { "epoch": 0.32571188104923554, "grad_norm": 0.19276151061058044, "learning_rate": 4.457146864917941e-05, "loss": 0.0154, "step": 11610 }, { "epoch": 0.32599242530509187, "grad_norm": 0.0995982363820076, "learning_rate": 4.4566792911581804e-05, "loss": 0.0233, "step": 11620 }, { "epoch": 0.32627296956094826, "grad_norm": 0.2788524031639099, "learning_rate": 4.45621171739842e-05, "loss": 0.0574, "step": 11630 }, { "epoch": 0.3265535138168046, "grad_norm": 0.35088348388671875, "learning_rate": 4.455744143638659e-05, "loss": 0.0113, "step": 11640 }, { "epoch": 0.32683405807266097, "grad_norm": 2.0778939723968506, "learning_rate": 4.4552765698788984e-05, "loss": 0.0476, "step": 11650 }, { "epoch": 0.3271146023285173, "grad_norm": 0.5080552697181702, "learning_rate": 4.4548089961191384e-05, "loss": 0.0217, "step": 11660 }, { "epoch": 0.3273951465843737, "grad_norm": 0.39370599389076233, "learning_rate": 4.454341422359378e-05, "loss": 0.0411, "step": 11670 }, { "epoch": 0.32767569084023007, "grad_norm": 0.41876864433288574, "learning_rate": 4.453873848599617e-05, "loss": 0.0365, "step": 11680 }, { "epoch": 0.3279562350960864, "grad_norm": 0.06884843856096268, "learning_rate": 4.453406274839856e-05, "loss": 0.0181, "step": 11690 }, { "epoch": 0.3282367793519428, "grad_norm": 2.596071720123291, "learning_rate": 4.4529387010800956e-05, "loss": 0.0508, "step": 11700 }, { "epoch": 0.3285173236077991, "grad_norm": 1.640618920326233, "learning_rate": 4.452471127320335e-05, "loss": 0.0379, "step": 11710 }, { "epoch": 0.3287978678636555, "grad_norm": 0.651918351650238, "learning_rate": 4.452003553560574e-05, "loss": 0.036, "step": 11720 }, { "epoch": 0.32907841211951183, "grad_norm": 0.6120554804801941, "learning_rate": 4.4515359798008136e-05, "loss": 0.0306, "step": 11730 }, { "epoch": 0.3293589563753682, "grad_norm": 0.6930578947067261, "learning_rate": 4.4510684060410536e-05, "loss": 0.0457, "step": 11740 }, { "epoch": 0.3296395006312246, "grad_norm": 0.08244192600250244, "learning_rate": 4.450600832281293e-05, "loss": 0.0209, "step": 11750 }, { "epoch": 0.32992004488708093, "grad_norm": 0.5137665271759033, "learning_rate": 4.450133258521532e-05, "loss": 0.0127, "step": 11760 }, { "epoch": 0.3302005891429373, "grad_norm": 5.130674839019775, "learning_rate": 4.4496656847617715e-05, "loss": 0.0596, "step": 11770 }, { "epoch": 0.33048113339879365, "grad_norm": 1.514221429824829, "learning_rate": 4.449198111002011e-05, "loss": 0.0424, "step": 11780 }, { "epoch": 0.33076167765465003, "grad_norm": 0.6617045998573303, "learning_rate": 4.44873053724225e-05, "loss": 0.0676, "step": 11790 }, { "epoch": 0.33104222191050636, "grad_norm": 1.0181429386138916, "learning_rate": 4.4482629634824895e-05, "loss": 0.0219, "step": 11800 }, { "epoch": 0.33132276616636275, "grad_norm": 0.6985236406326294, "learning_rate": 4.4477953897227294e-05, "loss": 0.0334, "step": 11810 }, { "epoch": 0.3316033104222191, "grad_norm": 0.08376132696866989, "learning_rate": 4.447327815962968e-05, "loss": 0.0206, "step": 11820 }, { "epoch": 0.33188385467807546, "grad_norm": 0.029797902330756187, "learning_rate": 4.446860242203208e-05, "loss": 0.0262, "step": 11830 }, { "epoch": 0.33216439893393185, "grad_norm": 0.03460313752293587, "learning_rate": 4.446392668443447e-05, "loss": 0.0312, "step": 11840 }, { "epoch": 0.3324449431897882, "grad_norm": 0.9965922236442566, "learning_rate": 4.445925094683687e-05, "loss": 0.0466, "step": 11850 }, { "epoch": 0.33272548744564456, "grad_norm": 0.31251662969589233, "learning_rate": 4.445457520923926e-05, "loss": 0.043, "step": 11860 }, { "epoch": 0.3330060317015009, "grad_norm": 0.02691243588924408, "learning_rate": 4.4449899471641653e-05, "loss": 0.0148, "step": 11870 }, { "epoch": 0.3332865759573573, "grad_norm": 1.5011037588119507, "learning_rate": 4.444522373404405e-05, "loss": 0.0443, "step": 11880 }, { "epoch": 0.3335671202132136, "grad_norm": 0.19395345449447632, "learning_rate": 4.444054799644644e-05, "loss": 0.044, "step": 11890 }, { "epoch": 0.33384766446907, "grad_norm": 2.029663562774658, "learning_rate": 4.443587225884884e-05, "loss": 0.0776, "step": 11900 }, { "epoch": 0.3341282087249264, "grad_norm": 0.3363190293312073, "learning_rate": 4.4431196521251226e-05, "loss": 0.0188, "step": 11910 }, { "epoch": 0.3344087529807827, "grad_norm": 0.3740219175815582, "learning_rate": 4.4426520783653626e-05, "loss": 0.0275, "step": 11920 }, { "epoch": 0.3346892972366391, "grad_norm": 0.11193308234214783, "learning_rate": 4.442184504605601e-05, "loss": 0.0463, "step": 11930 }, { "epoch": 0.3349698414924954, "grad_norm": 0.41483983397483826, "learning_rate": 4.441716930845841e-05, "loss": 0.0513, "step": 11940 }, { "epoch": 0.3352503857483518, "grad_norm": 0.08998782187700272, "learning_rate": 4.4412493570860805e-05, "loss": 0.028, "step": 11950 }, { "epoch": 0.33553093000420814, "grad_norm": 9.570478439331055, "learning_rate": 4.44078178332632e-05, "loss": 0.0221, "step": 11960 }, { "epoch": 0.3358114742600645, "grad_norm": 0.5713987946510315, "learning_rate": 4.44031420956656e-05, "loss": 0.0554, "step": 11970 }, { "epoch": 0.3360920185159209, "grad_norm": 0.21890589594841003, "learning_rate": 4.4398466358067985e-05, "loss": 0.0351, "step": 11980 }, { "epoch": 0.33637256277177724, "grad_norm": 2.022099018096924, "learning_rate": 4.4393790620470385e-05, "loss": 0.0264, "step": 11990 }, { "epoch": 0.3366531070276336, "grad_norm": 0.28418081998825073, "learning_rate": 4.438911488287277e-05, "loss": 0.0419, "step": 12000 }, { "epoch": 0.33693365128348995, "grad_norm": 0.05412141978740692, "learning_rate": 4.438443914527517e-05, "loss": 0.0354, "step": 12010 }, { "epoch": 0.33721419553934634, "grad_norm": 0.035651206970214844, "learning_rate": 4.4379763407677564e-05, "loss": 0.015, "step": 12020 }, { "epoch": 0.33749473979520267, "grad_norm": 0.03530125692486763, "learning_rate": 4.437508767007996e-05, "loss": 0.0151, "step": 12030 }, { "epoch": 0.33777528405105905, "grad_norm": 0.0335795022547245, "learning_rate": 4.437041193248235e-05, "loss": 0.0306, "step": 12040 }, { "epoch": 0.33805582830691544, "grad_norm": 0.3052193820476532, "learning_rate": 4.4365736194884744e-05, "loss": 0.053, "step": 12050 }, { "epoch": 0.33833637256277177, "grad_norm": 0.4238463044166565, "learning_rate": 4.436106045728714e-05, "loss": 0.0208, "step": 12060 }, { "epoch": 0.33861691681862816, "grad_norm": 0.08849052339792252, "learning_rate": 4.435638471968953e-05, "loss": 0.0614, "step": 12070 }, { "epoch": 0.3388974610744845, "grad_norm": 0.3051941394805908, "learning_rate": 4.435170898209193e-05, "loss": 0.0309, "step": 12080 }, { "epoch": 0.33917800533034087, "grad_norm": 0.3268875777721405, "learning_rate": 4.434703324449432e-05, "loss": 0.0747, "step": 12090 }, { "epoch": 0.3394585495861972, "grad_norm": 0.43353646993637085, "learning_rate": 4.4342357506896716e-05, "loss": 0.0524, "step": 12100 }, { "epoch": 0.3397390938420536, "grad_norm": 0.3359992504119873, "learning_rate": 4.433768176929911e-05, "loss": 0.0208, "step": 12110 }, { "epoch": 0.34001963809790997, "grad_norm": 0.050282739102840424, "learning_rate": 4.43330060317015e-05, "loss": 0.0193, "step": 12120 }, { "epoch": 0.3403001823537663, "grad_norm": 1.7415283918380737, "learning_rate": 4.4328330294103896e-05, "loss": 0.0481, "step": 12130 }, { "epoch": 0.3405807266096227, "grad_norm": 3.7176167964935303, "learning_rate": 4.432365455650629e-05, "loss": 0.0458, "step": 12140 }, { "epoch": 0.340861270865479, "grad_norm": 9.670089721679688, "learning_rate": 4.431897881890868e-05, "loss": 0.021, "step": 12150 }, { "epoch": 0.3411418151213354, "grad_norm": 0.05054955184459686, "learning_rate": 4.431430308131108e-05, "loss": 0.0123, "step": 12160 }, { "epoch": 0.34142235937719173, "grad_norm": 0.0422021821141243, "learning_rate": 4.4309627343713475e-05, "loss": 0.0241, "step": 12170 }, { "epoch": 0.3417029036330481, "grad_norm": 0.05054917186498642, "learning_rate": 4.430495160611587e-05, "loss": 0.0552, "step": 12180 }, { "epoch": 0.3419834478889045, "grad_norm": 0.08671228587627411, "learning_rate": 4.430027586851826e-05, "loss": 0.0323, "step": 12190 }, { "epoch": 0.34226399214476083, "grad_norm": 0.8885749578475952, "learning_rate": 4.4295600130920655e-05, "loss": 0.0525, "step": 12200 }, { "epoch": 0.3425445364006172, "grad_norm": 0.18224357068538666, "learning_rate": 4.429092439332305e-05, "loss": 0.0092, "step": 12210 }, { "epoch": 0.34282508065647355, "grad_norm": 7.574018955230713, "learning_rate": 4.428624865572544e-05, "loss": 0.0429, "step": 12220 }, { "epoch": 0.34310562491232993, "grad_norm": 1.3412939310073853, "learning_rate": 4.428157291812784e-05, "loss": 0.0237, "step": 12230 }, { "epoch": 0.34338616916818626, "grad_norm": 0.0665854886174202, "learning_rate": 4.427689718053023e-05, "loss": 0.0286, "step": 12240 }, { "epoch": 0.34366671342404265, "grad_norm": 0.09222022444009781, "learning_rate": 4.427222144293263e-05, "loss": 0.0286, "step": 12250 }, { "epoch": 0.343947257679899, "grad_norm": 0.3606748878955841, "learning_rate": 4.426754570533502e-05, "loss": 0.0121, "step": 12260 }, { "epoch": 0.34422780193575536, "grad_norm": 0.042900461703538895, "learning_rate": 4.4262869967737414e-05, "loss": 0.0319, "step": 12270 }, { "epoch": 0.34450834619161175, "grad_norm": 0.19270245730876923, "learning_rate": 4.425819423013981e-05, "loss": 0.0352, "step": 12280 }, { "epoch": 0.3447888904474681, "grad_norm": 0.18774625658988953, "learning_rate": 4.42535184925422e-05, "loss": 0.0275, "step": 12290 }, { "epoch": 0.34506943470332446, "grad_norm": 0.33066266775131226, "learning_rate": 4.42488427549446e-05, "loss": 0.0312, "step": 12300 }, { "epoch": 0.3453499789591808, "grad_norm": 0.2565407454967499, "learning_rate": 4.4244167017346986e-05, "loss": 0.028, "step": 12310 }, { "epoch": 0.3456305232150372, "grad_norm": 0.07868780940771103, "learning_rate": 4.4239491279749386e-05, "loss": 0.0567, "step": 12320 }, { "epoch": 0.3459110674708935, "grad_norm": 0.9614217281341553, "learning_rate": 4.423481554215177e-05, "loss": 0.0439, "step": 12330 }, { "epoch": 0.3461916117267499, "grad_norm": 0.456121563911438, "learning_rate": 4.423013980455417e-05, "loss": 0.0281, "step": 12340 }, { "epoch": 0.3464721559826063, "grad_norm": 0.10862316936254501, "learning_rate": 4.4225464066956566e-05, "loss": 0.0135, "step": 12350 }, { "epoch": 0.3467527002384626, "grad_norm": 0.13541866838932037, "learning_rate": 4.422078832935896e-05, "loss": 0.0466, "step": 12360 }, { "epoch": 0.347033244494319, "grad_norm": 1.0717734098434448, "learning_rate": 4.421611259176135e-05, "loss": 0.022, "step": 12370 }, { "epoch": 0.3473137887501753, "grad_norm": 0.7362959980964661, "learning_rate": 4.4211436854163745e-05, "loss": 0.0451, "step": 12380 }, { "epoch": 0.3475943330060317, "grad_norm": 0.06724483519792557, "learning_rate": 4.4206761116566145e-05, "loss": 0.0165, "step": 12390 }, { "epoch": 0.34787487726188804, "grad_norm": 0.781307578086853, "learning_rate": 4.420208537896853e-05, "loss": 0.0105, "step": 12400 }, { "epoch": 0.3481554215177444, "grad_norm": 0.6962375640869141, "learning_rate": 4.419740964137093e-05, "loss": 0.0466, "step": 12410 }, { "epoch": 0.3484359657736008, "grad_norm": 0.0777968317270279, "learning_rate": 4.419273390377332e-05, "loss": 0.0517, "step": 12420 }, { "epoch": 0.34871651002945714, "grad_norm": 0.08007251471281052, "learning_rate": 4.418805816617572e-05, "loss": 0.0588, "step": 12430 }, { "epoch": 0.3489970542853135, "grad_norm": 2.256601572036743, "learning_rate": 4.418338242857811e-05, "loss": 0.0787, "step": 12440 }, { "epoch": 0.34927759854116985, "grad_norm": 0.46827468276023865, "learning_rate": 4.4178706690980504e-05, "loss": 0.0447, "step": 12450 }, { "epoch": 0.34955814279702624, "grad_norm": 0.1295192837715149, "learning_rate": 4.41740309533829e-05, "loss": 0.0133, "step": 12460 }, { "epoch": 0.34983868705288257, "grad_norm": 0.46586573123931885, "learning_rate": 4.416935521578529e-05, "loss": 0.0235, "step": 12470 }, { "epoch": 0.35011923130873895, "grad_norm": 0.07293898612260818, "learning_rate": 4.416467947818769e-05, "loss": 0.0092, "step": 12480 }, { "epoch": 0.35039977556459534, "grad_norm": 0.4560152590274811, "learning_rate": 4.4160003740590076e-05, "loss": 0.0124, "step": 12490 }, { "epoch": 0.35068031982045167, "grad_norm": 0.2767658233642578, "learning_rate": 4.4155328002992476e-05, "loss": 0.0584, "step": 12500 }, { "epoch": 0.35096086407630805, "grad_norm": 0.28386837244033813, "learning_rate": 4.415065226539487e-05, "loss": 0.0405, "step": 12510 }, { "epoch": 0.3512414083321644, "grad_norm": 0.9322376251220703, "learning_rate": 4.414597652779726e-05, "loss": 0.0448, "step": 12520 }, { "epoch": 0.35152195258802077, "grad_norm": 0.12581001222133636, "learning_rate": 4.4141300790199656e-05, "loss": 0.0062, "step": 12530 }, { "epoch": 0.3518024968438771, "grad_norm": 3.9783518314361572, "learning_rate": 4.413662505260205e-05, "loss": 0.0573, "step": 12540 }, { "epoch": 0.3520830410997335, "grad_norm": 0.613832950592041, "learning_rate": 4.413194931500444e-05, "loss": 0.0886, "step": 12550 }, { "epoch": 0.35236358535558987, "grad_norm": 0.13585998117923737, "learning_rate": 4.4127273577406835e-05, "loss": 0.0125, "step": 12560 }, { "epoch": 0.3526441296114462, "grad_norm": 0.31029826402664185, "learning_rate": 4.4122597839809235e-05, "loss": 0.0535, "step": 12570 }, { "epoch": 0.3529246738673026, "grad_norm": 2.831186532974243, "learning_rate": 4.411792210221163e-05, "loss": 0.0306, "step": 12580 }, { "epoch": 0.3532052181231589, "grad_norm": 0.1555139422416687, "learning_rate": 4.411324636461402e-05, "loss": 0.0285, "step": 12590 }, { "epoch": 0.3534857623790153, "grad_norm": 0.910742998123169, "learning_rate": 4.4108570627016415e-05, "loss": 0.0266, "step": 12600 }, { "epoch": 0.35376630663487163, "grad_norm": 0.4598408639431, "learning_rate": 4.410389488941881e-05, "loss": 0.013, "step": 12610 }, { "epoch": 0.354046850890728, "grad_norm": 0.026040801778435707, "learning_rate": 4.40992191518212e-05, "loss": 0.036, "step": 12620 }, { "epoch": 0.3543273951465844, "grad_norm": 0.4776458442211151, "learning_rate": 4.4094543414223594e-05, "loss": 0.0354, "step": 12630 }, { "epoch": 0.35460793940244073, "grad_norm": 0.05029388144612312, "learning_rate": 4.408986767662599e-05, "loss": 0.0104, "step": 12640 }, { "epoch": 0.3548884836582971, "grad_norm": 0.24261340498924255, "learning_rate": 4.408519193902839e-05, "loss": 0.0225, "step": 12650 }, { "epoch": 0.35516902791415345, "grad_norm": 3.6779181957244873, "learning_rate": 4.408051620143078e-05, "loss": 0.0143, "step": 12660 }, { "epoch": 0.35544957217000983, "grad_norm": 3.4778430461883545, "learning_rate": 4.4075840463833174e-05, "loss": 0.0213, "step": 12670 }, { "epoch": 0.35573011642586616, "grad_norm": 1.4785399436950684, "learning_rate": 4.407116472623557e-05, "loss": 0.0576, "step": 12680 }, { "epoch": 0.35601066068172255, "grad_norm": 0.4664894938468933, "learning_rate": 4.406648898863796e-05, "loss": 0.0218, "step": 12690 }, { "epoch": 0.3562912049375789, "grad_norm": 0.5503578186035156, "learning_rate": 4.406181325104035e-05, "loss": 0.0152, "step": 12700 }, { "epoch": 0.35657174919343526, "grad_norm": 2.4083175659179688, "learning_rate": 4.4057137513442746e-05, "loss": 0.0441, "step": 12710 }, { "epoch": 0.35685229344929165, "grad_norm": 1.1971263885498047, "learning_rate": 4.4052461775845146e-05, "loss": 0.0373, "step": 12720 }, { "epoch": 0.357132837705148, "grad_norm": 0.44552767276763916, "learning_rate": 4.404778603824753e-05, "loss": 0.0464, "step": 12730 }, { "epoch": 0.35741338196100436, "grad_norm": 0.42653772234916687, "learning_rate": 4.404311030064993e-05, "loss": 0.0266, "step": 12740 }, { "epoch": 0.3576939262168607, "grad_norm": 0.3109067380428314, "learning_rate": 4.403843456305232e-05, "loss": 0.0165, "step": 12750 }, { "epoch": 0.3579744704727171, "grad_norm": 5.099842548370361, "learning_rate": 4.403375882545472e-05, "loss": 0.0294, "step": 12760 }, { "epoch": 0.3582550147285734, "grad_norm": 1.645293951034546, "learning_rate": 4.402908308785711e-05, "loss": 0.0472, "step": 12770 }, { "epoch": 0.3585355589844298, "grad_norm": 0.1678936630487442, "learning_rate": 4.4024407350259505e-05, "loss": 0.0232, "step": 12780 }, { "epoch": 0.3588161032402862, "grad_norm": 1.7252060174942017, "learning_rate": 4.4019731612661905e-05, "loss": 0.0264, "step": 12790 }, { "epoch": 0.3590966474961425, "grad_norm": 0.1917153149843216, "learning_rate": 4.401505587506429e-05, "loss": 0.0181, "step": 12800 }, { "epoch": 0.3593771917519989, "grad_norm": 4.758476734161377, "learning_rate": 4.401038013746669e-05, "loss": 0.028, "step": 12810 }, { "epoch": 0.3596577360078552, "grad_norm": 0.3095281422138214, "learning_rate": 4.400570439986908e-05, "loss": 0.027, "step": 12820 }, { "epoch": 0.3599382802637116, "grad_norm": 0.2171238213777542, "learning_rate": 4.400102866227148e-05, "loss": 0.0129, "step": 12830 }, { "epoch": 0.36021882451956794, "grad_norm": 0.02665984071791172, "learning_rate": 4.3996352924673864e-05, "loss": 0.0173, "step": 12840 }, { "epoch": 0.3604993687754243, "grad_norm": 0.04372847452759743, "learning_rate": 4.3991677187076264e-05, "loss": 0.0221, "step": 12850 }, { "epoch": 0.3607799130312807, "grad_norm": 19.78094482421875, "learning_rate": 4.398700144947866e-05, "loss": 0.0254, "step": 12860 }, { "epoch": 0.36106045728713704, "grad_norm": 0.5027284622192383, "learning_rate": 4.398232571188105e-05, "loss": 0.0234, "step": 12870 }, { "epoch": 0.3613410015429934, "grad_norm": 0.0827377438545227, "learning_rate": 4.397764997428345e-05, "loss": 0.0386, "step": 12880 }, { "epoch": 0.36162154579884975, "grad_norm": 0.3021044433116913, "learning_rate": 4.3972974236685837e-05, "loss": 0.0197, "step": 12890 }, { "epoch": 0.36190209005470614, "grad_norm": 0.205461323261261, "learning_rate": 4.3968298499088237e-05, "loss": 0.0276, "step": 12900 }, { "epoch": 0.36218263431056247, "grad_norm": 0.025400608777999878, "learning_rate": 4.396362276149062e-05, "loss": 0.0255, "step": 12910 }, { "epoch": 0.36246317856641885, "grad_norm": 1.6057789325714111, "learning_rate": 4.395894702389302e-05, "loss": 0.0145, "step": 12920 }, { "epoch": 0.36274372282227524, "grad_norm": 0.6279954314231873, "learning_rate": 4.3954271286295416e-05, "loss": 0.0321, "step": 12930 }, { "epoch": 0.36302426707813157, "grad_norm": 1.5173652172088623, "learning_rate": 4.394959554869781e-05, "loss": 0.0313, "step": 12940 }, { "epoch": 0.36330481133398795, "grad_norm": 0.07000196725130081, "learning_rate": 4.39449198111002e-05, "loss": 0.0343, "step": 12950 }, { "epoch": 0.3635853555898443, "grad_norm": 0.34873461723327637, "learning_rate": 4.3940244073502595e-05, "loss": 0.0315, "step": 12960 }, { "epoch": 0.36386589984570067, "grad_norm": 1.6658135652542114, "learning_rate": 4.393556833590499e-05, "loss": 0.026, "step": 12970 }, { "epoch": 0.364146444101557, "grad_norm": 0.943917453289032, "learning_rate": 4.393089259830738e-05, "loss": 0.0052, "step": 12980 }, { "epoch": 0.3644269883574134, "grad_norm": 0.13365350663661957, "learning_rate": 4.392621686070978e-05, "loss": 0.0239, "step": 12990 }, { "epoch": 0.36470753261326977, "grad_norm": 0.44428759813308716, "learning_rate": 4.3921541123112175e-05, "loss": 0.0183, "step": 13000 }, { "epoch": 0.3649880768691261, "grad_norm": 0.1382153034210205, "learning_rate": 4.391686538551457e-05, "loss": 0.0529, "step": 13010 }, { "epoch": 0.3652686211249825, "grad_norm": 0.028226161375641823, "learning_rate": 4.391218964791696e-05, "loss": 0.0182, "step": 13020 }, { "epoch": 0.3655491653808388, "grad_norm": 0.2208503782749176, "learning_rate": 4.3907513910319354e-05, "loss": 0.0377, "step": 13030 }, { "epoch": 0.3658297096366952, "grad_norm": 0.10205750912427902, "learning_rate": 4.390283817272175e-05, "loss": 0.0282, "step": 13040 }, { "epoch": 0.36611025389255153, "grad_norm": 0.467361181974411, "learning_rate": 4.389816243512414e-05, "loss": 0.0465, "step": 13050 }, { "epoch": 0.3663907981484079, "grad_norm": 0.32740816473960876, "learning_rate": 4.3893486697526534e-05, "loss": 0.027, "step": 13060 }, { "epoch": 0.3666713424042643, "grad_norm": 0.03508472070097923, "learning_rate": 4.3888810959928934e-05, "loss": 0.0335, "step": 13070 }, { "epoch": 0.36695188666012063, "grad_norm": 0.08461808413267136, "learning_rate": 4.388413522233133e-05, "loss": 0.0212, "step": 13080 }, { "epoch": 0.367232430915977, "grad_norm": 0.026610156521201134, "learning_rate": 4.387945948473372e-05, "loss": 0.0481, "step": 13090 }, { "epoch": 0.36751297517183334, "grad_norm": 0.6342524290084839, "learning_rate": 4.387478374713611e-05, "loss": 0.0585, "step": 13100 }, { "epoch": 0.36779351942768973, "grad_norm": 3.702859401702881, "learning_rate": 4.3870108009538506e-05, "loss": 0.0395, "step": 13110 }, { "epoch": 0.36807406368354606, "grad_norm": 0.24608348309993744, "learning_rate": 4.38654322719409e-05, "loss": 0.0439, "step": 13120 }, { "epoch": 0.36835460793940245, "grad_norm": 0.8516562581062317, "learning_rate": 4.386075653434329e-05, "loss": 0.0636, "step": 13130 }, { "epoch": 0.3686351521952588, "grad_norm": 0.37892064452171326, "learning_rate": 4.385608079674569e-05, "loss": 0.0138, "step": 13140 }, { "epoch": 0.36891569645111516, "grad_norm": 4.431915283203125, "learning_rate": 4.385140505914808e-05, "loss": 0.0369, "step": 13150 }, { "epoch": 0.36919624070697155, "grad_norm": 0.23762166500091553, "learning_rate": 4.384672932155048e-05, "loss": 0.0386, "step": 13160 }, { "epoch": 0.3694767849628279, "grad_norm": 0.955282986164093, "learning_rate": 4.384205358395287e-05, "loss": 0.0425, "step": 13170 }, { "epoch": 0.36975732921868426, "grad_norm": 0.47761473059654236, "learning_rate": 4.3837377846355265e-05, "loss": 0.0527, "step": 13180 }, { "epoch": 0.3700378734745406, "grad_norm": 0.3455933630466461, "learning_rate": 4.383270210875766e-05, "loss": 0.0186, "step": 13190 }, { "epoch": 0.370318417730397, "grad_norm": 0.7860179543495178, "learning_rate": 4.382802637116005e-05, "loss": 0.0216, "step": 13200 }, { "epoch": 0.3705989619862533, "grad_norm": 0.024545278400182724, "learning_rate": 4.382335063356245e-05, "loss": 0.036, "step": 13210 }, { "epoch": 0.3708795062421097, "grad_norm": 0.02267581596970558, "learning_rate": 4.381867489596484e-05, "loss": 0.0254, "step": 13220 }, { "epoch": 0.3711600504979661, "grad_norm": 0.2665846049785614, "learning_rate": 4.381399915836724e-05, "loss": 0.0116, "step": 13230 }, { "epoch": 0.3714405947538224, "grad_norm": 0.07397466152906418, "learning_rate": 4.3809323420769624e-05, "loss": 0.0393, "step": 13240 }, { "epoch": 0.3717211390096788, "grad_norm": 0.08274998515844345, "learning_rate": 4.3804647683172024e-05, "loss": 0.0685, "step": 13250 }, { "epoch": 0.3720016832655351, "grad_norm": 0.07083052396774292, "learning_rate": 4.379997194557442e-05, "loss": 0.0233, "step": 13260 }, { "epoch": 0.3722822275213915, "grad_norm": 0.5818612575531006, "learning_rate": 4.379529620797681e-05, "loss": 0.0444, "step": 13270 }, { "epoch": 0.37256277177724784, "grad_norm": 0.07642857730388641, "learning_rate": 4.3790620470379204e-05, "loss": 0.0419, "step": 13280 }, { "epoch": 0.3728433160331042, "grad_norm": 0.4148213267326355, "learning_rate": 4.37859447327816e-05, "loss": 0.016, "step": 13290 }, { "epoch": 0.3731238602889606, "grad_norm": 1.1698962450027466, "learning_rate": 4.3781268995183997e-05, "loss": 0.0088, "step": 13300 }, { "epoch": 0.37340440454481694, "grad_norm": 3.038109302520752, "learning_rate": 4.377659325758638e-05, "loss": 0.0514, "step": 13310 }, { "epoch": 0.3736849488006733, "grad_norm": 0.07528307288885117, "learning_rate": 4.377191751998878e-05, "loss": 0.0204, "step": 13320 }, { "epoch": 0.37396549305652965, "grad_norm": 1.1732488870620728, "learning_rate": 4.376724178239117e-05, "loss": 0.0281, "step": 13330 }, { "epoch": 0.37424603731238604, "grad_norm": 5.806952953338623, "learning_rate": 4.376256604479357e-05, "loss": 0.0312, "step": 13340 }, { "epoch": 0.37452658156824237, "grad_norm": 0.07316266000270844, "learning_rate": 4.375789030719596e-05, "loss": 0.0404, "step": 13350 }, { "epoch": 0.37480712582409875, "grad_norm": 0.19125762581825256, "learning_rate": 4.3753214569598356e-05, "loss": 0.0359, "step": 13360 }, { "epoch": 0.37508767007995514, "grad_norm": 0.6197768449783325, "learning_rate": 4.374853883200075e-05, "loss": 0.0363, "step": 13370 }, { "epoch": 0.37536821433581147, "grad_norm": 0.10674238204956055, "learning_rate": 4.374386309440314e-05, "loss": 0.0353, "step": 13380 }, { "epoch": 0.37564875859166785, "grad_norm": 0.6289215087890625, "learning_rate": 4.373918735680554e-05, "loss": 0.0461, "step": 13390 }, { "epoch": 0.3759293028475242, "grad_norm": 0.3856453597545624, "learning_rate": 4.373451161920793e-05, "loss": 0.0494, "step": 13400 }, { "epoch": 0.37620984710338057, "grad_norm": 0.16540098190307617, "learning_rate": 4.372983588161033e-05, "loss": 0.0464, "step": 13410 }, { "epoch": 0.3764903913592369, "grad_norm": 0.2020624279975891, "learning_rate": 4.372516014401272e-05, "loss": 0.0294, "step": 13420 }, { "epoch": 0.3767709356150933, "grad_norm": 0.0581364780664444, "learning_rate": 4.3720484406415114e-05, "loss": 0.0226, "step": 13430 }, { "epoch": 0.37705147987094967, "grad_norm": 0.24021989107131958, "learning_rate": 4.371580866881751e-05, "loss": 0.0437, "step": 13440 }, { "epoch": 0.377332024126806, "grad_norm": 0.9546102285385132, "learning_rate": 4.37111329312199e-05, "loss": 0.0183, "step": 13450 }, { "epoch": 0.3776125683826624, "grad_norm": 0.5476446747779846, "learning_rate": 4.3706457193622294e-05, "loss": 0.0187, "step": 13460 }, { "epoch": 0.3778931126385187, "grad_norm": 0.5648765563964844, "learning_rate": 4.370178145602469e-05, "loss": 0.015, "step": 13470 }, { "epoch": 0.3781736568943751, "grad_norm": 0.3633228838443756, "learning_rate": 4.369710571842709e-05, "loss": 0.0436, "step": 13480 }, { "epoch": 0.37845420115023143, "grad_norm": 0.46841081976890564, "learning_rate": 4.369242998082948e-05, "loss": 0.0424, "step": 13490 }, { "epoch": 0.3787347454060878, "grad_norm": 0.780561089515686, "learning_rate": 4.368775424323187e-05, "loss": 0.0309, "step": 13500 }, { "epoch": 0.3790152896619442, "grad_norm": 1.0233129262924194, "learning_rate": 4.3683078505634266e-05, "loss": 0.0718, "step": 13510 }, { "epoch": 0.37929583391780053, "grad_norm": 0.14187777042388916, "learning_rate": 4.367840276803666e-05, "loss": 0.023, "step": 13520 }, { "epoch": 0.3795763781736569, "grad_norm": 0.22761359810829163, "learning_rate": 4.367372703043905e-05, "loss": 0.0285, "step": 13530 }, { "epoch": 0.37985692242951324, "grad_norm": 0.13696548342704773, "learning_rate": 4.3669051292841446e-05, "loss": 0.0253, "step": 13540 }, { "epoch": 0.38013746668536963, "grad_norm": 0.13248471915721893, "learning_rate": 4.366437555524384e-05, "loss": 0.047, "step": 13550 }, { "epoch": 0.38041801094122596, "grad_norm": 0.2635922431945801, "learning_rate": 4.365969981764624e-05, "loss": 0.0085, "step": 13560 }, { "epoch": 0.38069855519708234, "grad_norm": 0.7861345410346985, "learning_rate": 4.365502408004863e-05, "loss": 0.0195, "step": 13570 }, { "epoch": 0.3809790994529387, "grad_norm": 1.1126697063446045, "learning_rate": 4.3650348342451025e-05, "loss": 0.0555, "step": 13580 }, { "epoch": 0.38125964370879506, "grad_norm": 0.3643365800380707, "learning_rate": 4.364567260485342e-05, "loss": 0.018, "step": 13590 }, { "epoch": 0.38154018796465144, "grad_norm": 0.06483574211597443, "learning_rate": 4.364099686725581e-05, "loss": 0.0078, "step": 13600 }, { "epoch": 0.3818207322205078, "grad_norm": 0.07161064445972443, "learning_rate": 4.3636321129658205e-05, "loss": 0.0269, "step": 13610 }, { "epoch": 0.38210127647636416, "grad_norm": 10.072480201721191, "learning_rate": 4.36316453920606e-05, "loss": 0.0386, "step": 13620 }, { "epoch": 0.3823818207322205, "grad_norm": 0.5944436192512512, "learning_rate": 4.3626969654463e-05, "loss": 0.02, "step": 13630 }, { "epoch": 0.3826623649880769, "grad_norm": 0.3119417726993561, "learning_rate": 4.3622293916865384e-05, "loss": 0.0437, "step": 13640 }, { "epoch": 0.3829429092439332, "grad_norm": 0.0632445439696312, "learning_rate": 4.3617618179267784e-05, "loss": 0.0345, "step": 13650 }, { "epoch": 0.3832234534997896, "grad_norm": 0.8965043425559998, "learning_rate": 4.361294244167017e-05, "loss": 0.0161, "step": 13660 }, { "epoch": 0.383503997755646, "grad_norm": 0.46853771805763245, "learning_rate": 4.360826670407257e-05, "loss": 0.0522, "step": 13670 }, { "epoch": 0.3837845420115023, "grad_norm": 0.22920112311840057, "learning_rate": 4.3603590966474964e-05, "loss": 0.0597, "step": 13680 }, { "epoch": 0.3840650862673587, "grad_norm": 0.14252960681915283, "learning_rate": 4.359891522887736e-05, "loss": 0.0603, "step": 13690 }, { "epoch": 0.384345630523215, "grad_norm": 0.49566665291786194, "learning_rate": 4.359423949127976e-05, "loss": 0.0145, "step": 13700 }, { "epoch": 0.3846261747790714, "grad_norm": 0.2550899386405945, "learning_rate": 4.358956375368214e-05, "loss": 0.0645, "step": 13710 }, { "epoch": 0.38490671903492774, "grad_norm": 0.0705445259809494, "learning_rate": 4.358488801608454e-05, "loss": 0.0143, "step": 13720 }, { "epoch": 0.3851872632907841, "grad_norm": 0.06925869733095169, "learning_rate": 4.358021227848693e-05, "loss": 0.0233, "step": 13730 }, { "epoch": 0.3854678075466405, "grad_norm": 1.0498765707015991, "learning_rate": 4.357553654088933e-05, "loss": 0.0725, "step": 13740 }, { "epoch": 0.38574835180249684, "grad_norm": 0.37432846426963806, "learning_rate": 4.3570860803291716e-05, "loss": 0.04, "step": 13750 }, { "epoch": 0.3860288960583532, "grad_norm": 0.12108743190765381, "learning_rate": 4.3566185065694116e-05, "loss": 0.0292, "step": 13760 }, { "epoch": 0.38630944031420955, "grad_norm": 0.48337435722351074, "learning_rate": 4.356150932809651e-05, "loss": 0.0378, "step": 13770 }, { "epoch": 0.38658998457006594, "grad_norm": 0.03039627894759178, "learning_rate": 4.35568335904989e-05, "loss": 0.0185, "step": 13780 }, { "epoch": 0.38687052882592227, "grad_norm": 0.3489627540111542, "learning_rate": 4.35521578529013e-05, "loss": 0.0131, "step": 13790 }, { "epoch": 0.38715107308177865, "grad_norm": 0.039507102221250534, "learning_rate": 4.354748211530369e-05, "loss": 0.03, "step": 13800 }, { "epoch": 0.38743161733763504, "grad_norm": 1.2696653604507446, "learning_rate": 4.354280637770609e-05, "loss": 0.0212, "step": 13810 }, { "epoch": 0.38771216159349137, "grad_norm": 0.07573480159044266, "learning_rate": 4.3538130640108475e-05, "loss": 0.0238, "step": 13820 }, { "epoch": 0.38799270584934775, "grad_norm": 0.19145093858242035, "learning_rate": 4.3533454902510874e-05, "loss": 0.023, "step": 13830 }, { "epoch": 0.3882732501052041, "grad_norm": 0.7786852717399597, "learning_rate": 4.352877916491327e-05, "loss": 0.05, "step": 13840 }, { "epoch": 0.38855379436106047, "grad_norm": 0.8379610180854797, "learning_rate": 4.352410342731566e-05, "loss": 0.0256, "step": 13850 }, { "epoch": 0.3888343386169168, "grad_norm": 0.4604332447052002, "learning_rate": 4.3519427689718054e-05, "loss": 0.0357, "step": 13860 }, { "epoch": 0.3891148828727732, "grad_norm": 1.9048043489456177, "learning_rate": 4.351475195212045e-05, "loss": 0.0494, "step": 13870 }, { "epoch": 0.38939542712862957, "grad_norm": 0.2742364704608917, "learning_rate": 4.351007621452284e-05, "loss": 0.0487, "step": 13880 }, { "epoch": 0.3896759713844859, "grad_norm": 0.2560631036758423, "learning_rate": 4.3505400476925233e-05, "loss": 0.0156, "step": 13890 }, { "epoch": 0.3899565156403423, "grad_norm": 0.5404646396636963, "learning_rate": 4.350072473932763e-05, "loss": 0.0385, "step": 13900 }, { "epoch": 0.3902370598961986, "grad_norm": 0.07305291295051575, "learning_rate": 4.3496049001730027e-05, "loss": 0.0288, "step": 13910 }, { "epoch": 0.390517604152055, "grad_norm": 0.11765086650848389, "learning_rate": 4.349137326413242e-05, "loss": 0.0369, "step": 13920 }, { "epoch": 0.3907981484079113, "grad_norm": 0.5487903952598572, "learning_rate": 4.348669752653481e-05, "loss": 0.0236, "step": 13930 }, { "epoch": 0.3910786926637677, "grad_norm": 0.7219108939170837, "learning_rate": 4.3482021788937206e-05, "loss": 0.0229, "step": 13940 }, { "epoch": 0.3913592369196241, "grad_norm": 1.5335205793380737, "learning_rate": 4.34773460513396e-05, "loss": 0.1027, "step": 13950 }, { "epoch": 0.39163978117548043, "grad_norm": 0.1518063098192215, "learning_rate": 4.347267031374199e-05, "loss": 0.0864, "step": 13960 }, { "epoch": 0.3919203254313368, "grad_norm": 3.1402108669281006, "learning_rate": 4.3467994576144385e-05, "loss": 0.0415, "step": 13970 }, { "epoch": 0.39220086968719314, "grad_norm": 0.5202327370643616, "learning_rate": 4.3463318838546785e-05, "loss": 0.0304, "step": 13980 }, { "epoch": 0.39248141394304953, "grad_norm": 0.05773229897022247, "learning_rate": 4.345864310094918e-05, "loss": 0.0212, "step": 13990 }, { "epoch": 0.39276195819890586, "grad_norm": 1.1100130081176758, "learning_rate": 4.345396736335157e-05, "loss": 0.0507, "step": 14000 }, { "epoch": 0.39304250245476224, "grad_norm": 1.6787406206130981, "learning_rate": 4.3449291625753965e-05, "loss": 0.0882, "step": 14010 }, { "epoch": 0.3933230467106186, "grad_norm": 0.1754007637500763, "learning_rate": 4.344461588815636e-05, "loss": 0.0194, "step": 14020 }, { "epoch": 0.39360359096647496, "grad_norm": 0.12770886719226837, "learning_rate": 4.343994015055875e-05, "loss": 0.0402, "step": 14030 }, { "epoch": 0.39388413522233134, "grad_norm": 0.42244189977645874, "learning_rate": 4.3435264412961144e-05, "loss": 0.0301, "step": 14040 }, { "epoch": 0.3941646794781877, "grad_norm": 0.22708222270011902, "learning_rate": 4.3430588675363544e-05, "loss": 0.0596, "step": 14050 }, { "epoch": 0.39444522373404406, "grad_norm": 0.5682022571563721, "learning_rate": 4.342591293776593e-05, "loss": 0.0413, "step": 14060 }, { "epoch": 0.3947257679899004, "grad_norm": 1.5572582483291626, "learning_rate": 4.342123720016833e-05, "loss": 0.0206, "step": 14070 }, { "epoch": 0.3950063122457568, "grad_norm": 0.29592910408973694, "learning_rate": 4.3416561462570724e-05, "loss": 0.0448, "step": 14080 }, { "epoch": 0.3952868565016131, "grad_norm": 0.17884668707847595, "learning_rate": 4.341188572497312e-05, "loss": 0.0643, "step": 14090 }, { "epoch": 0.3955674007574695, "grad_norm": 1.2275217771530151, "learning_rate": 4.340720998737551e-05, "loss": 0.0243, "step": 14100 }, { "epoch": 0.3958479450133259, "grad_norm": 0.1427169144153595, "learning_rate": 4.34025342497779e-05, "loss": 0.0338, "step": 14110 }, { "epoch": 0.3961284892691822, "grad_norm": 0.08354400843381882, "learning_rate": 4.33978585121803e-05, "loss": 0.0766, "step": 14120 }, { "epoch": 0.3964090335250386, "grad_norm": 0.032598234713077545, "learning_rate": 4.339318277458269e-05, "loss": 0.0326, "step": 14130 }, { "epoch": 0.3966895777808949, "grad_norm": 1.1594165563583374, "learning_rate": 4.338850703698509e-05, "loss": 0.0602, "step": 14140 }, { "epoch": 0.3969701220367513, "grad_norm": 1.927049160003662, "learning_rate": 4.3383831299387476e-05, "loss": 0.0716, "step": 14150 }, { "epoch": 0.39725066629260763, "grad_norm": 0.13254521787166595, "learning_rate": 4.3379155561789876e-05, "loss": 0.022, "step": 14160 }, { "epoch": 0.397531210548464, "grad_norm": 0.08947543799877167, "learning_rate": 4.337447982419227e-05, "loss": 0.0317, "step": 14170 }, { "epoch": 0.3978117548043204, "grad_norm": 0.0543549545109272, "learning_rate": 4.336980408659466e-05, "loss": 0.036, "step": 14180 }, { "epoch": 0.39809229906017674, "grad_norm": 0.05777794495224953, "learning_rate": 4.3365128348997055e-05, "loss": 0.0188, "step": 14190 }, { "epoch": 0.3983728433160331, "grad_norm": 0.6932314038276672, "learning_rate": 4.336045261139945e-05, "loss": 0.0279, "step": 14200 }, { "epoch": 0.39865338757188945, "grad_norm": 0.07317094504833221, "learning_rate": 4.335577687380185e-05, "loss": 0.0306, "step": 14210 }, { "epoch": 0.39893393182774584, "grad_norm": 0.10396334528923035, "learning_rate": 4.3351101136204235e-05, "loss": 0.0413, "step": 14220 }, { "epoch": 0.39921447608360217, "grad_norm": 0.23399154841899872, "learning_rate": 4.3346425398606635e-05, "loss": 0.0255, "step": 14230 }, { "epoch": 0.39949502033945855, "grad_norm": 0.38512226939201355, "learning_rate": 4.334174966100903e-05, "loss": 0.052, "step": 14240 }, { "epoch": 0.39977556459531494, "grad_norm": 0.05629371479153633, "learning_rate": 4.333707392341142e-05, "loss": 0.0117, "step": 14250 }, { "epoch": 0.40005610885117127, "grad_norm": 0.33564624190330505, "learning_rate": 4.3332398185813814e-05, "loss": 0.0509, "step": 14260 }, { "epoch": 0.40033665310702765, "grad_norm": 0.46235939860343933, "learning_rate": 4.332772244821621e-05, "loss": 0.0311, "step": 14270 }, { "epoch": 0.400617197362884, "grad_norm": 0.01711125485599041, "learning_rate": 4.33230467106186e-05, "loss": 0.0302, "step": 14280 }, { "epoch": 0.40089774161874037, "grad_norm": 0.30087271332740784, "learning_rate": 4.3318370973020994e-05, "loss": 0.0515, "step": 14290 }, { "epoch": 0.4011782858745967, "grad_norm": 0.41314440965652466, "learning_rate": 4.3313695235423393e-05, "loss": 0.0241, "step": 14300 }, { "epoch": 0.4014588301304531, "grad_norm": 0.06337013840675354, "learning_rate": 4.3309019497825787e-05, "loss": 0.0232, "step": 14310 }, { "epoch": 0.40173937438630947, "grad_norm": 0.32484862208366394, "learning_rate": 4.330434376022818e-05, "loss": 0.0566, "step": 14320 }, { "epoch": 0.4020199186421658, "grad_norm": 0.9340447783470154, "learning_rate": 4.329966802263057e-05, "loss": 0.0278, "step": 14330 }, { "epoch": 0.4023004628980222, "grad_norm": 0.26274749636650085, "learning_rate": 4.3294992285032966e-05, "loss": 0.0238, "step": 14340 }, { "epoch": 0.4025810071538785, "grad_norm": 0.7519007325172424, "learning_rate": 4.329031654743536e-05, "loss": 0.0216, "step": 14350 }, { "epoch": 0.4028615514097349, "grad_norm": 0.018814850598573685, "learning_rate": 4.328564080983775e-05, "loss": 0.021, "step": 14360 }, { "epoch": 0.4031420956655912, "grad_norm": 0.23208698630332947, "learning_rate": 4.3280965072240146e-05, "loss": 0.0287, "step": 14370 }, { "epoch": 0.4034226399214476, "grad_norm": 0.1713247448205948, "learning_rate": 4.3276289334642545e-05, "loss": 0.0299, "step": 14380 }, { "epoch": 0.40370318417730394, "grad_norm": 0.1157640889286995, "learning_rate": 4.327161359704494e-05, "loss": 0.0391, "step": 14390 }, { "epoch": 0.4039837284331603, "grad_norm": 0.5356809496879578, "learning_rate": 4.326693785944733e-05, "loss": 0.035, "step": 14400 }, { "epoch": 0.4042642726890167, "grad_norm": 0.11240236461162567, "learning_rate": 4.3262262121849725e-05, "loss": 0.0431, "step": 14410 }, { "epoch": 0.40454481694487304, "grad_norm": 0.06145598366856575, "learning_rate": 4.325758638425212e-05, "loss": 0.0207, "step": 14420 }, { "epoch": 0.40482536120072943, "grad_norm": 0.5510286092758179, "learning_rate": 4.325291064665451e-05, "loss": 0.0304, "step": 14430 }, { "epoch": 0.40510590545658576, "grad_norm": 0.05586526170372963, "learning_rate": 4.3248234909056904e-05, "loss": 0.0555, "step": 14440 }, { "epoch": 0.40538644971244214, "grad_norm": 0.3236068785190582, "learning_rate": 4.3243559171459304e-05, "loss": 0.0374, "step": 14450 }, { "epoch": 0.4056669939682985, "grad_norm": 0.04102804884314537, "learning_rate": 4.323888343386169e-05, "loss": 0.018, "step": 14460 }, { "epoch": 0.40594753822415486, "grad_norm": 0.43338266015052795, "learning_rate": 4.323420769626409e-05, "loss": 0.0452, "step": 14470 }, { "epoch": 0.40622808248001124, "grad_norm": 1.1338422298431396, "learning_rate": 4.3229531958666484e-05, "loss": 0.0581, "step": 14480 }, { "epoch": 0.4065086267358676, "grad_norm": 0.0229355338960886, "learning_rate": 4.322485622106888e-05, "loss": 0.0061, "step": 14490 }, { "epoch": 0.40678917099172396, "grad_norm": 0.05370306223630905, "learning_rate": 4.322018048347127e-05, "loss": 0.027, "step": 14500 }, { "epoch": 0.4070697152475803, "grad_norm": 0.032873332500457764, "learning_rate": 4.321550474587366e-05, "loss": 0.0309, "step": 14510 }, { "epoch": 0.4073502595034367, "grad_norm": 0.026929769665002823, "learning_rate": 4.321082900827606e-05, "loss": 0.0053, "step": 14520 }, { "epoch": 0.407630803759293, "grad_norm": 1.0514601469039917, "learning_rate": 4.320615327067845e-05, "loss": 0.0272, "step": 14530 }, { "epoch": 0.4079113480151494, "grad_norm": 0.9334607720375061, "learning_rate": 4.320147753308085e-05, "loss": 0.0373, "step": 14540 }, { "epoch": 0.4081918922710058, "grad_norm": 3.415823221206665, "learning_rate": 4.3196801795483236e-05, "loss": 0.0144, "step": 14550 }, { "epoch": 0.4084724365268621, "grad_norm": 5.856870651245117, "learning_rate": 4.3192126057885636e-05, "loss": 0.0207, "step": 14560 }, { "epoch": 0.4087529807827185, "grad_norm": 0.2960977852344513, "learning_rate": 4.318745032028802e-05, "loss": 0.0306, "step": 14570 }, { "epoch": 0.4090335250385748, "grad_norm": 0.04732128977775574, "learning_rate": 4.318277458269042e-05, "loss": 0.0102, "step": 14580 }, { "epoch": 0.4093140692944312, "grad_norm": 0.21067620813846588, "learning_rate": 4.3178098845092815e-05, "loss": 0.0076, "step": 14590 }, { "epoch": 0.40959461355028753, "grad_norm": 0.029671330004930496, "learning_rate": 4.317342310749521e-05, "loss": 0.0094, "step": 14600 }, { "epoch": 0.4098751578061439, "grad_norm": 0.8038507699966431, "learning_rate": 4.316874736989761e-05, "loss": 0.0594, "step": 14610 }, { "epoch": 0.4101557020620003, "grad_norm": 0.10976418852806091, "learning_rate": 4.3164071632299995e-05, "loss": 0.0535, "step": 14620 }, { "epoch": 0.41043624631785663, "grad_norm": 2.2363579273223877, "learning_rate": 4.3159395894702395e-05, "loss": 0.0379, "step": 14630 }, { "epoch": 0.410716790573713, "grad_norm": 0.06948670744895935, "learning_rate": 4.315472015710478e-05, "loss": 0.0333, "step": 14640 }, { "epoch": 0.41099733482956935, "grad_norm": 0.7393234968185425, "learning_rate": 4.315004441950718e-05, "loss": 0.0277, "step": 14650 }, { "epoch": 0.41127787908542573, "grad_norm": 3.222108840942383, "learning_rate": 4.3145368681909574e-05, "loss": 0.0241, "step": 14660 }, { "epoch": 0.41155842334128206, "grad_norm": 3.8794431686401367, "learning_rate": 4.314069294431197e-05, "loss": 0.0625, "step": 14670 }, { "epoch": 0.41183896759713845, "grad_norm": 3.687687397003174, "learning_rate": 4.313601720671436e-05, "loss": 0.092, "step": 14680 }, { "epoch": 0.41211951185299484, "grad_norm": 0.12551464140415192, "learning_rate": 4.3131341469116754e-05, "loss": 0.0356, "step": 14690 }, { "epoch": 0.41240005610885117, "grad_norm": 6.581370830535889, "learning_rate": 4.3126665731519154e-05, "loss": 0.0368, "step": 14700 }, { "epoch": 0.41268060036470755, "grad_norm": 0.9363254308700562, "learning_rate": 4.312198999392154e-05, "loss": 0.0334, "step": 14710 }, { "epoch": 0.4129611446205639, "grad_norm": 0.32192739844322205, "learning_rate": 4.311731425632394e-05, "loss": 0.0298, "step": 14720 }, { "epoch": 0.41324168887642027, "grad_norm": 0.35190048813819885, "learning_rate": 4.311263851872633e-05, "loss": 0.0578, "step": 14730 }, { "epoch": 0.4135222331322766, "grad_norm": 0.4447452425956726, "learning_rate": 4.3107962781128726e-05, "loss": 0.0251, "step": 14740 }, { "epoch": 0.413802777388133, "grad_norm": 0.0272963996976614, "learning_rate": 4.310328704353112e-05, "loss": 0.0072, "step": 14750 }, { "epoch": 0.41408332164398937, "grad_norm": 0.13107824325561523, "learning_rate": 4.309861130593351e-05, "loss": 0.0158, "step": 14760 }, { "epoch": 0.4143638658998457, "grad_norm": 0.39437294006347656, "learning_rate": 4.3093935568335906e-05, "loss": 0.012, "step": 14770 }, { "epoch": 0.4146444101557021, "grad_norm": 0.017383141443133354, "learning_rate": 4.30892598307383e-05, "loss": 0.0017, "step": 14780 }, { "epoch": 0.4149249544115584, "grad_norm": 5.862977504730225, "learning_rate": 4.308458409314069e-05, "loss": 0.0367, "step": 14790 }, { "epoch": 0.4152054986674148, "grad_norm": 0.21818865835666656, "learning_rate": 4.307990835554309e-05, "loss": 0.0241, "step": 14800 }, { "epoch": 0.4154860429232711, "grad_norm": 0.36026325821876526, "learning_rate": 4.3075232617945485e-05, "loss": 0.0693, "step": 14810 }, { "epoch": 0.4157665871791275, "grad_norm": 1.2595579624176025, "learning_rate": 4.307055688034788e-05, "loss": 0.0251, "step": 14820 }, { "epoch": 0.41604713143498384, "grad_norm": 0.6594622731208801, "learning_rate": 4.306588114275027e-05, "loss": 0.0173, "step": 14830 }, { "epoch": 0.4163276756908402, "grad_norm": 0.35436227917671204, "learning_rate": 4.3061205405152664e-05, "loss": 0.0287, "step": 14840 }, { "epoch": 0.4166082199466966, "grad_norm": 0.485017865896225, "learning_rate": 4.305652966755506e-05, "loss": 0.0406, "step": 14850 }, { "epoch": 0.41688876420255294, "grad_norm": 0.5207356214523315, "learning_rate": 4.305185392995745e-05, "loss": 0.0072, "step": 14860 }, { "epoch": 0.4171693084584093, "grad_norm": 0.13898347318172455, "learning_rate": 4.304717819235985e-05, "loss": 0.0348, "step": 14870 }, { "epoch": 0.41744985271426566, "grad_norm": 0.0636134222149849, "learning_rate": 4.304250245476224e-05, "loss": 0.0093, "step": 14880 }, { "epoch": 0.41773039697012204, "grad_norm": 0.4520607888698578, "learning_rate": 4.303782671716464e-05, "loss": 0.0668, "step": 14890 }, { "epoch": 0.41801094122597837, "grad_norm": 1.1253705024719238, "learning_rate": 4.303315097956703e-05, "loss": 0.0324, "step": 14900 }, { "epoch": 0.41829148548183476, "grad_norm": 0.046376846730709076, "learning_rate": 4.302847524196942e-05, "loss": 0.0092, "step": 14910 }, { "epoch": 0.41857202973769114, "grad_norm": 0.4110369384288788, "learning_rate": 4.3023799504371816e-05, "loss": 0.0129, "step": 14920 }, { "epoch": 0.4188525739935475, "grad_norm": 3.5499050617218018, "learning_rate": 4.301912376677421e-05, "loss": 0.0293, "step": 14930 }, { "epoch": 0.41913311824940386, "grad_norm": 0.07788801938295364, "learning_rate": 4.301444802917661e-05, "loss": 0.0111, "step": 14940 }, { "epoch": 0.4194136625052602, "grad_norm": 0.7538636922836304, "learning_rate": 4.3009772291578996e-05, "loss": 0.072, "step": 14950 }, { "epoch": 0.4196942067611166, "grad_norm": 0.0346502847969532, "learning_rate": 4.3005096553981396e-05, "loss": 0.0229, "step": 14960 }, { "epoch": 0.4199747510169729, "grad_norm": 0.09138436615467072, "learning_rate": 4.300042081638378e-05, "loss": 0.0324, "step": 14970 }, { "epoch": 0.4202552952728293, "grad_norm": 0.1685311198234558, "learning_rate": 4.299574507878618e-05, "loss": 0.0324, "step": 14980 }, { "epoch": 0.4205358395286857, "grad_norm": 0.14091050624847412, "learning_rate": 4.2991069341188575e-05, "loss": 0.0365, "step": 14990 }, { "epoch": 0.420816383784542, "grad_norm": 0.40736573934555054, "learning_rate": 4.298639360359097e-05, "loss": 0.0197, "step": 15000 }, { "epoch": 0.4210969280403984, "grad_norm": 0.02823065035045147, "learning_rate": 4.298171786599336e-05, "loss": 0.0121, "step": 15010 }, { "epoch": 0.4213774722962547, "grad_norm": 0.029093654826283455, "learning_rate": 4.2977042128395755e-05, "loss": 0.0423, "step": 15020 }, { "epoch": 0.4216580165521111, "grad_norm": 0.2008821666240692, "learning_rate": 4.2972366390798155e-05, "loss": 0.0122, "step": 15030 }, { "epoch": 0.42193856080796743, "grad_norm": 0.8263328075408936, "learning_rate": 4.296769065320054e-05, "loss": 0.0497, "step": 15040 }, { "epoch": 0.4222191050638238, "grad_norm": 0.01279241219162941, "learning_rate": 4.296301491560294e-05, "loss": 0.0213, "step": 15050 }, { "epoch": 0.4224996493196802, "grad_norm": 0.037039387971162796, "learning_rate": 4.295833917800533e-05, "loss": 0.0078, "step": 15060 }, { "epoch": 0.42278019357553653, "grad_norm": 1.9136275053024292, "learning_rate": 4.295366344040773e-05, "loss": 0.0909, "step": 15070 }, { "epoch": 0.4230607378313929, "grad_norm": 0.10316906869411469, "learning_rate": 4.294898770281012e-05, "loss": 0.0708, "step": 15080 }, { "epoch": 0.42334128208724925, "grad_norm": 0.14156180620193481, "learning_rate": 4.2944311965212514e-05, "loss": 0.0194, "step": 15090 }, { "epoch": 0.42362182634310563, "grad_norm": 0.1581171154975891, "learning_rate": 4.293963622761491e-05, "loss": 0.0236, "step": 15100 }, { "epoch": 0.42390237059896196, "grad_norm": 1.7021753787994385, "learning_rate": 4.29349604900173e-05, "loss": 0.0211, "step": 15110 }, { "epoch": 0.42418291485481835, "grad_norm": 0.08143515139818192, "learning_rate": 4.29302847524197e-05, "loss": 0.0247, "step": 15120 }, { "epoch": 0.42446345911067473, "grad_norm": 0.2551755905151367, "learning_rate": 4.2925609014822086e-05, "loss": 0.0071, "step": 15130 }, { "epoch": 0.42474400336653106, "grad_norm": 0.28798046708106995, "learning_rate": 4.2920933277224486e-05, "loss": 0.0396, "step": 15140 }, { "epoch": 0.42502454762238745, "grad_norm": 0.07206101715564728, "learning_rate": 4.291625753962688e-05, "loss": 0.0139, "step": 15150 }, { "epoch": 0.4253050918782438, "grad_norm": 0.046170346438884735, "learning_rate": 4.291158180202927e-05, "loss": 0.0743, "step": 15160 }, { "epoch": 0.42558563613410016, "grad_norm": 0.3012891113758087, "learning_rate": 4.2906906064431666e-05, "loss": 0.0567, "step": 15170 }, { "epoch": 0.4258661803899565, "grad_norm": 0.12217739969491959, "learning_rate": 4.290223032683406e-05, "loss": 0.0257, "step": 15180 }, { "epoch": 0.4261467246458129, "grad_norm": 1.9362841844558716, "learning_rate": 4.289755458923645e-05, "loss": 0.0838, "step": 15190 }, { "epoch": 0.42642726890166927, "grad_norm": 0.4009726345539093, "learning_rate": 4.2892878851638845e-05, "loss": 0.0481, "step": 15200 }, { "epoch": 0.4267078131575256, "grad_norm": 0.2278011441230774, "learning_rate": 4.2888203114041245e-05, "loss": 0.0487, "step": 15210 }, { "epoch": 0.426988357413382, "grad_norm": 0.8361111283302307, "learning_rate": 4.288352737644364e-05, "loss": 0.0331, "step": 15220 }, { "epoch": 0.4272689016692383, "grad_norm": 0.19043461978435516, "learning_rate": 4.287885163884603e-05, "loss": 0.0186, "step": 15230 }, { "epoch": 0.4275494459250947, "grad_norm": 0.04588304087519646, "learning_rate": 4.2874175901248425e-05, "loss": 0.0062, "step": 15240 }, { "epoch": 0.427829990180951, "grad_norm": 0.04926234856247902, "learning_rate": 4.286950016365082e-05, "loss": 0.0195, "step": 15250 }, { "epoch": 0.4281105344368074, "grad_norm": 1.0731192827224731, "learning_rate": 4.286482442605321e-05, "loss": 0.0363, "step": 15260 }, { "epoch": 0.42839107869266374, "grad_norm": 0.03558899462223053, "learning_rate": 4.2860148688455604e-05, "loss": 0.016, "step": 15270 }, { "epoch": 0.4286716229485201, "grad_norm": 0.5648701190948486, "learning_rate": 4.2855472950858e-05, "loss": 0.0279, "step": 15280 }, { "epoch": 0.4289521672043765, "grad_norm": 0.7525675892829895, "learning_rate": 4.28507972132604e-05, "loss": 0.0551, "step": 15290 }, { "epoch": 0.42923271146023284, "grad_norm": 1.092283010482788, "learning_rate": 4.284612147566279e-05, "loss": 0.0206, "step": 15300 }, { "epoch": 0.4295132557160892, "grad_norm": 0.06365705281496048, "learning_rate": 4.2841445738065183e-05, "loss": 0.0616, "step": 15310 }, { "epoch": 0.42979379997194556, "grad_norm": 4.167857646942139, "learning_rate": 4.2836770000467577e-05, "loss": 0.0415, "step": 15320 }, { "epoch": 0.43007434422780194, "grad_norm": 0.9551361799240112, "learning_rate": 4.283209426286997e-05, "loss": 0.0457, "step": 15330 }, { "epoch": 0.43035488848365827, "grad_norm": 3.3514387607574463, "learning_rate": 4.282741852527236e-05, "loss": 0.0513, "step": 15340 }, { "epoch": 0.43063543273951466, "grad_norm": 0.0676393210887909, "learning_rate": 4.2822742787674756e-05, "loss": 0.0178, "step": 15350 }, { "epoch": 0.43091597699537104, "grad_norm": 0.08345546573400497, "learning_rate": 4.2818067050077156e-05, "loss": 0.0131, "step": 15360 }, { "epoch": 0.43119652125122737, "grad_norm": 0.39428672194480896, "learning_rate": 4.281339131247954e-05, "loss": 0.0419, "step": 15370 }, { "epoch": 0.43147706550708376, "grad_norm": 0.8825334310531616, "learning_rate": 4.280871557488194e-05, "loss": 0.0326, "step": 15380 }, { "epoch": 0.4317576097629401, "grad_norm": 2.7089483737945557, "learning_rate": 4.2804039837284335e-05, "loss": 0.0433, "step": 15390 }, { "epoch": 0.43203815401879647, "grad_norm": 0.2768697440624237, "learning_rate": 4.279936409968673e-05, "loss": 0.021, "step": 15400 }, { "epoch": 0.4323186982746528, "grad_norm": 0.0736827403306961, "learning_rate": 4.279468836208912e-05, "loss": 0.0213, "step": 15410 }, { "epoch": 0.4325992425305092, "grad_norm": 4.0525288581848145, "learning_rate": 4.2790012624491515e-05, "loss": 0.0275, "step": 15420 }, { "epoch": 0.4328797867863656, "grad_norm": 0.04366090148687363, "learning_rate": 4.2785336886893915e-05, "loss": 0.0071, "step": 15430 }, { "epoch": 0.4331603310422219, "grad_norm": 0.03500070795416832, "learning_rate": 4.27806611492963e-05, "loss": 0.0206, "step": 15440 }, { "epoch": 0.4334408752980783, "grad_norm": 0.2471439242362976, "learning_rate": 4.27759854116987e-05, "loss": 0.0535, "step": 15450 }, { "epoch": 0.4337214195539346, "grad_norm": 0.02231910265982151, "learning_rate": 4.277130967410109e-05, "loss": 0.0186, "step": 15460 }, { "epoch": 0.434001963809791, "grad_norm": 0.4148167669773102, "learning_rate": 4.276663393650349e-05, "loss": 0.0591, "step": 15470 }, { "epoch": 0.43428250806564733, "grad_norm": 46.96342849731445, "learning_rate": 4.2761958198905874e-05, "loss": 0.0583, "step": 15480 }, { "epoch": 0.4345630523215037, "grad_norm": 0.19268977642059326, "learning_rate": 4.2757282461308274e-05, "loss": 0.0243, "step": 15490 }, { "epoch": 0.4348435965773601, "grad_norm": 0.03785302862524986, "learning_rate": 4.275260672371067e-05, "loss": 0.0083, "step": 15500 }, { "epoch": 0.43512414083321643, "grad_norm": 1.8729287385940552, "learning_rate": 4.274793098611306e-05, "loss": 0.0458, "step": 15510 }, { "epoch": 0.4354046850890728, "grad_norm": 0.040638167411088943, "learning_rate": 4.274325524851546e-05, "loss": 0.0476, "step": 15520 }, { "epoch": 0.43568522934492915, "grad_norm": 0.9558963179588318, "learning_rate": 4.2738579510917846e-05, "loss": 0.0376, "step": 15530 }, { "epoch": 0.43596577360078553, "grad_norm": 0.5234005451202393, "learning_rate": 4.2733903773320246e-05, "loss": 0.0251, "step": 15540 }, { "epoch": 0.43624631785664186, "grad_norm": 0.19712872803211212, "learning_rate": 4.272922803572263e-05, "loss": 0.0137, "step": 15550 }, { "epoch": 0.43652686211249825, "grad_norm": 0.11582615226507187, "learning_rate": 4.272455229812503e-05, "loss": 0.0515, "step": 15560 }, { "epoch": 0.43680740636835463, "grad_norm": 0.19019515812397003, "learning_rate": 4.2719876560527426e-05, "loss": 0.0305, "step": 15570 }, { "epoch": 0.43708795062421096, "grad_norm": 1.3804701566696167, "learning_rate": 4.271520082292982e-05, "loss": 0.0492, "step": 15580 }, { "epoch": 0.43736849488006735, "grad_norm": 0.31595584750175476, "learning_rate": 4.271052508533221e-05, "loss": 0.0274, "step": 15590 }, { "epoch": 0.4376490391359237, "grad_norm": 1.6626707315444946, "learning_rate": 4.2705849347734605e-05, "loss": 0.0346, "step": 15600 }, { "epoch": 0.43792958339178006, "grad_norm": 0.5194823145866394, "learning_rate": 4.2701173610137005e-05, "loss": 0.0325, "step": 15610 }, { "epoch": 0.4382101276476364, "grad_norm": 0.04885844886302948, "learning_rate": 4.269649787253939e-05, "loss": 0.0273, "step": 15620 }, { "epoch": 0.4384906719034928, "grad_norm": 0.03672458976507187, "learning_rate": 4.269182213494179e-05, "loss": 0.0352, "step": 15630 }, { "epoch": 0.43877121615934916, "grad_norm": 0.19406448304653168, "learning_rate": 4.2687146397344185e-05, "loss": 0.0353, "step": 15640 }, { "epoch": 0.4390517604152055, "grad_norm": 0.2849465608596802, "learning_rate": 4.268247065974658e-05, "loss": 0.0197, "step": 15650 }, { "epoch": 0.4393323046710619, "grad_norm": 0.7740310430526733, "learning_rate": 4.267779492214897e-05, "loss": 0.0427, "step": 15660 }, { "epoch": 0.4396128489269182, "grad_norm": 0.08434375375509262, "learning_rate": 4.2673119184551364e-05, "loss": 0.0166, "step": 15670 }, { "epoch": 0.4398933931827746, "grad_norm": 0.019548090174794197, "learning_rate": 4.266844344695376e-05, "loss": 0.0253, "step": 15680 }, { "epoch": 0.4401739374386309, "grad_norm": 1.4991344213485718, "learning_rate": 4.266376770935615e-05, "loss": 0.0467, "step": 15690 }, { "epoch": 0.4404544816944873, "grad_norm": 0.4878522753715515, "learning_rate": 4.2659091971758544e-05, "loss": 0.0173, "step": 15700 }, { "epoch": 0.44073502595034364, "grad_norm": 0.5918658375740051, "learning_rate": 4.2654416234160944e-05, "loss": 0.0231, "step": 15710 }, { "epoch": 0.4410155702062, "grad_norm": 0.02685694396495819, "learning_rate": 4.264974049656334e-05, "loss": 0.0274, "step": 15720 }, { "epoch": 0.4412961144620564, "grad_norm": 0.6357004642486572, "learning_rate": 4.264506475896573e-05, "loss": 0.0237, "step": 15730 }, { "epoch": 0.44157665871791274, "grad_norm": 1.3880510330200195, "learning_rate": 4.264038902136812e-05, "loss": 0.0139, "step": 15740 }, { "epoch": 0.4418572029737691, "grad_norm": 3.4350461959838867, "learning_rate": 4.2635713283770516e-05, "loss": 0.0988, "step": 15750 }, { "epoch": 0.44213774722962546, "grad_norm": 0.18544664978981018, "learning_rate": 4.263103754617291e-05, "loss": 0.0162, "step": 15760 }, { "epoch": 0.44241829148548184, "grad_norm": 0.03446367010474205, "learning_rate": 4.26263618085753e-05, "loss": 0.0394, "step": 15770 }, { "epoch": 0.44269883574133817, "grad_norm": 0.8012892007827759, "learning_rate": 4.26216860709777e-05, "loss": 0.0488, "step": 15780 }, { "epoch": 0.44297937999719456, "grad_norm": 5.673923492431641, "learning_rate": 4.261701033338009e-05, "loss": 0.0205, "step": 15790 }, { "epoch": 0.44325992425305094, "grad_norm": 0.35115715861320496, "learning_rate": 4.261233459578249e-05, "loss": 0.048, "step": 15800 }, { "epoch": 0.44354046850890727, "grad_norm": 0.022425899282097816, "learning_rate": 4.260765885818488e-05, "loss": 0.0632, "step": 15810 }, { "epoch": 0.44382101276476366, "grad_norm": 0.10749907046556473, "learning_rate": 4.2602983120587275e-05, "loss": 0.0543, "step": 15820 }, { "epoch": 0.44410155702062, "grad_norm": 0.646389901638031, "learning_rate": 4.259830738298967e-05, "loss": 0.0236, "step": 15830 }, { "epoch": 0.44438210127647637, "grad_norm": 1.255700707435608, "learning_rate": 4.259363164539206e-05, "loss": 0.0159, "step": 15840 }, { "epoch": 0.4446626455323327, "grad_norm": 0.4353819191455841, "learning_rate": 4.258895590779446e-05, "loss": 0.014, "step": 15850 }, { "epoch": 0.4449431897881891, "grad_norm": 0.07370178401470184, "learning_rate": 4.258428017019685e-05, "loss": 0.007, "step": 15860 }, { "epoch": 0.44522373404404547, "grad_norm": 0.17158187925815582, "learning_rate": 4.257960443259925e-05, "loss": 0.0163, "step": 15870 }, { "epoch": 0.4455042782999018, "grad_norm": 2.2394912242889404, "learning_rate": 4.2574928695001634e-05, "loss": 0.0417, "step": 15880 }, { "epoch": 0.4457848225557582, "grad_norm": 0.6778597831726074, "learning_rate": 4.2570252957404034e-05, "loss": 0.0436, "step": 15890 }, { "epoch": 0.4460653668116145, "grad_norm": 0.4550546109676361, "learning_rate": 4.256557721980643e-05, "loss": 0.0319, "step": 15900 }, { "epoch": 0.4463459110674709, "grad_norm": 0.10462629050016403, "learning_rate": 4.256090148220882e-05, "loss": 0.0488, "step": 15910 }, { "epoch": 0.44662645532332723, "grad_norm": 0.1800524741411209, "learning_rate": 4.255622574461121e-05, "loss": 0.0168, "step": 15920 }, { "epoch": 0.4469069995791836, "grad_norm": 0.3610716760158539, "learning_rate": 4.2551550007013606e-05, "loss": 0.0237, "step": 15930 }, { "epoch": 0.44718754383504, "grad_norm": 0.3020224869251251, "learning_rate": 4.2546874269416006e-05, "loss": 0.0261, "step": 15940 }, { "epoch": 0.44746808809089633, "grad_norm": 0.11956393718719482, "learning_rate": 4.254219853181839e-05, "loss": 0.0368, "step": 15950 }, { "epoch": 0.4477486323467527, "grad_norm": 0.17265819013118744, "learning_rate": 4.253752279422079e-05, "loss": 0.0485, "step": 15960 }, { "epoch": 0.44802917660260905, "grad_norm": 0.04150797426700592, "learning_rate": 4.253284705662318e-05, "loss": 0.0371, "step": 15970 }, { "epoch": 0.44830972085846543, "grad_norm": 0.14433293044567108, "learning_rate": 4.252817131902558e-05, "loss": 0.019, "step": 15980 }, { "epoch": 0.44859026511432176, "grad_norm": 0.11702004820108414, "learning_rate": 4.252349558142797e-05, "loss": 0.0579, "step": 15990 }, { "epoch": 0.44887080937017815, "grad_norm": 0.7531161904335022, "learning_rate": 4.2518819843830365e-05, "loss": 0.0378, "step": 16000 }, { "epoch": 0.44915135362603453, "grad_norm": 0.02248615212738514, "learning_rate": 4.251414410623276e-05, "loss": 0.0111, "step": 16010 }, { "epoch": 0.44943189788189086, "grad_norm": 3.746797561645508, "learning_rate": 4.250946836863515e-05, "loss": 0.0253, "step": 16020 }, { "epoch": 0.44971244213774725, "grad_norm": 1.3383736610412598, "learning_rate": 4.250479263103755e-05, "loss": 0.0126, "step": 16030 }, { "epoch": 0.4499929863936036, "grad_norm": 0.1505848467350006, "learning_rate": 4.250011689343994e-05, "loss": 0.0122, "step": 16040 }, { "epoch": 0.45027353064945996, "grad_norm": 0.4547758400440216, "learning_rate": 4.249544115584234e-05, "loss": 0.0737, "step": 16050 }, { "epoch": 0.4505540749053163, "grad_norm": 0.269559770822525, "learning_rate": 4.249076541824473e-05, "loss": 0.0183, "step": 16060 }, { "epoch": 0.4508346191611727, "grad_norm": 0.15990687906742096, "learning_rate": 4.2486089680647124e-05, "loss": 0.0495, "step": 16070 }, { "epoch": 0.45111516341702906, "grad_norm": 1.5411970615386963, "learning_rate": 4.248141394304952e-05, "loss": 0.0197, "step": 16080 }, { "epoch": 0.4513957076728854, "grad_norm": 0.456021785736084, "learning_rate": 4.247673820545191e-05, "loss": 0.0132, "step": 16090 }, { "epoch": 0.4516762519287418, "grad_norm": 0.11256060749292374, "learning_rate": 4.2472062467854304e-05, "loss": 0.0635, "step": 16100 }, { "epoch": 0.4519567961845981, "grad_norm": 2.0509939193725586, "learning_rate": 4.24673867302567e-05, "loss": 0.0433, "step": 16110 }, { "epoch": 0.4522373404404545, "grad_norm": 0.4297690987586975, "learning_rate": 4.24627109926591e-05, "loss": 0.0222, "step": 16120 }, { "epoch": 0.4525178846963108, "grad_norm": 0.05348575860261917, "learning_rate": 4.245803525506149e-05, "loss": 0.0329, "step": 16130 }, { "epoch": 0.4527984289521672, "grad_norm": 2.644052743911743, "learning_rate": 4.245335951746388e-05, "loss": 0.0412, "step": 16140 }, { "epoch": 0.45307897320802354, "grad_norm": 0.3958735764026642, "learning_rate": 4.2448683779866276e-05, "loss": 0.0286, "step": 16150 }, { "epoch": 0.4533595174638799, "grad_norm": 0.6521931886672974, "learning_rate": 4.244400804226867e-05, "loss": 0.0139, "step": 16160 }, { "epoch": 0.4536400617197363, "grad_norm": 1.470800757408142, "learning_rate": 4.243933230467106e-05, "loss": 0.0659, "step": 16170 }, { "epoch": 0.45392060597559264, "grad_norm": 0.05280093103647232, "learning_rate": 4.2434656567073456e-05, "loss": 0.031, "step": 16180 }, { "epoch": 0.454201150231449, "grad_norm": 0.8712307214736938, "learning_rate": 4.242998082947585e-05, "loss": 0.0365, "step": 16190 }, { "epoch": 0.45448169448730535, "grad_norm": 0.9813506603240967, "learning_rate": 4.242530509187825e-05, "loss": 0.0204, "step": 16200 }, { "epoch": 0.45476223874316174, "grad_norm": 0.01991022191941738, "learning_rate": 4.242062935428064e-05, "loss": 0.0218, "step": 16210 }, { "epoch": 0.45504278299901807, "grad_norm": 0.7989011406898499, "learning_rate": 4.2415953616683035e-05, "loss": 0.0326, "step": 16220 }, { "epoch": 0.45532332725487445, "grad_norm": 0.03208544850349426, "learning_rate": 4.241127787908543e-05, "loss": 0.0213, "step": 16230 }, { "epoch": 0.45560387151073084, "grad_norm": 0.15202726423740387, "learning_rate": 4.240660214148782e-05, "loss": 0.0482, "step": 16240 }, { "epoch": 0.45588441576658717, "grad_norm": 0.4533897042274475, "learning_rate": 4.2401926403890215e-05, "loss": 0.0327, "step": 16250 }, { "epoch": 0.45616496002244356, "grad_norm": 1.558620572090149, "learning_rate": 4.239725066629261e-05, "loss": 0.0242, "step": 16260 }, { "epoch": 0.4564455042782999, "grad_norm": 0.797541081905365, "learning_rate": 4.239257492869501e-05, "loss": 0.0202, "step": 16270 }, { "epoch": 0.45672604853415627, "grad_norm": 0.05201302841305733, "learning_rate": 4.2387899191097394e-05, "loss": 0.0184, "step": 16280 }, { "epoch": 0.4570065927900126, "grad_norm": 0.041873760521411896, "learning_rate": 4.2383223453499794e-05, "loss": 0.041, "step": 16290 }, { "epoch": 0.457287137045869, "grad_norm": 0.42068132758140564, "learning_rate": 4.237854771590219e-05, "loss": 0.0345, "step": 16300 }, { "epoch": 0.45756768130172537, "grad_norm": 0.03071429580450058, "learning_rate": 4.237387197830458e-05, "loss": 0.0261, "step": 16310 }, { "epoch": 0.4578482255575817, "grad_norm": 0.12259113788604736, "learning_rate": 4.2369196240706973e-05, "loss": 0.0497, "step": 16320 }, { "epoch": 0.4581287698134381, "grad_norm": 0.05341951549053192, "learning_rate": 4.2364520503109367e-05, "loss": 0.0149, "step": 16330 }, { "epoch": 0.4584093140692944, "grad_norm": 0.7608562707901001, "learning_rate": 4.2359844765511767e-05, "loss": 0.0188, "step": 16340 }, { "epoch": 0.4586898583251508, "grad_norm": 0.37219732999801636, "learning_rate": 4.235516902791415e-05, "loss": 0.0229, "step": 16350 }, { "epoch": 0.45897040258100713, "grad_norm": 0.5284112095832825, "learning_rate": 4.235049329031655e-05, "loss": 0.0566, "step": 16360 }, { "epoch": 0.4592509468368635, "grad_norm": 0.2680552899837494, "learning_rate": 4.234581755271894e-05, "loss": 0.0401, "step": 16370 }, { "epoch": 0.4595314910927199, "grad_norm": 0.5841194987297058, "learning_rate": 4.234114181512134e-05, "loss": 0.0439, "step": 16380 }, { "epoch": 0.45981203534857623, "grad_norm": 0.7090057134628296, "learning_rate": 4.2336466077523726e-05, "loss": 0.028, "step": 16390 }, { "epoch": 0.4600925796044326, "grad_norm": 0.4753388464450836, "learning_rate": 4.2331790339926125e-05, "loss": 0.037, "step": 16400 }, { "epoch": 0.46037312386028895, "grad_norm": 0.3719216287136078, "learning_rate": 4.232711460232852e-05, "loss": 0.0416, "step": 16410 }, { "epoch": 0.46065366811614533, "grad_norm": 0.756123960018158, "learning_rate": 4.232243886473091e-05, "loss": 0.0276, "step": 16420 }, { "epoch": 0.46093421237200166, "grad_norm": 0.31988629698753357, "learning_rate": 4.231776312713331e-05, "loss": 0.0427, "step": 16430 }, { "epoch": 0.46121475662785805, "grad_norm": 0.06344209611415863, "learning_rate": 4.23130873895357e-05, "loss": 0.0579, "step": 16440 }, { "epoch": 0.46149530088371443, "grad_norm": 0.5875459909439087, "learning_rate": 4.23084116519381e-05, "loss": 0.0228, "step": 16450 }, { "epoch": 0.46177584513957076, "grad_norm": 0.26258695125579834, "learning_rate": 4.2303735914340484e-05, "loss": 0.0149, "step": 16460 }, { "epoch": 0.46205638939542715, "grad_norm": 1.0615836381912231, "learning_rate": 4.2299060176742884e-05, "loss": 0.0158, "step": 16470 }, { "epoch": 0.4623369336512835, "grad_norm": 1.2003355026245117, "learning_rate": 4.229438443914528e-05, "loss": 0.0081, "step": 16480 }, { "epoch": 0.46261747790713986, "grad_norm": 36.572959899902344, "learning_rate": 4.228970870154767e-05, "loss": 0.0551, "step": 16490 }, { "epoch": 0.4628980221629962, "grad_norm": 0.0593876987695694, "learning_rate": 4.2285032963950064e-05, "loss": 0.0296, "step": 16500 }, { "epoch": 0.4631785664188526, "grad_norm": 0.08607715368270874, "learning_rate": 4.228035722635246e-05, "loss": 0.0305, "step": 16510 }, { "epoch": 0.46345911067470896, "grad_norm": 0.1413111388683319, "learning_rate": 4.227568148875486e-05, "loss": 0.0182, "step": 16520 }, { "epoch": 0.4637396549305653, "grad_norm": 1.2245808839797974, "learning_rate": 4.227100575115724e-05, "loss": 0.0587, "step": 16530 }, { "epoch": 0.4640201991864217, "grad_norm": 0.02594081312417984, "learning_rate": 4.226633001355964e-05, "loss": 0.0254, "step": 16540 }, { "epoch": 0.464300743442278, "grad_norm": 0.24825015664100647, "learning_rate": 4.2261654275962036e-05, "loss": 0.0432, "step": 16550 }, { "epoch": 0.4645812876981344, "grad_norm": 0.05376443639397621, "learning_rate": 4.225697853836443e-05, "loss": 0.0117, "step": 16560 }, { "epoch": 0.4648618319539907, "grad_norm": 0.1931350976228714, "learning_rate": 4.225230280076682e-05, "loss": 0.03, "step": 16570 }, { "epoch": 0.4651423762098471, "grad_norm": 0.17317818105220795, "learning_rate": 4.2247627063169216e-05, "loss": 0.0233, "step": 16580 }, { "epoch": 0.46542292046570344, "grad_norm": 0.12930940091609955, "learning_rate": 4.224295132557161e-05, "loss": 0.0139, "step": 16590 }, { "epoch": 0.4657034647215598, "grad_norm": 0.5952921509742737, "learning_rate": 4.2238275587974e-05, "loss": 0.0422, "step": 16600 }, { "epoch": 0.4659840089774162, "grad_norm": 0.05555728077888489, "learning_rate": 4.2233599850376395e-05, "loss": 0.0177, "step": 16610 }, { "epoch": 0.46626455323327254, "grad_norm": 0.09341336786746979, "learning_rate": 4.2228924112778795e-05, "loss": 0.0214, "step": 16620 }, { "epoch": 0.4665450974891289, "grad_norm": 0.28898558020591736, "learning_rate": 4.222424837518119e-05, "loss": 0.0175, "step": 16630 }, { "epoch": 0.46682564174498525, "grad_norm": 0.3682953715324402, "learning_rate": 4.221957263758358e-05, "loss": 0.039, "step": 16640 }, { "epoch": 0.46710618600084164, "grad_norm": 1.0450959205627441, "learning_rate": 4.2214896899985975e-05, "loss": 0.0461, "step": 16650 }, { "epoch": 0.46738673025669797, "grad_norm": 0.06278929859399796, "learning_rate": 4.221022116238837e-05, "loss": 0.0292, "step": 16660 }, { "epoch": 0.46766727451255435, "grad_norm": 3.7803266048431396, "learning_rate": 4.220554542479076e-05, "loss": 0.0176, "step": 16670 }, { "epoch": 0.46794781876841074, "grad_norm": 0.3671209514141083, "learning_rate": 4.2200869687193154e-05, "loss": 0.0259, "step": 16680 }, { "epoch": 0.46822836302426707, "grad_norm": 0.8124669790267944, "learning_rate": 4.2196193949595554e-05, "loss": 0.0134, "step": 16690 }, { "epoch": 0.46850890728012345, "grad_norm": 0.1872030347585678, "learning_rate": 4.219151821199794e-05, "loss": 0.0054, "step": 16700 }, { "epoch": 0.4687894515359798, "grad_norm": 0.313984215259552, "learning_rate": 4.218684247440034e-05, "loss": 0.0066, "step": 16710 }, { "epoch": 0.46906999579183617, "grad_norm": 0.04176779463887215, "learning_rate": 4.2182166736802734e-05, "loss": 0.0084, "step": 16720 }, { "epoch": 0.4693505400476925, "grad_norm": 0.029339885339140892, "learning_rate": 4.217749099920513e-05, "loss": 0.0822, "step": 16730 }, { "epoch": 0.4696310843035489, "grad_norm": 7.439297199249268, "learning_rate": 4.217281526160752e-05, "loss": 0.0644, "step": 16740 }, { "epoch": 0.46991162855940527, "grad_norm": 2.7808096408843994, "learning_rate": 4.216813952400991e-05, "loss": 0.0325, "step": 16750 }, { "epoch": 0.4701921728152616, "grad_norm": 0.8998562693595886, "learning_rate": 4.216346378641231e-05, "loss": 0.0149, "step": 16760 }, { "epoch": 0.470472717071118, "grad_norm": 0.09924670308828354, "learning_rate": 4.21587880488147e-05, "loss": 0.0601, "step": 16770 }, { "epoch": 0.4707532613269743, "grad_norm": 0.5685606598854065, "learning_rate": 4.21541123112171e-05, "loss": 0.0384, "step": 16780 }, { "epoch": 0.4710338055828307, "grad_norm": 0.2607676684856415, "learning_rate": 4.2149436573619486e-05, "loss": 0.0319, "step": 16790 }, { "epoch": 0.47131434983868703, "grad_norm": 0.8689115643501282, "learning_rate": 4.2144760836021886e-05, "loss": 0.0251, "step": 16800 }, { "epoch": 0.4715948940945434, "grad_norm": 0.07390134036540985, "learning_rate": 4.214008509842428e-05, "loss": 0.0638, "step": 16810 }, { "epoch": 0.4718754383503998, "grad_norm": 4.917212009429932, "learning_rate": 4.213540936082667e-05, "loss": 0.0329, "step": 16820 }, { "epoch": 0.47215598260625613, "grad_norm": 0.07418950647115707, "learning_rate": 4.2130733623229065e-05, "loss": 0.027, "step": 16830 }, { "epoch": 0.4724365268621125, "grad_norm": 0.2954288423061371, "learning_rate": 4.212605788563146e-05, "loss": 0.0411, "step": 16840 }, { "epoch": 0.47271707111796885, "grad_norm": 0.05023728683590889, "learning_rate": 4.212138214803386e-05, "loss": 0.0186, "step": 16850 }, { "epoch": 0.47299761537382523, "grad_norm": 0.18396598100662231, "learning_rate": 4.2116706410436244e-05, "loss": 0.0143, "step": 16860 }, { "epoch": 0.47327815962968156, "grad_norm": 0.2793548107147217, "learning_rate": 4.2112030672838644e-05, "loss": 0.0302, "step": 16870 }, { "epoch": 0.47355870388553795, "grad_norm": 1.2695443630218506, "learning_rate": 4.210735493524104e-05, "loss": 0.0745, "step": 16880 }, { "epoch": 0.47383924814139433, "grad_norm": 0.30449700355529785, "learning_rate": 4.210267919764343e-05, "loss": 0.031, "step": 16890 }, { "epoch": 0.47411979239725066, "grad_norm": 0.3393392264842987, "learning_rate": 4.2098003460045824e-05, "loss": 0.0422, "step": 16900 }, { "epoch": 0.47440033665310705, "grad_norm": 0.29554110765457153, "learning_rate": 4.209332772244822e-05, "loss": 0.0306, "step": 16910 }, { "epoch": 0.4746808809089634, "grad_norm": 0.456367552280426, "learning_rate": 4.208865198485061e-05, "loss": 0.0397, "step": 16920 }, { "epoch": 0.47496142516481976, "grad_norm": 0.18329951167106628, "learning_rate": 4.2083976247253e-05, "loss": 0.0191, "step": 16930 }, { "epoch": 0.4752419694206761, "grad_norm": 1.5122933387756348, "learning_rate": 4.20793005096554e-05, "loss": 0.023, "step": 16940 }, { "epoch": 0.4755225136765325, "grad_norm": 0.13439525663852692, "learning_rate": 4.2074624772057796e-05, "loss": 0.0149, "step": 16950 }, { "epoch": 0.47580305793238886, "grad_norm": 0.021452903747558594, "learning_rate": 4.206994903446019e-05, "loss": 0.0112, "step": 16960 }, { "epoch": 0.4760836021882452, "grad_norm": 0.2112305462360382, "learning_rate": 4.206527329686258e-05, "loss": 0.0726, "step": 16970 }, { "epoch": 0.4763641464441016, "grad_norm": 0.03757209703326225, "learning_rate": 4.2060597559264976e-05, "loss": 0.0226, "step": 16980 }, { "epoch": 0.4766446906999579, "grad_norm": 0.4205664396286011, "learning_rate": 4.205592182166737e-05, "loss": 0.0446, "step": 16990 }, { "epoch": 0.4769252349558143, "grad_norm": 0.15433695912361145, "learning_rate": 4.205124608406976e-05, "loss": 0.0468, "step": 17000 }, { "epoch": 0.4772057792116706, "grad_norm": 0.17866836488246918, "learning_rate": 4.2046570346472155e-05, "loss": 0.0311, "step": 17010 }, { "epoch": 0.477486323467527, "grad_norm": 0.2753507196903229, "learning_rate": 4.2041894608874555e-05, "loss": 0.0133, "step": 17020 }, { "epoch": 0.47776686772338334, "grad_norm": 0.1180872991681099, "learning_rate": 4.203721887127695e-05, "loss": 0.0336, "step": 17030 }, { "epoch": 0.4780474119792397, "grad_norm": 0.49107223749160767, "learning_rate": 4.203254313367934e-05, "loss": 0.0479, "step": 17040 }, { "epoch": 0.4783279562350961, "grad_norm": 0.5057842135429382, "learning_rate": 4.2027867396081735e-05, "loss": 0.0232, "step": 17050 }, { "epoch": 0.47860850049095244, "grad_norm": 0.7506217956542969, "learning_rate": 4.202319165848413e-05, "loss": 0.0568, "step": 17060 }, { "epoch": 0.4788890447468088, "grad_norm": 0.07035574316978455, "learning_rate": 4.201851592088652e-05, "loss": 0.0211, "step": 17070 }, { "epoch": 0.47916958900266515, "grad_norm": 1.648629903793335, "learning_rate": 4.2013840183288914e-05, "loss": 0.0335, "step": 17080 }, { "epoch": 0.47945013325852154, "grad_norm": 0.05572652071714401, "learning_rate": 4.2009164445691314e-05, "loss": 0.019, "step": 17090 }, { "epoch": 0.47973067751437787, "grad_norm": 2.8523404598236084, "learning_rate": 4.20044887080937e-05, "loss": 0.0221, "step": 17100 }, { "epoch": 0.48001122177023425, "grad_norm": 6.3785176277160645, "learning_rate": 4.19998129704961e-05, "loss": 0.036, "step": 17110 }, { "epoch": 0.48029176602609064, "grad_norm": 0.032066840678453445, "learning_rate": 4.1995137232898494e-05, "loss": 0.0608, "step": 17120 }, { "epoch": 0.48057231028194697, "grad_norm": 0.2347474992275238, "learning_rate": 4.199046149530089e-05, "loss": 0.0279, "step": 17130 }, { "epoch": 0.48085285453780335, "grad_norm": 0.13643144071102142, "learning_rate": 4.198578575770328e-05, "loss": 0.0212, "step": 17140 }, { "epoch": 0.4811333987936597, "grad_norm": 1.3557473421096802, "learning_rate": 4.198111002010567e-05, "loss": 0.021, "step": 17150 }, { "epoch": 0.48141394304951607, "grad_norm": 0.3112180531024933, "learning_rate": 4.197643428250807e-05, "loss": 0.0757, "step": 17160 }, { "epoch": 0.4816944873053724, "grad_norm": 0.21570250391960144, "learning_rate": 4.197175854491046e-05, "loss": 0.0407, "step": 17170 }, { "epoch": 0.4819750315612288, "grad_norm": 0.16107025742530823, "learning_rate": 4.196708280731286e-05, "loss": 0.0289, "step": 17180 }, { "epoch": 0.48225557581708517, "grad_norm": 0.5206446051597595, "learning_rate": 4.1962407069715246e-05, "loss": 0.04, "step": 17190 }, { "epoch": 0.4825361200729415, "grad_norm": 0.09906430542469025, "learning_rate": 4.1957731332117646e-05, "loss": 0.0195, "step": 17200 }, { "epoch": 0.4828166643287979, "grad_norm": 0.4998381733894348, "learning_rate": 4.195305559452004e-05, "loss": 0.0389, "step": 17210 }, { "epoch": 0.4830972085846542, "grad_norm": 0.6736046671867371, "learning_rate": 4.194837985692243e-05, "loss": 0.0223, "step": 17220 }, { "epoch": 0.4833777528405106, "grad_norm": 0.023697543889284134, "learning_rate": 4.1943704119324825e-05, "loss": 0.0258, "step": 17230 }, { "epoch": 0.48365829709636693, "grad_norm": 0.686852216720581, "learning_rate": 4.193902838172722e-05, "loss": 0.0534, "step": 17240 }, { "epoch": 0.4839388413522233, "grad_norm": 8.39834976196289, "learning_rate": 4.193435264412962e-05, "loss": 0.0275, "step": 17250 }, { "epoch": 0.4842193856080797, "grad_norm": 1.1261470317840576, "learning_rate": 4.1929676906532005e-05, "loss": 0.0522, "step": 17260 }, { "epoch": 0.48449992986393603, "grad_norm": 13.047738075256348, "learning_rate": 4.1925001168934404e-05, "loss": 0.0186, "step": 17270 }, { "epoch": 0.4847804741197924, "grad_norm": 0.5934122204780579, "learning_rate": 4.192032543133679e-05, "loss": 0.0241, "step": 17280 }, { "epoch": 0.48506101837564874, "grad_norm": 0.11318546533584595, "learning_rate": 4.191564969373919e-05, "loss": 0.049, "step": 17290 }, { "epoch": 0.48534156263150513, "grad_norm": 0.031007491052150726, "learning_rate": 4.1910973956141584e-05, "loss": 0.0098, "step": 17300 }, { "epoch": 0.48562210688736146, "grad_norm": 0.0209357850253582, "learning_rate": 4.190629821854398e-05, "loss": 0.0485, "step": 17310 }, { "epoch": 0.48590265114321785, "grad_norm": 0.34645888209342957, "learning_rate": 4.190162248094637e-05, "loss": 0.0511, "step": 17320 }, { "epoch": 0.48618319539907423, "grad_norm": 0.18795940279960632, "learning_rate": 4.1896946743348763e-05, "loss": 0.0196, "step": 17330 }, { "epoch": 0.48646373965493056, "grad_norm": 0.5592176914215088, "learning_rate": 4.189227100575116e-05, "loss": 0.0416, "step": 17340 }, { "epoch": 0.48674428391078695, "grad_norm": 0.07821041345596313, "learning_rate": 4.188759526815355e-05, "loss": 0.0202, "step": 17350 }, { "epoch": 0.4870248281666433, "grad_norm": 0.04669851064682007, "learning_rate": 4.188291953055595e-05, "loss": 0.0263, "step": 17360 }, { "epoch": 0.48730537242249966, "grad_norm": 0.10989853739738464, "learning_rate": 4.187824379295834e-05, "loss": 0.0158, "step": 17370 }, { "epoch": 0.487585916678356, "grad_norm": 0.5780651569366455, "learning_rate": 4.1873568055360736e-05, "loss": 0.0302, "step": 17380 }, { "epoch": 0.4878664609342124, "grad_norm": 0.23239977657794952, "learning_rate": 4.186889231776313e-05, "loss": 0.0521, "step": 17390 }, { "epoch": 0.4881470051900687, "grad_norm": 1.2335106134414673, "learning_rate": 4.186421658016552e-05, "loss": 0.016, "step": 17400 }, { "epoch": 0.4884275494459251, "grad_norm": 0.04185498505830765, "learning_rate": 4.1859540842567915e-05, "loss": 0.0196, "step": 17410 }, { "epoch": 0.4887080937017815, "grad_norm": 0.3281266689300537, "learning_rate": 4.185486510497031e-05, "loss": 0.0661, "step": 17420 }, { "epoch": 0.4889886379576378, "grad_norm": 0.1078762635588646, "learning_rate": 4.185018936737271e-05, "loss": 0.0276, "step": 17430 }, { "epoch": 0.4892691822134942, "grad_norm": 0.08741113543510437, "learning_rate": 4.18455136297751e-05, "loss": 0.0153, "step": 17440 }, { "epoch": 0.4895497264693505, "grad_norm": 0.12541402876377106, "learning_rate": 4.1840837892177495e-05, "loss": 0.0254, "step": 17450 }, { "epoch": 0.4898302707252069, "grad_norm": 0.05721235275268555, "learning_rate": 4.183616215457989e-05, "loss": 0.022, "step": 17460 }, { "epoch": 0.49011081498106324, "grad_norm": 0.5953645706176758, "learning_rate": 4.183148641698228e-05, "loss": 0.0197, "step": 17470 }, { "epoch": 0.4903913592369196, "grad_norm": 1.0312230587005615, "learning_rate": 4.1826810679384674e-05, "loss": 0.0256, "step": 17480 }, { "epoch": 0.490671903492776, "grad_norm": 0.25078412890434265, "learning_rate": 4.182213494178707e-05, "loss": 0.0357, "step": 17490 }, { "epoch": 0.49095244774863234, "grad_norm": 0.6744020581245422, "learning_rate": 4.181745920418946e-05, "loss": 0.0508, "step": 17500 }, { "epoch": 0.4912329920044887, "grad_norm": 0.054159991443157196, "learning_rate": 4.181278346659186e-05, "loss": 0.0314, "step": 17510 }, { "epoch": 0.49151353626034505, "grad_norm": 0.31371554732322693, "learning_rate": 4.180810772899425e-05, "loss": 0.0381, "step": 17520 }, { "epoch": 0.49179408051620144, "grad_norm": 1.2669843435287476, "learning_rate": 4.180343199139665e-05, "loss": 0.0483, "step": 17530 }, { "epoch": 0.49207462477205777, "grad_norm": 0.25916004180908203, "learning_rate": 4.179875625379904e-05, "loss": 0.02, "step": 17540 }, { "epoch": 0.49235516902791415, "grad_norm": 0.11687786877155304, "learning_rate": 4.179408051620143e-05, "loss": 0.0245, "step": 17550 }, { "epoch": 0.49263571328377054, "grad_norm": 0.03224622830748558, "learning_rate": 4.1789404778603826e-05, "loss": 0.0256, "step": 17560 }, { "epoch": 0.49291625753962687, "grad_norm": 0.04928550124168396, "learning_rate": 4.178472904100622e-05, "loss": 0.0177, "step": 17570 }, { "epoch": 0.49319680179548325, "grad_norm": 0.04701732471585274, "learning_rate": 4.178005330340862e-05, "loss": 0.0463, "step": 17580 }, { "epoch": 0.4934773460513396, "grad_norm": 0.4720054566860199, "learning_rate": 4.1775377565811006e-05, "loss": 0.0158, "step": 17590 }, { "epoch": 0.49375789030719597, "grad_norm": 0.012423539534211159, "learning_rate": 4.1770701828213406e-05, "loss": 0.026, "step": 17600 }, { "epoch": 0.4940384345630523, "grad_norm": 0.23348985612392426, "learning_rate": 4.176602609061579e-05, "loss": 0.0178, "step": 17610 }, { "epoch": 0.4943189788189087, "grad_norm": 0.09793531894683838, "learning_rate": 4.176135035301819e-05, "loss": 0.0218, "step": 17620 }, { "epoch": 0.49459952307476507, "grad_norm": 0.07639726996421814, "learning_rate": 4.1756674615420585e-05, "loss": 0.0137, "step": 17630 }, { "epoch": 0.4948800673306214, "grad_norm": 0.2188856452703476, "learning_rate": 4.175199887782298e-05, "loss": 0.0153, "step": 17640 }, { "epoch": 0.4951606115864778, "grad_norm": 1.0977939367294312, "learning_rate": 4.174732314022538e-05, "loss": 0.0345, "step": 17650 }, { "epoch": 0.4954411558423341, "grad_norm": 0.3636667728424072, "learning_rate": 4.1742647402627765e-05, "loss": 0.0339, "step": 17660 }, { "epoch": 0.4957217000981905, "grad_norm": 0.08186951279640198, "learning_rate": 4.1737971665030165e-05, "loss": 0.0565, "step": 17670 }, { "epoch": 0.49600224435404683, "grad_norm": 0.29397258162498474, "learning_rate": 4.173329592743255e-05, "loss": 0.0225, "step": 17680 }, { "epoch": 0.4962827886099032, "grad_norm": 0.416664183139801, "learning_rate": 4.172862018983495e-05, "loss": 0.0455, "step": 17690 }, { "epoch": 0.4965633328657596, "grad_norm": 0.08441418409347534, "learning_rate": 4.172394445223734e-05, "loss": 0.0185, "step": 17700 }, { "epoch": 0.49684387712161593, "grad_norm": 2.19862699508667, "learning_rate": 4.171926871463974e-05, "loss": 0.0416, "step": 17710 }, { "epoch": 0.4971244213774723, "grad_norm": 0.026288120076060295, "learning_rate": 4.171459297704213e-05, "loss": 0.0058, "step": 17720 }, { "epoch": 0.49740496563332864, "grad_norm": 0.16853895783424377, "learning_rate": 4.1709917239444524e-05, "loss": 0.045, "step": 17730 }, { "epoch": 0.49768550988918503, "grad_norm": 3.352160692214966, "learning_rate": 4.170524150184692e-05, "loss": 0.0652, "step": 17740 }, { "epoch": 0.49796605414504136, "grad_norm": 0.3626142144203186, "learning_rate": 4.170056576424931e-05, "loss": 0.0457, "step": 17750 }, { "epoch": 0.49824659840089774, "grad_norm": 0.027159065008163452, "learning_rate": 4.169589002665171e-05, "loss": 0.0113, "step": 17760 }, { "epoch": 0.49852714265675413, "grad_norm": 0.08831208944320679, "learning_rate": 4.1691214289054096e-05, "loss": 0.0399, "step": 17770 }, { "epoch": 0.49880768691261046, "grad_norm": 0.08027151226997375, "learning_rate": 4.1686538551456496e-05, "loss": 0.0261, "step": 17780 }, { "epoch": 0.49908823116846684, "grad_norm": 0.07789477705955505, "learning_rate": 4.168186281385889e-05, "loss": 0.0577, "step": 17790 }, { "epoch": 0.4993687754243232, "grad_norm": 0.17449888586997986, "learning_rate": 4.167718707626128e-05, "loss": 0.0332, "step": 17800 }, { "epoch": 0.49964931968017956, "grad_norm": 0.6127444505691528, "learning_rate": 4.1672511338663676e-05, "loss": 0.0237, "step": 17810 }, { "epoch": 0.4999298639360359, "grad_norm": 0.3830045759677887, "learning_rate": 4.166783560106607e-05, "loss": 0.0478, "step": 17820 }, { "epoch": 0.5002104081918922, "grad_norm": 1.0371372699737549, "learning_rate": 4.166315986346846e-05, "loss": 0.0272, "step": 17830 }, { "epoch": 0.5004909524477487, "grad_norm": 0.2596052587032318, "learning_rate": 4.1658484125870855e-05, "loss": 0.0281, "step": 17840 }, { "epoch": 0.500771496703605, "grad_norm": 1.0438423156738281, "learning_rate": 4.1653808388273255e-05, "loss": 0.0225, "step": 17850 }, { "epoch": 0.5010520409594613, "grad_norm": 0.21344833076000214, "learning_rate": 4.164913265067565e-05, "loss": 0.0358, "step": 17860 }, { "epoch": 0.5013325852153178, "grad_norm": 0.05447058752179146, "learning_rate": 4.164445691307804e-05, "loss": 0.0308, "step": 17870 }, { "epoch": 0.5016131294711741, "grad_norm": 0.1729852855205536, "learning_rate": 4.1639781175480434e-05, "loss": 0.0216, "step": 17880 }, { "epoch": 0.5018936737270304, "grad_norm": 0.010011528618633747, "learning_rate": 4.163510543788283e-05, "loss": 0.0124, "step": 17890 }, { "epoch": 0.5021742179828868, "grad_norm": 0.23545241355895996, "learning_rate": 4.163042970028522e-05, "loss": 0.0446, "step": 17900 }, { "epoch": 0.5024547622387432, "grad_norm": 0.3392878770828247, "learning_rate": 4.1625753962687614e-05, "loss": 0.0258, "step": 17910 }, { "epoch": 0.5027353064945995, "grad_norm": 0.0773933082818985, "learning_rate": 4.162107822509001e-05, "loss": 0.0172, "step": 17920 }, { "epoch": 0.5030158507504559, "grad_norm": 0.953871488571167, "learning_rate": 4.161640248749241e-05, "loss": 0.0194, "step": 17930 }, { "epoch": 0.5032963950063123, "grad_norm": 0.3890599012374878, "learning_rate": 4.16117267498948e-05, "loss": 0.0214, "step": 17940 }, { "epoch": 0.5035769392621686, "grad_norm": 0.10224391520023346, "learning_rate": 4.160705101229719e-05, "loss": 0.0685, "step": 17950 }, { "epoch": 0.503857483518025, "grad_norm": 0.30397772789001465, "learning_rate": 4.1602375274699586e-05, "loss": 0.06, "step": 17960 }, { "epoch": 0.5041380277738813, "grad_norm": 0.8328806161880493, "learning_rate": 4.159769953710198e-05, "loss": 0.0227, "step": 17970 }, { "epoch": 0.5044185720297377, "grad_norm": 0.2697104811668396, "learning_rate": 4.159302379950437e-05, "loss": 0.0169, "step": 17980 }, { "epoch": 0.504699116285594, "grad_norm": 1.3865892887115479, "learning_rate": 4.1588348061906766e-05, "loss": 0.036, "step": 17990 }, { "epoch": 0.5049796605414504, "grad_norm": 5.449163913726807, "learning_rate": 4.1583672324309166e-05, "loss": 0.0251, "step": 18000 }, { "epoch": 0.5052602047973068, "grad_norm": 0.08892542123794556, "learning_rate": 4.157899658671155e-05, "loss": 0.0418, "step": 18010 }, { "epoch": 0.5055407490531632, "grad_norm": 0.5333160161972046, "learning_rate": 4.157432084911395e-05, "loss": 0.0146, "step": 18020 }, { "epoch": 0.5058212933090195, "grad_norm": 0.4251088798046112, "learning_rate": 4.1569645111516345e-05, "loss": 0.0259, "step": 18030 }, { "epoch": 0.5061018375648758, "grad_norm": 0.11697389930486679, "learning_rate": 4.156496937391874e-05, "loss": 0.0305, "step": 18040 }, { "epoch": 0.5063823818207323, "grad_norm": 0.5998208522796631, "learning_rate": 4.156029363632113e-05, "loss": 0.04, "step": 18050 }, { "epoch": 0.5066629260765886, "grad_norm": 0.8338409662246704, "learning_rate": 4.1555617898723525e-05, "loss": 0.0562, "step": 18060 }, { "epoch": 0.5069434703324449, "grad_norm": 0.4531027376651764, "learning_rate": 4.1550942161125925e-05, "loss": 0.014, "step": 18070 }, { "epoch": 0.5072240145883014, "grad_norm": 0.12524309754371643, "learning_rate": 4.154626642352831e-05, "loss": 0.0322, "step": 18080 }, { "epoch": 0.5075045588441577, "grad_norm": 0.05179356783628464, "learning_rate": 4.154159068593071e-05, "loss": 0.0326, "step": 18090 }, { "epoch": 0.507785103100014, "grad_norm": 0.17437253892421722, "learning_rate": 4.15369149483331e-05, "loss": 0.0466, "step": 18100 }, { "epoch": 0.5080656473558703, "grad_norm": 0.44225847721099854, "learning_rate": 4.15322392107355e-05, "loss": 0.0576, "step": 18110 }, { "epoch": 0.5083461916117268, "grad_norm": 0.2699225842952728, "learning_rate": 4.152756347313789e-05, "loss": 0.0379, "step": 18120 }, { "epoch": 0.5086267358675831, "grad_norm": 0.21832452714443207, "learning_rate": 4.1522887735540284e-05, "loss": 0.016, "step": 18130 }, { "epoch": 0.5089072801234394, "grad_norm": 1.159603238105774, "learning_rate": 4.151821199794268e-05, "loss": 0.0111, "step": 18140 }, { "epoch": 0.5091878243792959, "grad_norm": 0.05638502538204193, "learning_rate": 4.151353626034507e-05, "loss": 0.0196, "step": 18150 }, { "epoch": 0.5094683686351522, "grad_norm": 0.03896774351596832, "learning_rate": 4.150886052274747e-05, "loss": 0.0368, "step": 18160 }, { "epoch": 0.5097489128910085, "grad_norm": 0.7746413350105286, "learning_rate": 4.1504184785149856e-05, "loss": 0.0306, "step": 18170 }, { "epoch": 0.5100294571468649, "grad_norm": 1.3959085941314697, "learning_rate": 4.1499509047552256e-05, "loss": 0.0247, "step": 18180 }, { "epoch": 0.5103100014027213, "grad_norm": 0.10043135285377502, "learning_rate": 4.149483330995464e-05, "loss": 0.0074, "step": 18190 }, { "epoch": 0.5105905456585776, "grad_norm": 0.7144458293914795, "learning_rate": 4.149015757235704e-05, "loss": 0.0614, "step": 18200 }, { "epoch": 0.510871089914434, "grad_norm": 0.24519136548042297, "learning_rate": 4.1485481834759436e-05, "loss": 0.0501, "step": 18210 }, { "epoch": 0.5111516341702904, "grad_norm": 0.3811081051826477, "learning_rate": 4.148080609716183e-05, "loss": 0.0511, "step": 18220 }, { "epoch": 0.5114321784261467, "grad_norm": 0.19128958880901337, "learning_rate": 4.147613035956422e-05, "loss": 0.0508, "step": 18230 }, { "epoch": 0.5117127226820031, "grad_norm": 0.19103066623210907, "learning_rate": 4.1471454621966615e-05, "loss": 0.0354, "step": 18240 }, { "epoch": 0.5119932669378594, "grad_norm": 0.17707452178001404, "learning_rate": 4.1466778884369015e-05, "loss": 0.0319, "step": 18250 }, { "epoch": 0.5122738111937158, "grad_norm": 0.7998449206352234, "learning_rate": 4.14621031467714e-05, "loss": 0.0699, "step": 18260 }, { "epoch": 0.5125543554495722, "grad_norm": 0.7912736535072327, "learning_rate": 4.14574274091738e-05, "loss": 0.0291, "step": 18270 }, { "epoch": 0.5128348997054285, "grad_norm": 1.0893869400024414, "learning_rate": 4.1452751671576194e-05, "loss": 0.0318, "step": 18280 }, { "epoch": 0.513115443961285, "grad_norm": 0.08035493642091751, "learning_rate": 4.144807593397859e-05, "loss": 0.0433, "step": 18290 }, { "epoch": 0.5133959882171413, "grad_norm": 0.037879325449466705, "learning_rate": 4.144340019638098e-05, "loss": 0.0397, "step": 18300 }, { "epoch": 0.5136765324729976, "grad_norm": 0.19323213398456573, "learning_rate": 4.1438724458783374e-05, "loss": 0.0214, "step": 18310 }, { "epoch": 0.5139570767288539, "grad_norm": 0.3570772707462311, "learning_rate": 4.143404872118577e-05, "loss": 0.0458, "step": 18320 }, { "epoch": 0.5142376209847104, "grad_norm": 1.1206166744232178, "learning_rate": 4.142937298358816e-05, "loss": 0.0246, "step": 18330 }, { "epoch": 0.5145181652405667, "grad_norm": 1.5499186515808105, "learning_rate": 4.142469724599056e-05, "loss": 0.0523, "step": 18340 }, { "epoch": 0.514798709496423, "grad_norm": 0.556364119052887, "learning_rate": 4.142002150839295e-05, "loss": 0.0367, "step": 18350 }, { "epoch": 0.5150792537522795, "grad_norm": 0.29686838388442993, "learning_rate": 4.1415345770795346e-05, "loss": 0.0201, "step": 18360 }, { "epoch": 0.5153597980081358, "grad_norm": 0.057743996381759644, "learning_rate": 4.141067003319774e-05, "loss": 0.0327, "step": 18370 }, { "epoch": 0.5156403422639921, "grad_norm": 0.18340657651424408, "learning_rate": 4.140599429560013e-05, "loss": 0.0427, "step": 18380 }, { "epoch": 0.5159208865198485, "grad_norm": 0.04470737650990486, "learning_rate": 4.1401318558002526e-05, "loss": 0.049, "step": 18390 }, { "epoch": 0.5162014307757049, "grad_norm": 0.5653552412986755, "learning_rate": 4.139664282040492e-05, "loss": 0.0859, "step": 18400 }, { "epoch": 0.5164819750315612, "grad_norm": 0.18379637598991394, "learning_rate": 4.139196708280731e-05, "loss": 0.0254, "step": 18410 }, { "epoch": 0.5167625192874176, "grad_norm": 0.14796175062656403, "learning_rate": 4.138729134520971e-05, "loss": 0.0101, "step": 18420 }, { "epoch": 0.517043063543274, "grad_norm": 0.06616916507482529, "learning_rate": 4.13826156076121e-05, "loss": 0.0428, "step": 18430 }, { "epoch": 0.5173236077991303, "grad_norm": 0.021445246413350105, "learning_rate": 4.13779398700145e-05, "loss": 0.0518, "step": 18440 }, { "epoch": 0.5176041520549867, "grad_norm": 0.35098063945770264, "learning_rate": 4.137326413241689e-05, "loss": 0.0245, "step": 18450 }, { "epoch": 0.517884696310843, "grad_norm": 0.06402155756950378, "learning_rate": 4.1368588394819285e-05, "loss": 0.0153, "step": 18460 }, { "epoch": 0.5181652405666994, "grad_norm": 0.24054387211799622, "learning_rate": 4.136391265722168e-05, "loss": 0.0299, "step": 18470 }, { "epoch": 0.5184457848225558, "grad_norm": 0.084052674472332, "learning_rate": 4.135923691962407e-05, "loss": 0.0192, "step": 18480 }, { "epoch": 0.5187263290784121, "grad_norm": 0.6487279534339905, "learning_rate": 4.135456118202647e-05, "loss": 0.0289, "step": 18490 }, { "epoch": 0.5190068733342684, "grad_norm": 1.104028582572937, "learning_rate": 4.134988544442886e-05, "loss": 0.0291, "step": 18500 }, { "epoch": 0.5192874175901249, "grad_norm": 0.02538338117301464, "learning_rate": 4.134520970683126e-05, "loss": 0.0443, "step": 18510 }, { "epoch": 0.5195679618459812, "grad_norm": 1.386893391609192, "learning_rate": 4.1340533969233644e-05, "loss": 0.0141, "step": 18520 }, { "epoch": 0.5198485061018375, "grad_norm": 0.2829553782939911, "learning_rate": 4.1335858231636044e-05, "loss": 0.0635, "step": 18530 }, { "epoch": 0.520129050357694, "grad_norm": 3.235992431640625, "learning_rate": 4.133118249403844e-05, "loss": 0.0532, "step": 18540 }, { "epoch": 0.5204095946135503, "grad_norm": 0.7794919610023499, "learning_rate": 4.132650675644083e-05, "loss": 0.0315, "step": 18550 }, { "epoch": 0.5206901388694066, "grad_norm": 0.08810330182313919, "learning_rate": 4.132183101884323e-05, "loss": 0.0289, "step": 18560 }, { "epoch": 0.520970683125263, "grad_norm": 0.2011886090040207, "learning_rate": 4.1317155281245616e-05, "loss": 0.0394, "step": 18570 }, { "epoch": 0.5212512273811194, "grad_norm": 0.18552415072917938, "learning_rate": 4.1312479543648016e-05, "loss": 0.0144, "step": 18580 }, { "epoch": 0.5215317716369757, "grad_norm": 2.691478729248047, "learning_rate": 4.13078038060504e-05, "loss": 0.0483, "step": 18590 }, { "epoch": 0.5218123158928321, "grad_norm": 2.4503397941589355, "learning_rate": 4.13031280684528e-05, "loss": 0.0143, "step": 18600 }, { "epoch": 0.5220928601486885, "grad_norm": 0.12876451015472412, "learning_rate": 4.129845233085519e-05, "loss": 0.0273, "step": 18610 }, { "epoch": 0.5223734044045448, "grad_norm": 0.1641281098127365, "learning_rate": 4.129377659325759e-05, "loss": 0.0039, "step": 18620 }, { "epoch": 0.5226539486604012, "grad_norm": 1.0715892314910889, "learning_rate": 4.128910085565998e-05, "loss": 0.0616, "step": 18630 }, { "epoch": 0.5229344929162575, "grad_norm": 0.3521519899368286, "learning_rate": 4.1284425118062375e-05, "loss": 0.0406, "step": 18640 }, { "epoch": 0.5232150371721139, "grad_norm": 0.9560930132865906, "learning_rate": 4.127974938046477e-05, "loss": 0.0244, "step": 18650 }, { "epoch": 0.5234955814279703, "grad_norm": 0.5189430713653564, "learning_rate": 4.127507364286716e-05, "loss": 0.0185, "step": 18660 }, { "epoch": 0.5237761256838266, "grad_norm": 2.3286890983581543, "learning_rate": 4.127039790526956e-05, "loss": 0.033, "step": 18670 }, { "epoch": 0.524056669939683, "grad_norm": 0.11130797117948532, "learning_rate": 4.126572216767195e-05, "loss": 0.0393, "step": 18680 }, { "epoch": 0.5243372141955394, "grad_norm": 0.15782012045383453, "learning_rate": 4.126104643007435e-05, "loss": 0.0285, "step": 18690 }, { "epoch": 0.5246177584513957, "grad_norm": 0.2035524547100067, "learning_rate": 4.125637069247674e-05, "loss": 0.0209, "step": 18700 }, { "epoch": 0.524898302707252, "grad_norm": 8.158551216125488, "learning_rate": 4.1251694954879134e-05, "loss": 0.0189, "step": 18710 }, { "epoch": 0.5251788469631085, "grad_norm": 0.28315603733062744, "learning_rate": 4.124701921728153e-05, "loss": 0.022, "step": 18720 }, { "epoch": 0.5254593912189648, "grad_norm": 0.3833126723766327, "learning_rate": 4.124234347968392e-05, "loss": 0.024, "step": 18730 }, { "epoch": 0.5257399354748211, "grad_norm": 0.12821070849895477, "learning_rate": 4.1237667742086314e-05, "loss": 0.016, "step": 18740 }, { "epoch": 0.5260204797306776, "grad_norm": 0.01698862947523594, "learning_rate": 4.123299200448871e-05, "loss": 0.0601, "step": 18750 }, { "epoch": 0.5263010239865339, "grad_norm": 0.10739752650260925, "learning_rate": 4.1228316266891107e-05, "loss": 0.0226, "step": 18760 }, { "epoch": 0.5265815682423902, "grad_norm": 0.048958200961351395, "learning_rate": 4.12236405292935e-05, "loss": 0.0236, "step": 18770 }, { "epoch": 0.5268621124982465, "grad_norm": 0.050478167831897736, "learning_rate": 4.121896479169589e-05, "loss": 0.0049, "step": 18780 }, { "epoch": 0.527142656754103, "grad_norm": 0.08643370866775513, "learning_rate": 4.1214289054098286e-05, "loss": 0.0406, "step": 18790 }, { "epoch": 0.5274232010099593, "grad_norm": 2.6439170837402344, "learning_rate": 4.120961331650068e-05, "loss": 0.0457, "step": 18800 }, { "epoch": 0.5277037452658156, "grad_norm": 1.3615232706069946, "learning_rate": 4.120493757890307e-05, "loss": 0.0255, "step": 18810 }, { "epoch": 0.5279842895216721, "grad_norm": 0.12703204154968262, "learning_rate": 4.1200261841305466e-05, "loss": 0.0296, "step": 18820 }, { "epoch": 0.5282648337775284, "grad_norm": 0.4491965174674988, "learning_rate": 4.119558610370786e-05, "loss": 0.0232, "step": 18830 }, { "epoch": 0.5285453780333847, "grad_norm": 0.02450815588235855, "learning_rate": 4.119091036611026e-05, "loss": 0.059, "step": 18840 }, { "epoch": 0.5288259222892411, "grad_norm": 0.08353116363286972, "learning_rate": 4.118623462851265e-05, "loss": 0.0312, "step": 18850 }, { "epoch": 0.5291064665450975, "grad_norm": 0.3620278537273407, "learning_rate": 4.1181558890915045e-05, "loss": 0.0264, "step": 18860 }, { "epoch": 0.5293870108009538, "grad_norm": 0.07052111625671387, "learning_rate": 4.117688315331744e-05, "loss": 0.0552, "step": 18870 }, { "epoch": 0.5296675550568102, "grad_norm": 0.6023431420326233, "learning_rate": 4.117220741571983e-05, "loss": 0.0224, "step": 18880 }, { "epoch": 0.5299480993126666, "grad_norm": 0.37989896535873413, "learning_rate": 4.1167531678122224e-05, "loss": 0.0239, "step": 18890 }, { "epoch": 0.530228643568523, "grad_norm": 0.31358328461647034, "learning_rate": 4.116285594052462e-05, "loss": 0.0266, "step": 18900 }, { "epoch": 0.5305091878243793, "grad_norm": 0.3863230049610138, "learning_rate": 4.115818020292702e-05, "loss": 0.0422, "step": 18910 }, { "epoch": 0.5307897320802356, "grad_norm": 0.9421529769897461, "learning_rate": 4.1153504465329404e-05, "loss": 0.0303, "step": 18920 }, { "epoch": 0.531070276336092, "grad_norm": 0.9675644636154175, "learning_rate": 4.1148828727731804e-05, "loss": 0.0231, "step": 18930 }, { "epoch": 0.5313508205919484, "grad_norm": 1.7909609079360962, "learning_rate": 4.11441529901342e-05, "loss": 0.0282, "step": 18940 }, { "epoch": 0.5316313648478047, "grad_norm": 1.0362086296081543, "learning_rate": 4.113947725253659e-05, "loss": 0.0408, "step": 18950 }, { "epoch": 0.5319119091036612, "grad_norm": 0.5174074769020081, "learning_rate": 4.113480151493898e-05, "loss": 0.0302, "step": 18960 }, { "epoch": 0.5321924533595175, "grad_norm": 0.23532475531101227, "learning_rate": 4.1130125777341376e-05, "loss": 0.0173, "step": 18970 }, { "epoch": 0.5324729976153738, "grad_norm": 5.37515926361084, "learning_rate": 4.1125450039743776e-05, "loss": 0.0387, "step": 18980 }, { "epoch": 0.5327535418712301, "grad_norm": 2.844709873199463, "learning_rate": 4.112077430214616e-05, "loss": 0.0253, "step": 18990 }, { "epoch": 0.5330340861270866, "grad_norm": 0.9424228072166443, "learning_rate": 4.111609856454856e-05, "loss": 0.0402, "step": 19000 }, { "epoch": 0.5333146303829429, "grad_norm": 1.022470235824585, "learning_rate": 4.111142282695095e-05, "loss": 0.0207, "step": 19010 }, { "epoch": 0.5335951746387992, "grad_norm": 0.04213989898562431, "learning_rate": 4.110674708935335e-05, "loss": 0.0512, "step": 19020 }, { "epoch": 0.5338757188946557, "grad_norm": 0.40505334734916687, "learning_rate": 4.110207135175574e-05, "loss": 0.022, "step": 19030 }, { "epoch": 0.534156263150512, "grad_norm": 0.205605149269104, "learning_rate": 4.1097395614158135e-05, "loss": 0.0105, "step": 19040 }, { "epoch": 0.5344368074063683, "grad_norm": 7.095761299133301, "learning_rate": 4.109271987656053e-05, "loss": 0.0356, "step": 19050 }, { "epoch": 0.5347173516622247, "grad_norm": 1.8398250341415405, "learning_rate": 4.108804413896292e-05, "loss": 0.0544, "step": 19060 }, { "epoch": 0.5349978959180811, "grad_norm": 1.2698255777359009, "learning_rate": 4.108336840136532e-05, "loss": 0.0703, "step": 19070 }, { "epoch": 0.5352784401739374, "grad_norm": 0.3854914903640747, "learning_rate": 4.107869266376771e-05, "loss": 0.0372, "step": 19080 }, { "epoch": 0.5355589844297938, "grad_norm": 1.2675613164901733, "learning_rate": 4.107401692617011e-05, "loss": 0.0162, "step": 19090 }, { "epoch": 0.5358395286856502, "grad_norm": 0.5420184135437012, "learning_rate": 4.1069341188572494e-05, "loss": 0.0457, "step": 19100 }, { "epoch": 0.5361200729415065, "grad_norm": 1.683388590812683, "learning_rate": 4.1064665450974894e-05, "loss": 0.0255, "step": 19110 }, { "epoch": 0.5364006171973629, "grad_norm": 0.04993033781647682, "learning_rate": 4.105998971337729e-05, "loss": 0.0376, "step": 19120 }, { "epoch": 0.5366811614532192, "grad_norm": 0.041381850838661194, "learning_rate": 4.105531397577968e-05, "loss": 0.0295, "step": 19130 }, { "epoch": 0.5369617057090756, "grad_norm": 0.2550196647644043, "learning_rate": 4.1050638238182074e-05, "loss": 0.0226, "step": 19140 }, { "epoch": 0.537242249964932, "grad_norm": 0.9787159562110901, "learning_rate": 4.104596250058447e-05, "loss": 0.0128, "step": 19150 }, { "epoch": 0.5375227942207883, "grad_norm": 0.0707443505525589, "learning_rate": 4.104128676298687e-05, "loss": 0.0407, "step": 19160 }, { "epoch": 0.5378033384766447, "grad_norm": 0.4026000499725342, "learning_rate": 4.103661102538925e-05, "loss": 0.0335, "step": 19170 }, { "epoch": 0.5380838827325011, "grad_norm": 0.943781316280365, "learning_rate": 4.103193528779165e-05, "loss": 0.0263, "step": 19180 }, { "epoch": 0.5383644269883574, "grad_norm": 0.24711377918720245, "learning_rate": 4.1027259550194046e-05, "loss": 0.0333, "step": 19190 }, { "epoch": 0.5386449712442137, "grad_norm": 1.2564129829406738, "learning_rate": 4.102258381259644e-05, "loss": 0.0288, "step": 19200 }, { "epoch": 0.5389255155000702, "grad_norm": 0.04068749025464058, "learning_rate": 4.101790807499883e-05, "loss": 0.0298, "step": 19210 }, { "epoch": 0.5392060597559265, "grad_norm": 2.0394973754882812, "learning_rate": 4.1013232337401226e-05, "loss": 0.0389, "step": 19220 }, { "epoch": 0.5394866040117828, "grad_norm": 0.19447001814842224, "learning_rate": 4.100855659980362e-05, "loss": 0.0372, "step": 19230 }, { "epoch": 0.5397671482676393, "grad_norm": 0.3524860143661499, "learning_rate": 4.100388086220601e-05, "loss": 0.0459, "step": 19240 }, { "epoch": 0.5400476925234956, "grad_norm": 3.690788507461548, "learning_rate": 4.099920512460841e-05, "loss": 0.0228, "step": 19250 }, { "epoch": 0.5403282367793519, "grad_norm": 6.887862682342529, "learning_rate": 4.0994529387010805e-05, "loss": 0.0615, "step": 19260 }, { "epoch": 0.5406087810352083, "grad_norm": 0.07403536885976791, "learning_rate": 4.09898536494132e-05, "loss": 0.0308, "step": 19270 }, { "epoch": 0.5408893252910647, "grad_norm": 0.3218100666999817, "learning_rate": 4.098517791181559e-05, "loss": 0.0205, "step": 19280 }, { "epoch": 0.541169869546921, "grad_norm": 0.45317497849464417, "learning_rate": 4.0980502174217984e-05, "loss": 0.0071, "step": 19290 }, { "epoch": 0.5414504138027774, "grad_norm": 0.459487646818161, "learning_rate": 4.097582643662038e-05, "loss": 0.0731, "step": 19300 }, { "epoch": 0.5417309580586338, "grad_norm": 0.4544183611869812, "learning_rate": 4.097115069902277e-05, "loss": 0.0391, "step": 19310 }, { "epoch": 0.5420115023144901, "grad_norm": 0.8758169412612915, "learning_rate": 4.0966474961425164e-05, "loss": 0.0658, "step": 19320 }, { "epoch": 0.5422920465703465, "grad_norm": 0.15245503187179565, "learning_rate": 4.0961799223827564e-05, "loss": 0.0245, "step": 19330 }, { "epoch": 0.5425725908262028, "grad_norm": 0.28124746680259705, "learning_rate": 4.095712348622995e-05, "loss": 0.0568, "step": 19340 }, { "epoch": 0.5428531350820592, "grad_norm": 1.5100957155227661, "learning_rate": 4.095244774863235e-05, "loss": 0.0159, "step": 19350 }, { "epoch": 0.5431336793379156, "grad_norm": 0.30853787064552307, "learning_rate": 4.094777201103474e-05, "loss": 0.0172, "step": 19360 }, { "epoch": 0.5434142235937719, "grad_norm": 2.775312900543213, "learning_rate": 4.0943096273437136e-05, "loss": 0.0419, "step": 19370 }, { "epoch": 0.5436947678496282, "grad_norm": 0.13331453502178192, "learning_rate": 4.0938420535839536e-05, "loss": 0.0184, "step": 19380 }, { "epoch": 0.5439753121054847, "grad_norm": 0.10220520198345184, "learning_rate": 4.093374479824192e-05, "loss": 0.031, "step": 19390 }, { "epoch": 0.544255856361341, "grad_norm": 0.497125506401062, "learning_rate": 4.092906906064432e-05, "loss": 0.0188, "step": 19400 }, { "epoch": 0.5445364006171973, "grad_norm": 0.6028520464897156, "learning_rate": 4.092439332304671e-05, "loss": 0.0614, "step": 19410 }, { "epoch": 0.5448169448730538, "grad_norm": 0.1577042043209076, "learning_rate": 4.091971758544911e-05, "loss": 0.0827, "step": 19420 }, { "epoch": 0.5450974891289101, "grad_norm": 0.31161630153656006, "learning_rate": 4.0915041847851495e-05, "loss": 0.0261, "step": 19430 }, { "epoch": 0.5453780333847664, "grad_norm": 0.04292500764131546, "learning_rate": 4.0910366110253895e-05, "loss": 0.0237, "step": 19440 }, { "epoch": 0.5456585776406228, "grad_norm": 0.1653280258178711, "learning_rate": 4.090569037265629e-05, "loss": 0.0182, "step": 19450 }, { "epoch": 0.5459391218964792, "grad_norm": 0.08679775893688202, "learning_rate": 4.090101463505868e-05, "loss": 0.0139, "step": 19460 }, { "epoch": 0.5462196661523355, "grad_norm": 0.12007167935371399, "learning_rate": 4.089633889746108e-05, "loss": 0.0235, "step": 19470 }, { "epoch": 0.5465002104081919, "grad_norm": 0.037514958530664444, "learning_rate": 4.089166315986347e-05, "loss": 0.0354, "step": 19480 }, { "epoch": 0.5467807546640483, "grad_norm": 0.7939467430114746, "learning_rate": 4.088698742226587e-05, "loss": 0.0149, "step": 19490 }, { "epoch": 0.5470612989199046, "grad_norm": 0.9538915157318115, "learning_rate": 4.0882311684668254e-05, "loss": 0.0332, "step": 19500 }, { "epoch": 0.547341843175761, "grad_norm": 0.26868224143981934, "learning_rate": 4.0877635947070654e-05, "loss": 0.0339, "step": 19510 }, { "epoch": 0.5476223874316173, "grad_norm": 0.9061500430107117, "learning_rate": 4.087296020947305e-05, "loss": 0.0121, "step": 19520 }, { "epoch": 0.5479029316874737, "grad_norm": 0.062476254999637604, "learning_rate": 4.086828447187544e-05, "loss": 0.0349, "step": 19530 }, { "epoch": 0.5481834759433301, "grad_norm": 0.32579490542411804, "learning_rate": 4.0863608734277834e-05, "loss": 0.0628, "step": 19540 }, { "epoch": 0.5484640201991864, "grad_norm": 0.9794478416442871, "learning_rate": 4.085893299668023e-05, "loss": 0.0632, "step": 19550 }, { "epoch": 0.5487445644550428, "grad_norm": 0.05253671854734421, "learning_rate": 4.085425725908262e-05, "loss": 0.0205, "step": 19560 }, { "epoch": 0.5490251087108992, "grad_norm": 0.26971033215522766, "learning_rate": 4.084958152148501e-05, "loss": 0.0401, "step": 19570 }, { "epoch": 0.5493056529667555, "grad_norm": 0.04770753160119057, "learning_rate": 4.084490578388741e-05, "loss": 0.0304, "step": 19580 }, { "epoch": 0.5495861972226118, "grad_norm": 0.07667747139930725, "learning_rate": 4.0840230046289806e-05, "loss": 0.035, "step": 19590 }, { "epoch": 0.5498667414784683, "grad_norm": 0.10346951335668564, "learning_rate": 4.08355543086922e-05, "loss": 0.0347, "step": 19600 }, { "epoch": 0.5501472857343246, "grad_norm": 0.6135060787200928, "learning_rate": 4.083087857109459e-05, "loss": 0.0358, "step": 19610 }, { "epoch": 0.5504278299901809, "grad_norm": 0.3799002468585968, "learning_rate": 4.0826202833496986e-05, "loss": 0.0442, "step": 19620 }, { "epoch": 0.5507083742460374, "grad_norm": 0.0635291263461113, "learning_rate": 4.082152709589938e-05, "loss": 0.0219, "step": 19630 }, { "epoch": 0.5509889185018937, "grad_norm": 0.5338521003723145, "learning_rate": 4.081685135830177e-05, "loss": 0.0264, "step": 19640 }, { "epoch": 0.55126946275775, "grad_norm": 0.07751046121120453, "learning_rate": 4.0812175620704165e-05, "loss": 0.0218, "step": 19650 }, { "epoch": 0.5515500070136063, "grad_norm": 2.100149393081665, "learning_rate": 4.0807499883106565e-05, "loss": 0.0223, "step": 19660 }, { "epoch": 0.5518305512694628, "grad_norm": 0.24986933171749115, "learning_rate": 4.080282414550896e-05, "loss": 0.0416, "step": 19670 }, { "epoch": 0.5521110955253191, "grad_norm": 5.835756301879883, "learning_rate": 4.079814840791135e-05, "loss": 0.0321, "step": 19680 }, { "epoch": 0.5523916397811754, "grad_norm": 1.536501407623291, "learning_rate": 4.0793472670313745e-05, "loss": 0.0511, "step": 19690 }, { "epoch": 0.5526721840370319, "grad_norm": 0.8779574632644653, "learning_rate": 4.078879693271614e-05, "loss": 0.026, "step": 19700 }, { "epoch": 0.5529527282928882, "grad_norm": 0.6290484666824341, "learning_rate": 4.078412119511853e-05, "loss": 0.0252, "step": 19710 }, { "epoch": 0.5532332725487445, "grad_norm": 1.2037246227264404, "learning_rate": 4.0779445457520924e-05, "loss": 0.0325, "step": 19720 }, { "epoch": 0.5535138168046009, "grad_norm": 0.15251211822032928, "learning_rate": 4.0774769719923324e-05, "loss": 0.0171, "step": 19730 }, { "epoch": 0.5537943610604573, "grad_norm": 1.4458023309707642, "learning_rate": 4.077009398232571e-05, "loss": 0.0301, "step": 19740 }, { "epoch": 0.5540749053163136, "grad_norm": 2.015052556991577, "learning_rate": 4.076541824472811e-05, "loss": 0.0302, "step": 19750 }, { "epoch": 0.55435544957217, "grad_norm": 0.019201157614588737, "learning_rate": 4.0760742507130503e-05, "loss": 0.0167, "step": 19760 }, { "epoch": 0.5546359938280264, "grad_norm": 0.2778054475784302, "learning_rate": 4.0756066769532897e-05, "loss": 0.0427, "step": 19770 }, { "epoch": 0.5549165380838827, "grad_norm": 0.19862985610961914, "learning_rate": 4.075139103193529e-05, "loss": 0.0358, "step": 19780 }, { "epoch": 0.5551970823397391, "grad_norm": 0.09296761453151703, "learning_rate": 4.074671529433768e-05, "loss": 0.0628, "step": 19790 }, { "epoch": 0.5554776265955954, "grad_norm": 3.0911977291107178, "learning_rate": 4.074203955674008e-05, "loss": 0.0551, "step": 19800 }, { "epoch": 0.5557581708514518, "grad_norm": 0.837997317314148, "learning_rate": 4.073736381914247e-05, "loss": 0.0288, "step": 19810 }, { "epoch": 0.5560387151073082, "grad_norm": 0.15092921257019043, "learning_rate": 4.073268808154487e-05, "loss": 0.0325, "step": 19820 }, { "epoch": 0.5563192593631645, "grad_norm": 0.36328646540641785, "learning_rate": 4.0728012343947256e-05, "loss": 0.0314, "step": 19830 }, { "epoch": 0.556599803619021, "grad_norm": 1.7929000854492188, "learning_rate": 4.0723336606349655e-05, "loss": 0.0292, "step": 19840 }, { "epoch": 0.5568803478748773, "grad_norm": 0.23768125474452972, "learning_rate": 4.071866086875205e-05, "loss": 0.013, "step": 19850 }, { "epoch": 0.5571608921307336, "grad_norm": 0.019399071112275124, "learning_rate": 4.071398513115444e-05, "loss": 0.0252, "step": 19860 }, { "epoch": 0.5574414363865899, "grad_norm": 0.025325128808617592, "learning_rate": 4.0709309393556835e-05, "loss": 0.0176, "step": 19870 }, { "epoch": 0.5577219806424464, "grad_norm": 0.4122799038887024, "learning_rate": 4.070463365595923e-05, "loss": 0.0326, "step": 19880 }, { "epoch": 0.5580025248983027, "grad_norm": 0.17568106949329376, "learning_rate": 4.069995791836163e-05, "loss": 0.0418, "step": 19890 }, { "epoch": 0.558283069154159, "grad_norm": 0.2119341939687729, "learning_rate": 4.0695282180764014e-05, "loss": 0.0237, "step": 19900 }, { "epoch": 0.5585636134100155, "grad_norm": 0.3159184157848358, "learning_rate": 4.0690606443166414e-05, "loss": 0.0364, "step": 19910 }, { "epoch": 0.5588441576658718, "grad_norm": 1.2619775533676147, "learning_rate": 4.06859307055688e-05, "loss": 0.0325, "step": 19920 }, { "epoch": 0.5591247019217281, "grad_norm": 0.4267543852329254, "learning_rate": 4.06812549679712e-05, "loss": 0.0281, "step": 19930 }, { "epoch": 0.5594052461775845, "grad_norm": 0.34703579545021057, "learning_rate": 4.0676579230373594e-05, "loss": 0.0695, "step": 19940 }, { "epoch": 0.5596857904334409, "grad_norm": 1.1621999740600586, "learning_rate": 4.067190349277599e-05, "loss": 0.0356, "step": 19950 }, { "epoch": 0.5599663346892972, "grad_norm": 1.1411892175674438, "learning_rate": 4.066722775517838e-05, "loss": 0.0481, "step": 19960 }, { "epoch": 0.5602468789451536, "grad_norm": 0.24931780993938446, "learning_rate": 4.066255201758077e-05, "loss": 0.0565, "step": 19970 }, { "epoch": 0.56052742320101, "grad_norm": 0.6953533887863159, "learning_rate": 4.065787627998317e-05, "loss": 0.0308, "step": 19980 }, { "epoch": 0.5608079674568663, "grad_norm": 1.5652605295181274, "learning_rate": 4.065320054238556e-05, "loss": 0.0589, "step": 19990 }, { "epoch": 0.5610885117127227, "grad_norm": 0.16156072914600372, "learning_rate": 4.064852480478796e-05, "loss": 0.0254, "step": 20000 }, { "epoch": 0.561369055968579, "grad_norm": 0.050837986171245575, "learning_rate": 4.064384906719035e-05, "loss": 0.0202, "step": 20010 }, { "epoch": 0.5616496002244354, "grad_norm": 1.2185817956924438, "learning_rate": 4.0639173329592746e-05, "loss": 0.0146, "step": 20020 }, { "epoch": 0.5619301444802918, "grad_norm": 0.26383906602859497, "learning_rate": 4.063449759199514e-05, "loss": 0.0521, "step": 20030 }, { "epoch": 0.5622106887361481, "grad_norm": 0.4181283712387085, "learning_rate": 4.062982185439753e-05, "loss": 0.0631, "step": 20040 }, { "epoch": 0.5624912329920045, "grad_norm": 0.14508719742298126, "learning_rate": 4.0625146116799925e-05, "loss": 0.0204, "step": 20050 }, { "epoch": 0.5627717772478609, "grad_norm": 2.9861137866973877, "learning_rate": 4.062047037920232e-05, "loss": 0.0188, "step": 20060 }, { "epoch": 0.5630523215037172, "grad_norm": 0.37671130895614624, "learning_rate": 4.061579464160472e-05, "loss": 0.0222, "step": 20070 }, { "epoch": 0.5633328657595735, "grad_norm": 1.2679451704025269, "learning_rate": 4.061111890400711e-05, "loss": 0.0441, "step": 20080 }, { "epoch": 0.56361341001543, "grad_norm": 0.04683038592338562, "learning_rate": 4.0606443166409505e-05, "loss": 0.0163, "step": 20090 }, { "epoch": 0.5638939542712863, "grad_norm": 2.7023072242736816, "learning_rate": 4.06017674288119e-05, "loss": 0.0575, "step": 20100 }, { "epoch": 0.5641744985271426, "grad_norm": 0.455967515707016, "learning_rate": 4.059709169121429e-05, "loss": 0.0356, "step": 20110 }, { "epoch": 0.5644550427829991, "grad_norm": 1.1120338439941406, "learning_rate": 4.0592415953616684e-05, "loss": 0.0475, "step": 20120 }, { "epoch": 0.5647355870388554, "grad_norm": 2.094675302505493, "learning_rate": 4.058774021601908e-05, "loss": 0.0296, "step": 20130 }, { "epoch": 0.5650161312947117, "grad_norm": 0.15356062352657318, "learning_rate": 4.058306447842147e-05, "loss": 0.0327, "step": 20140 }, { "epoch": 0.5652966755505681, "grad_norm": 0.0820997804403305, "learning_rate": 4.057838874082387e-05, "loss": 0.0267, "step": 20150 }, { "epoch": 0.5655772198064245, "grad_norm": 0.9279227256774902, "learning_rate": 4.0573713003226264e-05, "loss": 0.0445, "step": 20160 }, { "epoch": 0.5658577640622808, "grad_norm": 1.3179893493652344, "learning_rate": 4.056903726562866e-05, "loss": 0.0251, "step": 20170 }, { "epoch": 0.5661383083181372, "grad_norm": 0.9021573066711426, "learning_rate": 4.056436152803105e-05, "loss": 0.0349, "step": 20180 }, { "epoch": 0.5664188525739935, "grad_norm": 0.06707983464002609, "learning_rate": 4.055968579043344e-05, "loss": 0.035, "step": 20190 }, { "epoch": 0.5666993968298499, "grad_norm": 0.12450725585222244, "learning_rate": 4.0555010052835836e-05, "loss": 0.0454, "step": 20200 }, { "epoch": 0.5669799410857063, "grad_norm": 0.7484726309776306, "learning_rate": 4.055033431523823e-05, "loss": 0.0413, "step": 20210 }, { "epoch": 0.5672604853415626, "grad_norm": 0.6312171220779419, "learning_rate": 4.054565857764063e-05, "loss": 0.0412, "step": 20220 }, { "epoch": 0.567541029597419, "grad_norm": 0.21045097708702087, "learning_rate": 4.0540982840043016e-05, "loss": 0.0398, "step": 20230 }, { "epoch": 0.5678215738532754, "grad_norm": 2.014679431915283, "learning_rate": 4.0536307102445416e-05, "loss": 0.0419, "step": 20240 }, { "epoch": 0.5681021181091317, "grad_norm": 0.2394479662179947, "learning_rate": 4.05316313648478e-05, "loss": 0.0266, "step": 20250 }, { "epoch": 0.568382662364988, "grad_norm": 0.09074956923723221, "learning_rate": 4.05269556272502e-05, "loss": 0.0167, "step": 20260 }, { "epoch": 0.5686632066208445, "grad_norm": 1.2692089080810547, "learning_rate": 4.0522279889652595e-05, "loss": 0.0206, "step": 20270 }, { "epoch": 0.5689437508767008, "grad_norm": 0.13372349739074707, "learning_rate": 4.051760415205499e-05, "loss": 0.0408, "step": 20280 }, { "epoch": 0.5692242951325571, "grad_norm": 0.39984947443008423, "learning_rate": 4.051292841445739e-05, "loss": 0.0442, "step": 20290 }, { "epoch": 0.5695048393884136, "grad_norm": 0.3942771852016449, "learning_rate": 4.0508252676859774e-05, "loss": 0.0109, "step": 20300 }, { "epoch": 0.5697853836442699, "grad_norm": 0.038874492049217224, "learning_rate": 4.0503576939262174e-05, "loss": 0.05, "step": 20310 }, { "epoch": 0.5700659279001262, "grad_norm": 0.05763343349099159, "learning_rate": 4.049890120166456e-05, "loss": 0.0074, "step": 20320 }, { "epoch": 0.5703464721559826, "grad_norm": 0.12217506766319275, "learning_rate": 4.049422546406696e-05, "loss": 0.0153, "step": 20330 }, { "epoch": 0.570627016411839, "grad_norm": 0.5451664328575134, "learning_rate": 4.048954972646935e-05, "loss": 0.1008, "step": 20340 }, { "epoch": 0.5709075606676953, "grad_norm": 0.5903341770172119, "learning_rate": 4.048487398887175e-05, "loss": 0.0475, "step": 20350 }, { "epoch": 0.5711881049235517, "grad_norm": 0.495495468378067, "learning_rate": 4.048019825127414e-05, "loss": 0.0103, "step": 20360 }, { "epoch": 0.5714686491794081, "grad_norm": 0.059109900146722794, "learning_rate": 4.047552251367653e-05, "loss": 0.0073, "step": 20370 }, { "epoch": 0.5717491934352644, "grad_norm": 0.28177472949028015, "learning_rate": 4.047084677607893e-05, "loss": 0.056, "step": 20380 }, { "epoch": 0.5720297376911208, "grad_norm": 0.5446450114250183, "learning_rate": 4.046617103848132e-05, "loss": 0.0384, "step": 20390 }, { "epoch": 0.5723102819469771, "grad_norm": 0.6659284234046936, "learning_rate": 4.046149530088372e-05, "loss": 0.0294, "step": 20400 }, { "epoch": 0.5725908262028335, "grad_norm": 2.697371482849121, "learning_rate": 4.0456819563286106e-05, "loss": 0.0715, "step": 20410 }, { "epoch": 0.5728713704586899, "grad_norm": 1.3898919820785522, "learning_rate": 4.0452143825688506e-05, "loss": 0.0707, "step": 20420 }, { "epoch": 0.5731519147145462, "grad_norm": 0.06216865032911301, "learning_rate": 4.04474680880909e-05, "loss": 0.0228, "step": 20430 }, { "epoch": 0.5734324589704026, "grad_norm": 0.881115198135376, "learning_rate": 4.044279235049329e-05, "loss": 0.019, "step": 20440 }, { "epoch": 0.573713003226259, "grad_norm": 0.1677897870540619, "learning_rate": 4.0438116612895685e-05, "loss": 0.0172, "step": 20450 }, { "epoch": 0.5739935474821153, "grad_norm": 0.1795552372932434, "learning_rate": 4.043344087529808e-05, "loss": 0.038, "step": 20460 }, { "epoch": 0.5742740917379716, "grad_norm": 0.4600549340248108, "learning_rate": 4.042876513770047e-05, "loss": 0.0345, "step": 20470 }, { "epoch": 0.5745546359938281, "grad_norm": 0.09425505995750427, "learning_rate": 4.0424089400102865e-05, "loss": 0.0205, "step": 20480 }, { "epoch": 0.5748351802496844, "grad_norm": 4.61372184753418, "learning_rate": 4.0419413662505265e-05, "loss": 0.036, "step": 20490 }, { "epoch": 0.5751157245055407, "grad_norm": 0.27028095722198486, "learning_rate": 4.041473792490766e-05, "loss": 0.0029, "step": 20500 }, { "epoch": 0.5753962687613972, "grad_norm": 0.03563789650797844, "learning_rate": 4.041006218731005e-05, "loss": 0.0066, "step": 20510 }, { "epoch": 0.5756768130172535, "grad_norm": 0.023071080446243286, "learning_rate": 4.0405386449712444e-05, "loss": 0.0105, "step": 20520 }, { "epoch": 0.5759573572731098, "grad_norm": 1.8232399225234985, "learning_rate": 4.040071071211484e-05, "loss": 0.0478, "step": 20530 }, { "epoch": 0.5762379015289661, "grad_norm": 1.840932846069336, "learning_rate": 4.039603497451723e-05, "loss": 0.0248, "step": 20540 }, { "epoch": 0.5765184457848226, "grad_norm": 0.04721730947494507, "learning_rate": 4.0391359236919624e-05, "loss": 0.0082, "step": 20550 }, { "epoch": 0.5767989900406789, "grad_norm": 0.11185865849256516, "learning_rate": 4.038668349932202e-05, "loss": 0.025, "step": 20560 }, { "epoch": 0.5770795342965352, "grad_norm": 0.031456075608730316, "learning_rate": 4.038200776172442e-05, "loss": 0.012, "step": 20570 }, { "epoch": 0.5773600785523917, "grad_norm": 0.6339656114578247, "learning_rate": 4.037733202412681e-05, "loss": 0.0255, "step": 20580 }, { "epoch": 0.577640622808248, "grad_norm": 0.20911407470703125, "learning_rate": 4.03726562865292e-05, "loss": 0.0419, "step": 20590 }, { "epoch": 0.5779211670641043, "grad_norm": 0.06142083927989006, "learning_rate": 4.0367980548931596e-05, "loss": 0.0671, "step": 20600 }, { "epoch": 0.5782017113199607, "grad_norm": 0.8960585594177246, "learning_rate": 4.036330481133399e-05, "loss": 0.0358, "step": 20610 }, { "epoch": 0.5784822555758171, "grad_norm": 0.07892145961523056, "learning_rate": 4.035862907373638e-05, "loss": 0.0111, "step": 20620 }, { "epoch": 0.5787627998316734, "grad_norm": 0.44969722628593445, "learning_rate": 4.0353953336138776e-05, "loss": 0.0675, "step": 20630 }, { "epoch": 0.5790433440875298, "grad_norm": 0.273189514875412, "learning_rate": 4.0349277598541176e-05, "loss": 0.0588, "step": 20640 }, { "epoch": 0.5793238883433862, "grad_norm": 0.05255478620529175, "learning_rate": 4.034460186094356e-05, "loss": 0.0282, "step": 20650 }, { "epoch": 0.5796044325992425, "grad_norm": 0.27509427070617676, "learning_rate": 4.033992612334596e-05, "loss": 0.029, "step": 20660 }, { "epoch": 0.5798849768550989, "grad_norm": 0.07424774765968323, "learning_rate": 4.0335250385748355e-05, "loss": 0.0294, "step": 20670 }, { "epoch": 0.5801655211109552, "grad_norm": 0.15513528883457184, "learning_rate": 4.033057464815075e-05, "loss": 0.0294, "step": 20680 }, { "epoch": 0.5804460653668116, "grad_norm": 0.03644031658768654, "learning_rate": 4.032589891055314e-05, "loss": 0.0058, "step": 20690 }, { "epoch": 0.580726609622668, "grad_norm": 0.020484765991568565, "learning_rate": 4.0321223172955535e-05, "loss": 0.0363, "step": 20700 }, { "epoch": 0.5810071538785243, "grad_norm": 3.074047088623047, "learning_rate": 4.0316547435357934e-05, "loss": 0.0578, "step": 20710 }, { "epoch": 0.5812876981343807, "grad_norm": 0.09104529768228531, "learning_rate": 4.031187169776032e-05, "loss": 0.0218, "step": 20720 }, { "epoch": 0.5815682423902371, "grad_norm": 0.1549922674894333, "learning_rate": 4.030719596016272e-05, "loss": 0.0409, "step": 20730 }, { "epoch": 0.5818487866460934, "grad_norm": 0.2108004242181778, "learning_rate": 4.030252022256511e-05, "loss": 0.0279, "step": 20740 }, { "epoch": 0.5821293309019497, "grad_norm": 2.024721145629883, "learning_rate": 4.029784448496751e-05, "loss": 0.0516, "step": 20750 }, { "epoch": 0.5824098751578062, "grad_norm": 0.5967586636543274, "learning_rate": 4.02931687473699e-05, "loss": 0.0162, "step": 20760 }, { "epoch": 0.5826904194136625, "grad_norm": 0.3866839110851288, "learning_rate": 4.0288493009772293e-05, "loss": 0.0193, "step": 20770 }, { "epoch": 0.5829709636695188, "grad_norm": 0.0846666619181633, "learning_rate": 4.0283817272174687e-05, "loss": 0.0351, "step": 20780 }, { "epoch": 0.5832515079253753, "grad_norm": 2.11333966255188, "learning_rate": 4.027914153457708e-05, "loss": 0.0246, "step": 20790 }, { "epoch": 0.5835320521812316, "grad_norm": 0.21556423604488373, "learning_rate": 4.027446579697948e-05, "loss": 0.0409, "step": 20800 }, { "epoch": 0.5838125964370879, "grad_norm": 0.23706954717636108, "learning_rate": 4.0269790059381866e-05, "loss": 0.0167, "step": 20810 }, { "epoch": 0.5840931406929443, "grad_norm": 0.32247593998908997, "learning_rate": 4.0265114321784266e-05, "loss": 0.0276, "step": 20820 }, { "epoch": 0.5843736849488007, "grad_norm": 0.2005029171705246, "learning_rate": 4.026043858418665e-05, "loss": 0.0157, "step": 20830 }, { "epoch": 0.584654229204657, "grad_norm": 0.2338503748178482, "learning_rate": 4.025576284658905e-05, "loss": 0.0152, "step": 20840 }, { "epoch": 0.5849347734605134, "grad_norm": 9.113499641418457, "learning_rate": 4.0251087108991445e-05, "loss": 0.0413, "step": 20850 }, { "epoch": 0.5852153177163698, "grad_norm": 0.5816238522529602, "learning_rate": 4.024641137139384e-05, "loss": 0.0335, "step": 20860 }, { "epoch": 0.5854958619722261, "grad_norm": 6.852818012237549, "learning_rate": 4.024173563379623e-05, "loss": 0.0306, "step": 20870 }, { "epoch": 0.5857764062280825, "grad_norm": 1.200068712234497, "learning_rate": 4.0237059896198625e-05, "loss": 0.2091, "step": 20880 }, { "epoch": 0.5860569504839388, "grad_norm": 0.5409352779388428, "learning_rate": 4.0232384158601025e-05, "loss": 0.1231, "step": 20890 }, { "epoch": 0.5863374947397952, "grad_norm": 0.3164230287075043, "learning_rate": 4.022770842100341e-05, "loss": 0.0624, "step": 20900 }, { "epoch": 0.5866180389956516, "grad_norm": 21.228532791137695, "learning_rate": 4.022303268340581e-05, "loss": 0.0638, "step": 20910 }, { "epoch": 0.5868985832515079, "grad_norm": 0.15498997271060944, "learning_rate": 4.0218356945808204e-05, "loss": 0.0471, "step": 20920 }, { "epoch": 0.5871791275073643, "grad_norm": 0.2108922153711319, "learning_rate": 4.02136812082106e-05, "loss": 0.0568, "step": 20930 }, { "epoch": 0.5874596717632207, "grad_norm": 0.5054469704627991, "learning_rate": 4.020900547061299e-05, "loss": 0.024, "step": 20940 }, { "epoch": 0.587740216019077, "grad_norm": 0.2880743741989136, "learning_rate": 4.0204329733015384e-05, "loss": 0.0179, "step": 20950 }, { "epoch": 0.5880207602749333, "grad_norm": 0.2623721659183502, "learning_rate": 4.019965399541778e-05, "loss": 0.0465, "step": 20960 }, { "epoch": 0.5883013045307898, "grad_norm": 1.1736289262771606, "learning_rate": 4.019497825782017e-05, "loss": 0.0191, "step": 20970 }, { "epoch": 0.5885818487866461, "grad_norm": 1.979690670967102, "learning_rate": 4.019030252022257e-05, "loss": 0.0294, "step": 20980 }, { "epoch": 0.5888623930425024, "grad_norm": 0.35237938165664673, "learning_rate": 4.018562678262496e-05, "loss": 0.0134, "step": 20990 }, { "epoch": 0.5891429372983589, "grad_norm": 0.9665606021881104, "learning_rate": 4.0180951045027356e-05, "loss": 0.0194, "step": 21000 }, { "epoch": 0.5894234815542152, "grad_norm": 0.20433494448661804, "learning_rate": 4.017627530742975e-05, "loss": 0.0222, "step": 21010 }, { "epoch": 0.5897040258100715, "grad_norm": 0.703123152256012, "learning_rate": 4.017159956983214e-05, "loss": 0.016, "step": 21020 }, { "epoch": 0.5899845700659279, "grad_norm": 0.5649062991142273, "learning_rate": 4.0166923832234536e-05, "loss": 0.0323, "step": 21030 }, { "epoch": 0.5902651143217843, "grad_norm": 0.7857903838157654, "learning_rate": 4.016224809463693e-05, "loss": 0.029, "step": 21040 }, { "epoch": 0.5905456585776406, "grad_norm": 1.3822251558303833, "learning_rate": 4.015757235703932e-05, "loss": 0.038, "step": 21050 }, { "epoch": 0.590826202833497, "grad_norm": 0.0858098492026329, "learning_rate": 4.015289661944172e-05, "loss": 0.0247, "step": 21060 }, { "epoch": 0.5911067470893533, "grad_norm": 0.27351874113082886, "learning_rate": 4.0148220881844115e-05, "loss": 0.0201, "step": 21070 }, { "epoch": 0.5913872913452097, "grad_norm": 0.021933024749159813, "learning_rate": 4.014354514424651e-05, "loss": 0.0092, "step": 21080 }, { "epoch": 0.5916678356010661, "grad_norm": 0.4493241310119629, "learning_rate": 4.01388694066489e-05, "loss": 0.0111, "step": 21090 }, { "epoch": 0.5919483798569224, "grad_norm": 0.6204180121421814, "learning_rate": 4.0134193669051295e-05, "loss": 0.0358, "step": 21100 }, { "epoch": 0.5922289241127788, "grad_norm": 0.8422446846961975, "learning_rate": 4.012951793145369e-05, "loss": 0.0237, "step": 21110 }, { "epoch": 0.5925094683686352, "grad_norm": 0.24650032818317413, "learning_rate": 4.012484219385608e-05, "loss": 0.0692, "step": 21120 }, { "epoch": 0.5927900126244915, "grad_norm": 0.04378095269203186, "learning_rate": 4.012016645625848e-05, "loss": 0.0211, "step": 21130 }, { "epoch": 0.5930705568803478, "grad_norm": 6.182351589202881, "learning_rate": 4.011549071866087e-05, "loss": 0.0287, "step": 21140 }, { "epoch": 0.5933511011362043, "grad_norm": 0.07188566029071808, "learning_rate": 4.011081498106327e-05, "loss": 0.0283, "step": 21150 }, { "epoch": 0.5936316453920606, "grad_norm": 0.7525047063827515, "learning_rate": 4.0106139243465654e-05, "loss": 0.0427, "step": 21160 }, { "epoch": 0.5939121896479169, "grad_norm": 0.17341378331184387, "learning_rate": 4.0101463505868054e-05, "loss": 0.0184, "step": 21170 }, { "epoch": 0.5941927339037734, "grad_norm": 0.07128272205591202, "learning_rate": 4.009678776827045e-05, "loss": 0.0564, "step": 21180 }, { "epoch": 0.5944732781596297, "grad_norm": 0.08597701787948608, "learning_rate": 4.009211203067284e-05, "loss": 0.0306, "step": 21190 }, { "epoch": 0.594753822415486, "grad_norm": 0.06127036735415459, "learning_rate": 4.008743629307524e-05, "loss": 0.0078, "step": 21200 }, { "epoch": 0.5950343666713424, "grad_norm": 0.05376275256276131, "learning_rate": 4.0082760555477626e-05, "loss": 0.0293, "step": 21210 }, { "epoch": 0.5953149109271988, "grad_norm": 0.5750849843025208, "learning_rate": 4.0078084817880026e-05, "loss": 0.044, "step": 21220 }, { "epoch": 0.5955954551830551, "grad_norm": 0.047023601830005646, "learning_rate": 4.007340908028241e-05, "loss": 0.0135, "step": 21230 }, { "epoch": 0.5958759994389115, "grad_norm": 5.863286018371582, "learning_rate": 4.006873334268481e-05, "loss": 0.0425, "step": 21240 }, { "epoch": 0.5961565436947679, "grad_norm": 0.4212249517440796, "learning_rate": 4.00640576050872e-05, "loss": 0.0385, "step": 21250 }, { "epoch": 0.5964370879506242, "grad_norm": 0.7207387089729309, "learning_rate": 4.00593818674896e-05, "loss": 0.0465, "step": 21260 }, { "epoch": 0.5967176322064806, "grad_norm": 0.3838941752910614, "learning_rate": 4.005470612989199e-05, "loss": 0.0275, "step": 21270 }, { "epoch": 0.5969981764623369, "grad_norm": 0.09041538834571838, "learning_rate": 4.0050030392294385e-05, "loss": 0.0704, "step": 21280 }, { "epoch": 0.5972787207181933, "grad_norm": 0.3710717558860779, "learning_rate": 4.0045354654696785e-05, "loss": 0.015, "step": 21290 }, { "epoch": 0.5975592649740497, "grad_norm": 0.35654574632644653, "learning_rate": 4.004067891709917e-05, "loss": 0.0174, "step": 21300 }, { "epoch": 0.597839809229906, "grad_norm": 0.5406192541122437, "learning_rate": 4.003600317950157e-05, "loss": 0.0101, "step": 21310 }, { "epoch": 0.5981203534857624, "grad_norm": 0.28688111901283264, "learning_rate": 4.003132744190396e-05, "loss": 0.0457, "step": 21320 }, { "epoch": 0.5984008977416188, "grad_norm": 5.866365432739258, "learning_rate": 4.002665170430636e-05, "loss": 0.0464, "step": 21330 }, { "epoch": 0.5986814419974751, "grad_norm": 1.4806606769561768, "learning_rate": 4.002197596670875e-05, "loss": 0.0244, "step": 21340 }, { "epoch": 0.5989619862533314, "grad_norm": 0.3196195960044861, "learning_rate": 4.0017300229111144e-05, "loss": 0.0239, "step": 21350 }, { "epoch": 0.5992425305091879, "grad_norm": 0.17284809052944183, "learning_rate": 4.001262449151354e-05, "loss": 0.0364, "step": 21360 }, { "epoch": 0.5995230747650442, "grad_norm": 0.04275937005877495, "learning_rate": 4.000794875391593e-05, "loss": 0.0477, "step": 21370 }, { "epoch": 0.5998036190209005, "grad_norm": 0.40519699454307556, "learning_rate": 4.000327301631832e-05, "loss": 0.0053, "step": 21380 }, { "epoch": 0.600084163276757, "grad_norm": 0.49567586183547974, "learning_rate": 3.9998597278720716e-05, "loss": 0.0587, "step": 21390 }, { "epoch": 0.6003647075326133, "grad_norm": 0.2940233051776886, "learning_rate": 3.9993921541123116e-05, "loss": 0.0526, "step": 21400 }, { "epoch": 0.6006452517884696, "grad_norm": 0.0607791393995285, "learning_rate": 3.998924580352551e-05, "loss": 0.0308, "step": 21410 }, { "epoch": 0.6009257960443259, "grad_norm": 0.5268030762672424, "learning_rate": 3.99845700659279e-05, "loss": 0.0267, "step": 21420 }, { "epoch": 0.6012063403001824, "grad_norm": 1.2135268449783325, "learning_rate": 3.9979894328330296e-05, "loss": 0.0321, "step": 21430 }, { "epoch": 0.6014868845560387, "grad_norm": 0.06455528736114502, "learning_rate": 3.997521859073269e-05, "loss": 0.0128, "step": 21440 }, { "epoch": 0.601767428811895, "grad_norm": 2.326077699661255, "learning_rate": 3.997054285313508e-05, "loss": 0.0455, "step": 21450 }, { "epoch": 0.6020479730677515, "grad_norm": 0.1857239007949829, "learning_rate": 3.9965867115537475e-05, "loss": 0.0207, "step": 21460 }, { "epoch": 0.6023285173236078, "grad_norm": 1.4210498332977295, "learning_rate": 3.996119137793987e-05, "loss": 0.05, "step": 21470 }, { "epoch": 0.6026090615794641, "grad_norm": 0.22208678722381592, "learning_rate": 3.995651564034227e-05, "loss": 0.0437, "step": 21480 }, { "epoch": 0.6028896058353205, "grad_norm": 0.48494136333465576, "learning_rate": 3.995183990274466e-05, "loss": 0.0268, "step": 21490 }, { "epoch": 0.6031701500911769, "grad_norm": 1.9521896839141846, "learning_rate": 3.9947164165147055e-05, "loss": 0.0203, "step": 21500 }, { "epoch": 0.6034506943470332, "grad_norm": 0.011543912813067436, "learning_rate": 3.994248842754945e-05, "loss": 0.0499, "step": 21510 }, { "epoch": 0.6037312386028896, "grad_norm": 0.07688061147928238, "learning_rate": 3.993781268995184e-05, "loss": 0.0338, "step": 21520 }, { "epoch": 0.604011782858746, "grad_norm": 1.384993076324463, "learning_rate": 3.9933136952354234e-05, "loss": 0.0312, "step": 21530 }, { "epoch": 0.6042923271146023, "grad_norm": 0.17827478051185608, "learning_rate": 3.992846121475663e-05, "loss": 0.0329, "step": 21540 }, { "epoch": 0.6045728713704587, "grad_norm": 0.2286425232887268, "learning_rate": 3.992378547715903e-05, "loss": 0.0133, "step": 21550 }, { "epoch": 0.604853415626315, "grad_norm": 0.021580735221505165, "learning_rate": 3.9919109739561414e-05, "loss": 0.0148, "step": 21560 }, { "epoch": 0.6051339598821714, "grad_norm": 0.6690900921821594, "learning_rate": 3.9914434001963814e-05, "loss": 0.029, "step": 21570 }, { "epoch": 0.6054145041380278, "grad_norm": 4.744344711303711, "learning_rate": 3.990975826436621e-05, "loss": 0.0486, "step": 21580 }, { "epoch": 0.6056950483938841, "grad_norm": 0.11016254127025604, "learning_rate": 3.99050825267686e-05, "loss": 0.0283, "step": 21590 }, { "epoch": 0.6059755926497405, "grad_norm": 0.13750553131103516, "learning_rate": 3.990040678917099e-05, "loss": 0.0402, "step": 21600 }, { "epoch": 0.6062561369055969, "grad_norm": 0.2076699584722519, "learning_rate": 3.9895731051573386e-05, "loss": 0.0166, "step": 21610 }, { "epoch": 0.6065366811614532, "grad_norm": 0.16609862446784973, "learning_rate": 3.9891055313975786e-05, "loss": 0.0354, "step": 21620 }, { "epoch": 0.6068172254173095, "grad_norm": 0.48489612340927124, "learning_rate": 3.988637957637817e-05, "loss": 0.0572, "step": 21630 }, { "epoch": 0.607097769673166, "grad_norm": 0.9227844476699829, "learning_rate": 3.988170383878057e-05, "loss": 0.0315, "step": 21640 }, { "epoch": 0.6073783139290223, "grad_norm": 0.4963599145412445, "learning_rate": 3.987702810118296e-05, "loss": 0.0212, "step": 21650 }, { "epoch": 0.6076588581848786, "grad_norm": 0.10062550008296967, "learning_rate": 3.987235236358536e-05, "loss": 0.0106, "step": 21660 }, { "epoch": 0.6079394024407351, "grad_norm": 0.16304171085357666, "learning_rate": 3.986767662598775e-05, "loss": 0.01, "step": 21670 }, { "epoch": 0.6082199466965914, "grad_norm": 0.34672847390174866, "learning_rate": 3.9863000888390145e-05, "loss": 0.0327, "step": 21680 }, { "epoch": 0.6085004909524477, "grad_norm": 0.023574165999889374, "learning_rate": 3.985832515079254e-05, "loss": 0.0196, "step": 21690 }, { "epoch": 0.6087810352083041, "grad_norm": 3.7288756370544434, "learning_rate": 3.985364941319493e-05, "loss": 0.0266, "step": 21700 }, { "epoch": 0.6090615794641605, "grad_norm": 1.51796555519104, "learning_rate": 3.984897367559733e-05, "loss": 0.0529, "step": 21710 }, { "epoch": 0.6093421237200168, "grad_norm": 0.06977726519107819, "learning_rate": 3.984429793799972e-05, "loss": 0.0411, "step": 21720 }, { "epoch": 0.6096226679758732, "grad_norm": 0.8014068007469177, "learning_rate": 3.983962220040212e-05, "loss": 0.0306, "step": 21730 }, { "epoch": 0.6099032122317296, "grad_norm": 0.06792863458395004, "learning_rate": 3.9834946462804504e-05, "loss": 0.0392, "step": 21740 }, { "epoch": 0.6101837564875859, "grad_norm": 0.3652034401893616, "learning_rate": 3.9830270725206904e-05, "loss": 0.0426, "step": 21750 }, { "epoch": 0.6104643007434423, "grad_norm": 1.1455293893814087, "learning_rate": 3.98255949876093e-05, "loss": 0.0116, "step": 21760 }, { "epoch": 0.6107448449992986, "grad_norm": 0.35132989287376404, "learning_rate": 3.982091925001169e-05, "loss": 0.0203, "step": 21770 }, { "epoch": 0.611025389255155, "grad_norm": 0.4804457128047943, "learning_rate": 3.9816243512414083e-05, "loss": 0.0329, "step": 21780 }, { "epoch": 0.6113059335110114, "grad_norm": 0.04161791875958443, "learning_rate": 3.9811567774816477e-05, "loss": 0.0236, "step": 21790 }, { "epoch": 0.6115864777668677, "grad_norm": 0.22758284211158752, "learning_rate": 3.9806892037218876e-05, "loss": 0.0188, "step": 21800 }, { "epoch": 0.6118670220227241, "grad_norm": 4.068480968475342, "learning_rate": 3.980221629962126e-05, "loss": 0.0348, "step": 21810 }, { "epoch": 0.6121475662785805, "grad_norm": 0.8404746055603027, "learning_rate": 3.979754056202366e-05, "loss": 0.0179, "step": 21820 }, { "epoch": 0.6124281105344368, "grad_norm": 0.9687708020210266, "learning_rate": 3.9792864824426056e-05, "loss": 0.0309, "step": 21830 }, { "epoch": 0.6127086547902931, "grad_norm": 2.468628168106079, "learning_rate": 3.978818908682845e-05, "loss": 0.0374, "step": 21840 }, { "epoch": 0.6129891990461496, "grad_norm": 1.4577293395996094, "learning_rate": 3.978351334923084e-05, "loss": 0.0186, "step": 21850 }, { "epoch": 0.6132697433020059, "grad_norm": 0.09010326862335205, "learning_rate": 3.9778837611633235e-05, "loss": 0.0188, "step": 21860 }, { "epoch": 0.6135502875578622, "grad_norm": 0.06896397471427917, "learning_rate": 3.977416187403563e-05, "loss": 0.0168, "step": 21870 }, { "epoch": 0.6138308318137187, "grad_norm": 0.7435644865036011, "learning_rate": 3.976948613643802e-05, "loss": 0.0256, "step": 21880 }, { "epoch": 0.614111376069575, "grad_norm": 0.019083252176642418, "learning_rate": 3.976481039884042e-05, "loss": 0.023, "step": 21890 }, { "epoch": 0.6143919203254313, "grad_norm": 0.11787645518779755, "learning_rate": 3.9760134661242815e-05, "loss": 0.0614, "step": 21900 }, { "epoch": 0.6146724645812877, "grad_norm": 0.2707461714744568, "learning_rate": 3.975545892364521e-05, "loss": 0.044, "step": 21910 }, { "epoch": 0.6149530088371441, "grad_norm": 6.019365310668945, "learning_rate": 3.97507831860476e-05, "loss": 0.0174, "step": 21920 }, { "epoch": 0.6152335530930004, "grad_norm": 0.5220757722854614, "learning_rate": 3.9746107448449994e-05, "loss": 0.0406, "step": 21930 }, { "epoch": 0.6155140973488568, "grad_norm": 1.8705945014953613, "learning_rate": 3.974143171085239e-05, "loss": 0.029, "step": 21940 }, { "epoch": 0.6157946416047131, "grad_norm": 0.2765733003616333, "learning_rate": 3.973675597325479e-05, "loss": 0.0149, "step": 21950 }, { "epoch": 0.6160751858605695, "grad_norm": 0.5952902436256409, "learning_rate": 3.9732080235657174e-05, "loss": 0.049, "step": 21960 }, { "epoch": 0.6163557301164259, "grad_norm": 1.748311996459961, "learning_rate": 3.9727404498059574e-05, "loss": 0.0336, "step": 21970 }, { "epoch": 0.6166362743722822, "grad_norm": 0.026521623134613037, "learning_rate": 3.972272876046197e-05, "loss": 0.0296, "step": 21980 }, { "epoch": 0.6169168186281386, "grad_norm": 0.4983319342136383, "learning_rate": 3.971805302286436e-05, "loss": 0.0329, "step": 21990 }, { "epoch": 0.617197362883995, "grad_norm": 1.1243505477905273, "learning_rate": 3.971337728526675e-05, "loss": 0.0253, "step": 22000 }, { "epoch": 0.6174779071398513, "grad_norm": 0.38493651151657104, "learning_rate": 3.9708701547669146e-05, "loss": 0.0184, "step": 22010 }, { "epoch": 0.6177584513957076, "grad_norm": 0.024215789511799812, "learning_rate": 3.9704025810071546e-05, "loss": 0.0156, "step": 22020 }, { "epoch": 0.6180389956515641, "grad_norm": 0.03006119839847088, "learning_rate": 3.969935007247393e-05, "loss": 0.0212, "step": 22030 }, { "epoch": 0.6183195399074204, "grad_norm": 0.49007371068000793, "learning_rate": 3.969467433487633e-05, "loss": 0.0268, "step": 22040 }, { "epoch": 0.6186000841632767, "grad_norm": 0.1435483694076538, "learning_rate": 3.968999859727872e-05, "loss": 0.0716, "step": 22050 }, { "epoch": 0.6188806284191332, "grad_norm": 0.8426799178123474, "learning_rate": 3.968532285968112e-05, "loss": 0.1049, "step": 22060 }, { "epoch": 0.6191611726749895, "grad_norm": 2.2514004707336426, "learning_rate": 3.9680647122083505e-05, "loss": 0.0508, "step": 22070 }, { "epoch": 0.6194417169308458, "grad_norm": 0.9373038411140442, "learning_rate": 3.9675971384485905e-05, "loss": 0.0355, "step": 22080 }, { "epoch": 0.6197222611867022, "grad_norm": 4.923727035522461, "learning_rate": 3.96712956468883e-05, "loss": 0.057, "step": 22090 }, { "epoch": 0.6200028054425586, "grad_norm": 1.830161452293396, "learning_rate": 3.966661990929069e-05, "loss": 0.0497, "step": 22100 }, { "epoch": 0.6202833496984149, "grad_norm": 5.103137969970703, "learning_rate": 3.966194417169309e-05, "loss": 0.0353, "step": 22110 }, { "epoch": 0.6205638939542713, "grad_norm": 0.10001790523529053, "learning_rate": 3.965726843409548e-05, "loss": 0.0279, "step": 22120 }, { "epoch": 0.6208444382101277, "grad_norm": 0.049543965607881546, "learning_rate": 3.965259269649788e-05, "loss": 0.0163, "step": 22130 }, { "epoch": 0.621124982465984, "grad_norm": 0.1824961006641388, "learning_rate": 3.9647916958900264e-05, "loss": 0.0338, "step": 22140 }, { "epoch": 0.6214055267218404, "grad_norm": 2.5861823558807373, "learning_rate": 3.9643241221302664e-05, "loss": 0.0354, "step": 22150 }, { "epoch": 0.6216860709776967, "grad_norm": 0.1265188604593277, "learning_rate": 3.963856548370506e-05, "loss": 0.0387, "step": 22160 }, { "epoch": 0.6219666152335531, "grad_norm": 1.3016831874847412, "learning_rate": 3.963388974610745e-05, "loss": 0.0419, "step": 22170 }, { "epoch": 0.6222471594894095, "grad_norm": 0.21126395463943481, "learning_rate": 3.9629214008509844e-05, "loss": 0.0208, "step": 22180 }, { "epoch": 0.6225277037452658, "grad_norm": 0.09518637508153915, "learning_rate": 3.962453827091224e-05, "loss": 0.035, "step": 22190 }, { "epoch": 0.6228082480011222, "grad_norm": 0.046439483761787415, "learning_rate": 3.9619862533314637e-05, "loss": 0.0238, "step": 22200 }, { "epoch": 0.6230887922569786, "grad_norm": 0.6683565974235535, "learning_rate": 3.961518679571702e-05, "loss": 0.0083, "step": 22210 }, { "epoch": 0.6233693365128349, "grad_norm": 0.02888924442231655, "learning_rate": 3.961051105811942e-05, "loss": 0.0112, "step": 22220 }, { "epoch": 0.6236498807686912, "grad_norm": 0.5166340470314026, "learning_rate": 3.9605835320521816e-05, "loss": 0.0425, "step": 22230 }, { "epoch": 0.6239304250245477, "grad_norm": 3.088540554046631, "learning_rate": 3.960115958292421e-05, "loss": 0.0342, "step": 22240 }, { "epoch": 0.624210969280404, "grad_norm": 1.044143557548523, "learning_rate": 3.95964838453266e-05, "loss": 0.053, "step": 22250 }, { "epoch": 0.6244915135362603, "grad_norm": 1.7977912425994873, "learning_rate": 3.9591808107728996e-05, "loss": 0.0456, "step": 22260 }, { "epoch": 0.6247720577921168, "grad_norm": 0.07540057599544525, "learning_rate": 3.958713237013139e-05, "loss": 0.0185, "step": 22270 }, { "epoch": 0.6250526020479731, "grad_norm": 0.812985360622406, "learning_rate": 3.958245663253378e-05, "loss": 0.0424, "step": 22280 }, { "epoch": 0.6253331463038294, "grad_norm": 0.1378190517425537, "learning_rate": 3.9577780894936175e-05, "loss": 0.0307, "step": 22290 }, { "epoch": 0.6256136905596857, "grad_norm": 0.6924516558647156, "learning_rate": 3.9573105157338575e-05, "loss": 0.0398, "step": 22300 }, { "epoch": 0.6258942348155422, "grad_norm": 0.20190373063087463, "learning_rate": 3.956842941974097e-05, "loss": 0.0381, "step": 22310 }, { "epoch": 0.6261747790713985, "grad_norm": 1.0544099807739258, "learning_rate": 3.956375368214336e-05, "loss": 0.0419, "step": 22320 }, { "epoch": 0.6264553233272548, "grad_norm": 0.413085401058197, "learning_rate": 3.9559077944545754e-05, "loss": 0.0422, "step": 22330 }, { "epoch": 0.6267358675831113, "grad_norm": 0.5120216012001038, "learning_rate": 3.955440220694815e-05, "loss": 0.0337, "step": 22340 }, { "epoch": 0.6270164118389676, "grad_norm": 0.04421548172831535, "learning_rate": 3.954972646935054e-05, "loss": 0.0095, "step": 22350 }, { "epoch": 0.6272969560948239, "grad_norm": 0.6045029163360596, "learning_rate": 3.9545050731752934e-05, "loss": 0.0576, "step": 22360 }, { "epoch": 0.6275775003506803, "grad_norm": 0.18910078704357147, "learning_rate": 3.9540374994155334e-05, "loss": 0.0241, "step": 22370 }, { "epoch": 0.6278580446065367, "grad_norm": 1.073557734489441, "learning_rate": 3.953569925655772e-05, "loss": 0.0461, "step": 22380 }, { "epoch": 0.628138588862393, "grad_norm": 2.694287061691284, "learning_rate": 3.953102351896012e-05, "loss": 0.055, "step": 22390 }, { "epoch": 0.6284191331182494, "grad_norm": 1.1531403064727783, "learning_rate": 3.952634778136251e-05, "loss": 0.0272, "step": 22400 }, { "epoch": 0.6286996773741058, "grad_norm": 1.222733974456787, "learning_rate": 3.9521672043764906e-05, "loss": 0.0214, "step": 22410 }, { "epoch": 0.6289802216299621, "grad_norm": 0.13540247082710266, "learning_rate": 3.95169963061673e-05, "loss": 0.0409, "step": 22420 }, { "epoch": 0.6292607658858185, "grad_norm": 0.3235291540622711, "learning_rate": 3.951232056856969e-05, "loss": 0.0297, "step": 22430 }, { "epoch": 0.6295413101416748, "grad_norm": 0.5261475443840027, "learning_rate": 3.950764483097209e-05, "loss": 0.0297, "step": 22440 }, { "epoch": 0.6298218543975312, "grad_norm": 0.1527194082736969, "learning_rate": 3.950296909337448e-05, "loss": 0.0339, "step": 22450 }, { "epoch": 0.6301023986533876, "grad_norm": 1.6074490547180176, "learning_rate": 3.949829335577688e-05, "loss": 0.0258, "step": 22460 }, { "epoch": 0.6303829429092439, "grad_norm": 0.25659042596817017, "learning_rate": 3.9493617618179265e-05, "loss": 0.0275, "step": 22470 }, { "epoch": 0.6306634871651003, "grad_norm": 0.0665813758969307, "learning_rate": 3.9488941880581665e-05, "loss": 0.0214, "step": 22480 }, { "epoch": 0.6309440314209567, "grad_norm": 0.05181664973497391, "learning_rate": 3.948426614298406e-05, "loss": 0.0433, "step": 22490 }, { "epoch": 0.631224575676813, "grad_norm": 0.41712069511413574, "learning_rate": 3.947959040538645e-05, "loss": 0.0388, "step": 22500 }, { "epoch": 0.6315051199326693, "grad_norm": 0.2174205332994461, "learning_rate": 3.947491466778885e-05, "loss": 0.0226, "step": 22510 }, { "epoch": 0.6317856641885258, "grad_norm": 0.1490168571472168, "learning_rate": 3.947023893019124e-05, "loss": 0.0227, "step": 22520 }, { "epoch": 0.6320662084443821, "grad_norm": 0.17114180326461792, "learning_rate": 3.946556319259364e-05, "loss": 0.0561, "step": 22530 }, { "epoch": 0.6323467527002384, "grad_norm": 0.21539980173110962, "learning_rate": 3.9460887454996024e-05, "loss": 0.0256, "step": 22540 }, { "epoch": 0.6326272969560949, "grad_norm": 1.5799144506454468, "learning_rate": 3.9456211717398424e-05, "loss": 0.0363, "step": 22550 }, { "epoch": 0.6329078412119512, "grad_norm": 0.08618737757205963, "learning_rate": 3.945153597980081e-05, "loss": 0.0449, "step": 22560 }, { "epoch": 0.6331883854678075, "grad_norm": 0.683048665523529, "learning_rate": 3.944686024220321e-05, "loss": 0.0492, "step": 22570 }, { "epoch": 0.6334689297236639, "grad_norm": 0.20372651517391205, "learning_rate": 3.9442184504605604e-05, "loss": 0.0341, "step": 22580 }, { "epoch": 0.6337494739795203, "grad_norm": 0.018305836245417595, "learning_rate": 3.9437508767008e-05, "loss": 0.0186, "step": 22590 }, { "epoch": 0.6340300182353766, "grad_norm": 3.7208476066589355, "learning_rate": 3.943283302941039e-05, "loss": 0.022, "step": 22600 }, { "epoch": 0.634310562491233, "grad_norm": 0.03509819507598877, "learning_rate": 3.942815729181278e-05, "loss": 0.0036, "step": 22610 }, { "epoch": 0.6345911067470894, "grad_norm": 0.17525836825370789, "learning_rate": 3.942348155421518e-05, "loss": 0.0537, "step": 22620 }, { "epoch": 0.6348716510029457, "grad_norm": 0.05362547188997269, "learning_rate": 3.941880581661757e-05, "loss": 0.0115, "step": 22630 }, { "epoch": 0.6351521952588021, "grad_norm": 0.1588001400232315, "learning_rate": 3.941413007901997e-05, "loss": 0.0394, "step": 22640 }, { "epoch": 0.6354327395146584, "grad_norm": 0.03576695919036865, "learning_rate": 3.940945434142236e-05, "loss": 0.0402, "step": 22650 }, { "epoch": 0.6357132837705148, "grad_norm": 0.11014001816511154, "learning_rate": 3.9404778603824756e-05, "loss": 0.0353, "step": 22660 }, { "epoch": 0.6359938280263712, "grad_norm": 0.2830393314361572, "learning_rate": 3.940010286622715e-05, "loss": 0.0391, "step": 22670 }, { "epoch": 0.6362743722822275, "grad_norm": 0.3615638315677643, "learning_rate": 3.939542712862954e-05, "loss": 0.0331, "step": 22680 }, { "epoch": 0.6365549165380839, "grad_norm": 0.14497360587120056, "learning_rate": 3.9390751391031935e-05, "loss": 0.0461, "step": 22690 }, { "epoch": 0.6368354607939403, "grad_norm": 0.2519933581352234, "learning_rate": 3.938607565343433e-05, "loss": 0.0538, "step": 22700 }, { "epoch": 0.6371160050497966, "grad_norm": 1.200700044631958, "learning_rate": 3.938139991583673e-05, "loss": 0.0434, "step": 22710 }, { "epoch": 0.6373965493056529, "grad_norm": 0.053568821400403976, "learning_rate": 3.937672417823912e-05, "loss": 0.0456, "step": 22720 }, { "epoch": 0.6376770935615094, "grad_norm": 0.2737317681312561, "learning_rate": 3.9372048440641514e-05, "loss": 0.0263, "step": 22730 }, { "epoch": 0.6379576378173657, "grad_norm": 0.14805278182029724, "learning_rate": 3.936737270304391e-05, "loss": 0.0451, "step": 22740 }, { "epoch": 0.638238182073222, "grad_norm": 0.09337671846151352, "learning_rate": 3.93626969654463e-05, "loss": 0.0074, "step": 22750 }, { "epoch": 0.6385187263290785, "grad_norm": 0.05338851734995842, "learning_rate": 3.9358021227848694e-05, "loss": 0.0264, "step": 22760 }, { "epoch": 0.6387992705849348, "grad_norm": 0.21563240885734558, "learning_rate": 3.935334549025109e-05, "loss": 0.0636, "step": 22770 }, { "epoch": 0.6390798148407911, "grad_norm": 0.18045790493488312, "learning_rate": 3.934866975265348e-05, "loss": 0.0139, "step": 22780 }, { "epoch": 0.6393603590966475, "grad_norm": 0.29243841767311096, "learning_rate": 3.934399401505588e-05, "loss": 0.0307, "step": 22790 }, { "epoch": 0.6396409033525039, "grad_norm": 0.7087806463241577, "learning_rate": 3.933931827745827e-05, "loss": 0.0542, "step": 22800 }, { "epoch": 0.6399214476083602, "grad_norm": 0.08645330369472504, "learning_rate": 3.9334642539860666e-05, "loss": 0.0184, "step": 22810 }, { "epoch": 0.6402019918642166, "grad_norm": 0.058594148606061935, "learning_rate": 3.932996680226306e-05, "loss": 0.0144, "step": 22820 }, { "epoch": 0.6404825361200729, "grad_norm": 0.5946055054664612, "learning_rate": 3.932529106466545e-05, "loss": 0.0297, "step": 22830 }, { "epoch": 0.6407630803759293, "grad_norm": 0.14408941566944122, "learning_rate": 3.9320615327067846e-05, "loss": 0.0414, "step": 22840 }, { "epoch": 0.6410436246317857, "grad_norm": 0.03707456961274147, "learning_rate": 3.931593958947024e-05, "loss": 0.0327, "step": 22850 }, { "epoch": 0.641324168887642, "grad_norm": 0.04255200922489166, "learning_rate": 3.931126385187264e-05, "loss": 0.0392, "step": 22860 }, { "epoch": 0.6416047131434984, "grad_norm": 0.056467387825250626, "learning_rate": 3.9306588114275025e-05, "loss": 0.0323, "step": 22870 }, { "epoch": 0.6418852573993548, "grad_norm": 0.19214710593223572, "learning_rate": 3.9301912376677425e-05, "loss": 0.0113, "step": 22880 }, { "epoch": 0.6421658016552111, "grad_norm": 0.40138113498687744, "learning_rate": 3.929723663907982e-05, "loss": 0.0088, "step": 22890 }, { "epoch": 0.6424463459110674, "grad_norm": 0.01118597760796547, "learning_rate": 3.929256090148221e-05, "loss": 0.0312, "step": 22900 }, { "epoch": 0.6427268901669239, "grad_norm": 0.21731826663017273, "learning_rate": 3.9287885163884605e-05, "loss": 0.0368, "step": 22910 }, { "epoch": 0.6430074344227802, "grad_norm": 1.3963474035263062, "learning_rate": 3.9283209426287e-05, "loss": 0.029, "step": 22920 }, { "epoch": 0.6432879786786365, "grad_norm": 0.26714321970939636, "learning_rate": 3.92785336886894e-05, "loss": 0.0482, "step": 22930 }, { "epoch": 0.643568522934493, "grad_norm": 0.4825710654258728, "learning_rate": 3.9273857951091784e-05, "loss": 0.0675, "step": 22940 }, { "epoch": 0.6438490671903493, "grad_norm": 0.5260394811630249, "learning_rate": 3.9269182213494184e-05, "loss": 0.0201, "step": 22950 }, { "epoch": 0.6441296114462056, "grad_norm": 0.012510191649198532, "learning_rate": 3.926450647589657e-05, "loss": 0.021, "step": 22960 }, { "epoch": 0.644410155702062, "grad_norm": 0.025387438014149666, "learning_rate": 3.925983073829897e-05, "loss": 0.0608, "step": 22970 }, { "epoch": 0.6446906999579184, "grad_norm": 0.2907581329345703, "learning_rate": 3.925515500070136e-05, "loss": 0.0576, "step": 22980 }, { "epoch": 0.6449712442137747, "grad_norm": 0.04520168900489807, "learning_rate": 3.925047926310376e-05, "loss": 0.0304, "step": 22990 }, { "epoch": 0.645251788469631, "grad_norm": 0.0319281630218029, "learning_rate": 3.924580352550615e-05, "loss": 0.0156, "step": 23000 }, { "epoch": 0.6455323327254875, "grad_norm": 0.596100926399231, "learning_rate": 3.924112778790854e-05, "loss": 0.031, "step": 23010 }, { "epoch": 0.6458128769813438, "grad_norm": 0.06539153307676315, "learning_rate": 3.923645205031094e-05, "loss": 0.0412, "step": 23020 }, { "epoch": 0.6460934212372001, "grad_norm": 0.015490692108869553, "learning_rate": 3.923177631271333e-05, "loss": 0.0271, "step": 23030 }, { "epoch": 0.6463739654930565, "grad_norm": 0.08287902921438217, "learning_rate": 3.922710057511573e-05, "loss": 0.0358, "step": 23040 }, { "epoch": 0.6466545097489129, "grad_norm": 0.0647478923201561, "learning_rate": 3.9222424837518116e-05, "loss": 0.0386, "step": 23050 }, { "epoch": 0.6469350540047692, "grad_norm": 0.2003028690814972, "learning_rate": 3.9217749099920516e-05, "loss": 0.0235, "step": 23060 }, { "epoch": 0.6472155982606256, "grad_norm": 0.5503107905387878, "learning_rate": 3.921307336232291e-05, "loss": 0.0208, "step": 23070 }, { "epoch": 0.647496142516482, "grad_norm": 0.018278826028108597, "learning_rate": 3.92083976247253e-05, "loss": 0.0154, "step": 23080 }, { "epoch": 0.6477766867723384, "grad_norm": 0.2616320550441742, "learning_rate": 3.9203721887127695e-05, "loss": 0.0586, "step": 23090 }, { "epoch": 0.6480572310281947, "grad_norm": 0.10168734192848206, "learning_rate": 3.919904614953009e-05, "loss": 0.0142, "step": 23100 }, { "epoch": 0.648337775284051, "grad_norm": 0.04804458096623421, "learning_rate": 3.919437041193249e-05, "loss": 0.0159, "step": 23110 }, { "epoch": 0.6486183195399075, "grad_norm": 0.38120341300964355, "learning_rate": 3.9189694674334875e-05, "loss": 0.0178, "step": 23120 }, { "epoch": 0.6488988637957638, "grad_norm": 0.3523910939693451, "learning_rate": 3.9185018936737275e-05, "loss": 0.0196, "step": 23130 }, { "epoch": 0.6491794080516201, "grad_norm": 0.9192829132080078, "learning_rate": 3.918034319913967e-05, "loss": 0.028, "step": 23140 }, { "epoch": 0.6494599523074766, "grad_norm": 0.4372856020927429, "learning_rate": 3.917566746154206e-05, "loss": 0.0392, "step": 23150 }, { "epoch": 0.6497404965633329, "grad_norm": 0.06773208826780319, "learning_rate": 3.9170991723944454e-05, "loss": 0.034, "step": 23160 }, { "epoch": 0.6500210408191892, "grad_norm": 0.07751982659101486, "learning_rate": 3.916631598634685e-05, "loss": 0.0261, "step": 23170 }, { "epoch": 0.6503015850750455, "grad_norm": 0.31931230425834656, "learning_rate": 3.916164024874924e-05, "loss": 0.0258, "step": 23180 }, { "epoch": 0.650582129330902, "grad_norm": 0.020156050100922585, "learning_rate": 3.9156964511151633e-05, "loss": 0.0259, "step": 23190 }, { "epoch": 0.6508626735867583, "grad_norm": 1.0714876651763916, "learning_rate": 3.915228877355403e-05, "loss": 0.038, "step": 23200 }, { "epoch": 0.6511432178426146, "grad_norm": 0.6727017760276794, "learning_rate": 3.9147613035956427e-05, "loss": 0.0088, "step": 23210 }, { "epoch": 0.6514237620984711, "grad_norm": 0.17617401480674744, "learning_rate": 3.914293729835882e-05, "loss": 0.0339, "step": 23220 }, { "epoch": 0.6517043063543274, "grad_norm": 0.6850672960281372, "learning_rate": 3.913826156076121e-05, "loss": 0.028, "step": 23230 }, { "epoch": 0.6519848506101837, "grad_norm": 0.02362773008644581, "learning_rate": 3.9133585823163606e-05, "loss": 0.0108, "step": 23240 }, { "epoch": 0.6522653948660401, "grad_norm": 1.4690340757369995, "learning_rate": 3.9128910085566e-05, "loss": 0.0347, "step": 23250 }, { "epoch": 0.6525459391218965, "grad_norm": 0.2357429563999176, "learning_rate": 3.912423434796839e-05, "loss": 0.0088, "step": 23260 }, { "epoch": 0.6528264833777528, "grad_norm": 0.0247210543602705, "learning_rate": 3.9119558610370786e-05, "loss": 0.041, "step": 23270 }, { "epoch": 0.6531070276336092, "grad_norm": 0.026389438658952713, "learning_rate": 3.9114882872773185e-05, "loss": 0.0209, "step": 23280 }, { "epoch": 0.6533875718894656, "grad_norm": 0.08872511237859726, "learning_rate": 3.911020713517557e-05, "loss": 0.0118, "step": 23290 }, { "epoch": 0.6536681161453219, "grad_norm": 0.055517107248306274, "learning_rate": 3.910553139757797e-05, "loss": 0.0183, "step": 23300 }, { "epoch": 0.6539486604011783, "grad_norm": 0.1299661546945572, "learning_rate": 3.9100855659980365e-05, "loss": 0.011, "step": 23310 }, { "epoch": 0.6542292046570346, "grad_norm": 0.1784035861492157, "learning_rate": 3.909617992238276e-05, "loss": 0.0201, "step": 23320 }, { "epoch": 0.654509748912891, "grad_norm": 2.306007146835327, "learning_rate": 3.909150418478515e-05, "loss": 0.0321, "step": 23330 }, { "epoch": 0.6547902931687474, "grad_norm": 0.14436359703540802, "learning_rate": 3.9086828447187544e-05, "loss": 0.0059, "step": 23340 }, { "epoch": 0.6550708374246037, "grad_norm": 0.08367732912302017, "learning_rate": 3.9082152709589944e-05, "loss": 0.0251, "step": 23350 }, { "epoch": 0.6553513816804601, "grad_norm": 0.05964742973446846, "learning_rate": 3.907747697199233e-05, "loss": 0.0082, "step": 23360 }, { "epoch": 0.6556319259363165, "grad_norm": 0.05007968097925186, "learning_rate": 3.907280123439473e-05, "loss": 0.0209, "step": 23370 }, { "epoch": 0.6559124701921728, "grad_norm": 0.12411215156316757, "learning_rate": 3.906812549679712e-05, "loss": 0.0228, "step": 23380 }, { "epoch": 0.6561930144480291, "grad_norm": 0.1322195678949356, "learning_rate": 3.906344975919952e-05, "loss": 0.0259, "step": 23390 }, { "epoch": 0.6564735587038856, "grad_norm": 0.1288941353559494, "learning_rate": 3.905877402160191e-05, "loss": 0.022, "step": 23400 }, { "epoch": 0.6567541029597419, "grad_norm": 0.07441157847642899, "learning_rate": 3.90540982840043e-05, "loss": 0.0742, "step": 23410 }, { "epoch": 0.6570346472155982, "grad_norm": 0.9160907864570618, "learning_rate": 3.90494225464067e-05, "loss": 0.0194, "step": 23420 }, { "epoch": 0.6573151914714547, "grad_norm": 0.12519636750221252, "learning_rate": 3.904474680880909e-05, "loss": 0.0261, "step": 23430 }, { "epoch": 0.657595735727311, "grad_norm": 5.58405876159668, "learning_rate": 3.904007107121149e-05, "loss": 0.0264, "step": 23440 }, { "epoch": 0.6578762799831673, "grad_norm": 0.8530146479606628, "learning_rate": 3.9035395333613876e-05, "loss": 0.0137, "step": 23450 }, { "epoch": 0.6581568242390237, "grad_norm": 0.017040250822901726, "learning_rate": 3.9030719596016276e-05, "loss": 0.0559, "step": 23460 }, { "epoch": 0.6584373684948801, "grad_norm": 0.04614735022187233, "learning_rate": 3.902604385841866e-05, "loss": 0.0429, "step": 23470 }, { "epoch": 0.6587179127507364, "grad_norm": 0.9125344753265381, "learning_rate": 3.902136812082106e-05, "loss": 0.0307, "step": 23480 }, { "epoch": 0.6589984570065928, "grad_norm": 0.07376310974359512, "learning_rate": 3.9016692383223455e-05, "loss": 0.0208, "step": 23490 }, { "epoch": 0.6592790012624492, "grad_norm": 0.3132511079311371, "learning_rate": 3.901201664562585e-05, "loss": 0.0165, "step": 23500 }, { "epoch": 0.6595595455183055, "grad_norm": 0.1298481673002243, "learning_rate": 3.900734090802824e-05, "loss": 0.0278, "step": 23510 }, { "epoch": 0.6598400897741619, "grad_norm": 0.0742935985326767, "learning_rate": 3.9002665170430635e-05, "loss": 0.0323, "step": 23520 }, { "epoch": 0.6601206340300182, "grad_norm": 0.07253038138151169, "learning_rate": 3.8997989432833035e-05, "loss": 0.0259, "step": 23530 }, { "epoch": 0.6604011782858746, "grad_norm": 0.2882924973964691, "learning_rate": 3.899331369523542e-05, "loss": 0.0131, "step": 23540 }, { "epoch": 0.660681722541731, "grad_norm": 0.06673549115657806, "learning_rate": 3.898863795763782e-05, "loss": 0.0265, "step": 23550 }, { "epoch": 0.6609622667975873, "grad_norm": 0.15022172033786774, "learning_rate": 3.8983962220040214e-05, "loss": 0.0129, "step": 23560 }, { "epoch": 0.6612428110534437, "grad_norm": 0.5314601063728333, "learning_rate": 3.897928648244261e-05, "loss": 0.0387, "step": 23570 }, { "epoch": 0.6615233553093001, "grad_norm": 0.12535737454891205, "learning_rate": 3.8974610744845e-05, "loss": 0.0455, "step": 23580 }, { "epoch": 0.6618038995651564, "grad_norm": 0.9155845642089844, "learning_rate": 3.8969935007247394e-05, "loss": 0.0246, "step": 23590 }, { "epoch": 0.6620844438210127, "grad_norm": 0.044574715197086334, "learning_rate": 3.896525926964979e-05, "loss": 0.0391, "step": 23600 }, { "epoch": 0.6623649880768692, "grad_norm": 0.11468854546546936, "learning_rate": 3.896058353205218e-05, "loss": 0.0404, "step": 23610 }, { "epoch": 0.6626455323327255, "grad_norm": 1.3045021295547485, "learning_rate": 3.895590779445458e-05, "loss": 0.0316, "step": 23620 }, { "epoch": 0.6629260765885818, "grad_norm": 0.060792405158281326, "learning_rate": 3.895123205685697e-05, "loss": 0.0324, "step": 23630 }, { "epoch": 0.6632066208444382, "grad_norm": 2.0112738609313965, "learning_rate": 3.8946556319259366e-05, "loss": 0.0238, "step": 23640 }, { "epoch": 0.6634871651002946, "grad_norm": 0.681561291217804, "learning_rate": 3.894188058166176e-05, "loss": 0.0467, "step": 23650 }, { "epoch": 0.6637677093561509, "grad_norm": 0.19573527574539185, "learning_rate": 3.893720484406415e-05, "loss": 0.0311, "step": 23660 }, { "epoch": 0.6640482536120073, "grad_norm": 0.24537953734397888, "learning_rate": 3.8932529106466546e-05, "loss": 0.0092, "step": 23670 }, { "epoch": 0.6643287978678637, "grad_norm": 2.224395513534546, "learning_rate": 3.892785336886894e-05, "loss": 0.0613, "step": 23680 }, { "epoch": 0.66460934212372, "grad_norm": 1.2535046339035034, "learning_rate": 3.892317763127133e-05, "loss": 0.0219, "step": 23690 }, { "epoch": 0.6648898863795764, "grad_norm": 0.637887179851532, "learning_rate": 3.891850189367373e-05, "loss": 0.0376, "step": 23700 }, { "epoch": 0.6651704306354327, "grad_norm": 0.0748092532157898, "learning_rate": 3.8913826156076125e-05, "loss": 0.024, "step": 23710 }, { "epoch": 0.6654509748912891, "grad_norm": 0.6687340140342712, "learning_rate": 3.890915041847852e-05, "loss": 0.0366, "step": 23720 }, { "epoch": 0.6657315191471455, "grad_norm": 0.06030596047639847, "learning_rate": 3.890447468088091e-05, "loss": 0.0241, "step": 23730 }, { "epoch": 0.6660120634030018, "grad_norm": 0.04317037761211395, "learning_rate": 3.8899798943283304e-05, "loss": 0.0192, "step": 23740 }, { "epoch": 0.6662926076588582, "grad_norm": 0.9233769774436951, "learning_rate": 3.88951232056857e-05, "loss": 0.0321, "step": 23750 }, { "epoch": 0.6665731519147146, "grad_norm": 0.5220956206321716, "learning_rate": 3.889044746808809e-05, "loss": 0.045, "step": 23760 }, { "epoch": 0.6668536961705709, "grad_norm": 0.6750909090042114, "learning_rate": 3.888577173049049e-05, "loss": 0.021, "step": 23770 }, { "epoch": 0.6671342404264272, "grad_norm": 0.6590631604194641, "learning_rate": 3.888109599289288e-05, "loss": 0.0454, "step": 23780 }, { "epoch": 0.6674147846822837, "grad_norm": 0.08582155406475067, "learning_rate": 3.887642025529528e-05, "loss": 0.0161, "step": 23790 }, { "epoch": 0.66769532893814, "grad_norm": 0.26870790123939514, "learning_rate": 3.887174451769767e-05, "loss": 0.0505, "step": 23800 }, { "epoch": 0.6679758731939963, "grad_norm": 0.7351797223091125, "learning_rate": 3.886706878010006e-05, "loss": 0.05, "step": 23810 }, { "epoch": 0.6682564174498528, "grad_norm": 0.16261965036392212, "learning_rate": 3.8862393042502456e-05, "loss": 0.0233, "step": 23820 }, { "epoch": 0.6685369617057091, "grad_norm": 0.1786009818315506, "learning_rate": 3.885771730490485e-05, "loss": 0.0198, "step": 23830 }, { "epoch": 0.6688175059615654, "grad_norm": 0.2777544856071472, "learning_rate": 3.885304156730725e-05, "loss": 0.0274, "step": 23840 }, { "epoch": 0.6690980502174217, "grad_norm": 0.05397874116897583, "learning_rate": 3.8848365829709636e-05, "loss": 0.0143, "step": 23850 }, { "epoch": 0.6693785944732782, "grad_norm": 0.5540552139282227, "learning_rate": 3.8843690092112036e-05, "loss": 0.0419, "step": 23860 }, { "epoch": 0.6696591387291345, "grad_norm": 2.916740655899048, "learning_rate": 3.883901435451442e-05, "loss": 0.0351, "step": 23870 }, { "epoch": 0.6699396829849908, "grad_norm": 0.05921197682619095, "learning_rate": 3.883433861691682e-05, "loss": 0.0214, "step": 23880 }, { "epoch": 0.6702202272408473, "grad_norm": 0.3462936580181122, "learning_rate": 3.882966287931921e-05, "loss": 0.0293, "step": 23890 }, { "epoch": 0.6705007714967036, "grad_norm": 2.9831600189208984, "learning_rate": 3.882498714172161e-05, "loss": 0.0453, "step": 23900 }, { "epoch": 0.67078131575256, "grad_norm": 0.5510215759277344, "learning_rate": 3.8820311404124e-05, "loss": 0.0238, "step": 23910 }, { "epoch": 0.6710618600084163, "grad_norm": 0.2387644648551941, "learning_rate": 3.8815635666526395e-05, "loss": 0.0204, "step": 23920 }, { "epoch": 0.6713424042642727, "grad_norm": 0.14639399945735931, "learning_rate": 3.8810959928928795e-05, "loss": 0.0045, "step": 23930 }, { "epoch": 0.671622948520129, "grad_norm": 0.026994291692972183, "learning_rate": 3.880628419133118e-05, "loss": 0.0363, "step": 23940 }, { "epoch": 0.6719034927759854, "grad_norm": 0.06439428776502609, "learning_rate": 3.880160845373358e-05, "loss": 0.0103, "step": 23950 }, { "epoch": 0.6721840370318418, "grad_norm": 0.04344503581523895, "learning_rate": 3.879693271613597e-05, "loss": 0.0023, "step": 23960 }, { "epoch": 0.6724645812876981, "grad_norm": 0.5269049406051636, "learning_rate": 3.879225697853837e-05, "loss": 0.0132, "step": 23970 }, { "epoch": 0.6727451255435545, "grad_norm": 0.44406023621559143, "learning_rate": 3.878758124094076e-05, "loss": 0.0595, "step": 23980 }, { "epoch": 0.6730256697994108, "grad_norm": 0.050921108573675156, "learning_rate": 3.8782905503343154e-05, "loss": 0.0307, "step": 23990 }, { "epoch": 0.6733062140552672, "grad_norm": 0.19342252612113953, "learning_rate": 3.877822976574555e-05, "loss": 0.0716, "step": 24000 }, { "epoch": 0.6735867583111236, "grad_norm": 0.32767409086227417, "learning_rate": 3.877355402814794e-05, "loss": 0.0223, "step": 24010 }, { "epoch": 0.6738673025669799, "grad_norm": 0.09019522368907928, "learning_rate": 3.876887829055034e-05, "loss": 0.0171, "step": 24020 }, { "epoch": 0.6741478468228363, "grad_norm": 0.23583893477916718, "learning_rate": 3.8764202552952726e-05, "loss": 0.0462, "step": 24030 }, { "epoch": 0.6744283910786927, "grad_norm": 0.12758155167102814, "learning_rate": 3.8759526815355126e-05, "loss": 0.0385, "step": 24040 }, { "epoch": 0.674708935334549, "grad_norm": 0.3052281141281128, "learning_rate": 3.875485107775752e-05, "loss": 0.0403, "step": 24050 }, { "epoch": 0.6749894795904053, "grad_norm": 0.22018741071224213, "learning_rate": 3.875017534015991e-05, "loss": 0.0181, "step": 24060 }, { "epoch": 0.6752700238462618, "grad_norm": 0.3298323154449463, "learning_rate": 3.8745499602562306e-05, "loss": 0.0103, "step": 24070 }, { "epoch": 0.6755505681021181, "grad_norm": 0.08226700872182846, "learning_rate": 3.87408238649647e-05, "loss": 0.0746, "step": 24080 }, { "epoch": 0.6758311123579744, "grad_norm": 0.19629395008087158, "learning_rate": 3.873614812736709e-05, "loss": 0.0243, "step": 24090 }, { "epoch": 0.6761116566138309, "grad_norm": 0.437747597694397, "learning_rate": 3.8731472389769485e-05, "loss": 0.0414, "step": 24100 }, { "epoch": 0.6763922008696872, "grad_norm": 0.23963740468025208, "learning_rate": 3.872679665217188e-05, "loss": 0.0088, "step": 24110 }, { "epoch": 0.6766727451255435, "grad_norm": 2.6715383529663086, "learning_rate": 3.872212091457428e-05, "loss": 0.0323, "step": 24120 }, { "epoch": 0.6769532893813999, "grad_norm": 0.041390515863895416, "learning_rate": 3.871744517697667e-05, "loss": 0.0185, "step": 24130 }, { "epoch": 0.6772338336372563, "grad_norm": 0.04639597237110138, "learning_rate": 3.8712769439379065e-05, "loss": 0.0323, "step": 24140 }, { "epoch": 0.6775143778931126, "grad_norm": 0.028561050072312355, "learning_rate": 3.870809370178146e-05, "loss": 0.016, "step": 24150 }, { "epoch": 0.677794922148969, "grad_norm": 0.02894662879407406, "learning_rate": 3.870341796418385e-05, "loss": 0.0407, "step": 24160 }, { "epoch": 0.6780754664048254, "grad_norm": 0.766677975654602, "learning_rate": 3.8698742226586244e-05, "loss": 0.0087, "step": 24170 }, { "epoch": 0.6783560106606817, "grad_norm": 0.8634641766548157, "learning_rate": 3.869406648898864e-05, "loss": 0.0137, "step": 24180 }, { "epoch": 0.6786365549165381, "grad_norm": 0.024998003616929054, "learning_rate": 3.868939075139104e-05, "loss": 0.0381, "step": 24190 }, { "epoch": 0.6789170991723944, "grad_norm": 0.020949946716427803, "learning_rate": 3.8684715013793423e-05, "loss": 0.0283, "step": 24200 }, { "epoch": 0.6791976434282508, "grad_norm": 0.22507625818252563, "learning_rate": 3.8680039276195823e-05, "loss": 0.0211, "step": 24210 }, { "epoch": 0.6794781876841072, "grad_norm": 0.8557853698730469, "learning_rate": 3.8675363538598217e-05, "loss": 0.0123, "step": 24220 }, { "epoch": 0.6797587319399635, "grad_norm": 0.34842729568481445, "learning_rate": 3.867068780100061e-05, "loss": 0.0302, "step": 24230 }, { "epoch": 0.6800392761958199, "grad_norm": 1.8204069137573242, "learning_rate": 3.8666012063403e-05, "loss": 0.0398, "step": 24240 }, { "epoch": 0.6803198204516763, "grad_norm": 0.4551059305667877, "learning_rate": 3.8661336325805396e-05, "loss": 0.038, "step": 24250 }, { "epoch": 0.6806003647075326, "grad_norm": 0.7731631994247437, "learning_rate": 3.8656660588207796e-05, "loss": 0.035, "step": 24260 }, { "epoch": 0.6808809089633889, "grad_norm": 0.029376063495874405, "learning_rate": 3.865198485061018e-05, "loss": 0.0359, "step": 24270 }, { "epoch": 0.6811614532192454, "grad_norm": 0.11395037919282913, "learning_rate": 3.864730911301258e-05, "loss": 0.017, "step": 24280 }, { "epoch": 0.6814419974751017, "grad_norm": 0.051272910088300705, "learning_rate": 3.864263337541497e-05, "loss": 0.0334, "step": 24290 }, { "epoch": 0.681722541730958, "grad_norm": 0.05158807337284088, "learning_rate": 3.863795763781737e-05, "loss": 0.0107, "step": 24300 }, { "epoch": 0.6820030859868145, "grad_norm": 1.3981572389602661, "learning_rate": 3.863328190021976e-05, "loss": 0.0609, "step": 24310 }, { "epoch": 0.6822836302426708, "grad_norm": 0.147599995136261, "learning_rate": 3.8628606162622155e-05, "loss": 0.0179, "step": 24320 }, { "epoch": 0.6825641744985271, "grad_norm": 1.6233779191970825, "learning_rate": 3.8623930425024555e-05, "loss": 0.0241, "step": 24330 }, { "epoch": 0.6828447187543835, "grad_norm": 0.14357320964336395, "learning_rate": 3.861925468742694e-05, "loss": 0.0262, "step": 24340 }, { "epoch": 0.6831252630102399, "grad_norm": 0.5447237491607666, "learning_rate": 3.861457894982934e-05, "loss": 0.0179, "step": 24350 }, { "epoch": 0.6834058072660962, "grad_norm": 0.03752472624182701, "learning_rate": 3.860990321223173e-05, "loss": 0.027, "step": 24360 }, { "epoch": 0.6836863515219526, "grad_norm": 0.09986546635627747, "learning_rate": 3.860522747463413e-05, "loss": 0.027, "step": 24370 }, { "epoch": 0.683966895777809, "grad_norm": 0.7573723196983337, "learning_rate": 3.8600551737036514e-05, "loss": 0.0209, "step": 24380 }, { "epoch": 0.6842474400336653, "grad_norm": 0.705007791519165, "learning_rate": 3.8595875999438914e-05, "loss": 0.0267, "step": 24390 }, { "epoch": 0.6845279842895217, "grad_norm": 0.05524802580475807, "learning_rate": 3.859120026184131e-05, "loss": 0.0435, "step": 24400 }, { "epoch": 0.684808528545378, "grad_norm": 0.16457141935825348, "learning_rate": 3.85865245242437e-05, "loss": 0.0189, "step": 24410 }, { "epoch": 0.6850890728012344, "grad_norm": 0.8156257271766663, "learning_rate": 3.858184878664609e-05, "loss": 0.0206, "step": 24420 }, { "epoch": 0.6853696170570908, "grad_norm": 0.36529305577278137, "learning_rate": 3.8577173049048486e-05, "loss": 0.0249, "step": 24430 }, { "epoch": 0.6856501613129471, "grad_norm": 0.3161088228225708, "learning_rate": 3.8572497311450886e-05, "loss": 0.0452, "step": 24440 }, { "epoch": 0.6859307055688035, "grad_norm": 0.35006165504455566, "learning_rate": 3.856782157385327e-05, "loss": 0.0245, "step": 24450 }, { "epoch": 0.6862112498246599, "grad_norm": 1.0254093408584595, "learning_rate": 3.856314583625567e-05, "loss": 0.0112, "step": 24460 }, { "epoch": 0.6864917940805162, "grad_norm": 1.756962537765503, "learning_rate": 3.8558470098658066e-05, "loss": 0.0369, "step": 24470 }, { "epoch": 0.6867723383363725, "grad_norm": 1.074924111366272, "learning_rate": 3.855379436106046e-05, "loss": 0.0454, "step": 24480 }, { "epoch": 0.687052882592229, "grad_norm": 0.21980790793895721, "learning_rate": 3.854911862346285e-05, "loss": 0.0147, "step": 24490 }, { "epoch": 0.6873334268480853, "grad_norm": 1.1270771026611328, "learning_rate": 3.8544442885865245e-05, "loss": 0.0237, "step": 24500 }, { "epoch": 0.6876139711039416, "grad_norm": 0.03919491171836853, "learning_rate": 3.853976714826764e-05, "loss": 0.0345, "step": 24510 }, { "epoch": 0.687894515359798, "grad_norm": 0.0992208942770958, "learning_rate": 3.853509141067004e-05, "loss": 0.0413, "step": 24520 }, { "epoch": 0.6881750596156544, "grad_norm": 0.8403270244598389, "learning_rate": 3.853041567307243e-05, "loss": 0.0647, "step": 24530 }, { "epoch": 0.6884556038715107, "grad_norm": 0.49726173281669617, "learning_rate": 3.8525739935474825e-05, "loss": 0.0295, "step": 24540 }, { "epoch": 0.688736148127367, "grad_norm": 0.45145663619041443, "learning_rate": 3.852106419787722e-05, "loss": 0.0477, "step": 24550 }, { "epoch": 0.6890166923832235, "grad_norm": 0.03613867983222008, "learning_rate": 3.851638846027961e-05, "loss": 0.0205, "step": 24560 }, { "epoch": 0.6892972366390798, "grad_norm": 0.07926283031702042, "learning_rate": 3.8511712722682004e-05, "loss": 0.0457, "step": 24570 }, { "epoch": 0.6895777808949362, "grad_norm": 2.3708114624023438, "learning_rate": 3.85070369850844e-05, "loss": 0.0256, "step": 24580 }, { "epoch": 0.6898583251507925, "grad_norm": 0.9981784224510193, "learning_rate": 3.85023612474868e-05, "loss": 0.0579, "step": 24590 }, { "epoch": 0.6901388694066489, "grad_norm": 0.2667693495750427, "learning_rate": 3.8497685509889184e-05, "loss": 0.028, "step": 24600 }, { "epoch": 0.6904194136625053, "grad_norm": 0.3489788770675659, "learning_rate": 3.8493009772291584e-05, "loss": 0.0349, "step": 24610 }, { "epoch": 0.6906999579183616, "grad_norm": 0.6099436283111572, "learning_rate": 3.848833403469398e-05, "loss": 0.0723, "step": 24620 }, { "epoch": 0.690980502174218, "grad_norm": 0.130865216255188, "learning_rate": 3.848365829709637e-05, "loss": 0.0373, "step": 24630 }, { "epoch": 0.6912610464300744, "grad_norm": 0.4679802358150482, "learning_rate": 3.847898255949876e-05, "loss": 0.0309, "step": 24640 }, { "epoch": 0.6915415906859307, "grad_norm": 0.2952854633331299, "learning_rate": 3.8474306821901156e-05, "loss": 0.043, "step": 24650 }, { "epoch": 0.691822134941787, "grad_norm": 0.15778489410877228, "learning_rate": 3.8469631084303556e-05, "loss": 0.0328, "step": 24660 }, { "epoch": 0.6921026791976435, "grad_norm": 0.3053245544433594, "learning_rate": 3.846495534670594e-05, "loss": 0.0464, "step": 24670 }, { "epoch": 0.6923832234534998, "grad_norm": 0.19573304057121277, "learning_rate": 3.846027960910834e-05, "loss": 0.0389, "step": 24680 }, { "epoch": 0.6926637677093561, "grad_norm": 0.33845990896224976, "learning_rate": 3.845560387151073e-05, "loss": 0.022, "step": 24690 }, { "epoch": 0.6929443119652126, "grad_norm": 0.12001678347587585, "learning_rate": 3.845092813391313e-05, "loss": 0.0119, "step": 24700 }, { "epoch": 0.6932248562210689, "grad_norm": 0.8399164080619812, "learning_rate": 3.844625239631552e-05, "loss": 0.012, "step": 24710 }, { "epoch": 0.6935054004769252, "grad_norm": 0.18729692697525024, "learning_rate": 3.8441576658717915e-05, "loss": 0.0305, "step": 24720 }, { "epoch": 0.6937859447327815, "grad_norm": 0.07821544259786606, "learning_rate": 3.843690092112031e-05, "loss": 0.0167, "step": 24730 }, { "epoch": 0.694066488988638, "grad_norm": 0.20364460349082947, "learning_rate": 3.84322251835227e-05, "loss": 0.046, "step": 24740 }, { "epoch": 0.6943470332444943, "grad_norm": 0.029393460601568222, "learning_rate": 3.84275494459251e-05, "loss": 0.0312, "step": 24750 }, { "epoch": 0.6946275775003506, "grad_norm": 0.9740104675292969, "learning_rate": 3.842287370832749e-05, "loss": 0.0296, "step": 24760 }, { "epoch": 0.6949081217562071, "grad_norm": 1.5013344287872314, "learning_rate": 3.841819797072989e-05, "loss": 0.0254, "step": 24770 }, { "epoch": 0.6951886660120634, "grad_norm": 1.5424227714538574, "learning_rate": 3.8413522233132274e-05, "loss": 0.0195, "step": 24780 }, { "epoch": 0.6954692102679197, "grad_norm": 0.06564722955226898, "learning_rate": 3.8408846495534674e-05, "loss": 0.029, "step": 24790 }, { "epoch": 0.6957497545237761, "grad_norm": 0.3096904754638672, "learning_rate": 3.840417075793707e-05, "loss": 0.0443, "step": 24800 }, { "epoch": 0.6960302987796325, "grad_norm": 0.20620103180408478, "learning_rate": 3.839949502033946e-05, "loss": 0.0182, "step": 24810 }, { "epoch": 0.6963108430354888, "grad_norm": 3.256781578063965, "learning_rate": 3.839481928274185e-05, "loss": 0.0286, "step": 24820 }, { "epoch": 0.6965913872913452, "grad_norm": 0.6302417516708374, "learning_rate": 3.8390143545144246e-05, "loss": 0.0146, "step": 24830 }, { "epoch": 0.6968719315472016, "grad_norm": 0.03462322801351547, "learning_rate": 3.8385467807546646e-05, "loss": 0.0213, "step": 24840 }, { "epoch": 0.697152475803058, "grad_norm": 0.3534061014652252, "learning_rate": 3.838079206994903e-05, "loss": 0.0153, "step": 24850 }, { "epoch": 0.6974330200589143, "grad_norm": 0.1535591334104538, "learning_rate": 3.837611633235143e-05, "loss": 0.0263, "step": 24860 }, { "epoch": 0.6977135643147706, "grad_norm": 0.15811899304389954, "learning_rate": 3.8371440594753826e-05, "loss": 0.0339, "step": 24870 }, { "epoch": 0.697994108570627, "grad_norm": 1.8170545101165771, "learning_rate": 3.836676485715622e-05, "loss": 0.0427, "step": 24880 }, { "epoch": 0.6982746528264834, "grad_norm": 0.15544533729553223, "learning_rate": 3.836208911955861e-05, "loss": 0.0225, "step": 24890 }, { "epoch": 0.6985551970823397, "grad_norm": 2.075563669204712, "learning_rate": 3.8357413381961005e-05, "loss": 0.011, "step": 24900 }, { "epoch": 0.6988357413381961, "grad_norm": 0.03273499011993408, "learning_rate": 3.83527376443634e-05, "loss": 0.0575, "step": 24910 }, { "epoch": 0.6991162855940525, "grad_norm": 1.2711933851242065, "learning_rate": 3.834806190676579e-05, "loss": 0.0414, "step": 24920 }, { "epoch": 0.6993968298499088, "grad_norm": 3.1688954830169678, "learning_rate": 3.834338616916819e-05, "loss": 0.0289, "step": 24930 }, { "epoch": 0.6996773741057651, "grad_norm": 0.14562727510929108, "learning_rate": 3.8338710431570585e-05, "loss": 0.0571, "step": 24940 }, { "epoch": 0.6999579183616216, "grad_norm": 0.7857149839401245, "learning_rate": 3.833403469397298e-05, "loss": 0.0314, "step": 24950 }, { "epoch": 0.7002384626174779, "grad_norm": 0.5670215487480164, "learning_rate": 3.832935895637537e-05, "loss": 0.0167, "step": 24960 }, { "epoch": 0.7005190068733342, "grad_norm": 0.10155506432056427, "learning_rate": 3.8324683218777764e-05, "loss": 0.0158, "step": 24970 }, { "epoch": 0.7007995511291907, "grad_norm": 0.09284135699272156, "learning_rate": 3.832000748118016e-05, "loss": 0.0246, "step": 24980 }, { "epoch": 0.701080095385047, "grad_norm": 1.7162643671035767, "learning_rate": 3.831533174358255e-05, "loss": 0.0369, "step": 24990 }, { "epoch": 0.7013606396409033, "grad_norm": 0.5675196647644043, "learning_rate": 3.8310656005984944e-05, "loss": 0.047, "step": 25000 }, { "epoch": 0.7016411838967597, "grad_norm": 0.563754677772522, "learning_rate": 3.8305980268387344e-05, "loss": 0.0454, "step": 25010 }, { "epoch": 0.7019217281526161, "grad_norm": 0.09345812350511551, "learning_rate": 3.830130453078973e-05, "loss": 0.0252, "step": 25020 }, { "epoch": 0.7022022724084724, "grad_norm": 1.2149022817611694, "learning_rate": 3.829662879319213e-05, "loss": 0.0413, "step": 25030 }, { "epoch": 0.7024828166643288, "grad_norm": 0.11172030121088028, "learning_rate": 3.829195305559452e-05, "loss": 0.0313, "step": 25040 }, { "epoch": 0.7027633609201852, "grad_norm": 0.07137187570333481, "learning_rate": 3.8287277317996916e-05, "loss": 0.0139, "step": 25050 }, { "epoch": 0.7030439051760415, "grad_norm": 0.6786089539527893, "learning_rate": 3.828260158039931e-05, "loss": 0.0456, "step": 25060 }, { "epoch": 0.7033244494318979, "grad_norm": 0.22062668204307556, "learning_rate": 3.82779258428017e-05, "loss": 0.0676, "step": 25070 }, { "epoch": 0.7036049936877542, "grad_norm": 0.98066246509552, "learning_rate": 3.82732501052041e-05, "loss": 0.0224, "step": 25080 }, { "epoch": 0.7038855379436106, "grad_norm": 0.14956533908843994, "learning_rate": 3.826857436760649e-05, "loss": 0.0343, "step": 25090 }, { "epoch": 0.704166082199467, "grad_norm": 0.15842510759830475, "learning_rate": 3.826389863000889e-05, "loss": 0.0237, "step": 25100 }, { "epoch": 0.7044466264553233, "grad_norm": 0.23285749554634094, "learning_rate": 3.8259222892411275e-05, "loss": 0.0125, "step": 25110 }, { "epoch": 0.7047271707111797, "grad_norm": 0.04391917213797569, "learning_rate": 3.8254547154813675e-05, "loss": 0.0561, "step": 25120 }, { "epoch": 0.7050077149670361, "grad_norm": 0.04765023663640022, "learning_rate": 3.824987141721607e-05, "loss": 0.0216, "step": 25130 }, { "epoch": 0.7052882592228924, "grad_norm": 0.30557870864868164, "learning_rate": 3.824519567961846e-05, "loss": 0.0223, "step": 25140 }, { "epoch": 0.7055688034787487, "grad_norm": 1.787122368812561, "learning_rate": 3.824051994202086e-05, "loss": 0.0216, "step": 25150 }, { "epoch": 0.7058493477346052, "grad_norm": 1.0666066408157349, "learning_rate": 3.823584420442325e-05, "loss": 0.0525, "step": 25160 }, { "epoch": 0.7061298919904615, "grad_norm": 0.05113120377063751, "learning_rate": 3.823116846682565e-05, "loss": 0.0162, "step": 25170 }, { "epoch": 0.7064104362463178, "grad_norm": 0.06852001696825027, "learning_rate": 3.8226492729228034e-05, "loss": 0.0469, "step": 25180 }, { "epoch": 0.7066909805021743, "grad_norm": 0.16661213338375092, "learning_rate": 3.8221816991630434e-05, "loss": 0.0203, "step": 25190 }, { "epoch": 0.7069715247580306, "grad_norm": 0.02860392816364765, "learning_rate": 3.821714125403282e-05, "loss": 0.0529, "step": 25200 }, { "epoch": 0.7072520690138869, "grad_norm": 0.9956585168838501, "learning_rate": 3.821246551643522e-05, "loss": 0.016, "step": 25210 }, { "epoch": 0.7075326132697433, "grad_norm": 0.8469241857528687, "learning_rate": 3.8207789778837613e-05, "loss": 0.0229, "step": 25220 }, { "epoch": 0.7078131575255997, "grad_norm": 0.10392174869775772, "learning_rate": 3.8203114041240007e-05, "loss": 0.025, "step": 25230 }, { "epoch": 0.708093701781456, "grad_norm": 0.12531588971614838, "learning_rate": 3.8198438303642406e-05, "loss": 0.0493, "step": 25240 }, { "epoch": 0.7083742460373124, "grad_norm": 0.13654696941375732, "learning_rate": 3.819376256604479e-05, "loss": 0.0272, "step": 25250 }, { "epoch": 0.7086547902931688, "grad_norm": 0.12876342236995697, "learning_rate": 3.818908682844719e-05, "loss": 0.0453, "step": 25260 }, { "epoch": 0.7089353345490251, "grad_norm": 3.2016987800598145, "learning_rate": 3.818441109084958e-05, "loss": 0.0373, "step": 25270 }, { "epoch": 0.7092158788048815, "grad_norm": 0.207061767578125, "learning_rate": 3.817973535325198e-05, "loss": 0.05, "step": 25280 }, { "epoch": 0.7094964230607378, "grad_norm": 0.23185548186302185, "learning_rate": 3.817505961565437e-05, "loss": 0.0271, "step": 25290 }, { "epoch": 0.7097769673165942, "grad_norm": 0.36338019371032715, "learning_rate": 3.8170383878056765e-05, "loss": 0.0127, "step": 25300 }, { "epoch": 0.7100575115724506, "grad_norm": 0.09704617410898209, "learning_rate": 3.816570814045916e-05, "loss": 0.0301, "step": 25310 }, { "epoch": 0.7103380558283069, "grad_norm": 0.38941988348960876, "learning_rate": 3.816103240286155e-05, "loss": 0.0248, "step": 25320 }, { "epoch": 0.7106186000841633, "grad_norm": 2.230834722518921, "learning_rate": 3.8156356665263945e-05, "loss": 0.0449, "step": 25330 }, { "epoch": 0.7108991443400197, "grad_norm": 1.1818104982376099, "learning_rate": 3.815168092766634e-05, "loss": 0.0184, "step": 25340 }, { "epoch": 0.711179688595876, "grad_norm": 0.10259278118610382, "learning_rate": 3.814700519006874e-05, "loss": 0.0146, "step": 25350 }, { "epoch": 0.7114602328517323, "grad_norm": 0.10248912870883942, "learning_rate": 3.814232945247113e-05, "loss": 0.011, "step": 25360 }, { "epoch": 0.7117407771075888, "grad_norm": 0.25718051195144653, "learning_rate": 3.8137653714873524e-05, "loss": 0.0166, "step": 25370 }, { "epoch": 0.7120213213634451, "grad_norm": 0.11998813599348068, "learning_rate": 3.813297797727592e-05, "loss": 0.0234, "step": 25380 }, { "epoch": 0.7123018656193014, "grad_norm": 1.5589264631271362, "learning_rate": 3.812830223967831e-05, "loss": 0.0443, "step": 25390 }, { "epoch": 0.7125824098751578, "grad_norm": 0.055008940398693085, "learning_rate": 3.8123626502080704e-05, "loss": 0.0173, "step": 25400 }, { "epoch": 0.7128629541310142, "grad_norm": 0.4280003011226654, "learning_rate": 3.81189507644831e-05, "loss": 0.0117, "step": 25410 }, { "epoch": 0.7131434983868705, "grad_norm": 0.8278501629829407, "learning_rate": 3.811427502688549e-05, "loss": 0.0223, "step": 25420 }, { "epoch": 0.7134240426427269, "grad_norm": 0.023428888991475105, "learning_rate": 3.810959928928789e-05, "loss": 0.024, "step": 25430 }, { "epoch": 0.7137045868985833, "grad_norm": 0.03644218295812607, "learning_rate": 3.810492355169028e-05, "loss": 0.0521, "step": 25440 }, { "epoch": 0.7139851311544396, "grad_norm": 0.3178386986255646, "learning_rate": 3.8100247814092676e-05, "loss": 0.0561, "step": 25450 }, { "epoch": 0.714265675410296, "grad_norm": 0.28151699900627136, "learning_rate": 3.809557207649507e-05, "loss": 0.0546, "step": 25460 }, { "epoch": 0.7145462196661523, "grad_norm": 0.20915378630161285, "learning_rate": 3.809089633889746e-05, "loss": 0.0119, "step": 25470 }, { "epoch": 0.7148267639220087, "grad_norm": 0.03310835361480713, "learning_rate": 3.8086220601299856e-05, "loss": 0.0281, "step": 25480 }, { "epoch": 0.715107308177865, "grad_norm": 0.030825814232230186, "learning_rate": 3.808154486370225e-05, "loss": 0.0119, "step": 25490 }, { "epoch": 0.7153878524337214, "grad_norm": 3.129936933517456, "learning_rate": 3.807686912610465e-05, "loss": 0.0275, "step": 25500 }, { "epoch": 0.7156683966895778, "grad_norm": 0.052750229835510254, "learning_rate": 3.8072193388507035e-05, "loss": 0.0117, "step": 25510 }, { "epoch": 0.7159489409454342, "grad_norm": 0.0792463943362236, "learning_rate": 3.8067517650909435e-05, "loss": 0.0395, "step": 25520 }, { "epoch": 0.7162294852012905, "grad_norm": 0.4678553342819214, "learning_rate": 3.806284191331183e-05, "loss": 0.0413, "step": 25530 }, { "epoch": 0.7165100294571468, "grad_norm": 0.26413604617118835, "learning_rate": 3.805816617571422e-05, "loss": 0.0364, "step": 25540 }, { "epoch": 0.7167905737130033, "grad_norm": 1.4924875497817993, "learning_rate": 3.8053490438116615e-05, "loss": 0.0336, "step": 25550 }, { "epoch": 0.7170711179688596, "grad_norm": 0.47927695512771606, "learning_rate": 3.804881470051901e-05, "loss": 0.0388, "step": 25560 }, { "epoch": 0.7173516622247159, "grad_norm": 0.09347701817750931, "learning_rate": 3.804413896292141e-05, "loss": 0.0205, "step": 25570 }, { "epoch": 0.7176322064805724, "grad_norm": 0.3180250823497772, "learning_rate": 3.8039463225323794e-05, "loss": 0.0263, "step": 25580 }, { "epoch": 0.7179127507364287, "grad_norm": 0.015305743552744389, "learning_rate": 3.8034787487726194e-05, "loss": 0.0262, "step": 25590 }, { "epoch": 0.718193294992285, "grad_norm": 0.07041285187005997, "learning_rate": 3.803011175012858e-05, "loss": 0.0114, "step": 25600 }, { "epoch": 0.7184738392481413, "grad_norm": 0.026321861892938614, "learning_rate": 3.802543601253098e-05, "loss": 0.0093, "step": 25610 }, { "epoch": 0.7187543835039978, "grad_norm": 3.717725992202759, "learning_rate": 3.8020760274933374e-05, "loss": 0.0453, "step": 25620 }, { "epoch": 0.7190349277598541, "grad_norm": 2.713118553161621, "learning_rate": 3.801608453733577e-05, "loss": 0.0359, "step": 25630 }, { "epoch": 0.7193154720157104, "grad_norm": 0.1098276823759079, "learning_rate": 3.801140879973816e-05, "loss": 0.015, "step": 25640 }, { "epoch": 0.7195960162715669, "grad_norm": 0.08228477090597153, "learning_rate": 3.800673306214055e-05, "loss": 0.0404, "step": 25650 }, { "epoch": 0.7198765605274232, "grad_norm": 1.2213770151138306, "learning_rate": 3.800205732454295e-05, "loss": 0.0341, "step": 25660 }, { "epoch": 0.7201571047832795, "grad_norm": 0.032245393842458725, "learning_rate": 3.799738158694534e-05, "loss": 0.0164, "step": 25670 }, { "epoch": 0.7204376490391359, "grad_norm": 0.3923555314540863, "learning_rate": 3.799270584934774e-05, "loss": 0.0139, "step": 25680 }, { "epoch": 0.7207181932949923, "grad_norm": 0.18429122865200043, "learning_rate": 3.7988030111750126e-05, "loss": 0.0423, "step": 25690 }, { "epoch": 0.7209987375508486, "grad_norm": 5.105329513549805, "learning_rate": 3.7983354374152526e-05, "loss": 0.0538, "step": 25700 }, { "epoch": 0.721279281806705, "grad_norm": 3.100364923477173, "learning_rate": 3.797867863655492e-05, "loss": 0.0265, "step": 25710 }, { "epoch": 0.7215598260625614, "grad_norm": 0.028914673253893852, "learning_rate": 3.797400289895731e-05, "loss": 0.0054, "step": 25720 }, { "epoch": 0.7218403703184177, "grad_norm": 1.8753808736801147, "learning_rate": 3.7969327161359705e-05, "loss": 0.0185, "step": 25730 }, { "epoch": 0.7221209145742741, "grad_norm": 0.1029067188501358, "learning_rate": 3.79646514237621e-05, "loss": 0.0166, "step": 25740 }, { "epoch": 0.7224014588301304, "grad_norm": 0.020826848223805428, "learning_rate": 3.79599756861645e-05, "loss": 0.0173, "step": 25750 }, { "epoch": 0.7226820030859868, "grad_norm": 0.18941430747509003, "learning_rate": 3.7955299948566884e-05, "loss": 0.0131, "step": 25760 }, { "epoch": 0.7229625473418432, "grad_norm": 0.049623001366853714, "learning_rate": 3.7950624210969284e-05, "loss": 0.0352, "step": 25770 }, { "epoch": 0.7232430915976995, "grad_norm": 0.5616443157196045, "learning_rate": 3.794594847337168e-05, "loss": 0.0127, "step": 25780 }, { "epoch": 0.723523635853556, "grad_norm": 0.020462172105908394, "learning_rate": 3.794127273577407e-05, "loss": 0.0222, "step": 25790 }, { "epoch": 0.7238041801094123, "grad_norm": 0.9225625991821289, "learning_rate": 3.7936596998176464e-05, "loss": 0.0326, "step": 25800 }, { "epoch": 0.7240847243652686, "grad_norm": 0.3638676404953003, "learning_rate": 3.793192126057886e-05, "loss": 0.0105, "step": 25810 }, { "epoch": 0.7243652686211249, "grad_norm": 0.15734779834747314, "learning_rate": 3.792724552298125e-05, "loss": 0.0383, "step": 25820 }, { "epoch": 0.7246458128769814, "grad_norm": 0.4972843527793884, "learning_rate": 3.792256978538364e-05, "loss": 0.0137, "step": 25830 }, { "epoch": 0.7249263571328377, "grad_norm": 0.08208829164505005, "learning_rate": 3.791789404778604e-05, "loss": 0.014, "step": 25840 }, { "epoch": 0.725206901388694, "grad_norm": 0.48373931646347046, "learning_rate": 3.7913218310188436e-05, "loss": 0.029, "step": 25850 }, { "epoch": 0.7254874456445505, "grad_norm": 5.0809807777404785, "learning_rate": 3.790854257259083e-05, "loss": 0.0381, "step": 25860 }, { "epoch": 0.7257679899004068, "grad_norm": 0.5449585318565369, "learning_rate": 3.790386683499322e-05, "loss": 0.0638, "step": 25870 }, { "epoch": 0.7260485341562631, "grad_norm": 0.18042322993278503, "learning_rate": 3.7899191097395616e-05, "loss": 0.0477, "step": 25880 }, { "epoch": 0.7263290784121195, "grad_norm": 0.13749472796916962, "learning_rate": 3.789451535979801e-05, "loss": 0.0535, "step": 25890 }, { "epoch": 0.7266096226679759, "grad_norm": 3.5462870597839355, "learning_rate": 3.78898396222004e-05, "loss": 0.0267, "step": 25900 }, { "epoch": 0.7268901669238322, "grad_norm": 0.571878969669342, "learning_rate": 3.7885163884602795e-05, "loss": 0.0101, "step": 25910 }, { "epoch": 0.7271707111796886, "grad_norm": 0.04007372260093689, "learning_rate": 3.7880488147005195e-05, "loss": 0.022, "step": 25920 }, { "epoch": 0.727451255435545, "grad_norm": 0.4599281847476959, "learning_rate": 3.787581240940758e-05, "loss": 0.0285, "step": 25930 }, { "epoch": 0.7277317996914013, "grad_norm": 0.2238558530807495, "learning_rate": 3.787113667180998e-05, "loss": 0.0086, "step": 25940 }, { "epoch": 0.7280123439472577, "grad_norm": 3.0771617889404297, "learning_rate": 3.7866460934212375e-05, "loss": 0.0656, "step": 25950 }, { "epoch": 0.728292888203114, "grad_norm": 0.33144253492355347, "learning_rate": 3.786178519661477e-05, "loss": 0.0536, "step": 25960 }, { "epoch": 0.7285734324589704, "grad_norm": 0.1116536408662796, "learning_rate": 3.785710945901716e-05, "loss": 0.0221, "step": 25970 }, { "epoch": 0.7288539767148268, "grad_norm": 0.05542154610157013, "learning_rate": 3.7852433721419554e-05, "loss": 0.0187, "step": 25980 }, { "epoch": 0.7291345209706831, "grad_norm": 0.8016694188117981, "learning_rate": 3.7847757983821954e-05, "loss": 0.0669, "step": 25990 }, { "epoch": 0.7294150652265395, "grad_norm": 0.17447489500045776, "learning_rate": 3.784308224622434e-05, "loss": 0.0362, "step": 26000 }, { "epoch": 0.7296956094823959, "grad_norm": 0.12872332334518433, "learning_rate": 3.783840650862674e-05, "loss": 0.0192, "step": 26010 }, { "epoch": 0.7299761537382522, "grad_norm": 0.39369311928749084, "learning_rate": 3.783373077102913e-05, "loss": 0.0276, "step": 26020 }, { "epoch": 0.7302566979941085, "grad_norm": 0.09624532610177994, "learning_rate": 3.782905503343153e-05, "loss": 0.0223, "step": 26030 }, { "epoch": 0.730537242249965, "grad_norm": 0.03231789916753769, "learning_rate": 3.782437929583392e-05, "loss": 0.0381, "step": 26040 }, { "epoch": 0.7308177865058213, "grad_norm": 0.04607568681240082, "learning_rate": 3.781970355823631e-05, "loss": 0.0195, "step": 26050 }, { "epoch": 0.7310983307616776, "grad_norm": 0.225514754652977, "learning_rate": 3.781502782063871e-05, "loss": 0.046, "step": 26060 }, { "epoch": 0.7313788750175341, "grad_norm": 0.3428187668323517, "learning_rate": 3.78103520830411e-05, "loss": 0.0204, "step": 26070 }, { "epoch": 0.7316594192733904, "grad_norm": 0.20102502405643463, "learning_rate": 3.78056763454435e-05, "loss": 0.0372, "step": 26080 }, { "epoch": 0.7319399635292467, "grad_norm": 4.477596282958984, "learning_rate": 3.7801000607845886e-05, "loss": 0.0116, "step": 26090 }, { "epoch": 0.7322205077851031, "grad_norm": 0.8657251000404358, "learning_rate": 3.7796324870248286e-05, "loss": 0.0287, "step": 26100 }, { "epoch": 0.7325010520409595, "grad_norm": 0.03966415300965309, "learning_rate": 3.779164913265067e-05, "loss": 0.0158, "step": 26110 }, { "epoch": 0.7327815962968158, "grad_norm": 0.4963042438030243, "learning_rate": 3.778697339505307e-05, "loss": 0.003, "step": 26120 }, { "epoch": 0.7330621405526722, "grad_norm": 0.030686264857649803, "learning_rate": 3.7782297657455465e-05, "loss": 0.0588, "step": 26130 }, { "epoch": 0.7333426848085286, "grad_norm": 2.0223171710968018, "learning_rate": 3.777762191985786e-05, "loss": 0.0316, "step": 26140 }, { "epoch": 0.7336232290643849, "grad_norm": 0.03536931797862053, "learning_rate": 3.777294618226026e-05, "loss": 0.0235, "step": 26150 }, { "epoch": 0.7339037733202413, "grad_norm": 0.21969729661941528, "learning_rate": 3.7768270444662645e-05, "loss": 0.0116, "step": 26160 }, { "epoch": 0.7341843175760976, "grad_norm": 7.875779151916504, "learning_rate": 3.7763594707065044e-05, "loss": 0.0332, "step": 26170 }, { "epoch": 0.734464861831954, "grad_norm": 7.41851806640625, "learning_rate": 3.775891896946743e-05, "loss": 0.0195, "step": 26180 }, { "epoch": 0.7347454060878104, "grad_norm": 0.03651905432343483, "learning_rate": 3.775424323186983e-05, "loss": 0.0477, "step": 26190 }, { "epoch": 0.7350259503436667, "grad_norm": 0.049879927188158035, "learning_rate": 3.7749567494272224e-05, "loss": 0.032, "step": 26200 }, { "epoch": 0.735306494599523, "grad_norm": 0.2966752350330353, "learning_rate": 3.774489175667462e-05, "loss": 0.0195, "step": 26210 }, { "epoch": 0.7355870388553795, "grad_norm": 0.7365740537643433, "learning_rate": 3.774021601907701e-05, "loss": 0.0195, "step": 26220 }, { "epoch": 0.7358675831112358, "grad_norm": 0.4809545576572418, "learning_rate": 3.7735540281479403e-05, "loss": 0.0242, "step": 26230 }, { "epoch": 0.7361481273670921, "grad_norm": 6.408763408660889, "learning_rate": 3.7730864543881797e-05, "loss": 0.0227, "step": 26240 }, { "epoch": 0.7364286716229486, "grad_norm": 0.2986867427825928, "learning_rate": 3.772618880628419e-05, "loss": 0.0491, "step": 26250 }, { "epoch": 0.7367092158788049, "grad_norm": 0.3062385022640228, "learning_rate": 3.772151306868659e-05, "loss": 0.0386, "step": 26260 }, { "epoch": 0.7369897601346612, "grad_norm": 1.9727858304977417, "learning_rate": 3.771683733108898e-05, "loss": 0.0328, "step": 26270 }, { "epoch": 0.7372703043905176, "grad_norm": 0.17741897702217102, "learning_rate": 3.7712161593491376e-05, "loss": 0.0221, "step": 26280 }, { "epoch": 0.737550848646374, "grad_norm": 0.0623333714902401, "learning_rate": 3.770748585589377e-05, "loss": 0.0453, "step": 26290 }, { "epoch": 0.7378313929022303, "grad_norm": 0.02340639941394329, "learning_rate": 3.770281011829616e-05, "loss": 0.0189, "step": 26300 }, { "epoch": 0.7381119371580867, "grad_norm": 0.6592329144477844, "learning_rate": 3.7698134380698555e-05, "loss": 0.0218, "step": 26310 }, { "epoch": 0.7383924814139431, "grad_norm": 4.04000186920166, "learning_rate": 3.769345864310095e-05, "loss": 0.0212, "step": 26320 }, { "epoch": 0.7386730256697994, "grad_norm": 0.05859946087002754, "learning_rate": 3.768878290550334e-05, "loss": 0.0066, "step": 26330 }, { "epoch": 0.7389535699256558, "grad_norm": 0.03337478265166283, "learning_rate": 3.768410716790574e-05, "loss": 0.0168, "step": 26340 }, { "epoch": 0.7392341141815121, "grad_norm": 0.18210378289222717, "learning_rate": 3.7679431430308135e-05, "loss": 0.0222, "step": 26350 }, { "epoch": 0.7395146584373685, "grad_norm": 0.06714258342981339, "learning_rate": 3.767475569271053e-05, "loss": 0.0238, "step": 26360 }, { "epoch": 0.7397952026932249, "grad_norm": 1.1038806438446045, "learning_rate": 3.767007995511292e-05, "loss": 0.0762, "step": 26370 }, { "epoch": 0.7400757469490812, "grad_norm": 0.19006362557411194, "learning_rate": 3.7665404217515314e-05, "loss": 0.0184, "step": 26380 }, { "epoch": 0.7403562912049376, "grad_norm": 0.029210882261395454, "learning_rate": 3.766072847991771e-05, "loss": 0.0282, "step": 26390 }, { "epoch": 0.740636835460794, "grad_norm": 0.19486764073371887, "learning_rate": 3.76560527423201e-05, "loss": 0.0361, "step": 26400 }, { "epoch": 0.7409173797166503, "grad_norm": 0.06728459149599075, "learning_rate": 3.76513770047225e-05, "loss": 0.0663, "step": 26410 }, { "epoch": 0.7411979239725066, "grad_norm": 0.07872515916824341, "learning_rate": 3.764670126712489e-05, "loss": 0.0179, "step": 26420 }, { "epoch": 0.741478468228363, "grad_norm": 2.836656093597412, "learning_rate": 3.764202552952729e-05, "loss": 0.0703, "step": 26430 }, { "epoch": 0.7417590124842194, "grad_norm": 0.021632375195622444, "learning_rate": 3.763734979192968e-05, "loss": 0.0102, "step": 26440 }, { "epoch": 0.7420395567400757, "grad_norm": 0.04630523920059204, "learning_rate": 3.763267405433207e-05, "loss": 0.0189, "step": 26450 }, { "epoch": 0.7423201009959322, "grad_norm": 0.023378299549221992, "learning_rate": 3.7627998316734466e-05, "loss": 0.0139, "step": 26460 }, { "epoch": 0.7426006452517885, "grad_norm": 0.08844325691461563, "learning_rate": 3.762332257913686e-05, "loss": 0.0087, "step": 26470 }, { "epoch": 0.7428811895076448, "grad_norm": 0.1483190804719925, "learning_rate": 3.761864684153926e-05, "loss": 0.063, "step": 26480 }, { "epoch": 0.7431617337635011, "grad_norm": 0.03482922911643982, "learning_rate": 3.7613971103941646e-05, "loss": 0.0509, "step": 26490 }, { "epoch": 0.7434422780193576, "grad_norm": 0.13988882303237915, "learning_rate": 3.7609295366344046e-05, "loss": 0.0454, "step": 26500 }, { "epoch": 0.7437228222752139, "grad_norm": 0.29128730297088623, "learning_rate": 3.760461962874643e-05, "loss": 0.0455, "step": 26510 }, { "epoch": 0.7440033665310702, "grad_norm": 0.05934809893369675, "learning_rate": 3.759994389114883e-05, "loss": 0.0104, "step": 26520 }, { "epoch": 0.7442839107869267, "grad_norm": 0.6899235844612122, "learning_rate": 3.7595268153551225e-05, "loss": 0.015, "step": 26530 }, { "epoch": 0.744564455042783, "grad_norm": 0.25653648376464844, "learning_rate": 3.759059241595362e-05, "loss": 0.0069, "step": 26540 }, { "epoch": 0.7448449992986393, "grad_norm": 0.12620916962623596, "learning_rate": 3.758591667835601e-05, "loss": 0.0362, "step": 26550 }, { "epoch": 0.7451255435544957, "grad_norm": 0.1511276811361313, "learning_rate": 3.7581240940758405e-05, "loss": 0.0094, "step": 26560 }, { "epoch": 0.7454060878103521, "grad_norm": 0.040361106395721436, "learning_rate": 3.7576565203160805e-05, "loss": 0.0305, "step": 26570 }, { "epoch": 0.7456866320662084, "grad_norm": 0.10724196583032608, "learning_rate": 3.757188946556319e-05, "loss": 0.0262, "step": 26580 }, { "epoch": 0.7459671763220648, "grad_norm": 2.225358247756958, "learning_rate": 3.756721372796559e-05, "loss": 0.0405, "step": 26590 }, { "epoch": 0.7462477205779212, "grad_norm": 1.2523659467697144, "learning_rate": 3.756253799036798e-05, "loss": 0.0337, "step": 26600 }, { "epoch": 0.7465282648337775, "grad_norm": 0.19272585213184357, "learning_rate": 3.755786225277038e-05, "loss": 0.0069, "step": 26610 }, { "epoch": 0.7468088090896339, "grad_norm": 0.042443498969078064, "learning_rate": 3.755318651517277e-05, "loss": 0.0239, "step": 26620 }, { "epoch": 0.7470893533454902, "grad_norm": 1.1210124492645264, "learning_rate": 3.7548510777575163e-05, "loss": 0.0718, "step": 26630 }, { "epoch": 0.7473698976013466, "grad_norm": 0.07175491005182266, "learning_rate": 3.754383503997756e-05, "loss": 0.0448, "step": 26640 }, { "epoch": 0.747650441857203, "grad_norm": 0.03941613808274269, "learning_rate": 3.753915930237995e-05, "loss": 0.0057, "step": 26650 }, { "epoch": 0.7479309861130593, "grad_norm": 0.05413787066936493, "learning_rate": 3.753448356478235e-05, "loss": 0.0526, "step": 26660 }, { "epoch": 0.7482115303689157, "grad_norm": 36.86410903930664, "learning_rate": 3.7529807827184736e-05, "loss": 0.0219, "step": 26670 }, { "epoch": 0.7484920746247721, "grad_norm": 0.34200960397720337, "learning_rate": 3.7525132089587136e-05, "loss": 0.0195, "step": 26680 }, { "epoch": 0.7487726188806284, "grad_norm": 3.6827645301818848, "learning_rate": 3.752045635198953e-05, "loss": 0.0232, "step": 26690 }, { "epoch": 0.7490531631364847, "grad_norm": 0.15568797290325165, "learning_rate": 3.751578061439192e-05, "loss": 0.0452, "step": 26700 }, { "epoch": 0.7493337073923412, "grad_norm": 5.426539421081543, "learning_rate": 3.7511104876794316e-05, "loss": 0.0549, "step": 26710 }, { "epoch": 0.7496142516481975, "grad_norm": 0.34731897711753845, "learning_rate": 3.750642913919671e-05, "loss": 0.0102, "step": 26720 }, { "epoch": 0.7498947959040538, "grad_norm": 0.6350764632225037, "learning_rate": 3.75017534015991e-05, "loss": 0.0199, "step": 26730 }, { "epoch": 0.7501753401599103, "grad_norm": 0.06661983579397202, "learning_rate": 3.7497077664001495e-05, "loss": 0.0216, "step": 26740 }, { "epoch": 0.7504558844157666, "grad_norm": 0.06398290395736694, "learning_rate": 3.7492401926403895e-05, "loss": 0.0405, "step": 26750 }, { "epoch": 0.7507364286716229, "grad_norm": 0.5629951357841492, "learning_rate": 3.748772618880629e-05, "loss": 0.0275, "step": 26760 }, { "epoch": 0.7510169729274793, "grad_norm": 0.12387312203645706, "learning_rate": 3.748305045120868e-05, "loss": 0.0147, "step": 26770 }, { "epoch": 0.7512975171833357, "grad_norm": 0.9628930687904358, "learning_rate": 3.7478374713611074e-05, "loss": 0.0129, "step": 26780 }, { "epoch": 0.751578061439192, "grad_norm": 0.4340348541736603, "learning_rate": 3.747369897601347e-05, "loss": 0.0201, "step": 26790 }, { "epoch": 0.7518586056950484, "grad_norm": 0.07896804064512253, "learning_rate": 3.746902323841586e-05, "loss": 0.0227, "step": 26800 }, { "epoch": 0.7521391499509048, "grad_norm": 0.2294015884399414, "learning_rate": 3.7464347500818254e-05, "loss": 0.0049, "step": 26810 }, { "epoch": 0.7524196942067611, "grad_norm": 2.7425434589385986, "learning_rate": 3.745967176322065e-05, "loss": 0.0186, "step": 26820 }, { "epoch": 0.7527002384626175, "grad_norm": 0.8719572424888611, "learning_rate": 3.745499602562305e-05, "loss": 0.049, "step": 26830 }, { "epoch": 0.7529807827184738, "grad_norm": 0.20256361365318298, "learning_rate": 3.745032028802543e-05, "loss": 0.0161, "step": 26840 }, { "epoch": 0.7532613269743302, "grad_norm": 0.17631684243679047, "learning_rate": 3.744564455042783e-05, "loss": 0.0319, "step": 26850 }, { "epoch": 0.7535418712301866, "grad_norm": 0.12877456843852997, "learning_rate": 3.7440968812830226e-05, "loss": 0.0299, "step": 26860 }, { "epoch": 0.7538224154860429, "grad_norm": 1.0239245891571045, "learning_rate": 3.743629307523262e-05, "loss": 0.023, "step": 26870 }, { "epoch": 0.7541029597418993, "grad_norm": 0.25029927492141724, "learning_rate": 3.743161733763501e-05, "loss": 0.0473, "step": 26880 }, { "epoch": 0.7543835039977557, "grad_norm": 0.03563198447227478, "learning_rate": 3.7426941600037406e-05, "loss": 0.0154, "step": 26890 }, { "epoch": 0.754664048253612, "grad_norm": 0.7351282835006714, "learning_rate": 3.7422265862439806e-05, "loss": 0.0129, "step": 26900 }, { "epoch": 0.7549445925094683, "grad_norm": 0.4094708561897278, "learning_rate": 3.741759012484219e-05, "loss": 0.0108, "step": 26910 }, { "epoch": 0.7552251367653248, "grad_norm": 0.025403369218111038, "learning_rate": 3.741291438724459e-05, "loss": 0.0124, "step": 26920 }, { "epoch": 0.7555056810211811, "grad_norm": 0.06044026464223862, "learning_rate": 3.740823864964698e-05, "loss": 0.0075, "step": 26930 }, { "epoch": 0.7557862252770374, "grad_norm": 0.03622705116868019, "learning_rate": 3.740356291204938e-05, "loss": 0.0234, "step": 26940 }, { "epoch": 0.7560667695328939, "grad_norm": 0.06216801702976227, "learning_rate": 3.739888717445177e-05, "loss": 0.0298, "step": 26950 }, { "epoch": 0.7563473137887502, "grad_norm": 0.2524009048938751, "learning_rate": 3.7394211436854165e-05, "loss": 0.0234, "step": 26960 }, { "epoch": 0.7566278580446065, "grad_norm": 0.13021405041217804, "learning_rate": 3.7389535699256565e-05, "loss": 0.0332, "step": 26970 }, { "epoch": 0.7569084023004629, "grad_norm": 1.8993656635284424, "learning_rate": 3.738485996165895e-05, "loss": 0.0451, "step": 26980 }, { "epoch": 0.7571889465563193, "grad_norm": 0.15022607147693634, "learning_rate": 3.738018422406135e-05, "loss": 0.012, "step": 26990 }, { "epoch": 0.7574694908121756, "grad_norm": 12.631360054016113, "learning_rate": 3.737550848646374e-05, "loss": 0.019, "step": 27000 }, { "epoch": 0.757750035068032, "grad_norm": 0.21184034645557404, "learning_rate": 3.737083274886614e-05, "loss": 0.017, "step": 27010 }, { "epoch": 0.7580305793238884, "grad_norm": 7.576516628265381, "learning_rate": 3.7366157011268524e-05, "loss": 0.0321, "step": 27020 }, { "epoch": 0.7583111235797447, "grad_norm": 0.4154617190361023, "learning_rate": 3.7361481273670924e-05, "loss": 0.0116, "step": 27030 }, { "epoch": 0.7585916678356011, "grad_norm": 0.13087739050388336, "learning_rate": 3.735680553607332e-05, "loss": 0.0103, "step": 27040 }, { "epoch": 0.7588722120914574, "grad_norm": 0.13865117728710175, "learning_rate": 3.735212979847571e-05, "loss": 0.0214, "step": 27050 }, { "epoch": 0.7591527563473138, "grad_norm": 0.006571085192263126, "learning_rate": 3.734745406087811e-05, "loss": 0.0368, "step": 27060 }, { "epoch": 0.7594333006031702, "grad_norm": 1.5164326429367065, "learning_rate": 3.7342778323280496e-05, "loss": 0.0246, "step": 27070 }, { "epoch": 0.7597138448590265, "grad_norm": 0.24317172169685364, "learning_rate": 3.7338102585682896e-05, "loss": 0.0182, "step": 27080 }, { "epoch": 0.7599943891148828, "grad_norm": 0.3111538887023926, "learning_rate": 3.733342684808529e-05, "loss": 0.0248, "step": 27090 }, { "epoch": 0.7602749333707393, "grad_norm": 0.12436956912279129, "learning_rate": 3.732875111048768e-05, "loss": 0.027, "step": 27100 }, { "epoch": 0.7605554776265956, "grad_norm": 0.22552509605884552, "learning_rate": 3.7324075372890076e-05, "loss": 0.0392, "step": 27110 }, { "epoch": 0.7608360218824519, "grad_norm": 0.9430824518203735, "learning_rate": 3.731939963529247e-05, "loss": 0.0328, "step": 27120 }, { "epoch": 0.7611165661383084, "grad_norm": 0.028158167377114296, "learning_rate": 3.731472389769486e-05, "loss": 0.0478, "step": 27130 }, { "epoch": 0.7613971103941647, "grad_norm": 0.051244594156742096, "learning_rate": 3.7310048160097255e-05, "loss": 0.0274, "step": 27140 }, { "epoch": 0.761677654650021, "grad_norm": 0.040446687489748, "learning_rate": 3.730537242249965e-05, "loss": 0.0221, "step": 27150 }, { "epoch": 0.7619581989058773, "grad_norm": 2.6601455211639404, "learning_rate": 3.730069668490205e-05, "loss": 0.0441, "step": 27160 }, { "epoch": 0.7622387431617338, "grad_norm": 1.0299683809280396, "learning_rate": 3.729602094730444e-05, "loss": 0.0394, "step": 27170 }, { "epoch": 0.7625192874175901, "grad_norm": 0.08729609102010727, "learning_rate": 3.7291345209706834e-05, "loss": 0.038, "step": 27180 }, { "epoch": 0.7627998316734464, "grad_norm": 0.2402779757976532, "learning_rate": 3.728666947210923e-05, "loss": 0.041, "step": 27190 }, { "epoch": 0.7630803759293029, "grad_norm": 0.4915527403354645, "learning_rate": 3.728199373451162e-05, "loss": 0.0204, "step": 27200 }, { "epoch": 0.7633609201851592, "grad_norm": 0.060452695935964584, "learning_rate": 3.7277317996914014e-05, "loss": 0.008, "step": 27210 }, { "epoch": 0.7636414644410155, "grad_norm": 0.030694536864757538, "learning_rate": 3.727264225931641e-05, "loss": 0.0255, "step": 27220 }, { "epoch": 0.7639220086968719, "grad_norm": 0.03951748460531235, "learning_rate": 3.726796652171881e-05, "loss": 0.0073, "step": 27230 }, { "epoch": 0.7642025529527283, "grad_norm": 27.54781723022461, "learning_rate": 3.7263290784121193e-05, "loss": 0.0266, "step": 27240 }, { "epoch": 0.7644830972085846, "grad_norm": 0.8542125821113586, "learning_rate": 3.725861504652359e-05, "loss": 0.0535, "step": 27250 }, { "epoch": 0.764763641464441, "grad_norm": 0.587675929069519, "learning_rate": 3.7253939308925986e-05, "loss": 0.0183, "step": 27260 }, { "epoch": 0.7650441857202974, "grad_norm": 0.9682095646858215, "learning_rate": 3.724926357132838e-05, "loss": 0.0503, "step": 27270 }, { "epoch": 0.7653247299761538, "grad_norm": 1.34153413772583, "learning_rate": 3.724458783373077e-05, "loss": 0.0504, "step": 27280 }, { "epoch": 0.7656052742320101, "grad_norm": 0.2908751964569092, "learning_rate": 3.7239912096133166e-05, "loss": 0.0334, "step": 27290 }, { "epoch": 0.7658858184878664, "grad_norm": 0.262579083442688, "learning_rate": 3.7235236358535566e-05, "loss": 0.0589, "step": 27300 }, { "epoch": 0.7661663627437229, "grad_norm": 27.122291564941406, "learning_rate": 3.723056062093795e-05, "loss": 0.0446, "step": 27310 }, { "epoch": 0.7664469069995792, "grad_norm": 0.3148897886276245, "learning_rate": 3.722588488334035e-05, "loss": 0.0342, "step": 27320 }, { "epoch": 0.7667274512554355, "grad_norm": 0.8115214705467224, "learning_rate": 3.722120914574274e-05, "loss": 0.0286, "step": 27330 }, { "epoch": 0.767007995511292, "grad_norm": 0.04909211024641991, "learning_rate": 3.721653340814514e-05, "loss": 0.0467, "step": 27340 }, { "epoch": 0.7672885397671483, "grad_norm": 9.254246711730957, "learning_rate": 3.721185767054753e-05, "loss": 0.0473, "step": 27350 }, { "epoch": 0.7675690840230046, "grad_norm": 0.9693918228149414, "learning_rate": 3.7207181932949925e-05, "loss": 0.049, "step": 27360 }, { "epoch": 0.7678496282788609, "grad_norm": 6.085422992706299, "learning_rate": 3.720250619535232e-05, "loss": 0.0526, "step": 27370 }, { "epoch": 0.7681301725347174, "grad_norm": 0.7007277011871338, "learning_rate": 3.719783045775471e-05, "loss": 0.0213, "step": 27380 }, { "epoch": 0.7684107167905737, "grad_norm": 0.3250722289085388, "learning_rate": 3.719315472015711e-05, "loss": 0.0222, "step": 27390 }, { "epoch": 0.76869126104643, "grad_norm": 1.555074691772461, "learning_rate": 3.71884789825595e-05, "loss": 0.035, "step": 27400 }, { "epoch": 0.7689718053022865, "grad_norm": 3.450939416885376, "learning_rate": 3.71838032449619e-05, "loss": 0.0346, "step": 27410 }, { "epoch": 0.7692523495581428, "grad_norm": 1.7074720859527588, "learning_rate": 3.7179127507364284e-05, "loss": 0.0404, "step": 27420 }, { "epoch": 0.7695328938139991, "grad_norm": 0.35811153054237366, "learning_rate": 3.7174451769766684e-05, "loss": 0.0497, "step": 27430 }, { "epoch": 0.7698134380698555, "grad_norm": 5.113308906555176, "learning_rate": 3.716977603216908e-05, "loss": 0.0689, "step": 27440 }, { "epoch": 0.7700939823257119, "grad_norm": 0.30309638381004333, "learning_rate": 3.716510029457147e-05, "loss": 0.0304, "step": 27450 }, { "epoch": 0.7703745265815682, "grad_norm": 0.03385505452752113, "learning_rate": 3.716042455697386e-05, "loss": 0.0198, "step": 27460 }, { "epoch": 0.7706550708374246, "grad_norm": 0.41263288259506226, "learning_rate": 3.7155748819376256e-05, "loss": 0.0335, "step": 27470 }, { "epoch": 0.770935615093281, "grad_norm": 0.058354564011096954, "learning_rate": 3.7151073081778656e-05, "loss": 0.0129, "step": 27480 }, { "epoch": 0.7712161593491373, "grad_norm": 1.6408504247665405, "learning_rate": 3.714639734418104e-05, "loss": 0.0119, "step": 27490 }, { "epoch": 0.7714967036049937, "grad_norm": 1.3357480764389038, "learning_rate": 3.714172160658344e-05, "loss": 0.0477, "step": 27500 }, { "epoch": 0.77177724786085, "grad_norm": 4.399073600769043, "learning_rate": 3.7137045868985836e-05, "loss": 0.0235, "step": 27510 }, { "epoch": 0.7720577921167064, "grad_norm": 0.01615464687347412, "learning_rate": 3.713237013138823e-05, "loss": 0.0101, "step": 27520 }, { "epoch": 0.7723383363725628, "grad_norm": 0.026705050840973854, "learning_rate": 3.712769439379062e-05, "loss": 0.0597, "step": 27530 }, { "epoch": 0.7726188806284191, "grad_norm": 3.269685745239258, "learning_rate": 3.7123018656193015e-05, "loss": 0.0348, "step": 27540 }, { "epoch": 0.7728994248842755, "grad_norm": 0.0942189171910286, "learning_rate": 3.711834291859541e-05, "loss": 0.0403, "step": 27550 }, { "epoch": 0.7731799691401319, "grad_norm": 0.30513665080070496, "learning_rate": 3.71136671809978e-05, "loss": 0.0553, "step": 27560 }, { "epoch": 0.7734605133959882, "grad_norm": 0.31324502825737, "learning_rate": 3.71089914434002e-05, "loss": 0.0251, "step": 27570 }, { "epoch": 0.7737410576518445, "grad_norm": 0.2786839008331299, "learning_rate": 3.7104315705802595e-05, "loss": 0.0401, "step": 27580 }, { "epoch": 0.774021601907701, "grad_norm": 0.04236029461026192, "learning_rate": 3.709963996820499e-05, "loss": 0.0341, "step": 27590 }, { "epoch": 0.7743021461635573, "grad_norm": 0.8029642105102539, "learning_rate": 3.709496423060738e-05, "loss": 0.056, "step": 27600 }, { "epoch": 0.7745826904194136, "grad_norm": 0.07108563184738159, "learning_rate": 3.7090288493009774e-05, "loss": 0.0286, "step": 27610 }, { "epoch": 0.7748632346752701, "grad_norm": 0.5555779933929443, "learning_rate": 3.708561275541217e-05, "loss": 0.0319, "step": 27620 }, { "epoch": 0.7751437789311264, "grad_norm": 0.6306257843971252, "learning_rate": 3.708093701781456e-05, "loss": 0.0532, "step": 27630 }, { "epoch": 0.7754243231869827, "grad_norm": 0.10778923332691193, "learning_rate": 3.7076261280216953e-05, "loss": 0.0185, "step": 27640 }, { "epoch": 0.7757048674428391, "grad_norm": 0.25072410702705383, "learning_rate": 3.7071585542619353e-05, "loss": 0.0258, "step": 27650 }, { "epoch": 0.7759854116986955, "grad_norm": 1.1592978239059448, "learning_rate": 3.7066909805021747e-05, "loss": 0.05, "step": 27660 }, { "epoch": 0.7762659559545518, "grad_norm": 0.15313997864723206, "learning_rate": 3.706223406742414e-05, "loss": 0.0316, "step": 27670 }, { "epoch": 0.7765465002104082, "grad_norm": 0.16858191788196564, "learning_rate": 3.705755832982653e-05, "loss": 0.0236, "step": 27680 }, { "epoch": 0.7768270444662646, "grad_norm": 0.6954429149627686, "learning_rate": 3.7052882592228926e-05, "loss": 0.0536, "step": 27690 }, { "epoch": 0.7771075887221209, "grad_norm": 0.09030556678771973, "learning_rate": 3.704820685463132e-05, "loss": 0.031, "step": 27700 }, { "epoch": 0.7773881329779773, "grad_norm": 2.854586362838745, "learning_rate": 3.704353111703371e-05, "loss": 0.0165, "step": 27710 }, { "epoch": 0.7776686772338336, "grad_norm": 0.03383293002843857, "learning_rate": 3.703885537943611e-05, "loss": 0.0154, "step": 27720 }, { "epoch": 0.77794922148969, "grad_norm": 0.04871775209903717, "learning_rate": 3.70341796418385e-05, "loss": 0.0503, "step": 27730 }, { "epoch": 0.7782297657455464, "grad_norm": 0.420896977186203, "learning_rate": 3.70295039042409e-05, "loss": 0.0153, "step": 27740 }, { "epoch": 0.7785103100014027, "grad_norm": 0.05850230157375336, "learning_rate": 3.7024828166643285e-05, "loss": 0.0284, "step": 27750 }, { "epoch": 0.7787908542572591, "grad_norm": 0.6952290534973145, "learning_rate": 3.7020152429045685e-05, "loss": 0.0142, "step": 27760 }, { "epoch": 0.7790713985131155, "grad_norm": 0.38102516531944275, "learning_rate": 3.701547669144808e-05, "loss": 0.009, "step": 27770 }, { "epoch": 0.7793519427689718, "grad_norm": 1.0220564603805542, "learning_rate": 3.701080095385047e-05, "loss": 0.0393, "step": 27780 }, { "epoch": 0.7796324870248281, "grad_norm": 1.4025896787643433, "learning_rate": 3.700612521625287e-05, "loss": 0.015, "step": 27790 }, { "epoch": 0.7799130312806846, "grad_norm": 0.32537105679512024, "learning_rate": 3.700144947865526e-05, "loss": 0.0574, "step": 27800 }, { "epoch": 0.7801935755365409, "grad_norm": 1.0380027294158936, "learning_rate": 3.699677374105766e-05, "loss": 0.0323, "step": 27810 }, { "epoch": 0.7804741197923972, "grad_norm": 0.09852369129657745, "learning_rate": 3.6992098003460044e-05, "loss": 0.0613, "step": 27820 }, { "epoch": 0.7807546640482537, "grad_norm": 6.0661211013793945, "learning_rate": 3.6987422265862444e-05, "loss": 0.0157, "step": 27830 }, { "epoch": 0.78103520830411, "grad_norm": 0.05721420794725418, "learning_rate": 3.698274652826483e-05, "loss": 0.0358, "step": 27840 }, { "epoch": 0.7813157525599663, "grad_norm": 0.06576777249574661, "learning_rate": 3.697807079066723e-05, "loss": 0.0689, "step": 27850 }, { "epoch": 0.7815962968158227, "grad_norm": 0.3765275180339813, "learning_rate": 3.697339505306962e-05, "loss": 0.0458, "step": 27860 }, { "epoch": 0.7818768410716791, "grad_norm": 0.09906067699193954, "learning_rate": 3.6968719315472016e-05, "loss": 0.0517, "step": 27870 }, { "epoch": 0.7821573853275354, "grad_norm": 1.9291855096817017, "learning_rate": 3.6964043577874416e-05, "loss": 0.0293, "step": 27880 }, { "epoch": 0.7824379295833918, "grad_norm": 0.13937026262283325, "learning_rate": 3.69593678402768e-05, "loss": 0.0247, "step": 27890 }, { "epoch": 0.7827184738392482, "grad_norm": 0.3195354640483856, "learning_rate": 3.69546921026792e-05, "loss": 0.0431, "step": 27900 }, { "epoch": 0.7829990180951045, "grad_norm": 0.10552779585123062, "learning_rate": 3.695001636508159e-05, "loss": 0.0332, "step": 27910 }, { "epoch": 0.7832795623509609, "grad_norm": 0.8935118317604065, "learning_rate": 3.694534062748399e-05, "loss": 0.0467, "step": 27920 }, { "epoch": 0.7835601066068172, "grad_norm": 0.4555893540382385, "learning_rate": 3.694066488988638e-05, "loss": 0.0233, "step": 27930 }, { "epoch": 0.7838406508626736, "grad_norm": 0.9927309155464172, "learning_rate": 3.6935989152288775e-05, "loss": 0.0193, "step": 27940 }, { "epoch": 0.78412119511853, "grad_norm": 1.2324206829071045, "learning_rate": 3.693131341469117e-05, "loss": 0.0312, "step": 27950 }, { "epoch": 0.7844017393743863, "grad_norm": 4.306303977966309, "learning_rate": 3.692663767709356e-05, "loss": 0.0198, "step": 27960 }, { "epoch": 0.7846822836302426, "grad_norm": 0.09641049057245255, "learning_rate": 3.692196193949596e-05, "loss": 0.0134, "step": 27970 }, { "epoch": 0.7849628278860991, "grad_norm": 0.4662996828556061, "learning_rate": 3.691728620189835e-05, "loss": 0.0192, "step": 27980 }, { "epoch": 0.7852433721419554, "grad_norm": 0.5726356506347656, "learning_rate": 3.691261046430075e-05, "loss": 0.0275, "step": 27990 }, { "epoch": 0.7855239163978117, "grad_norm": 0.15567335486412048, "learning_rate": 3.690793472670314e-05, "loss": 0.027, "step": 28000 }, { "epoch": 0.7858044606536682, "grad_norm": 0.28961095213890076, "learning_rate": 3.6903258989105534e-05, "loss": 0.0245, "step": 28010 }, { "epoch": 0.7860850049095245, "grad_norm": 0.5079836249351501, "learning_rate": 3.689858325150793e-05, "loss": 0.0172, "step": 28020 }, { "epoch": 0.7863655491653808, "grad_norm": 1.773222804069519, "learning_rate": 3.689390751391032e-05, "loss": 0.0376, "step": 28030 }, { "epoch": 0.7866460934212371, "grad_norm": 2.127025604248047, "learning_rate": 3.6889231776312714e-05, "loss": 0.0533, "step": 28040 }, { "epoch": 0.7869266376770936, "grad_norm": 0.028202243149280548, "learning_rate": 3.688455603871511e-05, "loss": 0.0137, "step": 28050 }, { "epoch": 0.7872071819329499, "grad_norm": 0.13025309145450592, "learning_rate": 3.68798803011175e-05, "loss": 0.0439, "step": 28060 }, { "epoch": 0.7874877261888062, "grad_norm": 0.20033010840415955, "learning_rate": 3.68752045635199e-05, "loss": 0.0308, "step": 28070 }, { "epoch": 0.7877682704446627, "grad_norm": 0.060081690549850464, "learning_rate": 3.687052882592229e-05, "loss": 0.0191, "step": 28080 }, { "epoch": 0.788048814700519, "grad_norm": 0.1274302750825882, "learning_rate": 3.6865853088324686e-05, "loss": 0.0137, "step": 28090 }, { "epoch": 0.7883293589563753, "grad_norm": 0.06714648753404617, "learning_rate": 3.686117735072708e-05, "loss": 0.0257, "step": 28100 }, { "epoch": 0.7886099032122317, "grad_norm": 0.057009126991033554, "learning_rate": 3.685650161312947e-05, "loss": 0.0128, "step": 28110 }, { "epoch": 0.7888904474680881, "grad_norm": 31.5870361328125, "learning_rate": 3.6851825875531866e-05, "loss": 0.0183, "step": 28120 }, { "epoch": 0.7891709917239444, "grad_norm": 0.19570565223693848, "learning_rate": 3.684715013793426e-05, "loss": 0.0231, "step": 28130 }, { "epoch": 0.7894515359798008, "grad_norm": 1.255409598350525, "learning_rate": 3.684247440033666e-05, "loss": 0.0442, "step": 28140 }, { "epoch": 0.7897320802356572, "grad_norm": 0.14806464314460754, "learning_rate": 3.6837798662739045e-05, "loss": 0.0223, "step": 28150 }, { "epoch": 0.7900126244915135, "grad_norm": 0.07048120349645615, "learning_rate": 3.6833122925141445e-05, "loss": 0.0306, "step": 28160 }, { "epoch": 0.7902931687473699, "grad_norm": 0.14183765649795532, "learning_rate": 3.682844718754384e-05, "loss": 0.0272, "step": 28170 }, { "epoch": 0.7905737130032262, "grad_norm": 0.7431482076644897, "learning_rate": 3.682377144994623e-05, "loss": 0.0257, "step": 28180 }, { "epoch": 0.7908542572590826, "grad_norm": 0.06243447959423065, "learning_rate": 3.6819095712348624e-05, "loss": 0.0254, "step": 28190 }, { "epoch": 0.791134801514939, "grad_norm": 0.32008516788482666, "learning_rate": 3.681441997475102e-05, "loss": 0.0147, "step": 28200 }, { "epoch": 0.7914153457707953, "grad_norm": 0.19308830797672272, "learning_rate": 3.680974423715342e-05, "loss": 0.03, "step": 28210 }, { "epoch": 0.7916958900266517, "grad_norm": 0.24652168154716492, "learning_rate": 3.6805068499555804e-05, "loss": 0.0554, "step": 28220 }, { "epoch": 0.7919764342825081, "grad_norm": 0.21780957281589508, "learning_rate": 3.6800392761958204e-05, "loss": 0.0512, "step": 28230 }, { "epoch": 0.7922569785383644, "grad_norm": 1.026232361793518, "learning_rate": 3.679571702436059e-05, "loss": 0.0307, "step": 28240 }, { "epoch": 0.7925375227942207, "grad_norm": 0.8803231716156006, "learning_rate": 3.679104128676299e-05, "loss": 0.018, "step": 28250 }, { "epoch": 0.7928180670500772, "grad_norm": 0.07946039736270905, "learning_rate": 3.678636554916538e-05, "loss": 0.0174, "step": 28260 }, { "epoch": 0.7930986113059335, "grad_norm": 0.052700430154800415, "learning_rate": 3.6781689811567776e-05, "loss": 0.0317, "step": 28270 }, { "epoch": 0.7933791555617898, "grad_norm": 21.211715698242188, "learning_rate": 3.677701407397017e-05, "loss": 0.0416, "step": 28280 }, { "epoch": 0.7936596998176463, "grad_norm": 0.020944565534591675, "learning_rate": 3.677233833637256e-05, "loss": 0.0055, "step": 28290 }, { "epoch": 0.7939402440735026, "grad_norm": 0.3166229724884033, "learning_rate": 3.676766259877496e-05, "loss": 0.0404, "step": 28300 }, { "epoch": 0.7942207883293589, "grad_norm": 0.5305492877960205, "learning_rate": 3.676298686117735e-05, "loss": 0.0616, "step": 28310 }, { "epoch": 0.7945013325852153, "grad_norm": 1.4051480293273926, "learning_rate": 3.675831112357975e-05, "loss": 0.0209, "step": 28320 }, { "epoch": 0.7947818768410717, "grad_norm": 0.43227192759513855, "learning_rate": 3.6753635385982135e-05, "loss": 0.0318, "step": 28330 }, { "epoch": 0.795062421096928, "grad_norm": 1.4130369424819946, "learning_rate": 3.6748959648384535e-05, "loss": 0.0778, "step": 28340 }, { "epoch": 0.7953429653527844, "grad_norm": 0.9389885663986206, "learning_rate": 3.674428391078693e-05, "loss": 0.0454, "step": 28350 }, { "epoch": 0.7956235096086408, "grad_norm": 0.740301787853241, "learning_rate": 3.673960817318932e-05, "loss": 0.021, "step": 28360 }, { "epoch": 0.7959040538644971, "grad_norm": 0.14277231693267822, "learning_rate": 3.6734932435591715e-05, "loss": 0.0163, "step": 28370 }, { "epoch": 0.7961845981203535, "grad_norm": 0.5514681339263916, "learning_rate": 3.673025669799411e-05, "loss": 0.0209, "step": 28380 }, { "epoch": 0.7964651423762098, "grad_norm": 0.731052577495575, "learning_rate": 3.672558096039651e-05, "loss": 0.0199, "step": 28390 }, { "epoch": 0.7967456866320662, "grad_norm": 2.921793222427368, "learning_rate": 3.6720905222798894e-05, "loss": 0.0657, "step": 28400 }, { "epoch": 0.7970262308879226, "grad_norm": 4.411715030670166, "learning_rate": 3.6716229485201294e-05, "loss": 0.0384, "step": 28410 }, { "epoch": 0.7973067751437789, "grad_norm": 0.3319967985153198, "learning_rate": 3.671155374760369e-05, "loss": 0.036, "step": 28420 }, { "epoch": 0.7975873193996353, "grad_norm": 0.5410300493240356, "learning_rate": 3.670687801000608e-05, "loss": 0.0344, "step": 28430 }, { "epoch": 0.7978678636554917, "grad_norm": 0.1975550800561905, "learning_rate": 3.6702202272408474e-05, "loss": 0.0404, "step": 28440 }, { "epoch": 0.798148407911348, "grad_norm": 0.53998863697052, "learning_rate": 3.669752653481087e-05, "loss": 0.0175, "step": 28450 }, { "epoch": 0.7984289521672043, "grad_norm": 0.01789248362183571, "learning_rate": 3.669285079721326e-05, "loss": 0.0207, "step": 28460 }, { "epoch": 0.7987094964230608, "grad_norm": 0.3224974572658539, "learning_rate": 3.668817505961565e-05, "loss": 0.0285, "step": 28470 }, { "epoch": 0.7989900406789171, "grad_norm": 0.1464194804430008, "learning_rate": 3.668349932201805e-05, "loss": 0.0525, "step": 28480 }, { "epoch": 0.7992705849347734, "grad_norm": 1.2109301090240479, "learning_rate": 3.6678823584420446e-05, "loss": 0.0249, "step": 28490 }, { "epoch": 0.7995511291906299, "grad_norm": 0.057123150676488876, "learning_rate": 3.667414784682284e-05, "loss": 0.0238, "step": 28500 }, { "epoch": 0.7998316734464862, "grad_norm": 0.771106481552124, "learning_rate": 3.666947210922523e-05, "loss": 0.0199, "step": 28510 }, { "epoch": 0.8001122177023425, "grad_norm": 0.273798406124115, "learning_rate": 3.6664796371627626e-05, "loss": 0.0316, "step": 28520 }, { "epoch": 0.8003927619581989, "grad_norm": 0.18806418776512146, "learning_rate": 3.666012063403002e-05, "loss": 0.0071, "step": 28530 }, { "epoch": 0.8006733062140553, "grad_norm": 0.11173073202371597, "learning_rate": 3.665544489643241e-05, "loss": 0.0194, "step": 28540 }, { "epoch": 0.8009538504699116, "grad_norm": 0.49403640627861023, "learning_rate": 3.6650769158834805e-05, "loss": 0.0411, "step": 28550 }, { "epoch": 0.801234394725768, "grad_norm": 0.11181619018316269, "learning_rate": 3.6646093421237205e-05, "loss": 0.0383, "step": 28560 }, { "epoch": 0.8015149389816244, "grad_norm": 0.7297146916389465, "learning_rate": 3.66414176836396e-05, "loss": 0.0764, "step": 28570 }, { "epoch": 0.8017954832374807, "grad_norm": 0.21124359965324402, "learning_rate": 3.663674194604199e-05, "loss": 0.0292, "step": 28580 }, { "epoch": 0.8020760274933371, "grad_norm": 0.18307043612003326, "learning_rate": 3.6632066208444385e-05, "loss": 0.0141, "step": 28590 }, { "epoch": 0.8023565717491934, "grad_norm": 0.05431456118822098, "learning_rate": 3.662739047084678e-05, "loss": 0.0284, "step": 28600 }, { "epoch": 0.8026371160050498, "grad_norm": 0.05578525736927986, "learning_rate": 3.662271473324917e-05, "loss": 0.021, "step": 28610 }, { "epoch": 0.8029176602609062, "grad_norm": 0.05644829198718071, "learning_rate": 3.6618038995651564e-05, "loss": 0.0249, "step": 28620 }, { "epoch": 0.8031982045167625, "grad_norm": 0.03265247866511345, "learning_rate": 3.6613363258053964e-05, "loss": 0.0685, "step": 28630 }, { "epoch": 0.8034787487726189, "grad_norm": 0.08611705899238586, "learning_rate": 3.660868752045635e-05, "loss": 0.0547, "step": 28640 }, { "epoch": 0.8037592930284753, "grad_norm": 0.5940718650817871, "learning_rate": 3.660401178285875e-05, "loss": 0.0422, "step": 28650 }, { "epoch": 0.8040398372843316, "grad_norm": 0.10812171548604965, "learning_rate": 3.659933604526114e-05, "loss": 0.0401, "step": 28660 }, { "epoch": 0.8043203815401879, "grad_norm": 0.07914704829454422, "learning_rate": 3.6594660307663537e-05, "loss": 0.032, "step": 28670 }, { "epoch": 0.8046009257960444, "grad_norm": 8.703252792358398, "learning_rate": 3.658998457006593e-05, "loss": 0.0424, "step": 28680 }, { "epoch": 0.8048814700519007, "grad_norm": 0.43049120903015137, "learning_rate": 3.658530883246832e-05, "loss": 0.0353, "step": 28690 }, { "epoch": 0.805162014307757, "grad_norm": 0.08419250696897507, "learning_rate": 3.658063309487072e-05, "loss": 0.0257, "step": 28700 }, { "epoch": 0.8054425585636135, "grad_norm": 0.15591859817504883, "learning_rate": 3.657595735727311e-05, "loss": 0.0574, "step": 28710 }, { "epoch": 0.8057231028194698, "grad_norm": 0.08386965841054916, "learning_rate": 3.657128161967551e-05, "loss": 0.0247, "step": 28720 }, { "epoch": 0.8060036470753261, "grad_norm": 1.1751558780670166, "learning_rate": 3.6566605882077895e-05, "loss": 0.0127, "step": 28730 }, { "epoch": 0.8062841913311825, "grad_norm": 0.12280470132827759, "learning_rate": 3.6561930144480295e-05, "loss": 0.0172, "step": 28740 }, { "epoch": 0.8065647355870389, "grad_norm": 0.03873271867632866, "learning_rate": 3.655725440688268e-05, "loss": 0.0616, "step": 28750 }, { "epoch": 0.8068452798428952, "grad_norm": 0.06646531820297241, "learning_rate": 3.655257866928508e-05, "loss": 0.0153, "step": 28760 }, { "epoch": 0.8071258240987516, "grad_norm": 0.5368216633796692, "learning_rate": 3.6547902931687475e-05, "loss": 0.0445, "step": 28770 }, { "epoch": 0.8074063683546079, "grad_norm": 0.29101425409317017, "learning_rate": 3.654322719408987e-05, "loss": 0.0623, "step": 28780 }, { "epoch": 0.8076869126104643, "grad_norm": 0.7413378357887268, "learning_rate": 3.653855145649227e-05, "loss": 0.0641, "step": 28790 }, { "epoch": 0.8079674568663207, "grad_norm": 0.4713384807109833, "learning_rate": 3.6533875718894654e-05, "loss": 0.0271, "step": 28800 }, { "epoch": 0.808248001122177, "grad_norm": 0.07979833334684372, "learning_rate": 3.6529199981297054e-05, "loss": 0.0222, "step": 28810 }, { "epoch": 0.8085285453780334, "grad_norm": 1.5804022550582886, "learning_rate": 3.652452424369944e-05, "loss": 0.0481, "step": 28820 }, { "epoch": 0.8088090896338898, "grad_norm": 0.08860866725444794, "learning_rate": 3.651984850610184e-05, "loss": 0.0673, "step": 28830 }, { "epoch": 0.8090896338897461, "grad_norm": 0.4283328950405121, "learning_rate": 3.6515172768504234e-05, "loss": 0.0178, "step": 28840 }, { "epoch": 0.8093701781456024, "grad_norm": 0.28514963388442993, "learning_rate": 3.651049703090663e-05, "loss": 0.0312, "step": 28850 }, { "epoch": 0.8096507224014589, "grad_norm": 0.020042331889271736, "learning_rate": 3.650582129330902e-05, "loss": 0.0038, "step": 28860 }, { "epoch": 0.8099312666573152, "grad_norm": 0.3714158833026886, "learning_rate": 3.650114555571141e-05, "loss": 0.0077, "step": 28870 }, { "epoch": 0.8102118109131715, "grad_norm": 1.5000306367874146, "learning_rate": 3.649646981811381e-05, "loss": 0.0278, "step": 28880 }, { "epoch": 0.810492355169028, "grad_norm": 0.26519742608070374, "learning_rate": 3.64917940805162e-05, "loss": 0.0161, "step": 28890 }, { "epoch": 0.8107728994248843, "grad_norm": 1.3235005140304565, "learning_rate": 3.64871183429186e-05, "loss": 0.0369, "step": 28900 }, { "epoch": 0.8110534436807406, "grad_norm": 1.2705023288726807, "learning_rate": 3.648244260532099e-05, "loss": 0.0085, "step": 28910 }, { "epoch": 0.811333987936597, "grad_norm": 0.13017940521240234, "learning_rate": 3.6477766867723386e-05, "loss": 0.0119, "step": 28920 }, { "epoch": 0.8116145321924534, "grad_norm": 5.178325176239014, "learning_rate": 3.647309113012578e-05, "loss": 0.0549, "step": 28930 }, { "epoch": 0.8118950764483097, "grad_norm": 0.07990330457687378, "learning_rate": 3.646841539252817e-05, "loss": 0.0317, "step": 28940 }, { "epoch": 0.812175620704166, "grad_norm": 0.3164456784725189, "learning_rate": 3.6463739654930565e-05, "loss": 0.0092, "step": 28950 }, { "epoch": 0.8124561649600225, "grad_norm": 0.0611930713057518, "learning_rate": 3.645906391733296e-05, "loss": 0.0635, "step": 28960 }, { "epoch": 0.8127367092158788, "grad_norm": 0.05763211101293564, "learning_rate": 3.645438817973535e-05, "loss": 0.0119, "step": 28970 }, { "epoch": 0.8130172534717351, "grad_norm": 0.05284639820456505, "learning_rate": 3.644971244213775e-05, "loss": 0.0164, "step": 28980 }, { "epoch": 0.8132977977275915, "grad_norm": 0.29423782229423523, "learning_rate": 3.6445036704540145e-05, "loss": 0.0164, "step": 28990 }, { "epoch": 0.8135783419834479, "grad_norm": 0.9342760443687439, "learning_rate": 3.644036096694254e-05, "loss": 0.0349, "step": 29000 }, { "epoch": 0.8138588862393042, "grad_norm": 0.46735045313835144, "learning_rate": 3.643568522934493e-05, "loss": 0.0388, "step": 29010 }, { "epoch": 0.8141394304951606, "grad_norm": 0.2206730842590332, "learning_rate": 3.6431009491747324e-05, "loss": 0.0596, "step": 29020 }, { "epoch": 0.814419974751017, "grad_norm": 1.2002942562103271, "learning_rate": 3.642633375414972e-05, "loss": 0.045, "step": 29030 }, { "epoch": 0.8147005190068733, "grad_norm": 0.36992180347442627, "learning_rate": 3.642165801655211e-05, "loss": 0.0673, "step": 29040 }, { "epoch": 0.8149810632627297, "grad_norm": 0.08125296235084534, "learning_rate": 3.641698227895451e-05, "loss": 0.0224, "step": 29050 }, { "epoch": 0.815261607518586, "grad_norm": 0.07150746136903763, "learning_rate": 3.64123065413569e-05, "loss": 0.0163, "step": 29060 }, { "epoch": 0.8155421517744424, "grad_norm": 2.370743751525879, "learning_rate": 3.64076308037593e-05, "loss": 0.0772, "step": 29070 }, { "epoch": 0.8158226960302988, "grad_norm": 0.5778293609619141, "learning_rate": 3.640295506616169e-05, "loss": 0.0125, "step": 29080 }, { "epoch": 0.8161032402861551, "grad_norm": 0.24926725029945374, "learning_rate": 3.639827932856408e-05, "loss": 0.036, "step": 29090 }, { "epoch": 0.8163837845420115, "grad_norm": 0.053784407675266266, "learning_rate": 3.6393603590966476e-05, "loss": 0.012, "step": 29100 }, { "epoch": 0.8166643287978679, "grad_norm": 0.2581172287464142, "learning_rate": 3.638892785336887e-05, "loss": 0.0201, "step": 29110 }, { "epoch": 0.8169448730537242, "grad_norm": 2.1292343139648438, "learning_rate": 3.638425211577127e-05, "loss": 0.0407, "step": 29120 }, { "epoch": 0.8172254173095805, "grad_norm": 0.554215133190155, "learning_rate": 3.6379576378173656e-05, "loss": 0.0328, "step": 29130 }, { "epoch": 0.817505961565437, "grad_norm": 0.04432791844010353, "learning_rate": 3.6374900640576056e-05, "loss": 0.0094, "step": 29140 }, { "epoch": 0.8177865058212933, "grad_norm": 0.5823238492012024, "learning_rate": 3.637022490297844e-05, "loss": 0.0342, "step": 29150 }, { "epoch": 0.8180670500771496, "grad_norm": 0.05457804352045059, "learning_rate": 3.636554916538084e-05, "loss": 0.0264, "step": 29160 }, { "epoch": 0.8183475943330061, "grad_norm": 0.23831124603748322, "learning_rate": 3.6360873427783235e-05, "loss": 0.0289, "step": 29170 }, { "epoch": 0.8186281385888624, "grad_norm": 0.6298651695251465, "learning_rate": 3.635619769018563e-05, "loss": 0.0144, "step": 29180 }, { "epoch": 0.8189086828447187, "grad_norm": 0.16165514290332794, "learning_rate": 3.635152195258802e-05, "loss": 0.0486, "step": 29190 }, { "epoch": 0.8191892271005751, "grad_norm": 0.022244835272431374, "learning_rate": 3.6346846214990414e-05, "loss": 0.0123, "step": 29200 }, { "epoch": 0.8194697713564315, "grad_norm": 0.632930338382721, "learning_rate": 3.6342170477392814e-05, "loss": 0.0513, "step": 29210 }, { "epoch": 0.8197503156122878, "grad_norm": 0.062433548271656036, "learning_rate": 3.63374947397952e-05, "loss": 0.0253, "step": 29220 }, { "epoch": 0.8200308598681442, "grad_norm": 0.05796186625957489, "learning_rate": 3.63328190021976e-05, "loss": 0.0211, "step": 29230 }, { "epoch": 0.8203114041240006, "grad_norm": 0.686980128288269, "learning_rate": 3.632814326459999e-05, "loss": 0.0198, "step": 29240 }, { "epoch": 0.8205919483798569, "grad_norm": 0.036353182047605515, "learning_rate": 3.632346752700239e-05, "loss": 0.0124, "step": 29250 }, { "epoch": 0.8208724926357133, "grad_norm": 0.04292697086930275, "learning_rate": 3.631879178940478e-05, "loss": 0.0196, "step": 29260 }, { "epoch": 0.8211530368915696, "grad_norm": 1.3978253602981567, "learning_rate": 3.631411605180717e-05, "loss": 0.0215, "step": 29270 }, { "epoch": 0.821433581147426, "grad_norm": 1.1851123571395874, "learning_rate": 3.6309440314209566e-05, "loss": 0.0429, "step": 29280 }, { "epoch": 0.8217141254032824, "grad_norm": 0.4566071629524231, "learning_rate": 3.630476457661196e-05, "loss": 0.0395, "step": 29290 }, { "epoch": 0.8219946696591387, "grad_norm": 0.1054481565952301, "learning_rate": 3.630008883901436e-05, "loss": 0.0173, "step": 29300 }, { "epoch": 0.8222752139149951, "grad_norm": 0.19729511439800262, "learning_rate": 3.6295413101416746e-05, "loss": 0.012, "step": 29310 }, { "epoch": 0.8225557581708515, "grad_norm": 0.03450625762343407, "learning_rate": 3.6290737363819146e-05, "loss": 0.0177, "step": 29320 }, { "epoch": 0.8228363024267078, "grad_norm": 0.040574390441179276, "learning_rate": 3.628606162622154e-05, "loss": 0.0273, "step": 29330 }, { "epoch": 0.8231168466825641, "grad_norm": 0.048749957233667374, "learning_rate": 3.628138588862393e-05, "loss": 0.0352, "step": 29340 }, { "epoch": 0.8233973909384206, "grad_norm": 0.6932334303855896, "learning_rate": 3.6276710151026325e-05, "loss": 0.0473, "step": 29350 }, { "epoch": 0.8236779351942769, "grad_norm": 0.28600895404815674, "learning_rate": 3.627203441342872e-05, "loss": 0.0054, "step": 29360 }, { "epoch": 0.8239584794501332, "grad_norm": 0.20770083367824554, "learning_rate": 3.626735867583111e-05, "loss": 0.0368, "step": 29370 }, { "epoch": 0.8242390237059897, "grad_norm": 0.44785767793655396, "learning_rate": 3.6262682938233505e-05, "loss": 0.0284, "step": 29380 }, { "epoch": 0.824519567961846, "grad_norm": 0.053628988564014435, "learning_rate": 3.6258007200635905e-05, "loss": 0.0368, "step": 29390 }, { "epoch": 0.8248001122177023, "grad_norm": 0.4464922845363617, "learning_rate": 3.62533314630383e-05, "loss": 0.0401, "step": 29400 }, { "epoch": 0.8250806564735587, "grad_norm": 0.10207971930503845, "learning_rate": 3.624865572544069e-05, "loss": 0.0109, "step": 29410 }, { "epoch": 0.8253612007294151, "grad_norm": 1.6738430261611938, "learning_rate": 3.6243979987843084e-05, "loss": 0.0424, "step": 29420 }, { "epoch": 0.8256417449852714, "grad_norm": 0.05516105145215988, "learning_rate": 3.623930425024548e-05, "loss": 0.0085, "step": 29430 }, { "epoch": 0.8259222892411278, "grad_norm": 0.02250383049249649, "learning_rate": 3.623462851264787e-05, "loss": 0.0193, "step": 29440 }, { "epoch": 0.8262028334969842, "grad_norm": 0.016484679654240608, "learning_rate": 3.6229952775050264e-05, "loss": 0.0198, "step": 29450 }, { "epoch": 0.8264833777528405, "grad_norm": 0.5455631613731384, "learning_rate": 3.622527703745266e-05, "loss": 0.0165, "step": 29460 }, { "epoch": 0.8267639220086969, "grad_norm": 0.29423627257347107, "learning_rate": 3.622060129985506e-05, "loss": 0.0728, "step": 29470 }, { "epoch": 0.8270444662645532, "grad_norm": 0.24890673160552979, "learning_rate": 3.621592556225745e-05, "loss": 0.0236, "step": 29480 }, { "epoch": 0.8273250105204096, "grad_norm": 0.06228170171380043, "learning_rate": 3.621124982465984e-05, "loss": 0.0073, "step": 29490 }, { "epoch": 0.827605554776266, "grad_norm": 1.2168688774108887, "learning_rate": 3.6206574087062236e-05, "loss": 0.0435, "step": 29500 }, { "epoch": 0.8278860990321223, "grad_norm": 1.3562259674072266, "learning_rate": 3.620189834946463e-05, "loss": 0.0088, "step": 29510 }, { "epoch": 0.8281666432879787, "grad_norm": 0.05581644922494888, "learning_rate": 3.619722261186702e-05, "loss": 0.0147, "step": 29520 }, { "epoch": 0.8284471875438351, "grad_norm": 0.2439514696598053, "learning_rate": 3.6192546874269416e-05, "loss": 0.0264, "step": 29530 }, { "epoch": 0.8287277317996914, "grad_norm": 0.05084468796849251, "learning_rate": 3.6187871136671816e-05, "loss": 0.0202, "step": 29540 }, { "epoch": 0.8290082760555477, "grad_norm": 0.6773800253868103, "learning_rate": 3.61831953990742e-05, "loss": 0.0113, "step": 29550 }, { "epoch": 0.8292888203114042, "grad_norm": 1.1327253580093384, "learning_rate": 3.61785196614766e-05, "loss": 0.0319, "step": 29560 }, { "epoch": 0.8295693645672605, "grad_norm": 0.048566512763500214, "learning_rate": 3.617384392387899e-05, "loss": 0.0554, "step": 29570 }, { "epoch": 0.8298499088231168, "grad_norm": 1.1914920806884766, "learning_rate": 3.616916818628139e-05, "loss": 0.0221, "step": 29580 }, { "epoch": 0.8301304530789733, "grad_norm": 0.03400292620062828, "learning_rate": 3.616449244868378e-05, "loss": 0.0131, "step": 29590 }, { "epoch": 0.8304109973348296, "grad_norm": 0.05895378813147545, "learning_rate": 3.6159816711086175e-05, "loss": 0.0421, "step": 29600 }, { "epoch": 0.8306915415906859, "grad_norm": 0.02467329241335392, "learning_rate": 3.6155140973488574e-05, "loss": 0.0095, "step": 29610 }, { "epoch": 0.8309720858465423, "grad_norm": 1.1668699979782104, "learning_rate": 3.615046523589096e-05, "loss": 0.0454, "step": 29620 }, { "epoch": 0.8312526301023987, "grad_norm": 0.3280653953552246, "learning_rate": 3.614578949829336e-05, "loss": 0.0324, "step": 29630 }, { "epoch": 0.831533174358255, "grad_norm": 0.020668091252446175, "learning_rate": 3.614111376069575e-05, "loss": 0.016, "step": 29640 }, { "epoch": 0.8318137186141114, "grad_norm": 0.16564664244651794, "learning_rate": 3.613643802309815e-05, "loss": 0.0084, "step": 29650 }, { "epoch": 0.8320942628699677, "grad_norm": 0.571169376373291, "learning_rate": 3.613176228550054e-05, "loss": 0.0139, "step": 29660 }, { "epoch": 0.8323748071258241, "grad_norm": 0.20671549439430237, "learning_rate": 3.6127086547902933e-05, "loss": 0.0144, "step": 29670 }, { "epoch": 0.8326553513816805, "grad_norm": 0.03262154012918472, "learning_rate": 3.6122410810305327e-05, "loss": 0.0334, "step": 29680 }, { "epoch": 0.8329358956375368, "grad_norm": 0.08572149276733398, "learning_rate": 3.611773507270772e-05, "loss": 0.0217, "step": 29690 }, { "epoch": 0.8332164398933932, "grad_norm": 2.6550278663635254, "learning_rate": 3.611305933511012e-05, "loss": 0.0164, "step": 29700 }, { "epoch": 0.8334969841492496, "grad_norm": 0.19885064661502838, "learning_rate": 3.6108383597512506e-05, "loss": 0.0136, "step": 29710 }, { "epoch": 0.8337775284051059, "grad_norm": 0.7867249846458435, "learning_rate": 3.6103707859914906e-05, "loss": 0.0362, "step": 29720 }, { "epoch": 0.8340580726609622, "grad_norm": 0.20573075115680695, "learning_rate": 3.60990321223173e-05, "loss": 0.0333, "step": 29730 }, { "epoch": 0.8343386169168187, "grad_norm": 1.3160651922225952, "learning_rate": 3.609435638471969e-05, "loss": 0.0344, "step": 29740 }, { "epoch": 0.834619161172675, "grad_norm": 0.017910944297909737, "learning_rate": 3.6089680647122085e-05, "loss": 0.021, "step": 29750 }, { "epoch": 0.8348997054285313, "grad_norm": 0.2955927848815918, "learning_rate": 3.608500490952448e-05, "loss": 0.0145, "step": 29760 }, { "epoch": 0.8351802496843878, "grad_norm": 0.17183011770248413, "learning_rate": 3.608032917192687e-05, "loss": 0.026, "step": 29770 }, { "epoch": 0.8354607939402441, "grad_norm": 0.04272530972957611, "learning_rate": 3.6075653434329265e-05, "loss": 0.0115, "step": 29780 }, { "epoch": 0.8357413381961004, "grad_norm": 0.02554989606142044, "learning_rate": 3.6070977696731665e-05, "loss": 0.0075, "step": 29790 }, { "epoch": 0.8360218824519567, "grad_norm": 1.634757161140442, "learning_rate": 3.606630195913406e-05, "loss": 0.0468, "step": 29800 }, { "epoch": 0.8363024267078132, "grad_norm": 0.33153387904167175, "learning_rate": 3.606162622153645e-05, "loss": 0.0139, "step": 29810 }, { "epoch": 0.8365829709636695, "grad_norm": 0.19119258224964142, "learning_rate": 3.6056950483938844e-05, "loss": 0.0288, "step": 29820 }, { "epoch": 0.8368635152195258, "grad_norm": 0.48115408420562744, "learning_rate": 3.605227474634124e-05, "loss": 0.0308, "step": 29830 }, { "epoch": 0.8371440594753823, "grad_norm": 0.1337699592113495, "learning_rate": 3.604759900874363e-05, "loss": 0.0536, "step": 29840 }, { "epoch": 0.8374246037312386, "grad_norm": 0.02658909559249878, "learning_rate": 3.6042923271146024e-05, "loss": 0.0176, "step": 29850 }, { "epoch": 0.837705147987095, "grad_norm": 0.061247147619724274, "learning_rate": 3.603824753354842e-05, "loss": 0.0542, "step": 29860 }, { "epoch": 0.8379856922429513, "grad_norm": 18.329147338867188, "learning_rate": 3.603357179595082e-05, "loss": 0.0412, "step": 29870 }, { "epoch": 0.8382662364988077, "grad_norm": 0.7268772125244141, "learning_rate": 3.60288960583532e-05, "loss": 0.0151, "step": 29880 }, { "epoch": 0.838546780754664, "grad_norm": 0.8092358112335205, "learning_rate": 3.60242203207556e-05, "loss": 0.0338, "step": 29890 }, { "epoch": 0.8388273250105204, "grad_norm": 0.10288208723068237, "learning_rate": 3.6019544583157996e-05, "loss": 0.0181, "step": 29900 }, { "epoch": 0.8391078692663768, "grad_norm": 0.5785701274871826, "learning_rate": 3.601486884556039e-05, "loss": 0.0261, "step": 29910 }, { "epoch": 0.8393884135222331, "grad_norm": 0.44847530126571655, "learning_rate": 3.601019310796278e-05, "loss": 0.0269, "step": 29920 }, { "epoch": 0.8396689577780895, "grad_norm": 1.0896393060684204, "learning_rate": 3.6005517370365176e-05, "loss": 0.0072, "step": 29930 }, { "epoch": 0.8399495020339458, "grad_norm": 2.7792391777038574, "learning_rate": 3.6000841632767576e-05, "loss": 0.0516, "step": 29940 }, { "epoch": 0.8402300462898022, "grad_norm": 0.05444107949733734, "learning_rate": 3.599616589516996e-05, "loss": 0.0605, "step": 29950 }, { "epoch": 0.8405105905456586, "grad_norm": 0.7889410257339478, "learning_rate": 3.599149015757236e-05, "loss": 0.0367, "step": 29960 }, { "epoch": 0.8407911348015149, "grad_norm": 0.38567662239074707, "learning_rate": 3.598681441997475e-05, "loss": 0.015, "step": 29970 }, { "epoch": 0.8410716790573713, "grad_norm": 2.5946385860443115, "learning_rate": 3.598213868237715e-05, "loss": 0.0418, "step": 29980 }, { "epoch": 0.8413522233132277, "grad_norm": 0.15787290036678314, "learning_rate": 3.597746294477954e-05, "loss": 0.0301, "step": 29990 }, { "epoch": 0.841632767569084, "grad_norm": 0.18147484958171844, "learning_rate": 3.5972787207181935e-05, "loss": 0.0298, "step": 30000 }, { "epoch": 0.8419133118249403, "grad_norm": 0.580398440361023, "learning_rate": 3.5968111469584335e-05, "loss": 0.0608, "step": 30010 }, { "epoch": 0.8421938560807968, "grad_norm": 0.9182026386260986, "learning_rate": 3.596343573198672e-05, "loss": 0.0347, "step": 30020 }, { "epoch": 0.8424744003366531, "grad_norm": 2.8367490768432617, "learning_rate": 3.595875999438912e-05, "loss": 0.034, "step": 30030 }, { "epoch": 0.8427549445925094, "grad_norm": 0.16791965067386627, "learning_rate": 3.595408425679151e-05, "loss": 0.0145, "step": 30040 }, { "epoch": 0.8430354888483659, "grad_norm": 0.22786270081996918, "learning_rate": 3.594940851919391e-05, "loss": 0.0129, "step": 30050 }, { "epoch": 0.8433160331042222, "grad_norm": 0.022452836856245995, "learning_rate": 3.5944732781596294e-05, "loss": 0.0155, "step": 30060 }, { "epoch": 0.8435965773600785, "grad_norm": 0.013101693242788315, "learning_rate": 3.5940057043998693e-05, "loss": 0.027, "step": 30070 }, { "epoch": 0.8438771216159349, "grad_norm": 0.12265991419553757, "learning_rate": 3.593538130640109e-05, "loss": 0.0141, "step": 30080 }, { "epoch": 0.8441576658717913, "grad_norm": 0.07484771311283112, "learning_rate": 3.593070556880348e-05, "loss": 0.0422, "step": 30090 }, { "epoch": 0.8444382101276476, "grad_norm": 0.8667181730270386, "learning_rate": 3.592602983120587e-05, "loss": 0.0338, "step": 30100 }, { "epoch": 0.844718754383504, "grad_norm": 0.3776487410068512, "learning_rate": 3.5921354093608266e-05, "loss": 0.0494, "step": 30110 }, { "epoch": 0.8449992986393604, "grad_norm": 0.28375035524368286, "learning_rate": 3.5916678356010666e-05, "loss": 0.0168, "step": 30120 }, { "epoch": 0.8452798428952167, "grad_norm": 0.11314401775598526, "learning_rate": 3.591200261841305e-05, "loss": 0.0157, "step": 30130 }, { "epoch": 0.8455603871510731, "grad_norm": 0.08915567398071289, "learning_rate": 3.590732688081545e-05, "loss": 0.0175, "step": 30140 }, { "epoch": 0.8458409314069294, "grad_norm": 0.10225645452737808, "learning_rate": 3.5902651143217846e-05, "loss": 0.0273, "step": 30150 }, { "epoch": 0.8461214756627858, "grad_norm": 0.18749882280826569, "learning_rate": 3.589797540562024e-05, "loss": 0.031, "step": 30160 }, { "epoch": 0.8464020199186422, "grad_norm": 0.10905393213033676, "learning_rate": 3.589329966802263e-05, "loss": 0.0332, "step": 30170 }, { "epoch": 0.8466825641744985, "grad_norm": 0.10382351279258728, "learning_rate": 3.5888623930425025e-05, "loss": 0.0088, "step": 30180 }, { "epoch": 0.8469631084303549, "grad_norm": 1.1907367706298828, "learning_rate": 3.588394819282742e-05, "loss": 0.0171, "step": 30190 }, { "epoch": 0.8472436526862113, "grad_norm": 0.5400210618972778, "learning_rate": 3.587927245522981e-05, "loss": 0.0248, "step": 30200 }, { "epoch": 0.8475241969420676, "grad_norm": 0.06663842499256134, "learning_rate": 3.587459671763221e-05, "loss": 0.0437, "step": 30210 }, { "epoch": 0.8478047411979239, "grad_norm": 0.03485949710011482, "learning_rate": 3.5869920980034604e-05, "loss": 0.022, "step": 30220 }, { "epoch": 0.8480852854537804, "grad_norm": 0.043978966772556305, "learning_rate": 3.5865245242437e-05, "loss": 0.0125, "step": 30230 }, { "epoch": 0.8483658297096367, "grad_norm": 0.30580681562423706, "learning_rate": 3.586056950483939e-05, "loss": 0.0426, "step": 30240 }, { "epoch": 0.848646373965493, "grad_norm": 4.125027656555176, "learning_rate": 3.5855893767241784e-05, "loss": 0.0259, "step": 30250 }, { "epoch": 0.8489269182213495, "grad_norm": 0.29937121272087097, "learning_rate": 3.585121802964418e-05, "loss": 0.059, "step": 30260 }, { "epoch": 0.8492074624772058, "grad_norm": 0.43207791447639465, "learning_rate": 3.584654229204657e-05, "loss": 0.0461, "step": 30270 }, { "epoch": 0.8494880067330621, "grad_norm": 0.7818830013275146, "learning_rate": 3.584186655444896e-05, "loss": 0.052, "step": 30280 }, { "epoch": 0.8497685509889185, "grad_norm": 0.6020866632461548, "learning_rate": 3.583719081685136e-05, "loss": 0.0287, "step": 30290 }, { "epoch": 0.8500490952447749, "grad_norm": 0.2922709882259369, "learning_rate": 3.5832515079253756e-05, "loss": 0.0556, "step": 30300 }, { "epoch": 0.8503296395006312, "grad_norm": 0.09735214710235596, "learning_rate": 3.582783934165615e-05, "loss": 0.0074, "step": 30310 }, { "epoch": 0.8506101837564876, "grad_norm": 0.06081646308302879, "learning_rate": 3.582316360405854e-05, "loss": 0.0056, "step": 30320 }, { "epoch": 0.850890728012344, "grad_norm": 0.04296499863266945, "learning_rate": 3.5818487866460936e-05, "loss": 0.0167, "step": 30330 }, { "epoch": 0.8511712722682003, "grad_norm": 0.114081472158432, "learning_rate": 3.581381212886333e-05, "loss": 0.0165, "step": 30340 }, { "epoch": 0.8514518165240567, "grad_norm": 0.025717739015817642, "learning_rate": 3.580913639126572e-05, "loss": 0.0405, "step": 30350 }, { "epoch": 0.851732360779913, "grad_norm": 0.5246291756629944, "learning_rate": 3.580446065366812e-05, "loss": 0.016, "step": 30360 }, { "epoch": 0.8520129050357694, "grad_norm": 0.497477263212204, "learning_rate": 3.579978491607051e-05, "loss": 0.018, "step": 30370 }, { "epoch": 0.8522934492916258, "grad_norm": 1.7477535009384155, "learning_rate": 3.579510917847291e-05, "loss": 0.0289, "step": 30380 }, { "epoch": 0.8525739935474821, "grad_norm": 0.18145568668842316, "learning_rate": 3.57904334408753e-05, "loss": 0.0206, "step": 30390 }, { "epoch": 0.8528545378033385, "grad_norm": 0.14369285106658936, "learning_rate": 3.5785757703277695e-05, "loss": 0.0266, "step": 30400 }, { "epoch": 0.8531350820591949, "grad_norm": 0.10611865669488907, "learning_rate": 3.578108196568009e-05, "loss": 0.049, "step": 30410 }, { "epoch": 0.8534156263150512, "grad_norm": 0.0775410458445549, "learning_rate": 3.577640622808248e-05, "loss": 0.0198, "step": 30420 }, { "epoch": 0.8536961705709075, "grad_norm": 0.01595648005604744, "learning_rate": 3.577173049048488e-05, "loss": 0.0272, "step": 30430 }, { "epoch": 0.853976714826764, "grad_norm": 0.012081784196197987, "learning_rate": 3.576705475288727e-05, "loss": 0.0201, "step": 30440 }, { "epoch": 0.8542572590826203, "grad_norm": 2.614877700805664, "learning_rate": 3.576237901528967e-05, "loss": 0.0205, "step": 30450 }, { "epoch": 0.8545378033384766, "grad_norm": 3.216081380844116, "learning_rate": 3.5757703277692054e-05, "loss": 0.0361, "step": 30460 }, { "epoch": 0.8548183475943331, "grad_norm": 0.07274052500724792, "learning_rate": 3.5753027540094454e-05, "loss": 0.0386, "step": 30470 }, { "epoch": 0.8550988918501894, "grad_norm": 0.22434312105178833, "learning_rate": 3.574835180249684e-05, "loss": 0.0247, "step": 30480 }, { "epoch": 0.8553794361060457, "grad_norm": 7.318889141082764, "learning_rate": 3.574367606489924e-05, "loss": 0.0179, "step": 30490 }, { "epoch": 0.855659980361902, "grad_norm": 2.3447160720825195, "learning_rate": 3.573900032730163e-05, "loss": 0.0275, "step": 30500 }, { "epoch": 0.8559405246177585, "grad_norm": 0.018272938206791878, "learning_rate": 3.5734324589704026e-05, "loss": 0.0377, "step": 30510 }, { "epoch": 0.8562210688736148, "grad_norm": 1.2205066680908203, "learning_rate": 3.5729648852106426e-05, "loss": 0.0535, "step": 30520 }, { "epoch": 0.8565016131294712, "grad_norm": 0.2481471598148346, "learning_rate": 3.572497311450881e-05, "loss": 0.0245, "step": 30530 }, { "epoch": 0.8567821573853275, "grad_norm": 0.06884962320327759, "learning_rate": 3.572029737691121e-05, "loss": 0.0336, "step": 30540 }, { "epoch": 0.8570627016411839, "grad_norm": 0.3211739659309387, "learning_rate": 3.57156216393136e-05, "loss": 0.0202, "step": 30550 }, { "epoch": 0.8573432458970403, "grad_norm": 0.19380131363868713, "learning_rate": 3.5710945901716e-05, "loss": 0.0485, "step": 30560 }, { "epoch": 0.8576237901528966, "grad_norm": 7.924611568450928, "learning_rate": 3.570627016411839e-05, "loss": 0.022, "step": 30570 }, { "epoch": 0.857904334408753, "grad_norm": 0.4021049439907074, "learning_rate": 3.5701594426520785e-05, "loss": 0.0353, "step": 30580 }, { "epoch": 0.8581848786646094, "grad_norm": 0.3619682490825653, "learning_rate": 3.569691868892318e-05, "loss": 0.0215, "step": 30590 }, { "epoch": 0.8584654229204657, "grad_norm": 0.4722469449043274, "learning_rate": 3.569224295132557e-05, "loss": 0.0536, "step": 30600 }, { "epoch": 0.858745967176322, "grad_norm": 1.8866971731185913, "learning_rate": 3.568756721372797e-05, "loss": 0.0317, "step": 30610 }, { "epoch": 0.8590265114321785, "grad_norm": 2.0968058109283447, "learning_rate": 3.568289147613036e-05, "loss": 0.0443, "step": 30620 }, { "epoch": 0.8593070556880348, "grad_norm": 0.8998332619667053, "learning_rate": 3.567821573853276e-05, "loss": 0.0366, "step": 30630 }, { "epoch": 0.8595875999438911, "grad_norm": 0.0494161956012249, "learning_rate": 3.567354000093515e-05, "loss": 0.0138, "step": 30640 }, { "epoch": 0.8598681441997476, "grad_norm": 0.050366971641778946, "learning_rate": 3.5668864263337544e-05, "loss": 0.0468, "step": 30650 }, { "epoch": 0.8601486884556039, "grad_norm": 0.11510326713323593, "learning_rate": 3.566418852573994e-05, "loss": 0.0207, "step": 30660 }, { "epoch": 0.8604292327114602, "grad_norm": 0.24473528563976288, "learning_rate": 3.565951278814233e-05, "loss": 0.019, "step": 30670 }, { "epoch": 0.8607097769673165, "grad_norm": 0.0615515410900116, "learning_rate": 3.5654837050544723e-05, "loss": 0.0263, "step": 30680 }, { "epoch": 0.860990321223173, "grad_norm": 1.2144112586975098, "learning_rate": 3.5650161312947117e-05, "loss": 0.0293, "step": 30690 }, { "epoch": 0.8612708654790293, "grad_norm": 8.20545768737793, "learning_rate": 3.5645485575349516e-05, "loss": 0.0309, "step": 30700 }, { "epoch": 0.8615514097348856, "grad_norm": 0.06514015048742294, "learning_rate": 3.564080983775191e-05, "loss": 0.0297, "step": 30710 }, { "epoch": 0.8618319539907421, "grad_norm": 0.040954798460006714, "learning_rate": 3.56361341001543e-05, "loss": 0.0318, "step": 30720 }, { "epoch": 0.8621124982465984, "grad_norm": 0.03143469616770744, "learning_rate": 3.5631458362556696e-05, "loss": 0.0221, "step": 30730 }, { "epoch": 0.8623930425024547, "grad_norm": 0.023938676342368126, "learning_rate": 3.562678262495909e-05, "loss": 0.0275, "step": 30740 }, { "epoch": 0.8626735867583111, "grad_norm": 0.8647719621658325, "learning_rate": 3.562210688736148e-05, "loss": 0.0483, "step": 30750 }, { "epoch": 0.8629541310141675, "grad_norm": 0.2835013270378113, "learning_rate": 3.5617431149763875e-05, "loss": 0.0245, "step": 30760 }, { "epoch": 0.8632346752700238, "grad_norm": 0.13957823812961578, "learning_rate": 3.561275541216627e-05, "loss": 0.0045, "step": 30770 }, { "epoch": 0.8635152195258802, "grad_norm": 0.38985419273376465, "learning_rate": 3.560807967456867e-05, "loss": 0.0112, "step": 30780 }, { "epoch": 0.8637957637817366, "grad_norm": 0.3683305084705353, "learning_rate": 3.5603403936971055e-05, "loss": 0.0605, "step": 30790 }, { "epoch": 0.8640763080375929, "grad_norm": 0.045323681086301804, "learning_rate": 3.5598728199373455e-05, "loss": 0.0349, "step": 30800 }, { "epoch": 0.8643568522934493, "grad_norm": 0.30414506793022156, "learning_rate": 3.559405246177585e-05, "loss": 0.0051, "step": 30810 }, { "epoch": 0.8646373965493056, "grad_norm": 1.6002205610275269, "learning_rate": 3.558937672417824e-05, "loss": 0.023, "step": 30820 }, { "epoch": 0.864917940805162, "grad_norm": 0.11599962413311005, "learning_rate": 3.5584700986580634e-05, "loss": 0.0383, "step": 30830 }, { "epoch": 0.8651984850610184, "grad_norm": 0.7035638093948364, "learning_rate": 3.558002524898303e-05, "loss": 0.0199, "step": 30840 }, { "epoch": 0.8654790293168747, "grad_norm": 0.699391782283783, "learning_rate": 3.557534951138543e-05, "loss": 0.0308, "step": 30850 }, { "epoch": 0.8657595735727311, "grad_norm": 0.2972467243671417, "learning_rate": 3.5570673773787814e-05, "loss": 0.0285, "step": 30860 }, { "epoch": 0.8660401178285875, "grad_norm": 4.219332695007324, "learning_rate": 3.5565998036190214e-05, "loss": 0.028, "step": 30870 }, { "epoch": 0.8663206620844438, "grad_norm": 0.16109801828861237, "learning_rate": 3.55613222985926e-05, "loss": 0.0215, "step": 30880 }, { "epoch": 0.8666012063403001, "grad_norm": 0.5151497721672058, "learning_rate": 3.5556646560995e-05, "loss": 0.052, "step": 30890 }, { "epoch": 0.8668817505961566, "grad_norm": 0.3207774758338928, "learning_rate": 3.555197082339739e-05, "loss": 0.016, "step": 30900 }, { "epoch": 0.8671622948520129, "grad_norm": 2.297687530517578, "learning_rate": 3.5547295085799786e-05, "loss": 0.0236, "step": 30910 }, { "epoch": 0.8674428391078692, "grad_norm": 0.45593398809432983, "learning_rate": 3.5542619348202186e-05, "loss": 0.0386, "step": 30920 }, { "epoch": 0.8677233833637257, "grad_norm": 1.1154241561889648, "learning_rate": 3.553794361060457e-05, "loss": 0.0391, "step": 30930 }, { "epoch": 0.868003927619582, "grad_norm": 0.02835991606116295, "learning_rate": 3.553326787300697e-05, "loss": 0.0036, "step": 30940 }, { "epoch": 0.8682844718754383, "grad_norm": 0.04611534625291824, "learning_rate": 3.552859213540936e-05, "loss": 0.0297, "step": 30950 }, { "epoch": 0.8685650161312947, "grad_norm": 0.38032543659210205, "learning_rate": 3.552391639781176e-05, "loss": 0.0492, "step": 30960 }, { "epoch": 0.8688455603871511, "grad_norm": 0.0971643477678299, "learning_rate": 3.5519240660214145e-05, "loss": 0.0093, "step": 30970 }, { "epoch": 0.8691261046430074, "grad_norm": 0.029535381123423576, "learning_rate": 3.5514564922616545e-05, "loss": 0.0233, "step": 30980 }, { "epoch": 0.8694066488988638, "grad_norm": 1.3480825424194336, "learning_rate": 3.550988918501894e-05, "loss": 0.0209, "step": 30990 }, { "epoch": 0.8696871931547202, "grad_norm": 0.2621324062347412, "learning_rate": 3.550521344742133e-05, "loss": 0.0357, "step": 31000 }, { "epoch": 0.8699677374105765, "grad_norm": 0.32092562317848206, "learning_rate": 3.5500537709823725e-05, "loss": 0.0356, "step": 31010 }, { "epoch": 0.8702482816664329, "grad_norm": 0.14993628859519958, "learning_rate": 3.549586197222612e-05, "loss": 0.032, "step": 31020 }, { "epoch": 0.8705288259222892, "grad_norm": 0.4680577218532562, "learning_rate": 3.549118623462852e-05, "loss": 0.016, "step": 31030 }, { "epoch": 0.8708093701781456, "grad_norm": 0.11303120851516724, "learning_rate": 3.5486510497030904e-05, "loss": 0.0224, "step": 31040 }, { "epoch": 0.871089914434002, "grad_norm": 0.03655136749148369, "learning_rate": 3.5481834759433304e-05, "loss": 0.0466, "step": 31050 }, { "epoch": 0.8713704586898583, "grad_norm": 0.16470319032669067, "learning_rate": 3.54771590218357e-05, "loss": 0.0414, "step": 31060 }, { "epoch": 0.8716510029457147, "grad_norm": 0.1655784398317337, "learning_rate": 3.547248328423809e-05, "loss": 0.0184, "step": 31070 }, { "epoch": 0.8719315472015711, "grad_norm": 0.8899351954460144, "learning_rate": 3.5467807546640483e-05, "loss": 0.0294, "step": 31080 }, { "epoch": 0.8722120914574274, "grad_norm": 1.358067274093628, "learning_rate": 3.546313180904288e-05, "loss": 0.0498, "step": 31090 }, { "epoch": 0.8724926357132837, "grad_norm": 2.207170009613037, "learning_rate": 3.545845607144527e-05, "loss": 0.0402, "step": 31100 }, { "epoch": 0.8727731799691402, "grad_norm": 0.04117761552333832, "learning_rate": 3.545378033384766e-05, "loss": 0.0157, "step": 31110 }, { "epoch": 0.8730537242249965, "grad_norm": 0.2687360346317291, "learning_rate": 3.544910459625006e-05, "loss": 0.0373, "step": 31120 }, { "epoch": 0.8733342684808528, "grad_norm": 0.15749934315681458, "learning_rate": 3.5444428858652456e-05, "loss": 0.0206, "step": 31130 }, { "epoch": 0.8736148127367093, "grad_norm": 0.2544403076171875, "learning_rate": 3.543975312105485e-05, "loss": 0.04, "step": 31140 }, { "epoch": 0.8738953569925656, "grad_norm": 0.03569091856479645, "learning_rate": 3.543507738345724e-05, "loss": 0.0207, "step": 31150 }, { "epoch": 0.8741759012484219, "grad_norm": 0.5883515477180481, "learning_rate": 3.5430401645859636e-05, "loss": 0.0337, "step": 31160 }, { "epoch": 0.8744564455042783, "grad_norm": 0.46917861700057983, "learning_rate": 3.542572590826203e-05, "loss": 0.013, "step": 31170 }, { "epoch": 0.8747369897601347, "grad_norm": 1.1828718185424805, "learning_rate": 3.542105017066442e-05, "loss": 0.0271, "step": 31180 }, { "epoch": 0.875017534015991, "grad_norm": 0.38903653621673584, "learning_rate": 3.5416374433066815e-05, "loss": 0.0468, "step": 31190 }, { "epoch": 0.8752980782718474, "grad_norm": 0.22639788687229156, "learning_rate": 3.5411698695469215e-05, "loss": 0.0311, "step": 31200 }, { "epoch": 0.8755786225277038, "grad_norm": 0.40202292799949646, "learning_rate": 3.540702295787161e-05, "loss": 0.0497, "step": 31210 }, { "epoch": 0.8758591667835601, "grad_norm": 0.30312126874923706, "learning_rate": 3.5402347220274e-05, "loss": 0.0461, "step": 31220 }, { "epoch": 0.8761397110394165, "grad_norm": 0.09445594251155853, "learning_rate": 3.5397671482676394e-05, "loss": 0.0256, "step": 31230 }, { "epoch": 0.8764202552952728, "grad_norm": 0.06336534023284912, "learning_rate": 3.539299574507879e-05, "loss": 0.0414, "step": 31240 }, { "epoch": 0.8767007995511292, "grad_norm": 0.17509864270687103, "learning_rate": 3.538832000748118e-05, "loss": 0.0476, "step": 31250 }, { "epoch": 0.8769813438069856, "grad_norm": 0.053905412554740906, "learning_rate": 3.5383644269883574e-05, "loss": 0.0171, "step": 31260 }, { "epoch": 0.8772618880628419, "grad_norm": 0.26791754364967346, "learning_rate": 3.5378968532285974e-05, "loss": 0.0389, "step": 31270 }, { "epoch": 0.8775424323186983, "grad_norm": 0.5707160830497742, "learning_rate": 3.537429279468836e-05, "loss": 0.0256, "step": 31280 }, { "epoch": 0.8778229765745547, "grad_norm": 1.1895854473114014, "learning_rate": 3.536961705709076e-05, "loss": 0.0629, "step": 31290 }, { "epoch": 0.878103520830411, "grad_norm": 0.5907300114631653, "learning_rate": 3.536494131949315e-05, "loss": 0.035, "step": 31300 }, { "epoch": 0.8783840650862673, "grad_norm": 0.34717777371406555, "learning_rate": 3.5360265581895546e-05, "loss": 0.0363, "step": 31310 }, { "epoch": 0.8786646093421238, "grad_norm": 0.1749294102191925, "learning_rate": 3.535558984429794e-05, "loss": 0.0149, "step": 31320 }, { "epoch": 0.8789451535979801, "grad_norm": 0.7810256481170654, "learning_rate": 3.535091410670033e-05, "loss": 0.0423, "step": 31330 }, { "epoch": 0.8792256978538364, "grad_norm": 0.19119682908058167, "learning_rate": 3.534623836910273e-05, "loss": 0.0156, "step": 31340 }, { "epoch": 0.8795062421096929, "grad_norm": 0.46008625626564026, "learning_rate": 3.534156263150512e-05, "loss": 0.0226, "step": 31350 }, { "epoch": 0.8797867863655492, "grad_norm": 0.060152411460876465, "learning_rate": 3.533688689390752e-05, "loss": 0.0244, "step": 31360 }, { "epoch": 0.8800673306214055, "grad_norm": 0.029313955456018448, "learning_rate": 3.5332211156309905e-05, "loss": 0.006, "step": 31370 }, { "epoch": 0.8803478748772618, "grad_norm": 3.609966993331909, "learning_rate": 3.5327535418712305e-05, "loss": 0.0488, "step": 31380 }, { "epoch": 0.8806284191331183, "grad_norm": 0.4148963987827301, "learning_rate": 3.532285968111469e-05, "loss": 0.0223, "step": 31390 }, { "epoch": 0.8809089633889746, "grad_norm": 0.34402838349342346, "learning_rate": 3.531818394351709e-05, "loss": 0.0314, "step": 31400 }, { "epoch": 0.881189507644831, "grad_norm": 0.12612654268741608, "learning_rate": 3.5313508205919485e-05, "loss": 0.0338, "step": 31410 }, { "epoch": 0.8814700519006873, "grad_norm": 0.020634634420275688, "learning_rate": 3.530883246832188e-05, "loss": 0.0062, "step": 31420 }, { "epoch": 0.8817505961565437, "grad_norm": 0.5468136072158813, "learning_rate": 3.530415673072428e-05, "loss": 0.0256, "step": 31430 }, { "epoch": 0.8820311404124, "grad_norm": 2.3265364170074463, "learning_rate": 3.5299480993126664e-05, "loss": 0.0392, "step": 31440 }, { "epoch": 0.8823116846682564, "grad_norm": 0.041272446513175964, "learning_rate": 3.5294805255529064e-05, "loss": 0.0113, "step": 31450 }, { "epoch": 0.8825922289241128, "grad_norm": 0.30972427129745483, "learning_rate": 3.529012951793145e-05, "loss": 0.0175, "step": 31460 }, { "epoch": 0.8828727731799692, "grad_norm": 0.03071824088692665, "learning_rate": 3.528545378033385e-05, "loss": 0.0148, "step": 31470 }, { "epoch": 0.8831533174358255, "grad_norm": 0.09254030883312225, "learning_rate": 3.5280778042736244e-05, "loss": 0.0304, "step": 31480 }, { "epoch": 0.8834338616916818, "grad_norm": 0.019978970289230347, "learning_rate": 3.527610230513864e-05, "loss": 0.0263, "step": 31490 }, { "epoch": 0.8837144059475383, "grad_norm": 0.191198468208313, "learning_rate": 3.527142656754103e-05, "loss": 0.0298, "step": 31500 }, { "epoch": 0.8839949502033946, "grad_norm": 0.029923899099230766, "learning_rate": 3.526675082994342e-05, "loss": 0.0126, "step": 31510 }, { "epoch": 0.8842754944592509, "grad_norm": 0.01038662251085043, "learning_rate": 3.526207509234582e-05, "loss": 0.0077, "step": 31520 }, { "epoch": 0.8845560387151074, "grad_norm": 0.2790495455265045, "learning_rate": 3.525739935474821e-05, "loss": 0.0112, "step": 31530 }, { "epoch": 0.8848365829709637, "grad_norm": 2.3881900310516357, "learning_rate": 3.525272361715061e-05, "loss": 0.0291, "step": 31540 }, { "epoch": 0.88511712722682, "grad_norm": 0.6585655212402344, "learning_rate": 3.5248047879553e-05, "loss": 0.0433, "step": 31550 }, { "epoch": 0.8853976714826763, "grad_norm": 0.022996004670858383, "learning_rate": 3.5243372141955396e-05, "loss": 0.0235, "step": 31560 }, { "epoch": 0.8856782157385328, "grad_norm": 0.032366588711738586, "learning_rate": 3.523869640435779e-05, "loss": 0.0137, "step": 31570 }, { "epoch": 0.8859587599943891, "grad_norm": 1.6604695320129395, "learning_rate": 3.523402066676018e-05, "loss": 0.0435, "step": 31580 }, { "epoch": 0.8862393042502454, "grad_norm": 2.3303184509277344, "learning_rate": 3.5229344929162575e-05, "loss": 0.034, "step": 31590 }, { "epoch": 0.8865198485061019, "grad_norm": 0.049291085451841354, "learning_rate": 3.522466919156497e-05, "loss": 0.0182, "step": 31600 }, { "epoch": 0.8868003927619582, "grad_norm": 0.051687173545360565, "learning_rate": 3.521999345396737e-05, "loss": 0.0221, "step": 31610 }, { "epoch": 0.8870809370178145, "grad_norm": 0.6301789283752441, "learning_rate": 3.521531771636976e-05, "loss": 0.074, "step": 31620 }, { "epoch": 0.8873614812736709, "grad_norm": 0.1486501544713974, "learning_rate": 3.5210641978772154e-05, "loss": 0.0331, "step": 31630 }, { "epoch": 0.8876420255295273, "grad_norm": 0.03390931338071823, "learning_rate": 3.520596624117455e-05, "loss": 0.0066, "step": 31640 }, { "epoch": 0.8879225697853836, "grad_norm": 0.04386971890926361, "learning_rate": 3.520129050357694e-05, "loss": 0.0218, "step": 31650 }, { "epoch": 0.88820311404124, "grad_norm": 0.9505913257598877, "learning_rate": 3.5196614765979334e-05, "loss": 0.0118, "step": 31660 }, { "epoch": 0.8884836582970964, "grad_norm": 0.017288019880652428, "learning_rate": 3.519193902838173e-05, "loss": 0.0186, "step": 31670 }, { "epoch": 0.8887642025529527, "grad_norm": 0.167128324508667, "learning_rate": 3.518726329078412e-05, "loss": 0.0371, "step": 31680 }, { "epoch": 0.8890447468088091, "grad_norm": 0.6171634197235107, "learning_rate": 3.518258755318652e-05, "loss": 0.0283, "step": 31690 }, { "epoch": 0.8893252910646654, "grad_norm": 1.6197065114974976, "learning_rate": 3.5177911815588907e-05, "loss": 0.0401, "step": 31700 }, { "epoch": 0.8896058353205218, "grad_norm": 0.07838715612888336, "learning_rate": 3.5173236077991306e-05, "loss": 0.014, "step": 31710 }, { "epoch": 0.8898863795763782, "grad_norm": 0.5231626629829407, "learning_rate": 3.51685603403937e-05, "loss": 0.0101, "step": 31720 }, { "epoch": 0.8901669238322345, "grad_norm": 0.622847855091095, "learning_rate": 3.516388460279609e-05, "loss": 0.0274, "step": 31730 }, { "epoch": 0.8904474680880909, "grad_norm": 0.04970628768205643, "learning_rate": 3.5159208865198486e-05, "loss": 0.0257, "step": 31740 }, { "epoch": 0.8907280123439473, "grad_norm": 0.07161597907543182, "learning_rate": 3.515453312760088e-05, "loss": 0.0204, "step": 31750 }, { "epoch": 0.8910085565998036, "grad_norm": 0.04147350415587425, "learning_rate": 3.514985739000328e-05, "loss": 0.0324, "step": 31760 }, { "epoch": 0.8912891008556599, "grad_norm": 0.03189116343855858, "learning_rate": 3.5145181652405665e-05, "loss": 0.0148, "step": 31770 }, { "epoch": 0.8915696451115164, "grad_norm": 0.042605578899383545, "learning_rate": 3.5140505914808065e-05, "loss": 0.0335, "step": 31780 }, { "epoch": 0.8918501893673727, "grad_norm": 0.4750008285045624, "learning_rate": 3.513583017721045e-05, "loss": 0.0646, "step": 31790 }, { "epoch": 0.892130733623229, "grad_norm": 0.3167915344238281, "learning_rate": 3.513115443961285e-05, "loss": 0.0349, "step": 31800 }, { "epoch": 0.8924112778790855, "grad_norm": 0.2711995542049408, "learning_rate": 3.5126478702015245e-05, "loss": 0.0118, "step": 31810 }, { "epoch": 0.8926918221349418, "grad_norm": 0.3627259433269501, "learning_rate": 3.512180296441764e-05, "loss": 0.0375, "step": 31820 }, { "epoch": 0.8929723663907981, "grad_norm": 0.17752057313919067, "learning_rate": 3.511712722682004e-05, "loss": 0.035, "step": 31830 }, { "epoch": 0.8932529106466545, "grad_norm": 0.22631004452705383, "learning_rate": 3.5112451489222424e-05, "loss": 0.0256, "step": 31840 }, { "epoch": 0.8935334549025109, "grad_norm": 2.118381977081299, "learning_rate": 3.5107775751624824e-05, "loss": 0.0683, "step": 31850 }, { "epoch": 0.8938139991583672, "grad_norm": 0.28216060996055603, "learning_rate": 3.510310001402721e-05, "loss": 0.0407, "step": 31860 }, { "epoch": 0.8940945434142236, "grad_norm": 0.5998006463050842, "learning_rate": 3.509842427642961e-05, "loss": 0.0412, "step": 31870 }, { "epoch": 0.89437508767008, "grad_norm": 3.644585609436035, "learning_rate": 3.5093748538832e-05, "loss": 0.0271, "step": 31880 }, { "epoch": 0.8946556319259363, "grad_norm": 0.07496578991413116, "learning_rate": 3.50890728012344e-05, "loss": 0.0176, "step": 31890 }, { "epoch": 0.8949361761817927, "grad_norm": 0.0318036787211895, "learning_rate": 3.508439706363679e-05, "loss": 0.0315, "step": 31900 }, { "epoch": 0.895216720437649, "grad_norm": 0.40781641006469727, "learning_rate": 3.507972132603918e-05, "loss": 0.0156, "step": 31910 }, { "epoch": 0.8954972646935054, "grad_norm": 0.35372453927993774, "learning_rate": 3.5075045588441576e-05, "loss": 0.0271, "step": 31920 }, { "epoch": 0.8957778089493618, "grad_norm": 0.40806108713150024, "learning_rate": 3.507036985084397e-05, "loss": 0.0341, "step": 31930 }, { "epoch": 0.8960583532052181, "grad_norm": 0.7518731951713562, "learning_rate": 3.506569411324637e-05, "loss": 0.0538, "step": 31940 }, { "epoch": 0.8963388974610745, "grad_norm": 0.5623592734336853, "learning_rate": 3.5061018375648756e-05, "loss": 0.0175, "step": 31950 }, { "epoch": 0.8966194417169309, "grad_norm": 0.2245105355978012, "learning_rate": 3.5056342638051156e-05, "loss": 0.0287, "step": 31960 }, { "epoch": 0.8968999859727872, "grad_norm": 0.028048941865563393, "learning_rate": 3.505166690045355e-05, "loss": 0.0171, "step": 31970 }, { "epoch": 0.8971805302286435, "grad_norm": 0.36929771304130554, "learning_rate": 3.504699116285594e-05, "loss": 0.0173, "step": 31980 }, { "epoch": 0.8974610744845, "grad_norm": 0.012199988588690758, "learning_rate": 3.5042315425258335e-05, "loss": 0.0089, "step": 31990 }, { "epoch": 0.8977416187403563, "grad_norm": 0.40449440479278564, "learning_rate": 3.503763968766073e-05, "loss": 0.054, "step": 32000 }, { "epoch": 0.8980221629962126, "grad_norm": 5.741656303405762, "learning_rate": 3.503296395006312e-05, "loss": 0.0384, "step": 32010 }, { "epoch": 0.8983027072520691, "grad_norm": 0.5501914024353027, "learning_rate": 3.5028288212465515e-05, "loss": 0.0284, "step": 32020 }, { "epoch": 0.8985832515079254, "grad_norm": 0.11916009336709976, "learning_rate": 3.5023612474867915e-05, "loss": 0.048, "step": 32030 }, { "epoch": 0.8988637957637817, "grad_norm": 1.9099422693252563, "learning_rate": 3.501893673727031e-05, "loss": 0.0495, "step": 32040 }, { "epoch": 0.899144340019638, "grad_norm": 0.13561642169952393, "learning_rate": 3.50142609996727e-05, "loss": 0.0263, "step": 32050 }, { "epoch": 0.8994248842754945, "grad_norm": 0.4462505578994751, "learning_rate": 3.5009585262075094e-05, "loss": 0.0303, "step": 32060 }, { "epoch": 0.8997054285313508, "grad_norm": 3.0533151626586914, "learning_rate": 3.500490952447749e-05, "loss": 0.0283, "step": 32070 }, { "epoch": 0.8999859727872072, "grad_norm": 0.10819843411445618, "learning_rate": 3.500023378687988e-05, "loss": 0.0113, "step": 32080 }, { "epoch": 0.9002665170430636, "grad_norm": 0.22741328179836273, "learning_rate": 3.4995558049282273e-05, "loss": 0.0357, "step": 32090 }, { "epoch": 0.9005470612989199, "grad_norm": 0.32422661781311035, "learning_rate": 3.499088231168467e-05, "loss": 0.0111, "step": 32100 }, { "epoch": 0.9008276055547763, "grad_norm": 0.5475156903266907, "learning_rate": 3.4986206574087067e-05, "loss": 0.0318, "step": 32110 }, { "epoch": 0.9011081498106326, "grad_norm": 0.20251651108264923, "learning_rate": 3.498153083648946e-05, "loss": 0.0217, "step": 32120 }, { "epoch": 0.901388694066489, "grad_norm": 0.022954408079385757, "learning_rate": 3.497685509889185e-05, "loss": 0.0305, "step": 32130 }, { "epoch": 0.9016692383223454, "grad_norm": 0.018628856167197227, "learning_rate": 3.4972179361294246e-05, "loss": 0.0136, "step": 32140 }, { "epoch": 0.9019497825782017, "grad_norm": 0.2784214913845062, "learning_rate": 3.496750362369664e-05, "loss": 0.0252, "step": 32150 }, { "epoch": 0.9022303268340581, "grad_norm": 0.02757209725677967, "learning_rate": 3.496282788609903e-05, "loss": 0.016, "step": 32160 }, { "epoch": 0.9025108710899145, "grad_norm": 0.07619437575340271, "learning_rate": 3.4958152148501425e-05, "loss": 0.0177, "step": 32170 }, { "epoch": 0.9027914153457708, "grad_norm": 0.021737340837717056, "learning_rate": 3.4953476410903825e-05, "loss": 0.0405, "step": 32180 }, { "epoch": 0.9030719596016271, "grad_norm": 0.17795999348163605, "learning_rate": 3.494880067330621e-05, "loss": 0.0348, "step": 32190 }, { "epoch": 0.9033525038574836, "grad_norm": 0.019061215221881866, "learning_rate": 3.494412493570861e-05, "loss": 0.0187, "step": 32200 }, { "epoch": 0.9036330481133399, "grad_norm": 0.49143439531326294, "learning_rate": 3.4939449198111005e-05, "loss": 0.032, "step": 32210 }, { "epoch": 0.9039135923691962, "grad_norm": 0.1342395544052124, "learning_rate": 3.49347734605134e-05, "loss": 0.0621, "step": 32220 }, { "epoch": 0.9041941366250525, "grad_norm": 0.06311694532632828, "learning_rate": 3.493009772291579e-05, "loss": 0.0278, "step": 32230 }, { "epoch": 0.904474680880909, "grad_norm": 2.2234206199645996, "learning_rate": 3.4925421985318184e-05, "loss": 0.017, "step": 32240 }, { "epoch": 0.9047552251367653, "grad_norm": 0.058272961527109146, "learning_rate": 3.4920746247720584e-05, "loss": 0.0378, "step": 32250 }, { "epoch": 0.9050357693926216, "grad_norm": 0.2882314920425415, "learning_rate": 3.491607051012297e-05, "loss": 0.0326, "step": 32260 }, { "epoch": 0.9053163136484781, "grad_norm": 1.09280526638031, "learning_rate": 3.491139477252537e-05, "loss": 0.0273, "step": 32270 }, { "epoch": 0.9055968579043344, "grad_norm": 1.0684908628463745, "learning_rate": 3.490671903492776e-05, "loss": 0.047, "step": 32280 }, { "epoch": 0.9058774021601907, "grad_norm": 0.4491412043571472, "learning_rate": 3.490204329733016e-05, "loss": 0.0496, "step": 32290 }, { "epoch": 0.9061579464160471, "grad_norm": 0.07787039875984192, "learning_rate": 3.489736755973255e-05, "loss": 0.0345, "step": 32300 }, { "epoch": 0.9064384906719035, "grad_norm": 0.8960773944854736, "learning_rate": 3.489269182213494e-05, "loss": 0.0291, "step": 32310 }, { "epoch": 0.9067190349277598, "grad_norm": 0.030568260699510574, "learning_rate": 3.4888016084537336e-05, "loss": 0.0309, "step": 32320 }, { "epoch": 0.9069995791836162, "grad_norm": 0.2876034080982208, "learning_rate": 3.488334034693973e-05, "loss": 0.0255, "step": 32330 }, { "epoch": 0.9072801234394726, "grad_norm": 0.14859174191951752, "learning_rate": 3.487866460934213e-05, "loss": 0.0204, "step": 32340 }, { "epoch": 0.907560667695329, "grad_norm": 1.0873287916183472, "learning_rate": 3.4873988871744516e-05, "loss": 0.0193, "step": 32350 }, { "epoch": 0.9078412119511853, "grad_norm": 0.032719891518354416, "learning_rate": 3.4869313134146916e-05, "loss": 0.0447, "step": 32360 }, { "epoch": 0.9081217562070416, "grad_norm": 0.026577472686767578, "learning_rate": 3.486463739654931e-05, "loss": 0.011, "step": 32370 }, { "epoch": 0.908402300462898, "grad_norm": 0.44171908497810364, "learning_rate": 3.48599616589517e-05, "loss": 0.068, "step": 32380 }, { "epoch": 0.9086828447187544, "grad_norm": 0.3652147948741913, "learning_rate": 3.4855285921354095e-05, "loss": 0.0287, "step": 32390 }, { "epoch": 0.9089633889746107, "grad_norm": 0.5903343558311462, "learning_rate": 3.485061018375649e-05, "loss": 0.019, "step": 32400 }, { "epoch": 0.9092439332304671, "grad_norm": 0.43135866522789, "learning_rate": 3.484593444615888e-05, "loss": 0.0188, "step": 32410 }, { "epoch": 0.9095244774863235, "grad_norm": 1.7343169450759888, "learning_rate": 3.4841258708561275e-05, "loss": 0.0328, "step": 32420 }, { "epoch": 0.9098050217421798, "grad_norm": 1.5953190326690674, "learning_rate": 3.4836582970963675e-05, "loss": 0.0281, "step": 32430 }, { "epoch": 0.9100855659980361, "grad_norm": 2.695918083190918, "learning_rate": 3.483190723336607e-05, "loss": 0.0269, "step": 32440 }, { "epoch": 0.9103661102538926, "grad_norm": 1.396700143814087, "learning_rate": 3.482723149576846e-05, "loss": 0.0852, "step": 32450 }, { "epoch": 0.9106466545097489, "grad_norm": 1.6782554388046265, "learning_rate": 3.4822555758170854e-05, "loss": 0.0264, "step": 32460 }, { "epoch": 0.9109271987656052, "grad_norm": 0.10065733641386032, "learning_rate": 3.481788002057325e-05, "loss": 0.0264, "step": 32470 }, { "epoch": 0.9112077430214617, "grad_norm": 0.9336492419242859, "learning_rate": 3.481320428297564e-05, "loss": 0.0337, "step": 32480 }, { "epoch": 0.911488287277318, "grad_norm": 0.38686519861221313, "learning_rate": 3.4808528545378034e-05, "loss": 0.0352, "step": 32490 }, { "epoch": 0.9117688315331743, "grad_norm": 0.17329460382461548, "learning_rate": 3.480385280778043e-05, "loss": 0.023, "step": 32500 }, { "epoch": 0.9120493757890307, "grad_norm": 0.11386283487081528, "learning_rate": 3.479917707018283e-05, "loss": 0.04, "step": 32510 }, { "epoch": 0.9123299200448871, "grad_norm": 0.26792430877685547, "learning_rate": 3.479450133258522e-05, "loss": 0.0289, "step": 32520 }, { "epoch": 0.9126104643007434, "grad_norm": 0.07653316110372543, "learning_rate": 3.478982559498761e-05, "loss": 0.0098, "step": 32530 }, { "epoch": 0.9128910085565998, "grad_norm": 0.7964606881141663, "learning_rate": 3.4785149857390006e-05, "loss": 0.0294, "step": 32540 }, { "epoch": 0.9131715528124562, "grad_norm": 0.023458898067474365, "learning_rate": 3.47804741197924e-05, "loss": 0.0233, "step": 32550 }, { "epoch": 0.9134520970683125, "grad_norm": 0.491149365901947, "learning_rate": 3.477579838219479e-05, "loss": 0.0297, "step": 32560 }, { "epoch": 0.9137326413241689, "grad_norm": 0.0886722207069397, "learning_rate": 3.4771122644597186e-05, "loss": 0.0459, "step": 32570 }, { "epoch": 0.9140131855800252, "grad_norm": 0.12653782963752747, "learning_rate": 3.4766446906999586e-05, "loss": 0.0277, "step": 32580 }, { "epoch": 0.9142937298358816, "grad_norm": 0.4564046561717987, "learning_rate": 3.476177116940197e-05, "loss": 0.0219, "step": 32590 }, { "epoch": 0.914574274091738, "grad_norm": 0.09553050249814987, "learning_rate": 3.475709543180437e-05, "loss": 0.0313, "step": 32600 }, { "epoch": 0.9148548183475943, "grad_norm": 0.20285719633102417, "learning_rate": 3.475241969420676e-05, "loss": 0.0302, "step": 32610 }, { "epoch": 0.9151353626034507, "grad_norm": 0.14079904556274414, "learning_rate": 3.474774395660916e-05, "loss": 0.0191, "step": 32620 }, { "epoch": 0.9154159068593071, "grad_norm": 0.142070934176445, "learning_rate": 3.474306821901155e-05, "loss": 0.021, "step": 32630 }, { "epoch": 0.9156964511151634, "grad_norm": 0.07224003225564957, "learning_rate": 3.4738392481413944e-05, "loss": 0.0276, "step": 32640 }, { "epoch": 0.9159769953710197, "grad_norm": 0.050592925399541855, "learning_rate": 3.4733716743816344e-05, "loss": 0.0303, "step": 32650 }, { "epoch": 0.9162575396268762, "grad_norm": 0.4408785104751587, "learning_rate": 3.472904100621873e-05, "loss": 0.0261, "step": 32660 }, { "epoch": 0.9165380838827325, "grad_norm": 0.4459896385669708, "learning_rate": 3.472436526862113e-05, "loss": 0.0237, "step": 32670 }, { "epoch": 0.9168186281385888, "grad_norm": 1.374921441078186, "learning_rate": 3.471968953102352e-05, "loss": 0.0623, "step": 32680 }, { "epoch": 0.9170991723944453, "grad_norm": 0.3044758439064026, "learning_rate": 3.471501379342592e-05, "loss": 0.034, "step": 32690 }, { "epoch": 0.9173797166503016, "grad_norm": 0.2607450485229492, "learning_rate": 3.47103380558283e-05, "loss": 0.04, "step": 32700 }, { "epoch": 0.9176602609061579, "grad_norm": 0.43549248576164246, "learning_rate": 3.47056623182307e-05, "loss": 0.0251, "step": 32710 }, { "epoch": 0.9179408051620143, "grad_norm": 0.5178442001342773, "learning_rate": 3.4700986580633096e-05, "loss": 0.0227, "step": 32720 }, { "epoch": 0.9182213494178707, "grad_norm": 0.021922487765550613, "learning_rate": 3.469631084303549e-05, "loss": 0.034, "step": 32730 }, { "epoch": 0.918501893673727, "grad_norm": 0.025651460513472557, "learning_rate": 3.469163510543789e-05, "loss": 0.0228, "step": 32740 }, { "epoch": 0.9187824379295834, "grad_norm": 0.5149391889572144, "learning_rate": 3.4686959367840276e-05, "loss": 0.014, "step": 32750 }, { "epoch": 0.9190629821854398, "grad_norm": 1.3091548681259155, "learning_rate": 3.4682283630242676e-05, "loss": 0.0273, "step": 32760 }, { "epoch": 0.9193435264412961, "grad_norm": 0.0135659733787179, "learning_rate": 3.467760789264506e-05, "loss": 0.0457, "step": 32770 }, { "epoch": 0.9196240706971525, "grad_norm": 0.3848720192909241, "learning_rate": 3.467293215504746e-05, "loss": 0.0359, "step": 32780 }, { "epoch": 0.9199046149530088, "grad_norm": 0.21453240513801575, "learning_rate": 3.4668256417449855e-05, "loss": 0.0154, "step": 32790 }, { "epoch": 0.9201851592088652, "grad_norm": 0.5119496583938599, "learning_rate": 3.466358067985225e-05, "loss": 0.0184, "step": 32800 }, { "epoch": 0.9204657034647216, "grad_norm": 0.2494402676820755, "learning_rate": 3.465890494225464e-05, "loss": 0.0216, "step": 32810 }, { "epoch": 0.9207462477205779, "grad_norm": 0.23731490969657898, "learning_rate": 3.4654229204657035e-05, "loss": 0.0584, "step": 32820 }, { "epoch": 0.9210267919764343, "grad_norm": 0.020223403349518776, "learning_rate": 3.464955346705943e-05, "loss": 0.0503, "step": 32830 }, { "epoch": 0.9213073362322907, "grad_norm": 0.6442462801933289, "learning_rate": 3.464487772946182e-05, "loss": 0.0249, "step": 32840 }, { "epoch": 0.921587880488147, "grad_norm": 0.9527145624160767, "learning_rate": 3.464020199186422e-05, "loss": 0.034, "step": 32850 }, { "epoch": 0.9218684247440033, "grad_norm": 0.06079260632395744, "learning_rate": 3.4635526254266614e-05, "loss": 0.0309, "step": 32860 }, { "epoch": 0.9221489689998598, "grad_norm": 2.8579437732696533, "learning_rate": 3.463085051666901e-05, "loss": 0.0139, "step": 32870 }, { "epoch": 0.9224295132557161, "grad_norm": 0.19715115427970886, "learning_rate": 3.46261747790714e-05, "loss": 0.0334, "step": 32880 }, { "epoch": 0.9227100575115724, "grad_norm": 0.5640004277229309, "learning_rate": 3.4621499041473794e-05, "loss": 0.033, "step": 32890 }, { "epoch": 0.9229906017674289, "grad_norm": 0.4114953875541687, "learning_rate": 3.461682330387619e-05, "loss": 0.0179, "step": 32900 }, { "epoch": 0.9232711460232852, "grad_norm": 0.39667701721191406, "learning_rate": 3.461214756627858e-05, "loss": 0.0155, "step": 32910 }, { "epoch": 0.9235516902791415, "grad_norm": 0.30708765983581543, "learning_rate": 3.460747182868097e-05, "loss": 0.0386, "step": 32920 }, { "epoch": 0.9238322345349979, "grad_norm": 0.3409547507762909, "learning_rate": 3.460279609108337e-05, "loss": 0.0517, "step": 32930 }, { "epoch": 0.9241127787908543, "grad_norm": 0.08259677141904831, "learning_rate": 3.4598120353485766e-05, "loss": 0.0111, "step": 32940 }, { "epoch": 0.9243933230467106, "grad_norm": 0.0777939110994339, "learning_rate": 3.459344461588816e-05, "loss": 0.0455, "step": 32950 }, { "epoch": 0.924673867302567, "grad_norm": 0.10427937656641006, "learning_rate": 3.458876887829055e-05, "loss": 0.0295, "step": 32960 }, { "epoch": 0.9249544115584234, "grad_norm": 0.04723535105586052, "learning_rate": 3.4584093140692946e-05, "loss": 0.0316, "step": 32970 }, { "epoch": 0.9252349558142797, "grad_norm": 0.05749201774597168, "learning_rate": 3.457941740309534e-05, "loss": 0.0436, "step": 32980 }, { "epoch": 0.925515500070136, "grad_norm": 0.35107293725013733, "learning_rate": 3.457474166549773e-05, "loss": 0.0357, "step": 32990 }, { "epoch": 0.9257960443259924, "grad_norm": 0.7360305190086365, "learning_rate": 3.457006592790013e-05, "loss": 0.0091, "step": 33000 }, { "epoch": 0.9260765885818488, "grad_norm": 0.9161474108695984, "learning_rate": 3.456539019030252e-05, "loss": 0.0154, "step": 33010 }, { "epoch": 0.9263571328377052, "grad_norm": 0.02092314325273037, "learning_rate": 3.456071445270492e-05, "loss": 0.0117, "step": 33020 }, { "epoch": 0.9266376770935615, "grad_norm": 0.019027473405003548, "learning_rate": 3.455603871510731e-05, "loss": 0.0139, "step": 33030 }, { "epoch": 0.9269182213494179, "grad_norm": 0.030055135488510132, "learning_rate": 3.4551362977509705e-05, "loss": 0.0126, "step": 33040 }, { "epoch": 0.9271987656052743, "grad_norm": 0.6062058210372925, "learning_rate": 3.45466872399121e-05, "loss": 0.018, "step": 33050 }, { "epoch": 0.9274793098611306, "grad_norm": 0.4155551791191101, "learning_rate": 3.454201150231449e-05, "loss": 0.0744, "step": 33060 }, { "epoch": 0.9277598541169869, "grad_norm": 0.24654339253902435, "learning_rate": 3.453733576471689e-05, "loss": 0.0068, "step": 33070 }, { "epoch": 0.9280403983728434, "grad_norm": 0.7765662670135498, "learning_rate": 3.453266002711928e-05, "loss": 0.0299, "step": 33080 }, { "epoch": 0.9283209426286997, "grad_norm": 0.14994202554225922, "learning_rate": 3.452798428952168e-05, "loss": 0.0173, "step": 33090 }, { "epoch": 0.928601486884556, "grad_norm": 0.8562442064285278, "learning_rate": 3.4523308551924063e-05, "loss": 0.0366, "step": 33100 }, { "epoch": 0.9288820311404123, "grad_norm": 0.05200408026576042, "learning_rate": 3.4518632814326463e-05, "loss": 0.0556, "step": 33110 }, { "epoch": 0.9291625753962688, "grad_norm": 1.4735578298568726, "learning_rate": 3.4513957076728857e-05, "loss": 0.0446, "step": 33120 }, { "epoch": 0.9294431196521251, "grad_norm": 0.0689166784286499, "learning_rate": 3.450928133913125e-05, "loss": 0.034, "step": 33130 }, { "epoch": 0.9297236639079814, "grad_norm": 0.050786878913640976, "learning_rate": 3.450460560153364e-05, "loss": 0.0147, "step": 33140 }, { "epoch": 0.9300042081638379, "grad_norm": 0.13700084388256073, "learning_rate": 3.4499929863936036e-05, "loss": 0.023, "step": 33150 }, { "epoch": 0.9302847524196942, "grad_norm": 0.06554930657148361, "learning_rate": 3.4495254126338436e-05, "loss": 0.0481, "step": 33160 }, { "epoch": 0.9305652966755505, "grad_norm": 1.2941927909851074, "learning_rate": 3.449057838874082e-05, "loss": 0.0375, "step": 33170 }, { "epoch": 0.9308458409314069, "grad_norm": 0.1543726921081543, "learning_rate": 3.448590265114322e-05, "loss": 0.0233, "step": 33180 }, { "epoch": 0.9311263851872633, "grad_norm": 0.150149405002594, "learning_rate": 3.448122691354561e-05, "loss": 0.0226, "step": 33190 }, { "epoch": 0.9314069294431196, "grad_norm": 1.7011778354644775, "learning_rate": 3.447655117594801e-05, "loss": 0.0282, "step": 33200 }, { "epoch": 0.931687473698976, "grad_norm": 0.07333791255950928, "learning_rate": 3.44718754383504e-05, "loss": 0.0279, "step": 33210 }, { "epoch": 0.9319680179548324, "grad_norm": 0.6108389496803284, "learning_rate": 3.4467199700752795e-05, "loss": 0.0449, "step": 33220 }, { "epoch": 0.9322485622106887, "grad_norm": 0.03379828482866287, "learning_rate": 3.446252396315519e-05, "loss": 0.0087, "step": 33230 }, { "epoch": 0.9325291064665451, "grad_norm": 0.25742489099502563, "learning_rate": 3.445784822555758e-05, "loss": 0.0307, "step": 33240 }, { "epoch": 0.9328096507224014, "grad_norm": 0.03951835259795189, "learning_rate": 3.445317248795998e-05, "loss": 0.0149, "step": 33250 }, { "epoch": 0.9330901949782578, "grad_norm": 0.2087179720401764, "learning_rate": 3.444849675036237e-05, "loss": 0.0129, "step": 33260 }, { "epoch": 0.9333707392341142, "grad_norm": 0.8732184171676636, "learning_rate": 3.444382101276477e-05, "loss": 0.0558, "step": 33270 }, { "epoch": 0.9336512834899705, "grad_norm": 0.1671009212732315, "learning_rate": 3.443914527516716e-05, "loss": 0.0257, "step": 33280 }, { "epoch": 0.933931827745827, "grad_norm": 0.9922713041305542, "learning_rate": 3.4434469537569554e-05, "loss": 0.0224, "step": 33290 }, { "epoch": 0.9342123720016833, "grad_norm": 0.28043895959854126, "learning_rate": 3.442979379997195e-05, "loss": 0.0311, "step": 33300 }, { "epoch": 0.9344929162575396, "grad_norm": 0.3637234568595886, "learning_rate": 3.442511806237434e-05, "loss": 0.0321, "step": 33310 }, { "epoch": 0.9347734605133959, "grad_norm": 0.028357645496726036, "learning_rate": 3.442044232477673e-05, "loss": 0.0094, "step": 33320 }, { "epoch": 0.9350540047692524, "grad_norm": 0.028301339596509933, "learning_rate": 3.4415766587179126e-05, "loss": 0.0246, "step": 33330 }, { "epoch": 0.9353345490251087, "grad_norm": 0.33280593156814575, "learning_rate": 3.4411090849581526e-05, "loss": 0.0158, "step": 33340 }, { "epoch": 0.935615093280965, "grad_norm": 0.6739158034324646, "learning_rate": 3.440641511198392e-05, "loss": 0.0182, "step": 33350 }, { "epoch": 0.9358956375368215, "grad_norm": 0.09960892796516418, "learning_rate": 3.440173937438631e-05, "loss": 0.0382, "step": 33360 }, { "epoch": 0.9361761817926778, "grad_norm": 0.01251740101724863, "learning_rate": 3.4397063636788706e-05, "loss": 0.0085, "step": 33370 }, { "epoch": 0.9364567260485341, "grad_norm": 0.5382820963859558, "learning_rate": 3.43923878991911e-05, "loss": 0.0499, "step": 33380 }, { "epoch": 0.9367372703043905, "grad_norm": 0.015172847546637058, "learning_rate": 3.438771216159349e-05, "loss": 0.0435, "step": 33390 }, { "epoch": 0.9370178145602469, "grad_norm": 0.17077948153018951, "learning_rate": 3.4383036423995885e-05, "loss": 0.0162, "step": 33400 }, { "epoch": 0.9372983588161032, "grad_norm": 0.2150661051273346, "learning_rate": 3.437836068639828e-05, "loss": 0.0238, "step": 33410 }, { "epoch": 0.9375789030719596, "grad_norm": 0.03462826833128929, "learning_rate": 3.437368494880068e-05, "loss": 0.0372, "step": 33420 }, { "epoch": 0.937859447327816, "grad_norm": 0.15008209645748138, "learning_rate": 3.436900921120307e-05, "loss": 0.0197, "step": 33430 }, { "epoch": 0.9381399915836723, "grad_norm": 0.06205981597304344, "learning_rate": 3.4364333473605465e-05, "loss": 0.0267, "step": 33440 }, { "epoch": 0.9384205358395287, "grad_norm": 0.5469611883163452, "learning_rate": 3.435965773600786e-05, "loss": 0.0329, "step": 33450 }, { "epoch": 0.938701080095385, "grad_norm": 0.031909480690956116, "learning_rate": 3.435498199841025e-05, "loss": 0.0139, "step": 33460 }, { "epoch": 0.9389816243512414, "grad_norm": 0.36626672744750977, "learning_rate": 3.4350306260812644e-05, "loss": 0.067, "step": 33470 }, { "epoch": 0.9392621686070978, "grad_norm": 11.497079849243164, "learning_rate": 3.434563052321504e-05, "loss": 0.0526, "step": 33480 }, { "epoch": 0.9395427128629541, "grad_norm": 6.09824800491333, "learning_rate": 3.434095478561744e-05, "loss": 0.0184, "step": 33490 }, { "epoch": 0.9398232571188105, "grad_norm": 0.0556374229490757, "learning_rate": 3.4336279048019824e-05, "loss": 0.0154, "step": 33500 }, { "epoch": 0.9401038013746669, "grad_norm": 0.1053771898150444, "learning_rate": 3.4331603310422223e-05, "loss": 0.0061, "step": 33510 }, { "epoch": 0.9403843456305232, "grad_norm": 0.16954416036605835, "learning_rate": 3.432692757282461e-05, "loss": 0.0285, "step": 33520 }, { "epoch": 0.9406648898863795, "grad_norm": 0.04177188500761986, "learning_rate": 3.432225183522701e-05, "loss": 0.0479, "step": 33530 }, { "epoch": 0.940945434142236, "grad_norm": 0.03310291841626167, "learning_rate": 3.43175760976294e-05, "loss": 0.0385, "step": 33540 }, { "epoch": 0.9412259783980923, "grad_norm": 1.039929986000061, "learning_rate": 3.4312900360031796e-05, "loss": 0.016, "step": 33550 }, { "epoch": 0.9415065226539486, "grad_norm": 0.30769261717796326, "learning_rate": 3.4308224622434196e-05, "loss": 0.0408, "step": 33560 }, { "epoch": 0.9417870669098051, "grad_norm": 0.0625743716955185, "learning_rate": 3.430354888483658e-05, "loss": 0.0254, "step": 33570 }, { "epoch": 0.9420676111656614, "grad_norm": 0.04679158702492714, "learning_rate": 3.429887314723898e-05, "loss": 0.0264, "step": 33580 }, { "epoch": 0.9423481554215177, "grad_norm": 0.12351346760988235, "learning_rate": 3.429419740964137e-05, "loss": 0.0149, "step": 33590 }, { "epoch": 0.9426286996773741, "grad_norm": 5.036203384399414, "learning_rate": 3.428952167204377e-05, "loss": 0.0443, "step": 33600 }, { "epoch": 0.9429092439332305, "grad_norm": 0.12263604253530502, "learning_rate": 3.4284845934446155e-05, "loss": 0.0354, "step": 33610 }, { "epoch": 0.9431897881890868, "grad_norm": 0.22024713456630707, "learning_rate": 3.4280170196848555e-05, "loss": 0.0178, "step": 33620 }, { "epoch": 0.9434703324449432, "grad_norm": 0.6868940591812134, "learning_rate": 3.427549445925095e-05, "loss": 0.0201, "step": 33630 }, { "epoch": 0.9437508767007996, "grad_norm": 0.049505606293678284, "learning_rate": 3.427081872165334e-05, "loss": 0.0233, "step": 33640 }, { "epoch": 0.9440314209566559, "grad_norm": 0.2380741387605667, "learning_rate": 3.426614298405574e-05, "loss": 0.068, "step": 33650 }, { "epoch": 0.9443119652125123, "grad_norm": 0.7618623971939087, "learning_rate": 3.426146724645813e-05, "loss": 0.0143, "step": 33660 }, { "epoch": 0.9445925094683686, "grad_norm": 0.7121806144714355, "learning_rate": 3.425679150886053e-05, "loss": 0.0345, "step": 33670 }, { "epoch": 0.944873053724225, "grad_norm": 0.05781310796737671, "learning_rate": 3.4252115771262914e-05, "loss": 0.0226, "step": 33680 }, { "epoch": 0.9451535979800814, "grad_norm": 0.4886155128479004, "learning_rate": 3.4247440033665314e-05, "loss": 0.0253, "step": 33690 }, { "epoch": 0.9454341422359377, "grad_norm": 0.09074469655752182, "learning_rate": 3.424276429606771e-05, "loss": 0.023, "step": 33700 }, { "epoch": 0.9457146864917941, "grad_norm": 0.05522184446454048, "learning_rate": 3.42380885584701e-05, "loss": 0.063, "step": 33710 }, { "epoch": 0.9459952307476505, "grad_norm": 0.40347912907600403, "learning_rate": 3.423341282087249e-05, "loss": 0.018, "step": 33720 }, { "epoch": 0.9462757750035068, "grad_norm": 0.03851460665464401, "learning_rate": 3.4228737083274886e-05, "loss": 0.0126, "step": 33730 }, { "epoch": 0.9465563192593631, "grad_norm": 0.05886319279670715, "learning_rate": 3.422406134567728e-05, "loss": 0.0363, "step": 33740 }, { "epoch": 0.9468368635152196, "grad_norm": 0.0707797035574913, "learning_rate": 3.421938560807967e-05, "loss": 0.0148, "step": 33750 }, { "epoch": 0.9471174077710759, "grad_norm": 0.07891687750816345, "learning_rate": 3.421470987048207e-05, "loss": 0.0354, "step": 33760 }, { "epoch": 0.9473979520269322, "grad_norm": 0.2962004840373993, "learning_rate": 3.4210034132884466e-05, "loss": 0.0331, "step": 33770 }, { "epoch": 0.9476784962827887, "grad_norm": 0.03603608161211014, "learning_rate": 3.420535839528686e-05, "loss": 0.0062, "step": 33780 }, { "epoch": 0.947959040538645, "grad_norm": 1.1530264616012573, "learning_rate": 3.420068265768925e-05, "loss": 0.0422, "step": 33790 }, { "epoch": 0.9482395847945013, "grad_norm": 0.06375939399003983, "learning_rate": 3.4196006920091645e-05, "loss": 0.0243, "step": 33800 }, { "epoch": 0.9485201290503577, "grad_norm": 0.03900925815105438, "learning_rate": 3.419133118249404e-05, "loss": 0.0212, "step": 33810 }, { "epoch": 0.9488006733062141, "grad_norm": 0.8729046583175659, "learning_rate": 3.418665544489643e-05, "loss": 0.0404, "step": 33820 }, { "epoch": 0.9490812175620704, "grad_norm": 3.1595542430877686, "learning_rate": 3.4181979707298825e-05, "loss": 0.0209, "step": 33830 }, { "epoch": 0.9493617618179268, "grad_norm": 0.05531243979930878, "learning_rate": 3.4177303969701225e-05, "loss": 0.0153, "step": 33840 }, { "epoch": 0.9496423060737832, "grad_norm": 0.10202372819185257, "learning_rate": 3.417262823210362e-05, "loss": 0.0384, "step": 33850 }, { "epoch": 0.9499228503296395, "grad_norm": 0.29137542843818665, "learning_rate": 3.416795249450601e-05, "loss": 0.0192, "step": 33860 }, { "epoch": 0.9502033945854959, "grad_norm": 0.5616990327835083, "learning_rate": 3.4163276756908404e-05, "loss": 0.0323, "step": 33870 }, { "epoch": 0.9504839388413522, "grad_norm": 0.17313171923160553, "learning_rate": 3.41586010193108e-05, "loss": 0.0381, "step": 33880 }, { "epoch": 0.9507644830972086, "grad_norm": 0.3960915803909302, "learning_rate": 3.415392528171319e-05, "loss": 0.0397, "step": 33890 }, { "epoch": 0.951045027353065, "grad_norm": 0.04947710037231445, "learning_rate": 3.4149249544115584e-05, "loss": 0.0276, "step": 33900 }, { "epoch": 0.9513255716089213, "grad_norm": 0.6679660081863403, "learning_rate": 3.4144573806517984e-05, "loss": 0.0354, "step": 33910 }, { "epoch": 0.9516061158647777, "grad_norm": 0.337552934885025, "learning_rate": 3.413989806892037e-05, "loss": 0.0341, "step": 33920 }, { "epoch": 0.951886660120634, "grad_norm": 0.25453707575798035, "learning_rate": 3.413522233132277e-05, "loss": 0.0373, "step": 33930 }, { "epoch": 0.9521672043764904, "grad_norm": 0.08267179876565933, "learning_rate": 3.413054659372516e-05, "loss": 0.0242, "step": 33940 }, { "epoch": 0.9524477486323467, "grad_norm": 0.04952666163444519, "learning_rate": 3.4125870856127556e-05, "loss": 0.0125, "step": 33950 }, { "epoch": 0.9527282928882032, "grad_norm": 0.08764436095952988, "learning_rate": 3.412119511852995e-05, "loss": 0.0201, "step": 33960 }, { "epoch": 0.9530088371440595, "grad_norm": 0.33322641253471375, "learning_rate": 3.411651938093234e-05, "loss": 0.0494, "step": 33970 }, { "epoch": 0.9532893813999158, "grad_norm": 0.08774102479219437, "learning_rate": 3.411184364333474e-05, "loss": 0.0127, "step": 33980 }, { "epoch": 0.9535699256557721, "grad_norm": 0.016125008463859558, "learning_rate": 3.410716790573713e-05, "loss": 0.0188, "step": 33990 }, { "epoch": 0.9538504699116286, "grad_norm": 0.9714086055755615, "learning_rate": 3.410249216813953e-05, "loss": 0.0216, "step": 34000 }, { "epoch": 0.9541310141674849, "grad_norm": 0.13081766664981842, "learning_rate": 3.4097816430541915e-05, "loss": 0.0066, "step": 34010 }, { "epoch": 0.9544115584233412, "grad_norm": 1.796088457107544, "learning_rate": 3.4093140692944315e-05, "loss": 0.0131, "step": 34020 }, { "epoch": 0.9546921026791977, "grad_norm": 0.03826170042157173, "learning_rate": 3.408846495534671e-05, "loss": 0.0191, "step": 34030 }, { "epoch": 0.954972646935054, "grad_norm": 0.740395188331604, "learning_rate": 3.40837892177491e-05, "loss": 0.0312, "step": 34040 }, { "epoch": 0.9552531911909103, "grad_norm": 0.4121719002723694, "learning_rate": 3.4079113480151495e-05, "loss": 0.0159, "step": 34050 }, { "epoch": 0.9555337354467667, "grad_norm": 0.355532169342041, "learning_rate": 3.407443774255389e-05, "loss": 0.0155, "step": 34060 }, { "epoch": 0.9558142797026231, "grad_norm": 0.14085429906845093, "learning_rate": 3.406976200495629e-05, "loss": 0.057, "step": 34070 }, { "epoch": 0.9560948239584794, "grad_norm": 0.7708171606063843, "learning_rate": 3.4065086267358674e-05, "loss": 0.0218, "step": 34080 }, { "epoch": 0.9563753682143358, "grad_norm": 0.10582605749368668, "learning_rate": 3.4060410529761074e-05, "loss": 0.0121, "step": 34090 }, { "epoch": 0.9566559124701922, "grad_norm": 0.24428705871105194, "learning_rate": 3.405573479216346e-05, "loss": 0.0181, "step": 34100 }, { "epoch": 0.9569364567260485, "grad_norm": 0.044718410819768906, "learning_rate": 3.405105905456586e-05, "loss": 0.008, "step": 34110 }, { "epoch": 0.9572170009819049, "grad_norm": 0.012387290596961975, "learning_rate": 3.4046383316968253e-05, "loss": 0.0458, "step": 34120 }, { "epoch": 0.9574975452377612, "grad_norm": 0.22346779704093933, "learning_rate": 3.4041707579370647e-05, "loss": 0.0028, "step": 34130 }, { "epoch": 0.9577780894936176, "grad_norm": 0.4236675500869751, "learning_rate": 3.403703184177304e-05, "loss": 0.0292, "step": 34140 }, { "epoch": 0.958058633749474, "grad_norm": 0.014831059612333775, "learning_rate": 3.403235610417543e-05, "loss": 0.0056, "step": 34150 }, { "epoch": 0.9583391780053303, "grad_norm": 0.10833359509706497, "learning_rate": 3.402768036657783e-05, "loss": 0.0253, "step": 34160 }, { "epoch": 0.9586197222611867, "grad_norm": 0.18856598436832428, "learning_rate": 3.402300462898022e-05, "loss": 0.0222, "step": 34170 }, { "epoch": 0.9589002665170431, "grad_norm": 0.9978188276290894, "learning_rate": 3.401832889138262e-05, "loss": 0.0177, "step": 34180 }, { "epoch": 0.9591808107728994, "grad_norm": 0.018720494583249092, "learning_rate": 3.401365315378501e-05, "loss": 0.0154, "step": 34190 }, { "epoch": 0.9594613550287557, "grad_norm": 0.017227759584784508, "learning_rate": 3.4008977416187405e-05, "loss": 0.0225, "step": 34200 }, { "epoch": 0.9597418992846122, "grad_norm": 0.16090625524520874, "learning_rate": 3.40043016785898e-05, "loss": 0.0502, "step": 34210 }, { "epoch": 0.9600224435404685, "grad_norm": 0.2116394191980362, "learning_rate": 3.399962594099219e-05, "loss": 0.0069, "step": 34220 }, { "epoch": 0.9603029877963248, "grad_norm": 0.27626413106918335, "learning_rate": 3.3994950203394585e-05, "loss": 0.0174, "step": 34230 }, { "epoch": 0.9605835320521813, "grad_norm": 0.03211115300655365, "learning_rate": 3.399027446579698e-05, "loss": 0.0278, "step": 34240 }, { "epoch": 0.9608640763080376, "grad_norm": 0.04644925519824028, "learning_rate": 3.398559872819938e-05, "loss": 0.0654, "step": 34250 }, { "epoch": 0.9611446205638939, "grad_norm": 1.3107681274414062, "learning_rate": 3.398092299060177e-05, "loss": 0.0378, "step": 34260 }, { "epoch": 0.9614251648197503, "grad_norm": 0.6460570693016052, "learning_rate": 3.3976247253004164e-05, "loss": 0.0209, "step": 34270 }, { "epoch": 0.9617057090756067, "grad_norm": 0.30949804186820984, "learning_rate": 3.397157151540656e-05, "loss": 0.0154, "step": 34280 }, { "epoch": 0.961986253331463, "grad_norm": 0.4250471293926239, "learning_rate": 3.396689577780895e-05, "loss": 0.0428, "step": 34290 }, { "epoch": 0.9622667975873194, "grad_norm": 0.28180640935897827, "learning_rate": 3.3962220040211344e-05, "loss": 0.0184, "step": 34300 }, { "epoch": 0.9625473418431758, "grad_norm": 0.03152800723910332, "learning_rate": 3.395754430261374e-05, "loss": 0.0185, "step": 34310 }, { "epoch": 0.9628278860990321, "grad_norm": 0.021961728110909462, "learning_rate": 3.395286856501613e-05, "loss": 0.0183, "step": 34320 }, { "epoch": 0.9631084303548885, "grad_norm": 0.032754674553871155, "learning_rate": 3.394819282741853e-05, "loss": 0.0089, "step": 34330 }, { "epoch": 0.9633889746107448, "grad_norm": 0.2530365586280823, "learning_rate": 3.394351708982092e-05, "loss": 0.0551, "step": 34340 }, { "epoch": 0.9636695188666012, "grad_norm": 2.257117986679077, "learning_rate": 3.3938841352223316e-05, "loss": 0.0446, "step": 34350 }, { "epoch": 0.9639500631224576, "grad_norm": 0.19267655909061432, "learning_rate": 3.393416561462571e-05, "loss": 0.041, "step": 34360 }, { "epoch": 0.9642306073783139, "grad_norm": 0.4446794092655182, "learning_rate": 3.39294898770281e-05, "loss": 0.05, "step": 34370 }, { "epoch": 0.9645111516341703, "grad_norm": 2.847675085067749, "learning_rate": 3.3924814139430496e-05, "loss": 0.0304, "step": 34380 }, { "epoch": 0.9647916958900267, "grad_norm": 1.7412819862365723, "learning_rate": 3.392013840183289e-05, "loss": 0.0343, "step": 34390 }, { "epoch": 0.965072240145883, "grad_norm": 0.16333647072315216, "learning_rate": 3.391546266423529e-05, "loss": 0.0412, "step": 34400 }, { "epoch": 0.9653527844017393, "grad_norm": 0.1429060995578766, "learning_rate": 3.3910786926637675e-05, "loss": 0.0152, "step": 34410 }, { "epoch": 0.9656333286575958, "grad_norm": 0.09357337653636932, "learning_rate": 3.3906111189040075e-05, "loss": 0.0152, "step": 34420 }, { "epoch": 0.9659138729134521, "grad_norm": 1.0052242279052734, "learning_rate": 3.390143545144246e-05, "loss": 0.0214, "step": 34430 }, { "epoch": 0.9661944171693084, "grad_norm": 0.45452460646629333, "learning_rate": 3.389675971384486e-05, "loss": 0.032, "step": 34440 }, { "epoch": 0.9664749614251649, "grad_norm": 0.056603286415338516, "learning_rate": 3.3892083976247255e-05, "loss": 0.038, "step": 34450 }, { "epoch": 0.9667555056810212, "grad_norm": 5.291423797607422, "learning_rate": 3.388740823864965e-05, "loss": 0.0182, "step": 34460 }, { "epoch": 0.9670360499368775, "grad_norm": 0.013767588883638382, "learning_rate": 3.388273250105205e-05, "loss": 0.031, "step": 34470 }, { "epoch": 0.9673165941927339, "grad_norm": 2.2667112350463867, "learning_rate": 3.3878056763454434e-05, "loss": 0.0202, "step": 34480 }, { "epoch": 0.9675971384485903, "grad_norm": 0.7771583795547485, "learning_rate": 3.3873381025856834e-05, "loss": 0.0147, "step": 34490 }, { "epoch": 0.9678776827044466, "grad_norm": 0.9115377068519592, "learning_rate": 3.386870528825922e-05, "loss": 0.0147, "step": 34500 }, { "epoch": 0.968158226960303, "grad_norm": 0.3541896641254425, "learning_rate": 3.386402955066162e-05, "loss": 0.0147, "step": 34510 }, { "epoch": 0.9684387712161594, "grad_norm": 0.7870844006538391, "learning_rate": 3.385935381306401e-05, "loss": 0.0263, "step": 34520 }, { "epoch": 0.9687193154720157, "grad_norm": 0.6929520964622498, "learning_rate": 3.385467807546641e-05, "loss": 0.0242, "step": 34530 }, { "epoch": 0.9689998597278721, "grad_norm": 0.1713794320821762, "learning_rate": 3.38500023378688e-05, "loss": 0.0117, "step": 34540 }, { "epoch": 0.9692804039837284, "grad_norm": 0.11879829317331314, "learning_rate": 3.384532660027119e-05, "loss": 0.0263, "step": 34550 }, { "epoch": 0.9695609482395848, "grad_norm": 0.10248345136642456, "learning_rate": 3.384065086267359e-05, "loss": 0.0213, "step": 34560 }, { "epoch": 0.9698414924954412, "grad_norm": 0.017601288855075836, "learning_rate": 3.383597512507598e-05, "loss": 0.0174, "step": 34570 }, { "epoch": 0.9701220367512975, "grad_norm": 3.527916669845581, "learning_rate": 3.383129938747838e-05, "loss": 0.0244, "step": 34580 }, { "epoch": 0.9704025810071539, "grad_norm": 0.24460665881633759, "learning_rate": 3.3826623649880766e-05, "loss": 0.0205, "step": 34590 }, { "epoch": 0.9706831252630103, "grad_norm": 0.39178892970085144, "learning_rate": 3.3821947912283166e-05, "loss": 0.0781, "step": 34600 }, { "epoch": 0.9709636695188666, "grad_norm": 0.03423295542597771, "learning_rate": 3.381727217468556e-05, "loss": 0.0077, "step": 34610 }, { "epoch": 0.9712442137747229, "grad_norm": 0.19659049808979034, "learning_rate": 3.381259643708795e-05, "loss": 0.0397, "step": 34620 }, { "epoch": 0.9715247580305794, "grad_norm": 0.198961079120636, "learning_rate": 3.3807920699490345e-05, "loss": 0.0064, "step": 34630 }, { "epoch": 0.9718053022864357, "grad_norm": 0.023699184879660606, "learning_rate": 3.380324496189274e-05, "loss": 0.0355, "step": 34640 }, { "epoch": 0.972085846542292, "grad_norm": 0.11193465441465378, "learning_rate": 3.379856922429513e-05, "loss": 0.0238, "step": 34650 }, { "epoch": 0.9723663907981485, "grad_norm": 0.054319269955158234, "learning_rate": 3.3793893486697524e-05, "loss": 0.0527, "step": 34660 }, { "epoch": 0.9726469350540048, "grad_norm": 0.06779482215642929, "learning_rate": 3.3789217749099924e-05, "loss": 0.0279, "step": 34670 }, { "epoch": 0.9729274793098611, "grad_norm": 0.21323594450950623, "learning_rate": 3.378454201150232e-05, "loss": 0.0163, "step": 34680 }, { "epoch": 0.9732080235657175, "grad_norm": 5.421228885650635, "learning_rate": 3.377986627390471e-05, "loss": 0.0392, "step": 34690 }, { "epoch": 0.9734885678215739, "grad_norm": 0.17626135051250458, "learning_rate": 3.3775190536307104e-05, "loss": 0.0335, "step": 34700 }, { "epoch": 0.9737691120774302, "grad_norm": 0.264110803604126, "learning_rate": 3.37705147987095e-05, "loss": 0.0085, "step": 34710 }, { "epoch": 0.9740496563332866, "grad_norm": 0.6292673945426941, "learning_rate": 3.376583906111189e-05, "loss": 0.0455, "step": 34720 }, { "epoch": 0.974330200589143, "grad_norm": 0.13998596370220184, "learning_rate": 3.376116332351428e-05, "loss": 0.0461, "step": 34730 }, { "epoch": 0.9746107448449993, "grad_norm": 0.13115090131759644, "learning_rate": 3.3756487585916676e-05, "loss": 0.0222, "step": 34740 }, { "epoch": 0.9748912891008557, "grad_norm": 0.03715396299958229, "learning_rate": 3.3751811848319076e-05, "loss": 0.0455, "step": 34750 }, { "epoch": 0.975171833356712, "grad_norm": 0.0656178817152977, "learning_rate": 3.374713611072147e-05, "loss": 0.0347, "step": 34760 }, { "epoch": 0.9754523776125684, "grad_norm": 1.389703631401062, "learning_rate": 3.374246037312386e-05, "loss": 0.0236, "step": 34770 }, { "epoch": 0.9757329218684248, "grad_norm": 0.4597412347793579, "learning_rate": 3.3737784635526256e-05, "loss": 0.0244, "step": 34780 }, { "epoch": 0.9760134661242811, "grad_norm": 0.034126605838537216, "learning_rate": 3.373310889792865e-05, "loss": 0.0168, "step": 34790 }, { "epoch": 0.9762940103801374, "grad_norm": 0.013827863149344921, "learning_rate": 3.372843316033105e-05, "loss": 0.0258, "step": 34800 }, { "epoch": 0.9765745546359939, "grad_norm": 0.022017043083906174, "learning_rate": 3.3723757422733435e-05, "loss": 0.0261, "step": 34810 }, { "epoch": 0.9768550988918502, "grad_norm": 0.02784290909767151, "learning_rate": 3.3719081685135835e-05, "loss": 0.0357, "step": 34820 }, { "epoch": 0.9771356431477065, "grad_norm": 2.9395060539245605, "learning_rate": 3.371440594753822e-05, "loss": 0.0503, "step": 34830 }, { "epoch": 0.977416187403563, "grad_norm": 0.24556654691696167, "learning_rate": 3.370973020994062e-05, "loss": 0.0286, "step": 34840 }, { "epoch": 0.9776967316594193, "grad_norm": 0.34573644399642944, "learning_rate": 3.3705054472343015e-05, "loss": 0.0237, "step": 34850 }, { "epoch": 0.9779772759152756, "grad_norm": 2.210284471511841, "learning_rate": 3.370037873474541e-05, "loss": 0.0318, "step": 34860 }, { "epoch": 0.9782578201711319, "grad_norm": 0.3733750283718109, "learning_rate": 3.369570299714781e-05, "loss": 0.0067, "step": 34870 }, { "epoch": 0.9785383644269884, "grad_norm": 0.7097735404968262, "learning_rate": 3.3691027259550194e-05, "loss": 0.033, "step": 34880 }, { "epoch": 0.9788189086828447, "grad_norm": 0.03433864563703537, "learning_rate": 3.3686351521952594e-05, "loss": 0.0277, "step": 34890 }, { "epoch": 0.979099452938701, "grad_norm": 0.058461569249629974, "learning_rate": 3.368167578435498e-05, "loss": 0.0398, "step": 34900 }, { "epoch": 0.9793799971945575, "grad_norm": 0.5176515579223633, "learning_rate": 3.367700004675738e-05, "loss": 0.0458, "step": 34910 }, { "epoch": 0.9796605414504138, "grad_norm": 0.13194985687732697, "learning_rate": 3.367232430915977e-05, "loss": 0.0235, "step": 34920 }, { "epoch": 0.9799410857062701, "grad_norm": 1.3638771772384644, "learning_rate": 3.366764857156217e-05, "loss": 0.0263, "step": 34930 }, { "epoch": 0.9802216299621265, "grad_norm": 0.5802273750305176, "learning_rate": 3.366297283396456e-05, "loss": 0.032, "step": 34940 }, { "epoch": 0.9805021742179829, "grad_norm": 0.1335442215204239, "learning_rate": 3.365829709636695e-05, "loss": 0.0138, "step": 34950 }, { "epoch": 0.9807827184738392, "grad_norm": 0.46739864349365234, "learning_rate": 3.3653621358769346e-05, "loss": 0.0328, "step": 34960 }, { "epoch": 0.9810632627296956, "grad_norm": 0.06993871182203293, "learning_rate": 3.364894562117174e-05, "loss": 0.009, "step": 34970 }, { "epoch": 0.981343806985552, "grad_norm": 0.07580536603927612, "learning_rate": 3.364426988357414e-05, "loss": 0.0314, "step": 34980 }, { "epoch": 0.9816243512414083, "grad_norm": 0.6694544553756714, "learning_rate": 3.3639594145976526e-05, "loss": 0.0278, "step": 34990 }, { "epoch": 0.9819048954972647, "grad_norm": 0.5129803419113159, "learning_rate": 3.3634918408378926e-05, "loss": 0.0258, "step": 35000 }, { "epoch": 0.982185439753121, "grad_norm": 0.2933781147003174, "learning_rate": 3.363024267078132e-05, "loss": 0.0375, "step": 35010 }, { "epoch": 0.9824659840089774, "grad_norm": 3.1541974544525146, "learning_rate": 3.362556693318371e-05, "loss": 0.031, "step": 35020 }, { "epoch": 0.9827465282648338, "grad_norm": 0.1123121976852417, "learning_rate": 3.3620891195586105e-05, "loss": 0.027, "step": 35030 }, { "epoch": 0.9830270725206901, "grad_norm": 0.2259104698896408, "learning_rate": 3.36162154579885e-05, "loss": 0.0492, "step": 35040 }, { "epoch": 0.9833076167765465, "grad_norm": 2.536456823348999, "learning_rate": 3.361153972039089e-05, "loss": 0.0369, "step": 35050 }, { "epoch": 0.9835881610324029, "grad_norm": 0.7057671546936035, "learning_rate": 3.3606863982793285e-05, "loss": 0.041, "step": 35060 }, { "epoch": 0.9838687052882592, "grad_norm": 1.4222304821014404, "learning_rate": 3.3602188245195684e-05, "loss": 0.0303, "step": 35070 }, { "epoch": 0.9841492495441155, "grad_norm": 0.1870802342891693, "learning_rate": 3.359751250759808e-05, "loss": 0.0292, "step": 35080 }, { "epoch": 0.984429793799972, "grad_norm": 0.32165685296058655, "learning_rate": 3.359283677000047e-05, "loss": 0.0353, "step": 35090 }, { "epoch": 0.9847103380558283, "grad_norm": 0.23240286111831665, "learning_rate": 3.3588161032402864e-05, "loss": 0.0189, "step": 35100 }, { "epoch": 0.9849908823116846, "grad_norm": 0.0491907112300396, "learning_rate": 3.358348529480526e-05, "loss": 0.0133, "step": 35110 }, { "epoch": 0.9852714265675411, "grad_norm": 0.14754743874073029, "learning_rate": 3.357880955720765e-05, "loss": 0.0302, "step": 35120 }, { "epoch": 0.9855519708233974, "grad_norm": 0.03708450123667717, "learning_rate": 3.3574133819610043e-05, "loss": 0.0386, "step": 35130 }, { "epoch": 0.9858325150792537, "grad_norm": 1.4569333791732788, "learning_rate": 3.3569458082012437e-05, "loss": 0.0072, "step": 35140 }, { "epoch": 0.9861130593351101, "grad_norm": 0.6828127503395081, "learning_rate": 3.3564782344414836e-05, "loss": 0.0119, "step": 35150 }, { "epoch": 0.9863936035909665, "grad_norm": 0.3206544518470764, "learning_rate": 3.356010660681723e-05, "loss": 0.0301, "step": 35160 }, { "epoch": 0.9866741478468228, "grad_norm": 0.7284712195396423, "learning_rate": 3.355543086921962e-05, "loss": 0.0572, "step": 35170 }, { "epoch": 0.9869546921026792, "grad_norm": 0.2257654219865799, "learning_rate": 3.3550755131622016e-05, "loss": 0.0226, "step": 35180 }, { "epoch": 0.9872352363585356, "grad_norm": 0.05485226586461067, "learning_rate": 3.354607939402441e-05, "loss": 0.015, "step": 35190 }, { "epoch": 0.9875157806143919, "grad_norm": 0.042265940457582474, "learning_rate": 3.35414036564268e-05, "loss": 0.0258, "step": 35200 }, { "epoch": 0.9877963248702483, "grad_norm": 0.22520388662815094, "learning_rate": 3.3536727918829195e-05, "loss": 0.0216, "step": 35210 }, { "epoch": 0.9880768691261046, "grad_norm": 0.9784408807754517, "learning_rate": 3.3532052181231595e-05, "loss": 0.033, "step": 35220 }, { "epoch": 0.988357413381961, "grad_norm": 0.45057907700538635, "learning_rate": 3.352737644363398e-05, "loss": 0.0255, "step": 35230 }, { "epoch": 0.9886379576378174, "grad_norm": 0.06377924978733063, "learning_rate": 3.352270070603638e-05, "loss": 0.0151, "step": 35240 }, { "epoch": 0.9889185018936737, "grad_norm": 0.4642847776412964, "learning_rate": 3.3518024968438775e-05, "loss": 0.0249, "step": 35250 }, { "epoch": 0.9891990461495301, "grad_norm": 0.1722896248102188, "learning_rate": 3.351334923084117e-05, "loss": 0.0197, "step": 35260 }, { "epoch": 0.9894795904053865, "grad_norm": 0.7765779495239258, "learning_rate": 3.350867349324356e-05, "loss": 0.0321, "step": 35270 }, { "epoch": 0.9897601346612428, "grad_norm": 0.08128935843706131, "learning_rate": 3.3503997755645954e-05, "loss": 0.021, "step": 35280 }, { "epoch": 0.9900406789170991, "grad_norm": 0.5266945362091064, "learning_rate": 3.3499322018048354e-05, "loss": 0.0179, "step": 35290 }, { "epoch": 0.9903212231729556, "grad_norm": 0.26756229996681213, "learning_rate": 3.349464628045074e-05, "loss": 0.019, "step": 35300 }, { "epoch": 0.9906017674288119, "grad_norm": 0.017659109085798264, "learning_rate": 3.348997054285314e-05, "loss": 0.0199, "step": 35310 }, { "epoch": 0.9908823116846682, "grad_norm": 0.2894574999809265, "learning_rate": 3.348529480525553e-05, "loss": 0.0247, "step": 35320 }, { "epoch": 0.9911628559405247, "grad_norm": 0.137266606092453, "learning_rate": 3.348061906765793e-05, "loss": 0.0353, "step": 35330 }, { "epoch": 0.991443400196381, "grad_norm": 0.257712721824646, "learning_rate": 3.347594333006031e-05, "loss": 0.0134, "step": 35340 }, { "epoch": 0.9917239444522373, "grad_norm": 0.10794106870889664, "learning_rate": 3.347126759246271e-05, "loss": 0.053, "step": 35350 }, { "epoch": 0.9920044887080937, "grad_norm": 0.12930890917778015, "learning_rate": 3.3466591854865106e-05, "loss": 0.0316, "step": 35360 }, { "epoch": 0.9922850329639501, "grad_norm": 0.3948620557785034, "learning_rate": 3.34619161172675e-05, "loss": 0.0302, "step": 35370 }, { "epoch": 0.9925655772198064, "grad_norm": 0.05729019641876221, "learning_rate": 3.34572403796699e-05, "loss": 0.0326, "step": 35380 }, { "epoch": 0.9928461214756628, "grad_norm": 0.24863681197166443, "learning_rate": 3.3452564642072286e-05, "loss": 0.0206, "step": 35390 }, { "epoch": 0.9931266657315192, "grad_norm": 0.41474035382270813, "learning_rate": 3.3447888904474686e-05, "loss": 0.0272, "step": 35400 }, { "epoch": 0.9934072099873755, "grad_norm": 0.3796822130680084, "learning_rate": 3.344321316687707e-05, "loss": 0.0229, "step": 35410 }, { "epoch": 0.9936877542432319, "grad_norm": 0.16763517260551453, "learning_rate": 3.343853742927947e-05, "loss": 0.0205, "step": 35420 }, { "epoch": 0.9939682984990882, "grad_norm": 0.020077228546142578, "learning_rate": 3.3433861691681865e-05, "loss": 0.0549, "step": 35430 }, { "epoch": 0.9942488427549446, "grad_norm": 0.13925841450691223, "learning_rate": 3.342918595408426e-05, "loss": 0.0265, "step": 35440 }, { "epoch": 0.994529387010801, "grad_norm": 0.17116087675094604, "learning_rate": 3.342451021648665e-05, "loss": 0.0153, "step": 35450 }, { "epoch": 0.9948099312666573, "grad_norm": 0.07196842133998871, "learning_rate": 3.3419834478889045e-05, "loss": 0.0063, "step": 35460 }, { "epoch": 0.9950904755225137, "grad_norm": 2.867699146270752, "learning_rate": 3.3415158741291445e-05, "loss": 0.0218, "step": 35470 }, { "epoch": 0.9953710197783701, "grad_norm": 0.019605513662099838, "learning_rate": 3.341048300369383e-05, "loss": 0.0374, "step": 35480 }, { "epoch": 0.9956515640342264, "grad_norm": 0.20946018397808075, "learning_rate": 3.340580726609623e-05, "loss": 0.0165, "step": 35490 }, { "epoch": 0.9959321082900827, "grad_norm": 0.2613699436187744, "learning_rate": 3.3401131528498624e-05, "loss": 0.0107, "step": 35500 }, { "epoch": 0.9962126525459392, "grad_norm": 0.3192771077156067, "learning_rate": 3.339645579090102e-05, "loss": 0.0271, "step": 35510 }, { "epoch": 0.9964931968017955, "grad_norm": 0.0848178043961525, "learning_rate": 3.339178005330341e-05, "loss": 0.0147, "step": 35520 }, { "epoch": 0.9967737410576518, "grad_norm": 0.6128907203674316, "learning_rate": 3.3387104315705803e-05, "loss": 0.0355, "step": 35530 }, { "epoch": 0.9970542853135083, "grad_norm": 0.2949044704437256, "learning_rate": 3.33824285781082e-05, "loss": 0.0172, "step": 35540 }, { "epoch": 0.9973348295693646, "grad_norm": 0.10801045596599579, "learning_rate": 3.337775284051059e-05, "loss": 0.0429, "step": 35550 }, { "epoch": 0.9976153738252209, "grad_norm": 0.10415435582399368, "learning_rate": 3.337307710291298e-05, "loss": 0.0204, "step": 35560 }, { "epoch": 0.9978959180810772, "grad_norm": 0.26999589800834656, "learning_rate": 3.336840136531538e-05, "loss": 0.019, "step": 35570 }, { "epoch": 0.9981764623369337, "grad_norm": 0.23641523718833923, "learning_rate": 3.3363725627717776e-05, "loss": 0.046, "step": 35580 }, { "epoch": 0.99845700659279, "grad_norm": 0.07253163307905197, "learning_rate": 3.335904989012017e-05, "loss": 0.0211, "step": 35590 }, { "epoch": 0.9987375508486463, "grad_norm": 0.14435122907161713, "learning_rate": 3.335437415252256e-05, "loss": 0.0264, "step": 35600 }, { "epoch": 0.9990180951045028, "grad_norm": 0.4724274277687073, "learning_rate": 3.3349698414924955e-05, "loss": 0.0372, "step": 35610 }, { "epoch": 0.9992986393603591, "grad_norm": 0.9575018286705017, "learning_rate": 3.334502267732735e-05, "loss": 0.0184, "step": 35620 }, { "epoch": 0.9995791836162154, "grad_norm": 0.2887406051158905, "learning_rate": 3.334034693972974e-05, "loss": 0.0222, "step": 35630 }, { "epoch": 0.9998597278720718, "grad_norm": 0.2981266677379608, "learning_rate": 3.333567120213214e-05, "loss": 0.0238, "step": 35640 }, { "epoch": 1.0, "eval_f1": 0.9928904343760501, "eval_loss": 0.02610321342945099, "eval_precision": 0.9925317619199701, "eval_recall": 0.9932493661536782, "eval_runtime": 361.5828, "eval_samples_per_second": 675.975, "eval_steps_per_second": 42.25, "step": 35645 }, { "epoch": 1.0001402721279282, "grad_norm": 0.1421181857585907, "learning_rate": 3.333099546453453e-05, "loss": 0.0819, "step": 35650 }, { "epoch": 1.0004208163837844, "grad_norm": 0.10504051297903061, "learning_rate": 3.332631972693693e-05, "loss": 0.0209, "step": 35660 }, { "epoch": 1.0007013606396409, "grad_norm": 0.1952250748872757, "learning_rate": 3.332164398933932e-05, "loss": 0.02, "step": 35670 }, { "epoch": 1.0009819048954973, "grad_norm": 1.495906949043274, "learning_rate": 3.3316968251741714e-05, "loss": 0.0167, "step": 35680 }, { "epoch": 1.0012624491513535, "grad_norm": 0.31985142827033997, "learning_rate": 3.331229251414411e-05, "loss": 0.0325, "step": 35690 }, { "epoch": 1.00154299340721, "grad_norm": 2.040388345718384, "learning_rate": 3.33076167765465e-05, "loss": 0.0349, "step": 35700 }, { "epoch": 1.0018235376630664, "grad_norm": 0.10768236964941025, "learning_rate": 3.33029410389489e-05, "loss": 0.0114, "step": 35710 }, { "epoch": 1.0021040819189226, "grad_norm": 0.3001897633075714, "learning_rate": 3.329826530135129e-05, "loss": 0.0483, "step": 35720 }, { "epoch": 1.002384626174779, "grad_norm": 0.06700262427330017, "learning_rate": 3.329358956375369e-05, "loss": 0.016, "step": 35730 }, { "epoch": 1.0026651704306355, "grad_norm": 0.027690095826983452, "learning_rate": 3.328891382615607e-05, "loss": 0.0285, "step": 35740 }, { "epoch": 1.0029457146864917, "grad_norm": 0.028525259345769882, "learning_rate": 3.328423808855847e-05, "loss": 0.0168, "step": 35750 }, { "epoch": 1.0032262589423482, "grad_norm": 0.03694969415664673, "learning_rate": 3.3279562350960866e-05, "loss": 0.0076, "step": 35760 }, { "epoch": 1.0035068031982046, "grad_norm": 0.6371629238128662, "learning_rate": 3.327488661336326e-05, "loss": 0.0701, "step": 35770 }, { "epoch": 1.0037873474540608, "grad_norm": 0.36911827325820923, "learning_rate": 3.327021087576566e-05, "loss": 0.027, "step": 35780 }, { "epoch": 1.0040678917099173, "grad_norm": 0.28664615750312805, "learning_rate": 3.3265535138168046e-05, "loss": 0.0098, "step": 35790 }, { "epoch": 1.0043484359657735, "grad_norm": 0.022248562425374985, "learning_rate": 3.3260859400570446e-05, "loss": 0.0168, "step": 35800 }, { "epoch": 1.00462898022163, "grad_norm": 1.495800495147705, "learning_rate": 3.325618366297283e-05, "loss": 0.0288, "step": 35810 }, { "epoch": 1.0049095244774864, "grad_norm": 0.046243518590927124, "learning_rate": 3.325150792537523e-05, "loss": 0.014, "step": 35820 }, { "epoch": 1.0051900687333426, "grad_norm": 0.32307472825050354, "learning_rate": 3.324683218777762e-05, "loss": 0.0318, "step": 35830 }, { "epoch": 1.005470612989199, "grad_norm": 2.0683112144470215, "learning_rate": 3.324215645018002e-05, "loss": 0.0198, "step": 35840 }, { "epoch": 1.0057511572450555, "grad_norm": 0.4193629324436188, "learning_rate": 3.323748071258241e-05, "loss": 0.0322, "step": 35850 }, { "epoch": 1.0060317015009117, "grad_norm": 0.5428898334503174, "learning_rate": 3.3232804974984805e-05, "loss": 0.0218, "step": 35860 }, { "epoch": 1.0063122457567681, "grad_norm": 0.18935000896453857, "learning_rate": 3.32281292373872e-05, "loss": 0.0317, "step": 35870 }, { "epoch": 1.0065927900126246, "grad_norm": 0.030838435515761375, "learning_rate": 3.322345349978959e-05, "loss": 0.0105, "step": 35880 }, { "epoch": 1.0068733342684808, "grad_norm": 0.3620474636554718, "learning_rate": 3.321877776219199e-05, "loss": 0.0081, "step": 35890 }, { "epoch": 1.0071538785243372, "grad_norm": 0.019685419276356697, "learning_rate": 3.321410202459438e-05, "loss": 0.0038, "step": 35900 }, { "epoch": 1.0074344227801937, "grad_norm": 0.3138695955276489, "learning_rate": 3.320942628699678e-05, "loss": 0.0343, "step": 35910 }, { "epoch": 1.00771496703605, "grad_norm": 0.31325551867485046, "learning_rate": 3.320475054939917e-05, "loss": 0.032, "step": 35920 }, { "epoch": 1.0079955112919063, "grad_norm": 0.20189963281154633, "learning_rate": 3.3200074811801564e-05, "loss": 0.0203, "step": 35930 }, { "epoch": 1.0082760555477626, "grad_norm": 0.0832340344786644, "learning_rate": 3.319539907420396e-05, "loss": 0.0225, "step": 35940 }, { "epoch": 1.008556599803619, "grad_norm": 0.0919489935040474, "learning_rate": 3.319072333660635e-05, "loss": 0.0229, "step": 35950 }, { "epoch": 1.0088371440594754, "grad_norm": 1.813767910003662, "learning_rate": 3.318604759900874e-05, "loss": 0.0256, "step": 35960 }, { "epoch": 1.0091176883153317, "grad_norm": 0.08646406978368759, "learning_rate": 3.3181371861411136e-05, "loss": 0.0068, "step": 35970 }, { "epoch": 1.009398232571188, "grad_norm": 4.660658836364746, "learning_rate": 3.3176696123813536e-05, "loss": 0.0295, "step": 35980 }, { "epoch": 1.0096787768270445, "grad_norm": 0.21942640841007233, "learning_rate": 3.317202038621593e-05, "loss": 0.0113, "step": 35990 }, { "epoch": 1.0099593210829008, "grad_norm": 0.5391904711723328, "learning_rate": 3.316734464861832e-05, "loss": 0.0088, "step": 36000 }, { "epoch": 1.0102398653387572, "grad_norm": 1.1186591386795044, "learning_rate": 3.3162668911020716e-05, "loss": 0.0042, "step": 36010 }, { "epoch": 1.0105204095946136, "grad_norm": 0.14725664258003235, "learning_rate": 3.315799317342311e-05, "loss": 0.0269, "step": 36020 }, { "epoch": 1.0108009538504699, "grad_norm": 0.9896079301834106, "learning_rate": 3.31533174358255e-05, "loss": 0.0442, "step": 36030 }, { "epoch": 1.0110814981063263, "grad_norm": 0.06622561067342758, "learning_rate": 3.3148641698227895e-05, "loss": 0.0277, "step": 36040 }, { "epoch": 1.0113620423621827, "grad_norm": 2.987518787384033, "learning_rate": 3.314396596063029e-05, "loss": 0.0292, "step": 36050 }, { "epoch": 1.011642586618039, "grad_norm": 0.4866114556789398, "learning_rate": 3.313929022303269e-05, "loss": 0.0251, "step": 36060 }, { "epoch": 1.0119231308738954, "grad_norm": 0.012615138664841652, "learning_rate": 3.313461448543508e-05, "loss": 0.0228, "step": 36070 }, { "epoch": 1.0122036751297516, "grad_norm": 2.012239933013916, "learning_rate": 3.3129938747837474e-05, "loss": 0.0421, "step": 36080 }, { "epoch": 1.012484219385608, "grad_norm": 0.01268740464001894, "learning_rate": 3.312526301023987e-05, "loss": 0.0191, "step": 36090 }, { "epoch": 1.0127647636414645, "grad_norm": 2.2619881629943848, "learning_rate": 3.312058727264226e-05, "loss": 0.0355, "step": 36100 }, { "epoch": 1.0130453078973207, "grad_norm": 0.019242819398641586, "learning_rate": 3.3115911535044654e-05, "loss": 0.0109, "step": 36110 }, { "epoch": 1.0133258521531772, "grad_norm": 0.5456796884536743, "learning_rate": 3.311123579744705e-05, "loss": 0.0156, "step": 36120 }, { "epoch": 1.0136063964090336, "grad_norm": 0.12585122883319855, "learning_rate": 3.310656005984945e-05, "loss": 0.0057, "step": 36130 }, { "epoch": 1.0138869406648898, "grad_norm": 0.06721638888120651, "learning_rate": 3.310188432225183e-05, "loss": 0.0646, "step": 36140 }, { "epoch": 1.0141674849207463, "grad_norm": 0.7626860737800598, "learning_rate": 3.309720858465423e-05, "loss": 0.0689, "step": 36150 }, { "epoch": 1.0144480291766027, "grad_norm": 0.42633384466171265, "learning_rate": 3.3092532847056626e-05, "loss": 0.035, "step": 36160 }, { "epoch": 1.014728573432459, "grad_norm": 0.13619805872440338, "learning_rate": 3.308785710945902e-05, "loss": 0.0242, "step": 36170 }, { "epoch": 1.0150091176883154, "grad_norm": 0.3227216899394989, "learning_rate": 3.308318137186141e-05, "loss": 0.0154, "step": 36180 }, { "epoch": 1.0152896619441716, "grad_norm": 0.10811427235603333, "learning_rate": 3.3078505634263806e-05, "loss": 0.0192, "step": 36190 }, { "epoch": 1.015570206200028, "grad_norm": 0.18436482548713684, "learning_rate": 3.3073829896666206e-05, "loss": 0.0203, "step": 36200 }, { "epoch": 1.0158507504558845, "grad_norm": 1.2955257892608643, "learning_rate": 3.306915415906859e-05, "loss": 0.0315, "step": 36210 }, { "epoch": 1.0161312947117407, "grad_norm": 0.45879828929901123, "learning_rate": 3.306447842147099e-05, "loss": 0.0205, "step": 36220 }, { "epoch": 1.0164118389675971, "grad_norm": 0.04352802410721779, "learning_rate": 3.305980268387338e-05, "loss": 0.0295, "step": 36230 }, { "epoch": 1.0166923832234536, "grad_norm": 0.031045805662870407, "learning_rate": 3.305512694627578e-05, "loss": 0.0268, "step": 36240 }, { "epoch": 1.0169729274793098, "grad_norm": 0.9367356896400452, "learning_rate": 3.3050451208678165e-05, "loss": 0.008, "step": 36250 }, { "epoch": 1.0172534717351662, "grad_norm": 2.0639588832855225, "learning_rate": 3.3045775471080565e-05, "loss": 0.0371, "step": 36260 }, { "epoch": 1.0175340159910227, "grad_norm": 0.19018086791038513, "learning_rate": 3.304109973348296e-05, "loss": 0.013, "step": 36270 }, { "epoch": 1.0178145602468789, "grad_norm": 0.01840830035507679, "learning_rate": 3.303642399588535e-05, "loss": 0.0101, "step": 36280 }, { "epoch": 1.0180951045027353, "grad_norm": 0.05319578945636749, "learning_rate": 3.303174825828775e-05, "loss": 0.0558, "step": 36290 }, { "epoch": 1.0183756487585918, "grad_norm": 0.195138081908226, "learning_rate": 3.302707252069014e-05, "loss": 0.046, "step": 36300 }, { "epoch": 1.018656193014448, "grad_norm": 0.028261972591280937, "learning_rate": 3.302239678309254e-05, "loss": 0.0238, "step": 36310 }, { "epoch": 1.0189367372703044, "grad_norm": 0.8763851523399353, "learning_rate": 3.3017721045494924e-05, "loss": 0.0491, "step": 36320 }, { "epoch": 1.0192172815261606, "grad_norm": 0.9472708702087402, "learning_rate": 3.3013045307897324e-05, "loss": 0.0309, "step": 36330 }, { "epoch": 1.019497825782017, "grad_norm": 1.0153595209121704, "learning_rate": 3.300836957029972e-05, "loss": 0.0138, "step": 36340 }, { "epoch": 1.0197783700378735, "grad_norm": 0.41113075613975525, "learning_rate": 3.300369383270211e-05, "loss": 0.0297, "step": 36350 }, { "epoch": 1.0200589142937297, "grad_norm": 0.42150452733039856, "learning_rate": 3.29990180951045e-05, "loss": 0.0074, "step": 36360 }, { "epoch": 1.0203394585495862, "grad_norm": 0.48056986927986145, "learning_rate": 3.2994342357506896e-05, "loss": 0.0583, "step": 36370 }, { "epoch": 1.0206200028054426, "grad_norm": 0.4666799306869507, "learning_rate": 3.2989666619909296e-05, "loss": 0.0341, "step": 36380 }, { "epoch": 1.0209005470612988, "grad_norm": 0.7516776919364929, "learning_rate": 3.298499088231168e-05, "loss": 0.0161, "step": 36390 }, { "epoch": 1.0211810913171553, "grad_norm": 0.08827907592058182, "learning_rate": 3.298031514471408e-05, "loss": 0.0496, "step": 36400 }, { "epoch": 1.0214616355730117, "grad_norm": 0.056040287017822266, "learning_rate": 3.2975639407116476e-05, "loss": 0.0122, "step": 36410 }, { "epoch": 1.021742179828868, "grad_norm": 0.9007468819618225, "learning_rate": 3.297096366951887e-05, "loss": 0.0632, "step": 36420 }, { "epoch": 1.0220227240847244, "grad_norm": 0.18426845967769623, "learning_rate": 3.296628793192126e-05, "loss": 0.0149, "step": 36430 }, { "epoch": 1.0223032683405808, "grad_norm": 0.16343116760253906, "learning_rate": 3.2961612194323655e-05, "loss": 0.0299, "step": 36440 }, { "epoch": 1.022583812596437, "grad_norm": 0.3209187686443329, "learning_rate": 3.295693645672605e-05, "loss": 0.0474, "step": 36450 }, { "epoch": 1.0228643568522935, "grad_norm": 0.08847460150718689, "learning_rate": 3.295226071912844e-05, "loss": 0.0046, "step": 36460 }, { "epoch": 1.0231449011081497, "grad_norm": 0.03098597377538681, "learning_rate": 3.2947584981530835e-05, "loss": 0.0283, "step": 36470 }, { "epoch": 1.0234254453640061, "grad_norm": 0.05331321805715561, "learning_rate": 3.2942909243933235e-05, "loss": 0.0121, "step": 36480 }, { "epoch": 1.0237059896198626, "grad_norm": 5.737972259521484, "learning_rate": 3.293823350633563e-05, "loss": 0.034, "step": 36490 }, { "epoch": 1.0239865338757188, "grad_norm": 2.7633063793182373, "learning_rate": 3.293355776873802e-05, "loss": 0.0429, "step": 36500 }, { "epoch": 1.0242670781315752, "grad_norm": 0.02195909060537815, "learning_rate": 3.2928882031140414e-05, "loss": 0.0066, "step": 36510 }, { "epoch": 1.0245476223874317, "grad_norm": 0.022762805223464966, "learning_rate": 3.292420629354281e-05, "loss": 0.0248, "step": 36520 }, { "epoch": 1.024828166643288, "grad_norm": 0.02227087877690792, "learning_rate": 3.29195305559452e-05, "loss": 0.0136, "step": 36530 }, { "epoch": 1.0251087108991443, "grad_norm": 0.35382401943206787, "learning_rate": 3.2914854818347593e-05, "loss": 0.0271, "step": 36540 }, { "epoch": 1.0253892551550008, "grad_norm": 2.22780179977417, "learning_rate": 3.2910179080749993e-05, "loss": 0.0506, "step": 36550 }, { "epoch": 1.025669799410857, "grad_norm": 0.09076323360204697, "learning_rate": 3.290550334315238e-05, "loss": 0.0175, "step": 36560 }, { "epoch": 1.0259503436667134, "grad_norm": 0.9589269757270813, "learning_rate": 3.290082760555478e-05, "loss": 0.0133, "step": 36570 }, { "epoch": 1.02623088792257, "grad_norm": 0.5247045159339905, "learning_rate": 3.289615186795717e-05, "loss": 0.0261, "step": 36580 }, { "epoch": 1.026511432178426, "grad_norm": 0.016665631905198097, "learning_rate": 3.2891476130359566e-05, "loss": 0.0231, "step": 36590 }, { "epoch": 1.0267919764342825, "grad_norm": 1.3820549249649048, "learning_rate": 3.288680039276196e-05, "loss": 0.0166, "step": 36600 }, { "epoch": 1.0270725206901388, "grad_norm": 0.2054038792848587, "learning_rate": 3.288212465516435e-05, "loss": 0.0116, "step": 36610 }, { "epoch": 1.0273530649459952, "grad_norm": 0.0409415028989315, "learning_rate": 3.287744891756675e-05, "loss": 0.0243, "step": 36620 }, { "epoch": 1.0276336092018516, "grad_norm": 2.0193302631378174, "learning_rate": 3.287277317996914e-05, "loss": 0.0157, "step": 36630 }, { "epoch": 1.0279141534577079, "grad_norm": 0.040815770626068115, "learning_rate": 3.286809744237154e-05, "loss": 0.0045, "step": 36640 }, { "epoch": 1.0281946977135643, "grad_norm": 0.04949631169438362, "learning_rate": 3.2863421704773925e-05, "loss": 0.0181, "step": 36650 }, { "epoch": 1.0284752419694208, "grad_norm": 2.054511308670044, "learning_rate": 3.2858745967176325e-05, "loss": 0.0402, "step": 36660 }, { "epoch": 1.028755786225277, "grad_norm": 0.4941536784172058, "learning_rate": 3.285407022957872e-05, "loss": 0.0121, "step": 36670 }, { "epoch": 1.0290363304811334, "grad_norm": 0.17524076998233795, "learning_rate": 3.284939449198111e-05, "loss": 0.027, "step": 36680 }, { "epoch": 1.0293168747369899, "grad_norm": 0.505326509475708, "learning_rate": 3.284471875438351e-05, "loss": 0.0346, "step": 36690 }, { "epoch": 1.029597418992846, "grad_norm": 0.5295384526252747, "learning_rate": 3.28400430167859e-05, "loss": 0.0159, "step": 36700 }, { "epoch": 1.0298779632487025, "grad_norm": 0.02109752781689167, "learning_rate": 3.28353672791883e-05, "loss": 0.0391, "step": 36710 }, { "epoch": 1.030158507504559, "grad_norm": 0.38162267208099365, "learning_rate": 3.2830691541590684e-05, "loss": 0.0172, "step": 36720 }, { "epoch": 1.0304390517604152, "grad_norm": 0.0261833593249321, "learning_rate": 3.2826015803993084e-05, "loss": 0.0361, "step": 36730 }, { "epoch": 1.0307195960162716, "grad_norm": 0.18121293187141418, "learning_rate": 3.282134006639547e-05, "loss": 0.0346, "step": 36740 }, { "epoch": 1.0310001402721278, "grad_norm": 0.06968209147453308, "learning_rate": 3.281666432879787e-05, "loss": 0.0312, "step": 36750 }, { "epoch": 1.0312806845279843, "grad_norm": 0.11603501439094543, "learning_rate": 3.281198859120026e-05, "loss": 0.0268, "step": 36760 }, { "epoch": 1.0315612287838407, "grad_norm": 0.09477604180574417, "learning_rate": 3.2807312853602656e-05, "loss": 0.0198, "step": 36770 }, { "epoch": 1.031841773039697, "grad_norm": 0.06702425330877304, "learning_rate": 3.280263711600505e-05, "loss": 0.027, "step": 36780 }, { "epoch": 1.0321223172955534, "grad_norm": 2.5688064098358154, "learning_rate": 3.279796137840744e-05, "loss": 0.0286, "step": 36790 }, { "epoch": 1.0324028615514098, "grad_norm": 0.17979586124420166, "learning_rate": 3.279328564080984e-05, "loss": 0.0297, "step": 36800 }, { "epoch": 1.032683405807266, "grad_norm": 1.2377171516418457, "learning_rate": 3.278860990321223e-05, "loss": 0.0258, "step": 36810 }, { "epoch": 1.0329639500631225, "grad_norm": 0.09245448559522629, "learning_rate": 3.278393416561463e-05, "loss": 0.0243, "step": 36820 }, { "epoch": 1.033244494318979, "grad_norm": 0.38766738772392273, "learning_rate": 3.277925842801702e-05, "loss": 0.0154, "step": 36830 }, { "epoch": 1.0335250385748351, "grad_norm": 0.30956342816352844, "learning_rate": 3.2774582690419415e-05, "loss": 0.0268, "step": 36840 }, { "epoch": 1.0338055828306916, "grad_norm": 0.5752271413803101, "learning_rate": 3.276990695282181e-05, "loss": 0.0116, "step": 36850 }, { "epoch": 1.0340861270865478, "grad_norm": 0.12652373313903809, "learning_rate": 3.27652312152242e-05, "loss": 0.0086, "step": 36860 }, { "epoch": 1.0343666713424042, "grad_norm": 0.0962701365351677, "learning_rate": 3.2760555477626595e-05, "loss": 0.0205, "step": 36870 }, { "epoch": 1.0346472155982607, "grad_norm": 0.41454339027404785, "learning_rate": 3.275587974002899e-05, "loss": 0.0435, "step": 36880 }, { "epoch": 1.034927759854117, "grad_norm": 1.1531472206115723, "learning_rate": 3.275120400243139e-05, "loss": 0.0303, "step": 36890 }, { "epoch": 1.0352083041099733, "grad_norm": 0.3897148072719574, "learning_rate": 3.274652826483378e-05, "loss": 0.0405, "step": 36900 }, { "epoch": 1.0354888483658298, "grad_norm": 0.27644672989845276, "learning_rate": 3.2741852527236174e-05, "loss": 0.0352, "step": 36910 }, { "epoch": 1.035769392621686, "grad_norm": 0.3521435558795929, "learning_rate": 3.273717678963857e-05, "loss": 0.0065, "step": 36920 }, { "epoch": 1.0360499368775424, "grad_norm": 0.014237133786082268, "learning_rate": 3.273250105204096e-05, "loss": 0.0122, "step": 36930 }, { "epoch": 1.0363304811333989, "grad_norm": 4.137057304382324, "learning_rate": 3.2727825314443354e-05, "loss": 0.0257, "step": 36940 }, { "epoch": 1.036611025389255, "grad_norm": 1.4371719360351562, "learning_rate": 3.272314957684575e-05, "loss": 0.0271, "step": 36950 }, { "epoch": 1.0368915696451115, "grad_norm": 1.4973350763320923, "learning_rate": 3.271847383924814e-05, "loss": 0.1009, "step": 36960 }, { "epoch": 1.037172113900968, "grad_norm": 0.06464419513940811, "learning_rate": 3.271379810165054e-05, "loss": 0.0486, "step": 36970 }, { "epoch": 1.0374526581568242, "grad_norm": 0.6759933829307556, "learning_rate": 3.270912236405293e-05, "loss": 0.0356, "step": 36980 }, { "epoch": 1.0377332024126806, "grad_norm": 0.2871699631214142, "learning_rate": 3.2704446626455326e-05, "loss": 0.0164, "step": 36990 }, { "epoch": 1.038013746668537, "grad_norm": 0.19184884428977966, "learning_rate": 3.269977088885772e-05, "loss": 0.0312, "step": 37000 }, { "epoch": 1.0382942909243933, "grad_norm": 0.161237433552742, "learning_rate": 3.269509515126011e-05, "loss": 0.0392, "step": 37010 }, { "epoch": 1.0385748351802497, "grad_norm": 0.4622533619403839, "learning_rate": 3.2690419413662506e-05, "loss": 0.019, "step": 37020 }, { "epoch": 1.038855379436106, "grad_norm": 0.14217644929885864, "learning_rate": 3.26857436760649e-05, "loss": 0.0155, "step": 37030 }, { "epoch": 1.0391359236919624, "grad_norm": 0.03371940180659294, "learning_rate": 3.26810679384673e-05, "loss": 0.008, "step": 37040 }, { "epoch": 1.0394164679478188, "grad_norm": 0.01614147052168846, "learning_rate": 3.2676392200869685e-05, "loss": 0.0231, "step": 37050 }, { "epoch": 1.039697012203675, "grad_norm": 0.34849148988723755, "learning_rate": 3.2671716463272085e-05, "loss": 0.0365, "step": 37060 }, { "epoch": 1.0399775564595315, "grad_norm": 0.01327283401042223, "learning_rate": 3.266704072567448e-05, "loss": 0.0221, "step": 37070 }, { "epoch": 1.040258100715388, "grad_norm": 0.3156927227973938, "learning_rate": 3.266236498807687e-05, "loss": 0.0352, "step": 37080 }, { "epoch": 1.0405386449712442, "grad_norm": 0.2390492558479309, "learning_rate": 3.2657689250479264e-05, "loss": 0.0075, "step": 37090 }, { "epoch": 1.0408191892271006, "grad_norm": 0.024152521044015884, "learning_rate": 3.265301351288166e-05, "loss": 0.0539, "step": 37100 }, { "epoch": 1.041099733482957, "grad_norm": 1.0987646579742432, "learning_rate": 3.264833777528406e-05, "loss": 0.0546, "step": 37110 }, { "epoch": 1.0413802777388133, "grad_norm": 1.1776100397109985, "learning_rate": 3.2643662037686444e-05, "loss": 0.0435, "step": 37120 }, { "epoch": 1.0416608219946697, "grad_norm": 0.17868487536907196, "learning_rate": 3.2638986300088844e-05, "loss": 0.0276, "step": 37130 }, { "epoch": 1.041941366250526, "grad_norm": 0.1270407885313034, "learning_rate": 3.263431056249123e-05, "loss": 0.015, "step": 37140 }, { "epoch": 1.0422219105063824, "grad_norm": 0.03417125344276428, "learning_rate": 3.262963482489363e-05, "loss": 0.0286, "step": 37150 }, { "epoch": 1.0425024547622388, "grad_norm": 0.6700997352600098, "learning_rate": 3.2624959087296017e-05, "loss": 0.0129, "step": 37160 }, { "epoch": 1.042782999018095, "grad_norm": 1.5234622955322266, "learning_rate": 3.2620283349698416e-05, "loss": 0.0097, "step": 37170 }, { "epoch": 1.0430635432739515, "grad_norm": 0.07870687544345856, "learning_rate": 3.261560761210081e-05, "loss": 0.02, "step": 37180 }, { "epoch": 1.043344087529808, "grad_norm": 0.047378189861774445, "learning_rate": 3.26109318745032e-05, "loss": 0.0226, "step": 37190 }, { "epoch": 1.0436246317856641, "grad_norm": 0.24132980406284332, "learning_rate": 3.26062561369056e-05, "loss": 0.0057, "step": 37200 }, { "epoch": 1.0439051760415206, "grad_norm": 0.017609596252441406, "learning_rate": 3.260158039930799e-05, "loss": 0.0135, "step": 37210 }, { "epoch": 1.044185720297377, "grad_norm": 0.353364497423172, "learning_rate": 3.259690466171039e-05, "loss": 0.0245, "step": 37220 }, { "epoch": 1.0444662645532332, "grad_norm": 0.47659575939178467, "learning_rate": 3.2592228924112775e-05, "loss": 0.0251, "step": 37230 }, { "epoch": 1.0447468088090897, "grad_norm": 0.11110896617174149, "learning_rate": 3.2587553186515175e-05, "loss": 0.0231, "step": 37240 }, { "epoch": 1.045027353064946, "grad_norm": 0.12179174274206161, "learning_rate": 3.258287744891757e-05, "loss": 0.0125, "step": 37250 }, { "epoch": 1.0453078973208023, "grad_norm": 0.04873015731573105, "learning_rate": 3.257820171131996e-05, "loss": 0.0464, "step": 37260 }, { "epoch": 1.0455884415766588, "grad_norm": 0.4330528676509857, "learning_rate": 3.2573525973722355e-05, "loss": 0.0319, "step": 37270 }, { "epoch": 1.045868985832515, "grad_norm": 0.04617152363061905, "learning_rate": 3.256885023612475e-05, "loss": 0.0315, "step": 37280 }, { "epoch": 1.0461495300883714, "grad_norm": 0.16036340594291687, "learning_rate": 3.256417449852715e-05, "loss": 0.0136, "step": 37290 }, { "epoch": 1.0464300743442279, "grad_norm": 0.06491424143314362, "learning_rate": 3.2559498760929534e-05, "loss": 0.0144, "step": 37300 }, { "epoch": 1.046710618600084, "grad_norm": 0.3396812975406647, "learning_rate": 3.2554823023331934e-05, "loss": 0.0236, "step": 37310 }, { "epoch": 1.0469911628559405, "grad_norm": 0.04545038938522339, "learning_rate": 3.255014728573433e-05, "loss": 0.013, "step": 37320 }, { "epoch": 1.047271707111797, "grad_norm": 0.018178530037403107, "learning_rate": 3.254547154813672e-05, "loss": 0.0339, "step": 37330 }, { "epoch": 1.0475522513676532, "grad_norm": 0.06580556184053421, "learning_rate": 3.2540795810539114e-05, "loss": 0.0524, "step": 37340 }, { "epoch": 1.0478327956235096, "grad_norm": 0.2058963179588318, "learning_rate": 3.253612007294151e-05, "loss": 0.0681, "step": 37350 }, { "epoch": 1.048113339879366, "grad_norm": 0.5772327184677124, "learning_rate": 3.25314443353439e-05, "loss": 0.0501, "step": 37360 }, { "epoch": 1.0483938841352223, "grad_norm": 0.27136990427970886, "learning_rate": 3.25267685977463e-05, "loss": 0.036, "step": 37370 }, { "epoch": 1.0486744283910787, "grad_norm": 0.41546544432640076, "learning_rate": 3.2522092860148686e-05, "loss": 0.0519, "step": 37380 }, { "epoch": 1.0489549726469352, "grad_norm": 0.30083712935447693, "learning_rate": 3.2517417122551086e-05, "loss": 0.0094, "step": 37390 }, { "epoch": 1.0492355169027914, "grad_norm": 0.12271562218666077, "learning_rate": 3.251274138495348e-05, "loss": 0.0259, "step": 37400 }, { "epoch": 1.0495160611586478, "grad_norm": 0.16960835456848145, "learning_rate": 3.250806564735587e-05, "loss": 0.0159, "step": 37410 }, { "epoch": 1.049796605414504, "grad_norm": 0.15716221928596497, "learning_rate": 3.2503389909758266e-05, "loss": 0.0124, "step": 37420 }, { "epoch": 1.0500771496703605, "grad_norm": 0.24983270466327667, "learning_rate": 3.249871417216066e-05, "loss": 0.0388, "step": 37430 }, { "epoch": 1.050357693926217, "grad_norm": 0.345722496509552, "learning_rate": 3.249403843456306e-05, "loss": 0.0127, "step": 37440 }, { "epoch": 1.0506382381820731, "grad_norm": 0.29749226570129395, "learning_rate": 3.2489362696965445e-05, "loss": 0.0065, "step": 37450 }, { "epoch": 1.0509187824379296, "grad_norm": 0.1839456856250763, "learning_rate": 3.2484686959367845e-05, "loss": 0.0168, "step": 37460 }, { "epoch": 1.051199326693786, "grad_norm": 0.20132844150066376, "learning_rate": 3.248001122177023e-05, "loss": 0.0371, "step": 37470 }, { "epoch": 1.0514798709496422, "grad_norm": 0.23855257034301758, "learning_rate": 3.247533548417263e-05, "loss": 0.0404, "step": 37480 }, { "epoch": 1.0517604152054987, "grad_norm": 0.06521467864513397, "learning_rate": 3.2470659746575025e-05, "loss": 0.0181, "step": 37490 }, { "epoch": 1.0520409594613551, "grad_norm": 0.04516049847006798, "learning_rate": 3.246598400897742e-05, "loss": 0.0231, "step": 37500 }, { "epoch": 1.0523215037172113, "grad_norm": 0.03682852163910866, "learning_rate": 3.246130827137982e-05, "loss": 0.0417, "step": 37510 }, { "epoch": 1.0526020479730678, "grad_norm": 0.24349471926689148, "learning_rate": 3.2456632533782204e-05, "loss": 0.0179, "step": 37520 }, { "epoch": 1.0528825922289242, "grad_norm": 0.030726036056876183, "learning_rate": 3.2451956796184604e-05, "loss": 0.0293, "step": 37530 }, { "epoch": 1.0531631364847804, "grad_norm": 0.11771196871995926, "learning_rate": 3.244728105858699e-05, "loss": 0.0262, "step": 37540 }, { "epoch": 1.0534436807406369, "grad_norm": 0.23553359508514404, "learning_rate": 3.244260532098939e-05, "loss": 0.0085, "step": 37550 }, { "epoch": 1.053724224996493, "grad_norm": 0.25927430391311646, "learning_rate": 3.243792958339178e-05, "loss": 0.0185, "step": 37560 }, { "epoch": 1.0540047692523495, "grad_norm": 0.16675959527492523, "learning_rate": 3.2433253845794177e-05, "loss": 0.0163, "step": 37570 }, { "epoch": 1.054285313508206, "grad_norm": 0.045573506504297256, "learning_rate": 3.242857810819657e-05, "loss": 0.0112, "step": 37580 }, { "epoch": 1.0545658577640622, "grad_norm": 0.30782651901245117, "learning_rate": 3.242390237059896e-05, "loss": 0.0241, "step": 37590 }, { "epoch": 1.0548464020199186, "grad_norm": 0.027346976101398468, "learning_rate": 3.241922663300136e-05, "loss": 0.0296, "step": 37600 }, { "epoch": 1.055126946275775, "grad_norm": 0.0711386427283287, "learning_rate": 3.241455089540375e-05, "loss": 0.0256, "step": 37610 }, { "epoch": 1.0554074905316313, "grad_norm": 7.509599208831787, "learning_rate": 3.240987515780615e-05, "loss": 0.0488, "step": 37620 }, { "epoch": 1.0556880347874877, "grad_norm": 0.3598470389842987, "learning_rate": 3.2405199420208535e-05, "loss": 0.0233, "step": 37630 }, { "epoch": 1.0559685790433442, "grad_norm": 0.03895278647542, "learning_rate": 3.2400523682610935e-05, "loss": 0.0383, "step": 37640 }, { "epoch": 1.0562491232992004, "grad_norm": 2.200676441192627, "learning_rate": 3.239584794501333e-05, "loss": 0.024, "step": 37650 }, { "epoch": 1.0565296675550568, "grad_norm": 0.10503794997930527, "learning_rate": 3.239117220741572e-05, "loss": 0.0193, "step": 37660 }, { "epoch": 1.0568102118109133, "grad_norm": 1.0315266847610474, "learning_rate": 3.2386496469818115e-05, "loss": 0.0226, "step": 37670 }, { "epoch": 1.0570907560667695, "grad_norm": 0.41783297061920166, "learning_rate": 3.238182073222051e-05, "loss": 0.0142, "step": 37680 }, { "epoch": 1.057371300322626, "grad_norm": 0.12687799334526062, "learning_rate": 3.23771449946229e-05, "loss": 0.01, "step": 37690 }, { "epoch": 1.0576518445784822, "grad_norm": 0.09017841517925262, "learning_rate": 3.2372469257025294e-05, "loss": 0.0173, "step": 37700 }, { "epoch": 1.0579323888343386, "grad_norm": 0.6903824210166931, "learning_rate": 3.2367793519427694e-05, "loss": 0.0483, "step": 37710 }, { "epoch": 1.058212933090195, "grad_norm": 0.4533435106277466, "learning_rate": 3.236311778183009e-05, "loss": 0.0325, "step": 37720 }, { "epoch": 1.0584934773460513, "grad_norm": 0.06498520076274872, "learning_rate": 3.235844204423248e-05, "loss": 0.0514, "step": 37730 }, { "epoch": 1.0587740216019077, "grad_norm": 0.5057935118675232, "learning_rate": 3.2353766306634874e-05, "loss": 0.0493, "step": 37740 }, { "epoch": 1.0590545658577641, "grad_norm": 0.22679659724235535, "learning_rate": 3.234909056903727e-05, "loss": 0.025, "step": 37750 }, { "epoch": 1.0593351101136204, "grad_norm": 1.1905159950256348, "learning_rate": 3.234441483143966e-05, "loss": 0.0443, "step": 37760 }, { "epoch": 1.0596156543694768, "grad_norm": 0.12720896303653717, "learning_rate": 3.233973909384205e-05, "loss": 0.0159, "step": 37770 }, { "epoch": 1.0598961986253332, "grad_norm": 0.03871278837323189, "learning_rate": 3.2335063356244446e-05, "loss": 0.0346, "step": 37780 }, { "epoch": 1.0601767428811895, "grad_norm": 1.2815848588943481, "learning_rate": 3.2330387618646846e-05, "loss": 0.0706, "step": 37790 }, { "epoch": 1.060457287137046, "grad_norm": 0.2834428548812866, "learning_rate": 3.232571188104924e-05, "loss": 0.0321, "step": 37800 }, { "epoch": 1.0607378313929021, "grad_norm": 0.0877470076084137, "learning_rate": 3.232103614345163e-05, "loss": 0.0422, "step": 37810 }, { "epoch": 1.0610183756487586, "grad_norm": 0.18570876121520996, "learning_rate": 3.2316360405854026e-05, "loss": 0.0247, "step": 37820 }, { "epoch": 1.061298919904615, "grad_norm": 0.07321857661008835, "learning_rate": 3.231168466825642e-05, "loss": 0.0129, "step": 37830 }, { "epoch": 1.0615794641604712, "grad_norm": 0.25527849793434143, "learning_rate": 3.230700893065881e-05, "loss": 0.021, "step": 37840 }, { "epoch": 1.0618600084163277, "grad_norm": 0.01889306679368019, "learning_rate": 3.2302333193061205e-05, "loss": 0.0055, "step": 37850 }, { "epoch": 1.062140552672184, "grad_norm": 1.182529091835022, "learning_rate": 3.2297657455463605e-05, "loss": 0.021, "step": 37860 }, { "epoch": 1.0624210969280403, "grad_norm": 0.06387405842542648, "learning_rate": 3.229298171786599e-05, "loss": 0.038, "step": 37870 }, { "epoch": 1.0627016411838968, "grad_norm": 0.1390107423067093, "learning_rate": 3.228830598026839e-05, "loss": 0.0308, "step": 37880 }, { "epoch": 1.0629821854397532, "grad_norm": 0.04398196190595627, "learning_rate": 3.2283630242670785e-05, "loss": 0.0103, "step": 37890 }, { "epoch": 1.0632627296956094, "grad_norm": 0.03619837015867233, "learning_rate": 3.227895450507318e-05, "loss": 0.0264, "step": 37900 }, { "epoch": 1.0635432739514659, "grad_norm": 0.39745429158210754, "learning_rate": 3.227427876747557e-05, "loss": 0.0142, "step": 37910 }, { "epoch": 1.0638238182073223, "grad_norm": 0.30639031529426575, "learning_rate": 3.2269603029877964e-05, "loss": 0.0291, "step": 37920 }, { "epoch": 1.0641043624631785, "grad_norm": 2.91034197807312, "learning_rate": 3.2264927292280364e-05, "loss": 0.0116, "step": 37930 }, { "epoch": 1.064384906719035, "grad_norm": 0.009525301866233349, "learning_rate": 3.226025155468275e-05, "loss": 0.0201, "step": 37940 }, { "epoch": 1.0646654509748914, "grad_norm": 0.2184104174375534, "learning_rate": 3.225557581708515e-05, "loss": 0.0077, "step": 37950 }, { "epoch": 1.0649459952307476, "grad_norm": 0.012206627987325191, "learning_rate": 3.225090007948754e-05, "loss": 0.0197, "step": 37960 }, { "epoch": 1.065226539486604, "grad_norm": 0.19178053736686707, "learning_rate": 3.224622434188994e-05, "loss": 0.029, "step": 37970 }, { "epoch": 1.0655070837424603, "grad_norm": 0.019813749939203262, "learning_rate": 3.224154860429233e-05, "loss": 0.0155, "step": 37980 }, { "epoch": 1.0657876279983167, "grad_norm": 0.025310534983873367, "learning_rate": 3.223687286669472e-05, "loss": 0.0227, "step": 37990 }, { "epoch": 1.0660681722541732, "grad_norm": 0.0691412016749382, "learning_rate": 3.2232197129097116e-05, "loss": 0.014, "step": 38000 }, { "epoch": 1.0663487165100294, "grad_norm": 0.39578792452812195, "learning_rate": 3.222752139149951e-05, "loss": 0.0141, "step": 38010 }, { "epoch": 1.0666292607658858, "grad_norm": 2.6926932334899902, "learning_rate": 3.222284565390191e-05, "loss": 0.007, "step": 38020 }, { "epoch": 1.0669098050217423, "grad_norm": 0.019812939688563347, "learning_rate": 3.2218169916304296e-05, "loss": 0.0367, "step": 38030 }, { "epoch": 1.0671903492775985, "grad_norm": 0.10109330713748932, "learning_rate": 3.2213494178706696e-05, "loss": 0.0198, "step": 38040 }, { "epoch": 1.067470893533455, "grad_norm": 0.42766040563583374, "learning_rate": 3.220881844110908e-05, "loss": 0.0388, "step": 38050 }, { "epoch": 1.0677514377893114, "grad_norm": 0.08629392832517624, "learning_rate": 3.220414270351148e-05, "loss": 0.0497, "step": 38060 }, { "epoch": 1.0680319820451676, "grad_norm": 0.042397964745759964, "learning_rate": 3.2199466965913875e-05, "loss": 0.0376, "step": 38070 }, { "epoch": 1.068312526301024, "grad_norm": 0.11306595057249069, "learning_rate": 3.219479122831627e-05, "loss": 0.0261, "step": 38080 }, { "epoch": 1.0685930705568802, "grad_norm": 0.22576040029525757, "learning_rate": 3.219011549071866e-05, "loss": 0.0381, "step": 38090 }, { "epoch": 1.0688736148127367, "grad_norm": 1.3831149339675903, "learning_rate": 3.2185439753121054e-05, "loss": 0.0169, "step": 38100 }, { "epoch": 1.0691541590685931, "grad_norm": 0.38497740030288696, "learning_rate": 3.2180764015523454e-05, "loss": 0.0086, "step": 38110 }, { "epoch": 1.0694347033244493, "grad_norm": 0.09900840371847153, "learning_rate": 3.217608827792584e-05, "loss": 0.0167, "step": 38120 }, { "epoch": 1.0697152475803058, "grad_norm": 3.102954387664795, "learning_rate": 3.217141254032824e-05, "loss": 0.0227, "step": 38130 }, { "epoch": 1.0699957918361622, "grad_norm": 0.023840347304940224, "learning_rate": 3.2166736802730634e-05, "loss": 0.0089, "step": 38140 }, { "epoch": 1.0702763360920184, "grad_norm": 0.05388420820236206, "learning_rate": 3.216206106513303e-05, "loss": 0.062, "step": 38150 }, { "epoch": 1.0705568803478749, "grad_norm": 0.4130669832229614, "learning_rate": 3.215738532753542e-05, "loss": 0.0454, "step": 38160 }, { "epoch": 1.0708374246037313, "grad_norm": 0.12304805219173431, "learning_rate": 3.215270958993781e-05, "loss": 0.0387, "step": 38170 }, { "epoch": 1.0711179688595875, "grad_norm": 0.09524090588092804, "learning_rate": 3.2148033852340206e-05, "loss": 0.0243, "step": 38180 }, { "epoch": 1.071398513115444, "grad_norm": 0.04051101207733154, "learning_rate": 3.21433581147426e-05, "loss": 0.008, "step": 38190 }, { "epoch": 1.0716790573713004, "grad_norm": 0.015051962807774544, "learning_rate": 3.2138682377145e-05, "loss": 0.029, "step": 38200 }, { "epoch": 1.0719596016271566, "grad_norm": 0.34718766808509827, "learning_rate": 3.213400663954739e-05, "loss": 0.0276, "step": 38210 }, { "epoch": 1.072240145883013, "grad_norm": 0.026269542053341866, "learning_rate": 3.2129330901949786e-05, "loss": 0.0203, "step": 38220 }, { "epoch": 1.0725206901388693, "grad_norm": 0.3166946768760681, "learning_rate": 3.212465516435218e-05, "loss": 0.0148, "step": 38230 }, { "epoch": 1.0728012343947257, "grad_norm": 0.45876920223236084, "learning_rate": 3.211997942675457e-05, "loss": 0.0362, "step": 38240 }, { "epoch": 1.0730817786505822, "grad_norm": 0.4216490089893341, "learning_rate": 3.2115303689156965e-05, "loss": 0.0361, "step": 38250 }, { "epoch": 1.0733623229064384, "grad_norm": 0.16128559410572052, "learning_rate": 3.211062795155936e-05, "loss": 0.0456, "step": 38260 }, { "epoch": 1.0736428671622948, "grad_norm": 0.4308127164840698, "learning_rate": 3.210595221396175e-05, "loss": 0.0246, "step": 38270 }, { "epoch": 1.0739234114181513, "grad_norm": 0.5795280337333679, "learning_rate": 3.210127647636415e-05, "loss": 0.0127, "step": 38280 }, { "epoch": 1.0742039556740075, "grad_norm": 0.8281629681587219, "learning_rate": 3.209660073876654e-05, "loss": 0.029, "step": 38290 }, { "epoch": 1.074484499929864, "grad_norm": 0.47006532549858093, "learning_rate": 3.209192500116894e-05, "loss": 0.0167, "step": 38300 }, { "epoch": 1.0747650441857204, "grad_norm": 0.13184230029582977, "learning_rate": 3.208724926357133e-05, "loss": 0.0356, "step": 38310 }, { "epoch": 1.0750455884415766, "grad_norm": 0.10467483848333359, "learning_rate": 3.2082573525973724e-05, "loss": 0.0362, "step": 38320 }, { "epoch": 1.075326132697433, "grad_norm": 1.7164437770843506, "learning_rate": 3.207789778837612e-05, "loss": 0.0158, "step": 38330 }, { "epoch": 1.0756066769532895, "grad_norm": 0.2798384428024292, "learning_rate": 3.207322205077851e-05, "loss": 0.0257, "step": 38340 }, { "epoch": 1.0758872212091457, "grad_norm": 0.10919290035963058, "learning_rate": 3.206854631318091e-05, "loss": 0.0132, "step": 38350 }, { "epoch": 1.0761677654650021, "grad_norm": 0.024796968325972557, "learning_rate": 3.20638705755833e-05, "loss": 0.028, "step": 38360 }, { "epoch": 1.0764483097208584, "grad_norm": 1.0766156911849976, "learning_rate": 3.20591948379857e-05, "loss": 0.0416, "step": 38370 }, { "epoch": 1.0767288539767148, "grad_norm": 0.12370863556861877, "learning_rate": 3.205451910038808e-05, "loss": 0.0366, "step": 38380 }, { "epoch": 1.0770093982325712, "grad_norm": 0.054523903876543045, "learning_rate": 3.204984336279048e-05, "loss": 0.0275, "step": 38390 }, { "epoch": 1.0772899424884275, "grad_norm": 0.11888831108808517, "learning_rate": 3.2045167625192876e-05, "loss": 0.0172, "step": 38400 }, { "epoch": 1.077570486744284, "grad_norm": 0.31643885374069214, "learning_rate": 3.204049188759527e-05, "loss": 0.0077, "step": 38410 }, { "epoch": 1.0778510310001403, "grad_norm": 0.08605234324932098, "learning_rate": 3.203581614999767e-05, "loss": 0.0488, "step": 38420 }, { "epoch": 1.0781315752559966, "grad_norm": 0.07211365550756454, "learning_rate": 3.2031140412400056e-05, "loss": 0.0101, "step": 38430 }, { "epoch": 1.078412119511853, "grad_norm": 1.027798056602478, "learning_rate": 3.2026464674802456e-05, "loss": 0.0323, "step": 38440 }, { "epoch": 1.0786926637677094, "grad_norm": 0.16923807561397552, "learning_rate": 3.202178893720484e-05, "loss": 0.0208, "step": 38450 }, { "epoch": 1.0789732080235657, "grad_norm": 0.29867538809776306, "learning_rate": 3.201711319960724e-05, "loss": 0.0245, "step": 38460 }, { "epoch": 1.079253752279422, "grad_norm": 0.041427113115787506, "learning_rate": 3.201243746200963e-05, "loss": 0.0082, "step": 38470 }, { "epoch": 1.0795342965352783, "grad_norm": 0.6235448718070984, "learning_rate": 3.200776172441203e-05, "loss": 0.0283, "step": 38480 }, { "epoch": 1.0798148407911348, "grad_norm": 0.04204362630844116, "learning_rate": 3.200308598681442e-05, "loss": 0.0028, "step": 38490 }, { "epoch": 1.0800953850469912, "grad_norm": 0.033599112182855606, "learning_rate": 3.1998410249216815e-05, "loss": 0.01, "step": 38500 }, { "epoch": 1.0803759293028474, "grad_norm": 0.04225074499845505, "learning_rate": 3.1993734511619214e-05, "loss": 0.0216, "step": 38510 }, { "epoch": 1.0806564735587039, "grad_norm": 0.832991898059845, "learning_rate": 3.19890587740216e-05, "loss": 0.0114, "step": 38520 }, { "epoch": 1.0809370178145603, "grad_norm": 0.027176594361662865, "learning_rate": 3.1984383036424e-05, "loss": 0.007, "step": 38530 }, { "epoch": 1.0812175620704165, "grad_norm": 0.021739063784480095, "learning_rate": 3.197970729882639e-05, "loss": 0.0088, "step": 38540 }, { "epoch": 1.081498106326273, "grad_norm": 4.839587211608887, "learning_rate": 3.197503156122879e-05, "loss": 0.0607, "step": 38550 }, { "epoch": 1.0817786505821294, "grad_norm": 0.42342057824134827, "learning_rate": 3.197035582363118e-05, "loss": 0.0253, "step": 38560 }, { "epoch": 1.0820591948379856, "grad_norm": 0.19921226799488068, "learning_rate": 3.1965680086033573e-05, "loss": 0.0173, "step": 38570 }, { "epoch": 1.082339739093842, "grad_norm": 0.06576795130968094, "learning_rate": 3.1961004348435967e-05, "loss": 0.0351, "step": 38580 }, { "epoch": 1.0826202833496985, "grad_norm": 0.08974946290254593, "learning_rate": 3.195632861083836e-05, "loss": 0.0225, "step": 38590 }, { "epoch": 1.0829008276055547, "grad_norm": 0.3612746596336365, "learning_rate": 3.195165287324075e-05, "loss": 0.0179, "step": 38600 }, { "epoch": 1.0831813718614112, "grad_norm": 0.06528709828853607, "learning_rate": 3.1946977135643146e-05, "loss": 0.0148, "step": 38610 }, { "epoch": 1.0834619161172676, "grad_norm": 0.6435369849205017, "learning_rate": 3.1942301398045546e-05, "loss": 0.0273, "step": 38620 }, { "epoch": 1.0837424603731238, "grad_norm": 0.01706642657518387, "learning_rate": 3.193762566044794e-05, "loss": 0.0107, "step": 38630 }, { "epoch": 1.0840230046289803, "grad_norm": 0.01583273522555828, "learning_rate": 3.193294992285033e-05, "loss": 0.0057, "step": 38640 }, { "epoch": 1.0843035488848365, "grad_norm": 0.015956643968820572, "learning_rate": 3.1928274185252725e-05, "loss": 0.0448, "step": 38650 }, { "epoch": 1.084584093140693, "grad_norm": 0.038071900606155396, "learning_rate": 3.192359844765512e-05, "loss": 0.0341, "step": 38660 }, { "epoch": 1.0848646373965494, "grad_norm": 3.1814231872558594, "learning_rate": 3.191892271005751e-05, "loss": 0.0351, "step": 38670 }, { "epoch": 1.0851451816524056, "grad_norm": 0.6796857714653015, "learning_rate": 3.1914246972459905e-05, "loss": 0.0122, "step": 38680 }, { "epoch": 1.085425725908262, "grad_norm": 0.045471739023923874, "learning_rate": 3.19095712348623e-05, "loss": 0.0309, "step": 38690 }, { "epoch": 1.0857062701641185, "grad_norm": 1.2862850427627563, "learning_rate": 3.19048954972647e-05, "loss": 0.0198, "step": 38700 }, { "epoch": 1.0859868144199747, "grad_norm": 0.08420000225305557, "learning_rate": 3.190021975966709e-05, "loss": 0.0115, "step": 38710 }, { "epoch": 1.0862673586758311, "grad_norm": 1.1180673837661743, "learning_rate": 3.1895544022069484e-05, "loss": 0.0596, "step": 38720 }, { "epoch": 1.0865479029316876, "grad_norm": 0.2795035243034363, "learning_rate": 3.189086828447188e-05, "loss": 0.0153, "step": 38730 }, { "epoch": 1.0868284471875438, "grad_norm": 0.825249433517456, "learning_rate": 3.188619254687427e-05, "loss": 0.0362, "step": 38740 }, { "epoch": 1.0871089914434002, "grad_norm": 1.3892394304275513, "learning_rate": 3.1881516809276664e-05, "loss": 0.0172, "step": 38750 }, { "epoch": 1.0873895356992564, "grad_norm": 0.027452677488327026, "learning_rate": 3.187684107167906e-05, "loss": 0.005, "step": 38760 }, { "epoch": 1.087670079955113, "grad_norm": 0.7906798720359802, "learning_rate": 3.187216533408146e-05, "loss": 0.02, "step": 38770 }, { "epoch": 1.0879506242109693, "grad_norm": 0.06826978176832199, "learning_rate": 3.186748959648384e-05, "loss": 0.0294, "step": 38780 }, { "epoch": 1.0882311684668255, "grad_norm": 0.3735366761684418, "learning_rate": 3.186281385888624e-05, "loss": 0.0304, "step": 38790 }, { "epoch": 1.088511712722682, "grad_norm": 0.5407231450080872, "learning_rate": 3.1858138121288636e-05, "loss": 0.0643, "step": 38800 }, { "epoch": 1.0887922569785384, "grad_norm": 0.06428893655538559, "learning_rate": 3.185346238369103e-05, "loss": 0.0407, "step": 38810 }, { "epoch": 1.0890728012343946, "grad_norm": 0.2848556339740753, "learning_rate": 3.184878664609342e-05, "loss": 0.043, "step": 38820 }, { "epoch": 1.089353345490251, "grad_norm": 0.09837659448385239, "learning_rate": 3.1844110908495816e-05, "loss": 0.0446, "step": 38830 }, { "epoch": 1.0896338897461075, "grad_norm": 1.2355269193649292, "learning_rate": 3.1839435170898216e-05, "loss": 0.025, "step": 38840 }, { "epoch": 1.0899144340019638, "grad_norm": 0.28866279125213623, "learning_rate": 3.18347594333006e-05, "loss": 0.0247, "step": 38850 }, { "epoch": 1.0901949782578202, "grad_norm": 0.06641931086778641, "learning_rate": 3.1830083695703e-05, "loss": 0.0222, "step": 38860 }, { "epoch": 1.0904755225136766, "grad_norm": 0.08772577345371246, "learning_rate": 3.182540795810539e-05, "loss": 0.0694, "step": 38870 }, { "epoch": 1.0907560667695329, "grad_norm": 0.29309430718421936, "learning_rate": 3.182073222050779e-05, "loss": 0.0202, "step": 38880 }, { "epoch": 1.0910366110253893, "grad_norm": 0.30365556478500366, "learning_rate": 3.181605648291018e-05, "loss": 0.0284, "step": 38890 }, { "epoch": 1.0913171552812457, "grad_norm": 0.017209792509675026, "learning_rate": 3.1811380745312575e-05, "loss": 0.0101, "step": 38900 }, { "epoch": 1.091597699537102, "grad_norm": 1.1638493537902832, "learning_rate": 3.180670500771497e-05, "loss": 0.0346, "step": 38910 }, { "epoch": 1.0918782437929584, "grad_norm": 0.8278540372848511, "learning_rate": 3.180202927011736e-05, "loss": 0.0136, "step": 38920 }, { "epoch": 1.0921587880488146, "grad_norm": 0.35239338874816895, "learning_rate": 3.179735353251976e-05, "loss": 0.0291, "step": 38930 }, { "epoch": 1.092439332304671, "grad_norm": 0.4620610177516937, "learning_rate": 3.179267779492215e-05, "loss": 0.0443, "step": 38940 }, { "epoch": 1.0927198765605275, "grad_norm": 0.10770991444587708, "learning_rate": 3.178800205732455e-05, "loss": 0.0878, "step": 38950 }, { "epoch": 1.0930004208163837, "grad_norm": 1.4365309476852417, "learning_rate": 3.1783326319726934e-05, "loss": 0.0389, "step": 38960 }, { "epoch": 1.0932809650722402, "grad_norm": 1.7136322259902954, "learning_rate": 3.1778650582129333e-05, "loss": 0.032, "step": 38970 }, { "epoch": 1.0935615093280966, "grad_norm": 0.3187173902988434, "learning_rate": 3.177397484453173e-05, "loss": 0.0348, "step": 38980 }, { "epoch": 1.0938420535839528, "grad_norm": 0.16569602489471436, "learning_rate": 3.176929910693412e-05, "loss": 0.0162, "step": 38990 }, { "epoch": 1.0941225978398093, "grad_norm": 0.2939872741699219, "learning_rate": 3.176462336933651e-05, "loss": 0.02, "step": 39000 }, { "epoch": 1.0944031420956657, "grad_norm": 0.028925146907567978, "learning_rate": 3.1759947631738906e-05, "loss": 0.011, "step": 39010 }, { "epoch": 1.094683686351522, "grad_norm": 0.034436918795108795, "learning_rate": 3.1755271894141306e-05, "loss": 0.024, "step": 39020 }, { "epoch": 1.0949642306073784, "grad_norm": 0.03518927842378616, "learning_rate": 3.175059615654369e-05, "loss": 0.0182, "step": 39030 }, { "epoch": 1.0952447748632346, "grad_norm": 0.021176448091864586, "learning_rate": 3.174592041894609e-05, "loss": 0.0078, "step": 39040 }, { "epoch": 1.095525319119091, "grad_norm": 0.23874372243881226, "learning_rate": 3.1741244681348485e-05, "loss": 0.0434, "step": 39050 }, { "epoch": 1.0958058633749475, "grad_norm": 3.9102683067321777, "learning_rate": 3.173656894375088e-05, "loss": 0.0378, "step": 39060 }, { "epoch": 1.0960864076308037, "grad_norm": 0.26257383823394775, "learning_rate": 3.173189320615327e-05, "loss": 0.029, "step": 39070 }, { "epoch": 1.0963669518866601, "grad_norm": 0.07040310651063919, "learning_rate": 3.1727217468555665e-05, "loss": 0.0351, "step": 39080 }, { "epoch": 1.0966474961425166, "grad_norm": 0.39112862944602966, "learning_rate": 3.172254173095806e-05, "loss": 0.0307, "step": 39090 }, { "epoch": 1.0969280403983728, "grad_norm": 0.08033863455057144, "learning_rate": 3.171786599336045e-05, "loss": 0.0444, "step": 39100 }, { "epoch": 1.0972085846542292, "grad_norm": 0.1823255866765976, "learning_rate": 3.171319025576285e-05, "loss": 0.0323, "step": 39110 }, { "epoch": 1.0974891289100857, "grad_norm": 0.08910758793354034, "learning_rate": 3.1708514518165244e-05, "loss": 0.0347, "step": 39120 }, { "epoch": 1.0977696731659419, "grad_norm": 1.3000760078430176, "learning_rate": 3.170383878056764e-05, "loss": 0.035, "step": 39130 }, { "epoch": 1.0980502174217983, "grad_norm": 0.055897779762744904, "learning_rate": 3.169916304297003e-05, "loss": 0.0311, "step": 39140 }, { "epoch": 1.0983307616776545, "grad_norm": 0.2985897362232208, "learning_rate": 3.1694487305372424e-05, "loss": 0.0377, "step": 39150 }, { "epoch": 1.098611305933511, "grad_norm": 0.12140852957963943, "learning_rate": 3.168981156777482e-05, "loss": 0.0356, "step": 39160 }, { "epoch": 1.0988918501893674, "grad_norm": 0.18244172632694244, "learning_rate": 3.168513583017721e-05, "loss": 0.032, "step": 39170 }, { "epoch": 1.0991723944452236, "grad_norm": 0.11359795928001404, "learning_rate": 3.16804600925796e-05, "loss": 0.0225, "step": 39180 }, { "epoch": 1.09945293870108, "grad_norm": 0.1942921131849289, "learning_rate": 3.1675784354982e-05, "loss": 0.0301, "step": 39190 }, { "epoch": 1.0997334829569365, "grad_norm": 0.14426743984222412, "learning_rate": 3.167110861738439e-05, "loss": 0.0244, "step": 39200 }, { "epoch": 1.1000140272127927, "grad_norm": 0.11067474633455276, "learning_rate": 3.166643287978679e-05, "loss": 0.006, "step": 39210 }, { "epoch": 1.1002945714686492, "grad_norm": 0.03737741336226463, "learning_rate": 3.166175714218918e-05, "loss": 0.0137, "step": 39220 }, { "epoch": 1.1005751157245056, "grad_norm": 0.02530127950012684, "learning_rate": 3.1657081404591576e-05, "loss": 0.0395, "step": 39230 }, { "epoch": 1.1008556599803618, "grad_norm": 0.06106431782245636, "learning_rate": 3.165240566699397e-05, "loss": 0.0471, "step": 39240 }, { "epoch": 1.1011362042362183, "grad_norm": 0.1912565380334854, "learning_rate": 3.164772992939636e-05, "loss": 0.0403, "step": 39250 }, { "epoch": 1.1014167484920747, "grad_norm": 0.24528957903385162, "learning_rate": 3.164305419179876e-05, "loss": 0.01, "step": 39260 }, { "epoch": 1.101697292747931, "grad_norm": 0.06549382954835892, "learning_rate": 3.163837845420115e-05, "loss": 0.0257, "step": 39270 }, { "epoch": 1.1019778370037874, "grad_norm": 0.2908008396625519, "learning_rate": 3.163370271660355e-05, "loss": 0.0435, "step": 39280 }, { "epoch": 1.1022583812596438, "grad_norm": 0.35475966334342957, "learning_rate": 3.1629026979005935e-05, "loss": 0.0154, "step": 39290 }, { "epoch": 1.1025389255155, "grad_norm": 0.21727296710014343, "learning_rate": 3.1624351241408335e-05, "loss": 0.0141, "step": 39300 }, { "epoch": 1.1028194697713565, "grad_norm": 0.09189644455909729, "learning_rate": 3.161967550381073e-05, "loss": 0.0185, "step": 39310 }, { "epoch": 1.1031000140272127, "grad_norm": 0.6317231059074402, "learning_rate": 3.161499976621312e-05, "loss": 0.0344, "step": 39320 }, { "epoch": 1.1033805582830691, "grad_norm": 0.3619629442691803, "learning_rate": 3.161032402861552e-05, "loss": 0.0259, "step": 39330 }, { "epoch": 1.1036611025389256, "grad_norm": 0.0561172254383564, "learning_rate": 3.160564829101791e-05, "loss": 0.0064, "step": 39340 }, { "epoch": 1.1039416467947818, "grad_norm": 1.3724647760391235, "learning_rate": 3.160097255342031e-05, "loss": 0.0159, "step": 39350 }, { "epoch": 1.1042221910506382, "grad_norm": 21.347896575927734, "learning_rate": 3.1596296815822694e-05, "loss": 0.0441, "step": 39360 }, { "epoch": 1.1045027353064947, "grad_norm": 3.0482656955718994, "learning_rate": 3.1591621078225094e-05, "loss": 0.2043, "step": 39370 }, { "epoch": 1.104783279562351, "grad_norm": 2.459789991378784, "learning_rate": 3.158694534062748e-05, "loss": 0.1729, "step": 39380 }, { "epoch": 1.1050638238182073, "grad_norm": 0.9423958659172058, "learning_rate": 3.158226960302988e-05, "loss": 0.1681, "step": 39390 }, { "epoch": 1.1053443680740638, "grad_norm": 0.21515046060085297, "learning_rate": 3.157759386543227e-05, "loss": 0.0695, "step": 39400 }, { "epoch": 1.10562491232992, "grad_norm": 0.06715723127126694, "learning_rate": 3.1572918127834666e-05, "loss": 0.0301, "step": 39410 }, { "epoch": 1.1059054565857764, "grad_norm": 0.27021324634552, "learning_rate": 3.1568242390237066e-05, "loss": 0.034, "step": 39420 }, { "epoch": 1.1061860008416327, "grad_norm": 0.21384233236312866, "learning_rate": 3.156356665263945e-05, "loss": 0.0136, "step": 39430 }, { "epoch": 1.106466545097489, "grad_norm": 0.04923013970255852, "learning_rate": 3.155889091504185e-05, "loss": 0.0378, "step": 39440 }, { "epoch": 1.1067470893533455, "grad_norm": 2.1231017112731934, "learning_rate": 3.155421517744424e-05, "loss": 0.0409, "step": 39450 }, { "epoch": 1.1070276336092018, "grad_norm": 0.12094170600175858, "learning_rate": 3.154953943984664e-05, "loss": 0.0164, "step": 39460 }, { "epoch": 1.1073081778650582, "grad_norm": 0.31129223108291626, "learning_rate": 3.154486370224903e-05, "loss": 0.0358, "step": 39470 }, { "epoch": 1.1075887221209146, "grad_norm": 0.0489359088242054, "learning_rate": 3.1540187964651425e-05, "loss": 0.0051, "step": 39480 }, { "epoch": 1.1078692663767709, "grad_norm": 0.39590948820114136, "learning_rate": 3.153551222705382e-05, "loss": 0.034, "step": 39490 }, { "epoch": 1.1081498106326273, "grad_norm": 0.3718464970588684, "learning_rate": 3.153083648945621e-05, "loss": 0.0081, "step": 39500 }, { "epoch": 1.1084303548884837, "grad_norm": 0.030222317203879356, "learning_rate": 3.1526160751858605e-05, "loss": 0.0264, "step": 39510 }, { "epoch": 1.10871089914434, "grad_norm": 2.3410298824310303, "learning_rate": 3.1521485014261e-05, "loss": 0.0453, "step": 39520 }, { "epoch": 1.1089914434001964, "grad_norm": 0.20482730865478516, "learning_rate": 3.15168092766634e-05, "loss": 0.0179, "step": 39530 }, { "epoch": 1.1092719876560528, "grad_norm": 0.04418076202273369, "learning_rate": 3.151213353906579e-05, "loss": 0.011, "step": 39540 }, { "epoch": 1.109552531911909, "grad_norm": 0.5078072547912598, "learning_rate": 3.1507457801468184e-05, "loss": 0.0301, "step": 39550 }, { "epoch": 1.1098330761677655, "grad_norm": 0.03301423043012619, "learning_rate": 3.150278206387058e-05, "loss": 0.0297, "step": 39560 }, { "epoch": 1.110113620423622, "grad_norm": 2.044341802597046, "learning_rate": 3.149810632627297e-05, "loss": 0.014, "step": 39570 }, { "epoch": 1.1103941646794782, "grad_norm": 0.03260036185383797, "learning_rate": 3.149343058867536e-05, "loss": 0.0181, "step": 39580 }, { "epoch": 1.1106747089353346, "grad_norm": 0.1856195628643036, "learning_rate": 3.1488754851077757e-05, "loss": 0.0115, "step": 39590 }, { "epoch": 1.1109552531911908, "grad_norm": 1.4175801277160645, "learning_rate": 3.148407911348015e-05, "loss": 0.0285, "step": 39600 }, { "epoch": 1.1112357974470473, "grad_norm": 0.12701056897640228, "learning_rate": 3.147940337588255e-05, "loss": 0.0612, "step": 39610 }, { "epoch": 1.1115163417029037, "grad_norm": 0.07773152738809586, "learning_rate": 3.147472763828494e-05, "loss": 0.0139, "step": 39620 }, { "epoch": 1.11179688595876, "grad_norm": 0.2470003217458725, "learning_rate": 3.1470051900687336e-05, "loss": 0.0245, "step": 39630 }, { "epoch": 1.1120774302146164, "grad_norm": 0.09877359867095947, "learning_rate": 3.146537616308973e-05, "loss": 0.0206, "step": 39640 }, { "epoch": 1.1123579744704728, "grad_norm": 1.4590201377868652, "learning_rate": 3.146070042549212e-05, "loss": 0.0417, "step": 39650 }, { "epoch": 1.112638518726329, "grad_norm": 0.06450987607240677, "learning_rate": 3.1456024687894515e-05, "loss": 0.0292, "step": 39660 }, { "epoch": 1.1129190629821855, "grad_norm": 0.09973527491092682, "learning_rate": 3.145134895029691e-05, "loss": 0.0188, "step": 39670 }, { "epoch": 1.113199607238042, "grad_norm": 0.40180182456970215, "learning_rate": 3.144667321269931e-05, "loss": 0.0255, "step": 39680 }, { "epoch": 1.1134801514938981, "grad_norm": 0.10827098041772842, "learning_rate": 3.1441997475101695e-05, "loss": 0.0113, "step": 39690 }, { "epoch": 1.1137606957497546, "grad_norm": 0.729039192199707, "learning_rate": 3.1437321737504095e-05, "loss": 0.0484, "step": 39700 }, { "epoch": 1.1140412400056108, "grad_norm": 1.2116070985794067, "learning_rate": 3.143264599990649e-05, "loss": 0.0248, "step": 39710 }, { "epoch": 1.1143217842614672, "grad_norm": 0.4043295979499817, "learning_rate": 3.142797026230888e-05, "loss": 0.012, "step": 39720 }, { "epoch": 1.1146023285173237, "grad_norm": 0.03113831952214241, "learning_rate": 3.1423294524711274e-05, "loss": 0.0389, "step": 39730 }, { "epoch": 1.1148828727731799, "grad_norm": 0.045163024216890335, "learning_rate": 3.141861878711367e-05, "loss": 0.0332, "step": 39740 }, { "epoch": 1.1151634170290363, "grad_norm": 0.3480367362499237, "learning_rate": 3.141394304951607e-05, "loss": 0.055, "step": 39750 }, { "epoch": 1.1154439612848928, "grad_norm": 0.2454954832792282, "learning_rate": 3.1409267311918454e-05, "loss": 0.0181, "step": 39760 }, { "epoch": 1.115724505540749, "grad_norm": 0.45176082849502563, "learning_rate": 3.1404591574320854e-05, "loss": 0.0149, "step": 39770 }, { "epoch": 1.1160050497966054, "grad_norm": 0.1919344663619995, "learning_rate": 3.139991583672324e-05, "loss": 0.0321, "step": 39780 }, { "epoch": 1.1162855940524619, "grad_norm": 0.024045893922448158, "learning_rate": 3.139524009912564e-05, "loss": 0.016, "step": 39790 }, { "epoch": 1.116566138308318, "grad_norm": 8.106009483337402, "learning_rate": 3.139056436152803e-05, "loss": 0.0523, "step": 39800 }, { "epoch": 1.1168466825641745, "grad_norm": 0.9173324108123779, "learning_rate": 3.1385888623930426e-05, "loss": 0.047, "step": 39810 }, { "epoch": 1.117127226820031, "grad_norm": 0.041279081255197525, "learning_rate": 3.138121288633282e-05, "loss": 0.0106, "step": 39820 }, { "epoch": 1.1174077710758872, "grad_norm": 0.034472737461328506, "learning_rate": 3.137653714873521e-05, "loss": 0.0197, "step": 39830 }, { "epoch": 1.1176883153317436, "grad_norm": 0.050094667822122574, "learning_rate": 3.137186141113761e-05, "loss": 0.0194, "step": 39840 }, { "epoch": 1.1179688595876, "grad_norm": 4.9925055503845215, "learning_rate": 3.136718567354e-05, "loss": 0.0425, "step": 39850 }, { "epoch": 1.1182494038434563, "grad_norm": 0.08704519271850586, "learning_rate": 3.13625099359424e-05, "loss": 0.0403, "step": 39860 }, { "epoch": 1.1185299480993127, "grad_norm": 0.3535142242908478, "learning_rate": 3.1357834198344785e-05, "loss": 0.0071, "step": 39870 }, { "epoch": 1.118810492355169, "grad_norm": 0.38853660225868225, "learning_rate": 3.1353158460747185e-05, "loss": 0.0094, "step": 39880 }, { "epoch": 1.1190910366110254, "grad_norm": 0.013401197269558907, "learning_rate": 3.134848272314958e-05, "loss": 0.0455, "step": 39890 }, { "epoch": 1.1193715808668818, "grad_norm": 0.032891590148210526, "learning_rate": 3.134380698555197e-05, "loss": 0.0436, "step": 39900 }, { "epoch": 1.119652125122738, "grad_norm": 0.2559138834476471, "learning_rate": 3.1339131247954365e-05, "loss": 0.0041, "step": 39910 }, { "epoch": 1.1199326693785945, "grad_norm": 0.03324023634195328, "learning_rate": 3.133445551035676e-05, "loss": 0.0145, "step": 39920 }, { "epoch": 1.120213213634451, "grad_norm": 0.22259008884429932, "learning_rate": 3.132977977275916e-05, "loss": 0.0238, "step": 39930 }, { "epoch": 1.1204937578903071, "grad_norm": 1.3457562923431396, "learning_rate": 3.132510403516155e-05, "loss": 0.0459, "step": 39940 }, { "epoch": 1.1207743021461636, "grad_norm": 1.177484154701233, "learning_rate": 3.1320428297563944e-05, "loss": 0.0518, "step": 39950 }, { "epoch": 1.12105484640202, "grad_norm": 0.05931893736124039, "learning_rate": 3.131575255996634e-05, "loss": 0.0407, "step": 39960 }, { "epoch": 1.1213353906578762, "grad_norm": 0.07544034719467163, "learning_rate": 3.131107682236873e-05, "loss": 0.025, "step": 39970 }, { "epoch": 1.1216159349137327, "grad_norm": 0.058938439935445786, "learning_rate": 3.1306401084771123e-05, "loss": 0.0203, "step": 39980 }, { "epoch": 1.121896479169589, "grad_norm": 0.13592875003814697, "learning_rate": 3.130172534717352e-05, "loss": 0.0208, "step": 39990 }, { "epoch": 1.1221770234254453, "grad_norm": 0.16436460614204407, "learning_rate": 3.129704960957591e-05, "loss": 0.0129, "step": 40000 }, { "epoch": 1.1224575676813018, "grad_norm": 1.89368736743927, "learning_rate": 3.129237387197831e-05, "loss": 0.0124, "step": 40010 }, { "epoch": 1.122738111937158, "grad_norm": 0.06849294900894165, "learning_rate": 3.12876981343807e-05, "loss": 0.02, "step": 40020 }, { "epoch": 1.1230186561930144, "grad_norm": 7.294360637664795, "learning_rate": 3.1283022396783096e-05, "loss": 0.0313, "step": 40030 }, { "epoch": 1.1232992004488709, "grad_norm": 0.21033242344856262, "learning_rate": 3.127834665918549e-05, "loss": 0.058, "step": 40040 }, { "epoch": 1.123579744704727, "grad_norm": 0.16709277033805847, "learning_rate": 3.127367092158788e-05, "loss": 0.0585, "step": 40050 }, { "epoch": 1.1238602889605835, "grad_norm": 0.2073853313922882, "learning_rate": 3.1268995183990275e-05, "loss": 0.0351, "step": 40060 }, { "epoch": 1.12414083321644, "grad_norm": 0.029949534684419632, "learning_rate": 3.126431944639267e-05, "loss": 0.066, "step": 40070 }, { "epoch": 1.1244213774722962, "grad_norm": 0.9492502212524414, "learning_rate": 3.125964370879507e-05, "loss": 0.0435, "step": 40080 }, { "epoch": 1.1247019217281526, "grad_norm": 2.9352173805236816, "learning_rate": 3.1254967971197455e-05, "loss": 0.0518, "step": 40090 }, { "epoch": 1.1249824659840089, "grad_norm": 0.10137338191270828, "learning_rate": 3.1250292233599855e-05, "loss": 0.0287, "step": 40100 }, { "epoch": 1.1252630102398653, "grad_norm": 0.05506150797009468, "learning_rate": 3.124561649600224e-05, "loss": 0.0234, "step": 40110 }, { "epoch": 1.1255435544957217, "grad_norm": 0.4476780891418457, "learning_rate": 3.124094075840464e-05, "loss": 0.0243, "step": 40120 }, { "epoch": 1.1258240987515782, "grad_norm": 0.2287718653678894, "learning_rate": 3.1236265020807034e-05, "loss": 0.0366, "step": 40130 }, { "epoch": 1.1261046430074344, "grad_norm": 0.04120141640305519, "learning_rate": 3.123158928320943e-05, "loss": 0.0239, "step": 40140 }, { "epoch": 1.1263851872632908, "grad_norm": 0.09892817586660385, "learning_rate": 3.122691354561183e-05, "loss": 0.0191, "step": 40150 }, { "epoch": 1.126665731519147, "grad_norm": 0.2604083716869354, "learning_rate": 3.1222237808014214e-05, "loss": 0.0199, "step": 40160 }, { "epoch": 1.1269462757750035, "grad_norm": 0.49360817670822144, "learning_rate": 3.1217562070416614e-05, "loss": 0.0118, "step": 40170 }, { "epoch": 1.12722682003086, "grad_norm": 0.7936025857925415, "learning_rate": 3.1212886332819e-05, "loss": 0.0423, "step": 40180 }, { "epoch": 1.1275073642867162, "grad_norm": 0.7142085433006287, "learning_rate": 3.12082105952214e-05, "loss": 0.0271, "step": 40190 }, { "epoch": 1.1277879085425726, "grad_norm": 0.7048561573028564, "learning_rate": 3.1203534857623786e-05, "loss": 0.0151, "step": 40200 }, { "epoch": 1.128068452798429, "grad_norm": 0.20984143018722534, "learning_rate": 3.1198859120026186e-05, "loss": 0.0212, "step": 40210 }, { "epoch": 1.1283489970542853, "grad_norm": 0.4386805295944214, "learning_rate": 3.119418338242858e-05, "loss": 0.0146, "step": 40220 }, { "epoch": 1.1286295413101417, "grad_norm": 0.8272977471351624, "learning_rate": 3.118950764483097e-05, "loss": 0.0276, "step": 40230 }, { "epoch": 1.1289100855659981, "grad_norm": 0.5223056077957153, "learning_rate": 3.118483190723337e-05, "loss": 0.0446, "step": 40240 }, { "epoch": 1.1291906298218544, "grad_norm": 0.14102047681808472, "learning_rate": 3.118015616963576e-05, "loss": 0.0141, "step": 40250 }, { "epoch": 1.1294711740777108, "grad_norm": 0.06620679050683975, "learning_rate": 3.117548043203816e-05, "loss": 0.0382, "step": 40260 }, { "epoch": 1.129751718333567, "grad_norm": 0.13686548173427582, "learning_rate": 3.1170804694440545e-05, "loss": 0.0131, "step": 40270 }, { "epoch": 1.1300322625894235, "grad_norm": 0.16658470034599304, "learning_rate": 3.1166128956842945e-05, "loss": 0.0337, "step": 40280 }, { "epoch": 1.13031280684528, "grad_norm": 0.2693808376789093, "learning_rate": 3.116145321924534e-05, "loss": 0.0338, "step": 40290 }, { "epoch": 1.1305933511011361, "grad_norm": 0.2931952476501465, "learning_rate": 3.115677748164773e-05, "loss": 0.0372, "step": 40300 }, { "epoch": 1.1308738953569926, "grad_norm": 0.10438280552625656, "learning_rate": 3.1152101744050125e-05, "loss": 0.025, "step": 40310 }, { "epoch": 1.131154439612849, "grad_norm": 0.2198224514722824, "learning_rate": 3.114742600645252e-05, "loss": 0.0106, "step": 40320 }, { "epoch": 1.1314349838687052, "grad_norm": 0.43018639087677, "learning_rate": 3.114275026885492e-05, "loss": 0.0326, "step": 40330 }, { "epoch": 1.1317155281245617, "grad_norm": 0.14269313216209412, "learning_rate": 3.1138074531257304e-05, "loss": 0.027, "step": 40340 }, { "epoch": 1.131996072380418, "grad_norm": 0.03990757837891579, "learning_rate": 3.1133398793659704e-05, "loss": 0.0354, "step": 40350 }, { "epoch": 1.1322766166362743, "grad_norm": 0.03341352939605713, "learning_rate": 3.11287230560621e-05, "loss": 0.0133, "step": 40360 }, { "epoch": 1.1325571608921308, "grad_norm": 0.7930080890655518, "learning_rate": 3.112404731846449e-05, "loss": 0.0313, "step": 40370 }, { "epoch": 1.132837705147987, "grad_norm": 0.4319385588169098, "learning_rate": 3.1119371580866884e-05, "loss": 0.0277, "step": 40380 }, { "epoch": 1.1331182494038434, "grad_norm": 1.5221823453903198, "learning_rate": 3.111469584326928e-05, "loss": 0.0394, "step": 40390 }, { "epoch": 1.1333987936596999, "grad_norm": 0.017032131552696228, "learning_rate": 3.111002010567167e-05, "loss": 0.0086, "step": 40400 }, { "epoch": 1.133679337915556, "grad_norm": 0.5605892539024353, "learning_rate": 3.110534436807406e-05, "loss": 0.0467, "step": 40410 }, { "epoch": 1.1339598821714125, "grad_norm": 0.2520895302295685, "learning_rate": 3.1100668630476456e-05, "loss": 0.0224, "step": 40420 }, { "epoch": 1.134240426427269, "grad_norm": 0.2369556725025177, "learning_rate": 3.1095992892878856e-05, "loss": 0.0389, "step": 40430 }, { "epoch": 1.1345209706831252, "grad_norm": 0.4116283059120178, "learning_rate": 3.109131715528125e-05, "loss": 0.0249, "step": 40440 }, { "epoch": 1.1348015149389816, "grad_norm": 0.03011297807097435, "learning_rate": 3.108664141768364e-05, "loss": 0.0445, "step": 40450 }, { "epoch": 1.135082059194838, "grad_norm": 0.06955143809318542, "learning_rate": 3.1081965680086036e-05, "loss": 0.0227, "step": 40460 }, { "epoch": 1.1353626034506943, "grad_norm": 0.08161856234073639, "learning_rate": 3.107728994248843e-05, "loss": 0.0255, "step": 40470 }, { "epoch": 1.1356431477065507, "grad_norm": 1.3053884506225586, "learning_rate": 3.107261420489082e-05, "loss": 0.0331, "step": 40480 }, { "epoch": 1.135923691962407, "grad_norm": 0.7304539680480957, "learning_rate": 3.1067938467293215e-05, "loss": 0.0436, "step": 40490 }, { "epoch": 1.1362042362182634, "grad_norm": 0.2945406138896942, "learning_rate": 3.1063262729695615e-05, "loss": 0.0441, "step": 40500 }, { "epoch": 1.1364847804741198, "grad_norm": 0.8267127275466919, "learning_rate": 3.1058586992098e-05, "loss": 0.0373, "step": 40510 }, { "epoch": 1.1367653247299763, "grad_norm": 0.39815521240234375, "learning_rate": 3.10539112545004e-05, "loss": 0.0156, "step": 40520 }, { "epoch": 1.1370458689858325, "grad_norm": 0.034450847655534744, "learning_rate": 3.1049235516902794e-05, "loss": 0.0291, "step": 40530 }, { "epoch": 1.137326413241689, "grad_norm": 1.3007359504699707, "learning_rate": 3.104455977930519e-05, "loss": 0.0421, "step": 40540 }, { "epoch": 1.1376069574975451, "grad_norm": 0.5099307894706726, "learning_rate": 3.103988404170758e-05, "loss": 0.0157, "step": 40550 }, { "epoch": 1.1378875017534016, "grad_norm": 0.31558385491371155, "learning_rate": 3.1035208304109974e-05, "loss": 0.0335, "step": 40560 }, { "epoch": 1.138168046009258, "grad_norm": 0.19178250432014465, "learning_rate": 3.1030532566512374e-05, "loss": 0.0141, "step": 40570 }, { "epoch": 1.1384485902651142, "grad_norm": 0.045801568776369095, "learning_rate": 3.102585682891476e-05, "loss": 0.0373, "step": 40580 }, { "epoch": 1.1387291345209707, "grad_norm": 0.46539878845214844, "learning_rate": 3.102118109131716e-05, "loss": 0.0357, "step": 40590 }, { "epoch": 1.1390096787768271, "grad_norm": 0.2839643061161041, "learning_rate": 3.1016505353719547e-05, "loss": 0.0183, "step": 40600 }, { "epoch": 1.1392902230326833, "grad_norm": 0.2565126121044159, "learning_rate": 3.1011829616121946e-05, "loss": 0.0315, "step": 40610 }, { "epoch": 1.1395707672885398, "grad_norm": 0.9199680089950562, "learning_rate": 3.100715387852434e-05, "loss": 0.0694, "step": 40620 }, { "epoch": 1.1398513115443962, "grad_norm": 0.19522856175899506, "learning_rate": 3.100247814092673e-05, "loss": 0.0134, "step": 40630 }, { "epoch": 1.1401318558002524, "grad_norm": 1.054024338722229, "learning_rate": 3.0997802403329126e-05, "loss": 0.0231, "step": 40640 }, { "epoch": 1.1404124000561089, "grad_norm": 0.3295387625694275, "learning_rate": 3.099312666573152e-05, "loss": 0.026, "step": 40650 }, { "epoch": 1.140692944311965, "grad_norm": 8.725993156433105, "learning_rate": 3.098845092813392e-05, "loss": 0.0606, "step": 40660 }, { "epoch": 1.1409734885678215, "grad_norm": 0.5550515651702881, "learning_rate": 3.0983775190536305e-05, "loss": 0.0427, "step": 40670 }, { "epoch": 1.141254032823678, "grad_norm": 0.2606046795845032, "learning_rate": 3.0979099452938705e-05, "loss": 0.0247, "step": 40680 }, { "epoch": 1.1415345770795342, "grad_norm": 1.0803278684616089, "learning_rate": 3.097442371534109e-05, "loss": 0.0254, "step": 40690 }, { "epoch": 1.1418151213353906, "grad_norm": 0.12127242982387543, "learning_rate": 3.096974797774349e-05, "loss": 0.0466, "step": 40700 }, { "epoch": 1.142095665591247, "grad_norm": 0.3915865421295166, "learning_rate": 3.0965072240145885e-05, "loss": 0.0179, "step": 40710 }, { "epoch": 1.1423762098471033, "grad_norm": 0.07910073548555374, "learning_rate": 3.096039650254828e-05, "loss": 0.04, "step": 40720 }, { "epoch": 1.1426567541029597, "grad_norm": 0.272619366645813, "learning_rate": 3.095572076495067e-05, "loss": 0.0367, "step": 40730 }, { "epoch": 1.1429372983588162, "grad_norm": 0.1723586618900299, "learning_rate": 3.0951045027353064e-05, "loss": 0.0285, "step": 40740 }, { "epoch": 1.1432178426146724, "grad_norm": 1.3218003511428833, "learning_rate": 3.0946369289755464e-05, "loss": 0.0273, "step": 40750 }, { "epoch": 1.1434983868705288, "grad_norm": 1.2977938652038574, "learning_rate": 3.094169355215785e-05, "loss": 0.0483, "step": 40760 }, { "epoch": 1.143778931126385, "grad_norm": 0.6906498074531555, "learning_rate": 3.093701781456025e-05, "loss": 0.036, "step": 40770 }, { "epoch": 1.1440594753822415, "grad_norm": 0.11929251998662949, "learning_rate": 3.0932342076962644e-05, "loss": 0.0234, "step": 40780 }, { "epoch": 1.144340019638098, "grad_norm": 0.41343554854393005, "learning_rate": 3.092766633936504e-05, "loss": 0.0391, "step": 40790 }, { "epoch": 1.1446205638939544, "grad_norm": 0.1065889373421669, "learning_rate": 3.092299060176743e-05, "loss": 0.0283, "step": 40800 }, { "epoch": 1.1449011081498106, "grad_norm": 0.05235530436038971, "learning_rate": 3.091831486416982e-05, "loss": 0.0259, "step": 40810 }, { "epoch": 1.145181652405667, "grad_norm": 0.5291107892990112, "learning_rate": 3.0913639126572216e-05, "loss": 0.0207, "step": 40820 }, { "epoch": 1.1454621966615233, "grad_norm": 0.1653168648481369, "learning_rate": 3.090896338897461e-05, "loss": 0.0095, "step": 40830 }, { "epoch": 1.1457427409173797, "grad_norm": 0.17700497806072235, "learning_rate": 3.090428765137701e-05, "loss": 0.0085, "step": 40840 }, { "epoch": 1.1460232851732362, "grad_norm": 4.96730899810791, "learning_rate": 3.08996119137794e-05, "loss": 0.0423, "step": 40850 }, { "epoch": 1.1463038294290924, "grad_norm": 0.29590147733688354, "learning_rate": 3.0894936176181796e-05, "loss": 0.0948, "step": 40860 }, { "epoch": 1.1465843736849488, "grad_norm": 0.40849030017852783, "learning_rate": 3.089026043858419e-05, "loss": 0.012, "step": 40870 }, { "epoch": 1.1468649179408053, "grad_norm": 0.0300001110881567, "learning_rate": 3.088558470098658e-05, "loss": 0.0194, "step": 40880 }, { "epoch": 1.1471454621966615, "grad_norm": 1.9916130304336548, "learning_rate": 3.0880908963388975e-05, "loss": 0.0205, "step": 40890 }, { "epoch": 1.147426006452518, "grad_norm": 0.17220978438854218, "learning_rate": 3.087623322579137e-05, "loss": 0.0185, "step": 40900 }, { "epoch": 1.1477065507083744, "grad_norm": 0.18987686932086945, "learning_rate": 3.087155748819376e-05, "loss": 0.0391, "step": 40910 }, { "epoch": 1.1479870949642306, "grad_norm": 1.8562965393066406, "learning_rate": 3.086688175059616e-05, "loss": 0.0254, "step": 40920 }, { "epoch": 1.148267639220087, "grad_norm": 0.07380016148090363, "learning_rate": 3.0862206012998555e-05, "loss": 0.0172, "step": 40930 }, { "epoch": 1.1485481834759432, "grad_norm": 0.00852226372808218, "learning_rate": 3.085753027540095e-05, "loss": 0.0144, "step": 40940 }, { "epoch": 1.1488287277317997, "grad_norm": 0.3848504424095154, "learning_rate": 3.085285453780334e-05, "loss": 0.0317, "step": 40950 }, { "epoch": 1.1491092719876561, "grad_norm": 0.4376218318939209, "learning_rate": 3.0848178800205734e-05, "loss": 0.0095, "step": 40960 }, { "epoch": 1.1493898162435123, "grad_norm": 0.4400888979434967, "learning_rate": 3.084350306260813e-05, "loss": 0.0189, "step": 40970 }, { "epoch": 1.1496703604993688, "grad_norm": 1.0479542016983032, "learning_rate": 3.083882732501052e-05, "loss": 0.0671, "step": 40980 }, { "epoch": 1.1499509047552252, "grad_norm": 0.4949779212474823, "learning_rate": 3.083415158741292e-05, "loss": 0.0368, "step": 40990 }, { "epoch": 1.1502314490110814, "grad_norm": 0.2866255044937134, "learning_rate": 3.082947584981531e-05, "loss": 0.0195, "step": 41000 }, { "epoch": 1.1505119932669379, "grad_norm": 0.06336949020624161, "learning_rate": 3.0824800112217707e-05, "loss": 0.0061, "step": 41010 }, { "epoch": 1.1507925375227943, "grad_norm": 0.35892000794410706, "learning_rate": 3.082012437462009e-05, "loss": 0.0231, "step": 41020 }, { "epoch": 1.1510730817786505, "grad_norm": 0.13272161781787872, "learning_rate": 3.081544863702249e-05, "loss": 0.0208, "step": 41030 }, { "epoch": 1.151353626034507, "grad_norm": 10.277231216430664, "learning_rate": 3.0810772899424886e-05, "loss": 0.0137, "step": 41040 }, { "epoch": 1.1516341702903632, "grad_norm": 0.13409313559532166, "learning_rate": 3.080609716182728e-05, "loss": 0.0161, "step": 41050 }, { "epoch": 1.1519147145462196, "grad_norm": 0.040401920676231384, "learning_rate": 3.080142142422968e-05, "loss": 0.045, "step": 41060 }, { "epoch": 1.152195258802076, "grad_norm": 0.9532820582389832, "learning_rate": 3.0796745686632065e-05, "loss": 0.0424, "step": 41070 }, { "epoch": 1.1524758030579325, "grad_norm": 0.45268216729164124, "learning_rate": 3.0792069949034465e-05, "loss": 0.0093, "step": 41080 }, { "epoch": 1.1527563473137887, "grad_norm": 0.11902198940515518, "learning_rate": 3.078739421143685e-05, "loss": 0.0127, "step": 41090 }, { "epoch": 1.1530368915696452, "grad_norm": 0.7235426306724548, "learning_rate": 3.078271847383925e-05, "loss": 0.0213, "step": 41100 }, { "epoch": 1.1533174358255014, "grad_norm": 0.5047782063484192, "learning_rate": 3.077804273624164e-05, "loss": 0.0388, "step": 41110 }, { "epoch": 1.1535979800813578, "grad_norm": 0.5386695265769958, "learning_rate": 3.077336699864404e-05, "loss": 0.0321, "step": 41120 }, { "epoch": 1.1538785243372143, "grad_norm": 0.12533803284168243, "learning_rate": 3.076869126104643e-05, "loss": 0.0139, "step": 41130 }, { "epoch": 1.1541590685930705, "grad_norm": 0.06522999703884125, "learning_rate": 3.0764015523448824e-05, "loss": 0.0393, "step": 41140 }, { "epoch": 1.154439612848927, "grad_norm": 0.5737908482551575, "learning_rate": 3.0759339785851224e-05, "loss": 0.0183, "step": 41150 }, { "epoch": 1.1547201571047834, "grad_norm": 0.04696540907025337, "learning_rate": 3.075466404825361e-05, "loss": 0.0068, "step": 41160 }, { "epoch": 1.1550007013606396, "grad_norm": 1.8018702268600464, "learning_rate": 3.074998831065601e-05, "loss": 0.0639, "step": 41170 }, { "epoch": 1.155281245616496, "grad_norm": 0.8541412353515625, "learning_rate": 3.07453125730584e-05, "loss": 0.0103, "step": 41180 }, { "epoch": 1.1555617898723525, "grad_norm": 0.23260553181171417, "learning_rate": 3.07406368354608e-05, "loss": 0.0666, "step": 41190 }, { "epoch": 1.1558423341282087, "grad_norm": 0.04418720677495003, "learning_rate": 3.073596109786319e-05, "loss": 0.0277, "step": 41200 }, { "epoch": 1.1561228783840651, "grad_norm": 0.6111424565315247, "learning_rate": 3.073128536026558e-05, "loss": 0.0453, "step": 41210 }, { "epoch": 1.1564034226399214, "grad_norm": 0.7818432450294495, "learning_rate": 3.0726609622667976e-05, "loss": 0.0355, "step": 41220 }, { "epoch": 1.1566839668957778, "grad_norm": 4.470350742340088, "learning_rate": 3.072193388507037e-05, "loss": 0.0181, "step": 41230 }, { "epoch": 1.1569645111516342, "grad_norm": 0.39507102966308594, "learning_rate": 3.071725814747277e-05, "loss": 0.0419, "step": 41240 }, { "epoch": 1.1572450554074905, "grad_norm": 0.05933719128370285, "learning_rate": 3.0712582409875156e-05, "loss": 0.0229, "step": 41250 }, { "epoch": 1.157525599663347, "grad_norm": 0.06626400351524353, "learning_rate": 3.0707906672277556e-05, "loss": 0.0573, "step": 41260 }, { "epoch": 1.1578061439192033, "grad_norm": 0.38328489661216736, "learning_rate": 3.070323093467995e-05, "loss": 0.0206, "step": 41270 }, { "epoch": 1.1580866881750596, "grad_norm": 0.08017203956842422, "learning_rate": 3.069855519708234e-05, "loss": 0.0071, "step": 41280 }, { "epoch": 1.158367232430916, "grad_norm": 0.5366840958595276, "learning_rate": 3.0693879459484735e-05, "loss": 0.0217, "step": 41290 }, { "epoch": 1.1586477766867724, "grad_norm": 0.08620418608188629, "learning_rate": 3.068920372188713e-05, "loss": 0.0363, "step": 41300 }, { "epoch": 1.1589283209426287, "grad_norm": 0.8088881373405457, "learning_rate": 3.068452798428952e-05, "loss": 0.0145, "step": 41310 }, { "epoch": 1.159208865198485, "grad_norm": 0.07313146442174911, "learning_rate": 3.0679852246691915e-05, "loss": 0.0368, "step": 41320 }, { "epoch": 1.1594894094543413, "grad_norm": 0.06680617481470108, "learning_rate": 3.067517650909431e-05, "loss": 0.0235, "step": 41330 }, { "epoch": 1.1597699537101978, "grad_norm": 0.6030197143554688, "learning_rate": 3.067050077149671e-05, "loss": 0.0633, "step": 41340 }, { "epoch": 1.1600504979660542, "grad_norm": 0.30119091272354126, "learning_rate": 3.06658250338991e-05, "loss": 0.0422, "step": 41350 }, { "epoch": 1.1603310422219104, "grad_norm": 0.1958097517490387, "learning_rate": 3.0661149296301494e-05, "loss": 0.0524, "step": 41360 }, { "epoch": 1.1606115864777669, "grad_norm": 0.6825196743011475, "learning_rate": 3.065647355870389e-05, "loss": 0.0209, "step": 41370 }, { "epoch": 1.1608921307336233, "grad_norm": 0.0638296976685524, "learning_rate": 3.065179782110628e-05, "loss": 0.0177, "step": 41380 }, { "epoch": 1.1611726749894795, "grad_norm": 0.047916051000356674, "learning_rate": 3.0647122083508674e-05, "loss": 0.0357, "step": 41390 }, { "epoch": 1.161453219245336, "grad_norm": 0.07161860167980194, "learning_rate": 3.064244634591107e-05, "loss": 0.0401, "step": 41400 }, { "epoch": 1.1617337635011924, "grad_norm": 0.0799393355846405, "learning_rate": 3.063777060831347e-05, "loss": 0.0329, "step": 41410 }, { "epoch": 1.1620143077570486, "grad_norm": 0.08931092917919159, "learning_rate": 3.063309487071585e-05, "loss": 0.0152, "step": 41420 }, { "epoch": 1.162294852012905, "grad_norm": 0.7867400050163269, "learning_rate": 3.062841913311825e-05, "loss": 0.0152, "step": 41430 }, { "epoch": 1.1625753962687613, "grad_norm": 0.24794328212738037, "learning_rate": 3.0623743395520646e-05, "loss": 0.019, "step": 41440 }, { "epoch": 1.1628559405246177, "grad_norm": 0.12669771909713745, "learning_rate": 3.061906765792304e-05, "loss": 0.0051, "step": 41450 }, { "epoch": 1.1631364847804742, "grad_norm": 1.8112881183624268, "learning_rate": 3.061439192032543e-05, "loss": 0.0462, "step": 41460 }, { "epoch": 1.1634170290363306, "grad_norm": 0.06503170728683472, "learning_rate": 3.0609716182727826e-05, "loss": 0.0301, "step": 41470 }, { "epoch": 1.1636975732921868, "grad_norm": 0.04639974236488342, "learning_rate": 3.0605040445130226e-05, "loss": 0.0253, "step": 41480 }, { "epoch": 1.1639781175480433, "grad_norm": 3.0375711917877197, "learning_rate": 3.060036470753261e-05, "loss": 0.0377, "step": 41490 }, { "epoch": 1.1642586618038995, "grad_norm": 0.309612900018692, "learning_rate": 3.059568896993501e-05, "loss": 0.0527, "step": 41500 }, { "epoch": 1.164539206059756, "grad_norm": 2.2768380641937256, "learning_rate": 3.05910132323374e-05, "loss": 0.0317, "step": 41510 }, { "epoch": 1.1648197503156124, "grad_norm": 0.1858517974615097, "learning_rate": 3.05863374947398e-05, "loss": 0.0286, "step": 41520 }, { "epoch": 1.1651002945714686, "grad_norm": 0.12398361414670944, "learning_rate": 3.058166175714219e-05, "loss": 0.0417, "step": 41530 }, { "epoch": 1.165380838827325, "grad_norm": 0.6672377586364746, "learning_rate": 3.0576986019544584e-05, "loss": 0.0356, "step": 41540 }, { "epoch": 1.1656613830831815, "grad_norm": 0.32489022612571716, "learning_rate": 3.057231028194698e-05, "loss": 0.0216, "step": 41550 }, { "epoch": 1.1659419273390377, "grad_norm": 0.649763822555542, "learning_rate": 3.056763454434937e-05, "loss": 0.0277, "step": 41560 }, { "epoch": 1.1662224715948941, "grad_norm": 0.10230810195207596, "learning_rate": 3.056295880675177e-05, "loss": 0.0646, "step": 41570 }, { "epoch": 1.1665030158507506, "grad_norm": 2.021017074584961, "learning_rate": 3.055828306915416e-05, "loss": 0.0229, "step": 41580 }, { "epoch": 1.1667835601066068, "grad_norm": 0.0550408773124218, "learning_rate": 3.055360733155656e-05, "loss": 0.041, "step": 41590 }, { "epoch": 1.1670641043624632, "grad_norm": 0.30379518866539, "learning_rate": 3.054893159395894e-05, "loss": 0.0226, "step": 41600 }, { "epoch": 1.1673446486183194, "grad_norm": 2.0201056003570557, "learning_rate": 3.054425585636134e-05, "loss": 0.0102, "step": 41610 }, { "epoch": 1.1676251928741759, "grad_norm": 0.18500864505767822, "learning_rate": 3.0539580118763736e-05, "loss": 0.0323, "step": 41620 }, { "epoch": 1.1679057371300323, "grad_norm": 28.65359115600586, "learning_rate": 3.053490438116613e-05, "loss": 0.0549, "step": 41630 }, { "epoch": 1.1681862813858885, "grad_norm": 0.787277102470398, "learning_rate": 3.053022864356852e-05, "loss": 0.0208, "step": 41640 }, { "epoch": 1.168466825641745, "grad_norm": 0.50407874584198, "learning_rate": 3.0525552905970916e-05, "loss": 0.0347, "step": 41650 }, { "epoch": 1.1687473698976014, "grad_norm": 0.09438513219356537, "learning_rate": 3.0520877168373316e-05, "loss": 0.0232, "step": 41660 }, { "epoch": 1.1690279141534576, "grad_norm": 0.04237792268395424, "learning_rate": 3.0516201430775702e-05, "loss": 0.0189, "step": 41670 }, { "epoch": 1.169308458409314, "grad_norm": 0.5879709720611572, "learning_rate": 3.05115256931781e-05, "loss": 0.042, "step": 41680 }, { "epoch": 1.1695890026651705, "grad_norm": 2.5798118114471436, "learning_rate": 3.0506849955580495e-05, "loss": 0.0379, "step": 41690 }, { "epoch": 1.1698695469210267, "grad_norm": 0.026317963376641273, "learning_rate": 3.050217421798289e-05, "loss": 0.0072, "step": 41700 }, { "epoch": 1.1701500911768832, "grad_norm": 0.16031111776828766, "learning_rate": 3.0497498480385285e-05, "loss": 0.0309, "step": 41710 }, { "epoch": 1.1704306354327394, "grad_norm": 1.2625402212142944, "learning_rate": 3.0492822742787675e-05, "loss": 0.0531, "step": 41720 }, { "epoch": 1.1707111796885958, "grad_norm": 0.19256159663200378, "learning_rate": 3.048814700519007e-05, "loss": 0.0075, "step": 41730 }, { "epoch": 1.1709917239444523, "grad_norm": 0.4663739800453186, "learning_rate": 3.048347126759246e-05, "loss": 0.0348, "step": 41740 }, { "epoch": 1.1712722682003087, "grad_norm": 0.18045170605182648, "learning_rate": 3.0478795529994858e-05, "loss": 0.0269, "step": 41750 }, { "epoch": 1.171552812456165, "grad_norm": 0.1739271730184555, "learning_rate": 3.0474119792397254e-05, "loss": 0.0168, "step": 41760 }, { "epoch": 1.1718333567120214, "grad_norm": 1.02487313747406, "learning_rate": 3.0469444054799644e-05, "loss": 0.0232, "step": 41770 }, { "epoch": 1.1721139009678776, "grad_norm": 0.04316476359963417, "learning_rate": 3.046476831720204e-05, "loss": 0.0247, "step": 41780 }, { "epoch": 1.172394445223734, "grad_norm": 0.4540817439556122, "learning_rate": 3.0460092579604434e-05, "loss": 0.0238, "step": 41790 }, { "epoch": 1.1726749894795905, "grad_norm": 0.6336650252342224, "learning_rate": 3.045541684200683e-05, "loss": 0.0509, "step": 41800 }, { "epoch": 1.1729555337354467, "grad_norm": 0.6201178431510925, "learning_rate": 3.045074110440922e-05, "loss": 0.0474, "step": 41810 }, { "epoch": 1.1732360779913031, "grad_norm": 12.24411678314209, "learning_rate": 3.0446065366811617e-05, "loss": 0.0299, "step": 41820 }, { "epoch": 1.1735166222471596, "grad_norm": 0.13246305286884308, "learning_rate": 3.0441389629214013e-05, "loss": 0.0199, "step": 41830 }, { "epoch": 1.1737971665030158, "grad_norm": 0.0419076643884182, "learning_rate": 3.0436713891616403e-05, "loss": 0.0237, "step": 41840 }, { "epoch": 1.1740777107588722, "grad_norm": 0.06753120571374893, "learning_rate": 3.04320381540188e-05, "loss": 0.0096, "step": 41850 }, { "epoch": 1.1743582550147287, "grad_norm": 0.032033130526542664, "learning_rate": 3.042736241642119e-05, "loss": 0.0328, "step": 41860 }, { "epoch": 1.174638799270585, "grad_norm": 0.04096338897943497, "learning_rate": 3.0422686678823586e-05, "loss": 0.0233, "step": 41870 }, { "epoch": 1.1749193435264413, "grad_norm": 0.3617798388004303, "learning_rate": 3.041801094122598e-05, "loss": 0.0335, "step": 41880 }, { "epoch": 1.1751998877822976, "grad_norm": 0.8843037486076355, "learning_rate": 3.0413335203628372e-05, "loss": 0.0295, "step": 41890 }, { "epoch": 1.175480432038154, "grad_norm": 0.054666873067617416, "learning_rate": 3.040865946603077e-05, "loss": 0.0502, "step": 41900 }, { "epoch": 1.1757609762940104, "grad_norm": 1.545977234840393, "learning_rate": 3.040398372843316e-05, "loss": 0.0923, "step": 41910 }, { "epoch": 1.1760415205498667, "grad_norm": 6.16977596282959, "learning_rate": 3.0399307990835558e-05, "loss": 0.0489, "step": 41920 }, { "epoch": 1.176322064805723, "grad_norm": 0.16742569208145142, "learning_rate": 3.0394632253237948e-05, "loss": 0.0237, "step": 41930 }, { "epoch": 1.1766026090615795, "grad_norm": 3.239952325820923, "learning_rate": 3.0389956515640345e-05, "loss": 0.0222, "step": 41940 }, { "epoch": 1.1768831533174358, "grad_norm": 0.20009079575538635, "learning_rate": 3.0385280778042734e-05, "loss": 0.0335, "step": 41950 }, { "epoch": 1.1771636975732922, "grad_norm": 0.038872722536325455, "learning_rate": 3.038060504044513e-05, "loss": 0.0433, "step": 41960 }, { "epoch": 1.1774442418291486, "grad_norm": 0.0926138386130333, "learning_rate": 3.0375929302847527e-05, "loss": 0.0273, "step": 41970 }, { "epoch": 1.1777247860850049, "grad_norm": 3.7022194862365723, "learning_rate": 3.0371253565249917e-05, "loss": 0.0206, "step": 41980 }, { "epoch": 1.1780053303408613, "grad_norm": 0.2399919629096985, "learning_rate": 3.0366577827652314e-05, "loss": 0.0168, "step": 41990 }, { "epoch": 1.1782858745967175, "grad_norm": 0.046406105160713196, "learning_rate": 3.0361902090054707e-05, "loss": 0.0294, "step": 42000 }, { "epoch": 1.178566418852574, "grad_norm": 0.9945400953292847, "learning_rate": 3.0357226352457103e-05, "loss": 0.0197, "step": 42010 }, { "epoch": 1.1788469631084304, "grad_norm": 0.08297431468963623, "learning_rate": 3.0352550614859493e-05, "loss": 0.0276, "step": 42020 }, { "epoch": 1.1791275073642866, "grad_norm": 0.10470622777938843, "learning_rate": 3.034787487726189e-05, "loss": 0.0217, "step": 42030 }, { "epoch": 1.179408051620143, "grad_norm": 1.2540693283081055, "learning_rate": 3.0343199139664286e-05, "loss": 0.0213, "step": 42040 }, { "epoch": 1.1796885958759995, "grad_norm": 0.0618419423699379, "learning_rate": 3.0338523402066676e-05, "loss": 0.0426, "step": 42050 }, { "epoch": 1.1799691401318557, "grad_norm": 0.17994432151317596, "learning_rate": 3.0333847664469073e-05, "loss": 0.0493, "step": 42060 }, { "epoch": 1.1802496843877122, "grad_norm": 0.07291392236948013, "learning_rate": 3.0329171926871462e-05, "loss": 0.0147, "step": 42070 }, { "epoch": 1.1805302286435686, "grad_norm": 0.35217541456222534, "learning_rate": 3.032449618927386e-05, "loss": 0.0248, "step": 42080 }, { "epoch": 1.1808107728994248, "grad_norm": 3.092586040496826, "learning_rate": 3.0319820451676252e-05, "loss": 0.0261, "step": 42090 }, { "epoch": 1.1810913171552813, "grad_norm": 0.018946906551718712, "learning_rate": 3.031514471407865e-05, "loss": 0.0075, "step": 42100 }, { "epoch": 1.1813718614111377, "grad_norm": 0.04237695783376694, "learning_rate": 3.0310468976481045e-05, "loss": 0.0185, "step": 42110 }, { "epoch": 1.181652405666994, "grad_norm": 0.23391440510749817, "learning_rate": 3.0305793238883435e-05, "loss": 0.0248, "step": 42120 }, { "epoch": 1.1819329499228504, "grad_norm": 0.19945627450942993, "learning_rate": 3.030111750128583e-05, "loss": 0.0189, "step": 42130 }, { "epoch": 1.1822134941787068, "grad_norm": 0.15613983571529388, "learning_rate": 3.029644176368822e-05, "loss": 0.0237, "step": 42140 }, { "epoch": 1.182494038434563, "grad_norm": 1.358143925666809, "learning_rate": 3.0291766026090618e-05, "loss": 0.0314, "step": 42150 }, { "epoch": 1.1827745826904195, "grad_norm": 0.05581112205982208, "learning_rate": 3.0287090288493007e-05, "loss": 0.0222, "step": 42160 }, { "epoch": 1.1830551269462757, "grad_norm": 0.060042645782232285, "learning_rate": 3.0282414550895404e-05, "loss": 0.0571, "step": 42170 }, { "epoch": 1.1833356712021321, "grad_norm": 0.39457467198371887, "learning_rate": 3.02777388132978e-05, "loss": 0.015, "step": 42180 }, { "epoch": 1.1836162154579886, "grad_norm": 2.1693077087402344, "learning_rate": 3.0273063075700194e-05, "loss": 0.0086, "step": 42190 }, { "epoch": 1.1838967597138448, "grad_norm": 0.016612835228443146, "learning_rate": 3.0268387338102587e-05, "loss": 0.0233, "step": 42200 }, { "epoch": 1.1841773039697012, "grad_norm": 0.25638332962989807, "learning_rate": 3.026371160050498e-05, "loss": 0.0119, "step": 42210 }, { "epoch": 1.1844578482255577, "grad_norm": 0.2505991756916046, "learning_rate": 3.0259035862907377e-05, "loss": 0.0635, "step": 42220 }, { "epoch": 1.1847383924814139, "grad_norm": 0.05512344837188721, "learning_rate": 3.0254360125309766e-05, "loss": 0.0155, "step": 42230 }, { "epoch": 1.1850189367372703, "grad_norm": 0.2247704416513443, "learning_rate": 3.0249684387712163e-05, "loss": 0.0078, "step": 42240 }, { "epoch": 1.1852994809931268, "grad_norm": 0.17966359853744507, "learning_rate": 3.024500865011456e-05, "loss": 0.031, "step": 42250 }, { "epoch": 1.185580025248983, "grad_norm": 0.36575931310653687, "learning_rate": 3.024033291251695e-05, "loss": 0.044, "step": 42260 }, { "epoch": 1.1858605695048394, "grad_norm": 0.4106996953487396, "learning_rate": 3.0235657174919346e-05, "loss": 0.0174, "step": 42270 }, { "epoch": 1.1861411137606956, "grad_norm": 0.2665066719055176, "learning_rate": 3.0230981437321736e-05, "loss": 0.0192, "step": 42280 }, { "epoch": 1.186421658016552, "grad_norm": 0.05088217929005623, "learning_rate": 3.0226305699724132e-05, "loss": 0.0421, "step": 42290 }, { "epoch": 1.1867022022724085, "grad_norm": 0.25952962040901184, "learning_rate": 3.0221629962126525e-05, "loss": 0.0342, "step": 42300 }, { "epoch": 1.1869827465282647, "grad_norm": 0.40365123748779297, "learning_rate": 3.0216954224528922e-05, "loss": 0.0471, "step": 42310 }, { "epoch": 1.1872632907841212, "grad_norm": 0.3695685863494873, "learning_rate": 3.021227848693132e-05, "loss": 0.0278, "step": 42320 }, { "epoch": 1.1875438350399776, "grad_norm": 0.0697568878531456, "learning_rate": 3.0207602749333708e-05, "loss": 0.0199, "step": 42330 }, { "epoch": 1.1878243792958338, "grad_norm": 0.44712501764297485, "learning_rate": 3.0202927011736105e-05, "loss": 0.0137, "step": 42340 }, { "epoch": 1.1881049235516903, "grad_norm": 0.040382564067840576, "learning_rate": 3.0198251274138494e-05, "loss": 0.0227, "step": 42350 }, { "epoch": 1.1883854678075467, "grad_norm": 0.2517678737640381, "learning_rate": 3.019357553654089e-05, "loss": 0.0152, "step": 42360 }, { "epoch": 1.188666012063403, "grad_norm": 0.4259761869907379, "learning_rate": 3.018889979894328e-05, "loss": 0.0231, "step": 42370 }, { "epoch": 1.1889465563192594, "grad_norm": 0.26764529943466187, "learning_rate": 3.0184224061345677e-05, "loss": 0.0397, "step": 42380 }, { "epoch": 1.1892271005751156, "grad_norm": 0.03162240982055664, "learning_rate": 3.0179548323748074e-05, "loss": 0.0156, "step": 42390 }, { "epoch": 1.189507644830972, "grad_norm": 0.8321380019187927, "learning_rate": 3.0174872586150467e-05, "loss": 0.0241, "step": 42400 }, { "epoch": 1.1897881890868285, "grad_norm": 0.026208722963929176, "learning_rate": 3.0170196848552863e-05, "loss": 0.0427, "step": 42410 }, { "epoch": 1.190068733342685, "grad_norm": 0.28209859132766724, "learning_rate": 3.0165521110955253e-05, "loss": 0.0089, "step": 42420 }, { "epoch": 1.1903492775985411, "grad_norm": 0.1972639411687851, "learning_rate": 3.016084537335765e-05, "loss": 0.0403, "step": 42430 }, { "epoch": 1.1906298218543976, "grad_norm": 0.11931112408638, "learning_rate": 3.015616963576004e-05, "loss": 0.0407, "step": 42440 }, { "epoch": 1.1909103661102538, "grad_norm": 0.15557372570037842, "learning_rate": 3.0151493898162436e-05, "loss": 0.0132, "step": 42450 }, { "epoch": 1.1911909103661102, "grad_norm": 0.04421950504183769, "learning_rate": 3.0146818160564833e-05, "loss": 0.025, "step": 42460 }, { "epoch": 1.1914714546219667, "grad_norm": 0.09042984992265701, "learning_rate": 3.0142142422967222e-05, "loss": 0.042, "step": 42470 }, { "epoch": 1.191751998877823, "grad_norm": 0.5112776756286621, "learning_rate": 3.013746668536962e-05, "loss": 0.0134, "step": 42480 }, { "epoch": 1.1920325431336793, "grad_norm": 0.060139257460832596, "learning_rate": 3.0132790947772012e-05, "loss": 0.0231, "step": 42490 }, { "epoch": 1.1923130873895358, "grad_norm": 0.901210367679596, "learning_rate": 3.0128115210174405e-05, "loss": 0.0232, "step": 42500 }, { "epoch": 1.192593631645392, "grad_norm": 0.3501088619232178, "learning_rate": 3.01234394725768e-05, "loss": 0.0294, "step": 42510 }, { "epoch": 1.1928741759012484, "grad_norm": 0.16093918681144714, "learning_rate": 3.0118763734979195e-05, "loss": 0.016, "step": 42520 }, { "epoch": 1.1931547201571049, "grad_norm": 0.1376218944787979, "learning_rate": 3.011408799738159e-05, "loss": 0.0277, "step": 42530 }, { "epoch": 1.193435264412961, "grad_norm": 0.20848047733306885, "learning_rate": 3.010941225978398e-05, "loss": 0.0451, "step": 42540 }, { "epoch": 1.1937158086688175, "grad_norm": 0.40911948680877686, "learning_rate": 3.0104736522186378e-05, "loss": 0.034, "step": 42550 }, { "epoch": 1.1939963529246738, "grad_norm": 0.0875030979514122, "learning_rate": 3.0100060784588768e-05, "loss": 0.0071, "step": 42560 }, { "epoch": 1.1942768971805302, "grad_norm": 0.04392361268401146, "learning_rate": 3.0095385046991164e-05, "loss": 0.0254, "step": 42570 }, { "epoch": 1.1945574414363866, "grad_norm": 0.043658867478370667, "learning_rate": 3.009070930939356e-05, "loss": 0.0083, "step": 42580 }, { "epoch": 1.1948379856922429, "grad_norm": 0.13671518862247467, "learning_rate": 3.008603357179595e-05, "loss": 0.018, "step": 42590 }, { "epoch": 1.1951185299480993, "grad_norm": 0.09848207235336304, "learning_rate": 3.0081357834198347e-05, "loss": 0.0344, "step": 42600 }, { "epoch": 1.1953990742039557, "grad_norm": 0.02337554283440113, "learning_rate": 3.007668209660074e-05, "loss": 0.0098, "step": 42610 }, { "epoch": 1.195679618459812, "grad_norm": 0.04612033814191818, "learning_rate": 3.0072006359003137e-05, "loss": 0.0425, "step": 42620 }, { "epoch": 1.1959601627156684, "grad_norm": 0.09140872210264206, "learning_rate": 3.0067330621405526e-05, "loss": 0.035, "step": 42630 }, { "epoch": 1.1962407069715248, "grad_norm": 0.10331512987613678, "learning_rate": 3.0062654883807923e-05, "loss": 0.0139, "step": 42640 }, { "epoch": 1.196521251227381, "grad_norm": 0.12173160910606384, "learning_rate": 3.005797914621032e-05, "loss": 0.0261, "step": 42650 }, { "epoch": 1.1968017954832375, "grad_norm": 1.407546877861023, "learning_rate": 3.005330340861271e-05, "loss": 0.0494, "step": 42660 }, { "epoch": 1.1970823397390937, "grad_norm": 0.2784632742404938, "learning_rate": 3.0048627671015106e-05, "loss": 0.0154, "step": 42670 }, { "epoch": 1.1973628839949502, "grad_norm": 0.575778067111969, "learning_rate": 3.0043951933417496e-05, "loss": 0.0217, "step": 42680 }, { "epoch": 1.1976434282508066, "grad_norm": 0.8770255446434021, "learning_rate": 3.0039276195819892e-05, "loss": 0.057, "step": 42690 }, { "epoch": 1.197923972506663, "grad_norm": 0.4698627293109894, "learning_rate": 3.0034600458222285e-05, "loss": 0.0138, "step": 42700 }, { "epoch": 1.1982045167625193, "grad_norm": 0.7711870074272156, "learning_rate": 3.0029924720624682e-05, "loss": 0.0331, "step": 42710 }, { "epoch": 1.1984850610183757, "grad_norm": 0.07893531024456024, "learning_rate": 3.0025248983027075e-05, "loss": 0.0125, "step": 42720 }, { "epoch": 1.198765605274232, "grad_norm": 0.025596950203180313, "learning_rate": 3.0020573245429468e-05, "loss": 0.0485, "step": 42730 }, { "epoch": 1.1990461495300884, "grad_norm": 1.3737410306930542, "learning_rate": 3.0015897507831865e-05, "loss": 0.0451, "step": 42740 }, { "epoch": 1.1993266937859448, "grad_norm": 0.7851595282554626, "learning_rate": 3.0011221770234254e-05, "loss": 0.0466, "step": 42750 }, { "epoch": 1.199607238041801, "grad_norm": 0.11001642048358917, "learning_rate": 3.000654603263665e-05, "loss": 0.0161, "step": 42760 }, { "epoch": 1.1998877822976575, "grad_norm": 0.05789351835846901, "learning_rate": 3.000187029503904e-05, "loss": 0.015, "step": 42770 }, { "epoch": 1.200168326553514, "grad_norm": 0.6678832769393921, "learning_rate": 2.9997194557441437e-05, "loss": 0.0184, "step": 42780 }, { "epoch": 1.2004488708093701, "grad_norm": 0.07983417063951492, "learning_rate": 2.9992518819843834e-05, "loss": 0.021, "step": 42790 }, { "epoch": 1.2007294150652266, "grad_norm": 0.061342716217041016, "learning_rate": 2.9987843082246224e-05, "loss": 0.0068, "step": 42800 }, { "epoch": 1.201009959321083, "grad_norm": 0.033365074545145035, "learning_rate": 2.998316734464862e-05, "loss": 0.016, "step": 42810 }, { "epoch": 1.2012905035769392, "grad_norm": 0.3051092028617859, "learning_rate": 2.9978491607051013e-05, "loss": 0.0199, "step": 42820 }, { "epoch": 1.2015710478327957, "grad_norm": 0.02822508104145527, "learning_rate": 2.997381586945341e-05, "loss": 0.0226, "step": 42830 }, { "epoch": 1.2018515920886519, "grad_norm": 6.068301677703857, "learning_rate": 2.99691401318558e-05, "loss": 0.0347, "step": 42840 }, { "epoch": 1.2021321363445083, "grad_norm": 0.8748597502708435, "learning_rate": 2.9964464394258196e-05, "loss": 0.021, "step": 42850 }, { "epoch": 1.2024126806003648, "grad_norm": 0.16733448207378387, "learning_rate": 2.9959788656660593e-05, "loss": 0.0211, "step": 42860 }, { "epoch": 1.202693224856221, "grad_norm": 0.04950186237692833, "learning_rate": 2.9955112919062983e-05, "loss": 0.0123, "step": 42870 }, { "epoch": 1.2029737691120774, "grad_norm": 0.02346375398337841, "learning_rate": 2.995043718146538e-05, "loss": 0.0144, "step": 42880 }, { "epoch": 1.2032543133679339, "grad_norm": 0.06742963939905167, "learning_rate": 2.994576144386777e-05, "loss": 0.0195, "step": 42890 }, { "epoch": 1.20353485762379, "grad_norm": 0.8746142983436584, "learning_rate": 2.9941085706270165e-05, "loss": 0.0239, "step": 42900 }, { "epoch": 1.2038154018796465, "grad_norm": 0.03387539088726044, "learning_rate": 2.993640996867256e-05, "loss": 0.0343, "step": 42910 }, { "epoch": 1.204095946135503, "grad_norm": 18.47667121887207, "learning_rate": 2.9931734231074955e-05, "loss": 0.032, "step": 42920 }, { "epoch": 1.2043764903913592, "grad_norm": 0.24430418014526367, "learning_rate": 2.992705849347735e-05, "loss": 0.0393, "step": 42930 }, { "epoch": 1.2046570346472156, "grad_norm": 0.39394888281822205, "learning_rate": 2.992238275587974e-05, "loss": 0.0198, "step": 42940 }, { "epoch": 1.2049375789030718, "grad_norm": 0.5546062588691711, "learning_rate": 2.9917707018282138e-05, "loss": 0.0368, "step": 42950 }, { "epoch": 1.2052181231589283, "grad_norm": 0.13981223106384277, "learning_rate": 2.9913031280684528e-05, "loss": 0.0358, "step": 42960 }, { "epoch": 1.2054986674147847, "grad_norm": 0.3785648047924042, "learning_rate": 2.9908355543086924e-05, "loss": 0.0195, "step": 42970 }, { "epoch": 1.205779211670641, "grad_norm": 0.821530282497406, "learning_rate": 2.9903679805489314e-05, "loss": 0.0446, "step": 42980 }, { "epoch": 1.2060597559264974, "grad_norm": 0.1689644306898117, "learning_rate": 2.989900406789171e-05, "loss": 0.0109, "step": 42990 }, { "epoch": 1.2063403001823538, "grad_norm": 1.7835201025009155, "learning_rate": 2.9894328330294107e-05, "loss": 0.0373, "step": 43000 }, { "epoch": 1.20662084443821, "grad_norm": 0.7765435576438904, "learning_rate": 2.98896525926965e-05, "loss": 0.0306, "step": 43010 }, { "epoch": 1.2069013886940665, "grad_norm": 0.14506864547729492, "learning_rate": 2.9884976855098897e-05, "loss": 0.0276, "step": 43020 }, { "epoch": 1.207181932949923, "grad_norm": 0.7441787719726562, "learning_rate": 2.9880301117501287e-05, "loss": 0.0208, "step": 43030 }, { "epoch": 1.2074624772057791, "grad_norm": 1.0795761346817017, "learning_rate": 2.9875625379903683e-05, "loss": 0.027, "step": 43040 }, { "epoch": 1.2077430214616356, "grad_norm": 0.09289150685071945, "learning_rate": 2.9870949642306073e-05, "loss": 0.0355, "step": 43050 }, { "epoch": 1.2080235657174918, "grad_norm": 0.035836536437273026, "learning_rate": 2.986627390470847e-05, "loss": 0.0357, "step": 43060 }, { "epoch": 1.2083041099733483, "grad_norm": 0.08217921108007431, "learning_rate": 2.9861598167110866e-05, "loss": 0.0326, "step": 43070 }, { "epoch": 1.2085846542292047, "grad_norm": 0.12522569298744202, "learning_rate": 2.9856922429513256e-05, "loss": 0.0095, "step": 43080 }, { "epoch": 1.2088651984850611, "grad_norm": 2.8070261478424072, "learning_rate": 2.9852246691915652e-05, "loss": 0.0233, "step": 43090 }, { "epoch": 1.2091457427409174, "grad_norm": 0.8397427797317505, "learning_rate": 2.9847570954318045e-05, "loss": 0.0455, "step": 43100 }, { "epoch": 1.2094262869967738, "grad_norm": 0.36111676692962646, "learning_rate": 2.984289521672044e-05, "loss": 0.0314, "step": 43110 }, { "epoch": 1.20970683125263, "grad_norm": 0.06789600849151611, "learning_rate": 2.9838219479122832e-05, "loss": 0.0215, "step": 43120 }, { "epoch": 1.2099873755084865, "grad_norm": 0.11304499208927155, "learning_rate": 2.9833543741525228e-05, "loss": 0.0494, "step": 43130 }, { "epoch": 1.210267919764343, "grad_norm": 0.09642117470502853, "learning_rate": 2.9828868003927625e-05, "loss": 0.014, "step": 43140 }, { "epoch": 1.210548464020199, "grad_norm": 0.784947395324707, "learning_rate": 2.9824192266330015e-05, "loss": 0.0343, "step": 43150 }, { "epoch": 1.2108290082760556, "grad_norm": 0.7923051118850708, "learning_rate": 2.981951652873241e-05, "loss": 0.0201, "step": 43160 }, { "epoch": 1.211109552531912, "grad_norm": 0.05436089262366295, "learning_rate": 2.98148407911348e-05, "loss": 0.0307, "step": 43170 }, { "epoch": 1.2113900967877682, "grad_norm": 0.1545373499393463, "learning_rate": 2.9810165053537197e-05, "loss": 0.0279, "step": 43180 }, { "epoch": 1.2116706410436247, "grad_norm": 4.961238384246826, "learning_rate": 2.9805489315939587e-05, "loss": 0.056, "step": 43190 }, { "epoch": 1.211951185299481, "grad_norm": 0.8851295113563538, "learning_rate": 2.9800813578341984e-05, "loss": 0.0178, "step": 43200 }, { "epoch": 1.2122317295553373, "grad_norm": 0.4468470513820648, "learning_rate": 2.979613784074438e-05, "loss": 0.0525, "step": 43210 }, { "epoch": 1.2125122738111938, "grad_norm": 0.06529238820075989, "learning_rate": 2.9791462103146773e-05, "loss": 0.0142, "step": 43220 }, { "epoch": 1.21279281806705, "grad_norm": 0.18999028205871582, "learning_rate": 2.978678636554917e-05, "loss": 0.0265, "step": 43230 }, { "epoch": 1.2130733623229064, "grad_norm": 0.2919694483280182, "learning_rate": 2.978211062795156e-05, "loss": 0.0317, "step": 43240 }, { "epoch": 1.2133539065787629, "grad_norm": 0.08008774369955063, "learning_rate": 2.9777434890353956e-05, "loss": 0.0092, "step": 43250 }, { "epoch": 1.213634450834619, "grad_norm": 0.2066672146320343, "learning_rate": 2.9772759152756346e-05, "loss": 0.0169, "step": 43260 }, { "epoch": 1.2139149950904755, "grad_norm": 0.13572168350219727, "learning_rate": 2.9768083415158743e-05, "loss": 0.0337, "step": 43270 }, { "epoch": 1.214195539346332, "grad_norm": 0.2334788739681244, "learning_rate": 2.976340767756114e-05, "loss": 0.0475, "step": 43280 }, { "epoch": 1.2144760836021882, "grad_norm": 3.1477248668670654, "learning_rate": 2.975873193996353e-05, "loss": 0.0313, "step": 43290 }, { "epoch": 1.2147566278580446, "grad_norm": 0.06712280958890915, "learning_rate": 2.9754056202365925e-05, "loss": 0.0312, "step": 43300 }, { "epoch": 1.215037172113901, "grad_norm": 1.5525426864624023, "learning_rate": 2.974938046476832e-05, "loss": 0.0284, "step": 43310 }, { "epoch": 1.2153177163697573, "grad_norm": 0.11244435608386993, "learning_rate": 2.9744704727170715e-05, "loss": 0.0406, "step": 43320 }, { "epoch": 1.2155982606256137, "grad_norm": 0.797090470790863, "learning_rate": 2.9740028989573105e-05, "loss": 0.0305, "step": 43330 }, { "epoch": 1.21587880488147, "grad_norm": 0.3774526119232178, "learning_rate": 2.97353532519755e-05, "loss": 0.0152, "step": 43340 }, { "epoch": 1.2161593491373264, "grad_norm": 0.04061295837163925, "learning_rate": 2.9730677514377898e-05, "loss": 0.0121, "step": 43350 }, { "epoch": 1.2164398933931828, "grad_norm": 0.07309827953577042, "learning_rate": 2.9726001776780288e-05, "loss": 0.0162, "step": 43360 }, { "epoch": 1.2167204376490393, "grad_norm": 0.05222149193286896, "learning_rate": 2.9721326039182684e-05, "loss": 0.0276, "step": 43370 }, { "epoch": 1.2170009819048955, "grad_norm": 0.06941571831703186, "learning_rate": 2.9716650301585074e-05, "loss": 0.0228, "step": 43380 }, { "epoch": 1.217281526160752, "grad_norm": 3.5156617164611816, "learning_rate": 2.971197456398747e-05, "loss": 0.0292, "step": 43390 }, { "epoch": 1.2175620704166081, "grad_norm": 0.2523202896118164, "learning_rate": 2.9707298826389864e-05, "loss": 0.0324, "step": 43400 }, { "epoch": 1.2178426146724646, "grad_norm": 0.25735607743263245, "learning_rate": 2.9702623088792257e-05, "loss": 0.0073, "step": 43410 }, { "epoch": 1.218123158928321, "grad_norm": 0.024135824292898178, "learning_rate": 2.9697947351194653e-05, "loss": 0.006, "step": 43420 }, { "epoch": 1.2184037031841772, "grad_norm": 0.14613255858421326, "learning_rate": 2.9693271613597047e-05, "loss": 0.0139, "step": 43430 }, { "epoch": 1.2186842474400337, "grad_norm": 0.008729949593544006, "learning_rate": 2.9688595875999443e-05, "loss": 0.0016, "step": 43440 }, { "epoch": 1.2189647916958901, "grad_norm": 0.05292141065001488, "learning_rate": 2.9683920138401833e-05, "loss": 0.0351, "step": 43450 }, { "epoch": 1.2192453359517463, "grad_norm": 0.8771424889564514, "learning_rate": 2.967924440080423e-05, "loss": 0.043, "step": 43460 }, { "epoch": 1.2195258802076028, "grad_norm": 0.1460193246603012, "learning_rate": 2.967456866320662e-05, "loss": 0.007, "step": 43470 }, { "epoch": 1.2198064244634592, "grad_norm": 0.35580259561538696, "learning_rate": 2.9669892925609016e-05, "loss": 0.0111, "step": 43480 }, { "epoch": 1.2200869687193154, "grad_norm": 0.17154592275619507, "learning_rate": 2.9665217188011412e-05, "loss": 0.0452, "step": 43490 }, { "epoch": 1.2203675129751719, "grad_norm": 0.008639222010970116, "learning_rate": 2.9660541450413802e-05, "loss": 0.0498, "step": 43500 }, { "epoch": 1.220648057231028, "grad_norm": 0.022281363606452942, "learning_rate": 2.96558657128162e-05, "loss": 0.0267, "step": 43510 }, { "epoch": 1.2209286014868845, "grad_norm": 0.025451919063925743, "learning_rate": 2.9651189975218592e-05, "loss": 0.0075, "step": 43520 }, { "epoch": 1.221209145742741, "grad_norm": 0.007619775831699371, "learning_rate": 2.964651423762099e-05, "loss": 0.0181, "step": 43530 }, { "epoch": 1.2214896899985972, "grad_norm": 0.03735121712088585, "learning_rate": 2.9641838500023378e-05, "loss": 0.0121, "step": 43540 }, { "epoch": 1.2217702342544536, "grad_norm": 0.027439292520284653, "learning_rate": 2.9637162762425775e-05, "loss": 0.0042, "step": 43550 }, { "epoch": 1.22205077851031, "grad_norm": 0.01808382011950016, "learning_rate": 2.963248702482817e-05, "loss": 0.0096, "step": 43560 }, { "epoch": 1.2223313227661663, "grad_norm": 0.3787338435649872, "learning_rate": 2.962781128723056e-05, "loss": 0.0186, "step": 43570 }, { "epoch": 1.2226118670220227, "grad_norm": 0.030323658138513565, "learning_rate": 2.9623135549632958e-05, "loss": 0.0305, "step": 43580 }, { "epoch": 1.2228924112778792, "grad_norm": 0.05766588822007179, "learning_rate": 2.9618459812035347e-05, "loss": 0.0387, "step": 43590 }, { "epoch": 1.2231729555337354, "grad_norm": 0.05404314398765564, "learning_rate": 2.9613784074437744e-05, "loss": 0.0175, "step": 43600 }, { "epoch": 1.2234534997895918, "grad_norm": 1.192564606666565, "learning_rate": 2.9609108336840137e-05, "loss": 0.05, "step": 43610 }, { "epoch": 1.223734044045448, "grad_norm": 0.050942011177539825, "learning_rate": 2.9604432599242534e-05, "loss": 0.0208, "step": 43620 }, { "epoch": 1.2240145883013045, "grad_norm": 0.3609331548213959, "learning_rate": 2.9599756861644927e-05, "loss": 0.0344, "step": 43630 }, { "epoch": 1.224295132557161, "grad_norm": 0.15028506517410278, "learning_rate": 2.959508112404732e-05, "loss": 0.0199, "step": 43640 }, { "epoch": 1.2245756768130174, "grad_norm": 0.3114820122718811, "learning_rate": 2.9590405386449716e-05, "loss": 0.0326, "step": 43650 }, { "epoch": 1.2248562210688736, "grad_norm": 0.05470491573214531, "learning_rate": 2.9585729648852106e-05, "loss": 0.011, "step": 43660 }, { "epoch": 1.22513676532473, "grad_norm": 0.09619534760713577, "learning_rate": 2.9581053911254503e-05, "loss": 0.0038, "step": 43670 }, { "epoch": 1.2254173095805863, "grad_norm": 1.375853180885315, "learning_rate": 2.9576378173656892e-05, "loss": 0.0415, "step": 43680 }, { "epoch": 1.2256978538364427, "grad_norm": 0.32357001304626465, "learning_rate": 2.957170243605929e-05, "loss": 0.0233, "step": 43690 }, { "epoch": 1.2259783980922991, "grad_norm": 0.2298029214143753, "learning_rate": 2.9567026698461686e-05, "loss": 0.0173, "step": 43700 }, { "epoch": 1.2262589423481554, "grad_norm": 0.31022176146507263, "learning_rate": 2.9562350960864075e-05, "loss": 0.052, "step": 43710 }, { "epoch": 1.2265394866040118, "grad_norm": 0.05421556159853935, "learning_rate": 2.9557675223266472e-05, "loss": 0.018, "step": 43720 }, { "epoch": 1.2268200308598682, "grad_norm": 0.13739612698554993, "learning_rate": 2.9552999485668865e-05, "loss": 0.0273, "step": 43730 }, { "epoch": 1.2271005751157245, "grad_norm": 0.07816062867641449, "learning_rate": 2.954832374807126e-05, "loss": 0.0103, "step": 43740 }, { "epoch": 1.227381119371581, "grad_norm": 0.2775731384754181, "learning_rate": 2.954364801047365e-05, "loss": 0.0235, "step": 43750 }, { "epoch": 1.2276616636274373, "grad_norm": 0.2751673758029938, "learning_rate": 2.9538972272876048e-05, "loss": 0.0161, "step": 43760 }, { "epoch": 1.2279422078832936, "grad_norm": 0.017117910087108612, "learning_rate": 2.9534296535278444e-05, "loss": 0.0145, "step": 43770 }, { "epoch": 1.22822275213915, "grad_norm": 0.8549779057502747, "learning_rate": 2.9529620797680834e-05, "loss": 0.0263, "step": 43780 }, { "epoch": 1.2285032963950062, "grad_norm": 0.1175074353814125, "learning_rate": 2.952494506008323e-05, "loss": 0.0081, "step": 43790 }, { "epoch": 1.2287838406508627, "grad_norm": 0.17827025055885315, "learning_rate": 2.952026932248562e-05, "loss": 0.0341, "step": 43800 }, { "epoch": 1.229064384906719, "grad_norm": 1.3143672943115234, "learning_rate": 2.9515593584888017e-05, "loss": 0.0209, "step": 43810 }, { "epoch": 1.2293449291625753, "grad_norm": 0.4713425636291504, "learning_rate": 2.951091784729041e-05, "loss": 0.0354, "step": 43820 }, { "epoch": 1.2296254734184318, "grad_norm": 0.41844043135643005, "learning_rate": 2.9506242109692807e-05, "loss": 0.0148, "step": 43830 }, { "epoch": 1.2299060176742882, "grad_norm": 0.0771062970161438, "learning_rate": 2.9501566372095203e-05, "loss": 0.013, "step": 43840 }, { "epoch": 1.2301865619301444, "grad_norm": 0.3798847496509552, "learning_rate": 2.9496890634497593e-05, "loss": 0.0174, "step": 43850 }, { "epoch": 1.2304671061860009, "grad_norm": 0.04540138319134712, "learning_rate": 2.949221489689999e-05, "loss": 0.0383, "step": 43860 }, { "epoch": 1.2307476504418573, "grad_norm": 0.7709023356437683, "learning_rate": 2.948753915930238e-05, "loss": 0.0058, "step": 43870 }, { "epoch": 1.2310281946977135, "grad_norm": 0.020538566634058952, "learning_rate": 2.9482863421704776e-05, "loss": 0.0103, "step": 43880 }, { "epoch": 1.23130873895357, "grad_norm": 3.619263172149658, "learning_rate": 2.9478187684107166e-05, "loss": 0.0179, "step": 43890 }, { "epoch": 1.2315892832094262, "grad_norm": 0.032621853053569794, "learning_rate": 2.9473511946509562e-05, "loss": 0.0302, "step": 43900 }, { "epoch": 1.2318698274652826, "grad_norm": 6.283544063568115, "learning_rate": 2.946883620891196e-05, "loss": 0.0336, "step": 43910 }, { "epoch": 1.232150371721139, "grad_norm": 0.6920201182365417, "learning_rate": 2.9464160471314352e-05, "loss": 0.0198, "step": 43920 }, { "epoch": 1.2324309159769953, "grad_norm": 0.13118979334831238, "learning_rate": 2.945948473371675e-05, "loss": 0.03, "step": 43930 }, { "epoch": 1.2327114602328517, "grad_norm": 0.6906374096870422, "learning_rate": 2.9454808996119138e-05, "loss": 0.03, "step": 43940 }, { "epoch": 1.2329920044887082, "grad_norm": 0.04250851646065712, "learning_rate": 2.9450133258521535e-05, "loss": 0.0215, "step": 43950 }, { "epoch": 1.2332725487445644, "grad_norm": 0.4214615523815155, "learning_rate": 2.9445457520923925e-05, "loss": 0.0141, "step": 43960 }, { "epoch": 1.2335530930004208, "grad_norm": 0.40317219495773315, "learning_rate": 2.944078178332632e-05, "loss": 0.0315, "step": 43970 }, { "epoch": 1.2338336372562773, "grad_norm": 0.04568106681108475, "learning_rate": 2.9436106045728718e-05, "loss": 0.012, "step": 43980 }, { "epoch": 1.2341141815121335, "grad_norm": 0.05805359408259392, "learning_rate": 2.9431430308131107e-05, "loss": 0.0684, "step": 43990 }, { "epoch": 1.23439472576799, "grad_norm": 0.712623119354248, "learning_rate": 2.9426754570533504e-05, "loss": 0.0345, "step": 44000 }, { "epoch": 1.2346752700238461, "grad_norm": 0.039714548736810684, "learning_rate": 2.9422078832935897e-05, "loss": 0.024, "step": 44010 }, { "epoch": 1.2349558142797026, "grad_norm": 0.24133840203285217, "learning_rate": 2.941740309533829e-05, "loss": 0.0316, "step": 44020 }, { "epoch": 1.235236358535559, "grad_norm": 0.7467179894447327, "learning_rate": 2.9412727357740683e-05, "loss": 0.0326, "step": 44030 }, { "epoch": 1.2355169027914155, "grad_norm": 0.07947421073913574, "learning_rate": 2.940805162014308e-05, "loss": 0.0184, "step": 44040 }, { "epoch": 1.2357974470472717, "grad_norm": 0.0743587464094162, "learning_rate": 2.9403375882545476e-05, "loss": 0.012, "step": 44050 }, { "epoch": 1.2360779913031281, "grad_norm": 0.023768093436956406, "learning_rate": 2.9398700144947866e-05, "loss": 0.0306, "step": 44060 }, { "epoch": 1.2363585355589843, "grad_norm": 0.07238543778657913, "learning_rate": 2.9394024407350263e-05, "loss": 0.018, "step": 44070 }, { "epoch": 1.2366390798148408, "grad_norm": 0.02137974463403225, "learning_rate": 2.9389348669752653e-05, "loss": 0.0281, "step": 44080 }, { "epoch": 1.2369196240706972, "grad_norm": 0.18337330222129822, "learning_rate": 2.938467293215505e-05, "loss": 0.0123, "step": 44090 }, { "epoch": 1.2372001683265534, "grad_norm": 0.6154810786247253, "learning_rate": 2.937999719455744e-05, "loss": 0.0169, "step": 44100 }, { "epoch": 1.2374807125824099, "grad_norm": 0.600256085395813, "learning_rate": 2.9375321456959835e-05, "loss": 0.0189, "step": 44110 }, { "epoch": 1.2377612568382663, "grad_norm": 0.15110063552856445, "learning_rate": 2.9370645719362232e-05, "loss": 0.0311, "step": 44120 }, { "epoch": 1.2380418010941225, "grad_norm": 0.37264785170555115, "learning_rate": 2.9365969981764625e-05, "loss": 0.026, "step": 44130 }, { "epoch": 1.238322345349979, "grad_norm": 0.1525745838880539, "learning_rate": 2.936129424416702e-05, "loss": 0.0116, "step": 44140 }, { "epoch": 1.2386028896058354, "grad_norm": 0.038075175136327744, "learning_rate": 2.935661850656941e-05, "loss": 0.0093, "step": 44150 }, { "epoch": 1.2388834338616916, "grad_norm": 0.019556893035769463, "learning_rate": 2.9351942768971808e-05, "loss": 0.0317, "step": 44160 }, { "epoch": 1.239163978117548, "grad_norm": 0.1384054571390152, "learning_rate": 2.9347267031374198e-05, "loss": 0.0385, "step": 44170 }, { "epoch": 1.2394445223734043, "grad_norm": 2.2017171382904053, "learning_rate": 2.9342591293776594e-05, "loss": 0.0257, "step": 44180 }, { "epoch": 1.2397250666292607, "grad_norm": 0.5322200655937195, "learning_rate": 2.933791555617899e-05, "loss": 0.0186, "step": 44190 }, { "epoch": 1.2400056108851172, "grad_norm": 0.7022390365600586, "learning_rate": 2.933323981858138e-05, "loss": 0.0285, "step": 44200 }, { "epoch": 1.2402861551409734, "grad_norm": 0.0704619437456131, "learning_rate": 2.9328564080983777e-05, "loss": 0.0233, "step": 44210 }, { "epoch": 1.2405666993968298, "grad_norm": 0.05906716734170914, "learning_rate": 2.932388834338617e-05, "loss": 0.0327, "step": 44220 }, { "epoch": 1.2408472436526863, "grad_norm": 0.05294421687722206, "learning_rate": 2.9319212605788567e-05, "loss": 0.0125, "step": 44230 }, { "epoch": 1.2411277879085425, "grad_norm": 0.7799749970436096, "learning_rate": 2.9314536868190957e-05, "loss": 0.0286, "step": 44240 }, { "epoch": 1.241408332164399, "grad_norm": 0.06448271125555038, "learning_rate": 2.9309861130593353e-05, "loss": 0.0236, "step": 44250 }, { "epoch": 1.2416888764202554, "grad_norm": 0.08057023584842682, "learning_rate": 2.930518539299575e-05, "loss": 0.0087, "step": 44260 }, { "epoch": 1.2419694206761116, "grad_norm": 1.0569701194763184, "learning_rate": 2.930050965539814e-05, "loss": 0.0589, "step": 44270 }, { "epoch": 1.242249964931968, "grad_norm": 0.05066192150115967, "learning_rate": 2.9295833917800536e-05, "loss": 0.037, "step": 44280 }, { "epoch": 1.2425305091878243, "grad_norm": 0.559085488319397, "learning_rate": 2.9291158180202926e-05, "loss": 0.0323, "step": 44290 }, { "epoch": 1.2428110534436807, "grad_norm": 0.22210834920406342, "learning_rate": 2.9286482442605322e-05, "loss": 0.0274, "step": 44300 }, { "epoch": 1.2430915976995371, "grad_norm": 0.3584130108356476, "learning_rate": 2.9281806705007715e-05, "loss": 0.0515, "step": 44310 }, { "epoch": 1.2433721419553936, "grad_norm": 0.4133647382259369, "learning_rate": 2.927713096741011e-05, "loss": 0.0312, "step": 44320 }, { "epoch": 1.2436526862112498, "grad_norm": 0.25052863359451294, "learning_rate": 2.9272455229812505e-05, "loss": 0.0464, "step": 44330 }, { "epoch": 1.2439332304671062, "grad_norm": 1.2778481245040894, "learning_rate": 2.9267779492214898e-05, "loss": 0.0357, "step": 44340 }, { "epoch": 1.2442137747229625, "grad_norm": 1.0281789302825928, "learning_rate": 2.9263103754617295e-05, "loss": 0.0395, "step": 44350 }, { "epoch": 1.244494318978819, "grad_norm": 0.3409658670425415, "learning_rate": 2.9258428017019685e-05, "loss": 0.0602, "step": 44360 }, { "epoch": 1.2447748632346753, "grad_norm": 0.07285292446613312, "learning_rate": 2.925375227942208e-05, "loss": 0.026, "step": 44370 }, { "epoch": 1.2450554074905316, "grad_norm": 0.3867993652820587, "learning_rate": 2.924907654182447e-05, "loss": 0.0178, "step": 44380 }, { "epoch": 1.245335951746388, "grad_norm": 0.4797610938549042, "learning_rate": 2.9244400804226867e-05, "loss": 0.0323, "step": 44390 }, { "epoch": 1.2456164960022444, "grad_norm": 0.03424260765314102, "learning_rate": 2.9239725066629264e-05, "loss": 0.0433, "step": 44400 }, { "epoch": 1.2458970402581007, "grad_norm": 0.334293931722641, "learning_rate": 2.9235049329031654e-05, "loss": 0.0088, "step": 44410 }, { "epoch": 1.246177584513957, "grad_norm": 1.871780276298523, "learning_rate": 2.923037359143405e-05, "loss": 0.0158, "step": 44420 }, { "epoch": 1.2464581287698135, "grad_norm": 0.02437811717391014, "learning_rate": 2.9225697853836443e-05, "loss": 0.0383, "step": 44430 }, { "epoch": 1.2467386730256698, "grad_norm": 1.2793666124343872, "learning_rate": 2.922102211623884e-05, "loss": 0.0228, "step": 44440 }, { "epoch": 1.2470192172815262, "grad_norm": 0.21204526722431183, "learning_rate": 2.921634637864123e-05, "loss": 0.014, "step": 44450 }, { "epoch": 1.2472997615373824, "grad_norm": 0.4465603232383728, "learning_rate": 2.9211670641043626e-05, "loss": 0.0117, "step": 44460 }, { "epoch": 1.2475803057932389, "grad_norm": 0.014600957743823528, "learning_rate": 2.9206994903446023e-05, "loss": 0.0241, "step": 44470 }, { "epoch": 1.2478608500490953, "grad_norm": 0.17557811737060547, "learning_rate": 2.9202319165848413e-05, "loss": 0.0452, "step": 44480 }, { "epoch": 1.2481413943049515, "grad_norm": 0.044387608766555786, "learning_rate": 2.919764342825081e-05, "loss": 0.0092, "step": 44490 }, { "epoch": 1.248421938560808, "grad_norm": 0.05674290657043457, "learning_rate": 2.91929676906532e-05, "loss": 0.0171, "step": 44500 }, { "epoch": 1.2487024828166644, "grad_norm": 0.014418020844459534, "learning_rate": 2.9188291953055595e-05, "loss": 0.0233, "step": 44510 }, { "epoch": 1.2489830270725206, "grad_norm": 0.38438668847084045, "learning_rate": 2.918361621545799e-05, "loss": 0.0129, "step": 44520 }, { "epoch": 1.249263571328377, "grad_norm": 2.117709159851074, "learning_rate": 2.9178940477860385e-05, "loss": 0.032, "step": 44530 }, { "epoch": 1.2495441155842335, "grad_norm": 0.018088996410369873, "learning_rate": 2.917426474026278e-05, "loss": 0.0041, "step": 44540 }, { "epoch": 1.2498246598400897, "grad_norm": 0.03684306889772415, "learning_rate": 2.916958900266517e-05, "loss": 0.0219, "step": 44550 }, { "epoch": 1.2501052040959462, "grad_norm": 0.017656970769166946, "learning_rate": 2.9164913265067568e-05, "loss": 0.0176, "step": 44560 }, { "epoch": 1.2503857483518024, "grad_norm": 2.7547688484191895, "learning_rate": 2.9160237527469958e-05, "loss": 0.0191, "step": 44570 }, { "epoch": 1.2506662926076588, "grad_norm": 1.667304515838623, "learning_rate": 2.9155561789872354e-05, "loss": 0.02, "step": 44580 }, { "epoch": 1.2509468368635153, "grad_norm": 0.022899625822901726, "learning_rate": 2.9150886052274744e-05, "loss": 0.0264, "step": 44590 }, { "epoch": 1.2512273811193717, "grad_norm": 0.019255490973591805, "learning_rate": 2.914621031467714e-05, "loss": 0.0244, "step": 44600 }, { "epoch": 1.251507925375228, "grad_norm": 0.019543640315532684, "learning_rate": 2.9141534577079537e-05, "loss": 0.0325, "step": 44610 }, { "epoch": 1.2517884696310844, "grad_norm": 0.07526720315217972, "learning_rate": 2.9136858839481927e-05, "loss": 0.0087, "step": 44620 }, { "epoch": 1.2520690138869406, "grad_norm": 1.1004719734191895, "learning_rate": 2.9132183101884324e-05, "loss": 0.059, "step": 44630 }, { "epoch": 1.252349558142797, "grad_norm": 0.2957356870174408, "learning_rate": 2.9127507364286717e-05, "loss": 0.0097, "step": 44640 }, { "epoch": 1.2526301023986535, "grad_norm": 0.24848447740077972, "learning_rate": 2.9122831626689113e-05, "loss": 0.0337, "step": 44650 }, { "epoch": 1.2529106466545097, "grad_norm": 0.1472294181585312, "learning_rate": 2.9118155889091503e-05, "loss": 0.0075, "step": 44660 }, { "epoch": 1.2531911909103661, "grad_norm": 0.2123950570821762, "learning_rate": 2.91134801514939e-05, "loss": 0.0174, "step": 44670 }, { "epoch": 1.2534717351662223, "grad_norm": 1.3278917074203491, "learning_rate": 2.9108804413896296e-05, "loss": 0.0366, "step": 44680 }, { "epoch": 1.2537522794220788, "grad_norm": 0.12388315796852112, "learning_rate": 2.9104128676298686e-05, "loss": 0.0242, "step": 44690 }, { "epoch": 1.2540328236779352, "grad_norm": 0.6617727875709534, "learning_rate": 2.9099452938701082e-05, "loss": 0.0213, "step": 44700 }, { "epoch": 1.2543133679337917, "grad_norm": 0.20646344125270844, "learning_rate": 2.9094777201103472e-05, "loss": 0.0525, "step": 44710 }, { "epoch": 1.2545939121896479, "grad_norm": 2.463327169418335, "learning_rate": 2.909010146350587e-05, "loss": 0.0125, "step": 44720 }, { "epoch": 1.2548744564455043, "grad_norm": 2.1070499420166016, "learning_rate": 2.9085425725908262e-05, "loss": 0.008, "step": 44730 }, { "epoch": 1.2551550007013605, "grad_norm": 0.40127184987068176, "learning_rate": 2.908074998831066e-05, "loss": 0.0391, "step": 44740 }, { "epoch": 1.255435544957217, "grad_norm": 0.054797008633613586, "learning_rate": 2.9076074250713055e-05, "loss": 0.0579, "step": 44750 }, { "epoch": 1.2557160892130734, "grad_norm": 0.7029977440834045, "learning_rate": 2.9071398513115445e-05, "loss": 0.0589, "step": 44760 }, { "epoch": 1.2559966334689296, "grad_norm": 0.30638664960861206, "learning_rate": 2.906672277551784e-05, "loss": 0.0365, "step": 44770 }, { "epoch": 1.256277177724786, "grad_norm": 0.4372613728046417, "learning_rate": 2.906204703792023e-05, "loss": 0.0328, "step": 44780 }, { "epoch": 1.2565577219806425, "grad_norm": 0.20013290643692017, "learning_rate": 2.9057371300322628e-05, "loss": 0.0178, "step": 44790 }, { "epoch": 1.2568382662364987, "grad_norm": 0.2659102976322174, "learning_rate": 2.9052695562725017e-05, "loss": 0.0104, "step": 44800 }, { "epoch": 1.2571188104923552, "grad_norm": 2.426635265350342, "learning_rate": 2.9048019825127414e-05, "loss": 0.0243, "step": 44810 }, { "epoch": 1.2573993547482116, "grad_norm": 0.3476186692714691, "learning_rate": 2.904334408752981e-05, "loss": 0.0364, "step": 44820 }, { "epoch": 1.2576798990040678, "grad_norm": 0.601733386516571, "learning_rate": 2.9038668349932204e-05, "loss": 0.0338, "step": 44830 }, { "epoch": 1.2579604432599243, "grad_norm": 0.3927712142467499, "learning_rate": 2.90339926123346e-05, "loss": 0.0255, "step": 44840 }, { "epoch": 1.2582409875157805, "grad_norm": 0.21017222106456757, "learning_rate": 2.902931687473699e-05, "loss": 0.0119, "step": 44850 }, { "epoch": 1.258521531771637, "grad_norm": 0.6689198613166809, "learning_rate": 2.9024641137139386e-05, "loss": 0.0042, "step": 44860 }, { "epoch": 1.2588020760274934, "grad_norm": 0.3381228446960449, "learning_rate": 2.9019965399541776e-05, "loss": 0.0447, "step": 44870 }, { "epoch": 1.2590826202833498, "grad_norm": 0.2447052299976349, "learning_rate": 2.9015289661944173e-05, "loss": 0.0318, "step": 44880 }, { "epoch": 1.259363164539206, "grad_norm": 0.16087213158607483, "learning_rate": 2.901061392434657e-05, "loss": 0.021, "step": 44890 }, { "epoch": 1.2596437087950625, "grad_norm": 2.352918863296509, "learning_rate": 2.900593818674896e-05, "loss": 0.0177, "step": 44900 }, { "epoch": 1.2599242530509187, "grad_norm": 0.6709467172622681, "learning_rate": 2.9001262449151356e-05, "loss": 0.0192, "step": 44910 }, { "epoch": 1.2602047973067751, "grad_norm": 0.04200898855924606, "learning_rate": 2.899658671155375e-05, "loss": 0.0157, "step": 44920 }, { "epoch": 1.2604853415626316, "grad_norm": 3.954286813735962, "learning_rate": 2.8991910973956142e-05, "loss": 0.0551, "step": 44930 }, { "epoch": 1.2607658858184878, "grad_norm": 0.5086839199066162, "learning_rate": 2.8987235236358535e-05, "loss": 0.0259, "step": 44940 }, { "epoch": 1.2610464300743442, "grad_norm": 0.890838623046875, "learning_rate": 2.898255949876093e-05, "loss": 0.0226, "step": 44950 }, { "epoch": 1.2613269743302005, "grad_norm": 1.1469035148620605, "learning_rate": 2.8977883761163328e-05, "loss": 0.0415, "step": 44960 }, { "epoch": 1.261607518586057, "grad_norm": 3.5707826614379883, "learning_rate": 2.8973208023565718e-05, "loss": 0.0455, "step": 44970 }, { "epoch": 1.2618880628419133, "grad_norm": 0.04676514118909836, "learning_rate": 2.8968532285968114e-05, "loss": 0.0292, "step": 44980 }, { "epoch": 1.2621686070977698, "grad_norm": 1.906927227973938, "learning_rate": 2.8963856548370504e-05, "loss": 0.0288, "step": 44990 }, { "epoch": 1.262449151353626, "grad_norm": 0.04147890582680702, "learning_rate": 2.89591808107729e-05, "loss": 0.0428, "step": 45000 }, { "epoch": 1.2627296956094824, "grad_norm": 0.213868647813797, "learning_rate": 2.895450507317529e-05, "loss": 0.0087, "step": 45010 }, { "epoch": 1.2630102398653387, "grad_norm": 0.05743299424648285, "learning_rate": 2.8949829335577687e-05, "loss": 0.0274, "step": 45020 }, { "epoch": 1.263290784121195, "grad_norm": 1.0145937204360962, "learning_rate": 2.8945153597980084e-05, "loss": 0.0347, "step": 45030 }, { "epoch": 1.2635713283770516, "grad_norm": 0.01924419216811657, "learning_rate": 2.8940477860382477e-05, "loss": 0.0214, "step": 45040 }, { "epoch": 1.2638518726329078, "grad_norm": 11.26647663116455, "learning_rate": 2.8935802122784873e-05, "loss": 0.0496, "step": 45050 }, { "epoch": 1.2641324168887642, "grad_norm": 0.18105418980121613, "learning_rate": 2.8931126385187263e-05, "loss": 0.0302, "step": 45060 }, { "epoch": 1.2644129611446204, "grad_norm": 0.5069558024406433, "learning_rate": 2.892645064758966e-05, "loss": 0.0099, "step": 45070 }, { "epoch": 1.2646935054004769, "grad_norm": 0.5506256818771362, "learning_rate": 2.892177490999205e-05, "loss": 0.0534, "step": 45080 }, { "epoch": 1.2649740496563333, "grad_norm": 1.5798786878585815, "learning_rate": 2.8917099172394446e-05, "loss": 0.0142, "step": 45090 }, { "epoch": 1.2652545939121898, "grad_norm": 0.4116341471672058, "learning_rate": 2.8912423434796842e-05, "loss": 0.0347, "step": 45100 }, { "epoch": 1.265535138168046, "grad_norm": 0.1614706665277481, "learning_rate": 2.8907747697199232e-05, "loss": 0.0116, "step": 45110 }, { "epoch": 1.2658156824239024, "grad_norm": 0.32292667031288147, "learning_rate": 2.890307195960163e-05, "loss": 0.0517, "step": 45120 }, { "epoch": 1.2660962266797586, "grad_norm": 1.0312501192092896, "learning_rate": 2.8898396222004022e-05, "loss": 0.0492, "step": 45130 }, { "epoch": 1.266376770935615, "grad_norm": 0.13350659608840942, "learning_rate": 2.889372048440642e-05, "loss": 0.0209, "step": 45140 }, { "epoch": 1.2666573151914715, "grad_norm": 0.32558685541152954, "learning_rate": 2.888904474680881e-05, "loss": 0.0297, "step": 45150 }, { "epoch": 1.266937859447328, "grad_norm": 0.40824174880981445, "learning_rate": 2.8884369009211205e-05, "loss": 0.0127, "step": 45160 }, { "epoch": 1.2672184037031842, "grad_norm": 0.03448120877146721, "learning_rate": 2.88796932716136e-05, "loss": 0.0139, "step": 45170 }, { "epoch": 1.2674989479590406, "grad_norm": 0.426807701587677, "learning_rate": 2.887501753401599e-05, "loss": 0.0557, "step": 45180 }, { "epoch": 1.2677794922148968, "grad_norm": 0.1868773102760315, "learning_rate": 2.8870341796418388e-05, "loss": 0.0064, "step": 45190 }, { "epoch": 1.2680600364707533, "grad_norm": 0.0537264309823513, "learning_rate": 2.8865666058820777e-05, "loss": 0.0707, "step": 45200 }, { "epoch": 1.2683405807266097, "grad_norm": 0.2739847004413605, "learning_rate": 2.8860990321223174e-05, "loss": 0.0148, "step": 45210 }, { "epoch": 1.268621124982466, "grad_norm": 0.4356657862663269, "learning_rate": 2.885631458362557e-05, "loss": 0.0309, "step": 45220 }, { "epoch": 1.2689016692383224, "grad_norm": 0.07076182961463928, "learning_rate": 2.885163884602796e-05, "loss": 0.037, "step": 45230 }, { "epoch": 1.2691822134941786, "grad_norm": 0.3490826189517975, "learning_rate": 2.8846963108430357e-05, "loss": 0.0316, "step": 45240 }, { "epoch": 1.269462757750035, "grad_norm": 0.38742247223854065, "learning_rate": 2.884228737083275e-05, "loss": 0.0535, "step": 45250 }, { "epoch": 1.2697433020058915, "grad_norm": 0.3162526488304138, "learning_rate": 2.8837611633235147e-05, "loss": 0.0512, "step": 45260 }, { "epoch": 1.270023846261748, "grad_norm": 0.04589644819498062, "learning_rate": 2.8832935895637536e-05, "loss": 0.0078, "step": 45270 }, { "epoch": 1.2703043905176041, "grad_norm": 0.16576197743415833, "learning_rate": 2.8828260158039933e-05, "loss": 0.0239, "step": 45280 }, { "epoch": 1.2705849347734606, "grad_norm": 2.7081849575042725, "learning_rate": 2.882358442044233e-05, "loss": 0.0255, "step": 45290 }, { "epoch": 1.2708654790293168, "grad_norm": 0.05295514687895775, "learning_rate": 2.881890868284472e-05, "loss": 0.013, "step": 45300 }, { "epoch": 1.2711460232851732, "grad_norm": 0.03920577093958855, "learning_rate": 2.8814232945247116e-05, "loss": 0.0168, "step": 45310 }, { "epoch": 1.2714265675410297, "grad_norm": 0.02049187757074833, "learning_rate": 2.8809557207649505e-05, "loss": 0.0107, "step": 45320 }, { "epoch": 1.271707111796886, "grad_norm": 0.016405615955591202, "learning_rate": 2.8804881470051902e-05, "loss": 0.0156, "step": 45330 }, { "epoch": 1.2719876560527423, "grad_norm": 0.03038255125284195, "learning_rate": 2.8800205732454295e-05, "loss": 0.0116, "step": 45340 }, { "epoch": 1.2722682003085986, "grad_norm": 1.1832619905471802, "learning_rate": 2.879552999485669e-05, "loss": 0.0129, "step": 45350 }, { "epoch": 1.272548744564455, "grad_norm": 0.05738355964422226, "learning_rate": 2.8790854257259088e-05, "loss": 0.0497, "step": 45360 }, { "epoch": 1.2728292888203114, "grad_norm": 0.03821968287229538, "learning_rate": 2.8786178519661478e-05, "loss": 0.0112, "step": 45370 }, { "epoch": 1.2731098330761679, "grad_norm": 0.03117888793349266, "learning_rate": 2.8781502782063875e-05, "loss": 0.0097, "step": 45380 }, { "epoch": 1.273390377332024, "grad_norm": 0.043595947325229645, "learning_rate": 2.8776827044466264e-05, "loss": 0.0552, "step": 45390 }, { "epoch": 1.2736709215878805, "grad_norm": 0.2667463719844818, "learning_rate": 2.877215130686866e-05, "loss": 0.0507, "step": 45400 }, { "epoch": 1.2739514658437368, "grad_norm": 0.4495064318180084, "learning_rate": 2.876747556927105e-05, "loss": 0.0374, "step": 45410 }, { "epoch": 1.2742320100995932, "grad_norm": 0.15082037448883057, "learning_rate": 2.8762799831673447e-05, "loss": 0.0066, "step": 45420 }, { "epoch": 1.2745125543554496, "grad_norm": 0.04753878712654114, "learning_rate": 2.8758124094075844e-05, "loss": 0.0414, "step": 45430 }, { "epoch": 1.2747930986113059, "grad_norm": 0.03398888185620308, "learning_rate": 2.8753448356478237e-05, "loss": 0.0204, "step": 45440 }, { "epoch": 1.2750736428671623, "grad_norm": 0.2728842496871948, "learning_rate": 2.874877261888063e-05, "loss": 0.0345, "step": 45450 }, { "epoch": 1.2753541871230187, "grad_norm": 0.0807432234287262, "learning_rate": 2.8744096881283023e-05, "loss": 0.0473, "step": 45460 }, { "epoch": 1.275634731378875, "grad_norm": 3.3210508823394775, "learning_rate": 2.873942114368542e-05, "loss": 0.0154, "step": 45470 }, { "epoch": 1.2759152756347314, "grad_norm": 0.29243695735931396, "learning_rate": 2.873474540608781e-05, "loss": 0.0266, "step": 45480 }, { "epoch": 1.2761958198905878, "grad_norm": 0.6542925238609314, "learning_rate": 2.8730069668490206e-05, "loss": 0.0267, "step": 45490 }, { "epoch": 1.276476364146444, "grad_norm": 0.6942747235298157, "learning_rate": 2.8725393930892603e-05, "loss": 0.064, "step": 45500 }, { "epoch": 1.2767569084023005, "grad_norm": 0.9814334511756897, "learning_rate": 2.8720718193294992e-05, "loss": 0.0278, "step": 45510 }, { "epoch": 1.2770374526581567, "grad_norm": 0.17915277183055878, "learning_rate": 2.871604245569739e-05, "loss": 0.0224, "step": 45520 }, { "epoch": 1.2773179969140132, "grad_norm": 0.13169196248054504, "learning_rate": 2.871136671809978e-05, "loss": 0.0259, "step": 45530 }, { "epoch": 1.2775985411698696, "grad_norm": 0.11272983253002167, "learning_rate": 2.8706690980502175e-05, "loss": 0.0098, "step": 45540 }, { "epoch": 1.277879085425726, "grad_norm": 1.2739797830581665, "learning_rate": 2.870201524290457e-05, "loss": 0.0273, "step": 45550 }, { "epoch": 1.2781596296815823, "grad_norm": 0.05971289053559303, "learning_rate": 2.8697339505306965e-05, "loss": 0.0128, "step": 45560 }, { "epoch": 1.2784401739374387, "grad_norm": 0.10536950081586838, "learning_rate": 2.869266376770936e-05, "loss": 0.0249, "step": 45570 }, { "epoch": 1.278720718193295, "grad_norm": 0.0817098617553711, "learning_rate": 2.868798803011175e-05, "loss": 0.0209, "step": 45580 }, { "epoch": 1.2790012624491514, "grad_norm": 1.0190472602844238, "learning_rate": 2.8683312292514148e-05, "loss": 0.0407, "step": 45590 }, { "epoch": 1.2792818067050078, "grad_norm": 0.30848342180252075, "learning_rate": 2.8678636554916537e-05, "loss": 0.0105, "step": 45600 }, { "epoch": 1.279562350960864, "grad_norm": 0.015224620699882507, "learning_rate": 2.8673960817318934e-05, "loss": 0.012, "step": 45610 }, { "epoch": 1.2798428952167205, "grad_norm": 0.023583075031638145, "learning_rate": 2.8669285079721324e-05, "loss": 0.0062, "step": 45620 }, { "epoch": 1.2801234394725767, "grad_norm": 0.01594860479235649, "learning_rate": 2.866460934212372e-05, "loss": 0.0056, "step": 45630 }, { "epoch": 1.2804039837284331, "grad_norm": 0.02212408371269703, "learning_rate": 2.8659933604526117e-05, "loss": 0.0246, "step": 45640 }, { "epoch": 1.2806845279842896, "grad_norm": 5.1358418464660645, "learning_rate": 2.865525786692851e-05, "loss": 0.0325, "step": 45650 }, { "epoch": 1.280965072240146, "grad_norm": 1.0965781211853027, "learning_rate": 2.8650582129330907e-05, "loss": 0.0344, "step": 45660 }, { "epoch": 1.2812456164960022, "grad_norm": 0.4422471523284912, "learning_rate": 2.8645906391733296e-05, "loss": 0.0094, "step": 45670 }, { "epoch": 1.2815261607518587, "grad_norm": 0.1297796368598938, "learning_rate": 2.8641230654135693e-05, "loss": 0.0362, "step": 45680 }, { "epoch": 1.2818067050077149, "grad_norm": 0.011504077352583408, "learning_rate": 2.8636554916538083e-05, "loss": 0.0083, "step": 45690 }, { "epoch": 1.2820872492635713, "grad_norm": 0.10813495516777039, "learning_rate": 2.863187917894048e-05, "loss": 0.0557, "step": 45700 }, { "epoch": 1.2823677935194278, "grad_norm": 0.0952569991350174, "learning_rate": 2.8627203441342876e-05, "loss": 0.0615, "step": 45710 }, { "epoch": 1.282648337775284, "grad_norm": 1.1823806762695312, "learning_rate": 2.8622527703745266e-05, "loss": 0.0372, "step": 45720 }, { "epoch": 1.2829288820311404, "grad_norm": 0.15605078637599945, "learning_rate": 2.8617851966147662e-05, "loss": 0.0337, "step": 45730 }, { "epoch": 1.2832094262869966, "grad_norm": 1.3499336242675781, "learning_rate": 2.8613176228550055e-05, "loss": 0.0276, "step": 45740 }, { "epoch": 1.283489970542853, "grad_norm": 0.4734554588794708, "learning_rate": 2.8608500490952452e-05, "loss": 0.0464, "step": 45750 }, { "epoch": 1.2837705147987095, "grad_norm": 0.18891894817352295, "learning_rate": 2.860382475335484e-05, "loss": 0.0146, "step": 45760 }, { "epoch": 1.284051059054566, "grad_norm": 0.57065349817276, "learning_rate": 2.8599149015757238e-05, "loss": 0.0265, "step": 45770 }, { "epoch": 1.2843316033104222, "grad_norm": 0.28053998947143555, "learning_rate": 2.8594473278159635e-05, "loss": 0.0228, "step": 45780 }, { "epoch": 1.2846121475662786, "grad_norm": 0.022059038281440735, "learning_rate": 2.8589797540562024e-05, "loss": 0.0598, "step": 45790 }, { "epoch": 1.2848926918221348, "grad_norm": 0.24698852002620697, "learning_rate": 2.858512180296442e-05, "loss": 0.0223, "step": 45800 }, { "epoch": 1.2851732360779913, "grad_norm": 0.27085092663764954, "learning_rate": 2.858044606536681e-05, "loss": 0.0205, "step": 45810 }, { "epoch": 1.2854537803338477, "grad_norm": 0.3753688931465149, "learning_rate": 2.8575770327769207e-05, "loss": 0.006, "step": 45820 }, { "epoch": 1.2857343245897042, "grad_norm": 0.31165677309036255, "learning_rate": 2.85710945901716e-05, "loss": 0.0377, "step": 45830 }, { "epoch": 1.2860148688455604, "grad_norm": 0.1936056911945343, "learning_rate": 2.8566418852573994e-05, "loss": 0.0306, "step": 45840 }, { "epoch": 1.2862954131014168, "grad_norm": 0.07773367315530777, "learning_rate": 2.856174311497639e-05, "loss": 0.0097, "step": 45850 }, { "epoch": 1.286575957357273, "grad_norm": 0.04357993230223656, "learning_rate": 2.8557067377378783e-05, "loss": 0.0446, "step": 45860 }, { "epoch": 1.2868565016131295, "grad_norm": 0.13718023896217346, "learning_rate": 2.855239163978118e-05, "loss": 0.0196, "step": 45870 }, { "epoch": 1.287137045868986, "grad_norm": 0.03833355754613876, "learning_rate": 2.854771590218357e-05, "loss": 0.0133, "step": 45880 }, { "epoch": 1.2874175901248421, "grad_norm": 0.0121604660525918, "learning_rate": 2.8543040164585966e-05, "loss": 0.0098, "step": 45890 }, { "epoch": 1.2876981343806986, "grad_norm": 0.018703097477555275, "learning_rate": 2.8538364426988356e-05, "loss": 0.0039, "step": 45900 }, { "epoch": 1.2879786786365548, "grad_norm": 4.861170768737793, "learning_rate": 2.8533688689390752e-05, "loss": 0.0491, "step": 45910 }, { "epoch": 1.2882592228924112, "grad_norm": 0.020043672993779182, "learning_rate": 2.852901295179315e-05, "loss": 0.04, "step": 45920 }, { "epoch": 1.2885397671482677, "grad_norm": 1.572436809539795, "learning_rate": 2.852433721419554e-05, "loss": 0.0216, "step": 45930 }, { "epoch": 1.2888203114041241, "grad_norm": 0.045233823359012604, "learning_rate": 2.8519661476597935e-05, "loss": 0.0309, "step": 45940 }, { "epoch": 1.2891008556599803, "grad_norm": 0.1512812077999115, "learning_rate": 2.851498573900033e-05, "loss": 0.0126, "step": 45950 }, { "epoch": 1.2893813999158368, "grad_norm": 0.04127606749534607, "learning_rate": 2.8510310001402725e-05, "loss": 0.0156, "step": 45960 }, { "epoch": 1.289661944171693, "grad_norm": 0.5070045590400696, "learning_rate": 2.8505634263805115e-05, "loss": 0.0244, "step": 45970 }, { "epoch": 1.2899424884275494, "grad_norm": 0.03197487071156502, "learning_rate": 2.850095852620751e-05, "loss": 0.035, "step": 45980 }, { "epoch": 1.2902230326834059, "grad_norm": 0.3503648638725281, "learning_rate": 2.8496282788609908e-05, "loss": 0.0186, "step": 45990 }, { "epoch": 1.290503576939262, "grad_norm": 0.07654712349176407, "learning_rate": 2.8491607051012298e-05, "loss": 0.032, "step": 46000 }, { "epoch": 1.2907841211951185, "grad_norm": 0.38633981347084045, "learning_rate": 2.8486931313414694e-05, "loss": 0.0428, "step": 46010 }, { "epoch": 1.2910646654509748, "grad_norm": 0.23698757588863373, "learning_rate": 2.8482255575817084e-05, "loss": 0.0203, "step": 46020 }, { "epoch": 1.2913452097068312, "grad_norm": 14.146244049072266, "learning_rate": 2.847757983821948e-05, "loss": 0.0122, "step": 46030 }, { "epoch": 1.2916257539626876, "grad_norm": 0.8980849981307983, "learning_rate": 2.8472904100621874e-05, "loss": 0.0411, "step": 46040 }, { "epoch": 1.291906298218544, "grad_norm": 0.11468005180358887, "learning_rate": 2.846822836302427e-05, "loss": 0.0154, "step": 46050 }, { "epoch": 1.2921868424744003, "grad_norm": 0.6236281991004944, "learning_rate": 2.8463552625426663e-05, "loss": 0.0151, "step": 46060 }, { "epoch": 1.2924673867302567, "grad_norm": 0.10580503940582275, "learning_rate": 2.8458876887829056e-05, "loss": 0.0165, "step": 46070 }, { "epoch": 1.292747930986113, "grad_norm": 0.3278772532939911, "learning_rate": 2.8454201150231453e-05, "loss": 0.0302, "step": 46080 }, { "epoch": 1.2930284752419694, "grad_norm": 0.3010091781616211, "learning_rate": 2.8449525412633843e-05, "loss": 0.016, "step": 46090 }, { "epoch": 1.2933090194978258, "grad_norm": 0.4428342282772064, "learning_rate": 2.844484967503624e-05, "loss": 0.0206, "step": 46100 }, { "epoch": 1.2935895637536823, "grad_norm": 0.020195595920085907, "learning_rate": 2.844017393743863e-05, "loss": 0.0286, "step": 46110 }, { "epoch": 1.2938701080095385, "grad_norm": 0.7732749581336975, "learning_rate": 2.8435498199841026e-05, "loss": 0.0469, "step": 46120 }, { "epoch": 1.294150652265395, "grad_norm": 0.03304283320903778, "learning_rate": 2.8430822462243422e-05, "loss": 0.0387, "step": 46130 }, { "epoch": 1.2944311965212512, "grad_norm": 1.5050208568572998, "learning_rate": 2.8426146724645812e-05, "loss": 0.0486, "step": 46140 }, { "epoch": 1.2947117407771076, "grad_norm": 0.17233578860759735, "learning_rate": 2.842147098704821e-05, "loss": 0.029, "step": 46150 }, { "epoch": 1.294992285032964, "grad_norm": 1.522118091583252, "learning_rate": 2.84167952494506e-05, "loss": 0.0481, "step": 46160 }, { "epoch": 1.2952728292888203, "grad_norm": 0.08656130731105804, "learning_rate": 2.8412119511852998e-05, "loss": 0.0541, "step": 46170 }, { "epoch": 1.2955533735446767, "grad_norm": 0.2849079370498657, "learning_rate": 2.8407443774255388e-05, "loss": 0.0318, "step": 46180 }, { "epoch": 1.295833917800533, "grad_norm": 0.2412468045949936, "learning_rate": 2.8402768036657784e-05, "loss": 0.0166, "step": 46190 }, { "epoch": 1.2961144620563894, "grad_norm": 0.36465975642204285, "learning_rate": 2.839809229906018e-05, "loss": 0.0163, "step": 46200 }, { "epoch": 1.2963950063122458, "grad_norm": 0.9331910014152527, "learning_rate": 2.839341656146257e-05, "loss": 0.046, "step": 46210 }, { "epoch": 1.2966755505681022, "grad_norm": 2.336860179901123, "learning_rate": 2.8388740823864967e-05, "loss": 0.0192, "step": 46220 }, { "epoch": 1.2969560948239585, "grad_norm": 0.17960380017757416, "learning_rate": 2.8384065086267357e-05, "loss": 0.0262, "step": 46230 }, { "epoch": 1.297236639079815, "grad_norm": 1.6616206169128418, "learning_rate": 2.8379389348669754e-05, "loss": 0.0701, "step": 46240 }, { "epoch": 1.2975171833356711, "grad_norm": 0.07994398474693298, "learning_rate": 2.8374713611072147e-05, "loss": 0.0373, "step": 46250 }, { "epoch": 1.2977977275915276, "grad_norm": 0.10136464238166809, "learning_rate": 2.8370037873474543e-05, "loss": 0.0168, "step": 46260 }, { "epoch": 1.298078271847384, "grad_norm": 0.054792456328868866, "learning_rate": 2.836536213587694e-05, "loss": 0.0317, "step": 46270 }, { "epoch": 1.2983588161032402, "grad_norm": 0.23358173668384552, "learning_rate": 2.836068639827933e-05, "loss": 0.011, "step": 46280 }, { "epoch": 1.2986393603590967, "grad_norm": 0.027192743495106697, "learning_rate": 2.8356010660681726e-05, "loss": 0.0083, "step": 46290 }, { "epoch": 1.2989199046149529, "grad_norm": 0.7836595177650452, "learning_rate": 2.8351334923084116e-05, "loss": 0.0123, "step": 46300 }, { "epoch": 1.2992004488708093, "grad_norm": 0.023824887350201607, "learning_rate": 2.8346659185486513e-05, "loss": 0.0283, "step": 46310 }, { "epoch": 1.2994809931266658, "grad_norm": 0.06468398123979568, "learning_rate": 2.8341983447888902e-05, "loss": 0.0129, "step": 46320 }, { "epoch": 1.2997615373825222, "grad_norm": 0.15393120050430298, "learning_rate": 2.83373077102913e-05, "loss": 0.0534, "step": 46330 }, { "epoch": 1.3000420816383784, "grad_norm": 0.06767649203538895, "learning_rate": 2.8332631972693695e-05, "loss": 0.0057, "step": 46340 }, { "epoch": 1.3003226258942349, "grad_norm": 0.47961559891700745, "learning_rate": 2.832795623509609e-05, "loss": 0.0139, "step": 46350 }, { "epoch": 1.300603170150091, "grad_norm": 0.5090774297714233, "learning_rate": 2.832328049749848e-05, "loss": 0.0199, "step": 46360 }, { "epoch": 1.3008837144059475, "grad_norm": 0.05096210166811943, "learning_rate": 2.8318604759900875e-05, "loss": 0.0317, "step": 46370 }, { "epoch": 1.301164258661804, "grad_norm": 0.1863255500793457, "learning_rate": 2.831392902230327e-05, "loss": 0.0472, "step": 46380 }, { "epoch": 1.3014448029176602, "grad_norm": 0.6182000041007996, "learning_rate": 2.830925328470566e-05, "loss": 0.0184, "step": 46390 }, { "epoch": 1.3017253471735166, "grad_norm": 1.0372710227966309, "learning_rate": 2.8304577547108058e-05, "loss": 0.0193, "step": 46400 }, { "epoch": 1.302005891429373, "grad_norm": 0.26424258947372437, "learning_rate": 2.8299901809510454e-05, "loss": 0.0525, "step": 46410 }, { "epoch": 1.3022864356852293, "grad_norm": 0.20331887900829315, "learning_rate": 2.8295226071912844e-05, "loss": 0.0171, "step": 46420 }, { "epoch": 1.3025669799410857, "grad_norm": 0.019600559026002884, "learning_rate": 2.829055033431524e-05, "loss": 0.0253, "step": 46430 }, { "epoch": 1.3028475241969422, "grad_norm": 1.035272479057312, "learning_rate": 2.828587459671763e-05, "loss": 0.0185, "step": 46440 }, { "epoch": 1.3031280684527984, "grad_norm": 0.5141303539276123, "learning_rate": 2.8281198859120027e-05, "loss": 0.0176, "step": 46450 }, { "epoch": 1.3034086127086548, "grad_norm": 0.01545674167573452, "learning_rate": 2.827652312152242e-05, "loss": 0.0379, "step": 46460 }, { "epoch": 1.303689156964511, "grad_norm": 0.1832205057144165, "learning_rate": 2.8271847383924817e-05, "loss": 0.0088, "step": 46470 }, { "epoch": 1.3039697012203675, "grad_norm": 0.1305547058582306, "learning_rate": 2.8267171646327213e-05, "loss": 0.0151, "step": 46480 }, { "epoch": 1.304250245476224, "grad_norm": 0.0683896616101265, "learning_rate": 2.8262495908729603e-05, "loss": 0.0623, "step": 46490 }, { "epoch": 1.3045307897320804, "grad_norm": 0.030194271355867386, "learning_rate": 2.8257820171132e-05, "loss": 0.0188, "step": 46500 }, { "epoch": 1.3048113339879366, "grad_norm": 0.5682063698768616, "learning_rate": 2.825314443353439e-05, "loss": 0.0025, "step": 46510 }, { "epoch": 1.305091878243793, "grad_norm": 0.040222376585006714, "learning_rate": 2.8248468695936786e-05, "loss": 0.033, "step": 46520 }, { "epoch": 1.3053724224996492, "grad_norm": 0.4564909338951111, "learning_rate": 2.8243792958339175e-05, "loss": 0.0092, "step": 46530 }, { "epoch": 1.3056529667555057, "grad_norm": 0.018502501770853996, "learning_rate": 2.8239117220741572e-05, "loss": 0.0202, "step": 46540 }, { "epoch": 1.3059335110113621, "grad_norm": 0.03443314880132675, "learning_rate": 2.823444148314397e-05, "loss": 0.0033, "step": 46550 }, { "epoch": 1.3062140552672183, "grad_norm": 5.822821140289307, "learning_rate": 2.8229765745546362e-05, "loss": 0.0302, "step": 46560 }, { "epoch": 1.3064945995230748, "grad_norm": 1.6192352771759033, "learning_rate": 2.8225090007948758e-05, "loss": 0.0301, "step": 46570 }, { "epoch": 1.306775143778931, "grad_norm": 0.08631166070699692, "learning_rate": 2.8220414270351148e-05, "loss": 0.007, "step": 46580 }, { "epoch": 1.3070556880347874, "grad_norm": 0.09764288365840912, "learning_rate": 2.8215738532753545e-05, "loss": 0.0098, "step": 46590 }, { "epoch": 1.3073362322906439, "grad_norm": 0.06430576741695404, "learning_rate": 2.8211062795155934e-05, "loss": 0.0425, "step": 46600 }, { "epoch": 1.3076167765465003, "grad_norm": 2.007176160812378, "learning_rate": 2.820638705755833e-05, "loss": 0.0155, "step": 46610 }, { "epoch": 1.3078973208023565, "grad_norm": 0.04233159124851227, "learning_rate": 2.8201711319960727e-05, "loss": 0.0197, "step": 46620 }, { "epoch": 1.308177865058213, "grad_norm": 0.21983540058135986, "learning_rate": 2.8197035582363117e-05, "loss": 0.009, "step": 46630 }, { "epoch": 1.3084584093140692, "grad_norm": 0.6272803544998169, "learning_rate": 2.8192359844765514e-05, "loss": 0.0292, "step": 46640 }, { "epoch": 1.3087389535699256, "grad_norm": 0.06261756271123886, "learning_rate": 2.8187684107167907e-05, "loss": 0.0078, "step": 46650 }, { "epoch": 1.309019497825782, "grad_norm": 0.01050383411347866, "learning_rate": 2.8183008369570303e-05, "loss": 0.0045, "step": 46660 }, { "epoch": 1.3093000420816383, "grad_norm": 0.0851958841085434, "learning_rate": 2.8178332631972693e-05, "loss": 0.0559, "step": 46670 }, { "epoch": 1.3095805863374947, "grad_norm": 0.028751855716109276, "learning_rate": 2.817365689437509e-05, "loss": 0.0286, "step": 46680 }, { "epoch": 1.309861130593351, "grad_norm": 0.08945401012897491, "learning_rate": 2.8168981156777486e-05, "loss": 0.0435, "step": 46690 }, { "epoch": 1.3101416748492074, "grad_norm": 2.297555685043335, "learning_rate": 2.8164305419179876e-05, "loss": 0.0266, "step": 46700 }, { "epoch": 1.3104222191050638, "grad_norm": 0.01354247611016035, "learning_rate": 2.8159629681582273e-05, "loss": 0.0076, "step": 46710 }, { "epoch": 1.3107027633609203, "grad_norm": 3.700949192047119, "learning_rate": 2.8154953943984662e-05, "loss": 0.0392, "step": 46720 }, { "epoch": 1.3109833076167765, "grad_norm": 0.039632268249988556, "learning_rate": 2.815027820638706e-05, "loss": 0.0228, "step": 46730 }, { "epoch": 1.311263851872633, "grad_norm": 0.2500867545604706, "learning_rate": 2.8145602468789452e-05, "loss": 0.0449, "step": 46740 }, { "epoch": 1.3115443961284892, "grad_norm": 0.5885361433029175, "learning_rate": 2.8140926731191845e-05, "loss": 0.0622, "step": 46750 }, { "epoch": 1.3118249403843456, "grad_norm": 0.12325531244277954, "learning_rate": 2.8136250993594242e-05, "loss": 0.022, "step": 46760 }, { "epoch": 1.312105484640202, "grad_norm": 0.1400998830795288, "learning_rate": 2.8131575255996635e-05, "loss": 0.025, "step": 46770 }, { "epoch": 1.3123860288960585, "grad_norm": 0.2030801624059677, "learning_rate": 2.812689951839903e-05, "loss": 0.0121, "step": 46780 }, { "epoch": 1.3126665731519147, "grad_norm": 0.24484366178512573, "learning_rate": 2.812222378080142e-05, "loss": 0.0377, "step": 46790 }, { "epoch": 1.3129471174077711, "grad_norm": 0.25251317024230957, "learning_rate": 2.8117548043203818e-05, "loss": 0.0319, "step": 46800 }, { "epoch": 1.3132276616636274, "grad_norm": 0.5118898153305054, "learning_rate": 2.8112872305606208e-05, "loss": 0.0208, "step": 46810 }, { "epoch": 1.3135082059194838, "grad_norm": 0.038684867322444916, "learning_rate": 2.8108196568008604e-05, "loss": 0.0103, "step": 46820 }, { "epoch": 1.3137887501753402, "grad_norm": 1.2356188297271729, "learning_rate": 2.8103520830411e-05, "loss": 0.0257, "step": 46830 }, { "epoch": 1.3140692944311965, "grad_norm": 0.6436450481414795, "learning_rate": 2.809884509281339e-05, "loss": 0.0354, "step": 46840 }, { "epoch": 1.314349838687053, "grad_norm": 0.014781120233237743, "learning_rate": 2.8094169355215787e-05, "loss": 0.0143, "step": 46850 }, { "epoch": 1.3146303829429091, "grad_norm": 2.8775501251220703, "learning_rate": 2.808949361761818e-05, "loss": 0.0229, "step": 46860 }, { "epoch": 1.3149109271987656, "grad_norm": 0.13073542714118958, "learning_rate": 2.8084817880020577e-05, "loss": 0.0288, "step": 46870 }, { "epoch": 1.315191471454622, "grad_norm": 0.0766410306096077, "learning_rate": 2.8080142142422966e-05, "loss": 0.0157, "step": 46880 }, { "epoch": 1.3154720157104784, "grad_norm": 0.12096800655126572, "learning_rate": 2.8075466404825363e-05, "loss": 0.0227, "step": 46890 }, { "epoch": 1.3157525599663347, "grad_norm": 0.043053027242422104, "learning_rate": 2.807079066722776e-05, "loss": 0.0206, "step": 46900 }, { "epoch": 1.316033104222191, "grad_norm": 0.36174848675727844, "learning_rate": 2.806611492963015e-05, "loss": 0.0217, "step": 46910 }, { "epoch": 1.3163136484780473, "grad_norm": 0.06297751516103745, "learning_rate": 2.8061439192032546e-05, "loss": 0.0198, "step": 46920 }, { "epoch": 1.3165941927339038, "grad_norm": 0.060304250568151474, "learning_rate": 2.8056763454434936e-05, "loss": 0.0296, "step": 46930 }, { "epoch": 1.3168747369897602, "grad_norm": 0.5197926759719849, "learning_rate": 2.8052087716837332e-05, "loss": 0.023, "step": 46940 }, { "epoch": 1.3171552812456164, "grad_norm": 0.8956174850463867, "learning_rate": 2.8047411979239725e-05, "loss": 0.011, "step": 46950 }, { "epoch": 1.3174358255014729, "grad_norm": 2.6350159645080566, "learning_rate": 2.8042736241642122e-05, "loss": 0.0362, "step": 46960 }, { "epoch": 1.317716369757329, "grad_norm": 0.16670070588588715, "learning_rate": 2.8038060504044515e-05, "loss": 0.0111, "step": 46970 }, { "epoch": 1.3179969140131855, "grad_norm": 0.13588932156562805, "learning_rate": 2.8033384766446908e-05, "loss": 0.0175, "step": 46980 }, { "epoch": 1.318277458269042, "grad_norm": 0.01741638034582138, "learning_rate": 2.8028709028849305e-05, "loss": 0.0215, "step": 46990 }, { "epoch": 1.3185580025248984, "grad_norm": 0.48721304535865784, "learning_rate": 2.8024033291251694e-05, "loss": 0.0138, "step": 47000 }, { "epoch": 1.3188385467807546, "grad_norm": 0.40345561504364014, "learning_rate": 2.801935755365409e-05, "loss": 0.0272, "step": 47010 }, { "epoch": 1.319119091036611, "grad_norm": 0.5329194664955139, "learning_rate": 2.801468181605648e-05, "loss": 0.039, "step": 47020 }, { "epoch": 1.3193996352924673, "grad_norm": 0.011006190441548824, "learning_rate": 2.8010006078458877e-05, "loss": 0.0357, "step": 47030 }, { "epoch": 1.3196801795483237, "grad_norm": 0.02933824062347412, "learning_rate": 2.8005330340861274e-05, "loss": 0.0126, "step": 47040 }, { "epoch": 1.3199607238041802, "grad_norm": 7.325852870941162, "learning_rate": 2.8000654603263664e-05, "loss": 0.0329, "step": 47050 }, { "epoch": 1.3202412680600364, "grad_norm": 0.04473373293876648, "learning_rate": 2.799597886566606e-05, "loss": 0.0215, "step": 47060 }, { "epoch": 1.3205218123158928, "grad_norm": 0.07445589452981949, "learning_rate": 2.7991303128068453e-05, "loss": 0.0174, "step": 47070 }, { "epoch": 1.3208023565717493, "grad_norm": 0.03204337880015373, "learning_rate": 2.798662739047085e-05, "loss": 0.0223, "step": 47080 }, { "epoch": 1.3210829008276055, "grad_norm": 0.0771259292960167, "learning_rate": 2.798195165287324e-05, "loss": 0.0196, "step": 47090 }, { "epoch": 1.321363445083462, "grad_norm": 1.604807734489441, "learning_rate": 2.7977275915275636e-05, "loss": 0.0302, "step": 47100 }, { "epoch": 1.3216439893393184, "grad_norm": 0.7908846735954285, "learning_rate": 2.7972600177678033e-05, "loss": 0.0532, "step": 47110 }, { "epoch": 1.3219245335951746, "grad_norm": 0.3958434462547302, "learning_rate": 2.7967924440080422e-05, "loss": 0.0284, "step": 47120 }, { "epoch": 1.322205077851031, "grad_norm": 0.15981954336166382, "learning_rate": 2.796324870248282e-05, "loss": 0.0172, "step": 47130 }, { "epoch": 1.3224856221068872, "grad_norm": 0.02430890128016472, "learning_rate": 2.795857296488521e-05, "loss": 0.0125, "step": 47140 }, { "epoch": 1.3227661663627437, "grad_norm": 0.09609247744083405, "learning_rate": 2.7953897227287605e-05, "loss": 0.025, "step": 47150 }, { "epoch": 1.3230467106186001, "grad_norm": 1.406921625137329, "learning_rate": 2.794922148969e-05, "loss": 0.0336, "step": 47160 }, { "epoch": 1.3233272548744566, "grad_norm": 0.18160369992256165, "learning_rate": 2.7944545752092395e-05, "loss": 0.0241, "step": 47170 }, { "epoch": 1.3236077991303128, "grad_norm": 0.5910586714744568, "learning_rate": 2.793987001449479e-05, "loss": 0.0103, "step": 47180 }, { "epoch": 1.3238883433861692, "grad_norm": 3.9393715858459473, "learning_rate": 2.793519427689718e-05, "loss": 0.0156, "step": 47190 }, { "epoch": 1.3241688876420254, "grad_norm": 0.5724700093269348, "learning_rate": 2.7930518539299578e-05, "loss": 0.0209, "step": 47200 }, { "epoch": 1.324449431897882, "grad_norm": 1.322705864906311, "learning_rate": 2.7925842801701968e-05, "loss": 0.0164, "step": 47210 }, { "epoch": 1.3247299761537383, "grad_norm": 0.27009403705596924, "learning_rate": 2.7921167064104364e-05, "loss": 0.0228, "step": 47220 }, { "epoch": 1.3250105204095945, "grad_norm": 0.027762269601225853, "learning_rate": 2.7916491326506754e-05, "loss": 0.0287, "step": 47230 }, { "epoch": 1.325291064665451, "grad_norm": 0.1968117654323578, "learning_rate": 2.791181558890915e-05, "loss": 0.0049, "step": 47240 }, { "epoch": 1.3255716089213072, "grad_norm": 0.1671702116727829, "learning_rate": 2.7907139851311547e-05, "loss": 0.0254, "step": 47250 }, { "epoch": 1.3258521531771637, "grad_norm": 0.2386711984872818, "learning_rate": 2.790246411371394e-05, "loss": 0.0188, "step": 47260 }, { "epoch": 1.32613269743302, "grad_norm": 0.09132695198059082, "learning_rate": 2.7897788376116333e-05, "loss": 0.0223, "step": 47270 }, { "epoch": 1.3264132416888765, "grad_norm": 2.64005184173584, "learning_rate": 2.7893112638518726e-05, "loss": 0.0284, "step": 47280 }, { "epoch": 1.3266937859447328, "grad_norm": 0.48944342136383057, "learning_rate": 2.7888436900921123e-05, "loss": 0.0165, "step": 47290 }, { "epoch": 1.3269743302005892, "grad_norm": 0.10389170795679092, "learning_rate": 2.7883761163323513e-05, "loss": 0.0346, "step": 47300 }, { "epoch": 1.3272548744564454, "grad_norm": 0.04288553446531296, "learning_rate": 2.787908542572591e-05, "loss": 0.0053, "step": 47310 }, { "epoch": 1.3275354187123019, "grad_norm": 0.6972091794013977, "learning_rate": 2.7874409688128306e-05, "loss": 0.0318, "step": 47320 }, { "epoch": 1.3278159629681583, "grad_norm": 0.01853141002357006, "learning_rate": 2.7869733950530696e-05, "loss": 0.02, "step": 47330 }, { "epoch": 1.3280965072240145, "grad_norm": 0.5843558311462402, "learning_rate": 2.7865058212933092e-05, "loss": 0.0283, "step": 47340 }, { "epoch": 1.328377051479871, "grad_norm": 0.01650955341756344, "learning_rate": 2.7860382475335482e-05, "loss": 0.0182, "step": 47350 }, { "epoch": 1.3286575957357274, "grad_norm": 0.008518857881426811, "learning_rate": 2.785570673773788e-05, "loss": 0.0404, "step": 47360 }, { "epoch": 1.3289381399915836, "grad_norm": 0.19747315347194672, "learning_rate": 2.785103100014027e-05, "loss": 0.0218, "step": 47370 }, { "epoch": 1.32921868424744, "grad_norm": 0.033811457455158234, "learning_rate": 2.7846355262542668e-05, "loss": 0.0079, "step": 47380 }, { "epoch": 1.3294992285032965, "grad_norm": 0.47358936071395874, "learning_rate": 2.7841679524945065e-05, "loss": 0.0582, "step": 47390 }, { "epoch": 1.3297797727591527, "grad_norm": 1.0740910768508911, "learning_rate": 2.7837003787347455e-05, "loss": 0.0308, "step": 47400 }, { "epoch": 1.3300603170150092, "grad_norm": 0.6095095276832581, "learning_rate": 2.783232804974985e-05, "loss": 0.0256, "step": 47410 }, { "epoch": 1.3303408612708654, "grad_norm": 0.23433594405651093, "learning_rate": 2.782765231215224e-05, "loss": 0.0651, "step": 47420 }, { "epoch": 1.3306214055267218, "grad_norm": 0.8222702145576477, "learning_rate": 2.7822976574554637e-05, "loss": 0.0384, "step": 47430 }, { "epoch": 1.3309019497825783, "grad_norm": 0.4395143389701843, "learning_rate": 2.7818300836957027e-05, "loss": 0.0208, "step": 47440 }, { "epoch": 1.3311824940384347, "grad_norm": 0.5559719204902649, "learning_rate": 2.7813625099359424e-05, "loss": 0.0219, "step": 47450 }, { "epoch": 1.331463038294291, "grad_norm": 0.08758668601512909, "learning_rate": 2.780894936176182e-05, "loss": 0.026, "step": 47460 }, { "epoch": 1.3317435825501474, "grad_norm": 0.24594330787658691, "learning_rate": 2.7804273624164213e-05, "loss": 0.0453, "step": 47470 }, { "epoch": 1.3320241268060036, "grad_norm": 0.26925426721572876, "learning_rate": 2.779959788656661e-05, "loss": 0.0342, "step": 47480 }, { "epoch": 1.33230467106186, "grad_norm": 0.06292334944009781, "learning_rate": 2.7794922148969e-05, "loss": 0.0213, "step": 47490 }, { "epoch": 1.3325852153177165, "grad_norm": 0.1819867491722107, "learning_rate": 2.7790246411371396e-05, "loss": 0.0259, "step": 47500 }, { "epoch": 1.3328657595735727, "grad_norm": 0.1711725890636444, "learning_rate": 2.7785570673773786e-05, "loss": 0.0452, "step": 47510 }, { "epoch": 1.3331463038294291, "grad_norm": 0.13287316262722015, "learning_rate": 2.7780894936176183e-05, "loss": 0.0253, "step": 47520 }, { "epoch": 1.3334268480852853, "grad_norm": 0.41421231627464294, "learning_rate": 2.777621919857858e-05, "loss": 0.0212, "step": 47530 }, { "epoch": 1.3337073923411418, "grad_norm": 0.06350927799940109, "learning_rate": 2.777154346098097e-05, "loss": 0.0267, "step": 47540 }, { "epoch": 1.3339879365969982, "grad_norm": 0.32192355394363403, "learning_rate": 2.7766867723383365e-05, "loss": 0.007, "step": 47550 }, { "epoch": 1.3342684808528547, "grad_norm": 0.23719419538974762, "learning_rate": 2.776219198578576e-05, "loss": 0.0172, "step": 47560 }, { "epoch": 1.3345490251087109, "grad_norm": 0.6909984946250916, "learning_rate": 2.7757516248188155e-05, "loss": 0.0317, "step": 47570 }, { "epoch": 1.3348295693645673, "grad_norm": 1.3428336381912231, "learning_rate": 2.7752840510590545e-05, "loss": 0.0532, "step": 47580 }, { "epoch": 1.3351101136204235, "grad_norm": 0.41074883937835693, "learning_rate": 2.774816477299294e-05, "loss": 0.0437, "step": 47590 }, { "epoch": 1.33539065787628, "grad_norm": 0.25657185912132263, "learning_rate": 2.7743489035395338e-05, "loss": 0.0523, "step": 47600 }, { "epoch": 1.3356712021321364, "grad_norm": 0.16306395828723907, "learning_rate": 2.7738813297797728e-05, "loss": 0.0111, "step": 47610 }, { "epoch": 1.3359517463879926, "grad_norm": 2.207017183303833, "learning_rate": 2.7734137560200124e-05, "loss": 0.0392, "step": 47620 }, { "epoch": 1.336232290643849, "grad_norm": 0.09783905744552612, "learning_rate": 2.7729461822602514e-05, "loss": 0.0138, "step": 47630 }, { "epoch": 1.3365128348997053, "grad_norm": 0.08459838479757309, "learning_rate": 2.772478608500491e-05, "loss": 0.0419, "step": 47640 }, { "epoch": 1.3367933791555617, "grad_norm": 0.06813335418701172, "learning_rate": 2.7720110347407304e-05, "loss": 0.0094, "step": 47650 }, { "epoch": 1.3370739234114182, "grad_norm": 0.05171770602464676, "learning_rate": 2.7715434609809697e-05, "loss": 0.0131, "step": 47660 }, { "epoch": 1.3373544676672746, "grad_norm": 0.031240815296769142, "learning_rate": 2.7710758872212093e-05, "loss": 0.0365, "step": 47670 }, { "epoch": 1.3376350119231308, "grad_norm": 2.471647024154663, "learning_rate": 2.7706083134614487e-05, "loss": 0.0134, "step": 47680 }, { "epoch": 1.3379155561789873, "grad_norm": 0.04029529169201851, "learning_rate": 2.7701407397016883e-05, "loss": 0.0177, "step": 47690 }, { "epoch": 1.3381961004348435, "grad_norm": 1.3440693616867065, "learning_rate": 2.7696731659419273e-05, "loss": 0.0292, "step": 47700 }, { "epoch": 1.3384766446907, "grad_norm": 0.03721412643790245, "learning_rate": 2.769205592182167e-05, "loss": 0.0228, "step": 47710 }, { "epoch": 1.3387571889465564, "grad_norm": 0.036645255982875824, "learning_rate": 2.7687380184224066e-05, "loss": 0.0147, "step": 47720 }, { "epoch": 1.3390377332024128, "grad_norm": 0.02427177131175995, "learning_rate": 2.7682704446626456e-05, "loss": 0.0301, "step": 47730 }, { "epoch": 1.339318277458269, "grad_norm": 0.02397093176841736, "learning_rate": 2.7678028709028852e-05, "loss": 0.0268, "step": 47740 }, { "epoch": 1.3395988217141255, "grad_norm": 1.1504154205322266, "learning_rate": 2.7673352971431242e-05, "loss": 0.0366, "step": 47750 }, { "epoch": 1.3398793659699817, "grad_norm": 0.022609278559684753, "learning_rate": 2.766867723383364e-05, "loss": 0.0037, "step": 47760 }, { "epoch": 1.3401599102258381, "grad_norm": 0.6571581363677979, "learning_rate": 2.7664001496236032e-05, "loss": 0.0341, "step": 47770 }, { "epoch": 1.3404404544816946, "grad_norm": 3.7678518295288086, "learning_rate": 2.7659325758638428e-05, "loss": 0.0581, "step": 47780 }, { "epoch": 1.3407209987375508, "grad_norm": 2.087810516357422, "learning_rate": 2.7654650021040825e-05, "loss": 0.0324, "step": 47790 }, { "epoch": 1.3410015429934072, "grad_norm": 0.27449628710746765, "learning_rate": 2.7649974283443215e-05, "loss": 0.0186, "step": 47800 }, { "epoch": 1.3412820872492635, "grad_norm": 0.057430416345596313, "learning_rate": 2.764529854584561e-05, "loss": 0.0229, "step": 47810 }, { "epoch": 1.34156263150512, "grad_norm": 0.38599109649658203, "learning_rate": 2.7640622808248e-05, "loss": 0.0309, "step": 47820 }, { "epoch": 1.3418431757609763, "grad_norm": 0.11395157128572464, "learning_rate": 2.7635947070650397e-05, "loss": 0.0233, "step": 47830 }, { "epoch": 1.3421237200168328, "grad_norm": 0.05932430550456047, "learning_rate": 2.7631271333052787e-05, "loss": 0.0419, "step": 47840 }, { "epoch": 1.342404264272689, "grad_norm": 0.1731729805469513, "learning_rate": 2.7626595595455184e-05, "loss": 0.0344, "step": 47850 }, { "epoch": 1.3426848085285454, "grad_norm": 0.6204673051834106, "learning_rate": 2.762191985785758e-05, "loss": 0.0177, "step": 47860 }, { "epoch": 1.3429653527844017, "grad_norm": 0.1608671396970749, "learning_rate": 2.7617244120259973e-05, "loss": 0.0117, "step": 47870 }, { "epoch": 1.343245897040258, "grad_norm": 0.2517675459384918, "learning_rate": 2.7612568382662367e-05, "loss": 0.0216, "step": 47880 }, { "epoch": 1.3435264412961145, "grad_norm": 1.0989398956298828, "learning_rate": 2.760789264506476e-05, "loss": 0.0303, "step": 47890 }, { "epoch": 1.3438069855519708, "grad_norm": 0.013108900748193264, "learning_rate": 2.7603216907467156e-05, "loss": 0.0053, "step": 47900 }, { "epoch": 1.3440875298078272, "grad_norm": 0.149906724691391, "learning_rate": 2.7598541169869546e-05, "loss": 0.0045, "step": 47910 }, { "epoch": 1.3443680740636834, "grad_norm": 0.1058618575334549, "learning_rate": 2.7593865432271943e-05, "loss": 0.0048, "step": 47920 }, { "epoch": 1.3446486183195399, "grad_norm": 0.033786822110414505, "learning_rate": 2.758918969467434e-05, "loss": 0.0674, "step": 47930 }, { "epoch": 1.3449291625753963, "grad_norm": 0.29184386134147644, "learning_rate": 2.758451395707673e-05, "loss": 0.0221, "step": 47940 }, { "epoch": 1.3452097068312527, "grad_norm": 0.01712607406079769, "learning_rate": 2.7579838219479125e-05, "loss": 0.0208, "step": 47950 }, { "epoch": 1.345490251087109, "grad_norm": 0.03176151588559151, "learning_rate": 2.7575162481881515e-05, "loss": 0.0153, "step": 47960 }, { "epoch": 1.3457707953429654, "grad_norm": 0.19253267347812653, "learning_rate": 2.7570486744283912e-05, "loss": 0.0079, "step": 47970 }, { "epoch": 1.3460513395988216, "grad_norm": 0.9465163350105286, "learning_rate": 2.7565811006686305e-05, "loss": 0.0135, "step": 47980 }, { "epoch": 1.346331883854678, "grad_norm": 1.2164132595062256, "learning_rate": 2.75611352690887e-05, "loss": 0.0272, "step": 47990 }, { "epoch": 1.3466124281105345, "grad_norm": 1.68341064453125, "learning_rate": 2.7556459531491098e-05, "loss": 0.017, "step": 48000 }, { "epoch": 1.3468929723663907, "grad_norm": 0.05157823860645294, "learning_rate": 2.7551783793893488e-05, "loss": 0.0274, "step": 48010 }, { "epoch": 1.3471735166222472, "grad_norm": 0.07938018441200256, "learning_rate": 2.7547108056295884e-05, "loss": 0.0137, "step": 48020 }, { "epoch": 1.3474540608781036, "grad_norm": 0.038767412304878235, "learning_rate": 2.7542432318698274e-05, "loss": 0.028, "step": 48030 }, { "epoch": 1.3477346051339598, "grad_norm": 0.18165118992328644, "learning_rate": 2.753775658110067e-05, "loss": 0.0256, "step": 48040 }, { "epoch": 1.3480151493898163, "grad_norm": 0.182061567902565, "learning_rate": 2.753308084350306e-05, "loss": 0.0084, "step": 48050 }, { "epoch": 1.3482956936456727, "grad_norm": 0.8956032991409302, "learning_rate": 2.7528405105905457e-05, "loss": 0.0495, "step": 48060 }, { "epoch": 1.348576237901529, "grad_norm": 0.05125115066766739, "learning_rate": 2.7523729368307854e-05, "loss": 0.0499, "step": 48070 }, { "epoch": 1.3488567821573854, "grad_norm": 0.18916964530944824, "learning_rate": 2.7519053630710247e-05, "loss": 0.0501, "step": 48080 }, { "epoch": 1.3491373264132416, "grad_norm": 0.06553048640489578, "learning_rate": 2.7514377893112643e-05, "loss": 0.0388, "step": 48090 }, { "epoch": 1.349417870669098, "grad_norm": 0.858971118927002, "learning_rate": 2.7509702155515033e-05, "loss": 0.0259, "step": 48100 }, { "epoch": 1.3496984149249545, "grad_norm": 0.08955861628055573, "learning_rate": 2.750502641791743e-05, "loss": 0.008, "step": 48110 }, { "epoch": 1.349978959180811, "grad_norm": 0.051787957549095154, "learning_rate": 2.750035068031982e-05, "loss": 0.0119, "step": 48120 }, { "epoch": 1.3502595034366671, "grad_norm": 0.4794917106628418, "learning_rate": 2.7495674942722216e-05, "loss": 0.0485, "step": 48130 }, { "epoch": 1.3505400476925236, "grad_norm": 0.8167123198509216, "learning_rate": 2.7490999205124612e-05, "loss": 0.018, "step": 48140 }, { "epoch": 1.3508205919483798, "grad_norm": 0.3585563004016876, "learning_rate": 2.7486323467527002e-05, "loss": 0.0226, "step": 48150 }, { "epoch": 1.3511011362042362, "grad_norm": 1.1519522666931152, "learning_rate": 2.74816477299294e-05, "loss": 0.0138, "step": 48160 }, { "epoch": 1.3513816804600927, "grad_norm": 0.9539960622787476, "learning_rate": 2.7476971992331792e-05, "loss": 0.0247, "step": 48170 }, { "epoch": 1.3516622247159489, "grad_norm": 0.061478931456804276, "learning_rate": 2.7472296254734185e-05, "loss": 0.0258, "step": 48180 }, { "epoch": 1.3519427689718053, "grad_norm": 0.13016098737716675, "learning_rate": 2.7467620517136578e-05, "loss": 0.0179, "step": 48190 }, { "epoch": 1.3522233132276615, "grad_norm": 0.018185697495937347, "learning_rate": 2.7462944779538975e-05, "loss": 0.0332, "step": 48200 }, { "epoch": 1.352503857483518, "grad_norm": 0.08491340279579163, "learning_rate": 2.745826904194137e-05, "loss": 0.0309, "step": 48210 }, { "epoch": 1.3527844017393744, "grad_norm": 0.157069131731987, "learning_rate": 2.745359330434376e-05, "loss": 0.0049, "step": 48220 }, { "epoch": 1.3530649459952309, "grad_norm": 0.035589005798101425, "learning_rate": 2.7448917566746158e-05, "loss": 0.0114, "step": 48230 }, { "epoch": 1.353345490251087, "grad_norm": 2.3384411334991455, "learning_rate": 2.7444241829148547e-05, "loss": 0.0143, "step": 48240 }, { "epoch": 1.3536260345069435, "grad_norm": 1.162889838218689, "learning_rate": 2.7439566091550944e-05, "loss": 0.0382, "step": 48250 }, { "epoch": 1.3539065787627997, "grad_norm": 1.0750290155410767, "learning_rate": 2.7434890353953334e-05, "loss": 0.0107, "step": 48260 }, { "epoch": 1.3541871230186562, "grad_norm": 0.17844291031360626, "learning_rate": 2.743021461635573e-05, "loss": 0.0142, "step": 48270 }, { "epoch": 1.3544676672745126, "grad_norm": 0.18633495271205902, "learning_rate": 2.7425538878758127e-05, "loss": 0.0302, "step": 48280 }, { "epoch": 1.3547482115303688, "grad_norm": 1.1641464233398438, "learning_rate": 2.742086314116052e-05, "loss": 0.0603, "step": 48290 }, { "epoch": 1.3550287557862253, "grad_norm": 0.07303762435913086, "learning_rate": 2.7416187403562916e-05, "loss": 0.0225, "step": 48300 }, { "epoch": 1.3553093000420817, "grad_norm": 0.9741249084472656, "learning_rate": 2.7411511665965306e-05, "loss": 0.0375, "step": 48310 }, { "epoch": 1.355589844297938, "grad_norm": 0.16693975031375885, "learning_rate": 2.7406835928367703e-05, "loss": 0.0119, "step": 48320 }, { "epoch": 1.3558703885537944, "grad_norm": 0.09677904844284058, "learning_rate": 2.7402160190770092e-05, "loss": 0.0226, "step": 48330 }, { "epoch": 1.3561509328096508, "grad_norm": 0.3235374987125397, "learning_rate": 2.739748445317249e-05, "loss": 0.0197, "step": 48340 }, { "epoch": 1.356431477065507, "grad_norm": 0.5085169672966003, "learning_rate": 2.7392808715574886e-05, "loss": 0.024, "step": 48350 }, { "epoch": 1.3567120213213635, "grad_norm": 0.1868014633655548, "learning_rate": 2.7388132977977275e-05, "loss": 0.0312, "step": 48360 }, { "epoch": 1.3569925655772197, "grad_norm": 0.15986749529838562, "learning_rate": 2.7383457240379672e-05, "loss": 0.0287, "step": 48370 }, { "epoch": 1.3572731098330761, "grad_norm": 0.5840250849723816, "learning_rate": 2.7378781502782065e-05, "loss": 0.0309, "step": 48380 }, { "epoch": 1.3575536540889326, "grad_norm": 0.1932104527950287, "learning_rate": 2.737410576518446e-05, "loss": 0.0217, "step": 48390 }, { "epoch": 1.357834198344789, "grad_norm": 0.30845752358436584, "learning_rate": 2.736943002758685e-05, "loss": 0.0433, "step": 48400 }, { "epoch": 1.3581147426006452, "grad_norm": 0.16411857306957245, "learning_rate": 2.7364754289989248e-05, "loss": 0.0517, "step": 48410 }, { "epoch": 1.3583952868565017, "grad_norm": 1.1003472805023193, "learning_rate": 2.7360078552391644e-05, "loss": 0.0286, "step": 48420 }, { "epoch": 1.358675831112358, "grad_norm": 0.02866501361131668, "learning_rate": 2.7355402814794034e-05, "loss": 0.0214, "step": 48430 }, { "epoch": 1.3589563753682143, "grad_norm": 0.08949778228998184, "learning_rate": 2.735072707719643e-05, "loss": 0.0262, "step": 48440 }, { "epoch": 1.3592369196240708, "grad_norm": 0.838287353515625, "learning_rate": 2.734605133959882e-05, "loss": 0.0464, "step": 48450 }, { "epoch": 1.359517463879927, "grad_norm": 0.43291202187538147, "learning_rate": 2.7341375602001217e-05, "loss": 0.0136, "step": 48460 }, { "epoch": 1.3597980081357834, "grad_norm": 0.1088818833231926, "learning_rate": 2.733669986440361e-05, "loss": 0.0159, "step": 48470 }, { "epoch": 1.3600785523916397, "grad_norm": 0.3663778007030487, "learning_rate": 2.7332024126806007e-05, "loss": 0.0138, "step": 48480 }, { "epoch": 1.360359096647496, "grad_norm": 0.03683999925851822, "learning_rate": 2.73273483892084e-05, "loss": 0.0261, "step": 48490 }, { "epoch": 1.3606396409033525, "grad_norm": 0.52592933177948, "learning_rate": 2.7322672651610793e-05, "loss": 0.0126, "step": 48500 }, { "epoch": 1.360920185159209, "grad_norm": 0.02077941596508026, "learning_rate": 2.731799691401319e-05, "loss": 0.0194, "step": 48510 }, { "epoch": 1.3612007294150652, "grad_norm": 0.3041991591453552, "learning_rate": 2.731332117641558e-05, "loss": 0.0328, "step": 48520 }, { "epoch": 1.3614812736709216, "grad_norm": 0.34277084469795227, "learning_rate": 2.7308645438817976e-05, "loss": 0.0432, "step": 48530 }, { "epoch": 1.3617618179267779, "grad_norm": 0.28364261984825134, "learning_rate": 2.7303969701220366e-05, "loss": 0.0275, "step": 48540 }, { "epoch": 1.3620423621826343, "grad_norm": 0.2325262725353241, "learning_rate": 2.7299293963622762e-05, "loss": 0.013, "step": 48550 }, { "epoch": 1.3623229064384907, "grad_norm": 0.039403416216373444, "learning_rate": 2.729461822602516e-05, "loss": 0.0077, "step": 48560 }, { "epoch": 1.362603450694347, "grad_norm": 0.6987770199775696, "learning_rate": 2.728994248842755e-05, "loss": 0.0372, "step": 48570 }, { "epoch": 1.3628839949502034, "grad_norm": 0.6653364300727844, "learning_rate": 2.7285266750829945e-05, "loss": 0.0171, "step": 48580 }, { "epoch": 1.3631645392060596, "grad_norm": 0.8390184044837952, "learning_rate": 2.7280591013232338e-05, "loss": 0.0227, "step": 48590 }, { "epoch": 1.363445083461916, "grad_norm": 0.02456027828156948, "learning_rate": 2.7275915275634735e-05, "loss": 0.0083, "step": 48600 }, { "epoch": 1.3637256277177725, "grad_norm": 0.5325160622596741, "learning_rate": 2.7271239538037125e-05, "loss": 0.0364, "step": 48610 }, { "epoch": 1.364006171973629, "grad_norm": 0.1712132692337036, "learning_rate": 2.726656380043952e-05, "loss": 0.0133, "step": 48620 }, { "epoch": 1.3642867162294852, "grad_norm": 1.9586206674575806, "learning_rate": 2.7261888062841918e-05, "loss": 0.0514, "step": 48630 }, { "epoch": 1.3645672604853416, "grad_norm": 0.20366930961608887, "learning_rate": 2.7257212325244307e-05, "loss": 0.0269, "step": 48640 }, { "epoch": 1.3648478047411978, "grad_norm": 0.08576969802379608, "learning_rate": 2.7252536587646704e-05, "loss": 0.0265, "step": 48650 }, { "epoch": 1.3651283489970543, "grad_norm": 0.8846347332000732, "learning_rate": 2.7247860850049094e-05, "loss": 0.0359, "step": 48660 }, { "epoch": 1.3654088932529107, "grad_norm": 1.4110909700393677, "learning_rate": 2.724318511245149e-05, "loss": 0.0209, "step": 48670 }, { "epoch": 1.3656894375087671, "grad_norm": 1.185761570930481, "learning_rate": 2.7238509374853883e-05, "loss": 0.0336, "step": 48680 }, { "epoch": 1.3659699817646234, "grad_norm": 0.14208540320396423, "learning_rate": 2.723383363725628e-05, "loss": 0.0209, "step": 48690 }, { "epoch": 1.3662505260204798, "grad_norm": 0.06494259834289551, "learning_rate": 2.7229157899658677e-05, "loss": 0.0498, "step": 48700 }, { "epoch": 1.366531070276336, "grad_norm": 0.17516398429870605, "learning_rate": 2.7224482162061066e-05, "loss": 0.0239, "step": 48710 }, { "epoch": 1.3668116145321925, "grad_norm": 0.05600280687212944, "learning_rate": 2.7219806424463463e-05, "loss": 0.0199, "step": 48720 }, { "epoch": 1.367092158788049, "grad_norm": 0.1645234078168869, "learning_rate": 2.7215130686865853e-05, "loss": 0.0153, "step": 48730 }, { "epoch": 1.3673727030439051, "grad_norm": 0.16271436214447021, "learning_rate": 2.721045494926825e-05, "loss": 0.0245, "step": 48740 }, { "epoch": 1.3676532472997616, "grad_norm": 0.03486526012420654, "learning_rate": 2.720577921167064e-05, "loss": 0.0179, "step": 48750 }, { "epoch": 1.3679337915556178, "grad_norm": 0.6166093945503235, "learning_rate": 2.7201103474073035e-05, "loss": 0.0481, "step": 48760 }, { "epoch": 1.3682143358114742, "grad_norm": 0.05118228495121002, "learning_rate": 2.7196427736475432e-05, "loss": 0.0027, "step": 48770 }, { "epoch": 1.3684948800673307, "grad_norm": 0.31795603036880493, "learning_rate": 2.7191751998877825e-05, "loss": 0.0287, "step": 48780 }, { "epoch": 1.368775424323187, "grad_norm": 0.10731039196252823, "learning_rate": 2.7187076261280218e-05, "loss": 0.0299, "step": 48790 }, { "epoch": 1.3690559685790433, "grad_norm": 0.6083626747131348, "learning_rate": 2.718240052368261e-05, "loss": 0.04, "step": 48800 }, { "epoch": 1.3693365128348998, "grad_norm": 0.11196576803922653, "learning_rate": 2.7177724786085008e-05, "loss": 0.0129, "step": 48810 }, { "epoch": 1.369617057090756, "grad_norm": 0.17671416699886322, "learning_rate": 2.7173049048487398e-05, "loss": 0.0094, "step": 48820 }, { "epoch": 1.3698976013466124, "grad_norm": 0.31551456451416016, "learning_rate": 2.7168373310889794e-05, "loss": 0.0486, "step": 48830 }, { "epoch": 1.3701781456024689, "grad_norm": 0.04758315905928612, "learning_rate": 2.716369757329219e-05, "loss": 0.0149, "step": 48840 }, { "epoch": 1.370458689858325, "grad_norm": 0.45107603073120117, "learning_rate": 2.715902183569458e-05, "loss": 0.0094, "step": 48850 }, { "epoch": 1.3707392341141815, "grad_norm": 0.034316547214984894, "learning_rate": 2.7154346098096977e-05, "loss": 0.0182, "step": 48860 }, { "epoch": 1.3710197783700377, "grad_norm": 0.10052264481782913, "learning_rate": 2.7149670360499367e-05, "loss": 0.0639, "step": 48870 }, { "epoch": 1.3713003226258942, "grad_norm": 0.1527678519487381, "learning_rate": 2.7144994622901763e-05, "loss": 0.0251, "step": 48880 }, { "epoch": 1.3715808668817506, "grad_norm": 0.11112553626298904, "learning_rate": 2.7140318885304157e-05, "loss": 0.014, "step": 48890 }, { "epoch": 1.371861411137607, "grad_norm": 0.264417439699173, "learning_rate": 2.7135643147706553e-05, "loss": 0.0322, "step": 48900 }, { "epoch": 1.3721419553934633, "grad_norm": 0.05568407475948334, "learning_rate": 2.713096741010895e-05, "loss": 0.0102, "step": 48910 }, { "epoch": 1.3724224996493197, "grad_norm": 0.04458872601389885, "learning_rate": 2.712629167251134e-05, "loss": 0.0121, "step": 48920 }, { "epoch": 1.372703043905176, "grad_norm": 0.5330876111984253, "learning_rate": 2.7121615934913736e-05, "loss": 0.0161, "step": 48930 }, { "epoch": 1.3729835881610324, "grad_norm": 0.5268858671188354, "learning_rate": 2.7116940197316126e-05, "loss": 0.023, "step": 48940 }, { "epoch": 1.3732641324168888, "grad_norm": 0.595011293888092, "learning_rate": 2.7112264459718522e-05, "loss": 0.0156, "step": 48950 }, { "epoch": 1.373544676672745, "grad_norm": 0.02665679156780243, "learning_rate": 2.7107588722120912e-05, "loss": 0.0342, "step": 48960 }, { "epoch": 1.3738252209286015, "grad_norm": 0.6137765645980835, "learning_rate": 2.710291298452331e-05, "loss": 0.0229, "step": 48970 }, { "epoch": 1.374105765184458, "grad_norm": 0.4164828062057495, "learning_rate": 2.7098237246925705e-05, "loss": 0.028, "step": 48980 }, { "epoch": 1.3743863094403141, "grad_norm": 0.45995503664016724, "learning_rate": 2.70935615093281e-05, "loss": 0.0338, "step": 48990 }, { "epoch": 1.3746668536961706, "grad_norm": 0.17296819388866425, "learning_rate": 2.7088885771730495e-05, "loss": 0.023, "step": 49000 }, { "epoch": 1.374947397952027, "grad_norm": 0.1751699149608612, "learning_rate": 2.7084210034132885e-05, "loss": 0.0432, "step": 49010 }, { "epoch": 1.3752279422078832, "grad_norm": 0.08534793555736542, "learning_rate": 2.707953429653528e-05, "loss": 0.0308, "step": 49020 }, { "epoch": 1.3755084864637397, "grad_norm": 1.2995353937149048, "learning_rate": 2.707485855893767e-05, "loss": 0.0212, "step": 49030 }, { "epoch": 1.375789030719596, "grad_norm": 0.11190236359834671, "learning_rate": 2.7070182821340067e-05, "loss": 0.0297, "step": 49040 }, { "epoch": 1.3760695749754523, "grad_norm": 0.03986916318535805, "learning_rate": 2.7065507083742464e-05, "loss": 0.0086, "step": 49050 }, { "epoch": 1.3763501192313088, "grad_norm": 0.34684446454048157, "learning_rate": 2.7060831346144854e-05, "loss": 0.04, "step": 49060 }, { "epoch": 1.3766306634871652, "grad_norm": 0.4253654181957245, "learning_rate": 2.705615560854725e-05, "loss": 0.0396, "step": 49070 }, { "epoch": 1.3769112077430214, "grad_norm": 0.37916815280914307, "learning_rate": 2.7051479870949644e-05, "loss": 0.0138, "step": 49080 }, { "epoch": 1.3771917519988779, "grad_norm": 0.7115842700004578, "learning_rate": 2.7046804133352037e-05, "loss": 0.0211, "step": 49090 }, { "epoch": 1.377472296254734, "grad_norm": 0.027738846838474274, "learning_rate": 2.704212839575443e-05, "loss": 0.0308, "step": 49100 }, { "epoch": 1.3777528405105905, "grad_norm": 0.3621184229850769, "learning_rate": 2.7037452658156826e-05, "loss": 0.0236, "step": 49110 }, { "epoch": 1.378033384766447, "grad_norm": 0.6063050627708435, "learning_rate": 2.7032776920559223e-05, "loss": 0.0105, "step": 49120 }, { "epoch": 1.3783139290223032, "grad_norm": 0.5464107394218445, "learning_rate": 2.7028101182961613e-05, "loss": 0.0083, "step": 49130 }, { "epoch": 1.3785944732781596, "grad_norm": 0.017409568652510643, "learning_rate": 2.702342544536401e-05, "loss": 0.0432, "step": 49140 }, { "epoch": 1.3788750175340159, "grad_norm": 0.022932324558496475, "learning_rate": 2.70187497077664e-05, "loss": 0.0049, "step": 49150 }, { "epoch": 1.3791555617898723, "grad_norm": 0.1116257831454277, "learning_rate": 2.7014073970168796e-05, "loss": 0.0111, "step": 49160 }, { "epoch": 1.3794361060457287, "grad_norm": 1.6291676759719849, "learning_rate": 2.7009398232571185e-05, "loss": 0.0261, "step": 49170 }, { "epoch": 1.3797166503015852, "grad_norm": 0.5271481871604919, "learning_rate": 2.7004722494973582e-05, "loss": 0.0409, "step": 49180 }, { "epoch": 1.3799971945574414, "grad_norm": 0.7079965472221375, "learning_rate": 2.700004675737598e-05, "loss": 0.0325, "step": 49190 }, { "epoch": 1.3802777388132978, "grad_norm": 2.5254595279693604, "learning_rate": 2.699537101977837e-05, "loss": 0.0744, "step": 49200 }, { "epoch": 1.380558283069154, "grad_norm": 0.11448118835687637, "learning_rate": 2.6990695282180768e-05, "loss": 0.0134, "step": 49210 }, { "epoch": 1.3808388273250105, "grad_norm": 0.3135840594768524, "learning_rate": 2.6986019544583158e-05, "loss": 0.0179, "step": 49220 }, { "epoch": 1.381119371580867, "grad_norm": 0.541144609451294, "learning_rate": 2.6981343806985554e-05, "loss": 0.0255, "step": 49230 }, { "epoch": 1.3813999158367232, "grad_norm": 0.07669491320848465, "learning_rate": 2.6976668069387944e-05, "loss": 0.0283, "step": 49240 }, { "epoch": 1.3816804600925796, "grad_norm": 0.17535974085330963, "learning_rate": 2.697199233179034e-05, "loss": 0.0194, "step": 49250 }, { "epoch": 1.3819610043484358, "grad_norm": 0.23365344107151031, "learning_rate": 2.6967316594192737e-05, "loss": 0.0114, "step": 49260 }, { "epoch": 1.3822415486042923, "grad_norm": 0.8045592308044434, "learning_rate": 2.6962640856595127e-05, "loss": 0.0375, "step": 49270 }, { "epoch": 1.3825220928601487, "grad_norm": 1.8073906898498535, "learning_rate": 2.6957965118997524e-05, "loss": 0.0317, "step": 49280 }, { "epoch": 1.3828026371160052, "grad_norm": 0.054956305772066116, "learning_rate": 2.6953289381399917e-05, "loss": 0.0299, "step": 49290 }, { "epoch": 1.3830831813718614, "grad_norm": 0.41100406646728516, "learning_rate": 2.6948613643802313e-05, "loss": 0.0214, "step": 49300 }, { "epoch": 1.3833637256277178, "grad_norm": 0.8066640496253967, "learning_rate": 2.6943937906204703e-05, "loss": 0.0219, "step": 49310 }, { "epoch": 1.383644269883574, "grad_norm": 0.9466854333877563, "learning_rate": 2.69392621686071e-05, "loss": 0.0155, "step": 49320 }, { "epoch": 1.3839248141394305, "grad_norm": 0.2417161911725998, "learning_rate": 2.6934586431009496e-05, "loss": 0.0085, "step": 49330 }, { "epoch": 1.384205358395287, "grad_norm": 0.4385662376880646, "learning_rate": 2.6929910693411886e-05, "loss": 0.0266, "step": 49340 }, { "epoch": 1.3844859026511434, "grad_norm": 2.936051607131958, "learning_rate": 2.6925234955814282e-05, "loss": 0.0454, "step": 49350 }, { "epoch": 1.3847664469069996, "grad_norm": 0.03127685934305191, "learning_rate": 2.6920559218216672e-05, "loss": 0.0278, "step": 49360 }, { "epoch": 1.385046991162856, "grad_norm": 0.24358081817626953, "learning_rate": 2.691588348061907e-05, "loss": 0.0281, "step": 49370 }, { "epoch": 1.3853275354187122, "grad_norm": 0.7396218180656433, "learning_rate": 2.6911207743021462e-05, "loss": 0.0148, "step": 49380 }, { "epoch": 1.3856080796745687, "grad_norm": 0.3579739034175873, "learning_rate": 2.690653200542386e-05, "loss": 0.0582, "step": 49390 }, { "epoch": 1.3858886239304251, "grad_norm": 0.27813974022865295, "learning_rate": 2.690185626782625e-05, "loss": 0.0222, "step": 49400 }, { "epoch": 1.3861691681862813, "grad_norm": 0.11116593331098557, "learning_rate": 2.6897180530228645e-05, "loss": 0.0302, "step": 49410 }, { "epoch": 1.3864497124421378, "grad_norm": 0.048539917916059494, "learning_rate": 2.689250479263104e-05, "loss": 0.0168, "step": 49420 }, { "epoch": 1.386730256697994, "grad_norm": 0.047034237533807755, "learning_rate": 2.688782905503343e-05, "loss": 0.0113, "step": 49430 }, { "epoch": 1.3870108009538504, "grad_norm": 0.4156951904296875, "learning_rate": 2.6883153317435828e-05, "loss": 0.0409, "step": 49440 }, { "epoch": 1.3872913452097069, "grad_norm": 0.5101163983345032, "learning_rate": 2.6878477579838217e-05, "loss": 0.0235, "step": 49450 }, { "epoch": 1.3875718894655633, "grad_norm": 1.3001493215560913, "learning_rate": 2.6873801842240614e-05, "loss": 0.0333, "step": 49460 }, { "epoch": 1.3878524337214195, "grad_norm": 0.028355872258543968, "learning_rate": 2.686912610464301e-05, "loss": 0.0082, "step": 49470 }, { "epoch": 1.388132977977276, "grad_norm": 3.7123234272003174, "learning_rate": 2.68644503670454e-05, "loss": 0.0293, "step": 49480 }, { "epoch": 1.3884135222331322, "grad_norm": 1.1283458471298218, "learning_rate": 2.6859774629447797e-05, "loss": 0.0268, "step": 49490 }, { "epoch": 1.3886940664889886, "grad_norm": 0.09546975791454315, "learning_rate": 2.685509889185019e-05, "loss": 0.0195, "step": 49500 }, { "epoch": 1.388974610744845, "grad_norm": 0.17363639175891876, "learning_rate": 2.6850423154252586e-05, "loss": 0.0147, "step": 49510 }, { "epoch": 1.3892551550007013, "grad_norm": 0.016054196283221245, "learning_rate": 2.6845747416654976e-05, "loss": 0.017, "step": 49520 }, { "epoch": 1.3895356992565577, "grad_norm": 5.512981414794922, "learning_rate": 2.6841071679057373e-05, "loss": 0.0206, "step": 49530 }, { "epoch": 1.389816243512414, "grad_norm": 0.02711617574095726, "learning_rate": 2.683639594145977e-05, "loss": 0.0193, "step": 49540 }, { "epoch": 1.3900967877682704, "grad_norm": 0.3942486345767975, "learning_rate": 2.683172020386216e-05, "loss": 0.0388, "step": 49550 }, { "epoch": 1.3903773320241268, "grad_norm": 0.10201511532068253, "learning_rate": 2.6827044466264556e-05, "loss": 0.0108, "step": 49560 }, { "epoch": 1.3906578762799833, "grad_norm": 0.39942723512649536, "learning_rate": 2.6822368728666945e-05, "loss": 0.0522, "step": 49570 }, { "epoch": 1.3909384205358395, "grad_norm": 0.25986534357070923, "learning_rate": 2.6817692991069342e-05, "loss": 0.0163, "step": 49580 }, { "epoch": 1.391218964791696, "grad_norm": 0.5817072987556458, "learning_rate": 2.6813017253471735e-05, "loss": 0.0448, "step": 49590 }, { "epoch": 1.3914995090475522, "grad_norm": 0.0774780884385109, "learning_rate": 2.680834151587413e-05, "loss": 0.0353, "step": 49600 }, { "epoch": 1.3917800533034086, "grad_norm": 0.15589335560798645, "learning_rate": 2.6803665778276528e-05, "loss": 0.056, "step": 49610 }, { "epoch": 1.392060597559265, "grad_norm": 1.6147576570510864, "learning_rate": 2.6798990040678918e-05, "loss": 0.0159, "step": 49620 }, { "epoch": 1.3923411418151213, "grad_norm": 0.3630450367927551, "learning_rate": 2.6794314303081314e-05, "loss": 0.0361, "step": 49630 }, { "epoch": 1.3926216860709777, "grad_norm": 1.5628905296325684, "learning_rate": 2.6789638565483704e-05, "loss": 0.026, "step": 49640 }, { "epoch": 1.3929022303268341, "grad_norm": 0.05441616475582123, "learning_rate": 2.67849628278861e-05, "loss": 0.029, "step": 49650 }, { "epoch": 1.3931827745826904, "grad_norm": 0.020267190411686897, "learning_rate": 2.678028709028849e-05, "loss": 0.0619, "step": 49660 }, { "epoch": 1.3934633188385468, "grad_norm": 0.57863849401474, "learning_rate": 2.6775611352690887e-05, "loss": 0.0147, "step": 49670 }, { "epoch": 1.3937438630944032, "grad_norm": 0.19167013466358185, "learning_rate": 2.6770935615093284e-05, "loss": 0.0272, "step": 49680 }, { "epoch": 1.3940244073502595, "grad_norm": 0.996508777141571, "learning_rate": 2.6766259877495677e-05, "loss": 0.0512, "step": 49690 }, { "epoch": 1.394304951606116, "grad_norm": 0.04718349501490593, "learning_rate": 2.676158413989807e-05, "loss": 0.0185, "step": 49700 }, { "epoch": 1.3945854958619721, "grad_norm": 0.550603985786438, "learning_rate": 2.6756908402300463e-05, "loss": 0.0194, "step": 49710 }, { "epoch": 1.3948660401178286, "grad_norm": 0.6553761959075928, "learning_rate": 2.675223266470286e-05, "loss": 0.04, "step": 49720 }, { "epoch": 1.395146584373685, "grad_norm": 0.06671661883592606, "learning_rate": 2.674755692710525e-05, "loss": 0.0138, "step": 49730 }, { "epoch": 1.3954271286295414, "grad_norm": 0.16225586831569672, "learning_rate": 2.6742881189507646e-05, "loss": 0.0341, "step": 49740 }, { "epoch": 1.3957076728853977, "grad_norm": 0.10636740922927856, "learning_rate": 2.6738205451910043e-05, "loss": 0.0202, "step": 49750 }, { "epoch": 1.395988217141254, "grad_norm": 0.7812747955322266, "learning_rate": 2.6733529714312432e-05, "loss": 0.0186, "step": 49760 }, { "epoch": 1.3962687613971103, "grad_norm": 0.07339810580015182, "learning_rate": 2.672885397671483e-05, "loss": 0.0243, "step": 49770 }, { "epoch": 1.3965493056529668, "grad_norm": 0.01595054566860199, "learning_rate": 2.672417823911722e-05, "loss": 0.0179, "step": 49780 }, { "epoch": 1.3968298499088232, "grad_norm": 1.4015262126922607, "learning_rate": 2.6719502501519615e-05, "loss": 0.0137, "step": 49790 }, { "epoch": 1.3971103941646794, "grad_norm": 0.04164804145693779, "learning_rate": 2.6714826763922008e-05, "loss": 0.0295, "step": 49800 }, { "epoch": 1.3973909384205359, "grad_norm": 0.03559322655200958, "learning_rate": 2.6710151026324405e-05, "loss": 0.0552, "step": 49810 }, { "epoch": 1.397671482676392, "grad_norm": 1.0384835004806519, "learning_rate": 2.67054752887268e-05, "loss": 0.0398, "step": 49820 }, { "epoch": 1.3979520269322485, "grad_norm": 0.17221976816654205, "learning_rate": 2.670079955112919e-05, "loss": 0.0307, "step": 49830 }, { "epoch": 1.398232571188105, "grad_norm": 0.10609009861946106, "learning_rate": 2.6696123813531588e-05, "loss": 0.0114, "step": 49840 }, { "epoch": 1.3985131154439614, "grad_norm": 0.09908809512853622, "learning_rate": 2.6691448075933977e-05, "loss": 0.0339, "step": 49850 }, { "epoch": 1.3987936596998176, "grad_norm": 2.698340654373169, "learning_rate": 2.6686772338336374e-05, "loss": 0.0533, "step": 49860 }, { "epoch": 1.399074203955674, "grad_norm": 11.437633514404297, "learning_rate": 2.6682096600738764e-05, "loss": 0.032, "step": 49870 }, { "epoch": 1.3993547482115303, "grad_norm": 0.38035207986831665, "learning_rate": 2.667742086314116e-05, "loss": 0.0232, "step": 49880 }, { "epoch": 1.3996352924673867, "grad_norm": 0.1194073036313057, "learning_rate": 2.6672745125543557e-05, "loss": 0.0132, "step": 49890 }, { "epoch": 1.3999158367232432, "grad_norm": 0.088436558842659, "learning_rate": 2.666806938794595e-05, "loss": 0.049, "step": 49900 }, { "epoch": 1.4001963809790994, "grad_norm": 0.38637715578079224, "learning_rate": 2.6663393650348347e-05, "loss": 0.0369, "step": 49910 }, { "epoch": 1.4004769252349558, "grad_norm": 0.5508727431297302, "learning_rate": 2.6658717912750736e-05, "loss": 0.0362, "step": 49920 }, { "epoch": 1.4007574694908123, "grad_norm": 0.5408927798271179, "learning_rate": 2.6654042175153133e-05, "loss": 0.0465, "step": 49930 }, { "epoch": 1.4010380137466685, "grad_norm": 0.7062782645225525, "learning_rate": 2.6649366437555523e-05, "loss": 0.038, "step": 49940 }, { "epoch": 1.401318558002525, "grad_norm": 0.41962069272994995, "learning_rate": 2.664469069995792e-05, "loss": 0.0192, "step": 49950 }, { "epoch": 1.4015991022583814, "grad_norm": 0.3941808342933655, "learning_rate": 2.6640014962360316e-05, "loss": 0.036, "step": 49960 }, { "epoch": 1.4018796465142376, "grad_norm": 0.053463079035282135, "learning_rate": 2.6635339224762705e-05, "loss": 0.0532, "step": 49970 }, { "epoch": 1.402160190770094, "grad_norm": 0.23770776391029358, "learning_rate": 2.6630663487165102e-05, "loss": 0.019, "step": 49980 }, { "epoch": 1.4024407350259502, "grad_norm": 0.23996026813983917, "learning_rate": 2.6625987749567495e-05, "loss": 0.0241, "step": 49990 }, { "epoch": 1.4027212792818067, "grad_norm": 2.727687358856201, "learning_rate": 2.662131201196989e-05, "loss": 0.0395, "step": 50000 }, { "epoch": 1.4030018235376631, "grad_norm": 0.2146313637495041, "learning_rate": 2.661663627437228e-05, "loss": 0.0173, "step": 50010 }, { "epoch": 1.4032823677935196, "grad_norm": 0.4274258613586426, "learning_rate": 2.6611960536774678e-05, "loss": 0.038, "step": 50020 }, { "epoch": 1.4035629120493758, "grad_norm": 0.16470183432102203, "learning_rate": 2.6607284799177075e-05, "loss": 0.0285, "step": 50030 }, { "epoch": 1.4038434563052322, "grad_norm": 0.1542780101299286, "learning_rate": 2.6602609061579464e-05, "loss": 0.0167, "step": 50040 }, { "epoch": 1.4041240005610884, "grad_norm": 0.03662445768713951, "learning_rate": 2.659793332398186e-05, "loss": 0.0196, "step": 50050 }, { "epoch": 1.4044045448169449, "grad_norm": 0.9722762107849121, "learning_rate": 2.659325758638425e-05, "loss": 0.0199, "step": 50060 }, { "epoch": 1.4046850890728013, "grad_norm": 0.0851346105337143, "learning_rate": 2.6588581848786647e-05, "loss": 0.0142, "step": 50070 }, { "epoch": 1.4049656333286575, "grad_norm": 0.024009142071008682, "learning_rate": 2.6583906111189037e-05, "loss": 0.0261, "step": 50080 }, { "epoch": 1.405246177584514, "grad_norm": 2.4625980854034424, "learning_rate": 2.6579230373591433e-05, "loss": 0.0265, "step": 50090 }, { "epoch": 1.4055267218403702, "grad_norm": 0.4950384497642517, "learning_rate": 2.657455463599383e-05, "loss": 0.0138, "step": 50100 }, { "epoch": 1.4058072660962266, "grad_norm": 0.036531563848257065, "learning_rate": 2.6569878898396223e-05, "loss": 0.0197, "step": 50110 }, { "epoch": 1.406087810352083, "grad_norm": 0.019838107749819756, "learning_rate": 2.656520316079862e-05, "loss": 0.0372, "step": 50120 }, { "epoch": 1.4063683546079395, "grad_norm": 3.081902503967285, "learning_rate": 2.656052742320101e-05, "loss": 0.0364, "step": 50130 }, { "epoch": 1.4066488988637957, "grad_norm": 0.0769173726439476, "learning_rate": 2.6555851685603406e-05, "loss": 0.0103, "step": 50140 }, { "epoch": 1.4069294431196522, "grad_norm": 0.23346151411533356, "learning_rate": 2.6551175948005796e-05, "loss": 0.0135, "step": 50150 }, { "epoch": 1.4072099873755084, "grad_norm": 0.4432452917098999, "learning_rate": 2.6546500210408192e-05, "loss": 0.0253, "step": 50160 }, { "epoch": 1.4074905316313648, "grad_norm": 0.10562209784984589, "learning_rate": 2.654182447281059e-05, "loss": 0.0182, "step": 50170 }, { "epoch": 1.4077710758872213, "grad_norm": 1.4745137691497803, "learning_rate": 2.653714873521298e-05, "loss": 0.0567, "step": 50180 }, { "epoch": 1.4080516201430775, "grad_norm": 0.08232466876506805, "learning_rate": 2.6532472997615375e-05, "loss": 0.0126, "step": 50190 }, { "epoch": 1.408332164398934, "grad_norm": 0.6084491610527039, "learning_rate": 2.652779726001777e-05, "loss": 0.0683, "step": 50200 }, { "epoch": 1.4086127086547902, "grad_norm": 0.7466539144515991, "learning_rate": 2.6523121522420165e-05, "loss": 0.0156, "step": 50210 }, { "epoch": 1.4088932529106466, "grad_norm": 0.06971678137779236, "learning_rate": 2.6518445784822555e-05, "loss": 0.026, "step": 50220 }, { "epoch": 1.409173797166503, "grad_norm": 0.07089842855930328, "learning_rate": 2.651377004722495e-05, "loss": 0.0186, "step": 50230 }, { "epoch": 1.4094543414223595, "grad_norm": 0.026526808738708496, "learning_rate": 2.6509094309627348e-05, "loss": 0.0097, "step": 50240 }, { "epoch": 1.4097348856782157, "grad_norm": 1.1943228244781494, "learning_rate": 2.6504418572029738e-05, "loss": 0.0293, "step": 50250 }, { "epoch": 1.4100154299340721, "grad_norm": 0.028830695897340775, "learning_rate": 2.6499742834432134e-05, "loss": 0.0353, "step": 50260 }, { "epoch": 1.4102959741899284, "grad_norm": 3.046213150024414, "learning_rate": 2.6495067096834524e-05, "loss": 0.0361, "step": 50270 }, { "epoch": 1.4105765184457848, "grad_norm": 0.059025902301073074, "learning_rate": 2.649039135923692e-05, "loss": 0.0093, "step": 50280 }, { "epoch": 1.4108570627016412, "grad_norm": 0.05533954128623009, "learning_rate": 2.6485715621639317e-05, "loss": 0.0272, "step": 50290 }, { "epoch": 1.4111376069574977, "grad_norm": 0.033633530139923096, "learning_rate": 2.648103988404171e-05, "loss": 0.0102, "step": 50300 }, { "epoch": 1.411418151213354, "grad_norm": 2.029003381729126, "learning_rate": 2.6476364146444103e-05, "loss": 0.0197, "step": 50310 }, { "epoch": 1.4116986954692103, "grad_norm": 0.06713908910751343, "learning_rate": 2.6471688408846496e-05, "loss": 0.0045, "step": 50320 }, { "epoch": 1.4119792397250666, "grad_norm": 0.026585064828395844, "learning_rate": 2.6467012671248893e-05, "loss": 0.0221, "step": 50330 }, { "epoch": 1.412259783980923, "grad_norm": 0.04917012155056, "learning_rate": 2.6462336933651283e-05, "loss": 0.0078, "step": 50340 }, { "epoch": 1.4125403282367794, "grad_norm": 0.85372394323349, "learning_rate": 2.645766119605368e-05, "loss": 0.0338, "step": 50350 }, { "epoch": 1.4128208724926357, "grad_norm": 0.19965626299381256, "learning_rate": 2.6452985458456076e-05, "loss": 0.0177, "step": 50360 }, { "epoch": 1.413101416748492, "grad_norm": 0.2403455674648285, "learning_rate": 2.6448309720858466e-05, "loss": 0.0227, "step": 50370 }, { "epoch": 1.4133819610043483, "grad_norm": 1.009139060974121, "learning_rate": 2.6443633983260862e-05, "loss": 0.0248, "step": 50380 }, { "epoch": 1.4136625052602048, "grad_norm": 0.05477530509233475, "learning_rate": 2.6438958245663252e-05, "loss": 0.0318, "step": 50390 }, { "epoch": 1.4139430495160612, "grad_norm": 0.0035304792691022158, "learning_rate": 2.643428250806565e-05, "loss": 0.023, "step": 50400 }, { "epoch": 1.4142235937719176, "grad_norm": 0.040720950812101364, "learning_rate": 2.642960677046804e-05, "loss": 0.0236, "step": 50410 }, { "epoch": 1.4145041380277739, "grad_norm": 10.836292266845703, "learning_rate": 2.6424931032870438e-05, "loss": 0.0271, "step": 50420 }, { "epoch": 1.4147846822836303, "grad_norm": 0.014839479699730873, "learning_rate": 2.6420255295272835e-05, "loss": 0.0273, "step": 50430 }, { "epoch": 1.4150652265394865, "grad_norm": 0.21522848308086395, "learning_rate": 2.6415579557675224e-05, "loss": 0.019, "step": 50440 }, { "epoch": 1.415345770795343, "grad_norm": 0.0723131000995636, "learning_rate": 2.641090382007762e-05, "loss": 0.0294, "step": 50450 }, { "epoch": 1.4156263150511994, "grad_norm": 0.041551217436790466, "learning_rate": 2.640622808248001e-05, "loss": 0.0253, "step": 50460 }, { "epoch": 1.4159068593070556, "grad_norm": 3.2474260330200195, "learning_rate": 2.6401552344882407e-05, "loss": 0.0409, "step": 50470 }, { "epoch": 1.416187403562912, "grad_norm": 0.027175676077604294, "learning_rate": 2.6396876607284797e-05, "loss": 0.0085, "step": 50480 }, { "epoch": 1.4164679478187683, "grad_norm": 0.2887386977672577, "learning_rate": 2.6392200869687194e-05, "loss": 0.0493, "step": 50490 }, { "epoch": 1.4167484920746247, "grad_norm": 0.1208145022392273, "learning_rate": 2.638752513208959e-05, "loss": 0.0259, "step": 50500 }, { "epoch": 1.4170290363304812, "grad_norm": 0.1024802029132843, "learning_rate": 2.6382849394491983e-05, "loss": 0.0098, "step": 50510 }, { "epoch": 1.4173095805863376, "grad_norm": 0.2791426181793213, "learning_rate": 2.637817365689438e-05, "loss": 0.0117, "step": 50520 }, { "epoch": 1.4175901248421938, "grad_norm": 0.20417147874832153, "learning_rate": 2.637349791929677e-05, "loss": 0.0193, "step": 50530 }, { "epoch": 1.4178706690980503, "grad_norm": 0.016047542914748192, "learning_rate": 2.6368822181699166e-05, "loss": 0.0116, "step": 50540 }, { "epoch": 1.4181512133539065, "grad_norm": 0.01891869492828846, "learning_rate": 2.6364146444101556e-05, "loss": 0.0063, "step": 50550 }, { "epoch": 1.418431757609763, "grad_norm": 0.01782085932791233, "learning_rate": 2.6359470706503952e-05, "loss": 0.0444, "step": 50560 }, { "epoch": 1.4187123018656194, "grad_norm": 0.9296126365661621, "learning_rate": 2.635479496890635e-05, "loss": 0.026, "step": 50570 }, { "epoch": 1.4189928461214756, "grad_norm": 0.46102845668792725, "learning_rate": 2.635011923130874e-05, "loss": 0.012, "step": 50580 }, { "epoch": 1.419273390377332, "grad_norm": 0.5316612124443054, "learning_rate": 2.6345443493711135e-05, "loss": 0.0531, "step": 50590 }, { "epoch": 1.4195539346331885, "grad_norm": 0.20826004445552826, "learning_rate": 2.634076775611353e-05, "loss": 0.0115, "step": 50600 }, { "epoch": 1.4198344788890447, "grad_norm": 0.02275259979069233, "learning_rate": 2.633609201851592e-05, "loss": 0.0179, "step": 50610 }, { "epoch": 1.4201150231449011, "grad_norm": 0.024719052016735077, "learning_rate": 2.6331416280918315e-05, "loss": 0.0114, "step": 50620 }, { "epoch": 1.4203955674007576, "grad_norm": 0.6217374205589294, "learning_rate": 2.632674054332071e-05, "loss": 0.0323, "step": 50630 }, { "epoch": 1.4206761116566138, "grad_norm": 0.01753092184662819, "learning_rate": 2.6322064805723108e-05, "loss": 0.0367, "step": 50640 }, { "epoch": 1.4209566559124702, "grad_norm": 0.029606515541672707, "learning_rate": 2.6317389068125498e-05, "loss": 0.023, "step": 50650 }, { "epoch": 1.4212372001683264, "grad_norm": 0.2649177610874176, "learning_rate": 2.6312713330527894e-05, "loss": 0.0099, "step": 50660 }, { "epoch": 1.4215177444241829, "grad_norm": 0.06289686262607574, "learning_rate": 2.6308037592930284e-05, "loss": 0.0048, "step": 50670 }, { "epoch": 1.4217982886800393, "grad_norm": 0.5304654836654663, "learning_rate": 2.630336185533268e-05, "loss": 0.0223, "step": 50680 }, { "epoch": 1.4220788329358958, "grad_norm": 0.589387059211731, "learning_rate": 2.629868611773507e-05, "loss": 0.025, "step": 50690 }, { "epoch": 1.422359377191752, "grad_norm": 0.30437156558036804, "learning_rate": 2.6294010380137467e-05, "loss": 0.032, "step": 50700 }, { "epoch": 1.4226399214476084, "grad_norm": 5.432443141937256, "learning_rate": 2.6289334642539863e-05, "loss": 0.0136, "step": 50710 }, { "epoch": 1.4229204657034646, "grad_norm": 0.01750400848686695, "learning_rate": 2.6284658904942256e-05, "loss": 0.009, "step": 50720 }, { "epoch": 1.423201009959321, "grad_norm": 0.4226152300834656, "learning_rate": 2.6279983167344653e-05, "loss": 0.0151, "step": 50730 }, { "epoch": 1.4234815542151775, "grad_norm": 1.7435098886489868, "learning_rate": 2.6275307429747043e-05, "loss": 0.0578, "step": 50740 }, { "epoch": 1.4237620984710337, "grad_norm": 0.14696498215198517, "learning_rate": 2.627063169214944e-05, "loss": 0.0369, "step": 50750 }, { "epoch": 1.4240426427268902, "grad_norm": 0.23606052994728088, "learning_rate": 2.626595595455183e-05, "loss": 0.0347, "step": 50760 }, { "epoch": 1.4243231869827464, "grad_norm": 0.6838308572769165, "learning_rate": 2.6261280216954226e-05, "loss": 0.0453, "step": 50770 }, { "epoch": 1.4246037312386028, "grad_norm": 0.3087862432003021, "learning_rate": 2.6256604479356622e-05, "loss": 0.0261, "step": 50780 }, { "epoch": 1.4248842754944593, "grad_norm": 1.6797515153884888, "learning_rate": 2.6251928741759012e-05, "loss": 0.0306, "step": 50790 }, { "epoch": 1.4251648197503157, "grad_norm": 0.3979516625404358, "learning_rate": 2.624725300416141e-05, "loss": 0.0161, "step": 50800 }, { "epoch": 1.425445364006172, "grad_norm": 0.17802831530570984, "learning_rate": 2.62425772665638e-05, "loss": 0.0117, "step": 50810 }, { "epoch": 1.4257259082620284, "grad_norm": 0.38653188943862915, "learning_rate": 2.6237901528966198e-05, "loss": 0.019, "step": 50820 }, { "epoch": 1.4260064525178846, "grad_norm": 0.03387337177991867, "learning_rate": 2.6233225791368588e-05, "loss": 0.0106, "step": 50830 }, { "epoch": 1.426286996773741, "grad_norm": 0.5029979944229126, "learning_rate": 2.6228550053770985e-05, "loss": 0.0147, "step": 50840 }, { "epoch": 1.4265675410295975, "grad_norm": 0.015370570123195648, "learning_rate": 2.622387431617338e-05, "loss": 0.0272, "step": 50850 }, { "epoch": 1.4268480852854537, "grad_norm": 3.3997631072998047, "learning_rate": 2.621919857857577e-05, "loss": 0.0275, "step": 50860 }, { "epoch": 1.4271286295413101, "grad_norm": 0.062341101467609406, "learning_rate": 2.6214522840978167e-05, "loss": 0.0417, "step": 50870 }, { "epoch": 1.4274091737971666, "grad_norm": 0.0897769182920456, "learning_rate": 2.6209847103380557e-05, "loss": 0.021, "step": 50880 }, { "epoch": 1.4276897180530228, "grad_norm": 0.4508386552333832, "learning_rate": 2.6205171365782954e-05, "loss": 0.0452, "step": 50890 }, { "epoch": 1.4279702623088792, "grad_norm": 0.059779614210128784, "learning_rate": 2.6200495628185347e-05, "loss": 0.042, "step": 50900 }, { "epoch": 1.4282508065647357, "grad_norm": 0.5120693445205688, "learning_rate": 2.619581989058774e-05, "loss": 0.0167, "step": 50910 }, { "epoch": 1.428531350820592, "grad_norm": 0.0889950692653656, "learning_rate": 2.6191144152990137e-05, "loss": 0.0199, "step": 50920 }, { "epoch": 1.4288118950764483, "grad_norm": 0.06727559864521027, "learning_rate": 2.618646841539253e-05, "loss": 0.0272, "step": 50930 }, { "epoch": 1.4290924393323046, "grad_norm": 0.13797542452812195, "learning_rate": 2.6181792677794926e-05, "loss": 0.066, "step": 50940 }, { "epoch": 1.429372983588161, "grad_norm": 0.9598727822303772, "learning_rate": 2.6177116940197316e-05, "loss": 0.0232, "step": 50950 }, { "epoch": 1.4296535278440174, "grad_norm": 0.9880724549293518, "learning_rate": 2.6172441202599713e-05, "loss": 0.0209, "step": 50960 }, { "epoch": 1.4299340720998739, "grad_norm": 0.1821417659521103, "learning_rate": 2.6167765465002102e-05, "loss": 0.0257, "step": 50970 }, { "epoch": 1.43021461635573, "grad_norm": 3.304236888885498, "learning_rate": 2.61630897274045e-05, "loss": 0.0108, "step": 50980 }, { "epoch": 1.4304951606115865, "grad_norm": 0.026944328099489212, "learning_rate": 2.6158413989806895e-05, "loss": 0.0136, "step": 50990 }, { "epoch": 1.4307757048674428, "grad_norm": 0.07142742723226547, "learning_rate": 2.6153738252209285e-05, "loss": 0.0353, "step": 51000 }, { "epoch": 1.4310562491232992, "grad_norm": 0.04272855445742607, "learning_rate": 2.6149062514611682e-05, "loss": 0.0284, "step": 51010 }, { "epoch": 1.4313367933791556, "grad_norm": 2.710966110229492, "learning_rate": 2.6144386777014075e-05, "loss": 0.0311, "step": 51020 }, { "epoch": 1.4316173376350119, "grad_norm": 0.15668204426765442, "learning_rate": 2.613971103941647e-05, "loss": 0.026, "step": 51030 }, { "epoch": 1.4318978818908683, "grad_norm": 0.9852071404457092, "learning_rate": 2.613503530181886e-05, "loss": 0.0398, "step": 51040 }, { "epoch": 1.4321784261467245, "grad_norm": 0.07005651295185089, "learning_rate": 2.6130359564221258e-05, "loss": 0.0174, "step": 51050 }, { "epoch": 1.432458970402581, "grad_norm": 0.10823917388916016, "learning_rate": 2.6125683826623654e-05, "loss": 0.0192, "step": 51060 }, { "epoch": 1.4327395146584374, "grad_norm": 0.6202117204666138, "learning_rate": 2.6121008089026044e-05, "loss": 0.0333, "step": 51070 }, { "epoch": 1.4330200589142938, "grad_norm": 0.13954098522663116, "learning_rate": 2.611633235142844e-05, "loss": 0.0091, "step": 51080 }, { "epoch": 1.43330060317015, "grad_norm": 0.4075353443622589, "learning_rate": 2.611165661383083e-05, "loss": 0.0321, "step": 51090 }, { "epoch": 1.4335811474260065, "grad_norm": 0.06590376049280167, "learning_rate": 2.6106980876233227e-05, "loss": 0.0076, "step": 51100 }, { "epoch": 1.4338616916818627, "grad_norm": 0.019411476328969002, "learning_rate": 2.610230513863562e-05, "loss": 0.0224, "step": 51110 }, { "epoch": 1.4341422359377192, "grad_norm": 0.06710942834615707, "learning_rate": 2.6097629401038017e-05, "loss": 0.018, "step": 51120 }, { "epoch": 1.4344227801935756, "grad_norm": 0.27346619963645935, "learning_rate": 2.6092953663440413e-05, "loss": 0.0058, "step": 51130 }, { "epoch": 1.4347033244494318, "grad_norm": 0.025585142895579338, "learning_rate": 2.6088277925842803e-05, "loss": 0.057, "step": 51140 }, { "epoch": 1.4349838687052883, "grad_norm": 0.336890310049057, "learning_rate": 2.60836021882452e-05, "loss": 0.0448, "step": 51150 }, { "epoch": 1.4352644129611445, "grad_norm": 0.008812511339783669, "learning_rate": 2.607892645064759e-05, "loss": 0.0188, "step": 51160 }, { "epoch": 1.435544957217001, "grad_norm": 0.07885196805000305, "learning_rate": 2.6074250713049986e-05, "loss": 0.007, "step": 51170 }, { "epoch": 1.4358255014728574, "grad_norm": 0.31331247091293335, "learning_rate": 2.6069574975452376e-05, "loss": 0.0181, "step": 51180 }, { "epoch": 1.4361060457287138, "grad_norm": 0.25504270195961, "learning_rate": 2.6064899237854772e-05, "loss": 0.0189, "step": 51190 }, { "epoch": 1.43638658998457, "grad_norm": 0.01913280598819256, "learning_rate": 2.606022350025717e-05, "loss": 0.0283, "step": 51200 }, { "epoch": 1.4366671342404265, "grad_norm": 1.186085820198059, "learning_rate": 2.6055547762659562e-05, "loss": 0.0382, "step": 51210 }, { "epoch": 1.4369476784962827, "grad_norm": 0.033071424812078476, "learning_rate": 2.6050872025061955e-05, "loss": 0.0118, "step": 51220 }, { "epoch": 1.4372282227521391, "grad_norm": 0.15201596915721893, "learning_rate": 2.6046196287464348e-05, "loss": 0.0124, "step": 51230 }, { "epoch": 1.4375087670079956, "grad_norm": 0.4471926987171173, "learning_rate": 2.6041520549866745e-05, "loss": 0.0105, "step": 51240 }, { "epoch": 1.437789311263852, "grad_norm": 0.2262798696756363, "learning_rate": 2.6036844812269134e-05, "loss": 0.029, "step": 51250 }, { "epoch": 1.4380698555197082, "grad_norm": 1.2679485082626343, "learning_rate": 2.603216907467153e-05, "loss": 0.0239, "step": 51260 }, { "epoch": 1.4383503997755647, "grad_norm": 0.12461350858211517, "learning_rate": 2.6027493337073927e-05, "loss": 0.0097, "step": 51270 }, { "epoch": 1.4386309440314209, "grad_norm": 0.32366836071014404, "learning_rate": 2.6022817599476317e-05, "loss": 0.0169, "step": 51280 }, { "epoch": 1.4389114882872773, "grad_norm": 0.22641460597515106, "learning_rate": 2.6018141861878714e-05, "loss": 0.018, "step": 51290 }, { "epoch": 1.4391920325431338, "grad_norm": 0.9724528789520264, "learning_rate": 2.6013466124281104e-05, "loss": 0.0446, "step": 51300 }, { "epoch": 1.43947257679899, "grad_norm": 0.36239030957221985, "learning_rate": 2.60087903866835e-05, "loss": 0.0197, "step": 51310 }, { "epoch": 1.4397531210548464, "grad_norm": 0.10971923172473907, "learning_rate": 2.6004114649085893e-05, "loss": 0.0197, "step": 51320 }, { "epoch": 1.4400336653107026, "grad_norm": 0.5581293106079102, "learning_rate": 2.599943891148829e-05, "loss": 0.0149, "step": 51330 }, { "epoch": 1.440314209566559, "grad_norm": 0.4333712160587311, "learning_rate": 2.5994763173890686e-05, "loss": 0.0056, "step": 51340 }, { "epoch": 1.4405947538224155, "grad_norm": 0.4260680079460144, "learning_rate": 2.5990087436293076e-05, "loss": 0.0396, "step": 51350 }, { "epoch": 1.440875298078272, "grad_norm": 0.1697925180196762, "learning_rate": 2.5985411698695473e-05, "loss": 0.0154, "step": 51360 }, { "epoch": 1.4411558423341282, "grad_norm": 0.04235608130693436, "learning_rate": 2.5980735961097862e-05, "loss": 0.0103, "step": 51370 }, { "epoch": 1.4414363865899846, "grad_norm": 0.06505939364433289, "learning_rate": 2.597606022350026e-05, "loss": 0.0129, "step": 51380 }, { "epoch": 1.4417169308458408, "grad_norm": 0.028723793104290962, "learning_rate": 2.597138448590265e-05, "loss": 0.0168, "step": 51390 }, { "epoch": 1.4419974751016973, "grad_norm": 0.08224623650312424, "learning_rate": 2.5966708748305045e-05, "loss": 0.0067, "step": 51400 }, { "epoch": 1.4422780193575537, "grad_norm": 0.05512593686580658, "learning_rate": 2.5962033010707442e-05, "loss": 0.0297, "step": 51410 }, { "epoch": 1.44255856361341, "grad_norm": 0.00463439105078578, "learning_rate": 2.5957357273109835e-05, "loss": 0.0303, "step": 51420 }, { "epoch": 1.4428391078692664, "grad_norm": 0.024968618527054787, "learning_rate": 2.595268153551223e-05, "loss": 0.0177, "step": 51430 }, { "epoch": 1.4431196521251226, "grad_norm": 0.02150682546198368, "learning_rate": 2.594800579791462e-05, "loss": 0.0052, "step": 51440 }, { "epoch": 1.443400196380979, "grad_norm": 0.7534814476966858, "learning_rate": 2.5943330060317018e-05, "loss": 0.0187, "step": 51450 }, { "epoch": 1.4436807406368355, "grad_norm": 0.5313477516174316, "learning_rate": 2.5938654322719408e-05, "loss": 0.0078, "step": 51460 }, { "epoch": 1.443961284892692, "grad_norm": 0.01860833540558815, "learning_rate": 2.5933978585121804e-05, "loss": 0.0432, "step": 51470 }, { "epoch": 1.4442418291485482, "grad_norm": 0.4036755859851837, "learning_rate": 2.59293028475242e-05, "loss": 0.0161, "step": 51480 }, { "epoch": 1.4445223734044046, "grad_norm": 0.027762562036514282, "learning_rate": 2.592462710992659e-05, "loss": 0.008, "step": 51490 }, { "epoch": 1.4448029176602608, "grad_norm": 0.17707593739032745, "learning_rate": 2.5919951372328987e-05, "loss": 0.0124, "step": 51500 }, { "epoch": 1.4450834619161173, "grad_norm": 0.021543025970458984, "learning_rate": 2.591527563473138e-05, "loss": 0.019, "step": 51510 }, { "epoch": 1.4453640061719737, "grad_norm": 2.400941848754883, "learning_rate": 2.5910599897133773e-05, "loss": 0.017, "step": 51520 }, { "epoch": 1.44564455042783, "grad_norm": 0.15102721750736237, "learning_rate": 2.5905924159536166e-05, "loss": 0.0125, "step": 51530 }, { "epoch": 1.4459250946836864, "grad_norm": 0.44746503233909607, "learning_rate": 2.5901248421938563e-05, "loss": 0.0355, "step": 51540 }, { "epoch": 1.4462056389395428, "grad_norm": 0.07337530702352524, "learning_rate": 2.589657268434096e-05, "loss": 0.0188, "step": 51550 }, { "epoch": 1.446486183195399, "grad_norm": 0.1615971028804779, "learning_rate": 2.589189694674335e-05, "loss": 0.0302, "step": 51560 }, { "epoch": 1.4467667274512555, "grad_norm": 0.06138620153069496, "learning_rate": 2.5887221209145746e-05, "loss": 0.0105, "step": 51570 }, { "epoch": 1.447047271707112, "grad_norm": 0.2634975016117096, "learning_rate": 2.5882545471548136e-05, "loss": 0.0144, "step": 51580 }, { "epoch": 1.4473278159629681, "grad_norm": 0.12946778535842896, "learning_rate": 2.5877869733950532e-05, "loss": 0.0194, "step": 51590 }, { "epoch": 1.4476083602188246, "grad_norm": 0.4371131956577301, "learning_rate": 2.5873193996352922e-05, "loss": 0.0294, "step": 51600 }, { "epoch": 1.4478889044746808, "grad_norm": 0.5186461806297302, "learning_rate": 2.586851825875532e-05, "loss": 0.0287, "step": 51610 }, { "epoch": 1.4481694487305372, "grad_norm": 0.058187492191791534, "learning_rate": 2.5863842521157715e-05, "loss": 0.0197, "step": 51620 }, { "epoch": 1.4484499929863937, "grad_norm": 0.035316500812768936, "learning_rate": 2.5859166783560108e-05, "loss": 0.0179, "step": 51630 }, { "epoch": 1.44873053724225, "grad_norm": 1.172711730003357, "learning_rate": 2.5854491045962505e-05, "loss": 0.0511, "step": 51640 }, { "epoch": 1.4490110814981063, "grad_norm": 0.8302263617515564, "learning_rate": 2.5849815308364894e-05, "loss": 0.0336, "step": 51650 }, { "epoch": 1.4492916257539628, "grad_norm": 0.1323886513710022, "learning_rate": 2.584513957076729e-05, "loss": 0.0323, "step": 51660 }, { "epoch": 1.449572170009819, "grad_norm": 0.9486472010612488, "learning_rate": 2.584046383316968e-05, "loss": 0.0203, "step": 51670 }, { "epoch": 1.4498527142656754, "grad_norm": 0.04528618976473808, "learning_rate": 2.5835788095572077e-05, "loss": 0.0469, "step": 51680 }, { "epoch": 1.4501332585215319, "grad_norm": 0.06603121012449265, "learning_rate": 2.5831112357974474e-05, "loss": 0.0215, "step": 51690 }, { "epoch": 1.450413802777388, "grad_norm": 0.7690286040306091, "learning_rate": 2.5826436620376864e-05, "loss": 0.0442, "step": 51700 }, { "epoch": 1.4506943470332445, "grad_norm": 0.10695253312587738, "learning_rate": 2.582176088277926e-05, "loss": 0.0182, "step": 51710 }, { "epoch": 1.4509748912891007, "grad_norm": 0.20395426452159882, "learning_rate": 2.5817085145181653e-05, "loss": 0.017, "step": 51720 }, { "epoch": 1.4512554355449572, "grad_norm": 0.19490322470664978, "learning_rate": 2.581240940758405e-05, "loss": 0.0146, "step": 51730 }, { "epoch": 1.4515359798008136, "grad_norm": 6.27593994140625, "learning_rate": 2.580773366998644e-05, "loss": 0.0123, "step": 51740 }, { "epoch": 1.45181652405667, "grad_norm": 1.9361222982406616, "learning_rate": 2.5803057932388836e-05, "loss": 0.0165, "step": 51750 }, { "epoch": 1.4520970683125263, "grad_norm": 1.3589696884155273, "learning_rate": 2.5798382194791233e-05, "loss": 0.0345, "step": 51760 }, { "epoch": 1.4523776125683827, "grad_norm": 1.0005395412445068, "learning_rate": 2.5793706457193622e-05, "loss": 0.0254, "step": 51770 }, { "epoch": 1.452658156824239, "grad_norm": 0.3351835310459137, "learning_rate": 2.578903071959602e-05, "loss": 0.0308, "step": 51780 }, { "epoch": 1.4529387010800954, "grad_norm": 0.6929600834846497, "learning_rate": 2.578435498199841e-05, "loss": 0.0251, "step": 51790 }, { "epoch": 1.4532192453359518, "grad_norm": 0.08223629742860794, "learning_rate": 2.5779679244400805e-05, "loss": 0.0418, "step": 51800 }, { "epoch": 1.453499789591808, "grad_norm": 0.034709520637989044, "learning_rate": 2.57750035068032e-05, "loss": 0.0104, "step": 51810 }, { "epoch": 1.4537803338476645, "grad_norm": 3.7675557136535645, "learning_rate": 2.577032776920559e-05, "loss": 0.0355, "step": 51820 }, { "epoch": 1.4540608781035207, "grad_norm": 0.051600515842437744, "learning_rate": 2.5765652031607988e-05, "loss": 0.0353, "step": 51830 }, { "epoch": 1.4543414223593771, "grad_norm": 0.3466702103614807, "learning_rate": 2.576097629401038e-05, "loss": 0.0203, "step": 51840 }, { "epoch": 1.4546219666152336, "grad_norm": 0.18435032665729523, "learning_rate": 2.5756300556412778e-05, "loss": 0.0381, "step": 51850 }, { "epoch": 1.45490251087109, "grad_norm": 0.46892741322517395, "learning_rate": 2.5751624818815168e-05, "loss": 0.0111, "step": 51860 }, { "epoch": 1.4551830551269462, "grad_norm": 2.111677646636963, "learning_rate": 2.5746949081217564e-05, "loss": 0.0193, "step": 51870 }, { "epoch": 1.4554635993828027, "grad_norm": 0.014678836800158024, "learning_rate": 2.5742273343619954e-05, "loss": 0.03, "step": 51880 }, { "epoch": 1.455744143638659, "grad_norm": 2.3352599143981934, "learning_rate": 2.573759760602235e-05, "loss": 0.0413, "step": 51890 }, { "epoch": 1.4560246878945153, "grad_norm": 0.6759806275367737, "learning_rate": 2.5732921868424747e-05, "loss": 0.0107, "step": 51900 }, { "epoch": 1.4563052321503718, "grad_norm": 0.04478086903691292, "learning_rate": 2.5728246130827137e-05, "loss": 0.0167, "step": 51910 }, { "epoch": 1.4565857764062282, "grad_norm": 0.49369940161705017, "learning_rate": 2.5723570393229533e-05, "loss": 0.0167, "step": 51920 }, { "epoch": 1.4568663206620844, "grad_norm": 0.6170856952667236, "learning_rate": 2.5718894655631927e-05, "loss": 0.0228, "step": 51930 }, { "epoch": 1.4571468649179409, "grad_norm": 0.2014194130897522, "learning_rate": 2.5714218918034323e-05, "loss": 0.0107, "step": 51940 }, { "epoch": 1.457427409173797, "grad_norm": 0.08112978935241699, "learning_rate": 2.5709543180436713e-05, "loss": 0.0375, "step": 51950 }, { "epoch": 1.4577079534296535, "grad_norm": 0.018676239997148514, "learning_rate": 2.570486744283911e-05, "loss": 0.0133, "step": 51960 }, { "epoch": 1.45798849768551, "grad_norm": 0.2264186441898346, "learning_rate": 2.5700191705241506e-05, "loss": 0.0097, "step": 51970 }, { "epoch": 1.4582690419413662, "grad_norm": 0.08803657442331314, "learning_rate": 2.5695515967643896e-05, "loss": 0.0141, "step": 51980 }, { "epoch": 1.4585495861972226, "grad_norm": 0.011564181186258793, "learning_rate": 2.5690840230046292e-05, "loss": 0.0114, "step": 51990 }, { "epoch": 1.4588301304530789, "grad_norm": 0.20520952343940735, "learning_rate": 2.5686164492448682e-05, "loss": 0.0107, "step": 52000 }, { "epoch": 1.4591106747089353, "grad_norm": 0.07577104866504669, "learning_rate": 2.568148875485108e-05, "loss": 0.0141, "step": 52010 }, { "epoch": 1.4593912189647917, "grad_norm": 0.12978121638298035, "learning_rate": 2.5676813017253472e-05, "loss": 0.0108, "step": 52020 }, { "epoch": 1.4596717632206482, "grad_norm": 0.8966216444969177, "learning_rate": 2.5672137279655868e-05, "loss": 0.0326, "step": 52030 }, { "epoch": 1.4599523074765044, "grad_norm": 0.07360166311264038, "learning_rate": 2.5667461542058265e-05, "loss": 0.0265, "step": 52040 }, { "epoch": 1.4602328517323608, "grad_norm": 0.22800996899604797, "learning_rate": 2.5662785804460655e-05, "loss": 0.0264, "step": 52050 }, { "epoch": 1.460513395988217, "grad_norm": 0.004467122722417116, "learning_rate": 2.565811006686305e-05, "loss": 0.017, "step": 52060 }, { "epoch": 1.4607939402440735, "grad_norm": 0.1636831909418106, "learning_rate": 2.565343432926544e-05, "loss": 0.0062, "step": 52070 }, { "epoch": 1.46107448449993, "grad_norm": 1.342974305152893, "learning_rate": 2.5648758591667837e-05, "loss": 0.0554, "step": 52080 }, { "epoch": 1.4613550287557862, "grad_norm": 0.18357905745506287, "learning_rate": 2.5644082854070227e-05, "loss": 0.0069, "step": 52090 }, { "epoch": 1.4616355730116426, "grad_norm": 0.06954538822174072, "learning_rate": 2.5639407116472624e-05, "loss": 0.015, "step": 52100 }, { "epoch": 1.4619161172674988, "grad_norm": 0.03172842040657997, "learning_rate": 2.563473137887502e-05, "loss": 0.0449, "step": 52110 }, { "epoch": 1.4621966615233553, "grad_norm": 1.0964953899383545, "learning_rate": 2.5630055641277413e-05, "loss": 0.0183, "step": 52120 }, { "epoch": 1.4624772057792117, "grad_norm": 0.948522686958313, "learning_rate": 2.5625379903679807e-05, "loss": 0.0314, "step": 52130 }, { "epoch": 1.4627577500350681, "grad_norm": 0.682992696762085, "learning_rate": 2.56207041660822e-05, "loss": 0.0472, "step": 52140 }, { "epoch": 1.4630382942909244, "grad_norm": 0.1411021649837494, "learning_rate": 2.5616028428484596e-05, "loss": 0.042, "step": 52150 }, { "epoch": 1.4633188385467808, "grad_norm": 0.49289247393608093, "learning_rate": 2.5611352690886986e-05, "loss": 0.0305, "step": 52160 }, { "epoch": 1.463599382802637, "grad_norm": 0.2595321238040924, "learning_rate": 2.5606676953289383e-05, "loss": 0.0129, "step": 52170 }, { "epoch": 1.4638799270584935, "grad_norm": 0.06564658135175705, "learning_rate": 2.560200121569178e-05, "loss": 0.0106, "step": 52180 }, { "epoch": 1.46416047131435, "grad_norm": 0.01658061519265175, "learning_rate": 2.559732547809417e-05, "loss": 0.0038, "step": 52190 }, { "epoch": 1.4644410155702061, "grad_norm": 0.05669078975915909, "learning_rate": 2.5592649740496565e-05, "loss": 0.0143, "step": 52200 }, { "epoch": 1.4647215598260626, "grad_norm": 0.2026432752609253, "learning_rate": 2.5587974002898955e-05, "loss": 0.0295, "step": 52210 }, { "epoch": 1.465002104081919, "grad_norm": 0.10466164350509644, "learning_rate": 2.5583298265301352e-05, "loss": 0.0127, "step": 52220 }, { "epoch": 1.4652826483377752, "grad_norm": 1.0376091003417969, "learning_rate": 2.5578622527703745e-05, "loss": 0.043, "step": 52230 }, { "epoch": 1.4655631925936317, "grad_norm": 2.787686586380005, "learning_rate": 2.557394679010614e-05, "loss": 0.036, "step": 52240 }, { "epoch": 1.465843736849488, "grad_norm": 0.022756878286600113, "learning_rate": 2.5569271052508538e-05, "loss": 0.007, "step": 52250 }, { "epoch": 1.4661242811053443, "grad_norm": 0.9440562129020691, "learning_rate": 2.5564595314910928e-05, "loss": 0.0466, "step": 52260 }, { "epoch": 1.4664048253612008, "grad_norm": 0.25275304913520813, "learning_rate": 2.5559919577313324e-05, "loss": 0.0083, "step": 52270 }, { "epoch": 1.466685369617057, "grad_norm": 0.6149752140045166, "learning_rate": 2.5555243839715714e-05, "loss": 0.032, "step": 52280 }, { "epoch": 1.4669659138729134, "grad_norm": 0.12428087741136551, "learning_rate": 2.555056810211811e-05, "loss": 0.0347, "step": 52290 }, { "epoch": 1.4672464581287699, "grad_norm": 0.17866668105125427, "learning_rate": 2.55458923645205e-05, "loss": 0.0044, "step": 52300 }, { "epoch": 1.4675270023846263, "grad_norm": 0.056787218898534775, "learning_rate": 2.5541216626922897e-05, "loss": 0.045, "step": 52310 }, { "epoch": 1.4678075466404825, "grad_norm": 0.15635336935520172, "learning_rate": 2.5536540889325293e-05, "loss": 0.0162, "step": 52320 }, { "epoch": 1.468088090896339, "grad_norm": 0.05932248383760452, "learning_rate": 2.5531865151727687e-05, "loss": 0.0155, "step": 52330 }, { "epoch": 1.4683686351521952, "grad_norm": 2.546820878982544, "learning_rate": 2.5527189414130083e-05, "loss": 0.0541, "step": 52340 }, { "epoch": 1.4686491794080516, "grad_norm": 2.3699567317962646, "learning_rate": 2.5522513676532473e-05, "loss": 0.0159, "step": 52350 }, { "epoch": 1.468929723663908, "grad_norm": 1.2999342679977417, "learning_rate": 2.551783793893487e-05, "loss": 0.0335, "step": 52360 }, { "epoch": 1.4692102679197643, "grad_norm": 0.06278746575117111, "learning_rate": 2.551316220133726e-05, "loss": 0.031, "step": 52370 }, { "epoch": 1.4694908121756207, "grad_norm": 0.42276230454444885, "learning_rate": 2.5508486463739656e-05, "loss": 0.0119, "step": 52380 }, { "epoch": 1.469771356431477, "grad_norm": 0.07679533958435059, "learning_rate": 2.5503810726142052e-05, "loss": 0.046, "step": 52390 }, { "epoch": 1.4700519006873334, "grad_norm": 0.030722441151738167, "learning_rate": 2.5499134988544442e-05, "loss": 0.0251, "step": 52400 }, { "epoch": 1.4703324449431898, "grad_norm": 0.030774788931012154, "learning_rate": 2.549445925094684e-05, "loss": 0.0067, "step": 52410 }, { "epoch": 1.4706129891990463, "grad_norm": 0.9747705459594727, "learning_rate": 2.5489783513349232e-05, "loss": 0.0299, "step": 52420 }, { "epoch": 1.4708935334549025, "grad_norm": 1.7286598682403564, "learning_rate": 2.5485107775751625e-05, "loss": 0.0174, "step": 52430 }, { "epoch": 1.471174077710759, "grad_norm": 0.13912831246852875, "learning_rate": 2.5480432038154018e-05, "loss": 0.0065, "step": 52440 }, { "epoch": 1.4714546219666151, "grad_norm": 0.22086872160434723, "learning_rate": 2.5475756300556415e-05, "loss": 0.0191, "step": 52450 }, { "epoch": 1.4717351662224716, "grad_norm": 0.9976252913475037, "learning_rate": 2.547108056295881e-05, "loss": 0.0179, "step": 52460 }, { "epoch": 1.472015710478328, "grad_norm": 0.4362272620201111, "learning_rate": 2.54664048253612e-05, "loss": 0.0152, "step": 52470 }, { "epoch": 1.4722962547341842, "grad_norm": 0.5263736248016357, "learning_rate": 2.5461729087763597e-05, "loss": 0.0347, "step": 52480 }, { "epoch": 1.4725767989900407, "grad_norm": 0.28230926394462585, "learning_rate": 2.5457053350165987e-05, "loss": 0.0146, "step": 52490 }, { "epoch": 1.4728573432458971, "grad_norm": 0.02749178372323513, "learning_rate": 2.5452377612568384e-05, "loss": 0.01, "step": 52500 }, { "epoch": 1.4731378875017533, "grad_norm": 0.008241027593612671, "learning_rate": 2.5447701874970774e-05, "loss": 0.0151, "step": 52510 }, { "epoch": 1.4734184317576098, "grad_norm": 0.15498395264148712, "learning_rate": 2.544302613737317e-05, "loss": 0.0069, "step": 52520 }, { "epoch": 1.4736989760134662, "grad_norm": 1.3460112810134888, "learning_rate": 2.5438350399775567e-05, "loss": 0.0316, "step": 52530 }, { "epoch": 1.4739795202693224, "grad_norm": 0.05144423618912697, "learning_rate": 2.543367466217796e-05, "loss": 0.007, "step": 52540 }, { "epoch": 1.4742600645251789, "grad_norm": 0.2937069535255432, "learning_rate": 2.5428998924580356e-05, "loss": 0.0172, "step": 52550 }, { "epoch": 1.474540608781035, "grad_norm": 4.390767574310303, "learning_rate": 2.5424323186982746e-05, "loss": 0.0321, "step": 52560 }, { "epoch": 1.4748211530368915, "grad_norm": 0.1546860933303833, "learning_rate": 2.5419647449385143e-05, "loss": 0.0255, "step": 52570 }, { "epoch": 1.475101697292748, "grad_norm": 0.0457792803645134, "learning_rate": 2.5414971711787532e-05, "loss": 0.0373, "step": 52580 }, { "epoch": 1.4753822415486044, "grad_norm": 1.8757297992706299, "learning_rate": 2.541029597418993e-05, "loss": 0.0556, "step": 52590 }, { "epoch": 1.4756627858044606, "grad_norm": 0.511297881603241, "learning_rate": 2.5405620236592326e-05, "loss": 0.0245, "step": 52600 }, { "epoch": 1.475943330060317, "grad_norm": 0.2725287675857544, "learning_rate": 2.5400944498994715e-05, "loss": 0.0126, "step": 52610 }, { "epoch": 1.4762238743161733, "grad_norm": 0.03500046581029892, "learning_rate": 2.5396268761397112e-05, "loss": 0.0166, "step": 52620 }, { "epoch": 1.4765044185720297, "grad_norm": 0.6519321799278259, "learning_rate": 2.5391593023799505e-05, "loss": 0.0433, "step": 52630 }, { "epoch": 1.4767849628278862, "grad_norm": 0.08953879028558731, "learning_rate": 2.53869172862019e-05, "loss": 0.023, "step": 52640 }, { "epoch": 1.4770655070837424, "grad_norm": 0.027801332995295525, "learning_rate": 2.538224154860429e-05, "loss": 0.0243, "step": 52650 }, { "epoch": 1.4773460513395988, "grad_norm": 0.10965706408023834, "learning_rate": 2.5377565811006688e-05, "loss": 0.0133, "step": 52660 }, { "epoch": 1.477626595595455, "grad_norm": 1.225110411643982, "learning_rate": 2.5372890073409084e-05, "loss": 0.0255, "step": 52670 }, { "epoch": 1.4779071398513115, "grad_norm": 0.874717116355896, "learning_rate": 2.5368214335811474e-05, "loss": 0.026, "step": 52680 }, { "epoch": 1.478187684107168, "grad_norm": 0.37715616822242737, "learning_rate": 2.536353859821387e-05, "loss": 0.0106, "step": 52690 }, { "epoch": 1.4784682283630244, "grad_norm": 0.26449599862098694, "learning_rate": 2.535886286061626e-05, "loss": 0.0193, "step": 52700 }, { "epoch": 1.4787487726188806, "grad_norm": 0.17270098626613617, "learning_rate": 2.5354187123018657e-05, "loss": 0.0222, "step": 52710 }, { "epoch": 1.479029316874737, "grad_norm": 0.20435623824596405, "learning_rate": 2.534951138542105e-05, "loss": 0.0153, "step": 52720 }, { "epoch": 1.4793098611305933, "grad_norm": 0.05299729108810425, "learning_rate": 2.5344835647823443e-05, "loss": 0.0143, "step": 52730 }, { "epoch": 1.4795904053864497, "grad_norm": 0.6720919609069824, "learning_rate": 2.534015991022584e-05, "loss": 0.0256, "step": 52740 }, { "epoch": 1.4798709496423061, "grad_norm": 0.05494006723165512, "learning_rate": 2.5335484172628233e-05, "loss": 0.0075, "step": 52750 }, { "epoch": 1.4801514938981624, "grad_norm": 0.36511027812957764, "learning_rate": 2.533080843503063e-05, "loss": 0.0363, "step": 52760 }, { "epoch": 1.4804320381540188, "grad_norm": 0.6832915544509888, "learning_rate": 2.532613269743302e-05, "loss": 0.0498, "step": 52770 }, { "epoch": 1.480712582409875, "grad_norm": 2.0574593544006348, "learning_rate": 2.5321456959835416e-05, "loss": 0.0305, "step": 52780 }, { "epoch": 1.4809931266657315, "grad_norm": 0.10125732421875, "learning_rate": 2.5316781222237806e-05, "loss": 0.0267, "step": 52790 }, { "epoch": 1.481273670921588, "grad_norm": 0.5786702632904053, "learning_rate": 2.5312105484640202e-05, "loss": 0.0312, "step": 52800 }, { "epoch": 1.4815542151774443, "grad_norm": 0.4574950635433197, "learning_rate": 2.53074297470426e-05, "loss": 0.0232, "step": 52810 }, { "epoch": 1.4818347594333006, "grad_norm": 0.3954833447933197, "learning_rate": 2.530275400944499e-05, "loss": 0.0139, "step": 52820 }, { "epoch": 1.482115303689157, "grad_norm": 0.021024638786911964, "learning_rate": 2.5298078271847385e-05, "loss": 0.0158, "step": 52830 }, { "epoch": 1.4823958479450132, "grad_norm": 0.42173266410827637, "learning_rate": 2.5293402534249778e-05, "loss": 0.0408, "step": 52840 }, { "epoch": 1.4826763922008697, "grad_norm": 0.032527387142181396, "learning_rate": 2.5288726796652175e-05, "loss": 0.0149, "step": 52850 }, { "epoch": 1.482956936456726, "grad_norm": 1.054371953010559, "learning_rate": 2.528405105905457e-05, "loss": 0.0258, "step": 52860 }, { "epoch": 1.4832374807125825, "grad_norm": 0.11242713779211044, "learning_rate": 2.527937532145696e-05, "loss": 0.012, "step": 52870 }, { "epoch": 1.4835180249684388, "grad_norm": 0.05455593764781952, "learning_rate": 2.5274699583859358e-05, "loss": 0.0464, "step": 52880 }, { "epoch": 1.4837985692242952, "grad_norm": 0.18455654382705688, "learning_rate": 2.5270023846261747e-05, "loss": 0.0126, "step": 52890 }, { "epoch": 1.4840791134801514, "grad_norm": 0.10377223789691925, "learning_rate": 2.5265348108664144e-05, "loss": 0.0191, "step": 52900 }, { "epoch": 1.4843596577360079, "grad_norm": 22.23583984375, "learning_rate": 2.5260672371066534e-05, "loss": 0.0337, "step": 52910 }, { "epoch": 1.4846402019918643, "grad_norm": 0.6678638458251953, "learning_rate": 2.525599663346893e-05, "loss": 0.0269, "step": 52920 }, { "epoch": 1.4849207462477205, "grad_norm": 0.7563673853874207, "learning_rate": 2.5251320895871327e-05, "loss": 0.0548, "step": 52930 }, { "epoch": 1.485201290503577, "grad_norm": 0.8583924770355225, "learning_rate": 2.524664515827372e-05, "loss": 0.0418, "step": 52940 }, { "epoch": 1.4854818347594332, "grad_norm": 0.05860777571797371, "learning_rate": 2.5241969420676116e-05, "loss": 0.0143, "step": 52950 }, { "epoch": 1.4857623790152896, "grad_norm": 0.19636820256710052, "learning_rate": 2.5237293683078506e-05, "loss": 0.0329, "step": 52960 }, { "epoch": 1.486042923271146, "grad_norm": 0.2408931702375412, "learning_rate": 2.5232617945480903e-05, "loss": 0.0067, "step": 52970 }, { "epoch": 1.4863234675270025, "grad_norm": 2.110475540161133, "learning_rate": 2.5227942207883293e-05, "loss": 0.0473, "step": 52980 }, { "epoch": 1.4866040117828587, "grad_norm": 0.10106148570775986, "learning_rate": 2.522326647028569e-05, "loss": 0.0261, "step": 52990 }, { "epoch": 1.4868845560387152, "grad_norm": 0.11080478131771088, "learning_rate": 2.5218590732688086e-05, "loss": 0.0137, "step": 53000 }, { "epoch": 1.4871651002945714, "grad_norm": 0.7663170099258423, "learning_rate": 2.5213914995090475e-05, "loss": 0.0334, "step": 53010 }, { "epoch": 1.4874456445504278, "grad_norm": 1.8078837394714355, "learning_rate": 2.5209239257492872e-05, "loss": 0.041, "step": 53020 }, { "epoch": 1.4877261888062843, "grad_norm": 0.21564903855323792, "learning_rate": 2.5204563519895265e-05, "loss": 0.0095, "step": 53030 }, { "epoch": 1.4880067330621405, "grad_norm": 0.2535030245780945, "learning_rate": 2.5199887782297658e-05, "loss": 0.0129, "step": 53040 }, { "epoch": 1.488287277317997, "grad_norm": 0.03315699100494385, "learning_rate": 2.519521204470005e-05, "loss": 0.0485, "step": 53050 }, { "epoch": 1.4885678215738531, "grad_norm": 0.017709162086248398, "learning_rate": 2.5190536307102448e-05, "loss": 0.0236, "step": 53060 }, { "epoch": 1.4888483658297096, "grad_norm": 0.21171759068965912, "learning_rate": 2.5185860569504844e-05, "loss": 0.0365, "step": 53070 }, { "epoch": 1.489128910085566, "grad_norm": 0.1195739135146141, "learning_rate": 2.5181184831907234e-05, "loss": 0.0222, "step": 53080 }, { "epoch": 1.4894094543414225, "grad_norm": 0.0323178730905056, "learning_rate": 2.517650909430963e-05, "loss": 0.0133, "step": 53090 }, { "epoch": 1.4896899985972787, "grad_norm": 0.11967800557613373, "learning_rate": 2.517183335671202e-05, "loss": 0.0343, "step": 53100 }, { "epoch": 1.4899705428531351, "grad_norm": 0.0854450985789299, "learning_rate": 2.5167157619114417e-05, "loss": 0.0207, "step": 53110 }, { "epoch": 1.4902510871089913, "grad_norm": 0.13571174442768097, "learning_rate": 2.5162481881516807e-05, "loss": 0.0072, "step": 53120 }, { "epoch": 1.4905316313648478, "grad_norm": 1.6203347444534302, "learning_rate": 2.5157806143919203e-05, "loss": 0.0387, "step": 53130 }, { "epoch": 1.4908121756207042, "grad_norm": 1.1193077564239502, "learning_rate": 2.51531304063216e-05, "loss": 0.0185, "step": 53140 }, { "epoch": 1.4910927198765604, "grad_norm": 1.9264146089553833, "learning_rate": 2.5148454668723993e-05, "loss": 0.0737, "step": 53150 }, { "epoch": 1.4913732641324169, "grad_norm": 0.7657701969146729, "learning_rate": 2.514377893112639e-05, "loss": 0.0193, "step": 53160 }, { "epoch": 1.4916538083882733, "grad_norm": 0.1972614973783493, "learning_rate": 2.513910319352878e-05, "loss": 0.0529, "step": 53170 }, { "epoch": 1.4919343526441295, "grad_norm": 0.3443509042263031, "learning_rate": 2.5134427455931176e-05, "loss": 0.0159, "step": 53180 }, { "epoch": 1.492214896899986, "grad_norm": 0.056211207062006, "learning_rate": 2.5129751718333566e-05, "loss": 0.0241, "step": 53190 }, { "epoch": 1.4924954411558424, "grad_norm": 0.016961069777607918, "learning_rate": 2.5125075980735962e-05, "loss": 0.023, "step": 53200 }, { "epoch": 1.4927759854116986, "grad_norm": 0.21555060148239136, "learning_rate": 2.512040024313836e-05, "loss": 0.0088, "step": 53210 }, { "epoch": 1.493056529667555, "grad_norm": 0.033123310655355453, "learning_rate": 2.511572450554075e-05, "loss": 0.0332, "step": 53220 }, { "epoch": 1.4933370739234113, "grad_norm": 0.1634395867586136, "learning_rate": 2.5111048767943145e-05, "loss": 0.0222, "step": 53230 }, { "epoch": 1.4936176181792677, "grad_norm": 0.8979839086532593, "learning_rate": 2.5106373030345538e-05, "loss": 0.015, "step": 53240 }, { "epoch": 1.4938981624351242, "grad_norm": 1.0753763914108276, "learning_rate": 2.5101697292747935e-05, "loss": 0.0115, "step": 53250 }, { "epoch": 1.4941787066909806, "grad_norm": 0.6670624017715454, "learning_rate": 2.5097021555150325e-05, "loss": 0.0212, "step": 53260 }, { "epoch": 1.4944592509468368, "grad_norm": 0.39014965295791626, "learning_rate": 2.509234581755272e-05, "loss": 0.017, "step": 53270 }, { "epoch": 1.4947397952026933, "grad_norm": 0.3516005277633667, "learning_rate": 2.5087670079955118e-05, "loss": 0.0392, "step": 53280 }, { "epoch": 1.4950203394585495, "grad_norm": 0.2879440188407898, "learning_rate": 2.5082994342357507e-05, "loss": 0.0628, "step": 53290 }, { "epoch": 1.495300883714406, "grad_norm": 0.0335659384727478, "learning_rate": 2.5078318604759904e-05, "loss": 0.017, "step": 53300 }, { "epoch": 1.4955814279702624, "grad_norm": 0.08783608675003052, "learning_rate": 2.5073642867162294e-05, "loss": 0.0347, "step": 53310 }, { "epoch": 1.4958619722261186, "grad_norm": 0.7885558009147644, "learning_rate": 2.506896712956469e-05, "loss": 0.0476, "step": 53320 }, { "epoch": 1.496142516481975, "grad_norm": 0.3226720094680786, "learning_rate": 2.5064291391967083e-05, "loss": 0.014, "step": 53330 }, { "epoch": 1.4964230607378313, "grad_norm": 0.5683501362800598, "learning_rate": 2.5059615654369477e-05, "loss": 0.0316, "step": 53340 }, { "epoch": 1.4967036049936877, "grad_norm": 0.0654006078839302, "learning_rate": 2.5054939916771873e-05, "loss": 0.021, "step": 53350 }, { "epoch": 1.4969841492495441, "grad_norm": 0.15159739553928375, "learning_rate": 2.5050264179174266e-05, "loss": 0.0109, "step": 53360 }, { "epoch": 1.4972646935054006, "grad_norm": 0.1813742071390152, "learning_rate": 2.5045588441576663e-05, "loss": 0.0579, "step": 53370 }, { "epoch": 1.4975452377612568, "grad_norm": 0.24065521359443665, "learning_rate": 2.5040912703979053e-05, "loss": 0.0215, "step": 53380 }, { "epoch": 1.4978257820171132, "grad_norm": 1.0075875520706177, "learning_rate": 2.503623696638145e-05, "loss": 0.0182, "step": 53390 }, { "epoch": 1.4981063262729695, "grad_norm": 0.9444299340248108, "learning_rate": 2.503156122878384e-05, "loss": 0.0344, "step": 53400 }, { "epoch": 1.498386870528826, "grad_norm": 0.28303417563438416, "learning_rate": 2.5026885491186235e-05, "loss": 0.0261, "step": 53410 }, { "epoch": 1.4986674147846824, "grad_norm": 0.9438943266868591, "learning_rate": 2.5022209753588632e-05, "loss": 0.0415, "step": 53420 }, { "epoch": 1.4989479590405386, "grad_norm": 0.08858556300401688, "learning_rate": 2.5017534015991022e-05, "loss": 0.0331, "step": 53430 }, { "epoch": 1.499228503296395, "grad_norm": 0.3486209213733673, "learning_rate": 2.501285827839342e-05, "loss": 0.0158, "step": 53440 }, { "epoch": 1.4995090475522515, "grad_norm": 0.2899521589279175, "learning_rate": 2.500818254079581e-05, "loss": 0.0306, "step": 53450 }, { "epoch": 1.4997895918081077, "grad_norm": 1.4202377796173096, "learning_rate": 2.5003506803198208e-05, "loss": 0.0477, "step": 53460 }, { "epoch": 1.500070136063964, "grad_norm": 10.61298942565918, "learning_rate": 2.49988310656006e-05, "loss": 0.0274, "step": 53470 }, { "epoch": 1.5003506803198206, "grad_norm": 0.04357767477631569, "learning_rate": 2.4994155328002994e-05, "loss": 0.0077, "step": 53480 }, { "epoch": 1.5006312245756768, "grad_norm": 0.07231608033180237, "learning_rate": 2.4989479590405387e-05, "loss": 0.0147, "step": 53490 }, { "epoch": 1.5009117688315332, "grad_norm": 0.07219347357749939, "learning_rate": 2.498480385280778e-05, "loss": 0.0169, "step": 53500 }, { "epoch": 1.5011923130873894, "grad_norm": 0.027930472046136856, "learning_rate": 2.4980128115210174e-05, "loss": 0.0438, "step": 53510 }, { "epoch": 1.5014728573432459, "grad_norm": 0.023994507268071175, "learning_rate": 2.497545237761257e-05, "loss": 0.0292, "step": 53520 }, { "epoch": 1.5017534015991023, "grad_norm": 0.11860918253660202, "learning_rate": 2.4970776640014964e-05, "loss": 0.0268, "step": 53530 }, { "epoch": 1.5020339458549588, "grad_norm": 0.06183318793773651, "learning_rate": 2.4966100902417357e-05, "loss": 0.01, "step": 53540 }, { "epoch": 1.502314490110815, "grad_norm": 0.03869812935590744, "learning_rate": 2.4961425164819753e-05, "loss": 0.0414, "step": 53550 }, { "epoch": 1.5025950343666712, "grad_norm": 0.050050731748342514, "learning_rate": 2.4956749427222146e-05, "loss": 0.0057, "step": 53560 }, { "epoch": 1.5028755786225276, "grad_norm": 0.6935878992080688, "learning_rate": 2.495207368962454e-05, "loss": 0.026, "step": 53570 }, { "epoch": 1.503156122878384, "grad_norm": 0.0455874465405941, "learning_rate": 2.4947397952026933e-05, "loss": 0.0167, "step": 53580 }, { "epoch": 1.5034366671342405, "grad_norm": 0.5611703395843506, "learning_rate": 2.494272221442933e-05, "loss": 0.0322, "step": 53590 }, { "epoch": 1.503717211390097, "grad_norm": 0.26940497756004333, "learning_rate": 2.4938046476831722e-05, "loss": 0.0578, "step": 53600 }, { "epoch": 1.5039977556459532, "grad_norm": 0.5052971839904785, "learning_rate": 2.4933370739234116e-05, "loss": 0.0261, "step": 53610 }, { "epoch": 1.5042782999018094, "grad_norm": 0.0790216252207756, "learning_rate": 2.492869500163651e-05, "loss": 0.022, "step": 53620 }, { "epoch": 1.5045588441576658, "grad_norm": 0.08573313802480698, "learning_rate": 2.4924019264038902e-05, "loss": 0.0287, "step": 53630 }, { "epoch": 1.5048393884135223, "grad_norm": 0.20702695846557617, "learning_rate": 2.4919343526441295e-05, "loss": 0.0413, "step": 53640 }, { "epoch": 1.5051199326693787, "grad_norm": 0.17551201581954956, "learning_rate": 2.491466778884369e-05, "loss": 0.009, "step": 53650 }, { "epoch": 1.505400476925235, "grad_norm": 0.2010928839445114, "learning_rate": 2.4909992051246088e-05, "loss": 0.0243, "step": 53660 }, { "epoch": 1.5056810211810914, "grad_norm": 0.09512556344270706, "learning_rate": 2.490531631364848e-05, "loss": 0.0389, "step": 53670 }, { "epoch": 1.5059615654369476, "grad_norm": 0.24546150863170624, "learning_rate": 2.4900640576050874e-05, "loss": 0.0151, "step": 53680 }, { "epoch": 1.506242109692804, "grad_norm": 0.42681318521499634, "learning_rate": 2.4895964838453268e-05, "loss": 0.0117, "step": 53690 }, { "epoch": 1.5065226539486605, "grad_norm": 0.3720909059047699, "learning_rate": 2.489128910085566e-05, "loss": 0.0266, "step": 53700 }, { "epoch": 1.506803198204517, "grad_norm": 0.7030641436576843, "learning_rate": 2.4886613363258054e-05, "loss": 0.0545, "step": 53710 }, { "epoch": 1.5070837424603731, "grad_norm": 0.4701969623565674, "learning_rate": 2.4881937625660447e-05, "loss": 0.0341, "step": 53720 }, { "epoch": 1.5073642867162294, "grad_norm": 1.2616015672683716, "learning_rate": 2.4877261888062844e-05, "loss": 0.0316, "step": 53730 }, { "epoch": 1.5076448309720858, "grad_norm": 0.11067212373018265, "learning_rate": 2.4872586150465237e-05, "loss": 0.0166, "step": 53740 }, { "epoch": 1.5079253752279422, "grad_norm": 0.6211898326873779, "learning_rate": 2.4867910412867633e-05, "loss": 0.0193, "step": 53750 }, { "epoch": 1.5082059194837987, "grad_norm": 0.4678134024143219, "learning_rate": 2.4863234675270026e-05, "loss": 0.0386, "step": 53760 }, { "epoch": 1.508486463739655, "grad_norm": 0.05322973057627678, "learning_rate": 2.485855893767242e-05, "loss": 0.0323, "step": 53770 }, { "epoch": 1.5087670079955113, "grad_norm": 0.07370851933956146, "learning_rate": 2.4853883200074813e-05, "loss": 0.031, "step": 53780 }, { "epoch": 1.5090475522513676, "grad_norm": 0.63908851146698, "learning_rate": 2.4849207462477206e-05, "loss": 0.0305, "step": 53790 }, { "epoch": 1.509328096507224, "grad_norm": 0.15874114632606506, "learning_rate": 2.4844531724879602e-05, "loss": 0.0354, "step": 53800 }, { "epoch": 1.5096086407630804, "grad_norm": 0.09767623245716095, "learning_rate": 2.4839855987281996e-05, "loss": 0.0129, "step": 53810 }, { "epoch": 1.5098891850189369, "grad_norm": 0.08958626538515091, "learning_rate": 2.483518024968439e-05, "loss": 0.0169, "step": 53820 }, { "epoch": 1.510169729274793, "grad_norm": 0.03780937194824219, "learning_rate": 2.4830504512086782e-05, "loss": 0.0173, "step": 53830 }, { "epoch": 1.5104502735306493, "grad_norm": 0.09438282996416092, "learning_rate": 2.4825828774489175e-05, "loss": 0.0067, "step": 53840 }, { "epoch": 1.5107308177865058, "grad_norm": 0.058107584714889526, "learning_rate": 2.482115303689157e-05, "loss": 0.0185, "step": 53850 }, { "epoch": 1.5110113620423622, "grad_norm": 0.016011342406272888, "learning_rate": 2.4816477299293965e-05, "loss": 0.0332, "step": 53860 }, { "epoch": 1.5112919062982186, "grad_norm": 0.048184070736169815, "learning_rate": 2.481180156169636e-05, "loss": 0.0401, "step": 53870 }, { "epoch": 1.5115724505540749, "grad_norm": 0.1593952476978302, "learning_rate": 2.4807125824098754e-05, "loss": 0.0222, "step": 53880 }, { "epoch": 1.5118529948099313, "grad_norm": 0.5167945027351379, "learning_rate": 2.4802450086501148e-05, "loss": 0.0101, "step": 53890 }, { "epoch": 1.5121335390657875, "grad_norm": 0.022011302411556244, "learning_rate": 2.479777434890354e-05, "loss": 0.0198, "step": 53900 }, { "epoch": 1.512414083321644, "grad_norm": 0.09481081366539001, "learning_rate": 2.4793098611305934e-05, "loss": 0.0899, "step": 53910 }, { "epoch": 1.5126946275775004, "grad_norm": 0.23958614468574524, "learning_rate": 2.4788422873708327e-05, "loss": 0.0181, "step": 53920 }, { "epoch": 1.5129751718333568, "grad_norm": 0.4760279655456543, "learning_rate": 2.478374713611072e-05, "loss": 0.0329, "step": 53930 }, { "epoch": 1.513255716089213, "grad_norm": 0.43492937088012695, "learning_rate": 2.4779071398513117e-05, "loss": 0.0313, "step": 53940 }, { "epoch": 1.5135362603450693, "grad_norm": 1.361647367477417, "learning_rate": 2.477439566091551e-05, "loss": 0.034, "step": 53950 }, { "epoch": 1.5138168046009257, "grad_norm": 0.03885151818394661, "learning_rate": 2.4769719923317906e-05, "loss": 0.0109, "step": 53960 }, { "epoch": 1.5140973488567822, "grad_norm": 0.11690601706504822, "learning_rate": 2.47650441857203e-05, "loss": 0.0189, "step": 53970 }, { "epoch": 1.5143778931126386, "grad_norm": 0.3815596103668213, "learning_rate": 2.4760368448122693e-05, "loss": 0.028, "step": 53980 }, { "epoch": 1.514658437368495, "grad_norm": 0.18079182505607605, "learning_rate": 2.4755692710525086e-05, "loss": 0.018, "step": 53990 }, { "epoch": 1.5149389816243513, "grad_norm": 0.03714657574892044, "learning_rate": 2.475101697292748e-05, "loss": 0.0228, "step": 54000 }, { "epoch": 1.5152195258802075, "grad_norm": 0.9681529402732849, "learning_rate": 2.4746341235329876e-05, "loss": 0.0316, "step": 54010 }, { "epoch": 1.515500070136064, "grad_norm": 0.10708887130022049, "learning_rate": 2.474166549773227e-05, "loss": 0.0054, "step": 54020 }, { "epoch": 1.5157806143919204, "grad_norm": 0.034368738532066345, "learning_rate": 2.4736989760134662e-05, "loss": 0.0189, "step": 54030 }, { "epoch": 1.5160611586477768, "grad_norm": 0.0230941791087389, "learning_rate": 2.4732314022537055e-05, "loss": 0.0138, "step": 54040 }, { "epoch": 1.516341702903633, "grad_norm": 0.1283068060874939, "learning_rate": 2.472763828493945e-05, "loss": 0.0223, "step": 54050 }, { "epoch": 1.5166222471594895, "grad_norm": 1.2030915021896362, "learning_rate": 2.4722962547341845e-05, "loss": 0.0202, "step": 54060 }, { "epoch": 1.5169027914153457, "grad_norm": 0.5247664451599121, "learning_rate": 2.4718286809744238e-05, "loss": 0.0177, "step": 54070 }, { "epoch": 1.5171833356712021, "grad_norm": 0.13040219247341156, "learning_rate": 2.4713611072146634e-05, "loss": 0.0134, "step": 54080 }, { "epoch": 1.5174638799270586, "grad_norm": 0.09534723311662674, "learning_rate": 2.4708935334549028e-05, "loss": 0.0533, "step": 54090 }, { "epoch": 1.517744424182915, "grad_norm": 0.6060191988945007, "learning_rate": 2.470425959695142e-05, "loss": 0.0146, "step": 54100 }, { "epoch": 1.5180249684387712, "grad_norm": 0.7347156405448914, "learning_rate": 2.4699583859353814e-05, "loss": 0.0464, "step": 54110 }, { "epoch": 1.5183055126946274, "grad_norm": 0.13489890098571777, "learning_rate": 2.4694908121756207e-05, "loss": 0.0221, "step": 54120 }, { "epoch": 1.5185860569504839, "grad_norm": 0.024314669892191887, "learning_rate": 2.46902323841586e-05, "loss": 0.0361, "step": 54130 }, { "epoch": 1.5188666012063403, "grad_norm": 0.28817200660705566, "learning_rate": 2.4685556646560993e-05, "loss": 0.017, "step": 54140 }, { "epoch": 1.5191471454621968, "grad_norm": 0.7611408233642578, "learning_rate": 2.468088090896339e-05, "loss": 0.0283, "step": 54150 }, { "epoch": 1.519427689718053, "grad_norm": 0.14821834862232208, "learning_rate": 2.4676205171365786e-05, "loss": 0.0164, "step": 54160 }, { "epoch": 1.5197082339739094, "grad_norm": 2.4593594074249268, "learning_rate": 2.467152943376818e-05, "loss": 0.0407, "step": 54170 }, { "epoch": 1.5199887782297656, "grad_norm": 0.11282052099704742, "learning_rate": 2.4666853696170573e-05, "loss": 0.0372, "step": 54180 }, { "epoch": 1.520269322485622, "grad_norm": 0.10650046914815903, "learning_rate": 2.4662177958572966e-05, "loss": 0.0211, "step": 54190 }, { "epoch": 1.5205498667414785, "grad_norm": 0.31297287344932556, "learning_rate": 2.465750222097536e-05, "loss": 0.0153, "step": 54200 }, { "epoch": 1.520830410997335, "grad_norm": 0.5259931087493896, "learning_rate": 2.4652826483377752e-05, "loss": 0.0353, "step": 54210 }, { "epoch": 1.5211109552531912, "grad_norm": 0.32135045528411865, "learning_rate": 2.464815074578015e-05, "loss": 0.0403, "step": 54220 }, { "epoch": 1.5213914995090474, "grad_norm": 4.470187187194824, "learning_rate": 2.4643475008182542e-05, "loss": 0.019, "step": 54230 }, { "epoch": 1.5216720437649038, "grad_norm": 0.19474004209041595, "learning_rate": 2.4638799270584935e-05, "loss": 0.0199, "step": 54240 }, { "epoch": 1.5219525880207603, "grad_norm": 0.09994249045848846, "learning_rate": 2.4634123532987328e-05, "loss": 0.0141, "step": 54250 }, { "epoch": 1.5222331322766167, "grad_norm": 0.5639594197273254, "learning_rate": 2.4629447795389725e-05, "loss": 0.0257, "step": 54260 }, { "epoch": 1.5225136765324732, "grad_norm": 0.02298949472606182, "learning_rate": 2.4624772057792118e-05, "loss": 0.0051, "step": 54270 }, { "epoch": 1.5227942207883294, "grad_norm": 1.5230698585510254, "learning_rate": 2.462009632019451e-05, "loss": 0.0155, "step": 54280 }, { "epoch": 1.5230747650441856, "grad_norm": 0.24948284029960632, "learning_rate": 2.4615420582596908e-05, "loss": 0.0184, "step": 54290 }, { "epoch": 1.523355309300042, "grad_norm": 0.5280159115791321, "learning_rate": 2.46107448449993e-05, "loss": 0.0091, "step": 54300 }, { "epoch": 1.5236358535558985, "grad_norm": 0.17545217275619507, "learning_rate": 2.4606069107401694e-05, "loss": 0.0487, "step": 54310 }, { "epoch": 1.523916397811755, "grad_norm": 0.05112035572528839, "learning_rate": 2.4601393369804087e-05, "loss": 0.0388, "step": 54320 }, { "epoch": 1.5241969420676111, "grad_norm": 0.45723992586135864, "learning_rate": 2.459671763220648e-05, "loss": 0.0254, "step": 54330 }, { "epoch": 1.5244774863234676, "grad_norm": 0.027881767600774765, "learning_rate": 2.4592041894608873e-05, "loss": 0.0037, "step": 54340 }, { "epoch": 1.5247580305793238, "grad_norm": 1.1520311832427979, "learning_rate": 2.458736615701127e-05, "loss": 0.0159, "step": 54350 }, { "epoch": 1.5250385748351802, "grad_norm": 6.7258620262146, "learning_rate": 2.4582690419413663e-05, "loss": 0.0161, "step": 54360 }, { "epoch": 1.5253191190910367, "grad_norm": 0.06856262683868408, "learning_rate": 2.457801468181606e-05, "loss": 0.053, "step": 54370 }, { "epoch": 1.5255996633468931, "grad_norm": 2.8543167114257812, "learning_rate": 2.4573338944218453e-05, "loss": 0.0326, "step": 54380 }, { "epoch": 1.5258802076027493, "grad_norm": 0.08595672249794006, "learning_rate": 2.4568663206620846e-05, "loss": 0.0094, "step": 54390 }, { "epoch": 1.5261607518586056, "grad_norm": 0.09063933044672012, "learning_rate": 2.456398746902324e-05, "loss": 0.0123, "step": 54400 }, { "epoch": 1.526441296114462, "grad_norm": 0.21625913679599762, "learning_rate": 2.4559311731425632e-05, "loss": 0.0382, "step": 54410 }, { "epoch": 1.5267218403703184, "grad_norm": 0.03093143180012703, "learning_rate": 2.4554635993828025e-05, "loss": 0.0434, "step": 54420 }, { "epoch": 1.5270023846261749, "grad_norm": 0.16014087200164795, "learning_rate": 2.4549960256230422e-05, "loss": 0.0101, "step": 54430 }, { "epoch": 1.527282928882031, "grad_norm": 0.07165955752134323, "learning_rate": 2.4545284518632815e-05, "loss": 0.0237, "step": 54440 }, { "epoch": 1.5275634731378875, "grad_norm": 0.08783011883497238, "learning_rate": 2.454060878103521e-05, "loss": 0.0087, "step": 54450 }, { "epoch": 1.5278440173937438, "grad_norm": 0.840369701385498, "learning_rate": 2.4535933043437605e-05, "loss": 0.0523, "step": 54460 }, { "epoch": 1.5281245616496002, "grad_norm": 0.0812089666724205, "learning_rate": 2.4531257305839998e-05, "loss": 0.0679, "step": 54470 }, { "epoch": 1.5284051059054566, "grad_norm": 0.3938766121864319, "learning_rate": 2.452658156824239e-05, "loss": 0.0284, "step": 54480 }, { "epoch": 1.528685650161313, "grad_norm": 0.12031551450490952, "learning_rate": 2.4521905830644784e-05, "loss": 0.0292, "step": 54490 }, { "epoch": 1.5289661944171693, "grad_norm": 0.22805294394493103, "learning_rate": 2.451723009304718e-05, "loss": 0.0421, "step": 54500 }, { "epoch": 1.5292467386730255, "grad_norm": 0.3135005533695221, "learning_rate": 2.4512554355449574e-05, "loss": 0.0137, "step": 54510 }, { "epoch": 1.529527282928882, "grad_norm": 0.051140956580638885, "learning_rate": 2.4507878617851967e-05, "loss": 0.0276, "step": 54520 }, { "epoch": 1.5298078271847384, "grad_norm": 0.057549551129341125, "learning_rate": 2.450320288025436e-05, "loss": 0.0376, "step": 54530 }, { "epoch": 1.5300883714405948, "grad_norm": 0.9790767431259155, "learning_rate": 2.4498527142656753e-05, "loss": 0.0308, "step": 54540 }, { "epoch": 1.530368915696451, "grad_norm": 0.20665983855724335, "learning_rate": 2.4493851405059147e-05, "loss": 0.0291, "step": 54550 }, { "epoch": 1.5306494599523075, "grad_norm": 2.4634265899658203, "learning_rate": 2.4489175667461543e-05, "loss": 0.061, "step": 54560 }, { "epoch": 1.5309300042081637, "grad_norm": 0.14940884709358215, "learning_rate": 2.448449992986394e-05, "loss": 0.0164, "step": 54570 }, { "epoch": 1.5312105484640202, "grad_norm": 0.09822477400302887, "learning_rate": 2.4479824192266333e-05, "loss": 0.0091, "step": 54580 }, { "epoch": 1.5314910927198766, "grad_norm": 0.056536056101322174, "learning_rate": 2.4475148454668726e-05, "loss": 0.0295, "step": 54590 }, { "epoch": 1.531771636975733, "grad_norm": 0.7927533984184265, "learning_rate": 2.447047271707112e-05, "loss": 0.0325, "step": 54600 }, { "epoch": 1.5320521812315893, "grad_norm": 0.0830639898777008, "learning_rate": 2.4465796979473512e-05, "loss": 0.0171, "step": 54610 }, { "epoch": 1.5323327254874455, "grad_norm": 0.2782026529312134, "learning_rate": 2.4461121241875906e-05, "loss": 0.013, "step": 54620 }, { "epoch": 1.532613269743302, "grad_norm": 0.11085277795791626, "learning_rate": 2.44564455042783e-05, "loss": 0.0489, "step": 54630 }, { "epoch": 1.5328938139991584, "grad_norm": 2.2930872440338135, "learning_rate": 2.4451769766680695e-05, "loss": 0.0167, "step": 54640 }, { "epoch": 1.5331743582550148, "grad_norm": 0.9805535078048706, "learning_rate": 2.444709402908309e-05, "loss": 0.0232, "step": 54650 }, { "epoch": 1.5334549025108712, "grad_norm": 0.15923090279102325, "learning_rate": 2.4442418291485485e-05, "loss": 0.0128, "step": 54660 }, { "epoch": 1.5337354467667275, "grad_norm": 0.44483909010887146, "learning_rate": 2.4437742553887878e-05, "loss": 0.0272, "step": 54670 }, { "epoch": 1.5340159910225837, "grad_norm": 0.018385466188192368, "learning_rate": 2.443306681629027e-05, "loss": 0.0212, "step": 54680 }, { "epoch": 1.5342965352784401, "grad_norm": 0.4278740882873535, "learning_rate": 2.4428391078692664e-05, "loss": 0.0306, "step": 54690 }, { "epoch": 1.5345770795342966, "grad_norm": 0.034209754317998886, "learning_rate": 2.4423715341095058e-05, "loss": 0.0074, "step": 54700 }, { "epoch": 1.534857623790153, "grad_norm": 0.034607719630002975, "learning_rate": 2.4419039603497454e-05, "loss": 0.0107, "step": 54710 }, { "epoch": 1.5351381680460092, "grad_norm": 0.5593804717063904, "learning_rate": 2.4414363865899847e-05, "loss": 0.0198, "step": 54720 }, { "epoch": 1.5354187123018657, "grad_norm": 1.4311578273773193, "learning_rate": 2.440968812830224e-05, "loss": 0.0676, "step": 54730 }, { "epoch": 1.5356992565577219, "grad_norm": 0.4330081045627594, "learning_rate": 2.4405012390704634e-05, "loss": 0.0167, "step": 54740 }, { "epoch": 1.5359798008135783, "grad_norm": 0.2193072885274887, "learning_rate": 2.4400336653107027e-05, "loss": 0.03, "step": 54750 }, { "epoch": 1.5362603450694348, "grad_norm": 2.283728837966919, "learning_rate": 2.4395660915509423e-05, "loss": 0.0148, "step": 54760 }, { "epoch": 1.5365408893252912, "grad_norm": 0.28457221388816833, "learning_rate": 2.439098517791182e-05, "loss": 0.0134, "step": 54770 }, { "epoch": 1.5368214335811474, "grad_norm": 0.05331016331911087, "learning_rate": 2.4386309440314213e-05, "loss": 0.0142, "step": 54780 }, { "epoch": 1.5371019778370036, "grad_norm": 0.03639537841081619, "learning_rate": 2.4381633702716606e-05, "loss": 0.0163, "step": 54790 }, { "epoch": 1.53738252209286, "grad_norm": 0.026515265926718712, "learning_rate": 2.4376957965119e-05, "loss": 0.0121, "step": 54800 }, { "epoch": 1.5376630663487165, "grad_norm": 0.02123432233929634, "learning_rate": 2.4372282227521392e-05, "loss": 0.0121, "step": 54810 }, { "epoch": 1.537943610604573, "grad_norm": 1.0033363103866577, "learning_rate": 2.4367606489923786e-05, "loss": 0.0238, "step": 54820 }, { "epoch": 1.5382241548604292, "grad_norm": 0.02389727532863617, "learning_rate": 2.436293075232618e-05, "loss": 0.0065, "step": 54830 }, { "epoch": 1.5385046991162856, "grad_norm": 0.03028665855526924, "learning_rate": 2.4358255014728575e-05, "loss": 0.0078, "step": 54840 }, { "epoch": 1.5387852433721418, "grad_norm": 0.019094478338956833, "learning_rate": 2.435357927713097e-05, "loss": 0.0234, "step": 54850 }, { "epoch": 1.5390657876279983, "grad_norm": 0.01968345232307911, "learning_rate": 2.434890353953336e-05, "loss": 0.009, "step": 54860 }, { "epoch": 1.5393463318838547, "grad_norm": 0.020473064854741096, "learning_rate": 2.4344227801935758e-05, "loss": 0.0345, "step": 54870 }, { "epoch": 1.5396268761397112, "grad_norm": 0.04178757593035698, "learning_rate": 2.433955206433815e-05, "loss": 0.0242, "step": 54880 }, { "epoch": 1.5399074203955674, "grad_norm": 0.01433651428669691, "learning_rate": 2.4334876326740544e-05, "loss": 0.0148, "step": 54890 }, { "epoch": 1.5401879646514236, "grad_norm": 0.23617039620876312, "learning_rate": 2.4330200589142938e-05, "loss": 0.0088, "step": 54900 }, { "epoch": 1.54046850890728, "grad_norm": 0.023461876437067986, "learning_rate": 2.4325524851545334e-05, "loss": 0.0156, "step": 54910 }, { "epoch": 1.5407490531631365, "grad_norm": 3.718080520629883, "learning_rate": 2.4320849113947727e-05, "loss": 0.0502, "step": 54920 }, { "epoch": 1.541029597418993, "grad_norm": 0.16483210027217865, "learning_rate": 2.431617337635012e-05, "loss": 0.0085, "step": 54930 }, { "epoch": 1.5413101416748494, "grad_norm": 0.04484263435006142, "learning_rate": 2.4311497638752514e-05, "loss": 0.0155, "step": 54940 }, { "epoch": 1.5415906859307056, "grad_norm": 0.022965610027313232, "learning_rate": 2.4306821901154907e-05, "loss": 0.0079, "step": 54950 }, { "epoch": 1.5418712301865618, "grad_norm": 0.03946353867650032, "learning_rate": 2.4302146163557303e-05, "loss": 0.0092, "step": 54960 }, { "epoch": 1.5421517744424182, "grad_norm": 0.0199450496584177, "learning_rate": 2.4297470425959696e-05, "loss": 0.0244, "step": 54970 }, { "epoch": 1.5424323186982747, "grad_norm": 0.11318476498126984, "learning_rate": 2.4292794688362093e-05, "loss": 0.0126, "step": 54980 }, { "epoch": 1.5427128629541311, "grad_norm": 0.15717080235481262, "learning_rate": 2.4288118950764486e-05, "loss": 0.0187, "step": 54990 }, { "epoch": 1.5429934072099873, "grad_norm": 0.06059030443429947, "learning_rate": 2.428344321316688e-05, "loss": 0.0481, "step": 55000 }, { "epoch": 1.5432739514658438, "grad_norm": 2.8948559761047363, "learning_rate": 2.4278767475569272e-05, "loss": 0.0587, "step": 55010 }, { "epoch": 1.5435544957217, "grad_norm": 0.07833924889564514, "learning_rate": 2.4274091737971666e-05, "loss": 0.0086, "step": 55020 }, { "epoch": 1.5438350399775564, "grad_norm": 0.2999820113182068, "learning_rate": 2.426941600037406e-05, "loss": 0.0198, "step": 55030 }, { "epoch": 1.5441155842334129, "grad_norm": 1.4200478792190552, "learning_rate": 2.4264740262776452e-05, "loss": 0.0377, "step": 55040 }, { "epoch": 1.5443961284892693, "grad_norm": 0.04010448232293129, "learning_rate": 2.426006452517885e-05, "loss": 0.0299, "step": 55050 }, { "epoch": 1.5446766727451255, "grad_norm": 2.8021225929260254, "learning_rate": 2.425538878758124e-05, "loss": 0.0205, "step": 55060 }, { "epoch": 1.5449572170009818, "grad_norm": 0.02665749005973339, "learning_rate": 2.4250713049983638e-05, "loss": 0.0348, "step": 55070 }, { "epoch": 1.5452377612568382, "grad_norm": 0.01960575208067894, "learning_rate": 2.424603731238603e-05, "loss": 0.02, "step": 55080 }, { "epoch": 1.5455183055126946, "grad_norm": 1.0740185976028442, "learning_rate": 2.4241361574788424e-05, "loss": 0.0243, "step": 55090 }, { "epoch": 1.545798849768551, "grad_norm": 0.31925711035728455, "learning_rate": 2.4236685837190818e-05, "loss": 0.0221, "step": 55100 }, { "epoch": 1.5460793940244073, "grad_norm": 0.32859835028648376, "learning_rate": 2.423201009959321e-05, "loss": 0.0275, "step": 55110 }, { "epoch": 1.5463599382802637, "grad_norm": 1.0671682357788086, "learning_rate": 2.4227334361995607e-05, "loss": 0.0178, "step": 55120 }, { "epoch": 1.54664048253612, "grad_norm": 0.03268013522028923, "learning_rate": 2.4222658624398e-05, "loss": 0.0324, "step": 55130 }, { "epoch": 1.5469210267919764, "grad_norm": 0.14118607342243195, "learning_rate": 2.4217982886800394e-05, "loss": 0.0166, "step": 55140 }, { "epoch": 1.5472015710478328, "grad_norm": 0.21020211279392242, "learning_rate": 2.4213307149202787e-05, "loss": 0.0154, "step": 55150 }, { "epoch": 1.5474821153036893, "grad_norm": 0.5819916129112244, "learning_rate": 2.420863141160518e-05, "loss": 0.0361, "step": 55160 }, { "epoch": 1.5477626595595455, "grad_norm": 0.5276679992675781, "learning_rate": 2.4203955674007576e-05, "loss": 0.0343, "step": 55170 }, { "epoch": 1.5480432038154017, "grad_norm": 0.1824072301387787, "learning_rate": 2.419927993640997e-05, "loss": 0.0138, "step": 55180 }, { "epoch": 1.5483237480712582, "grad_norm": 0.24934996664524078, "learning_rate": 2.4194604198812366e-05, "loss": 0.0164, "step": 55190 }, { "epoch": 1.5486042923271146, "grad_norm": 0.0560336597263813, "learning_rate": 2.418992846121476e-05, "loss": 0.0067, "step": 55200 }, { "epoch": 1.548884836582971, "grad_norm": 0.6291278600692749, "learning_rate": 2.4185252723617152e-05, "loss": 0.022, "step": 55210 }, { "epoch": 1.5491653808388275, "grad_norm": 1.9277340173721313, "learning_rate": 2.4180576986019546e-05, "loss": 0.0549, "step": 55220 }, { "epoch": 1.5494459250946837, "grad_norm": 0.05026528611779213, "learning_rate": 2.417590124842194e-05, "loss": 0.0534, "step": 55230 }, { "epoch": 1.54972646935054, "grad_norm": 0.07938521355390549, "learning_rate": 2.4171225510824332e-05, "loss": 0.0331, "step": 55240 }, { "epoch": 1.5500070136063964, "grad_norm": 0.1004614531993866, "learning_rate": 2.4166549773226725e-05, "loss": 0.0229, "step": 55250 }, { "epoch": 1.5502875578622528, "grad_norm": 0.2563457489013672, "learning_rate": 2.416187403562912e-05, "loss": 0.0097, "step": 55260 }, { "epoch": 1.5505681021181092, "grad_norm": 0.43620598316192627, "learning_rate": 2.4157198298031515e-05, "loss": 0.0199, "step": 55270 }, { "epoch": 1.5508486463739655, "grad_norm": 2.5525765419006348, "learning_rate": 2.415252256043391e-05, "loss": 0.0201, "step": 55280 }, { "epoch": 1.551129190629822, "grad_norm": 0.017425142228603363, "learning_rate": 2.4147846822836305e-05, "loss": 0.0201, "step": 55290 }, { "epoch": 1.5514097348856781, "grad_norm": 0.1361369490623474, "learning_rate": 2.4143171085238698e-05, "loss": 0.0043, "step": 55300 }, { "epoch": 1.5516902791415346, "grad_norm": 0.013670435175299644, "learning_rate": 2.413849534764109e-05, "loss": 0.0312, "step": 55310 }, { "epoch": 1.551970823397391, "grad_norm": 0.3365156352519989, "learning_rate": 2.4133819610043484e-05, "loss": 0.0205, "step": 55320 }, { "epoch": 1.5522513676532474, "grad_norm": 1.4244581460952759, "learning_rate": 2.412914387244588e-05, "loss": 0.0582, "step": 55330 }, { "epoch": 1.5525319119091037, "grad_norm": 0.39772829413414, "learning_rate": 2.4124468134848274e-05, "loss": 0.0369, "step": 55340 }, { "epoch": 1.5528124561649599, "grad_norm": 9.196061134338379, "learning_rate": 2.4119792397250667e-05, "loss": 0.0521, "step": 55350 }, { "epoch": 1.5530930004208163, "grad_norm": 0.14438967406749725, "learning_rate": 2.411511665965306e-05, "loss": 0.0092, "step": 55360 }, { "epoch": 1.5533735446766728, "grad_norm": 0.048765167593955994, "learning_rate": 2.4110440922055457e-05, "loss": 0.0182, "step": 55370 }, { "epoch": 1.5536540889325292, "grad_norm": 0.02166498824954033, "learning_rate": 2.410576518445785e-05, "loss": 0.0064, "step": 55380 }, { "epoch": 1.5539346331883854, "grad_norm": 0.06038397178053856, "learning_rate": 2.4101089446860243e-05, "loss": 0.0209, "step": 55390 }, { "epoch": 1.5542151774442419, "grad_norm": 1.2022536993026733, "learning_rate": 2.409641370926264e-05, "loss": 0.015, "step": 55400 }, { "epoch": 1.554495721700098, "grad_norm": 0.28802114725112915, "learning_rate": 2.4091737971665033e-05, "loss": 0.0352, "step": 55410 }, { "epoch": 1.5547762659559545, "grad_norm": 1.1421611309051514, "learning_rate": 2.4087062234067426e-05, "loss": 0.0229, "step": 55420 }, { "epoch": 1.555056810211811, "grad_norm": 2.148252487182617, "learning_rate": 2.408238649646982e-05, "loss": 0.0262, "step": 55430 }, { "epoch": 1.5553373544676674, "grad_norm": 1.1838256120681763, "learning_rate": 2.4077710758872212e-05, "loss": 0.0333, "step": 55440 }, { "epoch": 1.5556178987235236, "grad_norm": 0.05545353889465332, "learning_rate": 2.4073035021274605e-05, "loss": 0.0267, "step": 55450 }, { "epoch": 1.5558984429793798, "grad_norm": 0.2579464912414551, "learning_rate": 2.4068359283677e-05, "loss": 0.0249, "step": 55460 }, { "epoch": 1.5561789872352363, "grad_norm": 8.901973724365234, "learning_rate": 2.4063683546079395e-05, "loss": 0.0255, "step": 55470 }, { "epoch": 1.5564595314910927, "grad_norm": 0.3929721415042877, "learning_rate": 2.405900780848179e-05, "loss": 0.0264, "step": 55480 }, { "epoch": 1.5567400757469492, "grad_norm": 0.7729992270469666, "learning_rate": 2.4054332070884185e-05, "loss": 0.0245, "step": 55490 }, { "epoch": 1.5570206200028054, "grad_norm": 1.3560651540756226, "learning_rate": 2.4049656333286578e-05, "loss": 0.0155, "step": 55500 }, { "epoch": 1.5573011642586618, "grad_norm": 0.46578341722488403, "learning_rate": 2.404498059568897e-05, "loss": 0.0131, "step": 55510 }, { "epoch": 1.557581708514518, "grad_norm": 0.4315662384033203, "learning_rate": 2.4040304858091364e-05, "loss": 0.0097, "step": 55520 }, { "epoch": 1.5578622527703745, "grad_norm": 0.11956389993429184, "learning_rate": 2.4035629120493757e-05, "loss": 0.0499, "step": 55530 }, { "epoch": 1.558142797026231, "grad_norm": 0.16552700102329254, "learning_rate": 2.4030953382896154e-05, "loss": 0.0333, "step": 55540 }, { "epoch": 1.5584233412820874, "grad_norm": 0.02753661572933197, "learning_rate": 2.4026277645298547e-05, "loss": 0.052, "step": 55550 }, { "epoch": 1.5587038855379436, "grad_norm": 0.2146671563386917, "learning_rate": 2.402160190770094e-05, "loss": 0.015, "step": 55560 }, { "epoch": 1.5589844297937998, "grad_norm": 0.040997214615345, "learning_rate": 2.4016926170103337e-05, "loss": 0.0323, "step": 55570 }, { "epoch": 1.5592649740496562, "grad_norm": 0.027753842994570732, "learning_rate": 2.401225043250573e-05, "loss": 0.014, "step": 55580 }, { "epoch": 1.5595455183055127, "grad_norm": 0.17293034493923187, "learning_rate": 2.4007574694908123e-05, "loss": 0.0274, "step": 55590 }, { "epoch": 1.5598260625613691, "grad_norm": 0.9502708315849304, "learning_rate": 2.4002898957310516e-05, "loss": 0.029, "step": 55600 }, { "epoch": 1.5601066068172256, "grad_norm": 0.20577789843082428, "learning_rate": 2.3998223219712913e-05, "loss": 0.0216, "step": 55610 }, { "epoch": 1.5603871510730818, "grad_norm": 0.09305811673402786, "learning_rate": 2.3993547482115306e-05, "loss": 0.0205, "step": 55620 }, { "epoch": 1.560667695328938, "grad_norm": 0.48149633407592773, "learning_rate": 2.39888717445177e-05, "loss": 0.0219, "step": 55630 }, { "epoch": 1.5609482395847945, "grad_norm": 0.02559610642492771, "learning_rate": 2.3984196006920092e-05, "loss": 0.0238, "step": 55640 }, { "epoch": 1.561228783840651, "grad_norm": 0.1322421431541443, "learning_rate": 2.3979520269322485e-05, "loss": 0.0323, "step": 55650 }, { "epoch": 1.5615093280965073, "grad_norm": 0.7579728364944458, "learning_rate": 2.397484453172488e-05, "loss": 0.032, "step": 55660 }, { "epoch": 1.5617898723523636, "grad_norm": 0.1804034262895584, "learning_rate": 2.3970168794127275e-05, "loss": 0.0264, "step": 55670 }, { "epoch": 1.56207041660822, "grad_norm": 0.41195738315582275, "learning_rate": 2.396549305652967e-05, "loss": 0.0063, "step": 55680 }, { "epoch": 1.5623509608640762, "grad_norm": 41.375389099121094, "learning_rate": 2.3960817318932065e-05, "loss": 0.0275, "step": 55690 }, { "epoch": 1.5626315051199327, "grad_norm": 0.019550444558262825, "learning_rate": 2.3956141581334458e-05, "loss": 0.0288, "step": 55700 }, { "epoch": 1.562912049375789, "grad_norm": 0.04373643547296524, "learning_rate": 2.395146584373685e-05, "loss": 0.0423, "step": 55710 }, { "epoch": 1.5631925936316455, "grad_norm": 2.616420030593872, "learning_rate": 2.3946790106139244e-05, "loss": 0.0473, "step": 55720 }, { "epoch": 1.5634731378875018, "grad_norm": 0.10464374721050262, "learning_rate": 2.3942114368541637e-05, "loss": 0.0265, "step": 55730 }, { "epoch": 1.563753682143358, "grad_norm": 1.0532795190811157, "learning_rate": 2.393743863094403e-05, "loss": 0.0306, "step": 55740 }, { "epoch": 1.5640342263992144, "grad_norm": 0.6037876009941101, "learning_rate": 2.3932762893346427e-05, "loss": 0.0343, "step": 55750 }, { "epoch": 1.5643147706550709, "grad_norm": 0.2707092761993408, "learning_rate": 2.392808715574882e-05, "loss": 0.0393, "step": 55760 }, { "epoch": 1.5645953149109273, "grad_norm": 0.07003992795944214, "learning_rate": 2.3923411418151213e-05, "loss": 0.0089, "step": 55770 }, { "epoch": 1.5648758591667835, "grad_norm": 1.4598172903060913, "learning_rate": 2.391873568055361e-05, "loss": 0.0455, "step": 55780 }, { "epoch": 1.56515640342264, "grad_norm": 0.656765341758728, "learning_rate": 2.3914059942956003e-05, "loss": 0.0172, "step": 55790 }, { "epoch": 1.5654369476784962, "grad_norm": 0.03420688211917877, "learning_rate": 2.3909384205358396e-05, "loss": 0.014, "step": 55800 }, { "epoch": 1.5657174919343526, "grad_norm": 0.045262549072504044, "learning_rate": 2.390470846776079e-05, "loss": 0.0156, "step": 55810 }, { "epoch": 1.565998036190209, "grad_norm": 0.026905063539743423, "learning_rate": 2.3900032730163186e-05, "loss": 0.0658, "step": 55820 }, { "epoch": 1.5662785804460655, "grad_norm": 0.23112532496452332, "learning_rate": 2.389535699256558e-05, "loss": 0.0171, "step": 55830 }, { "epoch": 1.5665591247019217, "grad_norm": 0.04531925544142723, "learning_rate": 2.3890681254967972e-05, "loss": 0.0245, "step": 55840 }, { "epoch": 1.566839668957778, "grad_norm": 0.21046306192874908, "learning_rate": 2.3886005517370365e-05, "loss": 0.0269, "step": 55850 }, { "epoch": 1.5671202132136344, "grad_norm": 0.44054359197616577, "learning_rate": 2.388132977977276e-05, "loss": 0.0518, "step": 55860 }, { "epoch": 1.5674007574694908, "grad_norm": 0.028244538232684135, "learning_rate": 2.3876654042175155e-05, "loss": 0.0161, "step": 55870 }, { "epoch": 1.5676813017253473, "grad_norm": 0.357248455286026, "learning_rate": 2.3871978304577548e-05, "loss": 0.02, "step": 55880 }, { "epoch": 1.5679618459812037, "grad_norm": 1.6045575141906738, "learning_rate": 2.3867302566979945e-05, "loss": 0.0166, "step": 55890 }, { "epoch": 1.56824239023706, "grad_norm": 0.5810555815696716, "learning_rate": 2.3862626829382338e-05, "loss": 0.0234, "step": 55900 }, { "epoch": 1.5685229344929161, "grad_norm": 0.05305883288383484, "learning_rate": 2.385795109178473e-05, "loss": 0.0171, "step": 55910 }, { "epoch": 1.5688034787487726, "grad_norm": 1.2115193605422974, "learning_rate": 2.3853275354187124e-05, "loss": 0.0297, "step": 55920 }, { "epoch": 1.569084023004629, "grad_norm": 0.09642118215560913, "learning_rate": 2.3848599616589517e-05, "loss": 0.0269, "step": 55930 }, { "epoch": 1.5693645672604855, "grad_norm": 0.2985004782676697, "learning_rate": 2.384392387899191e-05, "loss": 0.0231, "step": 55940 }, { "epoch": 1.5696451115163417, "grad_norm": 0.09547527879476547, "learning_rate": 2.3839248141394304e-05, "loss": 0.0245, "step": 55950 }, { "epoch": 1.5699256557721981, "grad_norm": 0.49979764223098755, "learning_rate": 2.38345724037967e-05, "loss": 0.0266, "step": 55960 }, { "epoch": 1.5702062000280543, "grad_norm": 0.08709584176540375, "learning_rate": 2.3829896666199093e-05, "loss": 0.0347, "step": 55970 }, { "epoch": 1.5704867442839108, "grad_norm": 0.42276278138160706, "learning_rate": 2.382522092860149e-05, "loss": 0.0328, "step": 55980 }, { "epoch": 1.5707672885397672, "grad_norm": 0.04411087930202484, "learning_rate": 2.3820545191003883e-05, "loss": 0.0133, "step": 55990 }, { "epoch": 1.5710478327956237, "grad_norm": 0.5109043121337891, "learning_rate": 2.3815869453406276e-05, "loss": 0.022, "step": 56000 }, { "epoch": 1.5713283770514799, "grad_norm": 0.06875672936439514, "learning_rate": 2.381119371580867e-05, "loss": 0.029, "step": 56010 }, { "epoch": 1.571608921307336, "grad_norm": 0.050021037459373474, "learning_rate": 2.3806517978211062e-05, "loss": 0.0074, "step": 56020 }, { "epoch": 1.5718894655631925, "grad_norm": 0.35527947545051575, "learning_rate": 2.380184224061346e-05, "loss": 0.0363, "step": 56030 }, { "epoch": 1.572170009819049, "grad_norm": 0.06699926406145096, "learning_rate": 2.3797166503015852e-05, "loss": 0.0247, "step": 56040 }, { "epoch": 1.5724505540749054, "grad_norm": 0.1111108809709549, "learning_rate": 2.3792490765418245e-05, "loss": 0.0404, "step": 56050 }, { "epoch": 1.5727310983307616, "grad_norm": 0.22407159209251404, "learning_rate": 2.378781502782064e-05, "loss": 0.0142, "step": 56060 }, { "epoch": 1.573011642586618, "grad_norm": 0.08973833918571472, "learning_rate": 2.378313929022303e-05, "loss": 0.0305, "step": 56070 }, { "epoch": 1.5732921868424743, "grad_norm": 0.05803001672029495, "learning_rate": 2.3778463552625428e-05, "loss": 0.0265, "step": 56080 }, { "epoch": 1.5735727310983307, "grad_norm": 0.03913208097219467, "learning_rate": 2.3773787815027825e-05, "loss": 0.0374, "step": 56090 }, { "epoch": 1.5738532753541872, "grad_norm": 0.07273263484239578, "learning_rate": 2.3769112077430218e-05, "loss": 0.019, "step": 56100 }, { "epoch": 1.5741338196100436, "grad_norm": 0.321290522813797, "learning_rate": 2.376443633983261e-05, "loss": 0.0211, "step": 56110 }, { "epoch": 1.5744143638658998, "grad_norm": 0.0182444229722023, "learning_rate": 2.3759760602235004e-05, "loss": 0.0327, "step": 56120 }, { "epoch": 1.574694908121756, "grad_norm": 0.8580840826034546, "learning_rate": 2.3755084864637397e-05, "loss": 0.0505, "step": 56130 }, { "epoch": 1.5749754523776125, "grad_norm": 0.03818349540233612, "learning_rate": 2.375040912703979e-05, "loss": 0.021, "step": 56140 }, { "epoch": 1.575255996633469, "grad_norm": 0.22200070321559906, "learning_rate": 2.3745733389442184e-05, "loss": 0.0367, "step": 56150 }, { "epoch": 1.5755365408893254, "grad_norm": 0.019805442541837692, "learning_rate": 2.374105765184458e-05, "loss": 0.0108, "step": 56160 }, { "epoch": 1.5758170851451818, "grad_norm": 0.10724752396345139, "learning_rate": 2.3736381914246973e-05, "loss": 0.0741, "step": 56170 }, { "epoch": 1.576097629401038, "grad_norm": 0.6208834052085876, "learning_rate": 2.3731706176649366e-05, "loss": 0.0066, "step": 56180 }, { "epoch": 1.5763781736568943, "grad_norm": 0.4264887869358063, "learning_rate": 2.3727030439051763e-05, "loss": 0.0272, "step": 56190 }, { "epoch": 1.5766587179127507, "grad_norm": 0.5778617262840271, "learning_rate": 2.3722354701454156e-05, "loss": 0.0215, "step": 56200 }, { "epoch": 1.5769392621686071, "grad_norm": 0.04816336929798126, "learning_rate": 2.371767896385655e-05, "loss": 0.0282, "step": 56210 }, { "epoch": 1.5772198064244636, "grad_norm": 0.045576632022857666, "learning_rate": 2.3713003226258942e-05, "loss": 0.0138, "step": 56220 }, { "epoch": 1.5775003506803198, "grad_norm": 0.49030691385269165, "learning_rate": 2.370832748866134e-05, "loss": 0.0409, "step": 56230 }, { "epoch": 1.5777808949361762, "grad_norm": 0.04319191724061966, "learning_rate": 2.3703651751063732e-05, "loss": 0.0061, "step": 56240 }, { "epoch": 1.5780614391920325, "grad_norm": 4.299720287322998, "learning_rate": 2.3698976013466125e-05, "loss": 0.026, "step": 56250 }, { "epoch": 1.578341983447889, "grad_norm": 0.4254834055900574, "learning_rate": 2.369430027586852e-05, "loss": 0.0162, "step": 56260 }, { "epoch": 1.5786225277037453, "grad_norm": 0.04226585105061531, "learning_rate": 2.368962453827091e-05, "loss": 0.0399, "step": 56270 }, { "epoch": 1.5789030719596018, "grad_norm": 0.8973715901374817, "learning_rate": 2.3684948800673308e-05, "loss": 0.0401, "step": 56280 }, { "epoch": 1.579183616215458, "grad_norm": 0.5999881625175476, "learning_rate": 2.36802730630757e-05, "loss": 0.0578, "step": 56290 }, { "epoch": 1.5794641604713142, "grad_norm": 0.18467040359973907, "learning_rate": 2.3675597325478098e-05, "loss": 0.0373, "step": 56300 }, { "epoch": 1.5797447047271707, "grad_norm": 0.15683117508888245, "learning_rate": 2.367092158788049e-05, "loss": 0.0446, "step": 56310 }, { "epoch": 1.580025248983027, "grad_norm": 0.6478664875030518, "learning_rate": 2.3666245850282884e-05, "loss": 0.0145, "step": 56320 }, { "epoch": 1.5803057932388835, "grad_norm": 0.39208710193634033, "learning_rate": 2.3661570112685277e-05, "loss": 0.0394, "step": 56330 }, { "epoch": 1.5805863374947398, "grad_norm": 0.9251658916473389, "learning_rate": 2.365689437508767e-05, "loss": 0.0305, "step": 56340 }, { "epoch": 1.5808668817505962, "grad_norm": 0.03721468895673752, "learning_rate": 2.3652218637490064e-05, "loss": 0.0189, "step": 56350 }, { "epoch": 1.5811474260064524, "grad_norm": 0.18749481439590454, "learning_rate": 2.3647542899892457e-05, "loss": 0.017, "step": 56360 }, { "epoch": 1.5814279702623089, "grad_norm": 0.09077321738004684, "learning_rate": 2.3642867162294853e-05, "loss": 0.0205, "step": 56370 }, { "epoch": 1.5817085145181653, "grad_norm": 0.08423442393541336, "learning_rate": 2.3638191424697247e-05, "loss": 0.0192, "step": 56380 }, { "epoch": 1.5819890587740217, "grad_norm": 0.5980010628700256, "learning_rate": 2.3633515687099643e-05, "loss": 0.0324, "step": 56390 }, { "epoch": 1.582269603029878, "grad_norm": 0.039644643664360046, "learning_rate": 2.3628839949502036e-05, "loss": 0.0114, "step": 56400 }, { "epoch": 1.5825501472857342, "grad_norm": 0.4112703502178192, "learning_rate": 2.362416421190443e-05, "loss": 0.0401, "step": 56410 }, { "epoch": 1.5828306915415906, "grad_norm": 0.05626508966088295, "learning_rate": 2.3619488474306823e-05, "loss": 0.0267, "step": 56420 }, { "epoch": 1.583111235797447, "grad_norm": 0.054027680307626724, "learning_rate": 2.3614812736709216e-05, "loss": 0.0168, "step": 56430 }, { "epoch": 1.5833917800533035, "grad_norm": 0.6199393272399902, "learning_rate": 2.3610136999111612e-05, "loss": 0.0232, "step": 56440 }, { "epoch": 1.5836723243091597, "grad_norm": 0.12415241450071335, "learning_rate": 2.3605461261514005e-05, "loss": 0.0479, "step": 56450 }, { "epoch": 1.5839528685650162, "grad_norm": 0.11393533647060394, "learning_rate": 2.36007855239164e-05, "loss": 0.0213, "step": 56460 }, { "epoch": 1.5842334128208724, "grad_norm": 0.02225232869386673, "learning_rate": 2.359610978631879e-05, "loss": 0.0123, "step": 56470 }, { "epoch": 1.5845139570767288, "grad_norm": 0.3384416699409485, "learning_rate": 2.3591434048721188e-05, "loss": 0.0256, "step": 56480 }, { "epoch": 1.5847945013325853, "grad_norm": 0.4241143465042114, "learning_rate": 2.358675831112358e-05, "loss": 0.0141, "step": 56490 }, { "epoch": 1.5850750455884417, "grad_norm": 0.06534229218959808, "learning_rate": 2.3582082573525975e-05, "loss": 0.0274, "step": 56500 }, { "epoch": 1.585355589844298, "grad_norm": 0.09417881071567535, "learning_rate": 2.357740683592837e-05, "loss": 0.0282, "step": 56510 }, { "epoch": 1.5856361341001541, "grad_norm": 0.24034392833709717, "learning_rate": 2.3572731098330764e-05, "loss": 0.0202, "step": 56520 }, { "epoch": 1.5859166783560106, "grad_norm": 0.3406159579753876, "learning_rate": 2.3568055360733157e-05, "loss": 0.0274, "step": 56530 }, { "epoch": 1.586197222611867, "grad_norm": 0.02883794531226158, "learning_rate": 2.356337962313555e-05, "loss": 0.0166, "step": 56540 }, { "epoch": 1.5864777668677235, "grad_norm": 0.01626587100327015, "learning_rate": 2.3558703885537944e-05, "loss": 0.0189, "step": 56550 }, { "epoch": 1.58675831112358, "grad_norm": 0.2614772617816925, "learning_rate": 2.3554028147940337e-05, "loss": 0.033, "step": 56560 }, { "epoch": 1.5870388553794361, "grad_norm": 0.324950635433197, "learning_rate": 2.354935241034273e-05, "loss": 0.0487, "step": 56570 }, { "epoch": 1.5873193996352923, "grad_norm": 0.05136653408408165, "learning_rate": 2.3544676672745127e-05, "loss": 0.0138, "step": 56580 }, { "epoch": 1.5875999438911488, "grad_norm": 0.45048683881759644, "learning_rate": 2.3540000935147523e-05, "loss": 0.0242, "step": 56590 }, { "epoch": 1.5878804881470052, "grad_norm": 0.13201995193958282, "learning_rate": 2.3535325197549916e-05, "loss": 0.0165, "step": 56600 }, { "epoch": 1.5881610324028617, "grad_norm": 0.6619156002998352, "learning_rate": 2.353064945995231e-05, "loss": 0.0422, "step": 56610 }, { "epoch": 1.5884415766587179, "grad_norm": 1.1701784133911133, "learning_rate": 2.3525973722354703e-05, "loss": 0.0359, "step": 56620 }, { "epoch": 1.5887221209145743, "grad_norm": 0.45241793990135193, "learning_rate": 2.3521297984757096e-05, "loss": 0.028, "step": 56630 }, { "epoch": 1.5890026651704305, "grad_norm": 0.9349461793899536, "learning_rate": 2.351662224715949e-05, "loss": 0.0387, "step": 56640 }, { "epoch": 1.589283209426287, "grad_norm": 0.5039007067680359, "learning_rate": 2.3511946509561885e-05, "loss": 0.0073, "step": 56650 }, { "epoch": 1.5895637536821434, "grad_norm": 0.12743675708770752, "learning_rate": 2.350727077196428e-05, "loss": 0.023, "step": 56660 }, { "epoch": 1.5898442979379999, "grad_norm": 0.3502245843410492, "learning_rate": 2.3502595034366672e-05, "loss": 0.0095, "step": 56670 }, { "epoch": 1.590124842193856, "grad_norm": 0.19765222072601318, "learning_rate": 2.3497919296769065e-05, "loss": 0.0213, "step": 56680 }, { "epoch": 1.5904053864497123, "grad_norm": 0.040735967457294464, "learning_rate": 2.349324355917146e-05, "loss": 0.035, "step": 56690 }, { "epoch": 1.5906859307055687, "grad_norm": 0.01576569490134716, "learning_rate": 2.3488567821573855e-05, "loss": 0.0023, "step": 56700 }, { "epoch": 1.5909664749614252, "grad_norm": 0.02141590602695942, "learning_rate": 2.3483892083976248e-05, "loss": 0.028, "step": 56710 }, { "epoch": 1.5912470192172816, "grad_norm": 0.4007592499256134, "learning_rate": 2.3479216346378644e-05, "loss": 0.0178, "step": 56720 }, { "epoch": 1.5915275634731378, "grad_norm": 0.1738976538181305, "learning_rate": 2.3474540608781037e-05, "loss": 0.0102, "step": 56730 }, { "epoch": 1.5918081077289943, "grad_norm": 1.2640509605407715, "learning_rate": 2.346986487118343e-05, "loss": 0.0395, "step": 56740 }, { "epoch": 1.5920886519848505, "grad_norm": 0.2250203937292099, "learning_rate": 2.3465189133585824e-05, "loss": 0.0223, "step": 56750 }, { "epoch": 1.592369196240707, "grad_norm": 0.2958712875843048, "learning_rate": 2.3460513395988217e-05, "loss": 0.0488, "step": 56760 }, { "epoch": 1.5926497404965634, "grad_norm": 0.0857834666967392, "learning_rate": 2.345583765839061e-05, "loss": 0.021, "step": 56770 }, { "epoch": 1.5929302847524198, "grad_norm": 0.02139771357178688, "learning_rate": 2.3451161920793007e-05, "loss": 0.0138, "step": 56780 }, { "epoch": 1.593210829008276, "grad_norm": 0.03739932179450989, "learning_rate": 2.34464861831954e-05, "loss": 0.007, "step": 56790 }, { "epoch": 1.5934913732641323, "grad_norm": 0.03624466806650162, "learning_rate": 2.3441810445597796e-05, "loss": 0.0188, "step": 56800 }, { "epoch": 1.5937719175199887, "grad_norm": 0.17351962625980377, "learning_rate": 2.343713470800019e-05, "loss": 0.0565, "step": 56810 }, { "epoch": 1.5940524617758451, "grad_norm": 0.695644199848175, "learning_rate": 2.3432458970402583e-05, "loss": 0.0559, "step": 56820 }, { "epoch": 1.5943330060317016, "grad_norm": 0.5327982306480408, "learning_rate": 2.3427783232804976e-05, "loss": 0.0336, "step": 56830 }, { "epoch": 1.594613550287558, "grad_norm": 0.5379593372344971, "learning_rate": 2.342310749520737e-05, "loss": 0.0214, "step": 56840 }, { "epoch": 1.5948940945434142, "grad_norm": 0.03211367502808571, "learning_rate": 2.3418431757609762e-05, "loss": 0.0065, "step": 56850 }, { "epoch": 1.5951746387992705, "grad_norm": 0.15073099732398987, "learning_rate": 2.341375602001216e-05, "loss": 0.041, "step": 56860 }, { "epoch": 1.595455183055127, "grad_norm": 0.05668662488460541, "learning_rate": 2.3409080282414552e-05, "loss": 0.0166, "step": 56870 }, { "epoch": 1.5957357273109833, "grad_norm": 0.8969873189926147, "learning_rate": 2.3404404544816945e-05, "loss": 0.0262, "step": 56880 }, { "epoch": 1.5960162715668398, "grad_norm": 0.27871641516685486, "learning_rate": 2.339972880721934e-05, "loss": 0.0264, "step": 56890 }, { "epoch": 1.596296815822696, "grad_norm": 0.11606337130069733, "learning_rate": 2.3395053069621735e-05, "loss": 0.0099, "step": 56900 }, { "epoch": 1.5965773600785524, "grad_norm": 0.44923555850982666, "learning_rate": 2.3390377332024128e-05, "loss": 0.0366, "step": 56910 }, { "epoch": 1.5968579043344087, "grad_norm": 0.03944730386137962, "learning_rate": 2.338570159442652e-05, "loss": 0.0212, "step": 56920 }, { "epoch": 1.597138448590265, "grad_norm": 0.7497053146362305, "learning_rate": 2.3381025856828917e-05, "loss": 0.038, "step": 56930 }, { "epoch": 1.5974189928461215, "grad_norm": 0.05065479129552841, "learning_rate": 2.337635011923131e-05, "loss": 0.0358, "step": 56940 }, { "epoch": 1.597699537101978, "grad_norm": 0.2786787748336792, "learning_rate": 2.3371674381633704e-05, "loss": 0.0166, "step": 56950 }, { "epoch": 1.5979800813578342, "grad_norm": 0.10218226164579391, "learning_rate": 2.3366998644036097e-05, "loss": 0.0235, "step": 56960 }, { "epoch": 1.5982606256136904, "grad_norm": 0.028772274032235146, "learning_rate": 2.336232290643849e-05, "loss": 0.0133, "step": 56970 }, { "epoch": 1.5985411698695469, "grad_norm": 0.3194539248943329, "learning_rate": 2.3357647168840883e-05, "loss": 0.0432, "step": 56980 }, { "epoch": 1.5988217141254033, "grad_norm": 1.6960409879684448, "learning_rate": 2.335297143124328e-05, "loss": 0.0273, "step": 56990 }, { "epoch": 1.5991022583812597, "grad_norm": 0.25796180963516235, "learning_rate": 2.3348295693645676e-05, "loss": 0.0301, "step": 57000 }, { "epoch": 1.599382802637116, "grad_norm": 0.7084257006645203, "learning_rate": 2.334361995604807e-05, "loss": 0.0129, "step": 57010 }, { "epoch": 1.5996633468929724, "grad_norm": 0.05218948796391487, "learning_rate": 2.3338944218450463e-05, "loss": 0.0137, "step": 57020 }, { "epoch": 1.5999438911488286, "grad_norm": 0.028898587450385094, "learning_rate": 2.3334268480852856e-05, "loss": 0.0385, "step": 57030 }, { "epoch": 1.600224435404685, "grad_norm": 0.36233004927635193, "learning_rate": 2.332959274325525e-05, "loss": 0.0184, "step": 57040 }, { "epoch": 1.6005049796605415, "grad_norm": 0.008957485668361187, "learning_rate": 2.3324917005657642e-05, "loss": 0.0071, "step": 57050 }, { "epoch": 1.600785523916398, "grad_norm": 0.14750021696090698, "learning_rate": 2.3320241268060035e-05, "loss": 0.014, "step": 57060 }, { "epoch": 1.6010660681722542, "grad_norm": 0.05590842664241791, "learning_rate": 2.3315565530462432e-05, "loss": 0.0297, "step": 57070 }, { "epoch": 1.6013466124281104, "grad_norm": 0.937084972858429, "learning_rate": 2.3310889792864825e-05, "loss": 0.0274, "step": 57080 }, { "epoch": 1.6016271566839668, "grad_norm": 0.03786454349756241, "learning_rate": 2.3306214055267218e-05, "loss": 0.0357, "step": 57090 }, { "epoch": 1.6019077009398233, "grad_norm": 0.7916435599327087, "learning_rate": 2.3301538317669615e-05, "loss": 0.0469, "step": 57100 }, { "epoch": 1.6021882451956797, "grad_norm": 0.09732218831777573, "learning_rate": 2.3296862580072008e-05, "loss": 0.0206, "step": 57110 }, { "epoch": 1.602468789451536, "grad_norm": 0.9963157773017883, "learning_rate": 2.32921868424744e-05, "loss": 0.015, "step": 57120 }, { "epoch": 1.6027493337073924, "grad_norm": 0.8732151985168457, "learning_rate": 2.3287511104876794e-05, "loss": 0.0212, "step": 57130 }, { "epoch": 1.6030298779632486, "grad_norm": 16.64512825012207, "learning_rate": 2.328283536727919e-05, "loss": 0.0291, "step": 57140 }, { "epoch": 1.603310422219105, "grad_norm": 0.05173669010400772, "learning_rate": 2.3278159629681584e-05, "loss": 0.024, "step": 57150 }, { "epoch": 1.6035909664749615, "grad_norm": 0.47335219383239746, "learning_rate": 2.3273483892083977e-05, "loss": 0.0399, "step": 57160 }, { "epoch": 1.603871510730818, "grad_norm": 0.08395947515964508, "learning_rate": 2.326880815448637e-05, "loss": 0.0175, "step": 57170 }, { "epoch": 1.6041520549866741, "grad_norm": 0.11476951092481613, "learning_rate": 2.3264132416888763e-05, "loss": 0.0068, "step": 57180 }, { "epoch": 1.6044325992425303, "grad_norm": 0.07609900832176208, "learning_rate": 2.325945667929116e-05, "loss": 0.029, "step": 57190 }, { "epoch": 1.6047131434983868, "grad_norm": 1.6150128841400146, "learning_rate": 2.3254780941693553e-05, "loss": 0.0115, "step": 57200 }, { "epoch": 1.6049936877542432, "grad_norm": 0.014158414676785469, "learning_rate": 2.325010520409595e-05, "loss": 0.0125, "step": 57210 }, { "epoch": 1.6052742320100997, "grad_norm": 0.4345167875289917, "learning_rate": 2.3245429466498343e-05, "loss": 0.0134, "step": 57220 }, { "epoch": 1.605554776265956, "grad_norm": 0.04642181098461151, "learning_rate": 2.3240753728900736e-05, "loss": 0.0162, "step": 57230 }, { "epoch": 1.6058353205218123, "grad_norm": 2.904733657836914, "learning_rate": 2.323607799130313e-05, "loss": 0.0484, "step": 57240 }, { "epoch": 1.6061158647776685, "grad_norm": 0.052003953605890274, "learning_rate": 2.3231402253705522e-05, "loss": 0.0342, "step": 57250 }, { "epoch": 1.606396409033525, "grad_norm": 0.04807988926768303, "learning_rate": 2.3226726516107915e-05, "loss": 0.0046, "step": 57260 }, { "epoch": 1.6066769532893814, "grad_norm": 0.6277223825454712, "learning_rate": 2.322205077851031e-05, "loss": 0.0301, "step": 57270 }, { "epoch": 1.6069574975452379, "grad_norm": 1.006812572479248, "learning_rate": 2.3217375040912705e-05, "loss": 0.0271, "step": 57280 }, { "epoch": 1.607238041801094, "grad_norm": 0.4694596827030182, "learning_rate": 2.3212699303315098e-05, "loss": 0.0285, "step": 57290 }, { "epoch": 1.6075185860569505, "grad_norm": 0.21756625175476074, "learning_rate": 2.3208023565717495e-05, "loss": 0.0196, "step": 57300 }, { "epoch": 1.6077991303128067, "grad_norm": 0.6189727783203125, "learning_rate": 2.3203347828119888e-05, "loss": 0.0117, "step": 57310 }, { "epoch": 1.6080796745686632, "grad_norm": 0.022759636864066124, "learning_rate": 2.319867209052228e-05, "loss": 0.0097, "step": 57320 }, { "epoch": 1.6083602188245196, "grad_norm": 0.04677291586995125, "learning_rate": 2.3193996352924674e-05, "loss": 0.0248, "step": 57330 }, { "epoch": 1.608640763080376, "grad_norm": 0.3866938650608063, "learning_rate": 2.318932061532707e-05, "loss": 0.0143, "step": 57340 }, { "epoch": 1.6089213073362323, "grad_norm": 9.0342378616333, "learning_rate": 2.3184644877729464e-05, "loss": 0.012, "step": 57350 }, { "epoch": 1.6092018515920885, "grad_norm": 0.033762332051992416, "learning_rate": 2.3179969140131857e-05, "loss": 0.029, "step": 57360 }, { "epoch": 1.609482395847945, "grad_norm": 2.702488899230957, "learning_rate": 2.317529340253425e-05, "loss": 0.0523, "step": 57370 }, { "epoch": 1.6097629401038014, "grad_norm": 0.040910910815000534, "learning_rate": 2.3170617664936643e-05, "loss": 0.0067, "step": 57380 }, { "epoch": 1.6100434843596578, "grad_norm": 2.1222314834594727, "learning_rate": 2.316594192733904e-05, "loss": 0.0309, "step": 57390 }, { "epoch": 1.610324028615514, "grad_norm": 0.09909479320049286, "learning_rate": 2.3161266189741433e-05, "loss": 0.0363, "step": 57400 }, { "epoch": 1.6106045728713705, "grad_norm": 0.5748298168182373, "learning_rate": 2.315659045214383e-05, "loss": 0.0463, "step": 57410 }, { "epoch": 1.6108851171272267, "grad_norm": 0.4209153354167938, "learning_rate": 2.3151914714546223e-05, "loss": 0.0378, "step": 57420 }, { "epoch": 1.6111656613830831, "grad_norm": 0.3359379172325134, "learning_rate": 2.3147238976948616e-05, "loss": 0.0176, "step": 57430 }, { "epoch": 1.6114462056389396, "grad_norm": 0.094283327460289, "learning_rate": 2.314256323935101e-05, "loss": 0.053, "step": 57440 }, { "epoch": 1.611726749894796, "grad_norm": 0.22771340608596802, "learning_rate": 2.3137887501753402e-05, "loss": 0.0384, "step": 57450 }, { "epoch": 1.6120072941506522, "grad_norm": 0.07049820572137833, "learning_rate": 2.3133211764155795e-05, "loss": 0.0116, "step": 57460 }, { "epoch": 1.6122878384065085, "grad_norm": 0.23472630977630615, "learning_rate": 2.312853602655819e-05, "loss": 0.0142, "step": 57470 }, { "epoch": 1.612568382662365, "grad_norm": 1.1326524019241333, "learning_rate": 2.3123860288960585e-05, "loss": 0.0308, "step": 57480 }, { "epoch": 1.6128489269182213, "grad_norm": 0.12920117378234863, "learning_rate": 2.3119184551362978e-05, "loss": 0.0207, "step": 57490 }, { "epoch": 1.6131294711740778, "grad_norm": 0.6285235285758972, "learning_rate": 2.3114508813765375e-05, "loss": 0.0242, "step": 57500 }, { "epoch": 1.6134100154299342, "grad_norm": 0.31158050894737244, "learning_rate": 2.3109833076167768e-05, "loss": 0.0197, "step": 57510 }, { "epoch": 1.6136905596857904, "grad_norm": 1.3726670742034912, "learning_rate": 2.310515733857016e-05, "loss": 0.0272, "step": 57520 }, { "epoch": 1.6139711039416467, "grad_norm": 0.016466651111841202, "learning_rate": 2.3100481600972554e-05, "loss": 0.008, "step": 57530 }, { "epoch": 1.614251648197503, "grad_norm": 23.05562973022461, "learning_rate": 2.3095805863374947e-05, "loss": 0.035, "step": 57540 }, { "epoch": 1.6145321924533595, "grad_norm": 0.13589176535606384, "learning_rate": 2.3091130125777344e-05, "loss": 0.016, "step": 57550 }, { "epoch": 1.614812736709216, "grad_norm": 0.6686053276062012, "learning_rate": 2.3086454388179737e-05, "loss": 0.0349, "step": 57560 }, { "epoch": 1.6150932809650722, "grad_norm": 0.032886359840631485, "learning_rate": 2.308177865058213e-05, "loss": 0.0281, "step": 57570 }, { "epoch": 1.6153738252209286, "grad_norm": 0.05275516211986542, "learning_rate": 2.3077102912984523e-05, "loss": 0.0033, "step": 57580 }, { "epoch": 1.6156543694767849, "grad_norm": 1.0990711450576782, "learning_rate": 2.3072427175386917e-05, "loss": 0.0403, "step": 57590 }, { "epoch": 1.6159349137326413, "grad_norm": 0.03908123821020126, "learning_rate": 2.3067751437789313e-05, "loss": 0.0303, "step": 57600 }, { "epoch": 1.6162154579884978, "grad_norm": 0.04672529548406601, "learning_rate": 2.3063075700191706e-05, "loss": 0.008, "step": 57610 }, { "epoch": 1.6164960022443542, "grad_norm": 0.021605949848890305, "learning_rate": 2.3058399962594103e-05, "loss": 0.0063, "step": 57620 }, { "epoch": 1.6167765465002104, "grad_norm": 0.03382399305701256, "learning_rate": 2.3053724224996496e-05, "loss": 0.0385, "step": 57630 }, { "epoch": 1.6170570907560666, "grad_norm": 0.36573195457458496, "learning_rate": 2.304904848739889e-05, "loss": 0.0339, "step": 57640 }, { "epoch": 1.617337635011923, "grad_norm": 0.04054216295480728, "learning_rate": 2.3044372749801282e-05, "loss": 0.0173, "step": 57650 }, { "epoch": 1.6176181792677795, "grad_norm": 0.07029417157173157, "learning_rate": 2.3039697012203675e-05, "loss": 0.0196, "step": 57660 }, { "epoch": 1.617898723523636, "grad_norm": 0.037278443574905396, "learning_rate": 2.303502127460607e-05, "loss": 0.0286, "step": 57670 }, { "epoch": 1.6181792677794922, "grad_norm": 0.08785868436098099, "learning_rate": 2.3030345537008462e-05, "loss": 0.0229, "step": 57680 }, { "epoch": 1.6184598120353486, "grad_norm": 0.8212907910346985, "learning_rate": 2.3025669799410858e-05, "loss": 0.03, "step": 57690 }, { "epoch": 1.6187403562912048, "grad_norm": 0.08244362473487854, "learning_rate": 2.302099406181325e-05, "loss": 0.0265, "step": 57700 }, { "epoch": 1.6190209005470613, "grad_norm": 1.9220287799835205, "learning_rate": 2.3016318324215648e-05, "loss": 0.0474, "step": 57710 }, { "epoch": 1.6193014448029177, "grad_norm": 0.17276832461357117, "learning_rate": 2.301164258661804e-05, "loss": 0.0401, "step": 57720 }, { "epoch": 1.6195819890587742, "grad_norm": 0.06303601711988449, "learning_rate": 2.3006966849020434e-05, "loss": 0.0169, "step": 57730 }, { "epoch": 1.6198625333146304, "grad_norm": 0.14453968405723572, "learning_rate": 2.3002291111422827e-05, "loss": 0.0147, "step": 57740 }, { "epoch": 1.6201430775704866, "grad_norm": 0.03992118686437607, "learning_rate": 2.299761537382522e-05, "loss": 0.0091, "step": 57750 }, { "epoch": 1.620423621826343, "grad_norm": 0.04252268001437187, "learning_rate": 2.2992939636227617e-05, "loss": 0.0095, "step": 57760 }, { "epoch": 1.6207041660821995, "grad_norm": 0.030899589881300926, "learning_rate": 2.298826389863001e-05, "loss": 0.0205, "step": 57770 }, { "epoch": 1.620984710338056, "grad_norm": 0.09237902611494064, "learning_rate": 2.2983588161032403e-05, "loss": 0.0118, "step": 57780 }, { "epoch": 1.6212652545939124, "grad_norm": 0.016132429242134094, "learning_rate": 2.2978912423434797e-05, "loss": 0.0248, "step": 57790 }, { "epoch": 1.6215457988497686, "grad_norm": 0.7902459502220154, "learning_rate": 2.2974236685837193e-05, "loss": 0.0461, "step": 57800 }, { "epoch": 1.6218263431056248, "grad_norm": 0.2115389108657837, "learning_rate": 2.2969560948239586e-05, "loss": 0.032, "step": 57810 }, { "epoch": 1.6221068873614812, "grad_norm": 3.743804693222046, "learning_rate": 2.296488521064198e-05, "loss": 0.0305, "step": 57820 }, { "epoch": 1.6223874316173377, "grad_norm": 0.35722294449806213, "learning_rate": 2.2960209473044376e-05, "loss": 0.0157, "step": 57830 }, { "epoch": 1.6226679758731941, "grad_norm": 0.07607870548963547, "learning_rate": 2.295553373544677e-05, "loss": 0.0261, "step": 57840 }, { "epoch": 1.6229485201290503, "grad_norm": 0.028888603672385216, "learning_rate": 2.2950857997849162e-05, "loss": 0.0342, "step": 57850 }, { "epoch": 1.6232290643849068, "grad_norm": 0.06795207411050797, "learning_rate": 2.2946182260251555e-05, "loss": 0.0105, "step": 57860 }, { "epoch": 1.623509608640763, "grad_norm": 0.060551904141902924, "learning_rate": 2.294150652265395e-05, "loss": 0.0094, "step": 57870 }, { "epoch": 1.6237901528966194, "grad_norm": 0.021350828930735588, "learning_rate": 2.2936830785056342e-05, "loss": 0.0097, "step": 57880 }, { "epoch": 1.6240706971524759, "grad_norm": 0.6741589307785034, "learning_rate": 2.2932155047458735e-05, "loss": 0.0363, "step": 57890 }, { "epoch": 1.6243512414083323, "grad_norm": 0.3769829571247101, "learning_rate": 2.292747930986113e-05, "loss": 0.0568, "step": 57900 }, { "epoch": 1.6246317856641885, "grad_norm": 0.09599489718675613, "learning_rate": 2.2922803572263528e-05, "loss": 0.017, "step": 57910 }, { "epoch": 1.6249123299200448, "grad_norm": 0.3428609073162079, "learning_rate": 2.291812783466592e-05, "loss": 0.0511, "step": 57920 }, { "epoch": 1.6251928741759012, "grad_norm": 0.04775846377015114, "learning_rate": 2.2913452097068314e-05, "loss": 0.0135, "step": 57930 }, { "epoch": 1.6254734184317576, "grad_norm": 0.2909873127937317, "learning_rate": 2.2908776359470707e-05, "loss": 0.0226, "step": 57940 }, { "epoch": 1.625753962687614, "grad_norm": 0.062465377151966095, "learning_rate": 2.29041006218731e-05, "loss": 0.0206, "step": 57950 }, { "epoch": 1.6260345069434703, "grad_norm": 0.07590743154287338, "learning_rate": 2.2899424884275494e-05, "loss": 0.0462, "step": 57960 }, { "epoch": 1.6263150511993267, "grad_norm": 0.06905794888734818, "learning_rate": 2.289474914667789e-05, "loss": 0.0378, "step": 57970 }, { "epoch": 1.626595595455183, "grad_norm": 0.6757624745368958, "learning_rate": 2.2890073409080283e-05, "loss": 0.0093, "step": 57980 }, { "epoch": 1.6268761397110394, "grad_norm": 0.7471351623535156, "learning_rate": 2.2885397671482677e-05, "loss": 0.0251, "step": 57990 }, { "epoch": 1.6271566839668958, "grad_norm": 0.02642267756164074, "learning_rate": 2.288072193388507e-05, "loss": 0.0391, "step": 58000 }, { "epoch": 1.6274372282227523, "grad_norm": 0.6351426839828491, "learning_rate": 2.2876046196287466e-05, "loss": 0.0193, "step": 58010 }, { "epoch": 1.6277177724786085, "grad_norm": 0.26944977045059204, "learning_rate": 2.287137045868986e-05, "loss": 0.0531, "step": 58020 }, { "epoch": 1.6279983167344647, "grad_norm": 0.9771100878715515, "learning_rate": 2.2866694721092253e-05, "loss": 0.0146, "step": 58030 }, { "epoch": 1.6282788609903212, "grad_norm": 0.23736056685447693, "learning_rate": 2.286201898349465e-05, "loss": 0.0223, "step": 58040 }, { "epoch": 1.6285594052461776, "grad_norm": 0.22642682492733002, "learning_rate": 2.2857343245897042e-05, "loss": 0.0177, "step": 58050 }, { "epoch": 1.628839949502034, "grad_norm": 0.02999846264719963, "learning_rate": 2.2852667508299436e-05, "loss": 0.0057, "step": 58060 }, { "epoch": 1.6291204937578903, "grad_norm": 0.0142514668405056, "learning_rate": 2.284799177070183e-05, "loss": 0.0234, "step": 58070 }, { "epoch": 1.6294010380137467, "grad_norm": 0.16622503101825714, "learning_rate": 2.2843316033104222e-05, "loss": 0.0155, "step": 58080 }, { "epoch": 1.629681582269603, "grad_norm": 0.15146000683307648, "learning_rate": 2.2838640295506615e-05, "loss": 0.0107, "step": 58090 }, { "epoch": 1.6299621265254594, "grad_norm": 1.119017481803894, "learning_rate": 2.283396455790901e-05, "loss": 0.0093, "step": 58100 }, { "epoch": 1.6302426707813158, "grad_norm": 0.6155540347099304, "learning_rate": 2.2829288820311405e-05, "loss": 0.0247, "step": 58110 }, { "epoch": 1.6305232150371722, "grad_norm": 0.017426975071430206, "learning_rate": 2.28246130827138e-05, "loss": 0.0046, "step": 58120 }, { "epoch": 1.6308037592930285, "grad_norm": 0.017725076526403427, "learning_rate": 2.2819937345116194e-05, "loss": 0.0132, "step": 58130 }, { "epoch": 1.6310843035488847, "grad_norm": 0.013704631477594376, "learning_rate": 2.2815261607518588e-05, "loss": 0.0097, "step": 58140 }, { "epoch": 1.6313648478047411, "grad_norm": 0.12711532413959503, "learning_rate": 2.281058586992098e-05, "loss": 0.0349, "step": 58150 }, { "epoch": 1.6316453920605976, "grad_norm": 0.03934046998620033, "learning_rate": 2.2805910132323374e-05, "loss": 0.0056, "step": 58160 }, { "epoch": 1.631925936316454, "grad_norm": 0.02070201374590397, "learning_rate": 2.2801234394725767e-05, "loss": 0.0064, "step": 58170 }, { "epoch": 1.6322064805723104, "grad_norm": 2.9660322666168213, "learning_rate": 2.2796558657128164e-05, "loss": 0.0225, "step": 58180 }, { "epoch": 1.6324870248281667, "grad_norm": 0.03455515578389168, "learning_rate": 2.2791882919530557e-05, "loss": 0.0301, "step": 58190 }, { "epoch": 1.6327675690840229, "grad_norm": 0.671302080154419, "learning_rate": 2.278720718193295e-05, "loss": 0.0237, "step": 58200 }, { "epoch": 1.6330481133398793, "grad_norm": 0.39016109704971313, "learning_rate": 2.2782531444335346e-05, "loss": 0.032, "step": 58210 }, { "epoch": 1.6333286575957358, "grad_norm": 0.03686191141605377, "learning_rate": 2.277785570673774e-05, "loss": 0.0174, "step": 58220 }, { "epoch": 1.6336092018515922, "grad_norm": 0.6304395794868469, "learning_rate": 2.2773179969140133e-05, "loss": 0.0238, "step": 58230 }, { "epoch": 1.6338897461074484, "grad_norm": 0.5306923985481262, "learning_rate": 2.2768504231542526e-05, "loss": 0.0546, "step": 58240 }, { "epoch": 1.6341702903633049, "grad_norm": 0.10417146235704422, "learning_rate": 2.2763828493944922e-05, "loss": 0.0301, "step": 58250 }, { "epoch": 1.634450834619161, "grad_norm": 0.025025011971592903, "learning_rate": 2.2759152756347316e-05, "loss": 0.0152, "step": 58260 }, { "epoch": 1.6347313788750175, "grad_norm": 0.03280699998140335, "learning_rate": 2.275447701874971e-05, "loss": 0.0112, "step": 58270 }, { "epoch": 1.635011923130874, "grad_norm": 0.3037371337413788, "learning_rate": 2.2749801281152102e-05, "loss": 0.0123, "step": 58280 }, { "epoch": 1.6352924673867304, "grad_norm": 0.052240099757909775, "learning_rate": 2.2745125543554495e-05, "loss": 0.041, "step": 58290 }, { "epoch": 1.6355730116425866, "grad_norm": 2.199206829071045, "learning_rate": 2.274044980595689e-05, "loss": 0.0353, "step": 58300 }, { "epoch": 1.6358535558984428, "grad_norm": 0.029888954013586044, "learning_rate": 2.2735774068359285e-05, "loss": 0.0145, "step": 58310 }, { "epoch": 1.6361341001542993, "grad_norm": 0.04014824330806732, "learning_rate": 2.273109833076168e-05, "loss": 0.0227, "step": 58320 }, { "epoch": 1.6364146444101557, "grad_norm": 0.09843143075704575, "learning_rate": 2.2726422593164074e-05, "loss": 0.0289, "step": 58330 }, { "epoch": 1.6366951886660122, "grad_norm": 0.17777563631534576, "learning_rate": 2.2721746855566468e-05, "loss": 0.0095, "step": 58340 }, { "epoch": 1.6369757329218684, "grad_norm": 0.10297390818595886, "learning_rate": 2.271707111796886e-05, "loss": 0.0291, "step": 58350 }, { "epoch": 1.6372562771777248, "grad_norm": 0.3830539286136627, "learning_rate": 2.2712395380371254e-05, "loss": 0.0152, "step": 58360 }, { "epoch": 1.637536821433581, "grad_norm": 0.15565642714500427, "learning_rate": 2.2707719642773647e-05, "loss": 0.0228, "step": 58370 }, { "epoch": 1.6378173656894375, "grad_norm": 0.04034395515918732, "learning_rate": 2.270304390517604e-05, "loss": 0.0207, "step": 58380 }, { "epoch": 1.638097909945294, "grad_norm": 0.853941798210144, "learning_rate": 2.2698368167578437e-05, "loss": 0.04, "step": 58390 }, { "epoch": 1.6383784542011504, "grad_norm": 1.1859819889068604, "learning_rate": 2.269369242998083e-05, "loss": 0.0479, "step": 58400 }, { "epoch": 1.6386589984570066, "grad_norm": 0.5414258241653442, "learning_rate": 2.2689016692383226e-05, "loss": 0.0205, "step": 58410 }, { "epoch": 1.6389395427128628, "grad_norm": 0.3925226330757141, "learning_rate": 2.268434095478562e-05, "loss": 0.0221, "step": 58420 }, { "epoch": 1.6392200869687192, "grad_norm": 1.1891884803771973, "learning_rate": 2.2679665217188013e-05, "loss": 0.0239, "step": 58430 }, { "epoch": 1.6395006312245757, "grad_norm": 0.055855341255664825, "learning_rate": 2.2674989479590406e-05, "loss": 0.035, "step": 58440 }, { "epoch": 1.6397811754804321, "grad_norm": 0.6552616357803345, "learning_rate": 2.26703137419928e-05, "loss": 0.0313, "step": 58450 }, { "epoch": 1.6400617197362886, "grad_norm": 0.7145028114318848, "learning_rate": 2.2665638004395196e-05, "loss": 0.0359, "step": 58460 }, { "epoch": 1.6403422639921448, "grad_norm": 0.3288998603820801, "learning_rate": 2.266096226679759e-05, "loss": 0.0306, "step": 58470 }, { "epoch": 1.640622808248001, "grad_norm": 0.30878376960754395, "learning_rate": 2.2656286529199982e-05, "loss": 0.0459, "step": 58480 }, { "epoch": 1.6409033525038574, "grad_norm": 0.20773887634277344, "learning_rate": 2.2651610791602375e-05, "loss": 0.0066, "step": 58490 }, { "epoch": 1.6411838967597139, "grad_norm": 0.05284392461180687, "learning_rate": 2.2646935054004768e-05, "loss": 0.0055, "step": 58500 }, { "epoch": 1.6414644410155703, "grad_norm": 0.036519430577754974, "learning_rate": 2.2642259316407165e-05, "loss": 0.0279, "step": 58510 }, { "epoch": 1.6417449852714265, "grad_norm": 1.4073373079299927, "learning_rate": 2.2637583578809558e-05, "loss": 0.0443, "step": 58520 }, { "epoch": 1.642025529527283, "grad_norm": 0.6400153636932373, "learning_rate": 2.2632907841211954e-05, "loss": 0.036, "step": 58530 }, { "epoch": 1.6423060737831392, "grad_norm": 0.22240068018436432, "learning_rate": 2.2628232103614348e-05, "loss": 0.0328, "step": 58540 }, { "epoch": 1.6425866180389956, "grad_norm": 0.2778480350971222, "learning_rate": 2.262355636601674e-05, "loss": 0.0322, "step": 58550 }, { "epoch": 1.642867162294852, "grad_norm": 0.07616277784109116, "learning_rate": 2.2618880628419134e-05, "loss": 0.0177, "step": 58560 }, { "epoch": 1.6431477065507085, "grad_norm": 0.8727232217788696, "learning_rate": 2.2614204890821527e-05, "loss": 0.0232, "step": 58570 }, { "epoch": 1.6434282508065647, "grad_norm": 0.05970629304647446, "learning_rate": 2.260952915322392e-05, "loss": 0.0483, "step": 58580 }, { "epoch": 1.643708795062421, "grad_norm": 0.3298342227935791, "learning_rate": 2.2604853415626313e-05, "loss": 0.0101, "step": 58590 }, { "epoch": 1.6439893393182774, "grad_norm": 0.08288508653640747, "learning_rate": 2.260017767802871e-05, "loss": 0.0048, "step": 58600 }, { "epoch": 1.6442698835741338, "grad_norm": 0.07883202284574509, "learning_rate": 2.2595501940431103e-05, "loss": 0.024, "step": 58610 }, { "epoch": 1.6445504278299903, "grad_norm": 0.04066922888159752, "learning_rate": 2.25908262028335e-05, "loss": 0.0583, "step": 58620 }, { "epoch": 1.6448309720858465, "grad_norm": 0.20616436004638672, "learning_rate": 2.2586150465235893e-05, "loss": 0.0227, "step": 58630 }, { "epoch": 1.645111516341703, "grad_norm": 0.3944506347179413, "learning_rate": 2.2581474727638286e-05, "loss": 0.0214, "step": 58640 }, { "epoch": 1.6453920605975592, "grad_norm": 1.2894947528839111, "learning_rate": 2.257679899004068e-05, "loss": 0.0255, "step": 58650 }, { "epoch": 1.6456726048534156, "grad_norm": 0.5815241932868958, "learning_rate": 2.2572123252443076e-05, "loss": 0.0249, "step": 58660 }, { "epoch": 1.645953149109272, "grad_norm": 0.058698102831840515, "learning_rate": 2.256744751484547e-05, "loss": 0.0236, "step": 58670 }, { "epoch": 1.6462336933651285, "grad_norm": 0.062112778425216675, "learning_rate": 2.2562771777247862e-05, "loss": 0.0358, "step": 58680 }, { "epoch": 1.6465142376209847, "grad_norm": 0.22064034640789032, "learning_rate": 2.2558096039650255e-05, "loss": 0.0164, "step": 58690 }, { "epoch": 1.646794781876841, "grad_norm": 0.010712635703384876, "learning_rate": 2.2553420302052648e-05, "loss": 0.0124, "step": 58700 }, { "epoch": 1.6470753261326974, "grad_norm": 0.008031493052840233, "learning_rate": 2.2548744564455045e-05, "loss": 0.007, "step": 58710 }, { "epoch": 1.6473558703885538, "grad_norm": 0.19212520122528076, "learning_rate": 2.2544068826857438e-05, "loss": 0.0465, "step": 58720 }, { "epoch": 1.6476364146444102, "grad_norm": 0.9667096138000488, "learning_rate": 2.2539393089259835e-05, "loss": 0.0169, "step": 58730 }, { "epoch": 1.6479169589002667, "grad_norm": 0.11455419659614563, "learning_rate": 2.2534717351662228e-05, "loss": 0.0053, "step": 58740 }, { "epoch": 1.648197503156123, "grad_norm": 0.007618286646902561, "learning_rate": 2.253004161406462e-05, "loss": 0.0121, "step": 58750 }, { "epoch": 1.6484780474119791, "grad_norm": 0.027650559321045876, "learning_rate": 2.2525365876467014e-05, "loss": 0.0064, "step": 58760 }, { "epoch": 1.6487585916678356, "grad_norm": 0.3531343936920166, "learning_rate": 2.2520690138869407e-05, "loss": 0.007, "step": 58770 }, { "epoch": 1.649039135923692, "grad_norm": 0.022146206349134445, "learning_rate": 2.25160144012718e-05, "loss": 0.0132, "step": 58780 }, { "epoch": 1.6493196801795484, "grad_norm": 0.04286836460232735, "learning_rate": 2.2511338663674193e-05, "loss": 0.0161, "step": 58790 }, { "epoch": 1.6496002244354047, "grad_norm": 0.018066611140966415, "learning_rate": 2.250666292607659e-05, "loss": 0.0196, "step": 58800 }, { "epoch": 1.649880768691261, "grad_norm": 0.008241347037255764, "learning_rate": 2.2501987188478983e-05, "loss": 0.0225, "step": 58810 }, { "epoch": 1.6501613129471173, "grad_norm": 0.8715524673461914, "learning_rate": 2.249731145088138e-05, "loss": 0.0574, "step": 58820 }, { "epoch": 1.6504418572029738, "grad_norm": 0.07140038162469864, "learning_rate": 2.2492635713283773e-05, "loss": 0.0227, "step": 58830 }, { "epoch": 1.6507224014588302, "grad_norm": 0.621550977230072, "learning_rate": 2.2487959975686166e-05, "loss": 0.0147, "step": 58840 }, { "epoch": 1.6510029457146866, "grad_norm": 0.4411660134792328, "learning_rate": 2.248328423808856e-05, "loss": 0.0317, "step": 58850 }, { "epoch": 1.6512834899705429, "grad_norm": 0.1653665155172348, "learning_rate": 2.2478608500490952e-05, "loss": 0.0172, "step": 58860 }, { "epoch": 1.651564034226399, "grad_norm": 0.3657958209514618, "learning_rate": 2.247393276289335e-05, "loss": 0.0368, "step": 58870 }, { "epoch": 1.6518445784822555, "grad_norm": 2.955420970916748, "learning_rate": 2.2469257025295742e-05, "loss": 0.0189, "step": 58880 }, { "epoch": 1.652125122738112, "grad_norm": 0.42142942547798157, "learning_rate": 2.2464581287698135e-05, "loss": 0.0487, "step": 58890 }, { "epoch": 1.6524056669939684, "grad_norm": 0.024691561236977577, "learning_rate": 2.245990555010053e-05, "loss": 0.0418, "step": 58900 }, { "epoch": 1.6526862112498246, "grad_norm": 0.06743770837783813, "learning_rate": 2.245522981250292e-05, "loss": 0.0173, "step": 58910 }, { "epoch": 1.652966755505681, "grad_norm": 0.02356809191405773, "learning_rate": 2.2450554074905318e-05, "loss": 0.0068, "step": 58920 }, { "epoch": 1.6532472997615373, "grad_norm": 0.4100382328033447, "learning_rate": 2.244587833730771e-05, "loss": 0.0098, "step": 58930 }, { "epoch": 1.6535278440173937, "grad_norm": 0.3546263873577118, "learning_rate": 2.2441202599710108e-05, "loss": 0.0116, "step": 58940 }, { "epoch": 1.6538083882732502, "grad_norm": 0.591015636920929, "learning_rate": 2.24365268621125e-05, "loss": 0.0329, "step": 58950 }, { "epoch": 1.6540889325291066, "grad_norm": 0.33675095438957214, "learning_rate": 2.2431851124514894e-05, "loss": 0.0629, "step": 58960 }, { "epoch": 1.6543694767849628, "grad_norm": 0.12982919812202454, "learning_rate": 2.2427175386917287e-05, "loss": 0.0216, "step": 58970 }, { "epoch": 1.654650021040819, "grad_norm": 0.15381473302841187, "learning_rate": 2.242249964931968e-05, "loss": 0.0099, "step": 58980 }, { "epoch": 1.6549305652966755, "grad_norm": 0.03435635194182396, "learning_rate": 2.2417823911722073e-05, "loss": 0.0125, "step": 58990 }, { "epoch": 1.655211109552532, "grad_norm": 0.20077116787433624, "learning_rate": 2.2413148174124467e-05, "loss": 0.0155, "step": 59000 }, { "epoch": 1.6554916538083884, "grad_norm": 0.049967069178819656, "learning_rate": 2.2408472436526863e-05, "loss": 0.0102, "step": 59010 }, { "epoch": 1.6557721980642446, "grad_norm": 0.8111933469772339, "learning_rate": 2.2403796698929256e-05, "loss": 0.0187, "step": 59020 }, { "epoch": 1.656052742320101, "grad_norm": 0.03149671480059624, "learning_rate": 2.2399120961331653e-05, "loss": 0.017, "step": 59030 }, { "epoch": 1.6563332865759572, "grad_norm": 0.03756759315729141, "learning_rate": 2.2394445223734046e-05, "loss": 0.0149, "step": 59040 }, { "epoch": 1.6566138308318137, "grad_norm": 12.394115447998047, "learning_rate": 2.238976948613644e-05, "loss": 0.0034, "step": 59050 }, { "epoch": 1.6568943750876701, "grad_norm": 0.17099931836128235, "learning_rate": 2.2385093748538832e-05, "loss": 0.0264, "step": 59060 }, { "epoch": 1.6571749193435266, "grad_norm": 0.032618846744298935, "learning_rate": 2.2380418010941226e-05, "loss": 0.0087, "step": 59070 }, { "epoch": 1.6574554635993828, "grad_norm": 0.3699735403060913, "learning_rate": 2.2375742273343622e-05, "loss": 0.0193, "step": 59080 }, { "epoch": 1.657736007855239, "grad_norm": 0.04226338490843773, "learning_rate": 2.2371066535746015e-05, "loss": 0.0246, "step": 59090 }, { "epoch": 1.6580165521110954, "grad_norm": 0.2917778193950653, "learning_rate": 2.236639079814841e-05, "loss": 0.0143, "step": 59100 }, { "epoch": 1.6582970963669519, "grad_norm": 0.04425767436623573, "learning_rate": 2.23617150605508e-05, "loss": 0.0307, "step": 59110 }, { "epoch": 1.6585776406228083, "grad_norm": 0.08861257880926132, "learning_rate": 2.2357039322953198e-05, "loss": 0.0245, "step": 59120 }, { "epoch": 1.6588581848786648, "grad_norm": 0.06367892771959305, "learning_rate": 2.235236358535559e-05, "loss": 0.0187, "step": 59130 }, { "epoch": 1.659138729134521, "grad_norm": 0.5958796143531799, "learning_rate": 2.2347687847757984e-05, "loss": 0.0496, "step": 59140 }, { "epoch": 1.6594192733903772, "grad_norm": 0.11143620312213898, "learning_rate": 2.234301211016038e-05, "loss": 0.0117, "step": 59150 }, { "epoch": 1.6596998176462336, "grad_norm": 0.22595231235027313, "learning_rate": 2.2338336372562774e-05, "loss": 0.0282, "step": 59160 }, { "epoch": 1.65998036190209, "grad_norm": 0.16322851181030273, "learning_rate": 2.2333660634965167e-05, "loss": 0.0272, "step": 59170 }, { "epoch": 1.6602609061579465, "grad_norm": 0.058967556804418564, "learning_rate": 2.232898489736756e-05, "loss": 0.0136, "step": 59180 }, { "epoch": 1.6605414504138027, "grad_norm": 0.31723880767822266, "learning_rate": 2.2324309159769954e-05, "loss": 0.0234, "step": 59190 }, { "epoch": 1.6608219946696592, "grad_norm": 0.061417922377586365, "learning_rate": 2.2319633422172347e-05, "loss": 0.0393, "step": 59200 }, { "epoch": 1.6611025389255154, "grad_norm": 0.345156192779541, "learning_rate": 2.2314957684574743e-05, "loss": 0.0372, "step": 59210 }, { "epoch": 1.6613830831813718, "grad_norm": 0.48314353823661804, "learning_rate": 2.2310281946977136e-05, "loss": 0.0374, "step": 59220 }, { "epoch": 1.6616636274372283, "grad_norm": 0.2749604880809784, "learning_rate": 2.2305606209379533e-05, "loss": 0.0141, "step": 59230 }, { "epoch": 1.6619441716930847, "grad_norm": 0.013435465283691883, "learning_rate": 2.2300930471781926e-05, "loss": 0.016, "step": 59240 }, { "epoch": 1.662224715948941, "grad_norm": 0.029945315793156624, "learning_rate": 2.229625473418432e-05, "loss": 0.0137, "step": 59250 }, { "epoch": 1.6625052602047972, "grad_norm": 0.8360579013824463, "learning_rate": 2.2291578996586712e-05, "loss": 0.027, "step": 59260 }, { "epoch": 1.6627858044606536, "grad_norm": 0.01993006467819214, "learning_rate": 2.2286903258989106e-05, "loss": 0.0123, "step": 59270 }, { "epoch": 1.66306634871651, "grad_norm": 1.4410655498504639, "learning_rate": 2.22822275213915e-05, "loss": 0.0339, "step": 59280 }, { "epoch": 1.6633468929723665, "grad_norm": 0.46716493368148804, "learning_rate": 2.2277551783793895e-05, "loss": 0.0255, "step": 59290 }, { "epoch": 1.6636274372282227, "grad_norm": 0.15275366604328156, "learning_rate": 2.227287604619629e-05, "loss": 0.0502, "step": 59300 }, { "epoch": 1.6639079814840791, "grad_norm": 0.3130142092704773, "learning_rate": 2.226820030859868e-05, "loss": 0.011, "step": 59310 }, { "epoch": 1.6641885257399354, "grad_norm": 0.09507399797439575, "learning_rate": 2.2263524571001078e-05, "loss": 0.0145, "step": 59320 }, { "epoch": 1.6644690699957918, "grad_norm": 0.5801864862442017, "learning_rate": 2.225884883340347e-05, "loss": 0.0147, "step": 59330 }, { "epoch": 1.6647496142516482, "grad_norm": 0.015405315905809402, "learning_rate": 2.2254173095805864e-05, "loss": 0.0195, "step": 59340 }, { "epoch": 1.6650301585075047, "grad_norm": 0.04691634327173233, "learning_rate": 2.2249497358208258e-05, "loss": 0.0106, "step": 59350 }, { "epoch": 1.665310702763361, "grad_norm": 0.6120195388793945, "learning_rate": 2.2244821620610654e-05, "loss": 0.0113, "step": 59360 }, { "epoch": 1.6655912470192171, "grad_norm": 0.004903679247945547, "learning_rate": 2.2240145883013047e-05, "loss": 0.0156, "step": 59370 }, { "epoch": 1.6658717912750736, "grad_norm": 0.07088898122310638, "learning_rate": 2.223547014541544e-05, "loss": 0.0378, "step": 59380 }, { "epoch": 1.66615233553093, "grad_norm": 1.0645802021026611, "learning_rate": 2.2230794407817834e-05, "loss": 0.0344, "step": 59390 }, { "epoch": 1.6664328797867864, "grad_norm": 0.04452436789870262, "learning_rate": 2.2226118670220227e-05, "loss": 0.0111, "step": 59400 }, { "epoch": 1.6667134240426429, "grad_norm": 0.32557621598243713, "learning_rate": 2.222144293262262e-05, "loss": 0.0168, "step": 59410 }, { "epoch": 1.666993968298499, "grad_norm": 0.04156230762600899, "learning_rate": 2.2216767195025016e-05, "loss": 0.008, "step": 59420 }, { "epoch": 1.6672745125543553, "grad_norm": 2.1304781436920166, "learning_rate": 2.2212091457427413e-05, "loss": 0.0409, "step": 59430 }, { "epoch": 1.6675550568102118, "grad_norm": 1.003193974494934, "learning_rate": 2.2207415719829806e-05, "loss": 0.0376, "step": 59440 }, { "epoch": 1.6678356010660682, "grad_norm": 0.4184893071651459, "learning_rate": 2.22027399822322e-05, "loss": 0.0158, "step": 59450 }, { "epoch": 1.6681161453219246, "grad_norm": 0.009426610544323921, "learning_rate": 2.2198064244634592e-05, "loss": 0.02, "step": 59460 }, { "epoch": 1.6683966895777809, "grad_norm": 2.2326040267944336, "learning_rate": 2.2193388507036986e-05, "loss": 0.0357, "step": 59470 }, { "epoch": 1.6686772338336373, "grad_norm": 0.3573369085788727, "learning_rate": 2.218871276943938e-05, "loss": 0.0128, "step": 59480 }, { "epoch": 1.6689577780894935, "grad_norm": 0.08799172937870026, "learning_rate": 2.2184037031841772e-05, "loss": 0.007, "step": 59490 }, { "epoch": 1.66923832234535, "grad_norm": 0.024739660322666168, "learning_rate": 2.217936129424417e-05, "loss": 0.0155, "step": 59500 }, { "epoch": 1.6695188666012064, "grad_norm": 0.03978164121508598, "learning_rate": 2.217468555664656e-05, "loss": 0.0089, "step": 59510 }, { "epoch": 1.6697994108570628, "grad_norm": 0.644257664680481, "learning_rate": 2.2170009819048955e-05, "loss": 0.0209, "step": 59520 }, { "epoch": 1.670079955112919, "grad_norm": 0.475505530834198, "learning_rate": 2.216533408145135e-05, "loss": 0.0205, "step": 59530 }, { "epoch": 1.6703604993687753, "grad_norm": 1.14993417263031, "learning_rate": 2.2160658343853744e-05, "loss": 0.0331, "step": 59540 }, { "epoch": 1.6706410436246317, "grad_norm": 0.01789381168782711, "learning_rate": 2.2155982606256138e-05, "loss": 0.017, "step": 59550 }, { "epoch": 1.6709215878804882, "grad_norm": 0.04865434020757675, "learning_rate": 2.215130686865853e-05, "loss": 0.023, "step": 59560 }, { "epoch": 1.6712021321363446, "grad_norm": 0.2519691586494446, "learning_rate": 2.2146631131060927e-05, "loss": 0.0156, "step": 59570 }, { "epoch": 1.6714826763922008, "grad_norm": 0.22850386798381805, "learning_rate": 2.214195539346332e-05, "loss": 0.0391, "step": 59580 }, { "epoch": 1.6717632206480573, "grad_norm": 0.6705805063247681, "learning_rate": 2.2137279655865714e-05, "loss": 0.0274, "step": 59590 }, { "epoch": 1.6720437649039135, "grad_norm": 0.15668250620365143, "learning_rate": 2.2132603918268107e-05, "loss": 0.0131, "step": 59600 }, { "epoch": 1.67232430915977, "grad_norm": 0.15390644967556, "learning_rate": 2.21279281806705e-05, "loss": 0.0125, "step": 59610 }, { "epoch": 1.6726048534156264, "grad_norm": 0.08839355409145355, "learning_rate": 2.2123252443072896e-05, "loss": 0.005, "step": 59620 }, { "epoch": 1.6728853976714828, "grad_norm": 0.01028946042060852, "learning_rate": 2.211857670547529e-05, "loss": 0.0086, "step": 59630 }, { "epoch": 1.673165941927339, "grad_norm": 0.4545777142047882, "learning_rate": 2.2113900967877686e-05, "loss": 0.0115, "step": 59640 }, { "epoch": 1.6734464861831952, "grad_norm": 0.007623937912285328, "learning_rate": 2.210922523028008e-05, "loss": 0.0064, "step": 59650 }, { "epoch": 1.6737270304390517, "grad_norm": 0.4917201101779938, "learning_rate": 2.2104549492682472e-05, "loss": 0.0263, "step": 59660 }, { "epoch": 1.6740075746949081, "grad_norm": 0.011809554882347584, "learning_rate": 2.2099873755084866e-05, "loss": 0.0325, "step": 59670 }, { "epoch": 1.6742881189507646, "grad_norm": 0.276980459690094, "learning_rate": 2.209519801748726e-05, "loss": 0.0348, "step": 59680 }, { "epoch": 1.674568663206621, "grad_norm": 0.02095450647175312, "learning_rate": 2.2090522279889652e-05, "loss": 0.043, "step": 59690 }, { "epoch": 1.6748492074624772, "grad_norm": 0.4224720299243927, "learning_rate": 2.2085846542292045e-05, "loss": 0.0058, "step": 59700 }, { "epoch": 1.6751297517183334, "grad_norm": 0.011150947771966457, "learning_rate": 2.208117080469444e-05, "loss": 0.0093, "step": 59710 }, { "epoch": 1.67541029597419, "grad_norm": 0.5552055239677429, "learning_rate": 2.2076495067096835e-05, "loss": 0.0233, "step": 59720 }, { "epoch": 1.6756908402300463, "grad_norm": 0.37376222014427185, "learning_rate": 2.207181932949923e-05, "loss": 0.0183, "step": 59730 }, { "epoch": 1.6759713844859028, "grad_norm": 0.01330542378127575, "learning_rate": 2.2067143591901625e-05, "loss": 0.0085, "step": 59740 }, { "epoch": 1.676251928741759, "grad_norm": 0.4309658408164978, "learning_rate": 2.2062467854304018e-05, "loss": 0.04, "step": 59750 }, { "epoch": 1.6765324729976152, "grad_norm": 0.14593298733234406, "learning_rate": 2.205779211670641e-05, "loss": 0.012, "step": 59760 }, { "epoch": 1.6768130172534716, "grad_norm": 0.2730362117290497, "learning_rate": 2.2053116379108804e-05, "loss": 0.0126, "step": 59770 }, { "epoch": 1.677093561509328, "grad_norm": 1.2221473455429077, "learning_rate": 2.20484406415112e-05, "loss": 0.0526, "step": 59780 }, { "epoch": 1.6773741057651845, "grad_norm": 0.015593188814818859, "learning_rate": 2.2043764903913594e-05, "loss": 0.0064, "step": 59790 }, { "epoch": 1.677654650021041, "grad_norm": 0.3109665811061859, "learning_rate": 2.2039089166315987e-05, "loss": 0.0103, "step": 59800 }, { "epoch": 1.6779351942768972, "grad_norm": 0.6143952012062073, "learning_rate": 2.203441342871838e-05, "loss": 0.0205, "step": 59810 }, { "epoch": 1.6782157385327534, "grad_norm": 1.2636170387268066, "learning_rate": 2.2029737691120773e-05, "loss": 0.0303, "step": 59820 }, { "epoch": 1.6784962827886099, "grad_norm": 1.3786578178405762, "learning_rate": 2.202506195352317e-05, "loss": 0.0329, "step": 59830 }, { "epoch": 1.6787768270444663, "grad_norm": 5.944608211517334, "learning_rate": 2.2020386215925563e-05, "loss": 0.0245, "step": 59840 }, { "epoch": 1.6790573713003227, "grad_norm": 0.020313333719968796, "learning_rate": 2.201571047832796e-05, "loss": 0.0258, "step": 59850 }, { "epoch": 1.679337915556179, "grad_norm": 0.020243704319000244, "learning_rate": 2.2011034740730353e-05, "loss": 0.018, "step": 59860 }, { "epoch": 1.6796184598120354, "grad_norm": 0.014481146819889545, "learning_rate": 2.2006359003132746e-05, "loss": 0.0173, "step": 59870 }, { "epoch": 1.6798990040678916, "grad_norm": 0.014367244206368923, "learning_rate": 2.200168326553514e-05, "loss": 0.0156, "step": 59880 }, { "epoch": 1.680179548323748, "grad_norm": 0.03895892947912216, "learning_rate": 2.1997007527937532e-05, "loss": 0.0323, "step": 59890 }, { "epoch": 1.6804600925796045, "grad_norm": 0.04845211282372475, "learning_rate": 2.1992331790339925e-05, "loss": 0.0157, "step": 59900 }, { "epoch": 1.680740636835461, "grad_norm": 0.01694098487496376, "learning_rate": 2.1987656052742322e-05, "loss": 0.0343, "step": 59910 }, { "epoch": 1.6810211810913172, "grad_norm": 0.2166452258825302, "learning_rate": 2.1982980315144715e-05, "loss": 0.0036, "step": 59920 }, { "epoch": 1.6813017253471734, "grad_norm": 0.15747658908367157, "learning_rate": 2.1978304577547108e-05, "loss": 0.0269, "step": 59930 }, { "epoch": 1.6815822696030298, "grad_norm": 0.039505913853645325, "learning_rate": 2.1973628839949505e-05, "loss": 0.0047, "step": 59940 }, { "epoch": 1.6818628138588863, "grad_norm": 0.5862597227096558, "learning_rate": 2.1968953102351898e-05, "loss": 0.0244, "step": 59950 }, { "epoch": 1.6821433581147427, "grad_norm": 0.363313764333725, "learning_rate": 2.196427736475429e-05, "loss": 0.0644, "step": 59960 }, { "epoch": 1.682423902370599, "grad_norm": 0.2002745121717453, "learning_rate": 2.1959601627156684e-05, "loss": 0.0278, "step": 59970 }, { "epoch": 1.6827044466264554, "grad_norm": 0.1840876042842865, "learning_rate": 2.195492588955908e-05, "loss": 0.057, "step": 59980 }, { "epoch": 1.6829849908823116, "grad_norm": 0.04505879431962967, "learning_rate": 2.1950250151961474e-05, "loss": 0.0077, "step": 59990 }, { "epoch": 1.683265535138168, "grad_norm": 0.03328315541148186, "learning_rate": 2.1945574414363867e-05, "loss": 0.0158, "step": 60000 }, { "epoch": 1.6835460793940245, "grad_norm": 0.4224107265472412, "learning_rate": 2.194089867676626e-05, "loss": 0.0476, "step": 60010 }, { "epoch": 1.683826623649881, "grad_norm": 0.06180864945054054, "learning_rate": 2.1936222939168653e-05, "loss": 0.0335, "step": 60020 }, { "epoch": 1.6841071679057371, "grad_norm": 0.04332800954580307, "learning_rate": 2.193154720157105e-05, "loss": 0.017, "step": 60030 }, { "epoch": 1.6843877121615933, "grad_norm": 0.26892492175102234, "learning_rate": 2.1926871463973443e-05, "loss": 0.0176, "step": 60040 }, { "epoch": 1.6846682564174498, "grad_norm": 0.06374634802341461, "learning_rate": 2.192219572637584e-05, "loss": 0.0265, "step": 60050 }, { "epoch": 1.6849488006733062, "grad_norm": 0.02767600677907467, "learning_rate": 2.1917519988778233e-05, "loss": 0.0151, "step": 60060 }, { "epoch": 1.6852293449291627, "grad_norm": 0.009797041304409504, "learning_rate": 2.1912844251180626e-05, "loss": 0.0219, "step": 60070 }, { "epoch": 1.685509889185019, "grad_norm": 0.06137848272919655, "learning_rate": 2.190816851358302e-05, "loss": 0.0497, "step": 60080 }, { "epoch": 1.6857904334408753, "grad_norm": 0.19813241064548492, "learning_rate": 2.1903492775985412e-05, "loss": 0.0182, "step": 60090 }, { "epoch": 1.6860709776967315, "grad_norm": 1.0799756050109863, "learning_rate": 2.1898817038387805e-05, "loss": 0.0221, "step": 60100 }, { "epoch": 1.686351521952588, "grad_norm": 7.221774578094482, "learning_rate": 2.18941413007902e-05, "loss": 0.0298, "step": 60110 }, { "epoch": 1.6866320662084444, "grad_norm": 0.2273845076560974, "learning_rate": 2.1889465563192595e-05, "loss": 0.0106, "step": 60120 }, { "epoch": 1.6869126104643009, "grad_norm": 0.0504414401948452, "learning_rate": 2.1884789825594988e-05, "loss": 0.0209, "step": 60130 }, { "epoch": 1.687193154720157, "grad_norm": 3.1649765968322754, "learning_rate": 2.1880114087997385e-05, "loss": 0.0169, "step": 60140 }, { "epoch": 1.6874736989760135, "grad_norm": 0.2187337428331375, "learning_rate": 2.1875438350399778e-05, "loss": 0.0205, "step": 60150 }, { "epoch": 1.6877542432318697, "grad_norm": 0.012867861427366734, "learning_rate": 2.187076261280217e-05, "loss": 0.0261, "step": 60160 }, { "epoch": 1.6880347874877262, "grad_norm": 0.02604072354733944, "learning_rate": 2.1866086875204564e-05, "loss": 0.0204, "step": 60170 }, { "epoch": 1.6883153317435826, "grad_norm": 0.11876078695058823, "learning_rate": 2.1861411137606957e-05, "loss": 0.0122, "step": 60180 }, { "epoch": 1.688595875999439, "grad_norm": 0.1804279386997223, "learning_rate": 2.1856735400009354e-05, "loss": 0.0368, "step": 60190 }, { "epoch": 1.6888764202552953, "grad_norm": 0.37643441557884216, "learning_rate": 2.1852059662411747e-05, "loss": 0.0225, "step": 60200 }, { "epoch": 1.6891569645111515, "grad_norm": 0.7400527000427246, "learning_rate": 2.184738392481414e-05, "loss": 0.0117, "step": 60210 }, { "epoch": 1.689437508767008, "grad_norm": 0.46023350954055786, "learning_rate": 2.1842708187216533e-05, "loss": 0.0626, "step": 60220 }, { "epoch": 1.6897180530228644, "grad_norm": 0.03601674363017082, "learning_rate": 2.183803244961893e-05, "loss": 0.0084, "step": 60230 }, { "epoch": 1.6899985972787208, "grad_norm": 0.03162173926830292, "learning_rate": 2.1833356712021323e-05, "loss": 0.0446, "step": 60240 }, { "epoch": 1.690279141534577, "grad_norm": 0.08924917131662369, "learning_rate": 2.1828680974423716e-05, "loss": 0.0156, "step": 60250 }, { "epoch": 1.6905596857904335, "grad_norm": 1.0518732070922852, "learning_rate": 2.1824005236826113e-05, "loss": 0.025, "step": 60260 }, { "epoch": 1.6908402300462897, "grad_norm": 0.20736753940582275, "learning_rate": 2.1819329499228506e-05, "loss": 0.0047, "step": 60270 }, { "epoch": 1.6911207743021461, "grad_norm": 0.1254023164510727, "learning_rate": 2.18146537616309e-05, "loss": 0.0137, "step": 60280 }, { "epoch": 1.6914013185580026, "grad_norm": 2.965538263320923, "learning_rate": 2.1809978024033292e-05, "loss": 0.0277, "step": 60290 }, { "epoch": 1.691681862813859, "grad_norm": 0.045413192361593246, "learning_rate": 2.1805302286435685e-05, "loss": 0.0244, "step": 60300 }, { "epoch": 1.6919624070697152, "grad_norm": 0.407747745513916, "learning_rate": 2.180062654883808e-05, "loss": 0.0271, "step": 60310 }, { "epoch": 1.6922429513255715, "grad_norm": 2.7124764919281006, "learning_rate": 2.179595081124047e-05, "loss": 0.0324, "step": 60320 }, { "epoch": 1.692523495581428, "grad_norm": 0.8239647150039673, "learning_rate": 2.1791275073642868e-05, "loss": 0.032, "step": 60330 }, { "epoch": 1.6928040398372843, "grad_norm": 0.13269475102424622, "learning_rate": 2.1786599336045265e-05, "loss": 0.0428, "step": 60340 }, { "epoch": 1.6930845840931408, "grad_norm": 0.40163132548332214, "learning_rate": 2.1781923598447658e-05, "loss": 0.0123, "step": 60350 }, { "epoch": 1.6933651283489972, "grad_norm": 0.09223155677318573, "learning_rate": 2.177724786085005e-05, "loss": 0.0449, "step": 60360 }, { "epoch": 1.6936456726048534, "grad_norm": 0.21145780384540558, "learning_rate": 2.1772572123252444e-05, "loss": 0.0059, "step": 60370 }, { "epoch": 1.6939262168607097, "grad_norm": 0.02375958487391472, "learning_rate": 2.1767896385654837e-05, "loss": 0.0218, "step": 60380 }, { "epoch": 1.694206761116566, "grad_norm": 0.5279950499534607, "learning_rate": 2.176322064805723e-05, "loss": 0.0352, "step": 60390 }, { "epoch": 1.6944873053724225, "grad_norm": 0.060052502900362015, "learning_rate": 2.1758544910459627e-05, "loss": 0.0083, "step": 60400 }, { "epoch": 1.694767849628279, "grad_norm": 0.05222751945257187, "learning_rate": 2.175386917286202e-05, "loss": 0.0192, "step": 60410 }, { "epoch": 1.6950483938841352, "grad_norm": 0.5712555050849915, "learning_rate": 2.1749193435264413e-05, "loss": 0.0168, "step": 60420 }, { "epoch": 1.6953289381399916, "grad_norm": 0.4478914439678192, "learning_rate": 2.1744517697666806e-05, "loss": 0.0139, "step": 60430 }, { "epoch": 1.6956094823958479, "grad_norm": 0.20297600328922272, "learning_rate": 2.1739841960069203e-05, "loss": 0.0301, "step": 60440 }, { "epoch": 1.6958900266517043, "grad_norm": 0.319149374961853, "learning_rate": 2.1735166222471596e-05, "loss": 0.0191, "step": 60450 }, { "epoch": 1.6961705709075607, "grad_norm": 0.6532734036445618, "learning_rate": 2.173049048487399e-05, "loss": 0.0537, "step": 60460 }, { "epoch": 1.6964511151634172, "grad_norm": 0.04556489735841751, "learning_rate": 2.1725814747276386e-05, "loss": 0.0323, "step": 60470 }, { "epoch": 1.6967316594192734, "grad_norm": 0.5018165111541748, "learning_rate": 2.172113900967878e-05, "loss": 0.0213, "step": 60480 }, { "epoch": 1.6970122036751296, "grad_norm": 0.09050407260656357, "learning_rate": 2.1716463272081172e-05, "loss": 0.0198, "step": 60490 }, { "epoch": 1.697292747930986, "grad_norm": 0.29560697078704834, "learning_rate": 2.1711787534483565e-05, "loss": 0.0105, "step": 60500 }, { "epoch": 1.6975732921868425, "grad_norm": 0.4300421476364136, "learning_rate": 2.170711179688596e-05, "loss": 0.0198, "step": 60510 }, { "epoch": 1.697853836442699, "grad_norm": 0.9534092545509338, "learning_rate": 2.170243605928835e-05, "loss": 0.0459, "step": 60520 }, { "epoch": 1.6981343806985552, "grad_norm": 0.27538734674453735, "learning_rate": 2.1697760321690748e-05, "loss": 0.0319, "step": 60530 }, { "epoch": 1.6984149249544116, "grad_norm": 0.21251408755779266, "learning_rate": 2.169308458409314e-05, "loss": 0.0095, "step": 60540 }, { "epoch": 1.6986954692102678, "grad_norm": 0.08151629567146301, "learning_rate": 2.1688408846495538e-05, "loss": 0.0199, "step": 60550 }, { "epoch": 1.6989760134661243, "grad_norm": 0.12103055417537689, "learning_rate": 2.168373310889793e-05, "loss": 0.0123, "step": 60560 }, { "epoch": 1.6992565577219807, "grad_norm": 0.1513238400220871, "learning_rate": 2.1679057371300324e-05, "loss": 0.0243, "step": 60570 }, { "epoch": 1.6995371019778371, "grad_norm": 0.03142140060663223, "learning_rate": 2.1674381633702717e-05, "loss": 0.0069, "step": 60580 }, { "epoch": 1.6998176462336934, "grad_norm": 0.9181414246559143, "learning_rate": 2.166970589610511e-05, "loss": 0.0123, "step": 60590 }, { "epoch": 1.7000981904895496, "grad_norm": 0.9882793426513672, "learning_rate": 2.1665030158507504e-05, "loss": 0.0647, "step": 60600 }, { "epoch": 1.700378734745406, "grad_norm": 0.3300537168979645, "learning_rate": 2.16603544209099e-05, "loss": 0.0301, "step": 60610 }, { "epoch": 1.7006592790012625, "grad_norm": 0.06759681552648544, "learning_rate": 2.1655678683312293e-05, "loss": 0.0371, "step": 60620 }, { "epoch": 1.700939823257119, "grad_norm": 0.028546493500471115, "learning_rate": 2.1651002945714686e-05, "loss": 0.0088, "step": 60630 }, { "epoch": 1.7012203675129751, "grad_norm": 0.04573538526892662, "learning_rate": 2.1646327208117083e-05, "loss": 0.0271, "step": 60640 }, { "epoch": 1.7015009117688316, "grad_norm": 0.26414480805397034, "learning_rate": 2.1641651470519476e-05, "loss": 0.0095, "step": 60650 }, { "epoch": 1.7017814560246878, "grad_norm": 1.151863694190979, "learning_rate": 2.163697573292187e-05, "loss": 0.0338, "step": 60660 }, { "epoch": 1.7020620002805442, "grad_norm": 0.02045712247490883, "learning_rate": 2.1632299995324262e-05, "loss": 0.0064, "step": 60670 }, { "epoch": 1.7023425445364007, "grad_norm": 0.4700659215450287, "learning_rate": 2.162762425772666e-05, "loss": 0.0148, "step": 60680 }, { "epoch": 1.702623088792257, "grad_norm": 0.5029246807098389, "learning_rate": 2.1622948520129052e-05, "loss": 0.0103, "step": 60690 }, { "epoch": 1.7029036330481133, "grad_norm": 7.756129264831543, "learning_rate": 2.1618272782531445e-05, "loss": 0.0374, "step": 60700 }, { "epoch": 1.7031841773039695, "grad_norm": 0.2197883278131485, "learning_rate": 2.161359704493384e-05, "loss": 0.0169, "step": 60710 }, { "epoch": 1.703464721559826, "grad_norm": 0.5621201992034912, "learning_rate": 2.160892130733623e-05, "loss": 0.0305, "step": 60720 }, { "epoch": 1.7037452658156824, "grad_norm": 0.040642108768224716, "learning_rate": 2.1604245569738625e-05, "loss": 0.0254, "step": 60730 }, { "epoch": 1.7040258100715389, "grad_norm": 0.03446929529309273, "learning_rate": 2.159956983214102e-05, "loss": 0.0327, "step": 60740 }, { "epoch": 1.7043063543273953, "grad_norm": 0.09414242208003998, "learning_rate": 2.1594894094543418e-05, "loss": 0.0135, "step": 60750 }, { "epoch": 1.7045868985832515, "grad_norm": 0.07663853466510773, "learning_rate": 2.159021835694581e-05, "loss": 0.0168, "step": 60760 }, { "epoch": 1.7048674428391077, "grad_norm": 0.03591591864824295, "learning_rate": 2.1585542619348204e-05, "loss": 0.0052, "step": 60770 }, { "epoch": 1.7051479870949642, "grad_norm": 3.3961026668548584, "learning_rate": 2.1580866881750597e-05, "loss": 0.0262, "step": 60780 }, { "epoch": 1.7054285313508206, "grad_norm": 0.11108322441577911, "learning_rate": 2.157619114415299e-05, "loss": 0.006, "step": 60790 }, { "epoch": 1.705709075606677, "grad_norm": 0.07276004552841187, "learning_rate": 2.1571515406555384e-05, "loss": 0.0241, "step": 60800 }, { "epoch": 1.7059896198625333, "grad_norm": 0.04713203385472298, "learning_rate": 2.1566839668957777e-05, "loss": 0.0133, "step": 60810 }, { "epoch": 1.7062701641183897, "grad_norm": 0.3273816406726837, "learning_rate": 2.1562163931360173e-05, "loss": 0.0035, "step": 60820 }, { "epoch": 1.706550708374246, "grad_norm": 0.029990028589963913, "learning_rate": 2.1557488193762567e-05, "loss": 0.0157, "step": 60830 }, { "epoch": 1.7068312526301024, "grad_norm": 0.03919167444109917, "learning_rate": 2.1552812456164963e-05, "loss": 0.039, "step": 60840 }, { "epoch": 1.7071117968859588, "grad_norm": 0.3518509268760681, "learning_rate": 2.1548136718567356e-05, "loss": 0.0625, "step": 60850 }, { "epoch": 1.7073923411418153, "grad_norm": 0.1458728015422821, "learning_rate": 2.154346098096975e-05, "loss": 0.0112, "step": 60860 }, { "epoch": 1.7076728853976715, "grad_norm": 0.20258164405822754, "learning_rate": 2.1538785243372143e-05, "loss": 0.0274, "step": 60870 }, { "epoch": 1.7079534296535277, "grad_norm": 0.17512963712215424, "learning_rate": 2.1534109505774536e-05, "loss": 0.0239, "step": 60880 }, { "epoch": 1.7082339739093841, "grad_norm": 0.4611028730869293, "learning_rate": 2.1529433768176932e-05, "loss": 0.0179, "step": 60890 }, { "epoch": 1.7085145181652406, "grad_norm": 0.027092142030596733, "learning_rate": 2.1524758030579325e-05, "loss": 0.0407, "step": 60900 }, { "epoch": 1.708795062421097, "grad_norm": 0.23832397162914276, "learning_rate": 2.152008229298172e-05, "loss": 0.0636, "step": 60910 }, { "epoch": 1.7090756066769532, "grad_norm": 0.09765736758708954, "learning_rate": 2.151540655538411e-05, "loss": 0.0285, "step": 60920 }, { "epoch": 1.7093561509328097, "grad_norm": 0.33523353934288025, "learning_rate": 2.1510730817786505e-05, "loss": 0.0059, "step": 60930 }, { "epoch": 1.709636695188666, "grad_norm": 0.43087372183799744, "learning_rate": 2.15060550801889e-05, "loss": 0.0121, "step": 60940 }, { "epoch": 1.7099172394445223, "grad_norm": 0.045915842056274414, "learning_rate": 2.1501379342591295e-05, "loss": 0.0413, "step": 60950 }, { "epoch": 1.7101977837003788, "grad_norm": 1.3062642812728882, "learning_rate": 2.149670360499369e-05, "loss": 0.0497, "step": 60960 }, { "epoch": 1.7104783279562352, "grad_norm": 0.35158446431159973, "learning_rate": 2.1492027867396084e-05, "loss": 0.0264, "step": 60970 }, { "epoch": 1.7107588722120914, "grad_norm": 2.444014072418213, "learning_rate": 2.1487352129798477e-05, "loss": 0.0573, "step": 60980 }, { "epoch": 1.7110394164679477, "grad_norm": 0.2738769054412842, "learning_rate": 2.148267639220087e-05, "loss": 0.0361, "step": 60990 }, { "epoch": 1.711319960723804, "grad_norm": 0.1010410264134407, "learning_rate": 2.1478000654603264e-05, "loss": 0.014, "step": 61000 }, { "epoch": 1.7116005049796605, "grad_norm": 0.2117917686700821, "learning_rate": 2.1473324917005657e-05, "loss": 0.0288, "step": 61010 }, { "epoch": 1.711881049235517, "grad_norm": 0.03502585366368294, "learning_rate": 2.146864917940805e-05, "loss": 0.0161, "step": 61020 }, { "epoch": 1.7121615934913734, "grad_norm": 0.5340592861175537, "learning_rate": 2.1463973441810447e-05, "loss": 0.0246, "step": 61030 }, { "epoch": 1.7124421377472296, "grad_norm": 0.06106528267264366, "learning_rate": 2.145929770421284e-05, "loss": 0.0123, "step": 61040 }, { "epoch": 1.7127226820030859, "grad_norm": 0.02638203278183937, "learning_rate": 2.1454621966615236e-05, "loss": 0.0113, "step": 61050 }, { "epoch": 1.7130032262589423, "grad_norm": 0.11909956485033035, "learning_rate": 2.144994622901763e-05, "loss": 0.0178, "step": 61060 }, { "epoch": 1.7132837705147987, "grad_norm": 0.33157145977020264, "learning_rate": 2.1445270491420023e-05, "loss": 0.0269, "step": 61070 }, { "epoch": 1.7135643147706552, "grad_norm": 0.03760359808802605, "learning_rate": 2.1440594753822416e-05, "loss": 0.0365, "step": 61080 }, { "epoch": 1.7138448590265114, "grad_norm": 0.2624523937702179, "learning_rate": 2.143591901622481e-05, "loss": 0.0175, "step": 61090 }, { "epoch": 1.7141254032823678, "grad_norm": 0.1001739501953125, "learning_rate": 2.1431243278627205e-05, "loss": 0.013, "step": 61100 }, { "epoch": 1.714405947538224, "grad_norm": 0.989505410194397, "learning_rate": 2.14265675410296e-05, "loss": 0.0182, "step": 61110 }, { "epoch": 1.7146864917940805, "grad_norm": 0.12231738865375519, "learning_rate": 2.1421891803431992e-05, "loss": 0.0425, "step": 61120 }, { "epoch": 1.714967036049937, "grad_norm": 0.10398749262094498, "learning_rate": 2.1417216065834385e-05, "loss": 0.015, "step": 61130 }, { "epoch": 1.7152475803057934, "grad_norm": 0.10307607799768448, "learning_rate": 2.141254032823678e-05, "loss": 0.0221, "step": 61140 }, { "epoch": 1.7155281245616496, "grad_norm": 0.2899338901042938, "learning_rate": 2.1407864590639175e-05, "loss": 0.0116, "step": 61150 }, { "epoch": 1.7158086688175058, "grad_norm": 4.046570301055908, "learning_rate": 2.1403188853041568e-05, "loss": 0.0086, "step": 61160 }, { "epoch": 1.7160892130733623, "grad_norm": 0.6794763803482056, "learning_rate": 2.1398513115443964e-05, "loss": 0.0423, "step": 61170 }, { "epoch": 1.7163697573292187, "grad_norm": 0.021214546635746956, "learning_rate": 2.1393837377846357e-05, "loss": 0.0131, "step": 61180 }, { "epoch": 1.7166503015850751, "grad_norm": 0.03592479228973389, "learning_rate": 2.138916164024875e-05, "loss": 0.0449, "step": 61190 }, { "epoch": 1.7169308458409314, "grad_norm": 0.0393037348985672, "learning_rate": 2.1384485902651144e-05, "loss": 0.0167, "step": 61200 }, { "epoch": 1.7172113900967878, "grad_norm": 0.03764428570866585, "learning_rate": 2.1379810165053537e-05, "loss": 0.0145, "step": 61210 }, { "epoch": 1.717491934352644, "grad_norm": 0.4574325978755951, "learning_rate": 2.137513442745593e-05, "loss": 0.0272, "step": 61220 }, { "epoch": 1.7177724786085005, "grad_norm": 0.35943248867988586, "learning_rate": 2.1370458689858327e-05, "loss": 0.0115, "step": 61230 }, { "epoch": 1.718053022864357, "grad_norm": 0.15459758043289185, "learning_rate": 2.136578295226072e-05, "loss": 0.0086, "step": 61240 }, { "epoch": 1.7183335671202133, "grad_norm": 0.7590016722679138, "learning_rate": 2.1361107214663116e-05, "loss": 0.0149, "step": 61250 }, { "epoch": 1.7186141113760696, "grad_norm": 0.2261372059583664, "learning_rate": 2.135643147706551e-05, "loss": 0.0215, "step": 61260 }, { "epoch": 1.7188946556319258, "grad_norm": 0.023551149293780327, "learning_rate": 2.1351755739467903e-05, "loss": 0.061, "step": 61270 }, { "epoch": 1.7191751998877822, "grad_norm": 0.33368057012557983, "learning_rate": 2.1347080001870296e-05, "loss": 0.0303, "step": 61280 }, { "epoch": 1.7194557441436387, "grad_norm": 0.026938866823911667, "learning_rate": 2.134240426427269e-05, "loss": 0.0299, "step": 61290 }, { "epoch": 1.719736288399495, "grad_norm": 0.8846641182899475, "learning_rate": 2.1337728526675085e-05, "loss": 0.0095, "step": 61300 }, { "epoch": 1.7200168326553515, "grad_norm": 0.7174367308616638, "learning_rate": 2.133305278907748e-05, "loss": 0.0115, "step": 61310 }, { "epoch": 1.7202973769112078, "grad_norm": 1.0419459342956543, "learning_rate": 2.1328377051479872e-05, "loss": 0.0107, "step": 61320 }, { "epoch": 1.720577921167064, "grad_norm": 0.06141507253050804, "learning_rate": 2.1323701313882265e-05, "loss": 0.0102, "step": 61330 }, { "epoch": 1.7208584654229204, "grad_norm": 0.08553704619407654, "learning_rate": 2.1319025576284658e-05, "loss": 0.0181, "step": 61340 }, { "epoch": 1.7211390096787769, "grad_norm": 0.019878657534718513, "learning_rate": 2.1314349838687055e-05, "loss": 0.0105, "step": 61350 }, { "epoch": 1.7214195539346333, "grad_norm": 0.08546658605337143, "learning_rate": 2.1309674101089448e-05, "loss": 0.0453, "step": 61360 }, { "epoch": 1.7217000981904895, "grad_norm": 0.011311491951346397, "learning_rate": 2.1304998363491844e-05, "loss": 0.0158, "step": 61370 }, { "epoch": 1.721980642446346, "grad_norm": 0.4698261022567749, "learning_rate": 2.1300322625894237e-05, "loss": 0.011, "step": 61380 }, { "epoch": 1.7222611867022022, "grad_norm": 0.027980109676718712, "learning_rate": 2.129564688829663e-05, "loss": 0.0176, "step": 61390 }, { "epoch": 1.7225417309580586, "grad_norm": 0.025955859571695328, "learning_rate": 2.1290971150699024e-05, "loss": 0.004, "step": 61400 }, { "epoch": 1.722822275213915, "grad_norm": 0.9663868546485901, "learning_rate": 2.1286295413101417e-05, "loss": 0.0388, "step": 61410 }, { "epoch": 1.7231028194697715, "grad_norm": 0.44808560609817505, "learning_rate": 2.128161967550381e-05, "loss": 0.039, "step": 61420 }, { "epoch": 1.7233833637256277, "grad_norm": 0.15479516983032227, "learning_rate": 2.1276943937906203e-05, "loss": 0.018, "step": 61430 }, { "epoch": 1.723663907981484, "grad_norm": 0.2361219972372055, "learning_rate": 2.12722682003086e-05, "loss": 0.0116, "step": 61440 }, { "epoch": 1.7239444522373404, "grad_norm": 0.9355350136756897, "learning_rate": 2.1267592462710993e-05, "loss": 0.0389, "step": 61450 }, { "epoch": 1.7242249964931968, "grad_norm": 0.4454866647720337, "learning_rate": 2.126291672511339e-05, "loss": 0.0298, "step": 61460 }, { "epoch": 1.7245055407490533, "grad_norm": 0.046649836003780365, "learning_rate": 2.1258240987515783e-05, "loss": 0.0296, "step": 61470 }, { "epoch": 1.7247860850049095, "grad_norm": 0.053194351494312286, "learning_rate": 2.1253565249918176e-05, "loss": 0.0241, "step": 61480 }, { "epoch": 1.725066629260766, "grad_norm": 0.1836208701133728, "learning_rate": 2.124888951232057e-05, "loss": 0.0089, "step": 61490 }, { "epoch": 1.7253471735166221, "grad_norm": 0.13423629105091095, "learning_rate": 2.1244213774722962e-05, "loss": 0.0291, "step": 61500 }, { "epoch": 1.7256277177724786, "grad_norm": 0.009016171097755432, "learning_rate": 2.123953803712536e-05, "loss": 0.0068, "step": 61510 }, { "epoch": 1.725908262028335, "grad_norm": 0.015451760031282902, "learning_rate": 2.1234862299527752e-05, "loss": 0.0292, "step": 61520 }, { "epoch": 1.7261888062841915, "grad_norm": 1.2241437435150146, "learning_rate": 2.1230186561930145e-05, "loss": 0.043, "step": 61530 }, { "epoch": 1.7264693505400477, "grad_norm": 0.12632079422473907, "learning_rate": 2.1225510824332538e-05, "loss": 0.0241, "step": 61540 }, { "epoch": 1.726749894795904, "grad_norm": 1.8776705265045166, "learning_rate": 2.1220835086734935e-05, "loss": 0.0569, "step": 61550 }, { "epoch": 1.7270304390517603, "grad_norm": 0.4751476049423218, "learning_rate": 2.1216159349137328e-05, "loss": 0.0127, "step": 61560 }, { "epoch": 1.7273109833076168, "grad_norm": 0.08832667022943497, "learning_rate": 2.121148361153972e-05, "loss": 0.0232, "step": 61570 }, { "epoch": 1.7275915275634732, "grad_norm": 0.08610422164201736, "learning_rate": 2.1206807873942118e-05, "loss": 0.0447, "step": 61580 }, { "epoch": 1.7278720718193294, "grad_norm": 0.05910763517022133, "learning_rate": 2.120213213634451e-05, "loss": 0.017, "step": 61590 }, { "epoch": 1.7281526160751859, "grad_norm": 0.1523619443178177, "learning_rate": 2.1197456398746904e-05, "loss": 0.0396, "step": 61600 }, { "epoch": 1.728433160331042, "grad_norm": 0.2388589233160019, "learning_rate": 2.1192780661149297e-05, "loss": 0.0264, "step": 61610 }, { "epoch": 1.7287137045868985, "grad_norm": 0.7753962874412537, "learning_rate": 2.118810492355169e-05, "loss": 0.0291, "step": 61620 }, { "epoch": 1.728994248842755, "grad_norm": 0.04431034252047539, "learning_rate": 2.1183429185954083e-05, "loss": 0.0288, "step": 61630 }, { "epoch": 1.7292747930986114, "grad_norm": 4.1835856437683105, "learning_rate": 2.1178753448356476e-05, "loss": 0.0362, "step": 61640 }, { "epoch": 1.7295553373544676, "grad_norm": 0.017987050116062164, "learning_rate": 2.1174077710758873e-05, "loss": 0.014, "step": 61650 }, { "epoch": 1.7298358816103239, "grad_norm": 0.03855755552649498, "learning_rate": 2.116940197316127e-05, "loss": 0.0338, "step": 61660 }, { "epoch": 1.7301164258661803, "grad_norm": 0.40515193343162537, "learning_rate": 2.1164726235563663e-05, "loss": 0.014, "step": 61670 }, { "epoch": 1.7303969701220367, "grad_norm": 0.4489935338497162, "learning_rate": 2.1160050497966056e-05, "loss": 0.0192, "step": 61680 }, { "epoch": 1.7306775143778932, "grad_norm": 0.035008080303668976, "learning_rate": 2.115537476036845e-05, "loss": 0.036, "step": 61690 }, { "epoch": 1.7309580586337496, "grad_norm": 1.3263074159622192, "learning_rate": 2.1150699022770842e-05, "loss": 0.0322, "step": 61700 }, { "epoch": 1.7312386028896058, "grad_norm": 0.09183444827795029, "learning_rate": 2.1146023285173235e-05, "loss": 0.0552, "step": 61710 }, { "epoch": 1.731519147145462, "grad_norm": 0.30246374011039734, "learning_rate": 2.1141347547575632e-05, "loss": 0.031, "step": 61720 }, { "epoch": 1.7317996914013185, "grad_norm": 0.11815338581800461, "learning_rate": 2.1136671809978025e-05, "loss": 0.0165, "step": 61730 }, { "epoch": 1.732080235657175, "grad_norm": 0.6408233046531677, "learning_rate": 2.1131996072380418e-05, "loss": 0.0175, "step": 61740 }, { "epoch": 1.7323607799130314, "grad_norm": 0.3189634084701538, "learning_rate": 2.1127320334782815e-05, "loss": 0.023, "step": 61750 }, { "epoch": 1.7326413241688876, "grad_norm": 0.04525298625230789, "learning_rate": 2.1122644597185208e-05, "loss": 0.0143, "step": 61760 }, { "epoch": 1.732921868424744, "grad_norm": 0.13429182767868042, "learning_rate": 2.11179688595876e-05, "loss": 0.0198, "step": 61770 }, { "epoch": 1.7332024126806003, "grad_norm": 0.08172321319580078, "learning_rate": 2.1113293121989994e-05, "loss": 0.009, "step": 61780 }, { "epoch": 1.7334829569364567, "grad_norm": 0.020038815215229988, "learning_rate": 2.110861738439239e-05, "loss": 0.0579, "step": 61790 }, { "epoch": 1.7337635011923132, "grad_norm": 0.16630680859088898, "learning_rate": 2.1103941646794784e-05, "loss": 0.0116, "step": 61800 }, { "epoch": 1.7340440454481696, "grad_norm": 0.046496011316776276, "learning_rate": 2.1099265909197177e-05, "loss": 0.0255, "step": 61810 }, { "epoch": 1.7343245897040258, "grad_norm": 0.26186323165893555, "learning_rate": 2.109459017159957e-05, "loss": 0.0258, "step": 61820 }, { "epoch": 1.734605133959882, "grad_norm": 1.7513597011566162, "learning_rate": 2.1089914434001963e-05, "loss": 0.0467, "step": 61830 }, { "epoch": 1.7348856782157385, "grad_norm": 1.4241721630096436, "learning_rate": 2.1085238696404357e-05, "loss": 0.0304, "step": 61840 }, { "epoch": 1.735166222471595, "grad_norm": 0.23368436098098755, "learning_rate": 2.1080562958806753e-05, "loss": 0.0074, "step": 61850 }, { "epoch": 1.7354467667274514, "grad_norm": 0.039380405098199844, "learning_rate": 2.107588722120915e-05, "loss": 0.0228, "step": 61860 }, { "epoch": 1.7357273109833076, "grad_norm": 0.10071995109319687, "learning_rate": 2.1071211483611543e-05, "loss": 0.0349, "step": 61870 }, { "epoch": 1.736007855239164, "grad_norm": 0.02456364966928959, "learning_rate": 2.1066535746013936e-05, "loss": 0.0094, "step": 61880 }, { "epoch": 1.7362883994950202, "grad_norm": 0.5502063035964966, "learning_rate": 2.106186000841633e-05, "loss": 0.0112, "step": 61890 }, { "epoch": 1.7365689437508767, "grad_norm": 0.1456429660320282, "learning_rate": 2.1057184270818722e-05, "loss": 0.0317, "step": 61900 }, { "epoch": 1.736849488006733, "grad_norm": 0.09533638507127762, "learning_rate": 2.1052508533221115e-05, "loss": 0.0039, "step": 61910 }, { "epoch": 1.7371300322625896, "grad_norm": 0.17614366114139557, "learning_rate": 2.104783279562351e-05, "loss": 0.044, "step": 61920 }, { "epoch": 1.7374105765184458, "grad_norm": 0.024860180914402008, "learning_rate": 2.1043157058025905e-05, "loss": 0.0213, "step": 61930 }, { "epoch": 1.737691120774302, "grad_norm": 0.16444416344165802, "learning_rate": 2.1038481320428298e-05, "loss": 0.0524, "step": 61940 }, { "epoch": 1.7379716650301584, "grad_norm": 0.60987389087677, "learning_rate": 2.103380558283069e-05, "loss": 0.0218, "step": 61950 }, { "epoch": 1.7382522092860149, "grad_norm": 0.18005037307739258, "learning_rate": 2.1029129845233088e-05, "loss": 0.0291, "step": 61960 }, { "epoch": 1.7385327535418713, "grad_norm": 0.009965915232896805, "learning_rate": 2.102445410763548e-05, "loss": 0.0119, "step": 61970 }, { "epoch": 1.7388132977977278, "grad_norm": 0.007682936731725931, "learning_rate": 2.1019778370037874e-05, "loss": 0.0122, "step": 61980 }, { "epoch": 1.739093842053584, "grad_norm": 0.014194854535162449, "learning_rate": 2.1015102632440267e-05, "loss": 0.0137, "step": 61990 }, { "epoch": 1.7393743863094402, "grad_norm": 0.536167562007904, "learning_rate": 2.1010426894842664e-05, "loss": 0.0212, "step": 62000 }, { "epoch": 1.7396549305652966, "grad_norm": 0.04384208843111992, "learning_rate": 2.1005751157245057e-05, "loss": 0.0206, "step": 62010 }, { "epoch": 1.739935474821153, "grad_norm": 0.35131311416625977, "learning_rate": 2.100107541964745e-05, "loss": 0.023, "step": 62020 }, { "epoch": 1.7402160190770095, "grad_norm": 1.0834112167358398, "learning_rate": 2.0996399682049843e-05, "loss": 0.022, "step": 62030 }, { "epoch": 1.7404965633328657, "grad_norm": 0.23824892938137054, "learning_rate": 2.0991723944452237e-05, "loss": 0.0085, "step": 62040 }, { "epoch": 1.7407771075887222, "grad_norm": 0.029848922044038773, "learning_rate": 2.0987048206854633e-05, "loss": 0.0137, "step": 62050 }, { "epoch": 1.7410576518445784, "grad_norm": 3.44775128364563, "learning_rate": 2.0982372469257026e-05, "loss": 0.0446, "step": 62060 }, { "epoch": 1.7413381961004348, "grad_norm": 0.4009413421154022, "learning_rate": 2.0977696731659423e-05, "loss": 0.0073, "step": 62070 }, { "epoch": 1.7416187403562913, "grad_norm": 2.1347873210906982, "learning_rate": 2.0973020994061816e-05, "loss": 0.0403, "step": 62080 }, { "epoch": 1.7418992846121477, "grad_norm": 0.28660693764686584, "learning_rate": 2.096834525646421e-05, "loss": 0.0322, "step": 62090 }, { "epoch": 1.742179828868004, "grad_norm": 0.055892378091812134, "learning_rate": 2.0963669518866602e-05, "loss": 0.0216, "step": 62100 }, { "epoch": 1.7424603731238602, "grad_norm": 0.8697624206542969, "learning_rate": 2.0958993781268995e-05, "loss": 0.0301, "step": 62110 }, { "epoch": 1.7427409173797166, "grad_norm": 0.035325005650520325, "learning_rate": 2.095431804367139e-05, "loss": 0.0065, "step": 62120 }, { "epoch": 1.743021461635573, "grad_norm": 0.03161554038524628, "learning_rate": 2.0949642306073782e-05, "loss": 0.0457, "step": 62130 }, { "epoch": 1.7433020058914295, "grad_norm": 0.030251242220401764, "learning_rate": 2.0944966568476178e-05, "loss": 0.0111, "step": 62140 }, { "epoch": 1.7435825501472857, "grad_norm": 0.48964551091194153, "learning_rate": 2.094029083087857e-05, "loss": 0.0143, "step": 62150 }, { "epoch": 1.7438630944031421, "grad_norm": 0.009099315851926804, "learning_rate": 2.0935615093280968e-05, "loss": 0.0106, "step": 62160 }, { "epoch": 1.7441436386589984, "grad_norm": 0.13147924840450287, "learning_rate": 2.093093935568336e-05, "loss": 0.0373, "step": 62170 }, { "epoch": 1.7444241829148548, "grad_norm": 0.158694326877594, "learning_rate": 2.0926263618085754e-05, "loss": 0.0231, "step": 62180 }, { "epoch": 1.7447047271707112, "grad_norm": 0.010010765865445137, "learning_rate": 2.0921587880488147e-05, "loss": 0.026, "step": 62190 }, { "epoch": 1.7449852714265677, "grad_norm": 1.5923045873641968, "learning_rate": 2.091691214289054e-05, "loss": 0.0586, "step": 62200 }, { "epoch": 1.745265815682424, "grad_norm": 0.0351567380130291, "learning_rate": 2.0912236405292937e-05, "loss": 0.0196, "step": 62210 }, { "epoch": 1.7455463599382801, "grad_norm": 0.20274406671524048, "learning_rate": 2.090756066769533e-05, "loss": 0.0111, "step": 62220 }, { "epoch": 1.7458269041941366, "grad_norm": 0.5155380964279175, "learning_rate": 2.0902884930097723e-05, "loss": 0.044, "step": 62230 }, { "epoch": 1.746107448449993, "grad_norm": 0.02207012288272381, "learning_rate": 2.0898209192500117e-05, "loss": 0.0056, "step": 62240 }, { "epoch": 1.7463879927058494, "grad_norm": 0.017054539173841476, "learning_rate": 2.089353345490251e-05, "loss": 0.027, "step": 62250 }, { "epoch": 1.7466685369617059, "grad_norm": 0.18944568932056427, "learning_rate": 2.0888857717304906e-05, "loss": 0.0068, "step": 62260 }, { "epoch": 1.746949081217562, "grad_norm": 0.05810971185564995, "learning_rate": 2.08841819797073e-05, "loss": 0.0351, "step": 62270 }, { "epoch": 1.7472296254734183, "grad_norm": 0.2373962551355362, "learning_rate": 2.0879506242109696e-05, "loss": 0.0498, "step": 62280 }, { "epoch": 1.7475101697292748, "grad_norm": 0.18249210715293884, "learning_rate": 2.087483050451209e-05, "loss": 0.0167, "step": 62290 }, { "epoch": 1.7477907139851312, "grad_norm": 0.8161043524742126, "learning_rate": 2.0870154766914482e-05, "loss": 0.0083, "step": 62300 }, { "epoch": 1.7480712582409876, "grad_norm": 1.0113803148269653, "learning_rate": 2.0865479029316875e-05, "loss": 0.055, "step": 62310 }, { "epoch": 1.7483518024968439, "grad_norm": 0.08765299618244171, "learning_rate": 2.086080329171927e-05, "loss": 0.0053, "step": 62320 }, { "epoch": 1.7486323467527, "grad_norm": 0.07207608968019485, "learning_rate": 2.0856127554121662e-05, "loss": 0.0099, "step": 62330 }, { "epoch": 1.7489128910085565, "grad_norm": 0.558362603187561, "learning_rate": 2.0851451816524055e-05, "loss": 0.0159, "step": 62340 }, { "epoch": 1.749193435264413, "grad_norm": 0.086622454226017, "learning_rate": 2.084677607892645e-05, "loss": 0.0349, "step": 62350 }, { "epoch": 1.7494739795202694, "grad_norm": 0.8273517489433289, "learning_rate": 2.0842100341328845e-05, "loss": 0.0448, "step": 62360 }, { "epoch": 1.7497545237761258, "grad_norm": 0.11827481538057327, "learning_rate": 2.083742460373124e-05, "loss": 0.0125, "step": 62370 }, { "epoch": 1.750035068031982, "grad_norm": 0.6860670447349548, "learning_rate": 2.0832748866133634e-05, "loss": 0.0466, "step": 62380 }, { "epoch": 1.7503156122878383, "grad_norm": 0.4548760950565338, "learning_rate": 2.0828073128536027e-05, "loss": 0.0226, "step": 62390 }, { "epoch": 1.7505961565436947, "grad_norm": 0.05780536308884621, "learning_rate": 2.082339739093842e-05, "loss": 0.0183, "step": 62400 }, { "epoch": 1.7508767007995512, "grad_norm": 0.23306675255298615, "learning_rate": 2.0818721653340814e-05, "loss": 0.0165, "step": 62410 }, { "epoch": 1.7511572450554076, "grad_norm": 0.4834936559200287, "learning_rate": 2.081404591574321e-05, "loss": 0.045, "step": 62420 }, { "epoch": 1.7514377893112638, "grad_norm": 0.058908287435770035, "learning_rate": 2.0809370178145603e-05, "loss": 0.0204, "step": 62430 }, { "epoch": 1.7517183335671203, "grad_norm": 0.6138299107551575, "learning_rate": 2.0804694440547997e-05, "loss": 0.0103, "step": 62440 }, { "epoch": 1.7519988778229765, "grad_norm": 0.8966478705406189, "learning_rate": 2.080001870295039e-05, "loss": 0.0379, "step": 62450 }, { "epoch": 1.752279422078833, "grad_norm": 0.04753046855330467, "learning_rate": 2.0795342965352786e-05, "loss": 0.0203, "step": 62460 }, { "epoch": 1.7525599663346894, "grad_norm": 0.42936971783638, "learning_rate": 2.079066722775518e-05, "loss": 0.0273, "step": 62470 }, { "epoch": 1.7528405105905458, "grad_norm": 3.7456717491149902, "learning_rate": 2.0785991490157573e-05, "loss": 0.063, "step": 62480 }, { "epoch": 1.753121054846402, "grad_norm": 0.13236360251903534, "learning_rate": 2.078131575255997e-05, "loss": 0.0219, "step": 62490 }, { "epoch": 1.7534015991022582, "grad_norm": 0.28881117701530457, "learning_rate": 2.0776640014962362e-05, "loss": 0.0065, "step": 62500 }, { "epoch": 1.7536821433581147, "grad_norm": 0.03097657300531864, "learning_rate": 2.0771964277364756e-05, "loss": 0.0229, "step": 62510 }, { "epoch": 1.7539626876139711, "grad_norm": 0.5357884764671326, "learning_rate": 2.076728853976715e-05, "loss": 0.0378, "step": 62520 }, { "epoch": 1.7542432318698276, "grad_norm": 0.09396424889564514, "learning_rate": 2.0762612802169542e-05, "loss": 0.0171, "step": 62530 }, { "epoch": 1.7545237761256838, "grad_norm": 0.054157670587301254, "learning_rate": 2.0757937064571935e-05, "loss": 0.0307, "step": 62540 }, { "epoch": 1.7548043203815402, "grad_norm": 0.053510576486587524, "learning_rate": 2.075326132697433e-05, "loss": 0.0324, "step": 62550 }, { "epoch": 1.7550848646373964, "grad_norm": 0.03740597516298294, "learning_rate": 2.0748585589376725e-05, "loss": 0.0115, "step": 62560 }, { "epoch": 1.7553654088932529, "grad_norm": 0.4807584285736084, "learning_rate": 2.074390985177912e-05, "loss": 0.0244, "step": 62570 }, { "epoch": 1.7556459531491093, "grad_norm": 0.7071739435195923, "learning_rate": 2.0739234114181514e-05, "loss": 0.0435, "step": 62580 }, { "epoch": 1.7559264974049658, "grad_norm": 1.8057669401168823, "learning_rate": 2.0734558376583908e-05, "loss": 0.0199, "step": 62590 }, { "epoch": 1.756207041660822, "grad_norm": 0.3569677770137787, "learning_rate": 2.07298826389863e-05, "loss": 0.0297, "step": 62600 }, { "epoch": 1.7564875859166782, "grad_norm": 0.5786514282226562, "learning_rate": 2.0725206901388694e-05, "loss": 0.0314, "step": 62610 }, { "epoch": 1.7567681301725346, "grad_norm": 0.24598190188407898, "learning_rate": 2.072053116379109e-05, "loss": 0.0265, "step": 62620 }, { "epoch": 1.757048674428391, "grad_norm": 0.7658158540725708, "learning_rate": 2.0715855426193484e-05, "loss": 0.0417, "step": 62630 }, { "epoch": 1.7573292186842475, "grad_norm": 0.029285451397299767, "learning_rate": 2.0711179688595877e-05, "loss": 0.006, "step": 62640 }, { "epoch": 1.757609762940104, "grad_norm": 0.2726006507873535, "learning_rate": 2.070650395099827e-05, "loss": 0.0109, "step": 62650 }, { "epoch": 1.7578903071959602, "grad_norm": 0.16253788769245148, "learning_rate": 2.0701828213400666e-05, "loss": 0.0263, "step": 62660 }, { "epoch": 1.7581708514518164, "grad_norm": 0.02589113637804985, "learning_rate": 2.069715247580306e-05, "loss": 0.0199, "step": 62670 }, { "epoch": 1.7584513957076728, "grad_norm": 0.08134538680315018, "learning_rate": 2.0692476738205453e-05, "loss": 0.0153, "step": 62680 }, { "epoch": 1.7587319399635293, "grad_norm": 0.3297345042228699, "learning_rate": 2.068780100060785e-05, "loss": 0.0312, "step": 62690 }, { "epoch": 1.7590124842193857, "grad_norm": 0.7575457096099854, "learning_rate": 2.0683125263010242e-05, "loss": 0.0594, "step": 62700 }, { "epoch": 1.759293028475242, "grad_norm": 0.7405019998550415, "learning_rate": 2.0678449525412636e-05, "loss": 0.0292, "step": 62710 }, { "epoch": 1.7595735727310984, "grad_norm": 0.1265239119529724, "learning_rate": 2.067377378781503e-05, "loss": 0.0215, "step": 62720 }, { "epoch": 1.7598541169869546, "grad_norm": 0.02153785154223442, "learning_rate": 2.0669098050217422e-05, "loss": 0.0102, "step": 62730 }, { "epoch": 1.760134661242811, "grad_norm": 0.05936139076948166, "learning_rate": 2.0664422312619815e-05, "loss": 0.0271, "step": 62740 }, { "epoch": 1.7604152054986675, "grad_norm": 0.05952540785074234, "learning_rate": 2.0659746575022208e-05, "loss": 0.0134, "step": 62750 }, { "epoch": 1.760695749754524, "grad_norm": 0.5385165214538574, "learning_rate": 2.0655070837424605e-05, "loss": 0.0292, "step": 62760 }, { "epoch": 1.7609762940103801, "grad_norm": 0.9397744536399841, "learning_rate": 2.0650395099827e-05, "loss": 0.01, "step": 62770 }, { "epoch": 1.7612568382662364, "grad_norm": 0.07238224148750305, "learning_rate": 2.0645719362229394e-05, "loss": 0.0298, "step": 62780 }, { "epoch": 1.7615373825220928, "grad_norm": 0.07076327502727509, "learning_rate": 2.0641043624631788e-05, "loss": 0.0108, "step": 62790 }, { "epoch": 1.7618179267779492, "grad_norm": 0.08149081468582153, "learning_rate": 2.063636788703418e-05, "loss": 0.0076, "step": 62800 }, { "epoch": 1.7620984710338057, "grad_norm": 0.11347737908363342, "learning_rate": 2.0631692149436574e-05, "loss": 0.0072, "step": 62810 }, { "epoch": 1.762379015289662, "grad_norm": 0.2768470346927643, "learning_rate": 2.0627016411838967e-05, "loss": 0.0397, "step": 62820 }, { "epoch": 1.7626595595455183, "grad_norm": 0.04208563268184662, "learning_rate": 2.0622340674241364e-05, "loss": 0.0254, "step": 62830 }, { "epoch": 1.7629401038013746, "grad_norm": 0.07107364386320114, "learning_rate": 2.0617664936643757e-05, "loss": 0.0606, "step": 62840 }, { "epoch": 1.763220648057231, "grad_norm": 1.1015721559524536, "learning_rate": 2.061298919904615e-05, "loss": 0.0271, "step": 62850 }, { "epoch": 1.7635011923130874, "grad_norm": 0.3390704095363617, "learning_rate": 2.0608313461448543e-05, "loss": 0.0132, "step": 62860 }, { "epoch": 1.7637817365689439, "grad_norm": 0.14375253021717072, "learning_rate": 2.060363772385094e-05, "loss": 0.023, "step": 62870 }, { "epoch": 1.7640622808248, "grad_norm": 0.10266657918691635, "learning_rate": 2.0598961986253333e-05, "loss": 0.0246, "step": 62880 }, { "epoch": 1.7643428250806563, "grad_norm": 0.03217071294784546, "learning_rate": 2.0594286248655726e-05, "loss": 0.0229, "step": 62890 }, { "epoch": 1.7646233693365128, "grad_norm": 0.2504962980747223, "learning_rate": 2.0589610511058122e-05, "loss": 0.0141, "step": 62900 }, { "epoch": 1.7649039135923692, "grad_norm": 0.04760671406984329, "learning_rate": 2.0584934773460516e-05, "loss": 0.0187, "step": 62910 }, { "epoch": 1.7651844578482256, "grad_norm": 0.053726524114608765, "learning_rate": 2.058025903586291e-05, "loss": 0.0168, "step": 62920 }, { "epoch": 1.765465002104082, "grad_norm": 0.30002787709236145, "learning_rate": 2.0575583298265302e-05, "loss": 0.0194, "step": 62930 }, { "epoch": 1.7657455463599383, "grad_norm": 0.049695711582899094, "learning_rate": 2.0570907560667695e-05, "loss": 0.0509, "step": 62940 }, { "epoch": 1.7660260906157945, "grad_norm": 0.3495732545852661, "learning_rate": 2.0566231823070088e-05, "loss": 0.0076, "step": 62950 }, { "epoch": 1.766306634871651, "grad_norm": 0.4407733082771301, "learning_rate": 2.0561556085472485e-05, "loss": 0.0512, "step": 62960 }, { "epoch": 1.7665871791275074, "grad_norm": 0.05083378404378891, "learning_rate": 2.0556880347874878e-05, "loss": 0.0202, "step": 62970 }, { "epoch": 1.7668677233833638, "grad_norm": 0.17345838248729706, "learning_rate": 2.0552204610277274e-05, "loss": 0.0099, "step": 62980 }, { "epoch": 1.76714826763922, "grad_norm": 0.05054445192217827, "learning_rate": 2.0547528872679668e-05, "loss": 0.017, "step": 62990 }, { "epoch": 1.7674288118950765, "grad_norm": 0.40646278858184814, "learning_rate": 2.054285313508206e-05, "loss": 0.0241, "step": 63000 }, { "epoch": 1.7677093561509327, "grad_norm": 1.1768182516098022, "learning_rate": 2.0538177397484454e-05, "loss": 0.0436, "step": 63010 }, { "epoch": 1.7679899004067892, "grad_norm": 0.07021812349557877, "learning_rate": 2.0533501659886847e-05, "loss": 0.0214, "step": 63020 }, { "epoch": 1.7682704446626456, "grad_norm": 0.05390329658985138, "learning_rate": 2.052882592228924e-05, "loss": 0.0186, "step": 63030 }, { "epoch": 1.768550988918502, "grad_norm": 0.03947312757372856, "learning_rate": 2.0524150184691637e-05, "loss": 0.0311, "step": 63040 }, { "epoch": 1.7688315331743583, "grad_norm": 0.9904593229293823, "learning_rate": 2.051947444709403e-05, "loss": 0.0235, "step": 63050 }, { "epoch": 1.7691120774302145, "grad_norm": 0.06164921820163727, "learning_rate": 2.0514798709496423e-05, "loss": 0.0128, "step": 63060 }, { "epoch": 1.769392621686071, "grad_norm": 0.3282768726348877, "learning_rate": 2.051012297189882e-05, "loss": 0.0211, "step": 63070 }, { "epoch": 1.7696731659419274, "grad_norm": 0.021114513278007507, "learning_rate": 2.0505447234301213e-05, "loss": 0.0192, "step": 63080 }, { "epoch": 1.7699537101977838, "grad_norm": 0.503709614276886, "learning_rate": 2.0500771496703606e-05, "loss": 0.02, "step": 63090 }, { "epoch": 1.77023425445364, "grad_norm": 0.5519452095031738, "learning_rate": 2.0496095759106e-05, "loss": 0.0112, "step": 63100 }, { "epoch": 1.7705147987094965, "grad_norm": 0.014594352804124355, "learning_rate": 2.0491420021508396e-05, "loss": 0.0441, "step": 63110 }, { "epoch": 1.7707953429653527, "grad_norm": 0.9463403820991516, "learning_rate": 2.048674428391079e-05, "loss": 0.0208, "step": 63120 }, { "epoch": 1.7710758872212091, "grad_norm": 0.035520732402801514, "learning_rate": 2.0482068546313182e-05, "loss": 0.0064, "step": 63130 }, { "epoch": 1.7713564314770656, "grad_norm": 0.6217635273933411, "learning_rate": 2.0477392808715575e-05, "loss": 0.0292, "step": 63140 }, { "epoch": 1.771636975732922, "grad_norm": 0.05404195189476013, "learning_rate": 2.0472717071117968e-05, "loss": 0.0173, "step": 63150 }, { "epoch": 1.7719175199887782, "grad_norm": 0.3406876027584076, "learning_rate": 2.046804133352036e-05, "loss": 0.0092, "step": 63160 }, { "epoch": 1.7721980642446344, "grad_norm": 0.10455042868852615, "learning_rate": 2.0463365595922758e-05, "loss": 0.0158, "step": 63170 }, { "epoch": 1.7724786085004909, "grad_norm": 0.4741954505443573, "learning_rate": 2.0458689858325155e-05, "loss": 0.0167, "step": 63180 }, { "epoch": 1.7727591527563473, "grad_norm": 0.2606368064880371, "learning_rate": 2.0454014120727548e-05, "loss": 0.0072, "step": 63190 }, { "epoch": 1.7730396970122038, "grad_norm": 1.4718691110610962, "learning_rate": 2.044933838312994e-05, "loss": 0.026, "step": 63200 }, { "epoch": 1.77332024126806, "grad_norm": 0.06439553946256638, "learning_rate": 2.0444662645532334e-05, "loss": 0.0154, "step": 63210 }, { "epoch": 1.7736007855239164, "grad_norm": 0.26105761528015137, "learning_rate": 2.0439986907934727e-05, "loss": 0.0259, "step": 63220 }, { "epoch": 1.7738813297797726, "grad_norm": 0.3248659074306488, "learning_rate": 2.043531117033712e-05, "loss": 0.0331, "step": 63230 }, { "epoch": 1.774161874035629, "grad_norm": 0.20960097014904022, "learning_rate": 2.0430635432739513e-05, "loss": 0.027, "step": 63240 }, { "epoch": 1.7744424182914855, "grad_norm": 0.027675675228238106, "learning_rate": 2.042595969514191e-05, "loss": 0.0327, "step": 63250 }, { "epoch": 1.774722962547342, "grad_norm": 0.3789786994457245, "learning_rate": 2.0421283957544303e-05, "loss": 0.0453, "step": 63260 }, { "epoch": 1.7750035068031982, "grad_norm": 0.2431030124425888, "learning_rate": 2.0416608219946696e-05, "loss": 0.0301, "step": 63270 }, { "epoch": 1.7752840510590544, "grad_norm": 1.8528525829315186, "learning_rate": 2.0411932482349093e-05, "loss": 0.0508, "step": 63280 }, { "epoch": 1.7755645953149108, "grad_norm": 0.10427393019199371, "learning_rate": 2.0407256744751486e-05, "loss": 0.0177, "step": 63290 }, { "epoch": 1.7758451395707673, "grad_norm": 0.07382652163505554, "learning_rate": 2.040258100715388e-05, "loss": 0.024, "step": 63300 }, { "epoch": 1.7761256838266237, "grad_norm": 0.18809962272644043, "learning_rate": 2.0397905269556272e-05, "loss": 0.0248, "step": 63310 }, { "epoch": 1.7764062280824802, "grad_norm": 0.23918575048446655, "learning_rate": 2.039322953195867e-05, "loss": 0.0176, "step": 63320 }, { "epoch": 1.7766867723383364, "grad_norm": 0.43528082966804504, "learning_rate": 2.0388553794361062e-05, "loss": 0.0275, "step": 63330 }, { "epoch": 1.7769673165941926, "grad_norm": 2.218987226486206, "learning_rate": 2.0383878056763455e-05, "loss": 0.0546, "step": 63340 }, { "epoch": 1.777247860850049, "grad_norm": 9.431686401367188, "learning_rate": 2.037920231916585e-05, "loss": 0.0599, "step": 63350 }, { "epoch": 1.7775284051059055, "grad_norm": 0.3711656928062439, "learning_rate": 2.037452658156824e-05, "loss": 0.0191, "step": 63360 }, { "epoch": 1.777808949361762, "grad_norm": 0.6345756649971008, "learning_rate": 2.0369850843970638e-05, "loss": 0.0308, "step": 63370 }, { "epoch": 1.7780894936176181, "grad_norm": 0.18688727915287018, "learning_rate": 2.036517510637303e-05, "loss": 0.0208, "step": 63380 }, { "epoch": 1.7783700378734746, "grad_norm": 0.5633224248886108, "learning_rate": 2.0360499368775428e-05, "loss": 0.0294, "step": 63390 }, { "epoch": 1.7786505821293308, "grad_norm": 0.06226220354437828, "learning_rate": 2.035582363117782e-05, "loss": 0.0167, "step": 63400 }, { "epoch": 1.7789311263851872, "grad_norm": 0.3053464889526367, "learning_rate": 2.0351147893580214e-05, "loss": 0.0518, "step": 63410 }, { "epoch": 1.7792116706410437, "grad_norm": 0.2912084460258484, "learning_rate": 2.0346472155982607e-05, "loss": 0.0194, "step": 63420 }, { "epoch": 1.7794922148969001, "grad_norm": 0.03959375247359276, "learning_rate": 2.0341796418385e-05, "loss": 0.0157, "step": 63430 }, { "epoch": 1.7797727591527563, "grad_norm": 0.029400954023003578, "learning_rate": 2.0337120680787393e-05, "loss": 0.024, "step": 63440 }, { "epoch": 1.7800533034086126, "grad_norm": 0.044840943068265915, "learning_rate": 2.0332444943189787e-05, "loss": 0.0142, "step": 63450 }, { "epoch": 1.780333847664469, "grad_norm": 0.7354563474655151, "learning_rate": 2.0327769205592183e-05, "loss": 0.0346, "step": 63460 }, { "epoch": 1.7806143919203254, "grad_norm": 0.09273529052734375, "learning_rate": 2.0323093467994576e-05, "loss": 0.0273, "step": 63470 }, { "epoch": 1.7808949361761819, "grad_norm": 0.024449322372674942, "learning_rate": 2.0318417730396973e-05, "loss": 0.0073, "step": 63480 }, { "epoch": 1.781175480432038, "grad_norm": 0.0317954383790493, "learning_rate": 2.0313741992799366e-05, "loss": 0.0139, "step": 63490 }, { "epoch": 1.7814560246878945, "grad_norm": 1.7979190349578857, "learning_rate": 2.030906625520176e-05, "loss": 0.0288, "step": 63500 }, { "epoch": 1.7817365689437508, "grad_norm": 0.7175925970077515, "learning_rate": 2.0304390517604152e-05, "loss": 0.0472, "step": 63510 }, { "epoch": 1.7820171131996072, "grad_norm": 0.020432641729712486, "learning_rate": 2.0299714780006545e-05, "loss": 0.0056, "step": 63520 }, { "epoch": 1.7822976574554636, "grad_norm": 0.17685066163539886, "learning_rate": 2.0295039042408942e-05, "loss": 0.0284, "step": 63530 }, { "epoch": 1.78257820171132, "grad_norm": 0.32209867238998413, "learning_rate": 2.0290363304811335e-05, "loss": 0.0197, "step": 63540 }, { "epoch": 1.7828587459671763, "grad_norm": 1.5943222045898438, "learning_rate": 2.028568756721373e-05, "loss": 0.0173, "step": 63550 }, { "epoch": 1.7831392902230325, "grad_norm": 0.05469394847750664, "learning_rate": 2.028101182961612e-05, "loss": 0.0324, "step": 63560 }, { "epoch": 1.783419834478889, "grad_norm": 1.2626404762268066, "learning_rate": 2.0276336092018518e-05, "loss": 0.0117, "step": 63570 }, { "epoch": 1.7837003787347454, "grad_norm": 0.03783418610692024, "learning_rate": 2.027166035442091e-05, "loss": 0.0163, "step": 63580 }, { "epoch": 1.7839809229906018, "grad_norm": 0.021704206243157387, "learning_rate": 2.0266984616823304e-05, "loss": 0.0291, "step": 63590 }, { "epoch": 1.7842614672464583, "grad_norm": 0.028474340215325356, "learning_rate": 2.02623088792257e-05, "loss": 0.025, "step": 63600 }, { "epoch": 1.7845420115023145, "grad_norm": 0.037459615617990494, "learning_rate": 2.0257633141628094e-05, "loss": 0.048, "step": 63610 }, { "epoch": 1.7848225557581707, "grad_norm": 0.10316760092973709, "learning_rate": 2.0252957404030487e-05, "loss": 0.0338, "step": 63620 }, { "epoch": 1.7851031000140272, "grad_norm": 0.49979740381240845, "learning_rate": 2.024828166643288e-05, "loss": 0.0143, "step": 63630 }, { "epoch": 1.7853836442698836, "grad_norm": 0.04308052733540535, "learning_rate": 2.0243605928835274e-05, "loss": 0.0484, "step": 63640 }, { "epoch": 1.78566418852574, "grad_norm": 0.10119663178920746, "learning_rate": 2.0238930191237667e-05, "loss": 0.0136, "step": 63650 }, { "epoch": 1.7859447327815963, "grad_norm": 0.08478458225727081, "learning_rate": 2.023425445364006e-05, "loss": 0.0339, "step": 63660 }, { "epoch": 1.7862252770374527, "grad_norm": 0.22376307845115662, "learning_rate": 2.0229578716042456e-05, "loss": 0.0117, "step": 63670 }, { "epoch": 1.786505821293309, "grad_norm": 1.707788348197937, "learning_rate": 2.0224902978444853e-05, "loss": 0.0199, "step": 63680 }, { "epoch": 1.7867863655491654, "grad_norm": 0.05831719562411308, "learning_rate": 2.0220227240847246e-05, "loss": 0.0169, "step": 63690 }, { "epoch": 1.7870669098050218, "grad_norm": 0.14866963028907776, "learning_rate": 2.021555150324964e-05, "loss": 0.0288, "step": 63700 }, { "epoch": 1.7873474540608782, "grad_norm": 0.3145166039466858, "learning_rate": 2.0210875765652032e-05, "loss": 0.0188, "step": 63710 }, { "epoch": 1.7876279983167345, "grad_norm": 0.33581095933914185, "learning_rate": 2.0206200028054426e-05, "loss": 0.0301, "step": 63720 }, { "epoch": 1.7879085425725907, "grad_norm": 1.4112753868103027, "learning_rate": 2.020152429045682e-05, "loss": 0.0188, "step": 63730 }, { "epoch": 1.7881890868284471, "grad_norm": 3.5957531929016113, "learning_rate": 2.0196848552859215e-05, "loss": 0.0196, "step": 63740 }, { "epoch": 1.7884696310843036, "grad_norm": 0.06733686476945877, "learning_rate": 2.019217281526161e-05, "loss": 0.0239, "step": 63750 }, { "epoch": 1.78875017534016, "grad_norm": 0.031566813588142395, "learning_rate": 2.0187497077664e-05, "loss": 0.043, "step": 63760 }, { "epoch": 1.7890307195960162, "grad_norm": 0.4259088933467865, "learning_rate": 2.0182821340066395e-05, "loss": 0.0098, "step": 63770 }, { "epoch": 1.7893112638518727, "grad_norm": 0.3356515169143677, "learning_rate": 2.017814560246879e-05, "loss": 0.0126, "step": 63780 }, { "epoch": 1.7895918081077289, "grad_norm": 0.3189668357372284, "learning_rate": 2.0173469864871184e-05, "loss": 0.0085, "step": 63790 }, { "epoch": 1.7898723523635853, "grad_norm": 0.48163729906082153, "learning_rate": 2.016879412727358e-05, "loss": 0.024, "step": 63800 }, { "epoch": 1.7901528966194418, "grad_norm": 0.26771095395088196, "learning_rate": 2.0164118389675974e-05, "loss": 0.0422, "step": 63810 }, { "epoch": 1.7904334408752982, "grad_norm": 0.7293832898139954, "learning_rate": 2.0159442652078367e-05, "loss": 0.0236, "step": 63820 }, { "epoch": 1.7907139851311544, "grad_norm": 0.6235517263412476, "learning_rate": 2.015476691448076e-05, "loss": 0.0062, "step": 63830 }, { "epoch": 1.7909945293870106, "grad_norm": 0.04922550544142723, "learning_rate": 2.0150091176883154e-05, "loss": 0.0101, "step": 63840 }, { "epoch": 1.791275073642867, "grad_norm": 0.060096628963947296, "learning_rate": 2.0145415439285547e-05, "loss": 0.0145, "step": 63850 }, { "epoch": 1.7915556178987235, "grad_norm": 0.025821184739470482, "learning_rate": 2.014073970168794e-05, "loss": 0.0113, "step": 63860 }, { "epoch": 1.79183616215458, "grad_norm": 0.7088441252708435, "learning_rate": 2.0136063964090336e-05, "loss": 0.0537, "step": 63870 }, { "epoch": 1.7921167064104364, "grad_norm": 0.6098968982696533, "learning_rate": 2.013138822649273e-05, "loss": 0.0108, "step": 63880 }, { "epoch": 1.7923972506662926, "grad_norm": 0.3269282579421997, "learning_rate": 2.0126712488895126e-05, "loss": 0.0408, "step": 63890 }, { "epoch": 1.7926777949221488, "grad_norm": 0.29646551609039307, "learning_rate": 2.012203675129752e-05, "loss": 0.0346, "step": 63900 }, { "epoch": 1.7929583391780053, "grad_norm": 0.2088315337896347, "learning_rate": 2.0117361013699912e-05, "loss": 0.028, "step": 63910 }, { "epoch": 1.7932388834338617, "grad_norm": 0.3836175799369812, "learning_rate": 2.0112685276102306e-05, "loss": 0.0144, "step": 63920 }, { "epoch": 1.7935194276897182, "grad_norm": 0.5335462093353271, "learning_rate": 2.01080095385047e-05, "loss": 0.0197, "step": 63930 }, { "epoch": 1.7937999719455744, "grad_norm": 0.1670827567577362, "learning_rate": 2.0103333800907095e-05, "loss": 0.0291, "step": 63940 }, { "epoch": 1.7940805162014308, "grad_norm": 0.2432379275560379, "learning_rate": 2.009865806330949e-05, "loss": 0.0228, "step": 63950 }, { "epoch": 1.794361060457287, "grad_norm": 0.36310794949531555, "learning_rate": 2.009398232571188e-05, "loss": 0.044, "step": 63960 }, { "epoch": 1.7946416047131435, "grad_norm": 0.19074593484401703, "learning_rate": 2.0089306588114275e-05, "loss": 0.0167, "step": 63970 }, { "epoch": 1.794922148969, "grad_norm": 0.05568910390138626, "learning_rate": 2.008463085051667e-05, "loss": 0.0071, "step": 63980 }, { "epoch": 1.7952026932248564, "grad_norm": 0.09101913869380951, "learning_rate": 2.0079955112919064e-05, "loss": 0.0058, "step": 63990 }, { "epoch": 1.7954832374807126, "grad_norm": 0.3520022928714752, "learning_rate": 2.0075279375321458e-05, "loss": 0.036, "step": 64000 }, { "epoch": 1.7957637817365688, "grad_norm": 0.09741708636283875, "learning_rate": 2.0070603637723854e-05, "loss": 0.0049, "step": 64010 }, { "epoch": 1.7960443259924253, "grad_norm": 0.02669193409383297, "learning_rate": 2.0065927900126247e-05, "loss": 0.0298, "step": 64020 }, { "epoch": 1.7963248702482817, "grad_norm": 0.05642002075910568, "learning_rate": 2.006125216252864e-05, "loss": 0.0117, "step": 64030 }, { "epoch": 1.7966054145041381, "grad_norm": 0.1221073791384697, "learning_rate": 2.0056576424931034e-05, "loss": 0.0343, "step": 64040 }, { "epoch": 1.7968859587599944, "grad_norm": 0.08084694296121597, "learning_rate": 2.0051900687333427e-05, "loss": 0.0113, "step": 64050 }, { "epoch": 1.7971665030158508, "grad_norm": 0.4958467185497284, "learning_rate": 2.004722494973582e-05, "loss": 0.0206, "step": 64060 }, { "epoch": 1.797447047271707, "grad_norm": 0.29658764600753784, "learning_rate": 2.0042549212138213e-05, "loss": 0.0242, "step": 64070 }, { "epoch": 1.7977275915275635, "grad_norm": 0.4898415207862854, "learning_rate": 2.003787347454061e-05, "loss": 0.0297, "step": 64080 }, { "epoch": 1.79800813578342, "grad_norm": 0.51373291015625, "learning_rate": 2.0033197736943006e-05, "loss": 0.0136, "step": 64090 }, { "epoch": 1.7982886800392763, "grad_norm": 1.4030920267105103, "learning_rate": 2.00285219993454e-05, "loss": 0.0418, "step": 64100 }, { "epoch": 1.7985692242951326, "grad_norm": 0.2206910401582718, "learning_rate": 2.0023846261747792e-05, "loss": 0.0183, "step": 64110 }, { "epoch": 1.7988497685509888, "grad_norm": 0.06758087128400803, "learning_rate": 2.0019170524150186e-05, "loss": 0.0299, "step": 64120 }, { "epoch": 1.7991303128068452, "grad_norm": 1.2349377870559692, "learning_rate": 2.001449478655258e-05, "loss": 0.0177, "step": 64130 }, { "epoch": 1.7994108570627017, "grad_norm": 0.4313546121120453, "learning_rate": 2.0009819048954972e-05, "loss": 0.07, "step": 64140 }, { "epoch": 1.799691401318558, "grad_norm": 0.07205051183700562, "learning_rate": 2.000514331135737e-05, "loss": 0.0092, "step": 64150 }, { "epoch": 1.7999719455744143, "grad_norm": 0.1504490226507187, "learning_rate": 2.000046757375976e-05, "loss": 0.0674, "step": 64160 }, { "epoch": 1.8002524898302708, "grad_norm": 0.048745568841695786, "learning_rate": 1.9995791836162155e-05, "loss": 0.0227, "step": 64170 }, { "epoch": 1.800533034086127, "grad_norm": 0.69569993019104, "learning_rate": 1.9991116098564548e-05, "loss": 0.0311, "step": 64180 }, { "epoch": 1.8008135783419834, "grad_norm": 0.27452215552330017, "learning_rate": 1.9986440360966944e-05, "loss": 0.0473, "step": 64190 }, { "epoch": 1.8010941225978399, "grad_norm": 0.20930635929107666, "learning_rate": 1.9981764623369338e-05, "loss": 0.0183, "step": 64200 }, { "epoch": 1.8013746668536963, "grad_norm": 0.7765156626701355, "learning_rate": 1.997708888577173e-05, "loss": 0.0362, "step": 64210 }, { "epoch": 1.8016552111095525, "grad_norm": 0.027774790301918983, "learning_rate": 1.9972413148174127e-05, "loss": 0.0324, "step": 64220 }, { "epoch": 1.8019357553654087, "grad_norm": 0.747905969619751, "learning_rate": 1.996773741057652e-05, "loss": 0.0519, "step": 64230 }, { "epoch": 1.8022162996212652, "grad_norm": 0.0922529399394989, "learning_rate": 1.9963061672978914e-05, "loss": 0.0279, "step": 64240 }, { "epoch": 1.8024968438771216, "grad_norm": 0.22815091907978058, "learning_rate": 1.9958385935381307e-05, "loss": 0.0376, "step": 64250 }, { "epoch": 1.802777388132978, "grad_norm": 0.4032367169857025, "learning_rate": 1.99537101977837e-05, "loss": 0.0195, "step": 64260 }, { "epoch": 1.8030579323888345, "grad_norm": 0.15844281017780304, "learning_rate": 1.9949034460186093e-05, "loss": 0.0227, "step": 64270 }, { "epoch": 1.8033384766446907, "grad_norm": 0.20421777665615082, "learning_rate": 1.994435872258849e-05, "loss": 0.0232, "step": 64280 }, { "epoch": 1.803619020900547, "grad_norm": 0.03290301933884621, "learning_rate": 1.9939682984990883e-05, "loss": 0.0153, "step": 64290 }, { "epoch": 1.8038995651564034, "grad_norm": 0.9783036708831787, "learning_rate": 1.993500724739328e-05, "loss": 0.0244, "step": 64300 }, { "epoch": 1.8041801094122598, "grad_norm": 0.1882416307926178, "learning_rate": 1.9930331509795673e-05, "loss": 0.028, "step": 64310 }, { "epoch": 1.8044606536681163, "grad_norm": 0.06502830982208252, "learning_rate": 1.9925655772198066e-05, "loss": 0.0077, "step": 64320 }, { "epoch": 1.8047411979239725, "grad_norm": 0.017894720658659935, "learning_rate": 1.992098003460046e-05, "loss": 0.0112, "step": 64330 }, { "epoch": 1.805021742179829, "grad_norm": 0.47084447741508484, "learning_rate": 1.9916304297002852e-05, "loss": 0.0381, "step": 64340 }, { "epoch": 1.8053022864356851, "grad_norm": 0.22615495324134827, "learning_rate": 1.9911628559405245e-05, "loss": 0.0216, "step": 64350 }, { "epoch": 1.8055828306915416, "grad_norm": 0.03710629791021347, "learning_rate": 1.990695282180764e-05, "loss": 0.0326, "step": 64360 }, { "epoch": 1.805863374947398, "grad_norm": 0.5762761235237122, "learning_rate": 1.9902277084210035e-05, "loss": 0.0229, "step": 64370 }, { "epoch": 1.8061439192032545, "grad_norm": 0.07036314159631729, "learning_rate": 1.9897601346612428e-05, "loss": 0.0099, "step": 64380 }, { "epoch": 1.8064244634591107, "grad_norm": 0.6617462635040283, "learning_rate": 1.9892925609014825e-05, "loss": 0.0474, "step": 64390 }, { "epoch": 1.806705007714967, "grad_norm": 0.06752441823482513, "learning_rate": 1.9888249871417218e-05, "loss": 0.0231, "step": 64400 }, { "epoch": 1.8069855519708233, "grad_norm": 0.5993126034736633, "learning_rate": 1.988357413381961e-05, "loss": 0.033, "step": 64410 }, { "epoch": 1.8072660962266798, "grad_norm": 0.6988139748573303, "learning_rate": 1.9878898396222004e-05, "loss": 0.0582, "step": 64420 }, { "epoch": 1.8075466404825362, "grad_norm": 0.09892693161964417, "learning_rate": 1.98742226586244e-05, "loss": 0.0267, "step": 64430 }, { "epoch": 1.8078271847383924, "grad_norm": 0.2740328311920166, "learning_rate": 1.9869546921026794e-05, "loss": 0.0172, "step": 64440 }, { "epoch": 1.8081077289942489, "grad_norm": 0.8906832933425903, "learning_rate": 1.9864871183429187e-05, "loss": 0.0415, "step": 64450 }, { "epoch": 1.808388273250105, "grad_norm": 0.06395286321640015, "learning_rate": 1.986019544583158e-05, "loss": 0.0303, "step": 64460 }, { "epoch": 1.8086688175059615, "grad_norm": 0.07619311660528183, "learning_rate": 1.9855519708233973e-05, "loss": 0.0317, "step": 64470 }, { "epoch": 1.808949361761818, "grad_norm": 0.48566263914108276, "learning_rate": 1.985084397063637e-05, "loss": 0.0189, "step": 64480 }, { "epoch": 1.8092299060176744, "grad_norm": 1.1721422672271729, "learning_rate": 1.9846168233038763e-05, "loss": 0.0289, "step": 64490 }, { "epoch": 1.8095104502735306, "grad_norm": 0.1729506105184555, "learning_rate": 1.984149249544116e-05, "loss": 0.0077, "step": 64500 }, { "epoch": 1.8097909945293869, "grad_norm": 0.13759519159793854, "learning_rate": 1.9836816757843553e-05, "loss": 0.0166, "step": 64510 }, { "epoch": 1.8100715387852433, "grad_norm": 0.5558524131774902, "learning_rate": 1.9832141020245946e-05, "loss": 0.0191, "step": 64520 }, { "epoch": 1.8103520830410997, "grad_norm": 0.36855974793434143, "learning_rate": 1.982746528264834e-05, "loss": 0.0301, "step": 64530 }, { "epoch": 1.8106326272969562, "grad_norm": 0.01536989863961935, "learning_rate": 1.9822789545050732e-05, "loss": 0.0105, "step": 64540 }, { "epoch": 1.8109131715528126, "grad_norm": 0.009644770063459873, "learning_rate": 1.9818113807453125e-05, "loss": 0.0168, "step": 64550 }, { "epoch": 1.8111937158086688, "grad_norm": 1.0159082412719727, "learning_rate": 1.981343806985552e-05, "loss": 0.0398, "step": 64560 }, { "epoch": 1.811474260064525, "grad_norm": 0.020166944712400436, "learning_rate": 1.9808762332257915e-05, "loss": 0.0269, "step": 64570 }, { "epoch": 1.8117548043203815, "grad_norm": 0.1519084870815277, "learning_rate": 1.9804086594660308e-05, "loss": 0.0178, "step": 64580 }, { "epoch": 1.812035348576238, "grad_norm": 0.15717348456382751, "learning_rate": 1.9799410857062705e-05, "loss": 0.0066, "step": 64590 }, { "epoch": 1.8123158928320944, "grad_norm": 0.04729950428009033, "learning_rate": 1.9794735119465098e-05, "loss": 0.0122, "step": 64600 }, { "epoch": 1.8125964370879506, "grad_norm": 0.0546366348862648, "learning_rate": 1.979005938186749e-05, "loss": 0.005, "step": 64610 }, { "epoch": 1.812876981343807, "grad_norm": 0.04795940965414047, "learning_rate": 1.9785383644269884e-05, "loss": 0.0297, "step": 64620 }, { "epoch": 1.8131575255996633, "grad_norm": 0.29159048199653625, "learning_rate": 1.9780707906672277e-05, "loss": 0.0104, "step": 64630 }, { "epoch": 1.8134380698555197, "grad_norm": 0.6276484131813049, "learning_rate": 1.9776032169074674e-05, "loss": 0.038, "step": 64640 }, { "epoch": 1.8137186141113761, "grad_norm": 0.029345287010073662, "learning_rate": 1.9771356431477067e-05, "loss": 0.0161, "step": 64650 }, { "epoch": 1.8139991583672326, "grad_norm": 0.050641980022192, "learning_rate": 1.976668069387946e-05, "loss": 0.0578, "step": 64660 }, { "epoch": 1.8142797026230888, "grad_norm": 0.15768833458423615, "learning_rate": 1.9762004956281853e-05, "loss": 0.0162, "step": 64670 }, { "epoch": 1.814560246878945, "grad_norm": 1.0044217109680176, "learning_rate": 1.9757329218684246e-05, "loss": 0.0329, "step": 64680 }, { "epoch": 1.8148407911348015, "grad_norm": 0.04504761844873428, "learning_rate": 1.9752653481086643e-05, "loss": 0.0456, "step": 64690 }, { "epoch": 1.815121335390658, "grad_norm": 0.04260660707950592, "learning_rate": 1.9747977743489036e-05, "loss": 0.0102, "step": 64700 }, { "epoch": 1.8154018796465143, "grad_norm": 0.027287306264042854, "learning_rate": 1.9743302005891433e-05, "loss": 0.0183, "step": 64710 }, { "epoch": 1.8156824239023706, "grad_norm": 0.017651716247200966, "learning_rate": 1.9738626268293826e-05, "loss": 0.0198, "step": 64720 }, { "epoch": 1.815962968158227, "grad_norm": 0.385208398103714, "learning_rate": 1.973395053069622e-05, "loss": 0.0443, "step": 64730 }, { "epoch": 1.8162435124140832, "grad_norm": 0.08179864287376404, "learning_rate": 1.9729274793098612e-05, "loss": 0.0265, "step": 64740 }, { "epoch": 1.8165240566699397, "grad_norm": 0.5570908188819885, "learning_rate": 1.9724599055501005e-05, "loss": 0.0181, "step": 64750 }, { "epoch": 1.816804600925796, "grad_norm": 0.1594029664993286, "learning_rate": 1.97199233179034e-05, "loss": 0.0194, "step": 64760 }, { "epoch": 1.8170851451816525, "grad_norm": 0.9654345512390137, "learning_rate": 1.971524758030579e-05, "loss": 0.0235, "step": 64770 }, { "epoch": 1.8173656894375088, "grad_norm": 0.15210914611816406, "learning_rate": 1.9710571842708188e-05, "loss": 0.0067, "step": 64780 }, { "epoch": 1.817646233693365, "grad_norm": 0.043191179633140564, "learning_rate": 1.970589610511058e-05, "loss": 0.0115, "step": 64790 }, { "epoch": 1.8179267779492214, "grad_norm": 0.21964170038700104, "learning_rate": 1.9701220367512978e-05, "loss": 0.0522, "step": 64800 }, { "epoch": 1.8182073222050779, "grad_norm": 0.04853306710720062, "learning_rate": 1.969654462991537e-05, "loss": 0.0188, "step": 64810 }, { "epoch": 1.8184878664609343, "grad_norm": 0.5367407202720642, "learning_rate": 1.9691868892317764e-05, "loss": 0.0204, "step": 64820 }, { "epoch": 1.8187684107167907, "grad_norm": 0.41936805844306946, "learning_rate": 1.9687193154720157e-05, "loss": 0.0128, "step": 64830 }, { "epoch": 1.819048954972647, "grad_norm": 0.03311591222882271, "learning_rate": 1.968251741712255e-05, "loss": 0.0508, "step": 64840 }, { "epoch": 1.8193294992285032, "grad_norm": 0.5116895437240601, "learning_rate": 1.9677841679524947e-05, "loss": 0.0185, "step": 64850 }, { "epoch": 1.8196100434843596, "grad_norm": 0.01805984601378441, "learning_rate": 1.967316594192734e-05, "loss": 0.0056, "step": 64860 }, { "epoch": 1.819890587740216, "grad_norm": 0.04194682836532593, "learning_rate": 1.9668490204329733e-05, "loss": 0.0459, "step": 64870 }, { "epoch": 1.8201711319960725, "grad_norm": 0.39719581604003906, "learning_rate": 1.9663814466732126e-05, "loss": 0.0073, "step": 64880 }, { "epoch": 1.8204516762519287, "grad_norm": 0.35474467277526855, "learning_rate": 1.9659138729134523e-05, "loss": 0.0285, "step": 64890 }, { "epoch": 1.820732220507785, "grad_norm": 0.6293283700942993, "learning_rate": 1.9654462991536916e-05, "loss": 0.0129, "step": 64900 }, { "epoch": 1.8210127647636414, "grad_norm": 0.21566633880138397, "learning_rate": 1.964978725393931e-05, "loss": 0.0144, "step": 64910 }, { "epoch": 1.8212933090194978, "grad_norm": 0.2668549120426178, "learning_rate": 1.9645111516341706e-05, "loss": 0.0383, "step": 64920 }, { "epoch": 1.8215738532753543, "grad_norm": 0.835565984249115, "learning_rate": 1.96404357787441e-05, "loss": 0.0148, "step": 64930 }, { "epoch": 1.8218543975312107, "grad_norm": 1.06385338306427, "learning_rate": 1.9635760041146492e-05, "loss": 0.023, "step": 64940 }, { "epoch": 1.822134941787067, "grad_norm": 0.1001051515340805, "learning_rate": 1.9631084303548885e-05, "loss": 0.0279, "step": 64950 }, { "epoch": 1.8224154860429231, "grad_norm": 9.988231658935547, "learning_rate": 1.962640856595128e-05, "loss": 0.0296, "step": 64960 }, { "epoch": 1.8226960302987796, "grad_norm": 0.017901957035064697, "learning_rate": 1.962173282835367e-05, "loss": 0.0071, "step": 64970 }, { "epoch": 1.822976574554636, "grad_norm": 0.033238161355257034, "learning_rate": 1.9617057090756065e-05, "loss": 0.0267, "step": 64980 }, { "epoch": 1.8232571188104925, "grad_norm": 1.0472253561019897, "learning_rate": 1.961238135315846e-05, "loss": 0.032, "step": 64990 }, { "epoch": 1.8235376630663487, "grad_norm": 0.07382168620824814, "learning_rate": 1.9607705615560858e-05, "loss": 0.0348, "step": 65000 }, { "epoch": 1.8238182073222051, "grad_norm": 0.48134124279022217, "learning_rate": 1.960302987796325e-05, "loss": 0.0302, "step": 65010 }, { "epoch": 1.8240987515780613, "grad_norm": 2.531198024749756, "learning_rate": 1.9598354140365644e-05, "loss": 0.0327, "step": 65020 }, { "epoch": 1.8243792958339178, "grad_norm": 0.05857495591044426, "learning_rate": 1.9593678402768037e-05, "loss": 0.0074, "step": 65030 }, { "epoch": 1.8246598400897742, "grad_norm": 0.7102998495101929, "learning_rate": 1.958900266517043e-05, "loss": 0.0096, "step": 65040 }, { "epoch": 1.8249403843456307, "grad_norm": 0.02007921412587166, "learning_rate": 1.9584326927572824e-05, "loss": 0.0044, "step": 65050 }, { "epoch": 1.8252209286014869, "grad_norm": 0.17469848692417145, "learning_rate": 1.957965118997522e-05, "loss": 0.027, "step": 65060 }, { "epoch": 1.825501472857343, "grad_norm": 0.33487460017204285, "learning_rate": 1.9574975452377613e-05, "loss": 0.0213, "step": 65070 }, { "epoch": 1.8257820171131995, "grad_norm": 0.020651323720812798, "learning_rate": 1.9570299714780006e-05, "loss": 0.038, "step": 65080 }, { "epoch": 1.826062561369056, "grad_norm": 0.4183170199394226, "learning_rate": 1.95656239771824e-05, "loss": 0.0313, "step": 65090 }, { "epoch": 1.8263431056249124, "grad_norm": 1.1867015361785889, "learning_rate": 1.9560948239584796e-05, "loss": 0.011, "step": 65100 }, { "epoch": 1.8266236498807686, "grad_norm": 0.04987623915076256, "learning_rate": 1.955627250198719e-05, "loss": 0.0098, "step": 65110 }, { "epoch": 1.826904194136625, "grad_norm": 0.024456709623336792, "learning_rate": 1.9551596764389586e-05, "loss": 0.0444, "step": 65120 }, { "epoch": 1.8271847383924813, "grad_norm": 0.2845584750175476, "learning_rate": 1.954692102679198e-05, "loss": 0.0189, "step": 65130 }, { "epoch": 1.8274652826483377, "grad_norm": 1.0030314922332764, "learning_rate": 1.9542245289194372e-05, "loss": 0.0267, "step": 65140 }, { "epoch": 1.8277458269041942, "grad_norm": 0.07212797552347183, "learning_rate": 1.9537569551596765e-05, "loss": 0.0181, "step": 65150 }, { "epoch": 1.8280263711600506, "grad_norm": 0.2742769122123718, "learning_rate": 1.953289381399916e-05, "loss": 0.0355, "step": 65160 }, { "epoch": 1.8283069154159068, "grad_norm": 0.6817632913589478, "learning_rate": 1.952821807640155e-05, "loss": 0.0241, "step": 65170 }, { "epoch": 1.828587459671763, "grad_norm": 0.24569863080978394, "learning_rate": 1.9523542338803945e-05, "loss": 0.0106, "step": 65180 }, { "epoch": 1.8288680039276195, "grad_norm": 6.580717086791992, "learning_rate": 1.951886660120634e-05, "loss": 0.0262, "step": 65190 }, { "epoch": 1.829148548183476, "grad_norm": 0.3710302710533142, "learning_rate": 1.9514190863608734e-05, "loss": 0.0164, "step": 65200 }, { "epoch": 1.8294290924393324, "grad_norm": 0.1866511106491089, "learning_rate": 1.950951512601113e-05, "loss": 0.0095, "step": 65210 }, { "epoch": 1.8297096366951888, "grad_norm": 0.059145282953977585, "learning_rate": 1.9504839388413524e-05, "loss": 0.0165, "step": 65220 }, { "epoch": 1.829990180951045, "grad_norm": 0.2270382046699524, "learning_rate": 1.9500163650815917e-05, "loss": 0.0241, "step": 65230 }, { "epoch": 1.8302707252069013, "grad_norm": 0.5299392938613892, "learning_rate": 1.949548791321831e-05, "loss": 0.0215, "step": 65240 }, { "epoch": 1.8305512694627577, "grad_norm": 0.010589420795440674, "learning_rate": 1.9490812175620704e-05, "loss": 0.0397, "step": 65250 }, { "epoch": 1.8308318137186141, "grad_norm": 0.11097654700279236, "learning_rate": 1.94861364380231e-05, "loss": 0.0128, "step": 65260 }, { "epoch": 1.8311123579744706, "grad_norm": 0.16985177993774414, "learning_rate": 1.9481460700425493e-05, "loss": 0.0232, "step": 65270 }, { "epoch": 1.8313929022303268, "grad_norm": 1.0437191724777222, "learning_rate": 1.9476784962827887e-05, "loss": 0.0192, "step": 65280 }, { "epoch": 1.8316734464861832, "grad_norm": 0.46707725524902344, "learning_rate": 1.947210922523028e-05, "loss": 0.0107, "step": 65290 }, { "epoch": 1.8319539907420395, "grad_norm": 0.03609205037355423, "learning_rate": 1.9467433487632676e-05, "loss": 0.0281, "step": 65300 }, { "epoch": 1.832234534997896, "grad_norm": 0.03311429172754288, "learning_rate": 1.946275775003507e-05, "loss": 0.0122, "step": 65310 }, { "epoch": 1.8325150792537523, "grad_norm": 0.018606822937726974, "learning_rate": 1.9458082012437463e-05, "loss": 0.0187, "step": 65320 }, { "epoch": 1.8327956235096088, "grad_norm": 0.9141477942466736, "learning_rate": 1.945340627483986e-05, "loss": 0.0215, "step": 65330 }, { "epoch": 1.833076167765465, "grad_norm": 0.3875596523284912, "learning_rate": 1.9448730537242252e-05, "loss": 0.0248, "step": 65340 }, { "epoch": 1.8333567120213212, "grad_norm": 0.0523802787065506, "learning_rate": 1.9444054799644645e-05, "loss": 0.0279, "step": 65350 }, { "epoch": 1.8336372562771777, "grad_norm": 0.0558890663087368, "learning_rate": 1.943937906204704e-05, "loss": 0.0453, "step": 65360 }, { "epoch": 1.833917800533034, "grad_norm": 0.8290716409683228, "learning_rate": 1.943470332444943e-05, "loss": 0.0229, "step": 65370 }, { "epoch": 1.8341983447888905, "grad_norm": 0.6570255756378174, "learning_rate": 1.9430027586851825e-05, "loss": 0.0123, "step": 65380 }, { "epoch": 1.8344788890447468, "grad_norm": 0.20751947164535522, "learning_rate": 1.942535184925422e-05, "loss": 0.0089, "step": 65390 }, { "epoch": 1.8347594333006032, "grad_norm": 1.1537041664123535, "learning_rate": 1.9420676111656615e-05, "loss": 0.0094, "step": 65400 }, { "epoch": 1.8350399775564594, "grad_norm": 0.03571438416838646, "learning_rate": 1.941600037405901e-05, "loss": 0.0102, "step": 65410 }, { "epoch": 1.8353205218123159, "grad_norm": 0.7425981163978577, "learning_rate": 1.9411324636461404e-05, "loss": 0.028, "step": 65420 }, { "epoch": 1.8356010660681723, "grad_norm": 1.5747085809707642, "learning_rate": 1.9406648898863797e-05, "loss": 0.0504, "step": 65430 }, { "epoch": 1.8358816103240287, "grad_norm": 2.009110927581787, "learning_rate": 1.940197316126619e-05, "loss": 0.0443, "step": 65440 }, { "epoch": 1.836162154579885, "grad_norm": 0.25947946310043335, "learning_rate": 1.9397297423668584e-05, "loss": 0.0177, "step": 65450 }, { "epoch": 1.8364426988357412, "grad_norm": 0.4087194800376892, "learning_rate": 1.9392621686070977e-05, "loss": 0.0056, "step": 65460 }, { "epoch": 1.8367232430915976, "grad_norm": 0.4585563540458679, "learning_rate": 1.9387945948473373e-05, "loss": 0.0184, "step": 65470 }, { "epoch": 1.837003787347454, "grad_norm": 1.425635576248169, "learning_rate": 1.9383270210875767e-05, "loss": 0.0525, "step": 65480 }, { "epoch": 1.8372843316033105, "grad_norm": 0.052558496594429016, "learning_rate": 1.937859447327816e-05, "loss": 0.0208, "step": 65490 }, { "epoch": 1.837564875859167, "grad_norm": 0.3090972900390625, "learning_rate": 1.9373918735680556e-05, "loss": 0.0379, "step": 65500 }, { "epoch": 1.8378454201150232, "grad_norm": 0.01572524756193161, "learning_rate": 1.936924299808295e-05, "loss": 0.0196, "step": 65510 }, { "epoch": 1.8381259643708794, "grad_norm": 0.8845873475074768, "learning_rate": 1.9364567260485343e-05, "loss": 0.0356, "step": 65520 }, { "epoch": 1.8384065086267358, "grad_norm": 0.9496434926986694, "learning_rate": 1.9359891522887736e-05, "loss": 0.026, "step": 65530 }, { "epoch": 1.8386870528825923, "grad_norm": 0.2442426085472107, "learning_rate": 1.9355215785290132e-05, "loss": 0.0376, "step": 65540 }, { "epoch": 1.8389675971384487, "grad_norm": 0.1368706077337265, "learning_rate": 1.9350540047692525e-05, "loss": 0.0044, "step": 65550 }, { "epoch": 1.839248141394305, "grad_norm": 0.47212544083595276, "learning_rate": 1.934586431009492e-05, "loss": 0.0213, "step": 65560 }, { "epoch": 1.8395286856501614, "grad_norm": 0.38215675950050354, "learning_rate": 1.9341188572497312e-05, "loss": 0.0365, "step": 65570 }, { "epoch": 1.8398092299060176, "grad_norm": 0.18641263246536255, "learning_rate": 1.9336512834899705e-05, "loss": 0.0425, "step": 65580 }, { "epoch": 1.840089774161874, "grad_norm": 0.055689986795186996, "learning_rate": 1.9331837097302098e-05, "loss": 0.0054, "step": 65590 }, { "epoch": 1.8403703184177305, "grad_norm": 0.07024640589952469, "learning_rate": 1.9327161359704495e-05, "loss": 0.0466, "step": 65600 }, { "epoch": 1.840650862673587, "grad_norm": 0.29792338609695435, "learning_rate": 1.932248562210689e-05, "loss": 0.0125, "step": 65610 }, { "epoch": 1.8409314069294431, "grad_norm": 0.1560596078634262, "learning_rate": 1.9317809884509284e-05, "loss": 0.0187, "step": 65620 }, { "epoch": 1.8412119511852993, "grad_norm": 0.024808084592223167, "learning_rate": 1.9313134146911677e-05, "loss": 0.0055, "step": 65630 }, { "epoch": 1.8414924954411558, "grad_norm": 0.7830901145935059, "learning_rate": 1.930845840931407e-05, "loss": 0.0218, "step": 65640 }, { "epoch": 1.8417730396970122, "grad_norm": 0.03476232290267944, "learning_rate": 1.9303782671716464e-05, "loss": 0.0178, "step": 65650 }, { "epoch": 1.8420535839528687, "grad_norm": 0.09351269900798798, "learning_rate": 1.9299106934118857e-05, "loss": 0.0348, "step": 65660 }, { "epoch": 1.8423341282087249, "grad_norm": 0.21929942071437836, "learning_rate": 1.929443119652125e-05, "loss": 0.0185, "step": 65670 }, { "epoch": 1.8426146724645813, "grad_norm": 0.04625517874956131, "learning_rate": 1.9289755458923647e-05, "loss": 0.0062, "step": 65680 }, { "epoch": 1.8428952167204375, "grad_norm": 0.042419012635946274, "learning_rate": 1.928507972132604e-05, "loss": 0.0151, "step": 65690 }, { "epoch": 1.843175760976294, "grad_norm": 0.41632935404777527, "learning_rate": 1.9280403983728433e-05, "loss": 0.0214, "step": 65700 }, { "epoch": 1.8434563052321504, "grad_norm": 0.08615466952323914, "learning_rate": 1.927572824613083e-05, "loss": 0.0475, "step": 65710 }, { "epoch": 1.8437368494880069, "grad_norm": 2.783561944961548, "learning_rate": 1.9271052508533223e-05, "loss": 0.0796, "step": 65720 }, { "epoch": 1.844017393743863, "grad_norm": 0.17630907893180847, "learning_rate": 1.9266376770935616e-05, "loss": 0.0168, "step": 65730 }, { "epoch": 1.8442979379997193, "grad_norm": 0.5907214283943176, "learning_rate": 1.926170103333801e-05, "loss": 0.0283, "step": 65740 }, { "epoch": 1.8445784822555757, "grad_norm": 0.4137907326221466, "learning_rate": 1.9257025295740405e-05, "loss": 0.0143, "step": 65750 }, { "epoch": 1.8448590265114322, "grad_norm": 0.5111982226371765, "learning_rate": 1.92523495581428e-05, "loss": 0.0238, "step": 65760 }, { "epoch": 1.8451395707672886, "grad_norm": 0.05361522361636162, "learning_rate": 1.9247673820545192e-05, "loss": 0.0212, "step": 65770 }, { "epoch": 1.8454201150231448, "grad_norm": 0.05676241219043732, "learning_rate": 1.9242998082947585e-05, "loss": 0.0163, "step": 65780 }, { "epoch": 1.8457006592790013, "grad_norm": 0.04656972736120224, "learning_rate": 1.9238322345349978e-05, "loss": 0.0319, "step": 65790 }, { "epoch": 1.8459812035348575, "grad_norm": 0.7167812585830688, "learning_rate": 1.9233646607752375e-05, "loss": 0.0294, "step": 65800 }, { "epoch": 1.846261747790714, "grad_norm": 1.0420194864273071, "learning_rate": 1.9228970870154768e-05, "loss": 0.0357, "step": 65810 }, { "epoch": 1.8465422920465704, "grad_norm": 0.09874187409877777, "learning_rate": 1.9224295132557164e-05, "loss": 0.0462, "step": 65820 }, { "epoch": 1.8468228363024268, "grad_norm": 0.054059479385614395, "learning_rate": 1.9219619394959557e-05, "loss": 0.0296, "step": 65830 }, { "epoch": 1.847103380558283, "grad_norm": 0.6828358173370361, "learning_rate": 1.921494365736195e-05, "loss": 0.0289, "step": 65840 }, { "epoch": 1.8473839248141393, "grad_norm": 0.043702103197574615, "learning_rate": 1.9210267919764344e-05, "loss": 0.0099, "step": 65850 }, { "epoch": 1.8476644690699957, "grad_norm": 0.0998198539018631, "learning_rate": 1.9205592182166737e-05, "loss": 0.0123, "step": 65860 }, { "epoch": 1.8479450133258521, "grad_norm": 0.053563617169857025, "learning_rate": 1.920091644456913e-05, "loss": 0.0068, "step": 65870 }, { "epoch": 1.8482255575817086, "grad_norm": 0.023701488971710205, "learning_rate": 1.9196240706971523e-05, "loss": 0.0109, "step": 65880 }, { "epoch": 1.848506101837565, "grad_norm": 0.02473239041864872, "learning_rate": 1.919156496937392e-05, "loss": 0.0163, "step": 65890 }, { "epoch": 1.8487866460934212, "grad_norm": 2.02982497215271, "learning_rate": 1.9186889231776313e-05, "loss": 0.0331, "step": 65900 }, { "epoch": 1.8490671903492775, "grad_norm": 0.10660052299499512, "learning_rate": 1.918221349417871e-05, "loss": 0.0295, "step": 65910 }, { "epoch": 1.849347734605134, "grad_norm": 0.012316946871578693, "learning_rate": 1.9177537756581103e-05, "loss": 0.0209, "step": 65920 }, { "epoch": 1.8496282788609903, "grad_norm": 0.16151586174964905, "learning_rate": 1.9172862018983496e-05, "loss": 0.0283, "step": 65930 }, { "epoch": 1.8499088231168468, "grad_norm": 0.04487130790948868, "learning_rate": 1.916818628138589e-05, "loss": 0.0558, "step": 65940 }, { "epoch": 1.850189367372703, "grad_norm": 1.3269792795181274, "learning_rate": 1.9163510543788282e-05, "loss": 0.064, "step": 65950 }, { "epoch": 1.8504699116285594, "grad_norm": 0.6323515176773071, "learning_rate": 1.915883480619068e-05, "loss": 0.0229, "step": 65960 }, { "epoch": 1.8507504558844157, "grad_norm": 0.9173246622085571, "learning_rate": 1.9154159068593072e-05, "loss": 0.0216, "step": 65970 }, { "epoch": 1.851031000140272, "grad_norm": 0.09008462727069855, "learning_rate": 1.9149483330995465e-05, "loss": 0.0215, "step": 65980 }, { "epoch": 1.8513115443961286, "grad_norm": 0.6633594632148743, "learning_rate": 1.9144807593397858e-05, "loss": 0.053, "step": 65990 }, { "epoch": 1.851592088651985, "grad_norm": 0.14576931297779083, "learning_rate": 1.914013185580025e-05, "loss": 0.0177, "step": 66000 }, { "epoch": 1.8518726329078412, "grad_norm": 0.04059145227074623, "learning_rate": 1.9135456118202648e-05, "loss": 0.0092, "step": 66010 }, { "epoch": 1.8521531771636974, "grad_norm": 0.036197222769260406, "learning_rate": 1.913078038060504e-05, "loss": 0.0113, "step": 66020 }, { "epoch": 1.8524337214195539, "grad_norm": 0.584598958492279, "learning_rate": 1.9126104643007438e-05, "loss": 0.0205, "step": 66030 }, { "epoch": 1.8527142656754103, "grad_norm": 0.3085622191429138, "learning_rate": 1.912142890540983e-05, "loss": 0.0155, "step": 66040 }, { "epoch": 1.8529948099312668, "grad_norm": 1.1180064678192139, "learning_rate": 1.9116753167812224e-05, "loss": 0.0252, "step": 66050 }, { "epoch": 1.853275354187123, "grad_norm": 0.3691863417625427, "learning_rate": 1.9112077430214617e-05, "loss": 0.0128, "step": 66060 }, { "epoch": 1.8535558984429794, "grad_norm": 0.0476066991686821, "learning_rate": 1.910740169261701e-05, "loss": 0.016, "step": 66070 }, { "epoch": 1.8538364426988356, "grad_norm": 0.051421742886304855, "learning_rate": 1.9102725955019403e-05, "loss": 0.0101, "step": 66080 }, { "epoch": 1.854116986954692, "grad_norm": 1.057364821434021, "learning_rate": 1.9098050217421796e-05, "loss": 0.0211, "step": 66090 }, { "epoch": 1.8543975312105485, "grad_norm": 0.045675501227378845, "learning_rate": 1.9093374479824193e-05, "loss": 0.0276, "step": 66100 }, { "epoch": 1.854678075466405, "grad_norm": 0.033820416778326035, "learning_rate": 1.9088698742226586e-05, "loss": 0.0529, "step": 66110 }, { "epoch": 1.8549586197222612, "grad_norm": 0.4139741361141205, "learning_rate": 1.9084023004628983e-05, "loss": 0.0445, "step": 66120 }, { "epoch": 1.8552391639781174, "grad_norm": 3.350478172302246, "learning_rate": 1.9079347267031376e-05, "loss": 0.0245, "step": 66130 }, { "epoch": 1.8555197082339738, "grad_norm": 0.03223452344536781, "learning_rate": 1.907467152943377e-05, "loss": 0.0196, "step": 66140 }, { "epoch": 1.8558002524898303, "grad_norm": 0.04550066590309143, "learning_rate": 1.9069995791836162e-05, "loss": 0.0198, "step": 66150 }, { "epoch": 1.8560807967456867, "grad_norm": 0.24578765034675598, "learning_rate": 1.9065320054238555e-05, "loss": 0.026, "step": 66160 }, { "epoch": 1.8563613410015432, "grad_norm": 2.807722330093384, "learning_rate": 1.9060644316640952e-05, "loss": 0.0393, "step": 66170 }, { "epoch": 1.8566418852573994, "grad_norm": 1.0397701263427734, "learning_rate": 1.9055968579043345e-05, "loss": 0.0189, "step": 66180 }, { "epoch": 1.8569224295132556, "grad_norm": 0.8220722079277039, "learning_rate": 1.9051292841445738e-05, "loss": 0.0235, "step": 66190 }, { "epoch": 1.857202973769112, "grad_norm": 0.13047447800636292, "learning_rate": 1.904661710384813e-05, "loss": 0.0188, "step": 66200 }, { "epoch": 1.8574835180249685, "grad_norm": 1.523612380027771, "learning_rate": 1.9041941366250528e-05, "loss": 0.0153, "step": 66210 }, { "epoch": 1.857764062280825, "grad_norm": 0.41223642230033875, "learning_rate": 1.903726562865292e-05, "loss": 0.0091, "step": 66220 }, { "epoch": 1.8580446065366811, "grad_norm": 0.0382796935737133, "learning_rate": 1.9032589891055314e-05, "loss": 0.0241, "step": 66230 }, { "epoch": 1.8583251507925376, "grad_norm": 0.20255108177661896, "learning_rate": 1.902791415345771e-05, "loss": 0.0204, "step": 66240 }, { "epoch": 1.8586056950483938, "grad_norm": 0.016442328691482544, "learning_rate": 1.9023238415860104e-05, "loss": 0.0285, "step": 66250 }, { "epoch": 1.8588862393042502, "grad_norm": 0.3316412568092346, "learning_rate": 1.9018562678262497e-05, "loss": 0.0442, "step": 66260 }, { "epoch": 1.8591667835601067, "grad_norm": 0.5637828707695007, "learning_rate": 1.901388694066489e-05, "loss": 0.0165, "step": 66270 }, { "epoch": 1.8594473278159631, "grad_norm": 0.5683085918426514, "learning_rate": 1.9009211203067283e-05, "loss": 0.0153, "step": 66280 }, { "epoch": 1.8597278720718193, "grad_norm": 0.4753537178039551, "learning_rate": 1.9004535465469677e-05, "loss": 0.0158, "step": 66290 }, { "epoch": 1.8600084163276756, "grad_norm": 0.665078341960907, "learning_rate": 1.8999859727872073e-05, "loss": 0.0267, "step": 66300 }, { "epoch": 1.860288960583532, "grad_norm": 0.031251151114702225, "learning_rate": 1.8995183990274466e-05, "loss": 0.0129, "step": 66310 }, { "epoch": 1.8605695048393884, "grad_norm": 0.28277966380119324, "learning_rate": 1.8990508252676863e-05, "loss": 0.0334, "step": 66320 }, { "epoch": 1.8608500490952449, "grad_norm": 0.026366982609033585, "learning_rate": 1.8985832515079256e-05, "loss": 0.0075, "step": 66330 }, { "epoch": 1.861130593351101, "grad_norm": 0.11826633661985397, "learning_rate": 1.898115677748165e-05, "loss": 0.0127, "step": 66340 }, { "epoch": 1.8614111376069575, "grad_norm": 0.2986895442008972, "learning_rate": 1.8976481039884042e-05, "loss": 0.0114, "step": 66350 }, { "epoch": 1.8616916818628138, "grad_norm": 0.016172630712389946, "learning_rate": 1.8971805302286435e-05, "loss": 0.0213, "step": 66360 }, { "epoch": 1.8619722261186702, "grad_norm": 2.9894261360168457, "learning_rate": 1.8967129564688832e-05, "loss": 0.0168, "step": 66370 }, { "epoch": 1.8622527703745266, "grad_norm": 0.04045763984322548, "learning_rate": 1.8962453827091225e-05, "loss": 0.0193, "step": 66380 }, { "epoch": 1.862533314630383, "grad_norm": 0.06061594560742378, "learning_rate": 1.8957778089493618e-05, "loss": 0.03, "step": 66390 }, { "epoch": 1.8628138588862393, "grad_norm": 0.020371928811073303, "learning_rate": 1.895310235189601e-05, "loss": 0.0085, "step": 66400 }, { "epoch": 1.8630944031420955, "grad_norm": 0.46184200048446655, "learning_rate": 1.8948426614298408e-05, "loss": 0.0079, "step": 66410 }, { "epoch": 1.863374947397952, "grad_norm": 0.3927205204963684, "learning_rate": 1.89437508767008e-05, "loss": 0.0331, "step": 66420 }, { "epoch": 1.8636554916538084, "grad_norm": 0.03873498737812042, "learning_rate": 1.8939075139103194e-05, "loss": 0.0351, "step": 66430 }, { "epoch": 1.8639360359096648, "grad_norm": 0.014168631285429, "learning_rate": 1.893439940150559e-05, "loss": 0.0067, "step": 66440 }, { "epoch": 1.8642165801655213, "grad_norm": 7.257327556610107, "learning_rate": 1.8929723663907984e-05, "loss": 0.017, "step": 66450 }, { "epoch": 1.8644971244213775, "grad_norm": 0.04563784971833229, "learning_rate": 1.8925047926310377e-05, "loss": 0.008, "step": 66460 }, { "epoch": 1.8647776686772337, "grad_norm": 0.2507133185863495, "learning_rate": 1.892037218871277e-05, "loss": 0.0078, "step": 66470 }, { "epoch": 1.8650582129330902, "grad_norm": 0.02555912733078003, "learning_rate": 1.8915696451115163e-05, "loss": 0.0282, "step": 66480 }, { "epoch": 1.8653387571889466, "grad_norm": 0.280422180891037, "learning_rate": 1.8911020713517557e-05, "loss": 0.0082, "step": 66490 }, { "epoch": 1.865619301444803, "grad_norm": 0.2168225795030594, "learning_rate": 1.890634497591995e-05, "loss": 0.0219, "step": 66500 }, { "epoch": 1.8658998457006593, "grad_norm": 0.31345435976982117, "learning_rate": 1.8901669238322346e-05, "loss": 0.0393, "step": 66510 }, { "epoch": 1.8661803899565157, "grad_norm": 0.9321370124816895, "learning_rate": 1.8896993500724743e-05, "loss": 0.0416, "step": 66520 }, { "epoch": 1.866460934212372, "grad_norm": 0.5159482955932617, "learning_rate": 1.8892317763127136e-05, "loss": 0.0239, "step": 66530 }, { "epoch": 1.8667414784682284, "grad_norm": 0.031627021729946136, "learning_rate": 1.888764202552953e-05, "loss": 0.0067, "step": 66540 }, { "epoch": 1.8670220227240848, "grad_norm": 0.03509160503745079, "learning_rate": 1.8882966287931922e-05, "loss": 0.0199, "step": 66550 }, { "epoch": 1.8673025669799412, "grad_norm": 0.05354198068380356, "learning_rate": 1.8878290550334315e-05, "loss": 0.0202, "step": 66560 }, { "epoch": 1.8675831112357975, "grad_norm": 0.03364976495504379, "learning_rate": 1.887361481273671e-05, "loss": 0.0348, "step": 66570 }, { "epoch": 1.8678636554916537, "grad_norm": 0.0585617758333683, "learning_rate": 1.8868939075139105e-05, "loss": 0.0242, "step": 66580 }, { "epoch": 1.8681441997475101, "grad_norm": 0.07411576062440872, "learning_rate": 1.8864263337541498e-05, "loss": 0.0266, "step": 66590 }, { "epoch": 1.8684247440033666, "grad_norm": 0.9479708075523376, "learning_rate": 1.885958759994389e-05, "loss": 0.0206, "step": 66600 }, { "epoch": 1.868705288259223, "grad_norm": 0.10491281747817993, "learning_rate": 1.8854911862346285e-05, "loss": 0.0337, "step": 66610 }, { "epoch": 1.8689858325150792, "grad_norm": 0.040323179215192795, "learning_rate": 1.885023612474868e-05, "loss": 0.0062, "step": 66620 }, { "epoch": 1.8692663767709357, "grad_norm": 0.10287593305110931, "learning_rate": 1.8845560387151074e-05, "loss": 0.0146, "step": 66630 }, { "epoch": 1.8695469210267919, "grad_norm": 0.07287994027137756, "learning_rate": 1.8840884649553467e-05, "loss": 0.0737, "step": 66640 }, { "epoch": 1.8698274652826483, "grad_norm": 0.04290309548377991, "learning_rate": 1.8836208911955864e-05, "loss": 0.026, "step": 66650 }, { "epoch": 1.8701080095385048, "grad_norm": 0.30679240822792053, "learning_rate": 1.8831533174358257e-05, "loss": 0.0114, "step": 66660 }, { "epoch": 1.8703885537943612, "grad_norm": 0.013782022520899773, "learning_rate": 1.882685743676065e-05, "loss": 0.0131, "step": 66670 }, { "epoch": 1.8706690980502174, "grad_norm": 0.2084171175956726, "learning_rate": 1.8822181699163043e-05, "loss": 0.0111, "step": 66680 }, { "epoch": 1.8709496423060736, "grad_norm": 1.4814683198928833, "learning_rate": 1.8817505961565437e-05, "loss": 0.0495, "step": 66690 }, { "epoch": 1.87123018656193, "grad_norm": 2.072187662124634, "learning_rate": 1.881283022396783e-05, "loss": 0.0056, "step": 66700 }, { "epoch": 1.8715107308177865, "grad_norm": 0.042022936046123505, "learning_rate": 1.8808154486370226e-05, "loss": 0.0058, "step": 66710 }, { "epoch": 1.871791275073643, "grad_norm": 0.6974712610244751, "learning_rate": 1.880347874877262e-05, "loss": 0.0126, "step": 66720 }, { "epoch": 1.8720718193294992, "grad_norm": 0.05730225890874863, "learning_rate": 1.8798803011175016e-05, "loss": 0.0168, "step": 66730 }, { "epoch": 1.8723523635853556, "grad_norm": 0.05713118985295296, "learning_rate": 1.879412727357741e-05, "loss": 0.0167, "step": 66740 }, { "epoch": 1.8726329078412118, "grad_norm": 0.0501830168068409, "learning_rate": 1.8789451535979802e-05, "loss": 0.0353, "step": 66750 }, { "epoch": 1.8729134520970683, "grad_norm": 0.4929333031177521, "learning_rate": 1.8784775798382195e-05, "loss": 0.0144, "step": 66760 }, { "epoch": 1.8731939963529247, "grad_norm": 0.7568842172622681, "learning_rate": 1.878010006078459e-05, "loss": 0.0286, "step": 66770 }, { "epoch": 1.8734745406087812, "grad_norm": 0.04778061807155609, "learning_rate": 1.8775424323186982e-05, "loss": 0.0024, "step": 66780 }, { "epoch": 1.8737550848646374, "grad_norm": 0.6212310791015625, "learning_rate": 1.877074858558938e-05, "loss": 0.0332, "step": 66790 }, { "epoch": 1.8740356291204936, "grad_norm": 0.026261702179908752, "learning_rate": 1.876607284799177e-05, "loss": 0.0307, "step": 66800 }, { "epoch": 1.87431617337635, "grad_norm": 0.0749158039689064, "learning_rate": 1.8761397110394165e-05, "loss": 0.0292, "step": 66810 }, { "epoch": 1.8745967176322065, "grad_norm": 0.22461198270320892, "learning_rate": 1.875672137279656e-05, "loss": 0.0433, "step": 66820 }, { "epoch": 1.874877261888063, "grad_norm": 0.17878887057304382, "learning_rate": 1.8752045635198954e-05, "loss": 0.0148, "step": 66830 }, { "epoch": 1.8751578061439194, "grad_norm": 0.12005989998579025, "learning_rate": 1.8747369897601347e-05, "loss": 0.002, "step": 66840 }, { "epoch": 1.8754383503997756, "grad_norm": 0.23731212317943573, "learning_rate": 1.874269416000374e-05, "loss": 0.0331, "step": 66850 }, { "epoch": 1.8757188946556318, "grad_norm": 0.05714042857289314, "learning_rate": 1.8738018422406137e-05, "loss": 0.0251, "step": 66860 }, { "epoch": 1.8759994389114882, "grad_norm": 0.10545618087053299, "learning_rate": 1.873334268480853e-05, "loss": 0.0428, "step": 66870 }, { "epoch": 1.8762799831673447, "grad_norm": 0.04819081351161003, "learning_rate": 1.8728666947210923e-05, "loss": 0.0207, "step": 66880 }, { "epoch": 1.8765605274232011, "grad_norm": 1.2044411897659302, "learning_rate": 1.8723991209613317e-05, "loss": 0.0274, "step": 66890 }, { "epoch": 1.8768410716790573, "grad_norm": 0.1617783159017563, "learning_rate": 1.871931547201571e-05, "loss": 0.0127, "step": 66900 }, { "epoch": 1.8771216159349138, "grad_norm": 0.28727447986602783, "learning_rate": 1.8714639734418103e-05, "loss": 0.0089, "step": 66910 }, { "epoch": 1.87740216019077, "grad_norm": 0.7754648923873901, "learning_rate": 1.87099639968205e-05, "loss": 0.0345, "step": 66920 }, { "epoch": 1.8776827044466264, "grad_norm": 0.03332659974694252, "learning_rate": 1.8705288259222896e-05, "loss": 0.0111, "step": 66930 }, { "epoch": 1.8779632487024829, "grad_norm": 0.04090399667620659, "learning_rate": 1.870061252162529e-05, "loss": 0.0436, "step": 66940 }, { "epoch": 1.8782437929583393, "grad_norm": 0.057308562099933624, "learning_rate": 1.8695936784027682e-05, "loss": 0.0256, "step": 66950 }, { "epoch": 1.8785243372141955, "grad_norm": 0.36687910556793213, "learning_rate": 1.8691261046430075e-05, "loss": 0.0208, "step": 66960 }, { "epoch": 1.8788048814700518, "grad_norm": 0.423846960067749, "learning_rate": 1.868658530883247e-05, "loss": 0.0599, "step": 66970 }, { "epoch": 1.8790854257259082, "grad_norm": 0.7990120053291321, "learning_rate": 1.8681909571234862e-05, "loss": 0.0254, "step": 66980 }, { "epoch": 1.8793659699817646, "grad_norm": 0.09787289798259735, "learning_rate": 1.8677233833637255e-05, "loss": 0.0162, "step": 66990 }, { "epoch": 1.879646514237621, "grad_norm": 0.19291256368160248, "learning_rate": 1.867255809603965e-05, "loss": 0.0455, "step": 67000 }, { "epoch": 1.8799270584934773, "grad_norm": 0.13204289972782135, "learning_rate": 1.8667882358442045e-05, "loss": 0.0481, "step": 67010 }, { "epoch": 1.8802076027493337, "grad_norm": 0.3609159290790558, "learning_rate": 1.8663206620844438e-05, "loss": 0.0332, "step": 67020 }, { "epoch": 1.88048814700519, "grad_norm": 0.10041210055351257, "learning_rate": 1.8658530883246834e-05, "loss": 0.0091, "step": 67030 }, { "epoch": 1.8807686912610464, "grad_norm": 0.15860222280025482, "learning_rate": 1.8653855145649228e-05, "loss": 0.0234, "step": 67040 }, { "epoch": 1.8810492355169028, "grad_norm": 2.83966064453125, "learning_rate": 1.864917940805162e-05, "loss": 0.02, "step": 67050 }, { "epoch": 1.8813297797727593, "grad_norm": 0.36053743958473206, "learning_rate": 1.8644503670454014e-05, "loss": 0.0364, "step": 67060 }, { "epoch": 1.8816103240286155, "grad_norm": 0.4368121922016144, "learning_rate": 1.863982793285641e-05, "loss": 0.0136, "step": 67070 }, { "epoch": 1.8818908682844717, "grad_norm": 0.30765727162361145, "learning_rate": 1.8635152195258804e-05, "loss": 0.0117, "step": 67080 }, { "epoch": 1.8821714125403282, "grad_norm": 0.01660696417093277, "learning_rate": 1.8630476457661197e-05, "loss": 0.0364, "step": 67090 }, { "epoch": 1.8824519567961846, "grad_norm": 0.058753788471221924, "learning_rate": 1.862580072006359e-05, "loss": 0.0106, "step": 67100 }, { "epoch": 1.882732501052041, "grad_norm": 7.18419885635376, "learning_rate": 1.8621124982465983e-05, "loss": 0.0082, "step": 67110 }, { "epoch": 1.8830130453078975, "grad_norm": 0.3063187599182129, "learning_rate": 1.861644924486838e-05, "loss": 0.0249, "step": 67120 }, { "epoch": 1.8832935895637537, "grad_norm": 0.7046062350273132, "learning_rate": 1.8611773507270773e-05, "loss": 0.0654, "step": 67130 }, { "epoch": 1.88357413381961, "grad_norm": 0.4165627062320709, "learning_rate": 1.860709776967317e-05, "loss": 0.0264, "step": 67140 }, { "epoch": 1.8838546780754664, "grad_norm": 0.7181987166404724, "learning_rate": 1.8602422032075562e-05, "loss": 0.0412, "step": 67150 }, { "epoch": 1.8841352223313228, "grad_norm": 0.06883709877729416, "learning_rate": 1.8597746294477956e-05, "loss": 0.0316, "step": 67160 }, { "epoch": 1.8844157665871792, "grad_norm": 0.1809278130531311, "learning_rate": 1.859307055688035e-05, "loss": 0.0114, "step": 67170 }, { "epoch": 1.8846963108430355, "grad_norm": 0.019806096330285072, "learning_rate": 1.8588394819282742e-05, "loss": 0.0112, "step": 67180 }, { "epoch": 1.884976855098892, "grad_norm": 0.05020029470324516, "learning_rate": 1.8583719081685135e-05, "loss": 0.0037, "step": 67190 }, { "epoch": 1.8852573993547481, "grad_norm": 1.437233328819275, "learning_rate": 1.8579043344087528e-05, "loss": 0.0502, "step": 67200 }, { "epoch": 1.8855379436106046, "grad_norm": 0.10605430603027344, "learning_rate": 1.8574367606489925e-05, "loss": 0.0141, "step": 67210 }, { "epoch": 1.885818487866461, "grad_norm": 0.0648907870054245, "learning_rate": 1.8569691868892318e-05, "loss": 0.0417, "step": 67220 }, { "epoch": 1.8860990321223174, "grad_norm": 0.20245783030986786, "learning_rate": 1.8565016131294714e-05, "loss": 0.0146, "step": 67230 }, { "epoch": 1.8863795763781737, "grad_norm": 0.2871764898300171, "learning_rate": 1.8560340393697108e-05, "loss": 0.0256, "step": 67240 }, { "epoch": 1.8866601206340299, "grad_norm": 0.6292780041694641, "learning_rate": 1.85556646560995e-05, "loss": 0.0233, "step": 67250 }, { "epoch": 1.8869406648898863, "grad_norm": 0.2372797429561615, "learning_rate": 1.8550988918501894e-05, "loss": 0.0307, "step": 67260 }, { "epoch": 1.8872212091457428, "grad_norm": 0.23543056845664978, "learning_rate": 1.8546313180904287e-05, "loss": 0.0565, "step": 67270 }, { "epoch": 1.8875017534015992, "grad_norm": 0.061850205063819885, "learning_rate": 1.8541637443306684e-05, "loss": 0.031, "step": 67280 }, { "epoch": 1.8877822976574554, "grad_norm": 0.1367514282464981, "learning_rate": 1.8536961705709077e-05, "loss": 0.0237, "step": 67290 }, { "epoch": 1.8880628419133119, "grad_norm": 0.047377828508615494, "learning_rate": 1.853228596811147e-05, "loss": 0.0103, "step": 67300 }, { "epoch": 1.888343386169168, "grad_norm": 0.5550222992897034, "learning_rate": 1.8527610230513863e-05, "loss": 0.0279, "step": 67310 }, { "epoch": 1.8886239304250245, "grad_norm": 0.04057091102004051, "learning_rate": 1.852293449291626e-05, "loss": 0.0257, "step": 67320 }, { "epoch": 1.888904474680881, "grad_norm": 1.7472158670425415, "learning_rate": 1.8518258755318653e-05, "loss": 0.0337, "step": 67330 }, { "epoch": 1.8891850189367374, "grad_norm": 0.030855568125844002, "learning_rate": 1.8513583017721046e-05, "loss": 0.0272, "step": 67340 }, { "epoch": 1.8894655631925936, "grad_norm": 0.5233986377716064, "learning_rate": 1.8508907280123442e-05, "loss": 0.0152, "step": 67350 }, { "epoch": 1.8897461074484498, "grad_norm": 0.024112023413181305, "learning_rate": 1.8504231542525836e-05, "loss": 0.0417, "step": 67360 }, { "epoch": 1.8900266517043063, "grad_norm": 0.06005192548036575, "learning_rate": 1.849955580492823e-05, "loss": 0.0132, "step": 67370 }, { "epoch": 1.8903071959601627, "grad_norm": 0.5631330609321594, "learning_rate": 1.8494880067330622e-05, "loss": 0.0197, "step": 67380 }, { "epoch": 1.8905877402160192, "grad_norm": 1.4650875329971313, "learning_rate": 1.8490204329733015e-05, "loss": 0.0348, "step": 67390 }, { "epoch": 1.8908682844718756, "grad_norm": 0.04806872457265854, "learning_rate": 1.8485528592135408e-05, "loss": 0.0176, "step": 67400 }, { "epoch": 1.8911488287277318, "grad_norm": 0.3025963008403778, "learning_rate": 1.84808528545378e-05, "loss": 0.0212, "step": 67410 }, { "epoch": 1.891429372983588, "grad_norm": 0.05289151147007942, "learning_rate": 1.8476177116940198e-05, "loss": 0.0104, "step": 67420 }, { "epoch": 1.8917099172394445, "grad_norm": 0.6139508485794067, "learning_rate": 1.8471501379342594e-05, "loss": 0.0412, "step": 67430 }, { "epoch": 1.891990461495301, "grad_norm": 1.012760877609253, "learning_rate": 1.8466825641744988e-05, "loss": 0.0185, "step": 67440 }, { "epoch": 1.8922710057511574, "grad_norm": 0.04715510085225105, "learning_rate": 1.846214990414738e-05, "loss": 0.005, "step": 67450 }, { "epoch": 1.8925515500070136, "grad_norm": 0.055024296045303345, "learning_rate": 1.8457474166549774e-05, "loss": 0.023, "step": 67460 }, { "epoch": 1.8928320942628698, "grad_norm": 0.511457622051239, "learning_rate": 1.8452798428952167e-05, "loss": 0.0516, "step": 67470 }, { "epoch": 1.8931126385187262, "grad_norm": 0.6154016852378845, "learning_rate": 1.844812269135456e-05, "loss": 0.0189, "step": 67480 }, { "epoch": 1.8933931827745827, "grad_norm": 0.4005107879638672, "learning_rate": 1.8443446953756957e-05, "loss": 0.0298, "step": 67490 }, { "epoch": 1.8936737270304391, "grad_norm": 0.25106289982795715, "learning_rate": 1.843877121615935e-05, "loss": 0.0167, "step": 67500 }, { "epoch": 1.8939542712862956, "grad_norm": 0.14490008354187012, "learning_rate": 1.8434095478561743e-05, "loss": 0.0234, "step": 67510 }, { "epoch": 1.8942348155421518, "grad_norm": 0.2116868793964386, "learning_rate": 1.8429419740964136e-05, "loss": 0.0238, "step": 67520 }, { "epoch": 1.894515359798008, "grad_norm": 0.016940131783485413, "learning_rate": 1.8424744003366533e-05, "loss": 0.0112, "step": 67530 }, { "epoch": 1.8947959040538644, "grad_norm": 0.48991671204566956, "learning_rate": 1.8420068265768926e-05, "loss": 0.0193, "step": 67540 }, { "epoch": 1.8950764483097209, "grad_norm": 0.15572503209114075, "learning_rate": 1.841539252817132e-05, "loss": 0.0293, "step": 67550 }, { "epoch": 1.8953569925655773, "grad_norm": 0.04420420154929161, "learning_rate": 1.8410716790573716e-05, "loss": 0.0465, "step": 67560 }, { "epoch": 1.8956375368214335, "grad_norm": 0.15096110105514526, "learning_rate": 1.840604105297611e-05, "loss": 0.0305, "step": 67570 }, { "epoch": 1.89591808107729, "grad_norm": 0.0910489410161972, "learning_rate": 1.8401365315378502e-05, "loss": 0.039, "step": 67580 }, { "epoch": 1.8961986253331462, "grad_norm": 0.052686579525470734, "learning_rate": 1.8396689577780895e-05, "loss": 0.0161, "step": 67590 }, { "epoch": 1.8964791695890026, "grad_norm": 0.22920013964176178, "learning_rate": 1.8392013840183288e-05, "loss": 0.0236, "step": 67600 }, { "epoch": 1.896759713844859, "grad_norm": 0.16414350271224976, "learning_rate": 1.838733810258568e-05, "loss": 0.0177, "step": 67610 }, { "epoch": 1.8970402581007155, "grad_norm": 0.10180466622114182, "learning_rate": 1.8382662364988078e-05, "loss": 0.0274, "step": 67620 }, { "epoch": 1.8973208023565717, "grad_norm": 0.27223989367485046, "learning_rate": 1.837798662739047e-05, "loss": 0.0182, "step": 67630 }, { "epoch": 1.897601346612428, "grad_norm": 0.02244557812809944, "learning_rate": 1.8373310889792868e-05, "loss": 0.0218, "step": 67640 }, { "epoch": 1.8978818908682844, "grad_norm": 0.5079793930053711, "learning_rate": 1.836863515219526e-05, "loss": 0.037, "step": 67650 }, { "epoch": 1.8981624351241408, "grad_norm": 0.0475359782576561, "learning_rate": 1.8363959414597654e-05, "loss": 0.0209, "step": 67660 }, { "epoch": 1.8984429793799973, "grad_norm": 0.1888798624277115, "learning_rate": 1.8359283677000047e-05, "loss": 0.0157, "step": 67670 }, { "epoch": 1.8987235236358535, "grad_norm": 3.8171324729919434, "learning_rate": 1.835460793940244e-05, "loss": 0.0345, "step": 67680 }, { "epoch": 1.89900406789171, "grad_norm": 0.02033955045044422, "learning_rate": 1.8349932201804837e-05, "loss": 0.0138, "step": 67690 }, { "epoch": 1.8992846121475662, "grad_norm": 0.09378324449062347, "learning_rate": 1.834525646420723e-05, "loss": 0.0851, "step": 67700 }, { "epoch": 1.8995651564034226, "grad_norm": 0.032201528549194336, "learning_rate": 1.8340580726609623e-05, "loss": 0.0349, "step": 67710 }, { "epoch": 1.899845700659279, "grad_norm": 0.14414072036743164, "learning_rate": 1.8335904989012016e-05, "loss": 0.0361, "step": 67720 }, { "epoch": 1.9001262449151355, "grad_norm": 1.3158156871795654, "learning_rate": 1.8331229251414413e-05, "loss": 0.0424, "step": 67730 }, { "epoch": 1.9004067891709917, "grad_norm": 0.35618171095848083, "learning_rate": 1.8326553513816806e-05, "loss": 0.0541, "step": 67740 }, { "epoch": 1.900687333426848, "grad_norm": 0.21564151346683502, "learning_rate": 1.83218777762192e-05, "loss": 0.0263, "step": 67750 }, { "epoch": 1.9009678776827044, "grad_norm": 0.9087278842926025, "learning_rate": 1.8317202038621596e-05, "loss": 0.026, "step": 67760 }, { "epoch": 1.9012484219385608, "grad_norm": 0.0918438732624054, "learning_rate": 1.831252630102399e-05, "loss": 0.0144, "step": 67770 }, { "epoch": 1.9015289661944172, "grad_norm": 0.16215871274471283, "learning_rate": 1.8307850563426382e-05, "loss": 0.0221, "step": 67780 }, { "epoch": 1.9018095104502737, "grad_norm": 0.10194717347621918, "learning_rate": 1.8303174825828775e-05, "loss": 0.0119, "step": 67790 }, { "epoch": 1.90209005470613, "grad_norm": 0.14704066514968872, "learning_rate": 1.8298499088231168e-05, "loss": 0.0107, "step": 67800 }, { "epoch": 1.9023705989619861, "grad_norm": 0.33533430099487305, "learning_rate": 1.829382335063356e-05, "loss": 0.0222, "step": 67810 }, { "epoch": 1.9026511432178426, "grad_norm": 0.05694631487131119, "learning_rate": 1.8289147613035955e-05, "loss": 0.0302, "step": 67820 }, { "epoch": 1.902931687473699, "grad_norm": 0.25848469138145447, "learning_rate": 1.828447187543835e-05, "loss": 0.0188, "step": 67830 }, { "epoch": 1.9032122317295554, "grad_norm": 0.058452337980270386, "learning_rate": 1.8279796137840748e-05, "loss": 0.0343, "step": 67840 }, { "epoch": 1.9034927759854117, "grad_norm": 0.19691918790340424, "learning_rate": 1.827512040024314e-05, "loss": 0.0257, "step": 67850 }, { "epoch": 1.903773320241268, "grad_norm": 0.4619278013706207, "learning_rate": 1.8270444662645534e-05, "loss": 0.0257, "step": 67860 }, { "epoch": 1.9040538644971243, "grad_norm": 0.22541822493076324, "learning_rate": 1.8265768925047927e-05, "loss": 0.0127, "step": 67870 }, { "epoch": 1.9043344087529808, "grad_norm": 0.04323374107480049, "learning_rate": 1.826109318745032e-05, "loss": 0.0097, "step": 67880 }, { "epoch": 1.9046149530088372, "grad_norm": 0.02096470631659031, "learning_rate": 1.8256417449852713e-05, "loss": 0.0215, "step": 67890 }, { "epoch": 1.9048954972646936, "grad_norm": 0.023010941222310066, "learning_rate": 1.825174171225511e-05, "loss": 0.0082, "step": 67900 }, { "epoch": 1.9051760415205499, "grad_norm": 0.11214902997016907, "learning_rate": 1.8247065974657503e-05, "loss": 0.0236, "step": 67910 }, { "epoch": 1.905456585776406, "grad_norm": 0.3959220051765442, "learning_rate": 1.8242390237059896e-05, "loss": 0.0176, "step": 67920 }, { "epoch": 1.9057371300322625, "grad_norm": 0.7208514213562012, "learning_rate": 1.823771449946229e-05, "loss": 0.0255, "step": 67930 }, { "epoch": 1.906017674288119, "grad_norm": 0.3059827387332916, "learning_rate": 1.8233038761864686e-05, "loss": 0.0215, "step": 67940 }, { "epoch": 1.9062982185439754, "grad_norm": 0.28773730993270874, "learning_rate": 1.822836302426708e-05, "loss": 0.0237, "step": 67950 }, { "epoch": 1.9065787627998316, "grad_norm": 0.365166574716568, "learning_rate": 1.8223687286669472e-05, "loss": 0.0305, "step": 67960 }, { "epoch": 1.906859307055688, "grad_norm": 0.017270047217607498, "learning_rate": 1.821901154907187e-05, "loss": 0.0092, "step": 67970 }, { "epoch": 1.9071398513115443, "grad_norm": 0.007890022359788418, "learning_rate": 1.8214335811474262e-05, "loss": 0.0136, "step": 67980 }, { "epoch": 1.9074203955674007, "grad_norm": 0.03782472014427185, "learning_rate": 1.8209660073876655e-05, "loss": 0.022, "step": 67990 }, { "epoch": 1.9077009398232572, "grad_norm": 0.4562471807003021, "learning_rate": 1.820498433627905e-05, "loss": 0.0311, "step": 68000 }, { "epoch": 1.9079814840791136, "grad_norm": 0.05321994051337242, "learning_rate": 1.820030859868144e-05, "loss": 0.0204, "step": 68010 }, { "epoch": 1.9082620283349698, "grad_norm": 0.01898195594549179, "learning_rate": 1.8195632861083835e-05, "loss": 0.0065, "step": 68020 }, { "epoch": 1.908542572590826, "grad_norm": 0.15095046162605286, "learning_rate": 1.819095712348623e-05, "loss": 0.0195, "step": 68030 }, { "epoch": 1.9088231168466825, "grad_norm": 0.14915774762630463, "learning_rate": 1.8186281385888628e-05, "loss": 0.0225, "step": 68040 }, { "epoch": 1.909103661102539, "grad_norm": 0.05480426549911499, "learning_rate": 1.818160564829102e-05, "loss": 0.0074, "step": 68050 }, { "epoch": 1.9093842053583954, "grad_norm": 0.01713491417467594, "learning_rate": 1.8176929910693414e-05, "loss": 0.0074, "step": 68060 }, { "epoch": 1.9096647496142518, "grad_norm": 0.4341367781162262, "learning_rate": 1.8172254173095807e-05, "loss": 0.0093, "step": 68070 }, { "epoch": 1.909945293870108, "grad_norm": 0.2600352168083191, "learning_rate": 1.81675784354982e-05, "loss": 0.0222, "step": 68080 }, { "epoch": 1.9102258381259642, "grad_norm": 0.5201166272163391, "learning_rate": 1.8162902697900594e-05, "loss": 0.0159, "step": 68090 }, { "epoch": 1.9105063823818207, "grad_norm": 0.22866956889629364, "learning_rate": 1.8158226960302987e-05, "loss": 0.0171, "step": 68100 }, { "epoch": 1.9107869266376771, "grad_norm": 0.16388647258281708, "learning_rate": 1.8153551222705383e-05, "loss": 0.0101, "step": 68110 }, { "epoch": 1.9110674708935336, "grad_norm": 0.03251909837126732, "learning_rate": 1.8148875485107776e-05, "loss": 0.0312, "step": 68120 }, { "epoch": 1.9113480151493898, "grad_norm": 0.09247944504022598, "learning_rate": 1.814419974751017e-05, "loss": 0.0345, "step": 68130 }, { "epoch": 1.9116285594052462, "grad_norm": 1.4064674377441406, "learning_rate": 1.8139524009912566e-05, "loss": 0.0134, "step": 68140 }, { "epoch": 1.9119091036611024, "grad_norm": 0.03613925352692604, "learning_rate": 1.813484827231496e-05, "loss": 0.0093, "step": 68150 }, { "epoch": 1.912189647916959, "grad_norm": 0.00973998848348856, "learning_rate": 1.8130172534717352e-05, "loss": 0.0124, "step": 68160 }, { "epoch": 1.9124701921728153, "grad_norm": 0.012450972571969032, "learning_rate": 1.8125496797119746e-05, "loss": 0.0278, "step": 68170 }, { "epoch": 1.9127507364286718, "grad_norm": 0.43718570470809937, "learning_rate": 1.8120821059522142e-05, "loss": 0.0273, "step": 68180 }, { "epoch": 1.913031280684528, "grad_norm": 0.08011370897293091, "learning_rate": 1.8116145321924535e-05, "loss": 0.0248, "step": 68190 }, { "epoch": 1.9133118249403842, "grad_norm": 0.028871456161141396, "learning_rate": 1.811146958432693e-05, "loss": 0.0294, "step": 68200 }, { "epoch": 1.9135923691962407, "grad_norm": 0.12465938925743103, "learning_rate": 1.810679384672932e-05, "loss": 0.0262, "step": 68210 }, { "epoch": 1.913872913452097, "grad_norm": 0.04917721822857857, "learning_rate": 1.8102118109131715e-05, "loss": 0.0079, "step": 68220 }, { "epoch": 1.9141534577079535, "grad_norm": 0.9026377201080322, "learning_rate": 1.809744237153411e-05, "loss": 0.0271, "step": 68230 }, { "epoch": 1.9144340019638098, "grad_norm": 0.6129225492477417, "learning_rate": 1.8092766633936504e-05, "loss": 0.02, "step": 68240 }, { "epoch": 1.9147145462196662, "grad_norm": 0.026544027030467987, "learning_rate": 1.80880908963389e-05, "loss": 0.0285, "step": 68250 }, { "epoch": 1.9149950904755224, "grad_norm": 0.00936639029532671, "learning_rate": 1.8083415158741294e-05, "loss": 0.0467, "step": 68260 }, { "epoch": 1.9152756347313789, "grad_norm": 0.19702228903770447, "learning_rate": 1.8078739421143687e-05, "loss": 0.0542, "step": 68270 }, { "epoch": 1.9155561789872353, "grad_norm": 0.16937965154647827, "learning_rate": 1.807406368354608e-05, "loss": 0.0253, "step": 68280 }, { "epoch": 1.9158367232430917, "grad_norm": 1.1248420476913452, "learning_rate": 1.8069387945948474e-05, "loss": 0.0338, "step": 68290 }, { "epoch": 1.916117267498948, "grad_norm": 0.23268768191337585, "learning_rate": 1.8064712208350867e-05, "loss": 0.0404, "step": 68300 }, { "epoch": 1.9163978117548042, "grad_norm": 0.09197933971881866, "learning_rate": 1.806003647075326e-05, "loss": 0.0146, "step": 68310 }, { "epoch": 1.9166783560106606, "grad_norm": 4.909285068511963, "learning_rate": 1.8055360733155656e-05, "loss": 0.0322, "step": 68320 }, { "epoch": 1.916958900266517, "grad_norm": 0.14103849232196808, "learning_rate": 1.805068499555805e-05, "loss": 0.031, "step": 68330 }, { "epoch": 1.9172394445223735, "grad_norm": 0.23594816029071808, "learning_rate": 1.8046009257960446e-05, "loss": 0.023, "step": 68340 }, { "epoch": 1.9175199887782297, "grad_norm": 0.06320907920598984, "learning_rate": 1.804133352036284e-05, "loss": 0.0265, "step": 68350 }, { "epoch": 1.9178005330340862, "grad_norm": 0.12938274443149567, "learning_rate": 1.8036657782765232e-05, "loss": 0.0215, "step": 68360 }, { "epoch": 1.9180810772899424, "grad_norm": 0.11076421290636063, "learning_rate": 1.8031982045167626e-05, "loss": 0.0158, "step": 68370 }, { "epoch": 1.9183616215457988, "grad_norm": 0.32968831062316895, "learning_rate": 1.802730630757002e-05, "loss": 0.0374, "step": 68380 }, { "epoch": 1.9186421658016553, "grad_norm": 0.21505548059940338, "learning_rate": 1.8022630569972415e-05, "loss": 0.0154, "step": 68390 }, { "epoch": 1.9189227100575117, "grad_norm": 0.047874707728624344, "learning_rate": 1.801795483237481e-05, "loss": 0.0184, "step": 68400 }, { "epoch": 1.919203254313368, "grad_norm": 0.23364415764808655, "learning_rate": 1.80132790947772e-05, "loss": 0.009, "step": 68410 }, { "epoch": 1.9194837985692241, "grad_norm": 1.956883430480957, "learning_rate": 1.8008603357179595e-05, "loss": 0.0432, "step": 68420 }, { "epoch": 1.9197643428250806, "grad_norm": 0.21457327902317047, "learning_rate": 1.8003927619581988e-05, "loss": 0.023, "step": 68430 }, { "epoch": 1.920044887080937, "grad_norm": 0.8582339882850647, "learning_rate": 1.7999251881984384e-05, "loss": 0.0526, "step": 68440 }, { "epoch": 1.9203254313367935, "grad_norm": 0.532575249671936, "learning_rate": 1.7994576144386778e-05, "loss": 0.0319, "step": 68450 }, { "epoch": 1.92060597559265, "grad_norm": 0.17051547765731812, "learning_rate": 1.7989900406789174e-05, "loss": 0.0219, "step": 68460 }, { "epoch": 1.9208865198485061, "grad_norm": 0.047616127878427505, "learning_rate": 1.7985224669191567e-05, "loss": 0.0116, "step": 68470 }, { "epoch": 1.9211670641043623, "grad_norm": 0.9666389226913452, "learning_rate": 1.798054893159396e-05, "loss": 0.0393, "step": 68480 }, { "epoch": 1.9214476083602188, "grad_norm": 0.051143351942300797, "learning_rate": 1.7975873193996354e-05, "loss": 0.0075, "step": 68490 }, { "epoch": 1.9217281526160752, "grad_norm": 0.058327168226242065, "learning_rate": 1.7971197456398747e-05, "loss": 0.0087, "step": 68500 }, { "epoch": 1.9220086968719317, "grad_norm": 0.6070728302001953, "learning_rate": 1.796652171880114e-05, "loss": 0.0141, "step": 68510 }, { "epoch": 1.9222892411277879, "grad_norm": 0.6399669051170349, "learning_rate": 1.7961845981203533e-05, "loss": 0.0559, "step": 68520 }, { "epoch": 1.9225697853836443, "grad_norm": 0.31986212730407715, "learning_rate": 1.795717024360593e-05, "loss": 0.0179, "step": 68530 }, { "epoch": 1.9228503296395005, "grad_norm": 0.025453822687268257, "learning_rate": 1.7952494506008323e-05, "loss": 0.0227, "step": 68540 }, { "epoch": 1.923130873895357, "grad_norm": 0.8205231428146362, "learning_rate": 1.794781876841072e-05, "loss": 0.0085, "step": 68550 }, { "epoch": 1.9234114181512134, "grad_norm": 0.17213384807109833, "learning_rate": 1.7943143030813112e-05, "loss": 0.015, "step": 68560 }, { "epoch": 1.9236919624070699, "grad_norm": 0.025179455056786537, "learning_rate": 1.7938467293215506e-05, "loss": 0.0467, "step": 68570 }, { "epoch": 1.923972506662926, "grad_norm": 0.09432859718799591, "learning_rate": 1.79337915556179e-05, "loss": 0.0101, "step": 68580 }, { "epoch": 1.9242530509187823, "grad_norm": 0.02841949090361595, "learning_rate": 1.7929115818020292e-05, "loss": 0.0221, "step": 68590 }, { "epoch": 1.9245335951746387, "grad_norm": 0.05061596259474754, "learning_rate": 1.792444008042269e-05, "loss": 0.0543, "step": 68600 }, { "epoch": 1.9248141394304952, "grad_norm": 0.3831184208393097, "learning_rate": 1.791976434282508e-05, "loss": 0.0189, "step": 68610 }, { "epoch": 1.9250946836863516, "grad_norm": 0.08087338507175446, "learning_rate": 1.7915088605227475e-05, "loss": 0.0103, "step": 68620 }, { "epoch": 1.9253752279422078, "grad_norm": 0.14402617514133453, "learning_rate": 1.7910412867629868e-05, "loss": 0.0098, "step": 68630 }, { "epoch": 1.9256557721980643, "grad_norm": 0.7458906173706055, "learning_rate": 1.7905737130032264e-05, "loss": 0.0384, "step": 68640 }, { "epoch": 1.9259363164539205, "grad_norm": 0.06204479932785034, "learning_rate": 1.7901061392434658e-05, "loss": 0.0319, "step": 68650 }, { "epoch": 1.926216860709777, "grad_norm": 0.306590735912323, "learning_rate": 1.789638565483705e-05, "loss": 0.0074, "step": 68660 }, { "epoch": 1.9264974049656334, "grad_norm": 1.2457209825515747, "learning_rate": 1.7891709917239447e-05, "loss": 0.0387, "step": 68670 }, { "epoch": 1.9267779492214898, "grad_norm": 0.16977277398109436, "learning_rate": 1.788703417964184e-05, "loss": 0.0308, "step": 68680 }, { "epoch": 1.927058493477346, "grad_norm": 0.3242484927177429, "learning_rate": 1.7882358442044234e-05, "loss": 0.0195, "step": 68690 }, { "epoch": 1.9273390377332023, "grad_norm": 0.06228184327483177, "learning_rate": 1.7877682704446627e-05, "loss": 0.0107, "step": 68700 }, { "epoch": 1.9276195819890587, "grad_norm": 0.050460249185562134, "learning_rate": 1.787300696684902e-05, "loss": 0.0255, "step": 68710 }, { "epoch": 1.9279001262449151, "grad_norm": 2.2465715408325195, "learning_rate": 1.7868331229251413e-05, "loss": 0.0444, "step": 68720 }, { "epoch": 1.9281806705007716, "grad_norm": 0.43111997842788696, "learning_rate": 1.7863655491653806e-05, "loss": 0.0209, "step": 68730 }, { "epoch": 1.928461214756628, "grad_norm": 0.09205399453639984, "learning_rate": 1.7858979754056203e-05, "loss": 0.0255, "step": 68740 }, { "epoch": 1.9287417590124842, "grad_norm": 0.2674558460712433, "learning_rate": 1.78543040164586e-05, "loss": 0.0167, "step": 68750 }, { "epoch": 1.9290223032683405, "grad_norm": 0.1863476037979126, "learning_rate": 1.7849628278860993e-05, "loss": 0.0084, "step": 68760 }, { "epoch": 1.929302847524197, "grad_norm": 0.26648199558258057, "learning_rate": 1.7844952541263386e-05, "loss": 0.012, "step": 68770 }, { "epoch": 1.9295833917800533, "grad_norm": 0.03713773563504219, "learning_rate": 1.784027680366578e-05, "loss": 0.0397, "step": 68780 }, { "epoch": 1.9298639360359098, "grad_norm": 0.9454395174980164, "learning_rate": 1.7835601066068172e-05, "loss": 0.0282, "step": 68790 }, { "epoch": 1.930144480291766, "grad_norm": 0.07272239029407501, "learning_rate": 1.7830925328470565e-05, "loss": 0.018, "step": 68800 }, { "epoch": 1.9304250245476224, "grad_norm": 0.06860581785440445, "learning_rate": 1.782624959087296e-05, "loss": 0.0123, "step": 68810 }, { "epoch": 1.9307055688034787, "grad_norm": 0.6022542119026184, "learning_rate": 1.7821573853275355e-05, "loss": 0.02, "step": 68820 }, { "epoch": 1.930986113059335, "grad_norm": 0.7365586161613464, "learning_rate": 1.7816898115677748e-05, "loss": 0.0125, "step": 68830 }, { "epoch": 1.9312666573151915, "grad_norm": 0.024189556017518044, "learning_rate": 1.781222237808014e-05, "loss": 0.0097, "step": 68840 }, { "epoch": 1.931547201571048, "grad_norm": 1.1018022298812866, "learning_rate": 1.7807546640482538e-05, "loss": 0.0357, "step": 68850 }, { "epoch": 1.9318277458269042, "grad_norm": 0.008555002510547638, "learning_rate": 1.780287090288493e-05, "loss": 0.0196, "step": 68860 }, { "epoch": 1.9321082900827604, "grad_norm": 0.08853679895401001, "learning_rate": 1.7798195165287324e-05, "loss": 0.0159, "step": 68870 }, { "epoch": 1.9323888343386169, "grad_norm": 0.013829195871949196, "learning_rate": 1.779351942768972e-05, "loss": 0.0119, "step": 68880 }, { "epoch": 1.9326693785944733, "grad_norm": 0.17225281894207, "learning_rate": 1.7788843690092114e-05, "loss": 0.0283, "step": 68890 }, { "epoch": 1.9329499228503297, "grad_norm": 0.05245345085859299, "learning_rate": 1.7784167952494507e-05, "loss": 0.0117, "step": 68900 }, { "epoch": 1.933230467106186, "grad_norm": 0.5146588087081909, "learning_rate": 1.77794922148969e-05, "loss": 0.0349, "step": 68910 }, { "epoch": 1.9335110113620424, "grad_norm": 0.9936693906784058, "learning_rate": 1.7774816477299293e-05, "loss": 0.0418, "step": 68920 }, { "epoch": 1.9337915556178986, "grad_norm": 0.06762772053480148, "learning_rate": 1.7770140739701686e-05, "loss": 0.0101, "step": 68930 }, { "epoch": 1.934072099873755, "grad_norm": 0.9471529126167297, "learning_rate": 1.7765465002104083e-05, "loss": 0.0369, "step": 68940 }, { "epoch": 1.9343526441296115, "grad_norm": 0.2170613408088684, "learning_rate": 1.776078926450648e-05, "loss": 0.0236, "step": 68950 }, { "epoch": 1.934633188385468, "grad_norm": 0.18089163303375244, "learning_rate": 1.7756113526908873e-05, "loss": 0.012, "step": 68960 }, { "epoch": 1.9349137326413242, "grad_norm": 0.19327519834041595, "learning_rate": 1.7751437789311266e-05, "loss": 0.0149, "step": 68970 }, { "epoch": 1.9351942768971804, "grad_norm": 0.01513429544866085, "learning_rate": 1.774676205171366e-05, "loss": 0.0061, "step": 68980 }, { "epoch": 1.9354748211530368, "grad_norm": 0.03238270804286003, "learning_rate": 1.7742086314116052e-05, "loss": 0.0877, "step": 68990 }, { "epoch": 1.9357553654088933, "grad_norm": 0.44138631224632263, "learning_rate": 1.7737410576518445e-05, "loss": 0.0213, "step": 69000 }, { "epoch": 1.9360359096647497, "grad_norm": 1.1583608388900757, "learning_rate": 1.7732734838920842e-05, "loss": 0.0352, "step": 69010 }, { "epoch": 1.9363164539206061, "grad_norm": 0.09899014234542847, "learning_rate": 1.7728059101323235e-05, "loss": 0.0205, "step": 69020 }, { "epoch": 1.9365969981764624, "grad_norm": 0.13866110146045685, "learning_rate": 1.7723383363725628e-05, "loss": 0.0257, "step": 69030 }, { "epoch": 1.9368775424323186, "grad_norm": 0.04452076554298401, "learning_rate": 1.771870762612802e-05, "loss": 0.009, "step": 69040 }, { "epoch": 1.937158086688175, "grad_norm": 0.643479585647583, "learning_rate": 1.7714031888530418e-05, "loss": 0.0162, "step": 69050 }, { "epoch": 1.9374386309440315, "grad_norm": 0.20316937565803528, "learning_rate": 1.770935615093281e-05, "loss": 0.0182, "step": 69060 }, { "epoch": 1.937719175199888, "grad_norm": 0.03733125701546669, "learning_rate": 1.7704680413335204e-05, "loss": 0.0069, "step": 69070 }, { "epoch": 1.9379997194557441, "grad_norm": 0.5240957140922546, "learning_rate": 1.77000046757376e-05, "loss": 0.0388, "step": 69080 }, { "epoch": 1.9382802637116006, "grad_norm": 0.2977186441421509, "learning_rate": 1.7695328938139994e-05, "loss": 0.0091, "step": 69090 }, { "epoch": 1.9385608079674568, "grad_norm": 0.4962044060230255, "learning_rate": 1.7690653200542387e-05, "loss": 0.0069, "step": 69100 }, { "epoch": 1.9388413522233132, "grad_norm": 0.22772009670734406, "learning_rate": 1.768597746294478e-05, "loss": 0.0168, "step": 69110 }, { "epoch": 1.9391218964791697, "grad_norm": 0.04441819712519646, "learning_rate": 1.7681301725347173e-05, "loss": 0.0486, "step": 69120 }, { "epoch": 1.939402440735026, "grad_norm": 0.35634297132492065, "learning_rate": 1.7676625987749566e-05, "loss": 0.0273, "step": 69130 }, { "epoch": 1.9396829849908823, "grad_norm": 0.26690906286239624, "learning_rate": 1.7671950250151963e-05, "loss": 0.0153, "step": 69140 }, { "epoch": 1.9399635292467385, "grad_norm": 0.017634423449635506, "learning_rate": 1.7667274512554356e-05, "loss": 0.0258, "step": 69150 }, { "epoch": 1.940244073502595, "grad_norm": 0.2807807922363281, "learning_rate": 1.7662598774956753e-05, "loss": 0.0234, "step": 69160 }, { "epoch": 1.9405246177584514, "grad_norm": 1.0244877338409424, "learning_rate": 1.7657923037359146e-05, "loss": 0.0286, "step": 69170 }, { "epoch": 1.9408051620143079, "grad_norm": 0.16333018243312836, "learning_rate": 1.765324729976154e-05, "loss": 0.0274, "step": 69180 }, { "epoch": 1.941085706270164, "grad_norm": 0.43799376487731934, "learning_rate": 1.7648571562163932e-05, "loss": 0.0188, "step": 69190 }, { "epoch": 1.9413662505260205, "grad_norm": 1.2382296323776245, "learning_rate": 1.7643895824566325e-05, "loss": 0.0451, "step": 69200 }, { "epoch": 1.9416467947818767, "grad_norm": 0.5825552940368652, "learning_rate": 1.763922008696872e-05, "loss": 0.037, "step": 69210 }, { "epoch": 1.9419273390377332, "grad_norm": 0.4045315086841583, "learning_rate": 1.7634544349371115e-05, "loss": 0.0321, "step": 69220 }, { "epoch": 1.9422078832935896, "grad_norm": 0.02452762797474861, "learning_rate": 1.7629868611773508e-05, "loss": 0.0563, "step": 69230 }, { "epoch": 1.942488427549446, "grad_norm": 0.07888671010732651, "learning_rate": 1.76251928741759e-05, "loss": 0.0448, "step": 69240 }, { "epoch": 1.9427689718053023, "grad_norm": 0.14450402557849884, "learning_rate": 1.7620517136578298e-05, "loss": 0.0191, "step": 69250 }, { "epoch": 1.9430495160611585, "grad_norm": 0.1221027672290802, "learning_rate": 1.761584139898069e-05, "loss": 0.0336, "step": 69260 }, { "epoch": 1.943330060317015, "grad_norm": 0.9242967963218689, "learning_rate": 1.7611165661383084e-05, "loss": 0.0383, "step": 69270 }, { "epoch": 1.9436106045728714, "grad_norm": 0.06791146099567413, "learning_rate": 1.7606489923785477e-05, "loss": 0.0387, "step": 69280 }, { "epoch": 1.9438911488287278, "grad_norm": 0.5956618785858154, "learning_rate": 1.7601814186187874e-05, "loss": 0.0172, "step": 69290 }, { "epoch": 1.944171693084584, "grad_norm": 0.27622145414352417, "learning_rate": 1.7597138448590267e-05, "loss": 0.0356, "step": 69300 }, { "epoch": 1.9444522373404405, "grad_norm": 0.12884937226772308, "learning_rate": 1.759246271099266e-05, "loss": 0.0222, "step": 69310 }, { "epoch": 1.9447327815962967, "grad_norm": 0.04461643472313881, "learning_rate": 1.7587786973395053e-05, "loss": 0.0078, "step": 69320 }, { "epoch": 1.9450133258521531, "grad_norm": 0.5189488530158997, "learning_rate": 1.7583111235797446e-05, "loss": 0.0175, "step": 69330 }, { "epoch": 1.9452938701080096, "grad_norm": 0.06946805864572525, "learning_rate": 1.757843549819984e-05, "loss": 0.0552, "step": 69340 }, { "epoch": 1.945574414363866, "grad_norm": 0.08170931786298752, "learning_rate": 1.7573759760602236e-05, "loss": 0.0292, "step": 69350 }, { "epoch": 1.9458549586197222, "grad_norm": 0.8735846281051636, "learning_rate": 1.7569084023004633e-05, "loss": 0.0332, "step": 69360 }, { "epoch": 1.9461355028755785, "grad_norm": 0.3558134436607361, "learning_rate": 1.7564408285407026e-05, "loss": 0.0339, "step": 69370 }, { "epoch": 1.946416047131435, "grad_norm": 0.282256156206131, "learning_rate": 1.755973254780942e-05, "loss": 0.0129, "step": 69380 }, { "epoch": 1.9466965913872913, "grad_norm": 0.2728719115257263, "learning_rate": 1.7555056810211812e-05, "loss": 0.0212, "step": 69390 }, { "epoch": 1.9469771356431478, "grad_norm": 0.03507731482386589, "learning_rate": 1.7550381072614205e-05, "loss": 0.0159, "step": 69400 }, { "epoch": 1.9472576798990042, "grad_norm": 0.023064415901899338, "learning_rate": 1.75457053350166e-05, "loss": 0.0093, "step": 69410 }, { "epoch": 1.9475382241548604, "grad_norm": 0.039038270711898804, "learning_rate": 1.754102959741899e-05, "loss": 0.0253, "step": 69420 }, { "epoch": 1.9478187684107167, "grad_norm": 0.03545006737112999, "learning_rate": 1.7536353859821388e-05, "loss": 0.0118, "step": 69430 }, { "epoch": 1.948099312666573, "grad_norm": 3.254142999649048, "learning_rate": 1.753167812222378e-05, "loss": 0.0631, "step": 69440 }, { "epoch": 1.9483798569224295, "grad_norm": 0.027601230889558792, "learning_rate": 1.7527002384626174e-05, "loss": 0.0153, "step": 69450 }, { "epoch": 1.948660401178286, "grad_norm": 0.02044757641851902, "learning_rate": 1.752232664702857e-05, "loss": 0.0051, "step": 69460 }, { "epoch": 1.9489409454341422, "grad_norm": 0.0864051878452301, "learning_rate": 1.7517650909430964e-05, "loss": 0.0549, "step": 69470 }, { "epoch": 1.9492214896899986, "grad_norm": 1.2527292966842651, "learning_rate": 1.7512975171833357e-05, "loss": 0.028, "step": 69480 }, { "epoch": 1.9495020339458549, "grad_norm": 0.030931388959288597, "learning_rate": 1.750829943423575e-05, "loss": 0.0631, "step": 69490 }, { "epoch": 1.9497825782017113, "grad_norm": 1.0119686126708984, "learning_rate": 1.7503623696638147e-05, "loss": 0.0143, "step": 69500 }, { "epoch": 1.9500631224575677, "grad_norm": 0.03952919691801071, "learning_rate": 1.749894795904054e-05, "loss": 0.0031, "step": 69510 }, { "epoch": 1.9503436667134242, "grad_norm": 0.6198864579200745, "learning_rate": 1.7494272221442933e-05, "loss": 0.0413, "step": 69520 }, { "epoch": 1.9506242109692804, "grad_norm": 0.10353223979473114, "learning_rate": 1.7489596483845326e-05, "loss": 0.0253, "step": 69530 }, { "epoch": 1.9509047552251366, "grad_norm": 0.34035661816596985, "learning_rate": 1.748492074624772e-05, "loss": 0.0591, "step": 69540 }, { "epoch": 1.951185299480993, "grad_norm": 0.02539021335542202, "learning_rate": 1.7480245008650116e-05, "loss": 0.0239, "step": 69550 }, { "epoch": 1.9514658437368495, "grad_norm": 0.08765698969364166, "learning_rate": 1.747556927105251e-05, "loss": 0.035, "step": 69560 }, { "epoch": 1.951746387992706, "grad_norm": 1.0155982971191406, "learning_rate": 1.7470893533454906e-05, "loss": 0.041, "step": 69570 }, { "epoch": 1.9520269322485622, "grad_norm": 0.07904400676488876, "learning_rate": 1.74662177958573e-05, "loss": 0.0408, "step": 69580 }, { "epoch": 1.9523074765044186, "grad_norm": 0.3955957293510437, "learning_rate": 1.7461542058259692e-05, "loss": 0.029, "step": 69590 }, { "epoch": 1.9525880207602748, "grad_norm": 0.9820129871368408, "learning_rate": 1.7456866320662085e-05, "loss": 0.0436, "step": 69600 }, { "epoch": 1.9528685650161313, "grad_norm": 0.2969701290130615, "learning_rate": 1.745219058306448e-05, "loss": 0.0098, "step": 69610 }, { "epoch": 1.9531491092719877, "grad_norm": 0.11913248151540756, "learning_rate": 1.744751484546687e-05, "loss": 0.0169, "step": 69620 }, { "epoch": 1.9534296535278441, "grad_norm": 0.06728685647249222, "learning_rate": 1.7442839107869265e-05, "loss": 0.0116, "step": 69630 }, { "epoch": 1.9537101977837004, "grad_norm": 0.24274227023124695, "learning_rate": 1.743816337027166e-05, "loss": 0.0192, "step": 69640 }, { "epoch": 1.9539907420395566, "grad_norm": 1.5769590139389038, "learning_rate": 1.7433487632674054e-05, "loss": 0.0162, "step": 69650 }, { "epoch": 1.954271286295413, "grad_norm": 0.10699215531349182, "learning_rate": 1.742881189507645e-05, "loss": 0.021, "step": 69660 }, { "epoch": 1.9545518305512695, "grad_norm": 0.037192802876234055, "learning_rate": 1.7424136157478844e-05, "loss": 0.0193, "step": 69670 }, { "epoch": 1.954832374807126, "grad_norm": 0.4232049286365509, "learning_rate": 1.7419460419881237e-05, "loss": 0.0199, "step": 69680 }, { "epoch": 1.9551129190629823, "grad_norm": 2.3874614238739014, "learning_rate": 1.741478468228363e-05, "loss": 0.0293, "step": 69690 }, { "epoch": 1.9553934633188386, "grad_norm": 0.40448465943336487, "learning_rate": 1.7410108944686024e-05, "loss": 0.0226, "step": 69700 }, { "epoch": 1.9556740075746948, "grad_norm": 0.04828820005059242, "learning_rate": 1.740543320708842e-05, "loss": 0.0427, "step": 69710 }, { "epoch": 1.9559545518305512, "grad_norm": 0.021061338484287262, "learning_rate": 1.7400757469490813e-05, "loss": 0.01, "step": 69720 }, { "epoch": 1.9562350960864077, "grad_norm": 0.3216798007488251, "learning_rate": 1.7396081731893207e-05, "loss": 0.0696, "step": 69730 }, { "epoch": 1.956515640342264, "grad_norm": 0.9593375325202942, "learning_rate": 1.73914059942956e-05, "loss": 0.0311, "step": 69740 }, { "epoch": 1.9567961845981203, "grad_norm": 0.13657735288143158, "learning_rate": 1.7386730256697993e-05, "loss": 0.0079, "step": 69750 }, { "epoch": 1.9570767288539768, "grad_norm": 1.4359171390533447, "learning_rate": 1.738205451910039e-05, "loss": 0.0227, "step": 69760 }, { "epoch": 1.957357273109833, "grad_norm": 0.05167895928025246, "learning_rate": 1.7377378781502783e-05, "loss": 0.0175, "step": 69770 }, { "epoch": 1.9576378173656894, "grad_norm": 0.41055983304977417, "learning_rate": 1.737270304390518e-05, "loss": 0.0259, "step": 69780 }, { "epoch": 1.9579183616215459, "grad_norm": 0.06534931808710098, "learning_rate": 1.7368027306307572e-05, "loss": 0.0172, "step": 69790 }, { "epoch": 1.9581989058774023, "grad_norm": 0.21875208616256714, "learning_rate": 1.7363351568709965e-05, "loss": 0.0156, "step": 69800 }, { "epoch": 1.9584794501332585, "grad_norm": 0.18967260420322418, "learning_rate": 1.735867583111236e-05, "loss": 0.0144, "step": 69810 }, { "epoch": 1.9587599943891147, "grad_norm": 0.041827570647001266, "learning_rate": 1.735400009351475e-05, "loss": 0.0119, "step": 69820 }, { "epoch": 1.9590405386449712, "grad_norm": 0.017384065315127373, "learning_rate": 1.7349324355917145e-05, "loss": 0.0078, "step": 69830 }, { "epoch": 1.9593210829008276, "grad_norm": 0.031357500702142715, "learning_rate": 1.7344648618319538e-05, "loss": 0.0159, "step": 69840 }, { "epoch": 1.959601627156684, "grad_norm": 0.34900185465812683, "learning_rate": 1.7339972880721935e-05, "loss": 0.0091, "step": 69850 }, { "epoch": 1.9598821714125403, "grad_norm": 0.37928369641304016, "learning_rate": 1.733529714312433e-05, "loss": 0.0219, "step": 69860 }, { "epoch": 1.9601627156683967, "grad_norm": 0.008421325124800205, "learning_rate": 1.7330621405526724e-05, "loss": 0.0186, "step": 69870 }, { "epoch": 1.960443259924253, "grad_norm": 0.27237680554389954, "learning_rate": 1.7325945667929117e-05, "loss": 0.015, "step": 69880 }, { "epoch": 1.9607238041801094, "grad_norm": 0.020770519971847534, "learning_rate": 1.732126993033151e-05, "loss": 0.0105, "step": 69890 }, { "epoch": 1.9610043484359658, "grad_norm": 0.3115159571170807, "learning_rate": 1.7316594192733904e-05, "loss": 0.0565, "step": 69900 }, { "epoch": 1.9612848926918223, "grad_norm": 0.009981553070247173, "learning_rate": 1.7311918455136297e-05, "loss": 0.0502, "step": 69910 }, { "epoch": 1.9615654369476785, "grad_norm": 2.775609254837036, "learning_rate": 1.7307242717538693e-05, "loss": 0.0169, "step": 69920 }, { "epoch": 1.9618459812035347, "grad_norm": 0.301119863986969, "learning_rate": 1.7302566979941087e-05, "loss": 0.0264, "step": 69930 }, { "epoch": 1.9621265254593911, "grad_norm": 0.02161199226975441, "learning_rate": 1.729789124234348e-05, "loss": 0.0188, "step": 69940 }, { "epoch": 1.9624070697152476, "grad_norm": 0.7283644676208496, "learning_rate": 1.7293215504745873e-05, "loss": 0.0275, "step": 69950 }, { "epoch": 1.962687613971104, "grad_norm": 0.01860804855823517, "learning_rate": 1.728853976714827e-05, "loss": 0.0152, "step": 69960 }, { "epoch": 1.9629681582269605, "grad_norm": 0.03909334912896156, "learning_rate": 1.7283864029550663e-05, "loss": 0.0054, "step": 69970 }, { "epoch": 1.9632487024828167, "grad_norm": 0.2424464076757431, "learning_rate": 1.7279188291953056e-05, "loss": 0.017, "step": 69980 }, { "epoch": 1.963529246738673, "grad_norm": 0.5969235897064209, "learning_rate": 1.7274512554355452e-05, "loss": 0.0155, "step": 69990 }, { "epoch": 1.9638097909945293, "grad_norm": 0.08777354657649994, "learning_rate": 1.7269836816757845e-05, "loss": 0.0576, "step": 70000 }, { "epoch": 1.9640903352503858, "grad_norm": 0.02284218929708004, "learning_rate": 1.726516107916024e-05, "loss": 0.0112, "step": 70010 }, { "epoch": 1.9643708795062422, "grad_norm": 0.30275607109069824, "learning_rate": 1.7260485341562632e-05, "loss": 0.0435, "step": 70020 }, { "epoch": 1.9646514237620984, "grad_norm": 0.2835124731063843, "learning_rate": 1.7255809603965025e-05, "loss": 0.0186, "step": 70030 }, { "epoch": 1.9649319680179547, "grad_norm": 0.8394071459770203, "learning_rate": 1.7251133866367418e-05, "loss": 0.0295, "step": 70040 }, { "epoch": 1.965212512273811, "grad_norm": 0.038419511169195175, "learning_rate": 1.7246458128769815e-05, "loss": 0.0223, "step": 70050 }, { "epoch": 1.9654930565296675, "grad_norm": 0.6069090962409973, "learning_rate": 1.7241782391172208e-05, "loss": 0.0168, "step": 70060 }, { "epoch": 1.965773600785524, "grad_norm": 0.5010778307914734, "learning_rate": 1.7237106653574604e-05, "loss": 0.0396, "step": 70070 }, { "epoch": 1.9660541450413804, "grad_norm": 0.042400211095809937, "learning_rate": 1.7232430915976997e-05, "loss": 0.0254, "step": 70080 }, { "epoch": 1.9663346892972366, "grad_norm": 0.7611352801322937, "learning_rate": 1.722775517837939e-05, "loss": 0.0414, "step": 70090 }, { "epoch": 1.9666152335530929, "grad_norm": 0.07167204469442368, "learning_rate": 1.7223079440781784e-05, "loss": 0.0301, "step": 70100 }, { "epoch": 1.9668957778089493, "grad_norm": 0.5101668834686279, "learning_rate": 1.7218403703184177e-05, "loss": 0.0212, "step": 70110 }, { "epoch": 1.9671763220648057, "grad_norm": 0.03741975128650665, "learning_rate": 1.721372796558657e-05, "loss": 0.0064, "step": 70120 }, { "epoch": 1.9674568663206622, "grad_norm": 0.04183134436607361, "learning_rate": 1.7209052227988967e-05, "loss": 0.0152, "step": 70130 }, { "epoch": 1.9677374105765184, "grad_norm": 0.7528977990150452, "learning_rate": 1.720437649039136e-05, "loss": 0.0178, "step": 70140 }, { "epoch": 1.9680179548323748, "grad_norm": 0.0811186134815216, "learning_rate": 1.7199700752793753e-05, "loss": 0.0258, "step": 70150 }, { "epoch": 1.968298499088231, "grad_norm": 0.3252735137939453, "learning_rate": 1.719502501519615e-05, "loss": 0.036, "step": 70160 }, { "epoch": 1.9685790433440875, "grad_norm": 0.13916385173797607, "learning_rate": 1.7190349277598543e-05, "loss": 0.0196, "step": 70170 }, { "epoch": 1.968859587599944, "grad_norm": 1.2661948204040527, "learning_rate": 1.7185673540000936e-05, "loss": 0.0212, "step": 70180 }, { "epoch": 1.9691401318558004, "grad_norm": 0.04814941808581352, "learning_rate": 1.718099780240333e-05, "loss": 0.0271, "step": 70190 }, { "epoch": 1.9694206761116566, "grad_norm": 0.09118592739105225, "learning_rate": 1.7176322064805725e-05, "loss": 0.024, "step": 70200 }, { "epoch": 1.9697012203675128, "grad_norm": 0.02686588279902935, "learning_rate": 1.717164632720812e-05, "loss": 0.0131, "step": 70210 }, { "epoch": 1.9699817646233693, "grad_norm": 0.25827333331108093, "learning_rate": 1.7166970589610512e-05, "loss": 0.0247, "step": 70220 }, { "epoch": 1.9702623088792257, "grad_norm": 0.07306474447250366, "learning_rate": 1.7162294852012905e-05, "loss": 0.0425, "step": 70230 }, { "epoch": 1.9705428531350822, "grad_norm": 0.05839666724205017, "learning_rate": 1.7157619114415298e-05, "loss": 0.0141, "step": 70240 }, { "epoch": 1.9708233973909384, "grad_norm": 0.058112647384405136, "learning_rate": 1.715294337681769e-05, "loss": 0.0188, "step": 70250 }, { "epoch": 1.9711039416467948, "grad_norm": 0.07376608997583389, "learning_rate": 1.7148267639220088e-05, "loss": 0.033, "step": 70260 }, { "epoch": 1.971384485902651, "grad_norm": 0.04753982275724411, "learning_rate": 1.7143591901622484e-05, "loss": 0.0401, "step": 70270 }, { "epoch": 1.9716650301585075, "grad_norm": 0.2927514612674713, "learning_rate": 1.7138916164024877e-05, "loss": 0.0175, "step": 70280 }, { "epoch": 1.971945574414364, "grad_norm": 0.7177024483680725, "learning_rate": 1.713424042642727e-05, "loss": 0.0212, "step": 70290 }, { "epoch": 1.9722261186702204, "grad_norm": 0.07212983071804047, "learning_rate": 1.7129564688829664e-05, "loss": 0.0214, "step": 70300 }, { "epoch": 1.9725066629260766, "grad_norm": 0.14228054881095886, "learning_rate": 1.7124888951232057e-05, "loss": 0.016, "step": 70310 }, { "epoch": 1.9727872071819328, "grad_norm": 0.19058053195476532, "learning_rate": 1.712021321363445e-05, "loss": 0.0065, "step": 70320 }, { "epoch": 1.9730677514377892, "grad_norm": 0.018319733440876007, "learning_rate": 1.7115537476036847e-05, "loss": 0.0337, "step": 70330 }, { "epoch": 1.9733482956936457, "grad_norm": 0.036344923079013824, "learning_rate": 1.711086173843924e-05, "loss": 0.0088, "step": 70340 }, { "epoch": 1.9736288399495021, "grad_norm": 0.02753262408077717, "learning_rate": 1.7106186000841633e-05, "loss": 0.0246, "step": 70350 }, { "epoch": 1.9739093842053586, "grad_norm": 1.1410114765167236, "learning_rate": 1.7101510263244026e-05, "loss": 0.0133, "step": 70360 }, { "epoch": 1.9741899284612148, "grad_norm": 0.7822995781898499, "learning_rate": 1.7096834525646423e-05, "loss": 0.038, "step": 70370 }, { "epoch": 1.974470472717071, "grad_norm": 0.9037063121795654, "learning_rate": 1.7092158788048816e-05, "loss": 0.0496, "step": 70380 }, { "epoch": 1.9747510169729274, "grad_norm": 0.07665835320949554, "learning_rate": 1.708748305045121e-05, "loss": 0.0201, "step": 70390 }, { "epoch": 1.9750315612287839, "grad_norm": 0.08416620641946793, "learning_rate": 1.7082807312853605e-05, "loss": 0.0204, "step": 70400 }, { "epoch": 1.9753121054846403, "grad_norm": 0.0924677848815918, "learning_rate": 1.7078131575256e-05, "loss": 0.0074, "step": 70410 }, { "epoch": 1.9755926497404965, "grad_norm": 0.15569958090782166, "learning_rate": 1.7073455837658392e-05, "loss": 0.0242, "step": 70420 }, { "epoch": 1.975873193996353, "grad_norm": 0.2464146465063095, "learning_rate": 1.7068780100060785e-05, "loss": 0.0058, "step": 70430 }, { "epoch": 1.9761537382522092, "grad_norm": 0.06819787621498108, "learning_rate": 1.7064104362463178e-05, "loss": 0.016, "step": 70440 }, { "epoch": 1.9764342825080656, "grad_norm": 0.0185720082372427, "learning_rate": 1.705942862486557e-05, "loss": 0.0106, "step": 70450 }, { "epoch": 1.976714826763922, "grad_norm": 0.04870761185884476, "learning_rate": 1.7054752887267968e-05, "loss": 0.01, "step": 70460 }, { "epoch": 1.9769953710197785, "grad_norm": 0.03410814329981804, "learning_rate": 1.705007714967036e-05, "loss": 0.0105, "step": 70470 }, { "epoch": 1.9772759152756347, "grad_norm": 1.2659506797790527, "learning_rate": 1.7045401412072758e-05, "loss": 0.0238, "step": 70480 }, { "epoch": 1.977556459531491, "grad_norm": 0.07550358772277832, "learning_rate": 1.704072567447515e-05, "loss": 0.0219, "step": 70490 }, { "epoch": 1.9778370037873474, "grad_norm": 0.49037933349609375, "learning_rate": 1.7036049936877544e-05, "loss": 0.0277, "step": 70500 }, { "epoch": 1.9781175480432038, "grad_norm": 0.018508853390812874, "learning_rate": 1.7031374199279937e-05, "loss": 0.0261, "step": 70510 }, { "epoch": 1.9783980922990603, "grad_norm": 0.2167162299156189, "learning_rate": 1.702669846168233e-05, "loss": 0.0095, "step": 70520 }, { "epoch": 1.9786786365549165, "grad_norm": 0.08030346781015396, "learning_rate": 1.7022022724084723e-05, "loss": 0.0319, "step": 70530 }, { "epoch": 1.978959180810773, "grad_norm": 0.02314762957394123, "learning_rate": 1.701734698648712e-05, "loss": 0.0077, "step": 70540 }, { "epoch": 1.9792397250666292, "grad_norm": 0.695681631565094, "learning_rate": 1.7012671248889513e-05, "loss": 0.0198, "step": 70550 }, { "epoch": 1.9795202693224856, "grad_norm": 0.03461457043886185, "learning_rate": 1.7007995511291906e-05, "loss": 0.0196, "step": 70560 }, { "epoch": 1.979800813578342, "grad_norm": 0.4883127808570862, "learning_rate": 1.7003319773694303e-05, "loss": 0.0374, "step": 70570 }, { "epoch": 1.9800813578341985, "grad_norm": 0.09380502998828888, "learning_rate": 1.6998644036096696e-05, "loss": 0.0287, "step": 70580 }, { "epoch": 1.9803619020900547, "grad_norm": 0.7653844356536865, "learning_rate": 1.699396829849909e-05, "loss": 0.018, "step": 70590 }, { "epoch": 1.980642446345911, "grad_norm": 1.0865801572799683, "learning_rate": 1.6989292560901482e-05, "loss": 0.0225, "step": 70600 }, { "epoch": 1.9809229906017674, "grad_norm": 0.5356186628341675, "learning_rate": 1.698461682330388e-05, "loss": 0.0493, "step": 70610 }, { "epoch": 1.9812035348576238, "grad_norm": 0.11619211733341217, "learning_rate": 1.6979941085706272e-05, "loss": 0.0188, "step": 70620 }, { "epoch": 1.9814840791134802, "grad_norm": 0.011964292265474796, "learning_rate": 1.6975265348108665e-05, "loss": 0.0142, "step": 70630 }, { "epoch": 1.9817646233693367, "grad_norm": 0.1499318778514862, "learning_rate": 1.6970589610511058e-05, "loss": 0.0077, "step": 70640 }, { "epoch": 1.982045167625193, "grad_norm": 0.4628300666809082, "learning_rate": 1.696591387291345e-05, "loss": 0.0226, "step": 70650 }, { "epoch": 1.9823257118810491, "grad_norm": 0.02267632633447647, "learning_rate": 1.6961238135315844e-05, "loss": 0.0248, "step": 70660 }, { "epoch": 1.9826062561369056, "grad_norm": 0.015285334549844265, "learning_rate": 1.695656239771824e-05, "loss": 0.0328, "step": 70670 }, { "epoch": 1.982886800392762, "grad_norm": 0.43436750769615173, "learning_rate": 1.6951886660120638e-05, "loss": 0.0082, "step": 70680 }, { "epoch": 1.9831673446486184, "grad_norm": 0.21248646080493927, "learning_rate": 1.694721092252303e-05, "loss": 0.0095, "step": 70690 }, { "epoch": 1.9834478889044747, "grad_norm": 0.4394901394844055, "learning_rate": 1.6942535184925424e-05, "loss": 0.0346, "step": 70700 }, { "epoch": 1.983728433160331, "grad_norm": 0.12942124903202057, "learning_rate": 1.6937859447327817e-05, "loss": 0.0184, "step": 70710 }, { "epoch": 1.9840089774161873, "grad_norm": 1.397826910018921, "learning_rate": 1.693318370973021e-05, "loss": 0.0211, "step": 70720 }, { "epoch": 1.9842895216720438, "grad_norm": 0.31497320532798767, "learning_rate": 1.6928507972132603e-05, "loss": 0.0179, "step": 70730 }, { "epoch": 1.9845700659279002, "grad_norm": 0.10481763631105423, "learning_rate": 1.6923832234534996e-05, "loss": 0.0235, "step": 70740 }, { "epoch": 1.9848506101837566, "grad_norm": 0.03619399294257164, "learning_rate": 1.6919156496937393e-05, "loss": 0.0129, "step": 70750 }, { "epoch": 1.9851311544396129, "grad_norm": 0.19131968915462494, "learning_rate": 1.6914480759339786e-05, "loss": 0.0374, "step": 70760 }, { "epoch": 1.985411698695469, "grad_norm": 0.27205199003219604, "learning_rate": 1.6909805021742183e-05, "loss": 0.0133, "step": 70770 }, { "epoch": 1.9856922429513255, "grad_norm": 0.04888029024004936, "learning_rate": 1.6905129284144576e-05, "loss": 0.009, "step": 70780 }, { "epoch": 1.985972787207182, "grad_norm": 0.2756154537200928, "learning_rate": 1.690045354654697e-05, "loss": 0.0071, "step": 70790 }, { "epoch": 1.9862533314630384, "grad_norm": 0.2134757936000824, "learning_rate": 1.6895777808949362e-05, "loss": 0.0061, "step": 70800 }, { "epoch": 1.9865338757188946, "grad_norm": 0.4020897448062897, "learning_rate": 1.6891102071351755e-05, "loss": 0.019, "step": 70810 }, { "epoch": 1.986814419974751, "grad_norm": 0.04440581798553467, "learning_rate": 1.6886426333754152e-05, "loss": 0.0042, "step": 70820 }, { "epoch": 1.9870949642306073, "grad_norm": 0.2938137352466583, "learning_rate": 1.6881750596156545e-05, "loss": 0.0308, "step": 70830 }, { "epoch": 1.9873755084864637, "grad_norm": 0.4905410408973694, "learning_rate": 1.6877074858558938e-05, "loss": 0.0435, "step": 70840 }, { "epoch": 1.9876560527423202, "grad_norm": 0.021562878042459488, "learning_rate": 1.687239912096133e-05, "loss": 0.0287, "step": 70850 }, { "epoch": 1.9879365969981766, "grad_norm": 0.05013788118958473, "learning_rate": 1.6867723383363725e-05, "loss": 0.0441, "step": 70860 }, { "epoch": 1.9882171412540328, "grad_norm": 0.17849422991275787, "learning_rate": 1.686304764576612e-05, "loss": 0.0382, "step": 70870 }, { "epoch": 1.988497685509889, "grad_norm": 0.20100511610507965, "learning_rate": 1.6858371908168514e-05, "loss": 0.0322, "step": 70880 }, { "epoch": 1.9887782297657455, "grad_norm": 0.2524515390396118, "learning_rate": 1.685369617057091e-05, "loss": 0.0061, "step": 70890 }, { "epoch": 1.989058774021602, "grad_norm": 0.0880739688873291, "learning_rate": 1.6849020432973304e-05, "loss": 0.0085, "step": 70900 }, { "epoch": 1.9893393182774584, "grad_norm": 0.1664600670337677, "learning_rate": 1.6844344695375697e-05, "loss": 0.0118, "step": 70910 }, { "epoch": 1.9896198625333146, "grad_norm": 0.5168462991714478, "learning_rate": 1.683966895777809e-05, "loss": 0.0163, "step": 70920 }, { "epoch": 1.989900406789171, "grad_norm": 0.040639277547597885, "learning_rate": 1.6834993220180483e-05, "loss": 0.0277, "step": 70930 }, { "epoch": 1.9901809510450272, "grad_norm": 1.808427333831787, "learning_rate": 1.6830317482582877e-05, "loss": 0.0299, "step": 70940 }, { "epoch": 1.9904614953008837, "grad_norm": 0.20158804953098297, "learning_rate": 1.682564174498527e-05, "loss": 0.0388, "step": 70950 }, { "epoch": 1.9907420395567401, "grad_norm": 0.23999084532260895, "learning_rate": 1.6820966007387666e-05, "loss": 0.0214, "step": 70960 }, { "epoch": 1.9910225838125966, "grad_norm": 0.030862964689731598, "learning_rate": 1.681629026979006e-05, "loss": 0.0212, "step": 70970 }, { "epoch": 1.9913031280684528, "grad_norm": 0.04350055754184723, "learning_rate": 1.6811614532192456e-05, "loss": 0.0109, "step": 70980 }, { "epoch": 1.991583672324309, "grad_norm": 0.1965039223432541, "learning_rate": 1.680693879459485e-05, "loss": 0.04, "step": 70990 }, { "epoch": 1.9918642165801654, "grad_norm": 0.015212048776447773, "learning_rate": 1.6802263056997242e-05, "loss": 0.0237, "step": 71000 }, { "epoch": 1.9921447608360219, "grad_norm": 0.08824288100004196, "learning_rate": 1.6797587319399635e-05, "loss": 0.0796, "step": 71010 }, { "epoch": 1.9924253050918783, "grad_norm": 0.3205777704715729, "learning_rate": 1.679291158180203e-05, "loss": 0.0078, "step": 71020 }, { "epoch": 1.9927058493477348, "grad_norm": 0.027394255623221397, "learning_rate": 1.6788235844204425e-05, "loss": 0.0153, "step": 71030 }, { "epoch": 1.992986393603591, "grad_norm": 0.7498533725738525, "learning_rate": 1.6783560106606818e-05, "loss": 0.0391, "step": 71040 }, { "epoch": 1.9932669378594472, "grad_norm": 0.09231271594762802, "learning_rate": 1.677888436900921e-05, "loss": 0.0268, "step": 71050 }, { "epoch": 1.9935474821153036, "grad_norm": 0.03089962713420391, "learning_rate": 1.6774208631411605e-05, "loss": 0.0192, "step": 71060 }, { "epoch": 1.99382802637116, "grad_norm": 0.04733678326010704, "learning_rate": 1.6769532893814e-05, "loss": 0.0147, "step": 71070 }, { "epoch": 1.9941085706270165, "grad_norm": 0.4882553219795227, "learning_rate": 1.6764857156216394e-05, "loss": 0.0508, "step": 71080 }, { "epoch": 1.9943891148828727, "grad_norm": 0.08843670785427094, "learning_rate": 1.6760181418618787e-05, "loss": 0.0241, "step": 71090 }, { "epoch": 1.9946696591387292, "grad_norm": 0.04003377631306648, "learning_rate": 1.6755505681021184e-05, "loss": 0.0247, "step": 71100 }, { "epoch": 1.9949502033945854, "grad_norm": 0.3815317451953888, "learning_rate": 1.6750829943423577e-05, "loss": 0.0248, "step": 71110 }, { "epoch": 1.9952307476504418, "grad_norm": 0.6594914197921753, "learning_rate": 1.674615420582597e-05, "loss": 0.0426, "step": 71120 }, { "epoch": 1.9955112919062983, "grad_norm": 0.04384031519293785, "learning_rate": 1.6741478468228363e-05, "loss": 0.0289, "step": 71130 }, { "epoch": 1.9957918361621547, "grad_norm": 0.09598950296640396, "learning_rate": 1.6736802730630757e-05, "loss": 0.005, "step": 71140 }, { "epoch": 1.996072380418011, "grad_norm": 0.0839935690164566, "learning_rate": 1.673212699303315e-05, "loss": 0.022, "step": 71150 }, { "epoch": 1.9963529246738672, "grad_norm": 0.6723725199699402, "learning_rate": 1.6727451255435543e-05, "loss": 0.0358, "step": 71160 }, { "epoch": 1.9966334689297236, "grad_norm": 0.021238110959529877, "learning_rate": 1.672277551783794e-05, "loss": 0.0258, "step": 71170 }, { "epoch": 1.99691401318558, "grad_norm": 0.13433803617954254, "learning_rate": 1.6718099780240336e-05, "loss": 0.0308, "step": 71180 }, { "epoch": 1.9971945574414365, "grad_norm": 0.1881173551082611, "learning_rate": 1.671342404264273e-05, "loss": 0.0086, "step": 71190 }, { "epoch": 1.9974751016972927, "grad_norm": 0.1280713826417923, "learning_rate": 1.6708748305045122e-05, "loss": 0.0158, "step": 71200 }, { "epoch": 1.9977556459531491, "grad_norm": 0.061874233186244965, "learning_rate": 1.6704072567447515e-05, "loss": 0.0237, "step": 71210 }, { "epoch": 1.9980361902090054, "grad_norm": 0.39929258823394775, "learning_rate": 1.669939682984991e-05, "loss": 0.0293, "step": 71220 }, { "epoch": 1.9983167344648618, "grad_norm": 0.16508154571056366, "learning_rate": 1.6694721092252302e-05, "loss": 0.0385, "step": 71230 }, { "epoch": 1.9985972787207182, "grad_norm": 0.5339992642402649, "learning_rate": 1.6690045354654698e-05, "loss": 0.0224, "step": 71240 }, { "epoch": 1.9988778229765747, "grad_norm": 0.053985849022865295, "learning_rate": 1.668536961705709e-05, "loss": 0.0078, "step": 71250 }, { "epoch": 1.999158367232431, "grad_norm": 0.6919560432434082, "learning_rate": 1.6680693879459485e-05, "loss": 0.0089, "step": 71260 }, { "epoch": 1.9994389114882871, "grad_norm": 0.027976742014288902, "learning_rate": 1.6676018141861878e-05, "loss": 0.0095, "step": 71270 }, { "epoch": 1.9997194557441436, "grad_norm": 0.36048755049705505, "learning_rate": 1.6671342404264274e-05, "loss": 0.0276, "step": 71280 }, { "epoch": 2.0, "grad_norm": 0.45349109172821045, "learning_rate": 1.6666666666666667e-05, "loss": 0.0376, "step": 71290 }, { "epoch": 2.0, "eval_f1": 0.9937535126142422, "eval_loss": 0.025136707350611687, "eval_precision": 0.9933829256893597, "eval_recall": 0.9941243761412644, "eval_runtime": 362.4855, "eval_samples_per_second": 674.292, "eval_steps_per_second": 42.145, "step": 71290 }, { "epoch": 2.0002805442558564, "grad_norm": 0.933976948261261, "learning_rate": 1.666199092906906e-05, "loss": 0.0281, "step": 71300 }, { "epoch": 2.000561088511713, "grad_norm": 0.06742829084396362, "learning_rate": 1.6657315191471457e-05, "loss": 0.0176, "step": 71310 }, { "epoch": 2.000841632767569, "grad_norm": 0.15085215866565704, "learning_rate": 1.665263945387385e-05, "loss": 0.0139, "step": 71320 }, { "epoch": 2.0011221770234253, "grad_norm": 1.8524972200393677, "learning_rate": 1.6647963716276243e-05, "loss": 0.0126, "step": 71330 }, { "epoch": 2.0014027212792818, "grad_norm": 0.02236943505704403, "learning_rate": 1.6643287978678637e-05, "loss": 0.0086, "step": 71340 }, { "epoch": 2.001683265535138, "grad_norm": 0.761205792427063, "learning_rate": 1.663861224108103e-05, "loss": 0.0193, "step": 71350 }, { "epoch": 2.0019638097909946, "grad_norm": 0.5346905589103699, "learning_rate": 1.6633936503483423e-05, "loss": 0.0269, "step": 71360 }, { "epoch": 2.002244354046851, "grad_norm": 0.17853540182113647, "learning_rate": 1.662926076588582e-05, "loss": 0.0311, "step": 71370 }, { "epoch": 2.002524898302707, "grad_norm": 0.9231346845626831, "learning_rate": 1.6624585028288213e-05, "loss": 0.0173, "step": 71380 }, { "epoch": 2.0028054425585635, "grad_norm": 3.44793963432312, "learning_rate": 1.661990929069061e-05, "loss": 0.0156, "step": 71390 }, { "epoch": 2.00308598681442, "grad_norm": 0.18318620324134827, "learning_rate": 1.6615233553093002e-05, "loss": 0.0204, "step": 71400 }, { "epoch": 2.0033665310702764, "grad_norm": 0.019093506038188934, "learning_rate": 1.6610557815495395e-05, "loss": 0.0088, "step": 71410 }, { "epoch": 2.003647075326133, "grad_norm": 0.8373134136199951, "learning_rate": 1.660588207789779e-05, "loss": 0.0184, "step": 71420 }, { "epoch": 2.003927619581989, "grad_norm": 2.37864351272583, "learning_rate": 1.6601206340300182e-05, "loss": 0.039, "step": 71430 }, { "epoch": 2.0042081638378453, "grad_norm": 0.04729698225855827, "learning_rate": 1.6596530602702575e-05, "loss": 0.0146, "step": 71440 }, { "epoch": 2.0044887080937017, "grad_norm": 1.6953603029251099, "learning_rate": 1.659185486510497e-05, "loss": 0.0156, "step": 71450 }, { "epoch": 2.004769252349558, "grad_norm": 0.016254745423793793, "learning_rate": 1.6587179127507365e-05, "loss": 0.0236, "step": 71460 }, { "epoch": 2.0050497966054146, "grad_norm": 0.04776562377810478, "learning_rate": 1.6582503389909758e-05, "loss": 0.0209, "step": 71470 }, { "epoch": 2.005330340861271, "grad_norm": 0.3364274501800537, "learning_rate": 1.6577827652312154e-05, "loss": 0.0197, "step": 71480 }, { "epoch": 2.005610885117127, "grad_norm": 0.6838787794113159, "learning_rate": 1.6573151914714548e-05, "loss": 0.0211, "step": 71490 }, { "epoch": 2.0058914293729835, "grad_norm": 0.30980971455574036, "learning_rate": 1.656847617711694e-05, "loss": 0.025, "step": 71500 }, { "epoch": 2.00617197362884, "grad_norm": 0.021638277918100357, "learning_rate": 1.6563800439519337e-05, "loss": 0.0235, "step": 71510 }, { "epoch": 2.0064525178846964, "grad_norm": 0.5540278553962708, "learning_rate": 1.655912470192173e-05, "loss": 0.0166, "step": 71520 }, { "epoch": 2.006733062140553, "grad_norm": 0.059641946107149124, "learning_rate": 1.6554448964324124e-05, "loss": 0.0225, "step": 71530 }, { "epoch": 2.0070136063964092, "grad_norm": 0.0631667822599411, "learning_rate": 1.6549773226726517e-05, "loss": 0.0091, "step": 71540 }, { "epoch": 2.0072941506522652, "grad_norm": 0.027644330635666847, "learning_rate": 1.654509748912891e-05, "loss": 0.0089, "step": 71550 }, { "epoch": 2.0075746949081217, "grad_norm": 0.17836298048496246, "learning_rate": 1.6540421751531303e-05, "loss": 0.0143, "step": 71560 }, { "epoch": 2.007855239163978, "grad_norm": 0.19041170179843903, "learning_rate": 1.6535746013933696e-05, "loss": 0.0074, "step": 71570 }, { "epoch": 2.0081357834198346, "grad_norm": 0.08736293017864227, "learning_rate": 1.6531070276336093e-05, "loss": 0.0147, "step": 71580 }, { "epoch": 2.008416327675691, "grad_norm": 1.7865056991577148, "learning_rate": 1.652639453873849e-05, "loss": 0.005, "step": 71590 }, { "epoch": 2.008696871931547, "grad_norm": 0.1490139365196228, "learning_rate": 1.6521718801140882e-05, "loss": 0.005, "step": 71600 }, { "epoch": 2.0089774161874034, "grad_norm": 0.007792017888277769, "learning_rate": 1.6517043063543276e-05, "loss": 0.0105, "step": 71610 }, { "epoch": 2.00925796044326, "grad_norm": 0.01703350432217121, "learning_rate": 1.651236732594567e-05, "loss": 0.0057, "step": 71620 }, { "epoch": 2.0095385046991163, "grad_norm": 0.22330403327941895, "learning_rate": 1.6507691588348062e-05, "loss": 0.0034, "step": 71630 }, { "epoch": 2.0098190489549728, "grad_norm": 0.07328762859106064, "learning_rate": 1.6503015850750455e-05, "loss": 0.0522, "step": 71640 }, { "epoch": 2.010099593210829, "grad_norm": 1.6262972354888916, "learning_rate": 1.649834011315285e-05, "loss": 0.0207, "step": 71650 }, { "epoch": 2.010380137466685, "grad_norm": 0.44909122586250305, "learning_rate": 1.6493664375555245e-05, "loss": 0.0146, "step": 71660 }, { "epoch": 2.0106606817225416, "grad_norm": 0.6742340922355652, "learning_rate": 1.6488988637957638e-05, "loss": 0.02, "step": 71670 }, { "epoch": 2.010941225978398, "grad_norm": 0.019549241289496422, "learning_rate": 1.6484312900360034e-05, "loss": 0.0197, "step": 71680 }, { "epoch": 2.0112217702342545, "grad_norm": 0.0463680773973465, "learning_rate": 1.6479637162762428e-05, "loss": 0.0063, "step": 71690 }, { "epoch": 2.011502314490111, "grad_norm": 0.9276918768882751, "learning_rate": 1.647496142516482e-05, "loss": 0.0351, "step": 71700 }, { "epoch": 2.011782858745967, "grad_norm": 0.127868190407753, "learning_rate": 1.6470285687567214e-05, "loss": 0.0188, "step": 71710 }, { "epoch": 2.0120634030018234, "grad_norm": 0.2145690619945526, "learning_rate": 1.646560994996961e-05, "loss": 0.0078, "step": 71720 }, { "epoch": 2.01234394725768, "grad_norm": 0.11814775317907333, "learning_rate": 1.6460934212372004e-05, "loss": 0.0062, "step": 71730 }, { "epoch": 2.0126244915135363, "grad_norm": 0.010430979542434216, "learning_rate": 1.6456258474774397e-05, "loss": 0.0574, "step": 71740 }, { "epoch": 2.0129050357693927, "grad_norm": 0.4355725049972534, "learning_rate": 1.645158273717679e-05, "loss": 0.0195, "step": 71750 }, { "epoch": 2.013185580025249, "grad_norm": 0.1383686363697052, "learning_rate": 1.6446906999579183e-05, "loss": 0.0197, "step": 71760 }, { "epoch": 2.013466124281105, "grad_norm": 0.39449745416641235, "learning_rate": 1.6442231261981576e-05, "loss": 0.0173, "step": 71770 }, { "epoch": 2.0137466685369616, "grad_norm": 0.24256841838359833, "learning_rate": 1.6437555524383973e-05, "loss": 0.0243, "step": 71780 }, { "epoch": 2.014027212792818, "grad_norm": 0.7997081875801086, "learning_rate": 1.643287978678637e-05, "loss": 0.0213, "step": 71790 }, { "epoch": 2.0143077570486745, "grad_norm": 0.20998474955558777, "learning_rate": 1.6428204049188762e-05, "loss": 0.0151, "step": 71800 }, { "epoch": 2.014588301304531, "grad_norm": 0.027210863307118416, "learning_rate": 1.6423528311591156e-05, "loss": 0.0436, "step": 71810 }, { "epoch": 2.0148688455603874, "grad_norm": 0.48308882117271423, "learning_rate": 1.641885257399355e-05, "loss": 0.0435, "step": 71820 }, { "epoch": 2.0151493898162434, "grad_norm": 0.03665667772293091, "learning_rate": 1.6414176836395942e-05, "loss": 0.0086, "step": 71830 }, { "epoch": 2.0154299340721, "grad_norm": 0.038408663123846054, "learning_rate": 1.6409501098798335e-05, "loss": 0.0275, "step": 71840 }, { "epoch": 2.0157104783279562, "grad_norm": 0.04825301095843315, "learning_rate": 1.6404825361200728e-05, "loss": 0.0122, "step": 71850 }, { "epoch": 2.0159910225838127, "grad_norm": 0.026875590905547142, "learning_rate": 1.6400149623603125e-05, "loss": 0.0346, "step": 71860 }, { "epoch": 2.016271566839669, "grad_norm": 0.07478857785463333, "learning_rate": 1.6395473886005518e-05, "loss": 0.0217, "step": 71870 }, { "epoch": 2.016552111095525, "grad_norm": 0.04408552497625351, "learning_rate": 1.639079814840791e-05, "loss": 0.0169, "step": 71880 }, { "epoch": 2.0168326553513816, "grad_norm": 0.4004431664943695, "learning_rate": 1.6386122410810308e-05, "loss": 0.0331, "step": 71890 }, { "epoch": 2.017113199607238, "grad_norm": 0.09496836364269257, "learning_rate": 1.63814466732127e-05, "loss": 0.0305, "step": 71900 }, { "epoch": 2.0173937438630944, "grad_norm": 0.20668037235736847, "learning_rate": 1.6376770935615094e-05, "loss": 0.0194, "step": 71910 }, { "epoch": 2.017674288118951, "grad_norm": 0.05450243502855301, "learning_rate": 1.6372095198017487e-05, "loss": 0.0033, "step": 71920 }, { "epoch": 2.0179548323748073, "grad_norm": 0.013717263005673885, "learning_rate": 1.6367419460419884e-05, "loss": 0.0125, "step": 71930 }, { "epoch": 2.0182353766306633, "grad_norm": 0.01895555853843689, "learning_rate": 1.6362743722822277e-05, "loss": 0.0268, "step": 71940 }, { "epoch": 2.0185159208865198, "grad_norm": 0.6058212518692017, "learning_rate": 1.635806798522467e-05, "loss": 0.0233, "step": 71950 }, { "epoch": 2.018796465142376, "grad_norm": 0.34140947461128235, "learning_rate": 1.6353392247627063e-05, "loss": 0.0093, "step": 71960 }, { "epoch": 2.0190770093982326, "grad_norm": 0.024594653397798538, "learning_rate": 1.6348716510029456e-05, "loss": 0.0148, "step": 71970 }, { "epoch": 2.019357553654089, "grad_norm": 0.09739704430103302, "learning_rate": 1.6344040772431853e-05, "loss": 0.0106, "step": 71980 }, { "epoch": 2.019638097909945, "grad_norm": 0.559372067451477, "learning_rate": 1.6339365034834246e-05, "loss": 0.0141, "step": 71990 }, { "epoch": 2.0199186421658015, "grad_norm": 0.01362406462430954, "learning_rate": 1.6334689297236642e-05, "loss": 0.0174, "step": 72000 }, { "epoch": 2.020199186421658, "grad_norm": 0.04175664857029915, "learning_rate": 1.6330013559639036e-05, "loss": 0.071, "step": 72010 }, { "epoch": 2.0204797306775144, "grad_norm": 0.04118923842906952, "learning_rate": 1.632533782204143e-05, "loss": 0.0262, "step": 72020 }, { "epoch": 2.020760274933371, "grad_norm": 8.126935005187988, "learning_rate": 1.6320662084443822e-05, "loss": 0.024, "step": 72030 }, { "epoch": 2.0210408191892273, "grad_norm": 1.0245654582977295, "learning_rate": 1.6315986346846215e-05, "loss": 0.0266, "step": 72040 }, { "epoch": 2.0213213634450833, "grad_norm": 0.06957541406154633, "learning_rate": 1.6311310609248608e-05, "loss": 0.0062, "step": 72050 }, { "epoch": 2.0216019077009397, "grad_norm": 0.35947301983833313, "learning_rate": 1.6306634871651e-05, "loss": 0.0202, "step": 72060 }, { "epoch": 2.021882451956796, "grad_norm": 0.07016007602214813, "learning_rate": 1.6301959134053398e-05, "loss": 0.0201, "step": 72070 }, { "epoch": 2.0221629962126526, "grad_norm": 0.035789087414741516, "learning_rate": 1.629728339645579e-05, "loss": 0.0442, "step": 72080 }, { "epoch": 2.022443540468509, "grad_norm": 0.06241618096828461, "learning_rate": 1.6292607658858188e-05, "loss": 0.0136, "step": 72090 }, { "epoch": 2.0227240847243655, "grad_norm": 0.23708213865756989, "learning_rate": 1.628793192126058e-05, "loss": 0.0296, "step": 72100 }, { "epoch": 2.0230046289802215, "grad_norm": 0.11899220943450928, "learning_rate": 1.6283256183662974e-05, "loss": 0.0217, "step": 72110 }, { "epoch": 2.023285173236078, "grad_norm": 0.3214717209339142, "learning_rate": 1.6278580446065367e-05, "loss": 0.016, "step": 72120 }, { "epoch": 2.0235657174919344, "grad_norm": 0.22742260992527008, "learning_rate": 1.627390470846776e-05, "loss": 0.0379, "step": 72130 }, { "epoch": 2.023846261747791, "grad_norm": 0.08486006408929825, "learning_rate": 1.6269228970870157e-05, "loss": 0.0074, "step": 72140 }, { "epoch": 2.0241268060036472, "grad_norm": 1.1859726905822754, "learning_rate": 1.626455323327255e-05, "loss": 0.0291, "step": 72150 }, { "epoch": 2.0244073502595032, "grad_norm": 0.014561960473656654, "learning_rate": 1.6259877495674943e-05, "loss": 0.0168, "step": 72160 }, { "epoch": 2.0246878945153597, "grad_norm": 0.11773920804262161, "learning_rate": 1.6255201758077336e-05, "loss": 0.052, "step": 72170 }, { "epoch": 2.024968438771216, "grad_norm": 0.025142796337604523, "learning_rate": 1.625052602047973e-05, "loss": 0.0038, "step": 72180 }, { "epoch": 2.0252489830270726, "grad_norm": 1.7393271923065186, "learning_rate": 1.6245850282882126e-05, "loss": 0.0414, "step": 72190 }, { "epoch": 2.025529527282929, "grad_norm": 0.03027450479567051, "learning_rate": 1.624117454528452e-05, "loss": 0.019, "step": 72200 }, { "epoch": 2.0258100715387855, "grad_norm": 0.5581474900245667, "learning_rate": 1.6236498807686916e-05, "loss": 0.0337, "step": 72210 }, { "epoch": 2.0260906157946414, "grad_norm": 0.21367976069450378, "learning_rate": 1.623182307008931e-05, "loss": 0.0383, "step": 72220 }, { "epoch": 2.026371160050498, "grad_norm": 0.25059351325035095, "learning_rate": 1.6227147332491702e-05, "loss": 0.0196, "step": 72230 }, { "epoch": 2.0266517043063543, "grad_norm": 0.04530385881662369, "learning_rate": 1.6222471594894095e-05, "loss": 0.0099, "step": 72240 }, { "epoch": 2.0269322485622108, "grad_norm": 0.7653769850730896, "learning_rate": 1.6217795857296488e-05, "loss": 0.0071, "step": 72250 }, { "epoch": 2.027212792818067, "grad_norm": 0.06321442127227783, "learning_rate": 1.621312011969888e-05, "loss": 0.0074, "step": 72260 }, { "epoch": 2.027493337073923, "grad_norm": 0.040291957557201385, "learning_rate": 1.6208444382101275e-05, "loss": 0.0294, "step": 72270 }, { "epoch": 2.0277738813297796, "grad_norm": 0.050171103328466415, "learning_rate": 1.620376864450367e-05, "loss": 0.004, "step": 72280 }, { "epoch": 2.028054425585636, "grad_norm": 0.022917207330465317, "learning_rate": 1.6199092906906064e-05, "loss": 0.0074, "step": 72290 }, { "epoch": 2.0283349698414925, "grad_norm": 0.38102880120277405, "learning_rate": 1.619441716930846e-05, "loss": 0.017, "step": 72300 }, { "epoch": 2.028615514097349, "grad_norm": 0.13568425178527832, "learning_rate": 1.6189741431710854e-05, "loss": 0.0092, "step": 72310 }, { "epoch": 2.0288960583532054, "grad_norm": 0.4257335662841797, "learning_rate": 1.6185065694113247e-05, "loss": 0.0342, "step": 72320 }, { "epoch": 2.0291766026090614, "grad_norm": 0.20895814895629883, "learning_rate": 1.618038995651564e-05, "loss": 0.0089, "step": 72330 }, { "epoch": 2.029457146864918, "grad_norm": 0.07662832736968994, "learning_rate": 1.6175714218918033e-05, "loss": 0.0215, "step": 72340 }, { "epoch": 2.0297376911207743, "grad_norm": 0.00823772232979536, "learning_rate": 1.617103848132043e-05, "loss": 0.0165, "step": 72350 }, { "epoch": 2.0300182353766307, "grad_norm": 0.06814062595367432, "learning_rate": 1.6166362743722823e-05, "loss": 0.0191, "step": 72360 }, { "epoch": 2.030298779632487, "grad_norm": 0.42352691292762756, "learning_rate": 1.6161687006125216e-05, "loss": 0.0395, "step": 72370 }, { "epoch": 2.030579323888343, "grad_norm": 1.5072805881500244, "learning_rate": 1.615701126852761e-05, "loss": 0.0475, "step": 72380 }, { "epoch": 2.0308598681441996, "grad_norm": 0.09487274289131165, "learning_rate": 1.6152335530930006e-05, "loss": 0.0098, "step": 72390 }, { "epoch": 2.031140412400056, "grad_norm": 0.09247401356697083, "learning_rate": 1.61476597933324e-05, "loss": 0.0257, "step": 72400 }, { "epoch": 2.0314209566559125, "grad_norm": 0.23334316909313202, "learning_rate": 1.6142984055734792e-05, "loss": 0.0392, "step": 72410 }, { "epoch": 2.031701500911769, "grad_norm": 0.061501774936914444, "learning_rate": 1.613830831813719e-05, "loss": 0.026, "step": 72420 }, { "epoch": 2.0319820451676254, "grad_norm": 0.3927210569381714, "learning_rate": 1.6133632580539582e-05, "loss": 0.0087, "step": 72430 }, { "epoch": 2.0322625894234814, "grad_norm": 0.14393210411071777, "learning_rate": 1.6128956842941975e-05, "loss": 0.0105, "step": 72440 }, { "epoch": 2.032543133679338, "grad_norm": 0.5302417874336243, "learning_rate": 1.612428110534437e-05, "loss": 0.0164, "step": 72450 }, { "epoch": 2.0328236779351943, "grad_norm": 2.6029298305511475, "learning_rate": 1.611960536774676e-05, "loss": 0.0242, "step": 72460 }, { "epoch": 2.0331042221910507, "grad_norm": 0.17657896876335144, "learning_rate": 1.6114929630149155e-05, "loss": 0.0073, "step": 72470 }, { "epoch": 2.033384766446907, "grad_norm": 0.21153761446475983, "learning_rate": 1.6110253892551548e-05, "loss": 0.0257, "step": 72480 }, { "epoch": 2.0336653107027636, "grad_norm": 0.23764550685882568, "learning_rate": 1.6105578154953944e-05, "loss": 0.0258, "step": 72490 }, { "epoch": 2.0339458549586196, "grad_norm": 0.22274816036224365, "learning_rate": 1.610090241735634e-05, "loss": 0.012, "step": 72500 }, { "epoch": 2.034226399214476, "grad_norm": 0.1575186401605606, "learning_rate": 1.6096226679758734e-05, "loss": 0.0095, "step": 72510 }, { "epoch": 2.0345069434703325, "grad_norm": 1.9136196374893188, "learning_rate": 1.6091550942161127e-05, "loss": 0.0455, "step": 72520 }, { "epoch": 2.034787487726189, "grad_norm": 0.08654935657978058, "learning_rate": 1.608687520456352e-05, "loss": 0.0099, "step": 72530 }, { "epoch": 2.0350680319820453, "grad_norm": 1.6222087144851685, "learning_rate": 1.6082199466965914e-05, "loss": 0.0215, "step": 72540 }, { "epoch": 2.0353485762379013, "grad_norm": 0.12690222263336182, "learning_rate": 1.6077523729368307e-05, "loss": 0.0127, "step": 72550 }, { "epoch": 2.0356291204937578, "grad_norm": 0.40155744552612305, "learning_rate": 1.6072847991770703e-05, "loss": 0.0215, "step": 72560 }, { "epoch": 2.035909664749614, "grad_norm": 0.05972588062286377, "learning_rate": 1.6068172254173096e-05, "loss": 0.0152, "step": 72570 }, { "epoch": 2.0361902090054707, "grad_norm": 0.5732808709144592, "learning_rate": 1.606349651657549e-05, "loss": 0.036, "step": 72580 }, { "epoch": 2.036470753261327, "grad_norm": 0.28897255659103394, "learning_rate": 1.6058820778977886e-05, "loss": 0.0101, "step": 72590 }, { "epoch": 2.0367512975171835, "grad_norm": 1.98434317111969, "learning_rate": 1.605414504138028e-05, "loss": 0.0295, "step": 72600 }, { "epoch": 2.0370318417730395, "grad_norm": 0.27878350019454956, "learning_rate": 1.6049469303782672e-05, "loss": 0.0229, "step": 72610 }, { "epoch": 2.037312386028896, "grad_norm": 0.15344011783599854, "learning_rate": 1.6044793566185066e-05, "loss": 0.012, "step": 72620 }, { "epoch": 2.0375929302847524, "grad_norm": 0.04949883371591568, "learning_rate": 1.6040117828587462e-05, "loss": 0.0348, "step": 72630 }, { "epoch": 2.037873474540609, "grad_norm": 2.5544285774230957, "learning_rate": 1.6035442090989855e-05, "loss": 0.0459, "step": 72640 }, { "epoch": 2.0381540187964653, "grad_norm": 0.4050377607345581, "learning_rate": 1.603076635339225e-05, "loss": 0.0334, "step": 72650 }, { "epoch": 2.0384345630523213, "grad_norm": 0.0327560231089592, "learning_rate": 1.602609061579464e-05, "loss": 0.0146, "step": 72660 }, { "epoch": 2.0387151073081777, "grad_norm": 0.013071205466985703, "learning_rate": 1.6021414878197035e-05, "loss": 0.0196, "step": 72670 }, { "epoch": 2.038995651564034, "grad_norm": 0.696874737739563, "learning_rate": 1.6016739140599428e-05, "loss": 0.0122, "step": 72680 }, { "epoch": 2.0392761958198906, "grad_norm": 0.011602158658206463, "learning_rate": 1.6012063403001824e-05, "loss": 0.0267, "step": 72690 }, { "epoch": 2.039556740075747, "grad_norm": 0.030497515574097633, "learning_rate": 1.600738766540422e-05, "loss": 0.0392, "step": 72700 }, { "epoch": 2.0398372843316035, "grad_norm": 0.10243765264749527, "learning_rate": 1.6002711927806614e-05, "loss": 0.0289, "step": 72710 }, { "epoch": 2.0401178285874595, "grad_norm": 0.2680674195289612, "learning_rate": 1.5998036190209007e-05, "loss": 0.0247, "step": 72720 }, { "epoch": 2.040398372843316, "grad_norm": 0.07420157641172409, "learning_rate": 1.59933604526114e-05, "loss": 0.03, "step": 72730 }, { "epoch": 2.0406789170991724, "grad_norm": 0.035855475813150406, "learning_rate": 1.5988684715013794e-05, "loss": 0.0098, "step": 72740 }, { "epoch": 2.040959461355029, "grad_norm": 0.034979235380887985, "learning_rate": 1.5984008977416187e-05, "loss": 0.0365, "step": 72750 }, { "epoch": 2.0412400056108853, "grad_norm": 0.03611329197883606, "learning_rate": 1.597933323981858e-05, "loss": 0.0254, "step": 72760 }, { "epoch": 2.0415205498667417, "grad_norm": 1.0640449523925781, "learning_rate": 1.5974657502220976e-05, "loss": 0.0213, "step": 72770 }, { "epoch": 2.0418010941225977, "grad_norm": 0.671966552734375, "learning_rate": 1.596998176462337e-05, "loss": 0.0206, "step": 72780 }, { "epoch": 2.042081638378454, "grad_norm": 0.5295448899269104, "learning_rate": 1.5965306027025763e-05, "loss": 0.0494, "step": 72790 }, { "epoch": 2.0423621826343106, "grad_norm": 0.08078943938016891, "learning_rate": 1.596063028942816e-05, "loss": 0.0236, "step": 72800 }, { "epoch": 2.042642726890167, "grad_norm": 0.03678404167294502, "learning_rate": 1.5955954551830552e-05, "loss": 0.0198, "step": 72810 }, { "epoch": 2.0429232711460235, "grad_norm": 1.3194830417633057, "learning_rate": 1.5951278814232946e-05, "loss": 0.0314, "step": 72820 }, { "epoch": 2.0432038154018795, "grad_norm": 0.014838623814284801, "learning_rate": 1.5946603076635342e-05, "loss": 0.0058, "step": 72830 }, { "epoch": 2.043484359657736, "grad_norm": 0.05478069186210632, "learning_rate": 1.5941927339037735e-05, "loss": 0.0125, "step": 72840 }, { "epoch": 2.0437649039135923, "grad_norm": 0.3715822398662567, "learning_rate": 1.593725160144013e-05, "loss": 0.0327, "step": 72850 }, { "epoch": 2.0440454481694488, "grad_norm": 0.09893971681594849, "learning_rate": 1.593257586384252e-05, "loss": 0.0188, "step": 72860 }, { "epoch": 2.044325992425305, "grad_norm": 0.08933393657207489, "learning_rate": 1.5927900126244915e-05, "loss": 0.0145, "step": 72870 }, { "epoch": 2.0446065366811617, "grad_norm": 0.035891685634851456, "learning_rate": 1.5923224388647308e-05, "loss": 0.0065, "step": 72880 }, { "epoch": 2.0448870809370177, "grad_norm": 1.0127373933792114, "learning_rate": 1.5918548651049704e-05, "loss": 0.0163, "step": 72890 }, { "epoch": 2.045167625192874, "grad_norm": 0.08064226061105728, "learning_rate": 1.5913872913452098e-05, "loss": 0.0081, "step": 72900 }, { "epoch": 2.0454481694487305, "grad_norm": 2.868117094039917, "learning_rate": 1.5909197175854494e-05, "loss": 0.0354, "step": 72910 }, { "epoch": 2.045728713704587, "grad_norm": 0.01778976060450077, "learning_rate": 1.5904521438256887e-05, "loss": 0.0208, "step": 72920 }, { "epoch": 2.0460092579604434, "grad_norm": 0.018092602491378784, "learning_rate": 1.589984570065928e-05, "loss": 0.0182, "step": 72930 }, { "epoch": 2.0462898022162994, "grad_norm": 0.13737034797668457, "learning_rate": 1.5895169963061674e-05, "loss": 0.0132, "step": 72940 }, { "epoch": 2.046570346472156, "grad_norm": 0.1970902979373932, "learning_rate": 1.5890494225464067e-05, "loss": 0.01, "step": 72950 }, { "epoch": 2.0468508907280123, "grad_norm": 0.06877575814723969, "learning_rate": 1.588581848786646e-05, "loss": 0.0071, "step": 72960 }, { "epoch": 2.0471314349838687, "grad_norm": 0.016137177124619484, "learning_rate": 1.5881142750268856e-05, "loss": 0.0202, "step": 72970 }, { "epoch": 2.047411979239725, "grad_norm": 0.016843413934111595, "learning_rate": 1.587646701267125e-05, "loss": 0.0062, "step": 72980 }, { "epoch": 2.0476925234955816, "grad_norm": 0.29452651739120483, "learning_rate": 1.5871791275073643e-05, "loss": 0.0083, "step": 72990 }, { "epoch": 2.0479730677514376, "grad_norm": 0.06911551207304001, "learning_rate": 1.586711553747604e-05, "loss": 0.0205, "step": 73000 }, { "epoch": 2.048253612007294, "grad_norm": 0.3763725161552429, "learning_rate": 1.5862439799878432e-05, "loss": 0.0352, "step": 73010 }, { "epoch": 2.0485341562631505, "grad_norm": 0.023964934051036835, "learning_rate": 1.5857764062280826e-05, "loss": 0.0123, "step": 73020 }, { "epoch": 2.048814700519007, "grad_norm": 0.1117859035730362, "learning_rate": 1.585308832468322e-05, "loss": 0.0041, "step": 73030 }, { "epoch": 2.0490952447748634, "grad_norm": 1.0572822093963623, "learning_rate": 1.5848412587085615e-05, "loss": 0.0206, "step": 73040 }, { "epoch": 2.0493757890307194, "grad_norm": 0.12855343520641327, "learning_rate": 1.584373684948801e-05, "loss": 0.0071, "step": 73050 }, { "epoch": 2.049656333286576, "grad_norm": 1.0655170679092407, "learning_rate": 1.58390611118904e-05, "loss": 0.0091, "step": 73060 }, { "epoch": 2.0499368775424323, "grad_norm": 0.2785130441188812, "learning_rate": 1.5834385374292795e-05, "loss": 0.0201, "step": 73070 }, { "epoch": 2.0502174217982887, "grad_norm": 0.01515648327767849, "learning_rate": 1.5829709636695188e-05, "loss": 0.0252, "step": 73080 }, { "epoch": 2.050497966054145, "grad_norm": 0.3194746971130371, "learning_rate": 1.582503389909758e-05, "loss": 0.0115, "step": 73090 }, { "epoch": 2.0507785103100016, "grad_norm": 1.7115198373794556, "learning_rate": 1.5820358161499978e-05, "loss": 0.0334, "step": 73100 }, { "epoch": 2.0510590545658576, "grad_norm": 0.15479643642902374, "learning_rate": 1.5815682423902374e-05, "loss": 0.0156, "step": 73110 }, { "epoch": 2.051339598821714, "grad_norm": 1.4053646326065063, "learning_rate": 1.5811006686304767e-05, "loss": 0.0179, "step": 73120 }, { "epoch": 2.0516201430775705, "grad_norm": 0.21216151118278503, "learning_rate": 1.580633094870716e-05, "loss": 0.0059, "step": 73130 }, { "epoch": 2.051900687333427, "grad_norm": 0.020031681284308434, "learning_rate": 1.5801655211109554e-05, "loss": 0.024, "step": 73140 }, { "epoch": 2.0521812315892833, "grad_norm": 0.043692320585250854, "learning_rate": 1.5796979473511947e-05, "loss": 0.0046, "step": 73150 }, { "epoch": 2.05246177584514, "grad_norm": 0.18195310235023499, "learning_rate": 1.579230373591434e-05, "loss": 0.0229, "step": 73160 }, { "epoch": 2.0527423201009958, "grad_norm": 0.19734011590480804, "learning_rate": 1.5787627998316733e-05, "loss": 0.0464, "step": 73170 }, { "epoch": 2.053022864356852, "grad_norm": 0.022074328735470772, "learning_rate": 1.578295226071913e-05, "loss": 0.0271, "step": 73180 }, { "epoch": 2.0533034086127087, "grad_norm": 0.06421401351690292, "learning_rate": 1.5778276523121523e-05, "loss": 0.0232, "step": 73190 }, { "epoch": 2.053583952868565, "grad_norm": 0.254896879196167, "learning_rate": 1.5773600785523916e-05, "loss": 0.0376, "step": 73200 }, { "epoch": 2.0538644971244215, "grad_norm": 0.19541023671627045, "learning_rate": 1.5768925047926313e-05, "loss": 0.0283, "step": 73210 }, { "epoch": 2.0541450413802775, "grad_norm": 2.6309690475463867, "learning_rate": 1.5764249310328706e-05, "loss": 0.0111, "step": 73220 }, { "epoch": 2.054425585636134, "grad_norm": 0.1078190878033638, "learning_rate": 1.57595735727311e-05, "loss": 0.0277, "step": 73230 }, { "epoch": 2.0547061298919904, "grad_norm": 0.15940247476100922, "learning_rate": 1.5754897835133492e-05, "loss": 0.0175, "step": 73240 }, { "epoch": 2.054986674147847, "grad_norm": 0.0941513404250145, "learning_rate": 1.575022209753589e-05, "loss": 0.0185, "step": 73250 }, { "epoch": 2.0552672184037033, "grad_norm": 0.627319872379303, "learning_rate": 1.574554635993828e-05, "loss": 0.0132, "step": 73260 }, { "epoch": 2.0555477626595597, "grad_norm": 0.24051673710346222, "learning_rate": 1.5740870622340675e-05, "loss": 0.0158, "step": 73270 }, { "epoch": 2.0558283069154157, "grad_norm": 0.20418709516525269, "learning_rate": 1.5736194884743068e-05, "loss": 0.0104, "step": 73280 }, { "epoch": 2.056108851171272, "grad_norm": 0.0807727575302124, "learning_rate": 1.573151914714546e-05, "loss": 0.0281, "step": 73290 }, { "epoch": 2.0563893954271286, "grad_norm": 0.11308534443378448, "learning_rate": 1.5726843409547858e-05, "loss": 0.0185, "step": 73300 }, { "epoch": 2.056669939682985, "grad_norm": 0.07919807732105255, "learning_rate": 1.572216767195025e-05, "loss": 0.0307, "step": 73310 }, { "epoch": 2.0569504839388415, "grad_norm": 0.05403148755431175, "learning_rate": 1.5717491934352647e-05, "loss": 0.009, "step": 73320 }, { "epoch": 2.0572310281946975, "grad_norm": 0.07728945463895798, "learning_rate": 1.571281619675504e-05, "loss": 0.0303, "step": 73330 }, { "epoch": 2.057511572450554, "grad_norm": 0.05018117278814316, "learning_rate": 1.5708140459157434e-05, "loss": 0.0207, "step": 73340 }, { "epoch": 2.0577921167064104, "grad_norm": 0.5292794704437256, "learning_rate": 1.5703464721559827e-05, "loss": 0.0161, "step": 73350 }, { "epoch": 2.058072660962267, "grad_norm": 1.375868320465088, "learning_rate": 1.569878898396222e-05, "loss": 0.0263, "step": 73360 }, { "epoch": 2.0583532052181233, "grad_norm": 0.01185943465679884, "learning_rate": 1.5694113246364613e-05, "loss": 0.009, "step": 73370 }, { "epoch": 2.0586337494739797, "grad_norm": 0.24242866039276123, "learning_rate": 1.5689437508767006e-05, "loss": 0.0049, "step": 73380 }, { "epoch": 2.0589142937298357, "grad_norm": 0.04053737223148346, "learning_rate": 1.5684761771169403e-05, "loss": 0.0374, "step": 73390 }, { "epoch": 2.059194837985692, "grad_norm": 0.011427545920014381, "learning_rate": 1.5680086033571796e-05, "loss": 0.0113, "step": 73400 }, { "epoch": 2.0594753822415486, "grad_norm": 0.017174772918224335, "learning_rate": 1.5675410295974193e-05, "loss": 0.0185, "step": 73410 }, { "epoch": 2.059755926497405, "grad_norm": 0.9589441418647766, "learning_rate": 1.5670734558376586e-05, "loss": 0.0147, "step": 73420 }, { "epoch": 2.0600364707532615, "grad_norm": 0.10771728307008743, "learning_rate": 1.566605882077898e-05, "loss": 0.0075, "step": 73430 }, { "epoch": 2.060317015009118, "grad_norm": 0.015630576759576797, "learning_rate": 1.5661383083181372e-05, "loss": 0.0138, "step": 73440 }, { "epoch": 2.060597559264974, "grad_norm": 0.24644696712493896, "learning_rate": 1.5656707345583765e-05, "loss": 0.0467, "step": 73450 }, { "epoch": 2.0608781035208303, "grad_norm": 0.9226123094558716, "learning_rate": 1.5652031607986162e-05, "loss": 0.0629, "step": 73460 }, { "epoch": 2.061158647776687, "grad_norm": 0.07851668447256088, "learning_rate": 1.5647355870388555e-05, "loss": 0.0083, "step": 73470 }, { "epoch": 2.061439192032543, "grad_norm": 0.06522516906261444, "learning_rate": 1.5642680132790948e-05, "loss": 0.0077, "step": 73480 }, { "epoch": 2.0617197362883997, "grad_norm": 0.021879002451896667, "learning_rate": 1.563800439519334e-05, "loss": 0.0201, "step": 73490 }, { "epoch": 2.0620002805442557, "grad_norm": 0.22826221585273743, "learning_rate": 1.5633328657595738e-05, "loss": 0.0581, "step": 73500 }, { "epoch": 2.062280824800112, "grad_norm": 0.058766674250364304, "learning_rate": 1.562865291999813e-05, "loss": 0.0307, "step": 73510 }, { "epoch": 2.0625613690559685, "grad_norm": 0.19087713956832886, "learning_rate": 1.5623977182400524e-05, "loss": 0.0211, "step": 73520 }, { "epoch": 2.062841913311825, "grad_norm": 0.6448677778244019, "learning_rate": 1.561930144480292e-05, "loss": 0.0141, "step": 73530 }, { "epoch": 2.0631224575676814, "grad_norm": 0.6236250996589661, "learning_rate": 1.5614625707205314e-05, "loss": 0.0261, "step": 73540 }, { "epoch": 2.063403001823538, "grad_norm": 0.8513860106468201, "learning_rate": 1.5609949969607707e-05, "loss": 0.019, "step": 73550 }, { "epoch": 2.063683546079394, "grad_norm": 0.06218241527676582, "learning_rate": 1.56052742320101e-05, "loss": 0.0321, "step": 73560 }, { "epoch": 2.0639640903352503, "grad_norm": 1.0883586406707764, "learning_rate": 1.5600598494412493e-05, "loss": 0.0321, "step": 73570 }, { "epoch": 2.0642446345911067, "grad_norm": 0.8826009035110474, "learning_rate": 1.5595922756814886e-05, "loss": 0.0422, "step": 73580 }, { "epoch": 2.064525178846963, "grad_norm": 1.3377262353897095, "learning_rate": 1.559124701921728e-05, "loss": 0.0445, "step": 73590 }, { "epoch": 2.0648057231028196, "grad_norm": 0.3061227798461914, "learning_rate": 1.5586571281619676e-05, "loss": 0.0405, "step": 73600 }, { "epoch": 2.0650862673586756, "grad_norm": 1.1391140222549438, "learning_rate": 1.5581895544022073e-05, "loss": 0.0592, "step": 73610 }, { "epoch": 2.065366811614532, "grad_norm": 0.25042369961738586, "learning_rate": 1.5577219806424466e-05, "loss": 0.0402, "step": 73620 }, { "epoch": 2.0656473558703885, "grad_norm": 0.09072747826576233, "learning_rate": 1.557254406882686e-05, "loss": 0.0118, "step": 73630 }, { "epoch": 2.065927900126245, "grad_norm": 0.07338384538888931, "learning_rate": 1.5567868331229252e-05, "loss": 0.0324, "step": 73640 }, { "epoch": 2.0662084443821014, "grad_norm": 0.16646264493465424, "learning_rate": 1.5563192593631645e-05, "loss": 0.0126, "step": 73650 }, { "epoch": 2.066488988637958, "grad_norm": 0.10964058339595795, "learning_rate": 1.555851685603404e-05, "loss": 0.0229, "step": 73660 }, { "epoch": 2.066769532893814, "grad_norm": 0.07177871465682983, "learning_rate": 1.5553841118436435e-05, "loss": 0.0139, "step": 73670 }, { "epoch": 2.0670500771496703, "grad_norm": 0.5581681132316589, "learning_rate": 1.5549165380838828e-05, "loss": 0.0229, "step": 73680 }, { "epoch": 2.0673306214055267, "grad_norm": 0.06207374855875969, "learning_rate": 1.554448964324122e-05, "loss": 0.0181, "step": 73690 }, { "epoch": 2.067611165661383, "grad_norm": 0.28022676706314087, "learning_rate": 1.5539813905643614e-05, "loss": 0.0066, "step": 73700 }, { "epoch": 2.0678917099172396, "grad_norm": 0.06736638396978378, "learning_rate": 1.553513816804601e-05, "loss": 0.0483, "step": 73710 }, { "epoch": 2.0681722541730956, "grad_norm": 2.1492409706115723, "learning_rate": 1.5530462430448404e-05, "loss": 0.0453, "step": 73720 }, { "epoch": 2.068452798428952, "grad_norm": 0.13658329844474792, "learning_rate": 1.5525786692850797e-05, "loss": 0.0368, "step": 73730 }, { "epoch": 2.0687333426848085, "grad_norm": 0.4593747556209564, "learning_rate": 1.5521110955253194e-05, "loss": 0.0196, "step": 73740 }, { "epoch": 2.069013886940665, "grad_norm": 0.11515361815690994, "learning_rate": 1.5516435217655587e-05, "loss": 0.0266, "step": 73750 }, { "epoch": 2.0692944311965213, "grad_norm": 0.3085857927799225, "learning_rate": 1.551175948005798e-05, "loss": 0.0166, "step": 73760 }, { "epoch": 2.069574975452378, "grad_norm": 0.24849426746368408, "learning_rate": 1.5507083742460373e-05, "loss": 0.0142, "step": 73770 }, { "epoch": 2.069855519708234, "grad_norm": 0.5345064401626587, "learning_rate": 1.5502408004862766e-05, "loss": 0.0383, "step": 73780 }, { "epoch": 2.0701360639640902, "grad_norm": 5.610030174255371, "learning_rate": 1.549773226726516e-05, "loss": 0.0392, "step": 73790 }, { "epoch": 2.0704166082199467, "grad_norm": 0.47782641649246216, "learning_rate": 1.5493056529667556e-05, "loss": 0.0273, "step": 73800 }, { "epoch": 2.070697152475803, "grad_norm": 0.17249609529972076, "learning_rate": 1.548838079206995e-05, "loss": 0.0232, "step": 73810 }, { "epoch": 2.0709776967316595, "grad_norm": 0.2862936556339264, "learning_rate": 1.5483705054472346e-05, "loss": 0.0052, "step": 73820 }, { "epoch": 2.071258240987516, "grad_norm": 0.21291981637477875, "learning_rate": 1.547902931687474e-05, "loss": 0.0227, "step": 73830 }, { "epoch": 2.071538785243372, "grad_norm": 0.13683821260929108, "learning_rate": 1.5474353579277132e-05, "loss": 0.0122, "step": 73840 }, { "epoch": 2.0718193294992284, "grad_norm": 0.04138621687889099, "learning_rate": 1.5469677841679525e-05, "loss": 0.018, "step": 73850 }, { "epoch": 2.072099873755085, "grad_norm": 0.35944610834121704, "learning_rate": 1.546500210408192e-05, "loss": 0.0201, "step": 73860 }, { "epoch": 2.0723804180109413, "grad_norm": 0.07782509177923203, "learning_rate": 1.546032636648431e-05, "loss": 0.0157, "step": 73870 }, { "epoch": 2.0726609622667977, "grad_norm": 0.3336731195449829, "learning_rate": 1.5455650628886708e-05, "loss": 0.0174, "step": 73880 }, { "epoch": 2.0729415065226537, "grad_norm": 0.042712196707725525, "learning_rate": 1.54509748912891e-05, "loss": 0.0073, "step": 73890 }, { "epoch": 2.07322205077851, "grad_norm": 1.2612532377243042, "learning_rate": 1.5446299153691494e-05, "loss": 0.0091, "step": 73900 }, { "epoch": 2.0735025950343666, "grad_norm": 0.14892645180225372, "learning_rate": 1.544162341609389e-05, "loss": 0.0059, "step": 73910 }, { "epoch": 2.073783139290223, "grad_norm": 0.13734912872314453, "learning_rate": 1.5436947678496284e-05, "loss": 0.0207, "step": 73920 }, { "epoch": 2.0740636835460795, "grad_norm": 0.01667669415473938, "learning_rate": 1.5432271940898677e-05, "loss": 0.0078, "step": 73930 }, { "epoch": 2.074344227801936, "grad_norm": 0.13271930813789368, "learning_rate": 1.542759620330107e-05, "loss": 0.0205, "step": 73940 }, { "epoch": 2.074624772057792, "grad_norm": 0.4229695796966553, "learning_rate": 1.5422920465703467e-05, "loss": 0.0049, "step": 73950 }, { "epoch": 2.0749053163136484, "grad_norm": 0.3315017819404602, "learning_rate": 1.541824472810586e-05, "loss": 0.0189, "step": 73960 }, { "epoch": 2.075185860569505, "grad_norm": 0.040057096630334854, "learning_rate": 1.5413568990508253e-05, "loss": 0.0276, "step": 73970 }, { "epoch": 2.0754664048253613, "grad_norm": 0.20041939616203308, "learning_rate": 1.5408893252910646e-05, "loss": 0.0576, "step": 73980 }, { "epoch": 2.0757469490812177, "grad_norm": 0.024860981851816177, "learning_rate": 1.540421751531304e-05, "loss": 0.0131, "step": 73990 }, { "epoch": 2.076027493337074, "grad_norm": 0.4767327308654785, "learning_rate": 1.5399541777715433e-05, "loss": 0.0164, "step": 74000 }, { "epoch": 2.07630803759293, "grad_norm": 0.4012508690357208, "learning_rate": 1.539486604011783e-05, "loss": 0.044, "step": 74010 }, { "epoch": 2.0765885818487866, "grad_norm": 0.07768352329730988, "learning_rate": 1.5390190302520226e-05, "loss": 0.0113, "step": 74020 }, { "epoch": 2.076869126104643, "grad_norm": 0.42298710346221924, "learning_rate": 1.538551456492262e-05, "loss": 0.0147, "step": 74030 }, { "epoch": 2.0771496703604995, "grad_norm": 0.3065096437931061, "learning_rate": 1.5380838827325012e-05, "loss": 0.0084, "step": 74040 }, { "epoch": 2.077430214616356, "grad_norm": 0.3454776704311371, "learning_rate": 1.5376163089727405e-05, "loss": 0.0327, "step": 74050 }, { "epoch": 2.077710758872212, "grad_norm": 0.9152061343193054, "learning_rate": 1.53714873521298e-05, "loss": 0.0341, "step": 74060 }, { "epoch": 2.0779913031280683, "grad_norm": 0.17819319665431976, "learning_rate": 1.536681161453219e-05, "loss": 0.0078, "step": 74070 }, { "epoch": 2.078271847383925, "grad_norm": 0.20379169285297394, "learning_rate": 1.5362135876934585e-05, "loss": 0.0089, "step": 74080 }, { "epoch": 2.0785523916397812, "grad_norm": 0.20281319320201874, "learning_rate": 1.535746013933698e-05, "loss": 0.0126, "step": 74090 }, { "epoch": 2.0788329358956377, "grad_norm": 0.506401777267456, "learning_rate": 1.5352784401739374e-05, "loss": 0.0246, "step": 74100 }, { "epoch": 2.079113480151494, "grad_norm": 3.0851528644561768, "learning_rate": 1.5348108664141768e-05, "loss": 0.0225, "step": 74110 }, { "epoch": 2.07939402440735, "grad_norm": 0.5257904529571533, "learning_rate": 1.5343432926544164e-05, "loss": 0.0206, "step": 74120 }, { "epoch": 2.0796745686632065, "grad_norm": 0.10736420750617981, "learning_rate": 1.5338757188946557e-05, "loss": 0.0129, "step": 74130 }, { "epoch": 2.079955112919063, "grad_norm": 1.1532667875289917, "learning_rate": 1.533408145134895e-05, "loss": 0.0126, "step": 74140 }, { "epoch": 2.0802356571749194, "grad_norm": 0.03981836885213852, "learning_rate": 1.5329405713751347e-05, "loss": 0.0162, "step": 74150 }, { "epoch": 2.080516201430776, "grad_norm": 0.3222828209400177, "learning_rate": 1.532472997615374e-05, "loss": 0.0435, "step": 74160 }, { "epoch": 2.080796745686632, "grad_norm": 0.2892175018787384, "learning_rate": 1.5320054238556133e-05, "loss": 0.0299, "step": 74170 }, { "epoch": 2.0810772899424883, "grad_norm": 0.062178656458854675, "learning_rate": 1.5315378500958526e-05, "loss": 0.015, "step": 74180 }, { "epoch": 2.0813578341983447, "grad_norm": 0.008513865061104298, "learning_rate": 1.531070276336092e-05, "loss": 0.0273, "step": 74190 }, { "epoch": 2.081638378454201, "grad_norm": 0.0467434860765934, "learning_rate": 1.5306027025763313e-05, "loss": 0.0193, "step": 74200 }, { "epoch": 2.0819189227100576, "grad_norm": 0.28070101141929626, "learning_rate": 1.530135128816571e-05, "loss": 0.0295, "step": 74210 }, { "epoch": 2.082199466965914, "grad_norm": 0.22963251173496246, "learning_rate": 1.5296675550568103e-05, "loss": 0.0095, "step": 74220 }, { "epoch": 2.08248001122177, "grad_norm": 0.03449910506606102, "learning_rate": 1.52919998129705e-05, "loss": 0.0306, "step": 74230 }, { "epoch": 2.0827605554776265, "grad_norm": 0.3411256968975067, "learning_rate": 1.5287324075372892e-05, "loss": 0.0402, "step": 74240 }, { "epoch": 2.083041099733483, "grad_norm": 0.13774406909942627, "learning_rate": 1.5282648337775285e-05, "loss": 0.0292, "step": 74250 }, { "epoch": 2.0833216439893394, "grad_norm": 0.03363718464970589, "learning_rate": 1.527797260017768e-05, "loss": 0.0239, "step": 74260 }, { "epoch": 2.083602188245196, "grad_norm": 0.04285508394241333, "learning_rate": 1.527329686258007e-05, "loss": 0.0092, "step": 74270 }, { "epoch": 2.083882732501052, "grad_norm": 0.14406511187553406, "learning_rate": 1.5268621124982465e-05, "loss": 0.02, "step": 74280 }, { "epoch": 2.0841632767569083, "grad_norm": 0.010899534448981285, "learning_rate": 1.526394538738486e-05, "loss": 0.0317, "step": 74290 }, { "epoch": 2.0844438210127647, "grad_norm": 0.3576441705226898, "learning_rate": 1.5259269649787255e-05, "loss": 0.0308, "step": 74300 }, { "epoch": 2.084724365268621, "grad_norm": 0.15456362068653107, "learning_rate": 1.525459391218965e-05, "loss": 0.0104, "step": 74310 }, { "epoch": 2.0850049095244776, "grad_norm": 0.25208282470703125, "learning_rate": 1.5249918174592043e-05, "loss": 0.0363, "step": 74320 }, { "epoch": 2.085285453780334, "grad_norm": 0.24055905640125275, "learning_rate": 1.5245242436994436e-05, "loss": 0.0274, "step": 74330 }, { "epoch": 2.08556599803619, "grad_norm": 1.5236423015594482, "learning_rate": 1.524056669939683e-05, "loss": 0.017, "step": 74340 }, { "epoch": 2.0858465422920465, "grad_norm": 0.07185245305299759, "learning_rate": 1.5235890961799224e-05, "loss": 0.0179, "step": 74350 }, { "epoch": 2.086127086547903, "grad_norm": 0.032451480627059937, "learning_rate": 1.523121522420162e-05, "loss": 0.0063, "step": 74360 }, { "epoch": 2.0864076308037593, "grad_norm": 0.03854461759328842, "learning_rate": 1.5226539486604013e-05, "loss": 0.0173, "step": 74370 }, { "epoch": 2.086688175059616, "grad_norm": 0.35962244868278503, "learning_rate": 1.5221863749006407e-05, "loss": 0.0116, "step": 74380 }, { "epoch": 2.086968719315472, "grad_norm": 0.3534540832042694, "learning_rate": 1.5217188011408801e-05, "loss": 0.0246, "step": 74390 }, { "epoch": 2.0872492635713282, "grad_norm": 0.5152592658996582, "learning_rate": 1.5212512273811195e-05, "loss": 0.0111, "step": 74400 }, { "epoch": 2.0875298078271847, "grad_norm": 0.005927411839365959, "learning_rate": 1.5207836536213588e-05, "loss": 0.0172, "step": 74410 }, { "epoch": 2.087810352083041, "grad_norm": 0.00497502600774169, "learning_rate": 1.5203160798615981e-05, "loss": 0.0099, "step": 74420 }, { "epoch": 2.0880908963388976, "grad_norm": 0.04509378969669342, "learning_rate": 1.5198485061018377e-05, "loss": 0.0147, "step": 74430 }, { "epoch": 2.088371440594754, "grad_norm": 0.9425305128097534, "learning_rate": 1.519380932342077e-05, "loss": 0.0171, "step": 74440 }, { "epoch": 2.08865198485061, "grad_norm": 1.0205729007720947, "learning_rate": 1.5189133585823165e-05, "loss": 0.0326, "step": 74450 }, { "epoch": 2.0889325291064664, "grad_norm": 0.019398480653762817, "learning_rate": 1.5184457848225559e-05, "loss": 0.0032, "step": 74460 }, { "epoch": 2.089213073362323, "grad_norm": 0.0667976438999176, "learning_rate": 1.5179782110627952e-05, "loss": 0.0149, "step": 74470 }, { "epoch": 2.0894936176181793, "grad_norm": 0.15200097858905792, "learning_rate": 1.5175106373030345e-05, "loss": 0.0073, "step": 74480 }, { "epoch": 2.0897741618740358, "grad_norm": 0.06984066218137741, "learning_rate": 1.517043063543274e-05, "loss": 0.0038, "step": 74490 }, { "epoch": 2.090054706129892, "grad_norm": 0.037111151963472366, "learning_rate": 1.5165754897835136e-05, "loss": 0.0054, "step": 74500 }, { "epoch": 2.090335250385748, "grad_norm": 0.10669298470020294, "learning_rate": 1.516107916023753e-05, "loss": 0.0266, "step": 74510 }, { "epoch": 2.0906157946416046, "grad_norm": 3.368622064590454, "learning_rate": 1.5156403422639923e-05, "loss": 0.032, "step": 74520 }, { "epoch": 2.090896338897461, "grad_norm": 0.015343297272920609, "learning_rate": 1.5151727685042316e-05, "loss": 0.0258, "step": 74530 }, { "epoch": 2.0911768831533175, "grad_norm": 0.028665553778409958, "learning_rate": 1.514705194744471e-05, "loss": 0.0063, "step": 74540 }, { "epoch": 2.091457427409174, "grad_norm": 1.262582778930664, "learning_rate": 1.5142376209847104e-05, "loss": 0.0222, "step": 74550 }, { "epoch": 2.09173797166503, "grad_norm": 0.10570221394300461, "learning_rate": 1.5137700472249497e-05, "loss": 0.0034, "step": 74560 }, { "epoch": 2.0920185159208864, "grad_norm": 1.9636986255645752, "learning_rate": 1.5133024734651893e-05, "loss": 0.0198, "step": 74570 }, { "epoch": 2.092299060176743, "grad_norm": 0.008044307120144367, "learning_rate": 1.5128348997054287e-05, "loss": 0.0299, "step": 74580 }, { "epoch": 2.0925796044325993, "grad_norm": 0.01297242846339941, "learning_rate": 1.512367325945668e-05, "loss": 0.0202, "step": 74590 }, { "epoch": 2.0928601486884557, "grad_norm": 0.05942341685295105, "learning_rate": 1.5118997521859075e-05, "loss": 0.0444, "step": 74600 }, { "epoch": 2.093140692944312, "grad_norm": 0.24111518263816833, "learning_rate": 1.5114321784261468e-05, "loss": 0.0254, "step": 74610 }, { "epoch": 2.093421237200168, "grad_norm": 0.20373490452766418, "learning_rate": 1.5109646046663861e-05, "loss": 0.0102, "step": 74620 }, { "epoch": 2.0937017814560246, "grad_norm": 0.20218592882156372, "learning_rate": 1.5104970309066254e-05, "loss": 0.0232, "step": 74630 }, { "epoch": 2.093982325711881, "grad_norm": 0.007242775056511164, "learning_rate": 1.510029457146865e-05, "loss": 0.0207, "step": 74640 }, { "epoch": 2.0942628699677375, "grad_norm": 0.38159891963005066, "learning_rate": 1.5095618833871045e-05, "loss": 0.0164, "step": 74650 }, { "epoch": 2.094543414223594, "grad_norm": 0.2092028707265854, "learning_rate": 1.5090943096273439e-05, "loss": 0.0484, "step": 74660 }, { "epoch": 2.0948239584794504, "grad_norm": 0.9456270933151245, "learning_rate": 1.5086267358675832e-05, "loss": 0.0137, "step": 74670 }, { "epoch": 2.0951045027353064, "grad_norm": 0.1749448925256729, "learning_rate": 1.5081591621078225e-05, "loss": 0.0085, "step": 74680 }, { "epoch": 2.095385046991163, "grad_norm": 0.4665398597717285, "learning_rate": 1.507691588348062e-05, "loss": 0.0251, "step": 74690 }, { "epoch": 2.0956655912470192, "grad_norm": 0.05289607122540474, "learning_rate": 1.5072240145883013e-05, "loss": 0.0295, "step": 74700 }, { "epoch": 2.0959461355028757, "grad_norm": 0.1704685091972351, "learning_rate": 1.506756440828541e-05, "loss": 0.0164, "step": 74710 }, { "epoch": 2.096226679758732, "grad_norm": 0.05137210711836815, "learning_rate": 1.5062888670687803e-05, "loss": 0.0125, "step": 74720 }, { "epoch": 2.096507224014588, "grad_norm": 0.052150338888168335, "learning_rate": 1.5058212933090196e-05, "loss": 0.0386, "step": 74730 }, { "epoch": 2.0967877682704446, "grad_norm": 0.13915985822677612, "learning_rate": 1.5053537195492589e-05, "loss": 0.0149, "step": 74740 }, { "epoch": 2.097068312526301, "grad_norm": 0.08982928097248077, "learning_rate": 1.5048861457894984e-05, "loss": 0.0052, "step": 74750 }, { "epoch": 2.0973488567821574, "grad_norm": 0.37158194184303284, "learning_rate": 1.5044185720297377e-05, "loss": 0.0189, "step": 74760 }, { "epoch": 2.097629401038014, "grad_norm": 0.29677262902259827, "learning_rate": 1.503950998269977e-05, "loss": 0.0087, "step": 74770 }, { "epoch": 2.0979099452938703, "grad_norm": 0.06097070500254631, "learning_rate": 1.5034834245102167e-05, "loss": 0.0417, "step": 74780 }, { "epoch": 2.0981904895497263, "grad_norm": 1.501252293586731, "learning_rate": 1.503015850750456e-05, "loss": 0.0272, "step": 74790 }, { "epoch": 2.0984710338055828, "grad_norm": 0.048277854919433594, "learning_rate": 1.5025482769906955e-05, "loss": 0.0752, "step": 74800 }, { "epoch": 2.098751578061439, "grad_norm": 0.024468624964356422, "learning_rate": 1.5020807032309348e-05, "loss": 0.016, "step": 74810 }, { "epoch": 2.0990321223172956, "grad_norm": 0.391293466091156, "learning_rate": 1.5016131294711741e-05, "loss": 0.0166, "step": 74820 }, { "epoch": 2.099312666573152, "grad_norm": 0.038136761635541916, "learning_rate": 1.5011455557114134e-05, "loss": 0.0211, "step": 74830 }, { "epoch": 2.099593210829008, "grad_norm": 0.14332973957061768, "learning_rate": 1.5006779819516529e-05, "loss": 0.0339, "step": 74840 }, { "epoch": 2.0998737550848645, "grad_norm": 0.16903717815876007, "learning_rate": 1.5002104081918924e-05, "loss": 0.0177, "step": 74850 }, { "epoch": 2.100154299340721, "grad_norm": 0.13830453157424927, "learning_rate": 1.4997428344321319e-05, "loss": 0.0057, "step": 74860 }, { "epoch": 2.1004348435965774, "grad_norm": 0.23722688853740692, "learning_rate": 1.4992752606723712e-05, "loss": 0.0045, "step": 74870 }, { "epoch": 2.100715387852434, "grad_norm": 0.6058448553085327, "learning_rate": 1.4988076869126105e-05, "loss": 0.0273, "step": 74880 }, { "epoch": 2.1009959321082903, "grad_norm": 0.8823527097702026, "learning_rate": 1.4983401131528498e-05, "loss": 0.0368, "step": 74890 }, { "epoch": 2.1012764763641463, "grad_norm": 0.27000901103019714, "learning_rate": 1.4978725393930893e-05, "loss": 0.0283, "step": 74900 }, { "epoch": 2.1015570206200027, "grad_norm": 0.517725944519043, "learning_rate": 1.4974049656333286e-05, "loss": 0.007, "step": 74910 }, { "epoch": 2.101837564875859, "grad_norm": 0.0411214679479599, "learning_rate": 1.4969373918735683e-05, "loss": 0.0065, "step": 74920 }, { "epoch": 2.1021181091317156, "grad_norm": 0.3572632670402527, "learning_rate": 1.4964698181138076e-05, "loss": 0.0082, "step": 74930 }, { "epoch": 2.102398653387572, "grad_norm": 0.10215938091278076, "learning_rate": 1.4960022443540469e-05, "loss": 0.0116, "step": 74940 }, { "epoch": 2.102679197643428, "grad_norm": 0.015481202863156796, "learning_rate": 1.4955346705942864e-05, "loss": 0.0103, "step": 74950 }, { "epoch": 2.1029597418992845, "grad_norm": 0.6829631328582764, "learning_rate": 1.4950670968345257e-05, "loss": 0.0053, "step": 74960 }, { "epoch": 2.103240286155141, "grad_norm": 0.06656390428543091, "learning_rate": 1.494599523074765e-05, "loss": 0.0077, "step": 74970 }, { "epoch": 2.1035208304109974, "grad_norm": 0.5257898569107056, "learning_rate": 1.4941319493150043e-05, "loss": 0.0073, "step": 74980 }, { "epoch": 2.103801374666854, "grad_norm": 0.1910233497619629, "learning_rate": 1.493664375555244e-05, "loss": 0.0204, "step": 74990 }, { "epoch": 2.1040819189227102, "grad_norm": 0.04113131761550903, "learning_rate": 1.4931968017954833e-05, "loss": 0.0225, "step": 75000 }, { "epoch": 2.1043624631785662, "grad_norm": 0.012880293652415276, "learning_rate": 1.4927292280357228e-05, "loss": 0.0176, "step": 75010 }, { "epoch": 2.1046430074344227, "grad_norm": 0.004219804424792528, "learning_rate": 1.4922616542759621e-05, "loss": 0.0178, "step": 75020 }, { "epoch": 2.104923551690279, "grad_norm": 1.2884629964828491, "learning_rate": 1.4917940805162014e-05, "loss": 0.0405, "step": 75030 }, { "epoch": 2.1052040959461356, "grad_norm": 0.12187236547470093, "learning_rate": 1.4913265067564407e-05, "loss": 0.0285, "step": 75040 }, { "epoch": 2.105484640201992, "grad_norm": 0.043231528252363205, "learning_rate": 1.4908589329966802e-05, "loss": 0.0182, "step": 75050 }, { "epoch": 2.1057651844578484, "grad_norm": 0.34814298152923584, "learning_rate": 1.4903913592369199e-05, "loss": 0.0215, "step": 75060 }, { "epoch": 2.1060457287137044, "grad_norm": 0.04961239919066429, "learning_rate": 1.4899237854771592e-05, "loss": 0.0069, "step": 75070 }, { "epoch": 2.106326272969561, "grad_norm": 0.3267940282821655, "learning_rate": 1.4894562117173985e-05, "loss": 0.0595, "step": 75080 }, { "epoch": 2.1066068172254173, "grad_norm": 0.21714060008525848, "learning_rate": 1.4889886379576378e-05, "loss": 0.0422, "step": 75090 }, { "epoch": 2.1068873614812738, "grad_norm": 0.10042203217744827, "learning_rate": 1.4885210641978773e-05, "loss": 0.0127, "step": 75100 }, { "epoch": 2.10716790573713, "grad_norm": 0.039832837879657745, "learning_rate": 1.4880534904381166e-05, "loss": 0.0103, "step": 75110 }, { "epoch": 2.107448449992986, "grad_norm": 2.912767171859741, "learning_rate": 1.487585916678356e-05, "loss": 0.0598, "step": 75120 }, { "epoch": 2.1077289942488426, "grad_norm": 1.3550115823745728, "learning_rate": 1.4871183429185956e-05, "loss": 0.033, "step": 75130 }, { "epoch": 2.108009538504699, "grad_norm": 0.035447776317596436, "learning_rate": 1.4866507691588349e-05, "loss": 0.0236, "step": 75140 }, { "epoch": 2.1082900827605555, "grad_norm": 0.5597656965255737, "learning_rate": 1.4861831953990742e-05, "loss": 0.0476, "step": 75150 }, { "epoch": 2.108570627016412, "grad_norm": 0.6114969253540039, "learning_rate": 1.4857156216393137e-05, "loss": 0.032, "step": 75160 }, { "epoch": 2.1088511712722684, "grad_norm": 0.3169812858104706, "learning_rate": 1.485248047879553e-05, "loss": 0.0391, "step": 75170 }, { "epoch": 2.1091317155281244, "grad_norm": 0.5841463804244995, "learning_rate": 1.4847804741197923e-05, "loss": 0.0503, "step": 75180 }, { "epoch": 2.109412259783981, "grad_norm": 0.07556121051311493, "learning_rate": 1.4843129003600316e-05, "loss": 0.0314, "step": 75190 }, { "epoch": 2.1096928040398373, "grad_norm": 0.9870970845222473, "learning_rate": 1.4838453266002713e-05, "loss": 0.0237, "step": 75200 }, { "epoch": 2.1099733482956937, "grad_norm": 0.6382038593292236, "learning_rate": 1.4833777528405108e-05, "loss": 0.0296, "step": 75210 }, { "epoch": 2.11025389255155, "grad_norm": 0.24618050456047058, "learning_rate": 1.4829101790807501e-05, "loss": 0.0098, "step": 75220 }, { "epoch": 2.110534436807406, "grad_norm": 0.10169859230518341, "learning_rate": 1.4824426053209894e-05, "loss": 0.0151, "step": 75230 }, { "epoch": 2.1108149810632626, "grad_norm": 0.0791398361325264, "learning_rate": 1.4819750315612287e-05, "loss": 0.0259, "step": 75240 }, { "epoch": 2.111095525319119, "grad_norm": 0.10298183560371399, "learning_rate": 1.4815074578014682e-05, "loss": 0.0093, "step": 75250 }, { "epoch": 2.1113760695749755, "grad_norm": 0.025226036086678505, "learning_rate": 1.4810398840417075e-05, "loss": 0.0084, "step": 75260 }, { "epoch": 2.111656613830832, "grad_norm": 3.372272491455078, "learning_rate": 1.4805723102819472e-05, "loss": 0.0302, "step": 75270 }, { "epoch": 2.1119371580866884, "grad_norm": 0.08001767843961716, "learning_rate": 1.4801047365221865e-05, "loss": 0.0151, "step": 75280 }, { "epoch": 2.1122177023425444, "grad_norm": 0.0677795335650444, "learning_rate": 1.4796371627624258e-05, "loss": 0.0197, "step": 75290 }, { "epoch": 2.112498246598401, "grad_norm": 0.05020041763782501, "learning_rate": 1.4791695890026653e-05, "loss": 0.0314, "step": 75300 }, { "epoch": 2.1127787908542572, "grad_norm": 0.03519677370786667, "learning_rate": 1.4787020152429046e-05, "loss": 0.0069, "step": 75310 }, { "epoch": 2.1130593351101137, "grad_norm": 0.5248537659645081, "learning_rate": 1.478234441483144e-05, "loss": 0.031, "step": 75320 }, { "epoch": 2.11333987936597, "grad_norm": 0.040356773883104324, "learning_rate": 1.4777668677233833e-05, "loss": 0.0165, "step": 75330 }, { "epoch": 2.1136204236218266, "grad_norm": 0.31721407175064087, "learning_rate": 1.4772992939636229e-05, "loss": 0.0693, "step": 75340 }, { "epoch": 2.1139009678776826, "grad_norm": 0.49542108178138733, "learning_rate": 1.4768317202038622e-05, "loss": 0.0421, "step": 75350 }, { "epoch": 2.114181512133539, "grad_norm": 0.3341558575630188, "learning_rate": 1.4763641464441017e-05, "loss": 0.0274, "step": 75360 }, { "epoch": 2.1144620563893954, "grad_norm": 0.1274924874305725, "learning_rate": 1.475896572684341e-05, "loss": 0.0128, "step": 75370 }, { "epoch": 2.114742600645252, "grad_norm": 0.05442224442958832, "learning_rate": 1.4754289989245803e-05, "loss": 0.0079, "step": 75380 }, { "epoch": 2.1150231449011083, "grad_norm": 0.029164910316467285, "learning_rate": 1.4749614251648197e-05, "loss": 0.0547, "step": 75390 }, { "epoch": 2.1153036891569643, "grad_norm": 0.34958750009536743, "learning_rate": 1.4744938514050593e-05, "loss": 0.0107, "step": 75400 }, { "epoch": 2.1155842334128208, "grad_norm": 0.11112283915281296, "learning_rate": 1.4740262776452988e-05, "loss": 0.0059, "step": 75410 }, { "epoch": 2.115864777668677, "grad_norm": 0.06654394418001175, "learning_rate": 1.4735587038855381e-05, "loss": 0.0082, "step": 75420 }, { "epoch": 2.1161453219245336, "grad_norm": 0.06143626198172569, "learning_rate": 1.4730911301257774e-05, "loss": 0.0284, "step": 75430 }, { "epoch": 2.11642586618039, "grad_norm": 0.06493736803531647, "learning_rate": 1.4726235563660167e-05, "loss": 0.0344, "step": 75440 }, { "epoch": 2.1167064104362465, "grad_norm": 0.05292629450559616, "learning_rate": 1.4721559826062562e-05, "loss": 0.0265, "step": 75450 }, { "epoch": 2.1169869546921025, "grad_norm": 0.15413804352283478, "learning_rate": 1.4716884088464955e-05, "loss": 0.0133, "step": 75460 }, { "epoch": 2.117267498947959, "grad_norm": 0.026772160083055496, "learning_rate": 1.4712208350867352e-05, "loss": 0.0357, "step": 75470 }, { "epoch": 2.1175480432038154, "grad_norm": 0.06973902136087418, "learning_rate": 1.4707532613269745e-05, "loss": 0.021, "step": 75480 }, { "epoch": 2.117828587459672, "grad_norm": 0.5638455152511597, "learning_rate": 1.4702856875672138e-05, "loss": 0.0456, "step": 75490 }, { "epoch": 2.1181091317155283, "grad_norm": 0.04954998567700386, "learning_rate": 1.4698181138074531e-05, "loss": 0.0104, "step": 75500 }, { "epoch": 2.1183896759713843, "grad_norm": 0.26585736870765686, "learning_rate": 1.4693505400476926e-05, "loss": 0.0337, "step": 75510 }, { "epoch": 2.1186702202272407, "grad_norm": 0.4995632469654083, "learning_rate": 1.468882966287932e-05, "loss": 0.0097, "step": 75520 }, { "epoch": 2.118950764483097, "grad_norm": 0.18740060925483704, "learning_rate": 1.4684153925281713e-05, "loss": 0.0198, "step": 75530 }, { "epoch": 2.1192313087389536, "grad_norm": 0.3337761163711548, "learning_rate": 1.4679478187684109e-05, "loss": 0.0249, "step": 75540 }, { "epoch": 2.11951185299481, "grad_norm": 0.3685028851032257, "learning_rate": 1.4674802450086502e-05, "loss": 0.015, "step": 75550 }, { "epoch": 2.1197923972506665, "grad_norm": 0.03855695575475693, "learning_rate": 1.4670126712488897e-05, "loss": 0.0058, "step": 75560 }, { "epoch": 2.1200729415065225, "grad_norm": 0.22918501496315002, "learning_rate": 1.466545097489129e-05, "loss": 0.0103, "step": 75570 }, { "epoch": 2.120353485762379, "grad_norm": 0.2117370218038559, "learning_rate": 1.4660775237293683e-05, "loss": 0.0337, "step": 75580 }, { "epoch": 2.1206340300182354, "grad_norm": 0.013422888703644276, "learning_rate": 1.4656099499696077e-05, "loss": 0.014, "step": 75590 }, { "epoch": 2.120914574274092, "grad_norm": 0.02335825189948082, "learning_rate": 1.4651423762098471e-05, "loss": 0.0143, "step": 75600 }, { "epoch": 2.1211951185299482, "grad_norm": 0.34589463472366333, "learning_rate": 1.4646748024500866e-05, "loss": 0.0242, "step": 75610 }, { "epoch": 2.1214756627858042, "grad_norm": 0.03664247691631317, "learning_rate": 1.4642072286903261e-05, "loss": 0.0273, "step": 75620 }, { "epoch": 2.1217562070416607, "grad_norm": 0.06664524227380753, "learning_rate": 1.4637396549305654e-05, "loss": 0.01, "step": 75630 }, { "epoch": 2.122036751297517, "grad_norm": 0.013639991171658039, "learning_rate": 1.4632720811708047e-05, "loss": 0.0075, "step": 75640 }, { "epoch": 2.1223172955533736, "grad_norm": 0.15815916657447815, "learning_rate": 1.462804507411044e-05, "loss": 0.002, "step": 75650 }, { "epoch": 2.12259783980923, "grad_norm": 0.15515649318695068, "learning_rate": 1.4623369336512835e-05, "loss": 0.0067, "step": 75660 }, { "epoch": 2.1228783840650864, "grad_norm": 0.02193688228726387, "learning_rate": 1.4618693598915229e-05, "loss": 0.0242, "step": 75670 }, { "epoch": 2.1231589283209424, "grad_norm": 0.015641991049051285, "learning_rate": 1.4614017861317625e-05, "loss": 0.0489, "step": 75680 }, { "epoch": 2.123439472576799, "grad_norm": 0.20500671863555908, "learning_rate": 1.4609342123720018e-05, "loss": 0.013, "step": 75690 }, { "epoch": 2.1237200168326553, "grad_norm": 0.05628032237291336, "learning_rate": 1.4604666386122411e-05, "loss": 0.0251, "step": 75700 }, { "epoch": 2.1240005610885118, "grad_norm": 0.7142986059188843, "learning_rate": 1.4599990648524806e-05, "loss": 0.0258, "step": 75710 }, { "epoch": 2.124281105344368, "grad_norm": 13.418041229248047, "learning_rate": 1.45953149109272e-05, "loss": 0.0289, "step": 75720 }, { "epoch": 2.1245616496002246, "grad_norm": 0.23739749193191528, "learning_rate": 1.4590639173329593e-05, "loss": 0.012, "step": 75730 }, { "epoch": 2.1248421938560806, "grad_norm": 0.37034204602241516, "learning_rate": 1.4585963435731986e-05, "loss": 0.0358, "step": 75740 }, { "epoch": 2.125122738111937, "grad_norm": 0.01868264190852642, "learning_rate": 1.4581287698134382e-05, "loss": 0.0286, "step": 75750 }, { "epoch": 2.1254032823677935, "grad_norm": 0.5510637760162354, "learning_rate": 1.4576611960536775e-05, "loss": 0.0426, "step": 75760 }, { "epoch": 2.12568382662365, "grad_norm": 0.5452149510383606, "learning_rate": 1.457193622293917e-05, "loss": 0.0092, "step": 75770 }, { "epoch": 2.1259643708795064, "grad_norm": 0.01512143388390541, "learning_rate": 1.4567260485341563e-05, "loss": 0.0038, "step": 75780 }, { "epoch": 2.1262449151353624, "grad_norm": 0.05462921783328056, "learning_rate": 1.4562584747743957e-05, "loss": 0.0125, "step": 75790 }, { "epoch": 2.126525459391219, "grad_norm": 0.018741736188530922, "learning_rate": 1.455790901014635e-05, "loss": 0.0215, "step": 75800 }, { "epoch": 2.1268060036470753, "grad_norm": 0.05697593465447426, "learning_rate": 1.4553233272548745e-05, "loss": 0.0174, "step": 75810 }, { "epoch": 2.1270865479029317, "grad_norm": 0.006795960478484631, "learning_rate": 1.4548557534951141e-05, "loss": 0.0224, "step": 75820 }, { "epoch": 2.127367092158788, "grad_norm": 0.012536952272057533, "learning_rate": 1.4543881797353534e-05, "loss": 0.0162, "step": 75830 }, { "epoch": 2.1276476364146446, "grad_norm": 0.036653291434049606, "learning_rate": 1.4539206059755927e-05, "loss": 0.0198, "step": 75840 }, { "epoch": 2.1279281806705006, "grad_norm": 0.03375743702054024, "learning_rate": 1.453453032215832e-05, "loss": 0.0403, "step": 75850 }, { "epoch": 2.128208724926357, "grad_norm": 0.03296864032745361, "learning_rate": 1.4529854584560715e-05, "loss": 0.0134, "step": 75860 }, { "epoch": 2.1284892691822135, "grad_norm": 0.03129351884126663, "learning_rate": 1.4525178846963109e-05, "loss": 0.0236, "step": 75870 }, { "epoch": 2.12876981343807, "grad_norm": 0.40605252981185913, "learning_rate": 1.4520503109365502e-05, "loss": 0.0302, "step": 75880 }, { "epoch": 2.1290503576939264, "grad_norm": 0.08644959330558777, "learning_rate": 1.4515827371767898e-05, "loss": 0.0246, "step": 75890 }, { "epoch": 2.129330901949783, "grad_norm": 0.22515186667442322, "learning_rate": 1.4511151634170291e-05, "loss": 0.0197, "step": 75900 }, { "epoch": 2.129611446205639, "grad_norm": 0.17417116463184357, "learning_rate": 1.4506475896572685e-05, "loss": 0.021, "step": 75910 }, { "epoch": 2.1298919904614952, "grad_norm": 0.031239699572324753, "learning_rate": 1.450180015897508e-05, "loss": 0.0238, "step": 75920 }, { "epoch": 2.1301725347173517, "grad_norm": 0.01639951951801777, "learning_rate": 1.4497124421377473e-05, "loss": 0.0051, "step": 75930 }, { "epoch": 2.130453078973208, "grad_norm": 0.02344634011387825, "learning_rate": 1.4492448683779866e-05, "loss": 0.0162, "step": 75940 }, { "epoch": 2.1307336232290646, "grad_norm": 0.2103407084941864, "learning_rate": 1.4487772946182259e-05, "loss": 0.0157, "step": 75950 }, { "epoch": 2.1310141674849206, "grad_norm": 0.2047588974237442, "learning_rate": 1.4483097208584656e-05, "loss": 0.0101, "step": 75960 }, { "epoch": 2.131294711740777, "grad_norm": 0.6977762579917908, "learning_rate": 1.447842147098705e-05, "loss": 0.0275, "step": 75970 }, { "epoch": 2.1315752559966334, "grad_norm": 0.012671127915382385, "learning_rate": 1.4473745733389444e-05, "loss": 0.0044, "step": 75980 }, { "epoch": 2.13185580025249, "grad_norm": 0.03135411813855171, "learning_rate": 1.4469069995791837e-05, "loss": 0.003, "step": 75990 }, { "epoch": 2.1321363445083463, "grad_norm": 0.021757982671260834, "learning_rate": 1.446439425819423e-05, "loss": 0.0103, "step": 76000 }, { "epoch": 2.1324168887642028, "grad_norm": 0.012173091992735863, "learning_rate": 1.4459718520596625e-05, "loss": 0.0058, "step": 76010 }, { "epoch": 2.1326974330200588, "grad_norm": 0.03802630677819252, "learning_rate": 1.4455042782999018e-05, "loss": 0.0078, "step": 76020 }, { "epoch": 2.132977977275915, "grad_norm": 0.17839252948760986, "learning_rate": 1.4450367045401414e-05, "loss": 0.0201, "step": 76030 }, { "epoch": 2.1332585215317716, "grad_norm": 0.047945424914360046, "learning_rate": 1.4445691307803808e-05, "loss": 0.0158, "step": 76040 }, { "epoch": 2.133539065787628, "grad_norm": 0.5485140085220337, "learning_rate": 1.44410155702062e-05, "loss": 0.027, "step": 76050 }, { "epoch": 2.1338196100434845, "grad_norm": 0.17683865129947662, "learning_rate": 1.4436339832608594e-05, "loss": 0.019, "step": 76060 }, { "epoch": 2.1341001542993405, "grad_norm": 0.01879556104540825, "learning_rate": 1.4431664095010989e-05, "loss": 0.0149, "step": 76070 }, { "epoch": 2.134380698555197, "grad_norm": 0.016470570117235184, "learning_rate": 1.4426988357413382e-05, "loss": 0.0215, "step": 76080 }, { "epoch": 2.1346612428110534, "grad_norm": 0.028908582404255867, "learning_rate": 1.4422312619815775e-05, "loss": 0.0154, "step": 76090 }, { "epoch": 2.13494178706691, "grad_norm": 0.06788532435894012, "learning_rate": 1.4417636882218172e-05, "loss": 0.0159, "step": 76100 }, { "epoch": 2.1352223313227663, "grad_norm": 0.548032820224762, "learning_rate": 1.4412961144620565e-05, "loss": 0.0107, "step": 76110 }, { "epoch": 2.1355028755786227, "grad_norm": 0.2279454916715622, "learning_rate": 1.440828540702296e-05, "loss": 0.0171, "step": 76120 }, { "epoch": 2.1357834198344787, "grad_norm": 1.434791922569275, "learning_rate": 1.4403609669425353e-05, "loss": 0.0133, "step": 76130 }, { "epoch": 2.136063964090335, "grad_norm": 0.1436769962310791, "learning_rate": 1.4398933931827746e-05, "loss": 0.0261, "step": 76140 }, { "epoch": 2.1363445083461916, "grad_norm": 1.4535490274429321, "learning_rate": 1.4394258194230139e-05, "loss": 0.0108, "step": 76150 }, { "epoch": 2.136625052602048, "grad_norm": 0.08381864428520203, "learning_rate": 1.4389582456632534e-05, "loss": 0.0099, "step": 76160 }, { "epoch": 2.1369055968579045, "grad_norm": 0.0304171871393919, "learning_rate": 1.438490671903493e-05, "loss": 0.0359, "step": 76170 }, { "epoch": 2.1371861411137605, "grad_norm": 1.805122971534729, "learning_rate": 1.4380230981437324e-05, "loss": 0.0348, "step": 76180 }, { "epoch": 2.137466685369617, "grad_norm": 0.11114463210105896, "learning_rate": 1.4375555243839717e-05, "loss": 0.0181, "step": 76190 }, { "epoch": 2.1377472296254734, "grad_norm": 0.22260068356990814, "learning_rate": 1.437087950624211e-05, "loss": 0.0115, "step": 76200 }, { "epoch": 2.13802777388133, "grad_norm": 0.034528233110904694, "learning_rate": 1.4366203768644505e-05, "loss": 0.0127, "step": 76210 }, { "epoch": 2.1383083181371862, "grad_norm": 0.2375493347644806, "learning_rate": 1.4361528031046898e-05, "loss": 0.0122, "step": 76220 }, { "epoch": 2.1385888623930427, "grad_norm": 0.04787379503250122, "learning_rate": 1.4356852293449291e-05, "loss": 0.0203, "step": 76230 }, { "epoch": 2.1388694066488987, "grad_norm": 0.008625268004834652, "learning_rate": 1.4352176555851688e-05, "loss": 0.0178, "step": 76240 }, { "epoch": 2.139149950904755, "grad_norm": 0.20668143033981323, "learning_rate": 1.434750081825408e-05, "loss": 0.0211, "step": 76250 }, { "epoch": 2.1394304951606116, "grad_norm": 0.06789498776197433, "learning_rate": 1.4342825080656474e-05, "loss": 0.0671, "step": 76260 }, { "epoch": 2.139711039416468, "grad_norm": 0.07468756288290024, "learning_rate": 1.4338149343058869e-05, "loss": 0.0159, "step": 76270 }, { "epoch": 2.1399915836723244, "grad_norm": 0.37150368094444275, "learning_rate": 1.4333473605461262e-05, "loss": 0.0161, "step": 76280 }, { "epoch": 2.1402721279281804, "grad_norm": 0.20630669593811035, "learning_rate": 1.4328797867863655e-05, "loss": 0.0086, "step": 76290 }, { "epoch": 2.140552672184037, "grad_norm": 0.18894469738006592, "learning_rate": 1.4324122130266048e-05, "loss": 0.0153, "step": 76300 }, { "epoch": 2.1408332164398933, "grad_norm": 0.02623433619737625, "learning_rate": 1.4319446392668445e-05, "loss": 0.0062, "step": 76310 }, { "epoch": 2.1411137606957498, "grad_norm": 0.031377002596855164, "learning_rate": 1.431477065507084e-05, "loss": 0.0227, "step": 76320 }, { "epoch": 2.141394304951606, "grad_norm": 0.3264651298522949, "learning_rate": 1.4310094917473233e-05, "loss": 0.0224, "step": 76330 }, { "epoch": 2.1416748492074626, "grad_norm": 0.030989423394203186, "learning_rate": 1.4305419179875626e-05, "loss": 0.0343, "step": 76340 }, { "epoch": 2.1419553934633186, "grad_norm": 1.0371063947677612, "learning_rate": 1.4300743442278019e-05, "loss": 0.0223, "step": 76350 }, { "epoch": 2.142235937719175, "grad_norm": 0.8262537121772766, "learning_rate": 1.4296067704680414e-05, "loss": 0.0096, "step": 76360 }, { "epoch": 2.1425164819750315, "grad_norm": 0.13810017704963684, "learning_rate": 1.4291391967082807e-05, "loss": 0.048, "step": 76370 }, { "epoch": 2.142797026230888, "grad_norm": 1.5590848922729492, "learning_rate": 1.4286716229485204e-05, "loss": 0.008, "step": 76380 }, { "epoch": 2.1430775704867444, "grad_norm": 4.329244136810303, "learning_rate": 1.4282040491887597e-05, "loss": 0.0515, "step": 76390 }, { "epoch": 2.143358114742601, "grad_norm": 0.06684580445289612, "learning_rate": 1.427736475428999e-05, "loss": 0.0201, "step": 76400 }, { "epoch": 2.143638658998457, "grad_norm": 0.052887994796037674, "learning_rate": 1.4272689016692383e-05, "loss": 0.0356, "step": 76410 }, { "epoch": 2.1439192032543133, "grad_norm": 3.1220779418945312, "learning_rate": 1.4268013279094778e-05, "loss": 0.0195, "step": 76420 }, { "epoch": 2.1441997475101697, "grad_norm": 0.16765189170837402, "learning_rate": 1.4263337541497171e-05, "loss": 0.0099, "step": 76430 }, { "epoch": 2.144480291766026, "grad_norm": 0.029478855431079865, "learning_rate": 1.4258661803899564e-05, "loss": 0.016, "step": 76440 }, { "epoch": 2.1447608360218826, "grad_norm": 0.3052222430706024, "learning_rate": 1.425398606630196e-05, "loss": 0.0079, "step": 76450 }, { "epoch": 2.1450413802777386, "grad_norm": 0.0813019871711731, "learning_rate": 1.4249310328704354e-05, "loss": 0.0158, "step": 76460 }, { "epoch": 2.145321924533595, "grad_norm": 0.1771785467863083, "learning_rate": 1.4244634591106749e-05, "loss": 0.0197, "step": 76470 }, { "epoch": 2.1456024687894515, "grad_norm": 0.24443010985851288, "learning_rate": 1.4239958853509142e-05, "loss": 0.0477, "step": 76480 }, { "epoch": 2.145883013045308, "grad_norm": 0.020004913210868835, "learning_rate": 1.4235283115911535e-05, "loss": 0.0258, "step": 76490 }, { "epoch": 2.1461635573011644, "grad_norm": 0.04090360924601555, "learning_rate": 1.4230607378313928e-05, "loss": 0.0105, "step": 76500 }, { "epoch": 2.146444101557021, "grad_norm": 0.2822690010070801, "learning_rate": 1.4225931640716323e-05, "loss": 0.0101, "step": 76510 }, { "epoch": 2.146724645812877, "grad_norm": 0.01881779171526432, "learning_rate": 1.4221255903118718e-05, "loss": 0.0382, "step": 76520 }, { "epoch": 2.1470051900687332, "grad_norm": 0.10830901563167572, "learning_rate": 1.4216580165521113e-05, "loss": 0.0201, "step": 76530 }, { "epoch": 2.1472857343245897, "grad_norm": 1.5464686155319214, "learning_rate": 1.4211904427923506e-05, "loss": 0.0314, "step": 76540 }, { "epoch": 2.147566278580446, "grad_norm": 0.972217857837677, "learning_rate": 1.4207228690325899e-05, "loss": 0.031, "step": 76550 }, { "epoch": 2.1478468228363026, "grad_norm": 0.3661109209060669, "learning_rate": 1.4202552952728292e-05, "loss": 0.0103, "step": 76560 }, { "epoch": 2.148127367092159, "grad_norm": 0.030028551816940308, "learning_rate": 1.4197877215130687e-05, "loss": 0.0109, "step": 76570 }, { "epoch": 2.148407911348015, "grad_norm": 0.5358534455299377, "learning_rate": 1.419320147753308e-05, "loss": 0.0207, "step": 76580 }, { "epoch": 2.1486884556038714, "grad_norm": 0.6380683183670044, "learning_rate": 1.4188525739935477e-05, "loss": 0.0195, "step": 76590 }, { "epoch": 2.148968999859728, "grad_norm": 2.011542320251465, "learning_rate": 1.418385000233787e-05, "loss": 0.0194, "step": 76600 }, { "epoch": 2.1492495441155843, "grad_norm": 0.016884248703718185, "learning_rate": 1.4179174264740263e-05, "loss": 0.0049, "step": 76610 }, { "epoch": 2.1495300883714408, "grad_norm": 0.022724486887454987, "learning_rate": 1.4174498527142658e-05, "loss": 0.0258, "step": 76620 }, { "epoch": 2.1498106326272968, "grad_norm": 0.02169327810406685, "learning_rate": 1.4169822789545051e-05, "loss": 0.0289, "step": 76630 }, { "epoch": 2.150091176883153, "grad_norm": 0.34750351309776306, "learning_rate": 1.4165147051947444e-05, "loss": 0.0123, "step": 76640 }, { "epoch": 2.1503717211390097, "grad_norm": 0.5905702114105225, "learning_rate": 1.4160471314349837e-05, "loss": 0.0266, "step": 76650 }, { "epoch": 2.150652265394866, "grad_norm": 0.19776056706905365, "learning_rate": 1.4155795576752234e-05, "loss": 0.0143, "step": 76660 }, { "epoch": 2.1509328096507225, "grad_norm": 0.07454081624746323, "learning_rate": 1.4151119839154627e-05, "loss": 0.0315, "step": 76670 }, { "epoch": 2.151213353906579, "grad_norm": 0.09201308339834213, "learning_rate": 1.4146444101557022e-05, "loss": 0.0239, "step": 76680 }, { "epoch": 2.151493898162435, "grad_norm": 0.5402765274047852, "learning_rate": 1.4141768363959415e-05, "loss": 0.0098, "step": 76690 }, { "epoch": 2.1517744424182914, "grad_norm": 0.04490510746836662, "learning_rate": 1.4137092626361808e-05, "loss": 0.0223, "step": 76700 }, { "epoch": 2.152054986674148, "grad_norm": 1.4119776487350464, "learning_rate": 1.4132416888764201e-05, "loss": 0.0082, "step": 76710 }, { "epoch": 2.1523355309300043, "grad_norm": 0.016898563131690025, "learning_rate": 1.4127741151166598e-05, "loss": 0.017, "step": 76720 }, { "epoch": 2.1526160751858607, "grad_norm": 0.5209808349609375, "learning_rate": 1.4123065413568993e-05, "loss": 0.0597, "step": 76730 }, { "epoch": 2.1528966194417167, "grad_norm": 0.07352891564369202, "learning_rate": 1.4118389675971386e-05, "loss": 0.0265, "step": 76740 }, { "epoch": 2.153177163697573, "grad_norm": 0.1596907526254654, "learning_rate": 1.4113713938373779e-05, "loss": 0.0073, "step": 76750 }, { "epoch": 2.1534577079534296, "grad_norm": 0.1859072595834732, "learning_rate": 1.4109038200776172e-05, "loss": 0.0121, "step": 76760 }, { "epoch": 2.153738252209286, "grad_norm": 0.17191745340824127, "learning_rate": 1.4104362463178567e-05, "loss": 0.0202, "step": 76770 }, { "epoch": 2.1540187964651425, "grad_norm": 0.6646074652671814, "learning_rate": 1.409968672558096e-05, "loss": 0.0355, "step": 76780 }, { "epoch": 2.154299340720999, "grad_norm": 0.0687444657087326, "learning_rate": 1.4095010987983357e-05, "loss": 0.0156, "step": 76790 }, { "epoch": 2.154579884976855, "grad_norm": 1.1297602653503418, "learning_rate": 1.409033525038575e-05, "loss": 0.0293, "step": 76800 }, { "epoch": 2.1548604292327114, "grad_norm": 0.01542514655739069, "learning_rate": 1.4085659512788143e-05, "loss": 0.0142, "step": 76810 }, { "epoch": 2.155140973488568, "grad_norm": 6.157717227935791, "learning_rate": 1.4080983775190536e-05, "loss": 0.0228, "step": 76820 }, { "epoch": 2.1554215177444243, "grad_norm": 0.6916541457176208, "learning_rate": 1.4076308037592931e-05, "loss": 0.0351, "step": 76830 }, { "epoch": 2.1557020620002807, "grad_norm": 0.06430429220199585, "learning_rate": 1.4071632299995324e-05, "loss": 0.0112, "step": 76840 }, { "epoch": 2.1559826062561367, "grad_norm": 0.03530902415513992, "learning_rate": 1.4066956562397717e-05, "loss": 0.0409, "step": 76850 }, { "epoch": 2.156263150511993, "grad_norm": 0.9276949167251587, "learning_rate": 1.4062280824800114e-05, "loss": 0.0317, "step": 76860 }, { "epoch": 2.1565436947678496, "grad_norm": 1.9959636926651, "learning_rate": 1.4057605087202507e-05, "loss": 0.0205, "step": 76870 }, { "epoch": 2.156824239023706, "grad_norm": 0.03185483068227768, "learning_rate": 1.4052929349604902e-05, "loss": 0.0054, "step": 76880 }, { "epoch": 2.1571047832795625, "grad_norm": 1.398026466369629, "learning_rate": 1.4048253612007295e-05, "loss": 0.0207, "step": 76890 }, { "epoch": 2.157385327535419, "grad_norm": 0.01857638731598854, "learning_rate": 1.4043577874409688e-05, "loss": 0.0116, "step": 76900 }, { "epoch": 2.157665871791275, "grad_norm": 0.22586515545845032, "learning_rate": 1.4038902136812081e-05, "loss": 0.0239, "step": 76910 }, { "epoch": 2.1579464160471313, "grad_norm": 0.3012331426143646, "learning_rate": 1.4034226399214476e-05, "loss": 0.0308, "step": 76920 }, { "epoch": 2.1582269603029878, "grad_norm": 0.5979565978050232, "learning_rate": 1.4029550661616873e-05, "loss": 0.0106, "step": 76930 }, { "epoch": 2.158507504558844, "grad_norm": 0.025586692616343498, "learning_rate": 1.4024874924019266e-05, "loss": 0.0113, "step": 76940 }, { "epoch": 2.1587880488147007, "grad_norm": 0.007233227137476206, "learning_rate": 1.402019918642166e-05, "loss": 0.0396, "step": 76950 }, { "epoch": 2.1590685930705567, "grad_norm": 0.04570106789469719, "learning_rate": 1.4015523448824052e-05, "loss": 0.043, "step": 76960 }, { "epoch": 2.159349137326413, "grad_norm": 0.07990343123674393, "learning_rate": 1.4010847711226447e-05, "loss": 0.0155, "step": 76970 }, { "epoch": 2.1596296815822695, "grad_norm": 0.06189311668276787, "learning_rate": 1.400617197362884e-05, "loss": 0.025, "step": 76980 }, { "epoch": 2.159910225838126, "grad_norm": 0.03015826642513275, "learning_rate": 1.4001496236031234e-05, "loss": 0.0044, "step": 76990 }, { "epoch": 2.1601907700939824, "grad_norm": 0.02701481804251671, "learning_rate": 1.399682049843363e-05, "loss": 0.0205, "step": 77000 }, { "epoch": 2.160471314349839, "grad_norm": 0.104880690574646, "learning_rate": 1.3992144760836023e-05, "loss": 0.046, "step": 77010 }, { "epoch": 2.160751858605695, "grad_norm": 0.010763268917798996, "learning_rate": 1.3987469023238416e-05, "loss": 0.0066, "step": 77020 }, { "epoch": 2.1610324028615513, "grad_norm": 0.025672459974884987, "learning_rate": 1.3982793285640811e-05, "loss": 0.0533, "step": 77030 }, { "epoch": 2.1613129471174077, "grad_norm": 0.3108491897583008, "learning_rate": 1.3978117548043204e-05, "loss": 0.0284, "step": 77040 }, { "epoch": 2.161593491373264, "grad_norm": 0.25881317257881165, "learning_rate": 1.3973441810445598e-05, "loss": 0.0405, "step": 77050 }, { "epoch": 2.1618740356291206, "grad_norm": 0.30566704273223877, "learning_rate": 1.396876607284799e-05, "loss": 0.0205, "step": 77060 }, { "epoch": 2.162154579884977, "grad_norm": 0.05884343758225441, "learning_rate": 1.3964090335250387e-05, "loss": 0.012, "step": 77070 }, { "epoch": 2.162435124140833, "grad_norm": 1.7398251295089722, "learning_rate": 1.3959414597652782e-05, "loss": 0.0392, "step": 77080 }, { "epoch": 2.1627156683966895, "grad_norm": 0.2298448383808136, "learning_rate": 1.3954738860055175e-05, "loss": 0.0128, "step": 77090 }, { "epoch": 2.162996212652546, "grad_norm": 0.24226993322372437, "learning_rate": 1.3950063122457568e-05, "loss": 0.017, "step": 77100 }, { "epoch": 2.1632767569084024, "grad_norm": 2.123866081237793, "learning_rate": 1.3945387384859962e-05, "loss": 0.0287, "step": 77110 }, { "epoch": 2.163557301164259, "grad_norm": 0.23682327568531036, "learning_rate": 1.3940711647262356e-05, "loss": 0.014, "step": 77120 }, { "epoch": 2.1638378454201153, "grad_norm": 0.6214775443077087, "learning_rate": 1.393603590966475e-05, "loss": 0.041, "step": 77130 }, { "epoch": 2.1641183896759713, "grad_norm": 0.29764360189437866, "learning_rate": 1.3931360172067146e-05, "loss": 0.0436, "step": 77140 }, { "epoch": 2.1643989339318277, "grad_norm": 0.08602353930473328, "learning_rate": 1.392668443446954e-05, "loss": 0.022, "step": 77150 }, { "epoch": 2.164679478187684, "grad_norm": 0.09174434840679169, "learning_rate": 1.3922008696871932e-05, "loss": 0.0078, "step": 77160 }, { "epoch": 2.1649600224435406, "grad_norm": 0.025741400197148323, "learning_rate": 1.3917332959274326e-05, "loss": 0.0129, "step": 77170 }, { "epoch": 2.165240566699397, "grad_norm": 0.14064998924732208, "learning_rate": 1.391265722167672e-05, "loss": 0.0386, "step": 77180 }, { "epoch": 2.165521110955253, "grad_norm": 0.05261456221342087, "learning_rate": 1.3907981484079114e-05, "loss": 0.034, "step": 77190 }, { "epoch": 2.1658016552111095, "grad_norm": 0.26858964562416077, "learning_rate": 1.3903305746481507e-05, "loss": 0.0168, "step": 77200 }, { "epoch": 2.166082199466966, "grad_norm": 0.569105327129364, "learning_rate": 1.3898630008883903e-05, "loss": 0.0158, "step": 77210 }, { "epoch": 2.1663627437228223, "grad_norm": 0.05395793169736862, "learning_rate": 1.3893954271286296e-05, "loss": 0.0222, "step": 77220 }, { "epoch": 2.166643287978679, "grad_norm": 0.040938813239336014, "learning_rate": 1.3889278533688691e-05, "loss": 0.0174, "step": 77230 }, { "epoch": 2.166923832234535, "grad_norm": 0.015448816120624542, "learning_rate": 1.3884602796091084e-05, "loss": 0.0043, "step": 77240 }, { "epoch": 2.167204376490391, "grad_norm": 0.1643258035182953, "learning_rate": 1.3879927058493478e-05, "loss": 0.0074, "step": 77250 }, { "epoch": 2.1674849207462477, "grad_norm": 0.03886279836297035, "learning_rate": 1.387525132089587e-05, "loss": 0.0065, "step": 77260 }, { "epoch": 2.167765465002104, "grad_norm": 0.02568807452917099, "learning_rate": 1.3870575583298266e-05, "loss": 0.0453, "step": 77270 }, { "epoch": 2.1680460092579605, "grad_norm": 0.10288961231708527, "learning_rate": 1.386589984570066e-05, "loss": 0.029, "step": 77280 }, { "epoch": 2.168326553513817, "grad_norm": 0.4620596766471863, "learning_rate": 1.3861224108103055e-05, "loss": 0.0367, "step": 77290 }, { "epoch": 2.168607097769673, "grad_norm": 0.05479080229997635, "learning_rate": 1.3856548370505448e-05, "loss": 0.0034, "step": 77300 }, { "epoch": 2.1688876420255294, "grad_norm": 0.018112504854798317, "learning_rate": 1.3851872632907842e-05, "loss": 0.0305, "step": 77310 }, { "epoch": 2.169168186281386, "grad_norm": 0.794066309928894, "learning_rate": 1.3847196895310235e-05, "loss": 0.0188, "step": 77320 }, { "epoch": 2.1694487305372423, "grad_norm": 1.1729114055633545, "learning_rate": 1.384252115771263e-05, "loss": 0.0195, "step": 77330 }, { "epoch": 2.1697292747930987, "grad_norm": 1.3505120277404785, "learning_rate": 1.3837845420115023e-05, "loss": 0.026, "step": 77340 }, { "epoch": 2.170009819048955, "grad_norm": 0.8262357115745544, "learning_rate": 1.383316968251742e-05, "loss": 0.0257, "step": 77350 }, { "epoch": 2.170290363304811, "grad_norm": 0.030944984406232834, "learning_rate": 1.3828493944919812e-05, "loss": 0.0151, "step": 77360 }, { "epoch": 2.1705709075606676, "grad_norm": 0.12022232264280319, "learning_rate": 1.3823818207322206e-05, "loss": 0.0349, "step": 77370 }, { "epoch": 2.170851451816524, "grad_norm": 3.2702391147613525, "learning_rate": 1.38191424697246e-05, "loss": 0.0045, "step": 77380 }, { "epoch": 2.1711319960723805, "grad_norm": 1.0661938190460205, "learning_rate": 1.3814466732126994e-05, "loss": 0.0409, "step": 77390 }, { "epoch": 2.171412540328237, "grad_norm": 0.13574114441871643, "learning_rate": 1.3809790994529387e-05, "loss": 0.0247, "step": 77400 }, { "epoch": 2.171693084584093, "grad_norm": 0.3058968186378479, "learning_rate": 1.380511525693178e-05, "loss": 0.0392, "step": 77410 }, { "epoch": 2.1719736288399494, "grad_norm": 0.020802734419703484, "learning_rate": 1.3800439519334176e-05, "loss": 0.0077, "step": 77420 }, { "epoch": 2.172254173095806, "grad_norm": 0.03587988391518593, "learning_rate": 1.379576378173657e-05, "loss": 0.008, "step": 77430 }, { "epoch": 2.1725347173516623, "grad_norm": 0.19412007927894592, "learning_rate": 1.3791088044138964e-05, "loss": 0.0328, "step": 77440 }, { "epoch": 2.1728152616075187, "grad_norm": 0.27377432584762573, "learning_rate": 1.3786412306541358e-05, "loss": 0.0192, "step": 77450 }, { "epoch": 2.173095805863375, "grad_norm": 0.022274751216173172, "learning_rate": 1.378173656894375e-05, "loss": 0.008, "step": 77460 }, { "epoch": 2.173376350119231, "grad_norm": 0.15391366183757782, "learning_rate": 1.3777060831346144e-05, "loss": 0.0139, "step": 77470 }, { "epoch": 2.1736568943750876, "grad_norm": 0.11092265695333481, "learning_rate": 1.3772385093748539e-05, "loss": 0.009, "step": 77480 }, { "epoch": 2.173937438630944, "grad_norm": 0.01759192906320095, "learning_rate": 1.3767709356150935e-05, "loss": 0.0187, "step": 77490 }, { "epoch": 2.1742179828868005, "grad_norm": 0.028326155617833138, "learning_rate": 1.3763033618553328e-05, "loss": 0.0229, "step": 77500 }, { "epoch": 2.174498527142657, "grad_norm": 0.02614622563123703, "learning_rate": 1.3758357880955722e-05, "loss": 0.0088, "step": 77510 }, { "epoch": 2.174779071398513, "grad_norm": 0.6054356098175049, "learning_rate": 1.3753682143358115e-05, "loss": 0.0758, "step": 77520 }, { "epoch": 2.1750596156543693, "grad_norm": 0.16633105278015137, "learning_rate": 1.374900640576051e-05, "loss": 0.0158, "step": 77530 }, { "epoch": 2.175340159910226, "grad_norm": 0.14688535034656525, "learning_rate": 1.3744330668162903e-05, "loss": 0.0233, "step": 77540 }, { "epoch": 2.175620704166082, "grad_norm": 0.5793275237083435, "learning_rate": 1.3739654930565296e-05, "loss": 0.0126, "step": 77550 }, { "epoch": 2.1759012484219387, "grad_norm": 0.04369383305311203, "learning_rate": 1.3734979192967692e-05, "loss": 0.016, "step": 77560 }, { "epoch": 2.176181792677795, "grad_norm": 0.6647480130195618, "learning_rate": 1.3730303455370086e-05, "loss": 0.0117, "step": 77570 }, { "epoch": 2.176462336933651, "grad_norm": 0.9157407879829407, "learning_rate": 1.3725627717772479e-05, "loss": 0.0267, "step": 77580 }, { "epoch": 2.1767428811895075, "grad_norm": 0.6849482655525208, "learning_rate": 1.3720951980174874e-05, "loss": 0.0294, "step": 77590 }, { "epoch": 2.177023425445364, "grad_norm": 0.5760108828544617, "learning_rate": 1.3716276242577267e-05, "loss": 0.0097, "step": 77600 }, { "epoch": 2.1773039697012204, "grad_norm": 0.3217446804046631, "learning_rate": 1.371160050497966e-05, "loss": 0.0086, "step": 77610 }, { "epoch": 2.177584513957077, "grad_norm": 0.5529428720474243, "learning_rate": 1.3706924767382053e-05, "loss": 0.0376, "step": 77620 }, { "epoch": 2.177865058212933, "grad_norm": 0.2400994598865509, "learning_rate": 1.370224902978445e-05, "loss": 0.0391, "step": 77630 }, { "epoch": 2.1781456024687893, "grad_norm": 0.09710412472486496, "learning_rate": 1.3697573292186844e-05, "loss": 0.028, "step": 77640 }, { "epoch": 2.1784261467246457, "grad_norm": 0.07708554714918137, "learning_rate": 1.3692897554589238e-05, "loss": 0.0106, "step": 77650 }, { "epoch": 2.178706690980502, "grad_norm": 0.11746712774038315, "learning_rate": 1.368822181699163e-05, "loss": 0.0125, "step": 77660 }, { "epoch": 2.1789872352363586, "grad_norm": 0.8206024765968323, "learning_rate": 1.3683546079394024e-05, "loss": 0.0189, "step": 77670 }, { "epoch": 2.179267779492215, "grad_norm": 0.07351287454366684, "learning_rate": 1.3678870341796419e-05, "loss": 0.0176, "step": 77680 }, { "epoch": 2.179548323748071, "grad_norm": 0.6011829376220703, "learning_rate": 1.3674194604198812e-05, "loss": 0.0281, "step": 77690 }, { "epoch": 2.1798288680039275, "grad_norm": 0.08120191842317581, "learning_rate": 1.3669518866601209e-05, "loss": 0.0127, "step": 77700 }, { "epoch": 2.180109412259784, "grad_norm": 0.8267459273338318, "learning_rate": 1.3664843129003602e-05, "loss": 0.0468, "step": 77710 }, { "epoch": 2.1803899565156404, "grad_norm": 0.019530832767486572, "learning_rate": 1.3660167391405995e-05, "loss": 0.0291, "step": 77720 }, { "epoch": 2.180670500771497, "grad_norm": 0.1955784559249878, "learning_rate": 1.3655491653808388e-05, "loss": 0.0052, "step": 77730 }, { "epoch": 2.1809510450273533, "grad_norm": 0.7584865093231201, "learning_rate": 1.3650815916210783e-05, "loss": 0.0275, "step": 77740 }, { "epoch": 2.1812315892832093, "grad_norm": 0.19437919557094574, "learning_rate": 1.3646140178613176e-05, "loss": 0.0195, "step": 77750 }, { "epoch": 2.1815121335390657, "grad_norm": 0.06042390316724777, "learning_rate": 1.3641464441015569e-05, "loss": 0.0327, "step": 77760 }, { "epoch": 2.181792677794922, "grad_norm": 0.4155561923980713, "learning_rate": 1.3636788703417966e-05, "loss": 0.0352, "step": 77770 }, { "epoch": 2.1820732220507786, "grad_norm": 0.26016420125961304, "learning_rate": 1.3632112965820359e-05, "loss": 0.0318, "step": 77780 }, { "epoch": 2.182353766306635, "grad_norm": 0.8516184091567993, "learning_rate": 1.3627437228222754e-05, "loss": 0.049, "step": 77790 }, { "epoch": 2.1826343105624915, "grad_norm": 0.14754968881607056, "learning_rate": 1.3622761490625147e-05, "loss": 0.0126, "step": 77800 }, { "epoch": 2.1829148548183475, "grad_norm": 0.14891114830970764, "learning_rate": 1.361808575302754e-05, "loss": 0.0135, "step": 77810 }, { "epoch": 2.183195399074204, "grad_norm": 0.2008647471666336, "learning_rate": 1.3613410015429933e-05, "loss": 0.0399, "step": 77820 }, { "epoch": 2.1834759433300603, "grad_norm": 0.8012875914573669, "learning_rate": 1.3608734277832328e-05, "loss": 0.0331, "step": 77830 }, { "epoch": 2.183756487585917, "grad_norm": 0.04422256350517273, "learning_rate": 1.3604058540234725e-05, "loss": 0.01, "step": 77840 }, { "epoch": 2.1840370318417732, "grad_norm": 0.0521821565926075, "learning_rate": 1.3599382802637118e-05, "loss": 0.0292, "step": 77850 }, { "epoch": 2.184317576097629, "grad_norm": 0.523806631565094, "learning_rate": 1.3594707065039511e-05, "loss": 0.0142, "step": 77860 }, { "epoch": 2.1845981203534857, "grad_norm": 2.181013345718384, "learning_rate": 1.3590031327441904e-05, "loss": 0.0263, "step": 77870 }, { "epoch": 2.184878664609342, "grad_norm": 0.5763520002365112, "learning_rate": 1.3585355589844299e-05, "loss": 0.0391, "step": 77880 }, { "epoch": 2.1851592088651985, "grad_norm": 0.2195492833852768, "learning_rate": 1.3580679852246692e-05, "loss": 0.0107, "step": 77890 }, { "epoch": 2.185439753121055, "grad_norm": 0.31482988595962524, "learning_rate": 1.3576004114649085e-05, "loss": 0.0135, "step": 77900 }, { "epoch": 2.1857202973769114, "grad_norm": 0.6072250008583069, "learning_rate": 1.3571328377051482e-05, "loss": 0.0501, "step": 77910 }, { "epoch": 2.1860008416327674, "grad_norm": 0.05519077554345131, "learning_rate": 1.3566652639453875e-05, "loss": 0.0488, "step": 77920 }, { "epoch": 2.186281385888624, "grad_norm": 0.08833178877830505, "learning_rate": 1.3561976901856268e-05, "loss": 0.0127, "step": 77930 }, { "epoch": 2.1865619301444803, "grad_norm": 1.8019983768463135, "learning_rate": 1.3557301164258663e-05, "loss": 0.0319, "step": 77940 }, { "epoch": 2.1868424744003367, "grad_norm": 0.0774877518415451, "learning_rate": 1.3552625426661056e-05, "loss": 0.0096, "step": 77950 }, { "epoch": 2.187123018656193, "grad_norm": 0.16843494772911072, "learning_rate": 1.354794968906345e-05, "loss": 0.0152, "step": 77960 }, { "epoch": 2.187403562912049, "grad_norm": 7.794422149658203, "learning_rate": 1.3543273951465846e-05, "loss": 0.0242, "step": 77970 }, { "epoch": 2.1876841071679056, "grad_norm": 0.04247409477829933, "learning_rate": 1.3538598213868239e-05, "loss": 0.0261, "step": 77980 }, { "epoch": 2.187964651423762, "grad_norm": 0.08321196585893631, "learning_rate": 1.3533922476270634e-05, "loss": 0.0237, "step": 77990 }, { "epoch": 2.1882451956796185, "grad_norm": 0.03842276707291603, "learning_rate": 1.3529246738673027e-05, "loss": 0.0106, "step": 78000 }, { "epoch": 2.188525739935475, "grad_norm": 0.026708070188760757, "learning_rate": 1.352457100107542e-05, "loss": 0.0202, "step": 78010 }, { "epoch": 2.1888062841913314, "grad_norm": 0.1939706802368164, "learning_rate": 1.3519895263477813e-05, "loss": 0.0147, "step": 78020 }, { "epoch": 2.1890868284471874, "grad_norm": 0.3184756636619568, "learning_rate": 1.3515219525880208e-05, "loss": 0.008, "step": 78030 }, { "epoch": 2.189367372703044, "grad_norm": 0.1326749473810196, "learning_rate": 1.3510543788282603e-05, "loss": 0.0062, "step": 78040 }, { "epoch": 2.1896479169589003, "grad_norm": 0.29792118072509766, "learning_rate": 1.3505868050684998e-05, "loss": 0.003, "step": 78050 }, { "epoch": 2.1899284612147567, "grad_norm": 0.08717750012874603, "learning_rate": 1.3501192313087391e-05, "loss": 0.0117, "step": 78060 }, { "epoch": 2.190209005470613, "grad_norm": 0.4924820065498352, "learning_rate": 1.3496516575489784e-05, "loss": 0.0179, "step": 78070 }, { "epoch": 2.190489549726469, "grad_norm": 0.03206944093108177, "learning_rate": 1.3491840837892177e-05, "loss": 0.0524, "step": 78080 }, { "epoch": 2.1907700939823256, "grad_norm": 0.20994722843170166, "learning_rate": 1.3487165100294572e-05, "loss": 0.004, "step": 78090 }, { "epoch": 2.191050638238182, "grad_norm": 0.5394690036773682, "learning_rate": 1.3482489362696965e-05, "loss": 0.0083, "step": 78100 }, { "epoch": 2.1913311824940385, "grad_norm": 0.1453535556793213, "learning_rate": 1.3477813625099362e-05, "loss": 0.0374, "step": 78110 }, { "epoch": 2.191611726749895, "grad_norm": 0.08061260730028152, "learning_rate": 1.3473137887501755e-05, "loss": 0.0141, "step": 78120 }, { "epoch": 2.1918922710057513, "grad_norm": 0.040178343653678894, "learning_rate": 1.3468462149904148e-05, "loss": 0.0265, "step": 78130 }, { "epoch": 2.1921728152616073, "grad_norm": 0.28972333669662476, "learning_rate": 1.3463786412306543e-05, "loss": 0.0123, "step": 78140 }, { "epoch": 2.192453359517464, "grad_norm": 0.05855460464954376, "learning_rate": 1.3459110674708936e-05, "loss": 0.0303, "step": 78150 }, { "epoch": 2.1927339037733202, "grad_norm": 0.40546929836273193, "learning_rate": 1.345443493711133e-05, "loss": 0.0189, "step": 78160 }, { "epoch": 2.1930144480291767, "grad_norm": 0.03051304630935192, "learning_rate": 1.3449759199513722e-05, "loss": 0.0124, "step": 78170 }, { "epoch": 2.193294992285033, "grad_norm": 0.2546875774860382, "learning_rate": 1.3445083461916119e-05, "loss": 0.0468, "step": 78180 }, { "epoch": 2.193575536540889, "grad_norm": 0.0673549547791481, "learning_rate": 1.3440407724318512e-05, "loss": 0.0066, "step": 78190 }, { "epoch": 2.1938560807967455, "grad_norm": 0.6328393220901489, "learning_rate": 1.3435731986720907e-05, "loss": 0.0172, "step": 78200 }, { "epoch": 2.194136625052602, "grad_norm": 0.029130052775144577, "learning_rate": 1.34310562491233e-05, "loss": 0.0095, "step": 78210 }, { "epoch": 2.1944171693084584, "grad_norm": 0.33667242527008057, "learning_rate": 1.3426380511525693e-05, "loss": 0.0049, "step": 78220 }, { "epoch": 2.194697713564315, "grad_norm": 0.39556318521499634, "learning_rate": 1.3421704773928086e-05, "loss": 0.0793, "step": 78230 }, { "epoch": 2.1949782578201713, "grad_norm": 0.06601662933826447, "learning_rate": 1.3417029036330481e-05, "loss": 0.0181, "step": 78240 }, { "epoch": 2.1952588020760273, "grad_norm": 0.054363053292036057, "learning_rate": 1.3412353298732878e-05, "loss": 0.0022, "step": 78250 }, { "epoch": 2.1955393463318837, "grad_norm": 0.03465747833251953, "learning_rate": 1.3407677561135271e-05, "loss": 0.0196, "step": 78260 }, { "epoch": 2.19581989058774, "grad_norm": 1.0974845886230469, "learning_rate": 1.3403001823537664e-05, "loss": 0.0226, "step": 78270 }, { "epoch": 2.1961004348435966, "grad_norm": 0.016295939683914185, "learning_rate": 1.3398326085940057e-05, "loss": 0.0124, "step": 78280 }, { "epoch": 2.196380979099453, "grad_norm": 0.14580771327018738, "learning_rate": 1.3393650348342452e-05, "loss": 0.0331, "step": 78290 }, { "epoch": 2.196661523355309, "grad_norm": 0.019826488569378853, "learning_rate": 1.3388974610744845e-05, "loss": 0.0087, "step": 78300 }, { "epoch": 2.1969420676111655, "grad_norm": 0.7647343873977661, "learning_rate": 1.3384298873147238e-05, "loss": 0.0285, "step": 78310 }, { "epoch": 2.197222611867022, "grad_norm": 0.1523953527212143, "learning_rate": 1.3379623135549635e-05, "loss": 0.0107, "step": 78320 }, { "epoch": 2.1975031561228784, "grad_norm": 0.023641280829906464, "learning_rate": 1.3374947397952028e-05, "loss": 0.0307, "step": 78330 }, { "epoch": 2.197783700378735, "grad_norm": 0.7895297408103943, "learning_rate": 1.3370271660354421e-05, "loss": 0.0217, "step": 78340 }, { "epoch": 2.1980642446345913, "grad_norm": 0.7366478443145752, "learning_rate": 1.3365595922756816e-05, "loss": 0.025, "step": 78350 }, { "epoch": 2.1983447888904473, "grad_norm": 2.713667154312134, "learning_rate": 1.336092018515921e-05, "loss": 0.013, "step": 78360 }, { "epoch": 2.1986253331463037, "grad_norm": 0.4031772017478943, "learning_rate": 1.3356244447561602e-05, "loss": 0.0102, "step": 78370 }, { "epoch": 2.19890587740216, "grad_norm": 0.875824511051178, "learning_rate": 1.3351568709963996e-05, "loss": 0.0296, "step": 78380 }, { "epoch": 2.1991864216580166, "grad_norm": 1.353295922279358, "learning_rate": 1.3346892972366392e-05, "loss": 0.029, "step": 78390 }, { "epoch": 2.199466965913873, "grad_norm": 0.030220670625567436, "learning_rate": 1.3342217234768787e-05, "loss": 0.0381, "step": 78400 }, { "epoch": 2.1997475101697295, "grad_norm": 0.16274896264076233, "learning_rate": 1.333754149717118e-05, "loss": 0.0221, "step": 78410 }, { "epoch": 2.2000280544255855, "grad_norm": 0.17539837956428528, "learning_rate": 1.3332865759573573e-05, "loss": 0.0313, "step": 78420 }, { "epoch": 2.200308598681442, "grad_norm": 0.19500946998596191, "learning_rate": 1.3328190021975966e-05, "loss": 0.0172, "step": 78430 }, { "epoch": 2.2005891429372983, "grad_norm": 0.40590980648994446, "learning_rate": 1.3323514284378361e-05, "loss": 0.0081, "step": 78440 }, { "epoch": 2.200869687193155, "grad_norm": 0.01958121545612812, "learning_rate": 1.3318838546780754e-05, "loss": 0.0471, "step": 78450 }, { "epoch": 2.2011502314490112, "grad_norm": 0.27821558713912964, "learning_rate": 1.3314162809183151e-05, "loss": 0.0519, "step": 78460 }, { "epoch": 2.2014307757048677, "grad_norm": 0.35667532682418823, "learning_rate": 1.3309487071585544e-05, "loss": 0.0378, "step": 78470 }, { "epoch": 2.2017113199607237, "grad_norm": 0.931420624256134, "learning_rate": 1.3304811333987937e-05, "loss": 0.0487, "step": 78480 }, { "epoch": 2.20199186421658, "grad_norm": 0.1098427101969719, "learning_rate": 1.330013559639033e-05, "loss": 0.0136, "step": 78490 }, { "epoch": 2.2022724084724365, "grad_norm": 0.05194830149412155, "learning_rate": 1.3295459858792725e-05, "loss": 0.0089, "step": 78500 }, { "epoch": 2.202552952728293, "grad_norm": 0.6941965818405151, "learning_rate": 1.3290784121195118e-05, "loss": 0.0286, "step": 78510 }, { "epoch": 2.2028334969841494, "grad_norm": 0.09902817755937576, "learning_rate": 1.3286108383597512e-05, "loss": 0.0072, "step": 78520 }, { "epoch": 2.2031140412400054, "grad_norm": 0.34895792603492737, "learning_rate": 1.3281432645999908e-05, "loss": 0.0058, "step": 78530 }, { "epoch": 2.203394585495862, "grad_norm": 0.0931849554181099, "learning_rate": 1.3276756908402301e-05, "loss": 0.0133, "step": 78540 }, { "epoch": 2.2036751297517183, "grad_norm": 0.046209897845983505, "learning_rate": 1.3272081170804696e-05, "loss": 0.0159, "step": 78550 }, { "epoch": 2.2039556740075747, "grad_norm": 0.08694440126419067, "learning_rate": 1.326740543320709e-05, "loss": 0.0111, "step": 78560 }, { "epoch": 2.204236218263431, "grad_norm": 0.8783290982246399, "learning_rate": 1.3262729695609482e-05, "loss": 0.0199, "step": 78570 }, { "epoch": 2.2045167625192876, "grad_norm": 0.017824998125433922, "learning_rate": 1.3258053958011876e-05, "loss": 0.0157, "step": 78580 }, { "epoch": 2.2047973067751436, "grad_norm": 0.020395107567310333, "learning_rate": 1.325337822041427e-05, "loss": 0.0133, "step": 78590 }, { "epoch": 2.205077851031, "grad_norm": 2.095831871032715, "learning_rate": 1.3248702482816665e-05, "loss": 0.0505, "step": 78600 }, { "epoch": 2.2053583952868565, "grad_norm": 0.04760279133915901, "learning_rate": 1.324402674521906e-05, "loss": 0.0388, "step": 78610 }, { "epoch": 2.205638939542713, "grad_norm": 0.03206987306475639, "learning_rate": 1.3239351007621453e-05, "loss": 0.0224, "step": 78620 }, { "epoch": 2.2059194837985694, "grad_norm": 0.029562830924987793, "learning_rate": 1.3234675270023846e-05, "loss": 0.0178, "step": 78630 }, { "epoch": 2.2062000280544254, "grad_norm": 3.330538749694824, "learning_rate": 1.322999953242624e-05, "loss": 0.0392, "step": 78640 }, { "epoch": 2.206480572310282, "grad_norm": 0.9309597015380859, "learning_rate": 1.3225323794828634e-05, "loss": 0.028, "step": 78650 }, { "epoch": 2.2067611165661383, "grad_norm": 0.1516118347644806, "learning_rate": 1.3220648057231028e-05, "loss": 0.0112, "step": 78660 }, { "epoch": 2.2070416608219947, "grad_norm": 4.605774402618408, "learning_rate": 1.3215972319633424e-05, "loss": 0.0203, "step": 78670 }, { "epoch": 2.207322205077851, "grad_norm": 0.202309250831604, "learning_rate": 1.3211296582035817e-05, "loss": 0.0124, "step": 78680 }, { "epoch": 2.2076027493337076, "grad_norm": 0.34113165736198425, "learning_rate": 1.320662084443821e-05, "loss": 0.0215, "step": 78690 }, { "epoch": 2.2078832935895636, "grad_norm": 0.03248974680900574, "learning_rate": 1.3201945106840605e-05, "loss": 0.0066, "step": 78700 }, { "epoch": 2.20816383784542, "grad_norm": 0.0550791472196579, "learning_rate": 1.3197269369242999e-05, "loss": 0.0132, "step": 78710 }, { "epoch": 2.2084443821012765, "grad_norm": 0.15972986817359924, "learning_rate": 1.3192593631645392e-05, "loss": 0.0474, "step": 78720 }, { "epoch": 2.208724926357133, "grad_norm": 0.253903865814209, "learning_rate": 1.3187917894047785e-05, "loss": 0.0501, "step": 78730 }, { "epoch": 2.2090054706129894, "grad_norm": 0.02879193425178528, "learning_rate": 1.3183242156450181e-05, "loss": 0.0443, "step": 78740 }, { "epoch": 2.2092860148688453, "grad_norm": 0.6194014549255371, "learning_rate": 1.3178566418852576e-05, "loss": 0.0233, "step": 78750 }, { "epoch": 2.209566559124702, "grad_norm": 0.19434037804603577, "learning_rate": 1.317389068125497e-05, "loss": 0.0248, "step": 78760 }, { "epoch": 2.2098471033805582, "grad_norm": 1.2426892518997192, "learning_rate": 1.3169214943657363e-05, "loss": 0.0398, "step": 78770 }, { "epoch": 2.2101276476364147, "grad_norm": 0.025359436869621277, "learning_rate": 1.3164539206059756e-05, "loss": 0.0216, "step": 78780 }, { "epoch": 2.210408191892271, "grad_norm": 0.19906240701675415, "learning_rate": 1.315986346846215e-05, "loss": 0.0307, "step": 78790 }, { "epoch": 2.2106887361481276, "grad_norm": 0.027532009407877922, "learning_rate": 1.3155187730864544e-05, "loss": 0.0087, "step": 78800 }, { "epoch": 2.2109692804039836, "grad_norm": 0.48786821961402893, "learning_rate": 1.315051199326694e-05, "loss": 0.0191, "step": 78810 }, { "epoch": 2.21124982465984, "grad_norm": 0.06681138277053833, "learning_rate": 1.3145836255669333e-05, "loss": 0.0341, "step": 78820 }, { "epoch": 2.2115303689156964, "grad_norm": 0.08807437866926193, "learning_rate": 1.3141160518071727e-05, "loss": 0.0115, "step": 78830 }, { "epoch": 2.211810913171553, "grad_norm": 0.8577641248703003, "learning_rate": 1.313648478047412e-05, "loss": 0.0171, "step": 78840 }, { "epoch": 2.2120914574274093, "grad_norm": 0.019647721201181412, "learning_rate": 1.3131809042876515e-05, "loss": 0.012, "step": 78850 }, { "epoch": 2.2123720016832653, "grad_norm": 0.9172661900520325, "learning_rate": 1.3127133305278908e-05, "loss": 0.0187, "step": 78860 }, { "epoch": 2.2126525459391218, "grad_norm": 0.01585068181157112, "learning_rate": 1.3122457567681301e-05, "loss": 0.0511, "step": 78870 }, { "epoch": 2.212933090194978, "grad_norm": 0.07856842130422592, "learning_rate": 1.3117781830083697e-05, "loss": 0.0448, "step": 78880 }, { "epoch": 2.2132136344508346, "grad_norm": 0.08893420547246933, "learning_rate": 1.311310609248609e-05, "loss": 0.0175, "step": 78890 }, { "epoch": 2.213494178706691, "grad_norm": 0.2017291635274887, "learning_rate": 1.3108430354888485e-05, "loss": 0.0456, "step": 78900 }, { "epoch": 2.2137747229625475, "grad_norm": 0.06176154688000679, "learning_rate": 1.3103754617290879e-05, "loss": 0.0184, "step": 78910 }, { "epoch": 2.2140552672184035, "grad_norm": 0.08822319656610489, "learning_rate": 1.3099078879693272e-05, "loss": 0.0295, "step": 78920 }, { "epoch": 2.21433581147426, "grad_norm": 2.2390494346618652, "learning_rate": 1.3094403142095665e-05, "loss": 0.0376, "step": 78930 }, { "epoch": 2.2146163557301164, "grad_norm": 0.10955370962619781, "learning_rate": 1.308972740449806e-05, "loss": 0.0206, "step": 78940 }, { "epoch": 2.214896899985973, "grad_norm": 0.19428913295269012, "learning_rate": 1.3085051666900455e-05, "loss": 0.0101, "step": 78950 }, { "epoch": 2.2151774442418293, "grad_norm": 0.054730597883462906, "learning_rate": 1.308037592930285e-05, "loss": 0.0114, "step": 78960 }, { "epoch": 2.2154579884976857, "grad_norm": 0.14879736304283142, "learning_rate": 1.3075700191705243e-05, "loss": 0.0141, "step": 78970 }, { "epoch": 2.2157385327535417, "grad_norm": 0.20300501585006714, "learning_rate": 1.3071024454107636e-05, "loss": 0.0185, "step": 78980 }, { "epoch": 2.216019077009398, "grad_norm": 0.2577894926071167, "learning_rate": 1.3066348716510029e-05, "loss": 0.0126, "step": 78990 }, { "epoch": 2.2162996212652546, "grad_norm": 0.06475245952606201, "learning_rate": 1.3061672978912424e-05, "loss": 0.026, "step": 79000 }, { "epoch": 2.216580165521111, "grad_norm": 2.987905502319336, "learning_rate": 1.3056997241314817e-05, "loss": 0.0425, "step": 79010 }, { "epoch": 2.2168607097769675, "grad_norm": 2.7853786945343018, "learning_rate": 1.3052321503717213e-05, "loss": 0.0131, "step": 79020 }, { "epoch": 2.2171412540328235, "grad_norm": 0.3672862946987152, "learning_rate": 1.3047645766119607e-05, "loss": 0.0254, "step": 79030 }, { "epoch": 2.21742179828868, "grad_norm": 0.028248677030205727, "learning_rate": 1.3042970028522e-05, "loss": 0.0088, "step": 79040 }, { "epoch": 2.2177023425445364, "grad_norm": 0.020590052008628845, "learning_rate": 1.3038294290924395e-05, "loss": 0.0204, "step": 79050 }, { "epoch": 2.217982886800393, "grad_norm": 0.03501803055405617, "learning_rate": 1.3033618553326788e-05, "loss": 0.0299, "step": 79060 }, { "epoch": 2.2182634310562492, "grad_norm": 4.037543773651123, "learning_rate": 1.3028942815729181e-05, "loss": 0.0143, "step": 79070 }, { "epoch": 2.2185439753121057, "grad_norm": 0.9146421551704407, "learning_rate": 1.3024267078131574e-05, "loss": 0.0173, "step": 79080 }, { "epoch": 2.2188245195679617, "grad_norm": 0.048959508538246155, "learning_rate": 1.301959134053397e-05, "loss": 0.005, "step": 79090 }, { "epoch": 2.219105063823818, "grad_norm": 0.01710912026464939, "learning_rate": 1.3014915602936364e-05, "loss": 0.0084, "step": 79100 }, { "epoch": 2.2193856080796746, "grad_norm": 0.442144513130188, "learning_rate": 1.3010239865338759e-05, "loss": 0.0107, "step": 79110 }, { "epoch": 2.219666152335531, "grad_norm": 0.9437402486801147, "learning_rate": 1.3005564127741152e-05, "loss": 0.0354, "step": 79120 }, { "epoch": 2.2199466965913874, "grad_norm": 0.5130457878112793, "learning_rate": 1.3000888390143545e-05, "loss": 0.0283, "step": 79130 }, { "epoch": 2.220227240847244, "grad_norm": 0.26769039034843445, "learning_rate": 1.2996212652545938e-05, "loss": 0.0188, "step": 79140 }, { "epoch": 2.2205077851031, "grad_norm": 0.059162724763154984, "learning_rate": 1.2991536914948333e-05, "loss": 0.0275, "step": 79150 }, { "epoch": 2.2207883293589563, "grad_norm": 1.4812264442443848, "learning_rate": 1.298686117735073e-05, "loss": 0.0266, "step": 79160 }, { "epoch": 2.2210688736148128, "grad_norm": 0.013333783484995365, "learning_rate": 1.2982185439753123e-05, "loss": 0.0369, "step": 79170 }, { "epoch": 2.221349417870669, "grad_norm": 0.04717008396983147, "learning_rate": 1.2977509702155516e-05, "loss": 0.0157, "step": 79180 }, { "epoch": 2.2216299621265256, "grad_norm": 0.1967974156141281, "learning_rate": 1.2972833964557909e-05, "loss": 0.0069, "step": 79190 }, { "epoch": 2.2219105063823816, "grad_norm": 0.025068623945116997, "learning_rate": 1.2968158226960304e-05, "loss": 0.0114, "step": 79200 }, { "epoch": 2.222191050638238, "grad_norm": 0.025560220703482628, "learning_rate": 1.2963482489362697e-05, "loss": 0.0408, "step": 79210 }, { "epoch": 2.2224715948940945, "grad_norm": 0.06624428182840347, "learning_rate": 1.295880675176509e-05, "loss": 0.0178, "step": 79220 }, { "epoch": 2.222752139149951, "grad_norm": 1.0144429206848145, "learning_rate": 1.2954131014167487e-05, "loss": 0.022, "step": 79230 }, { "epoch": 2.2230326834058074, "grad_norm": 0.05732322856783867, "learning_rate": 1.294945527656988e-05, "loss": 0.0049, "step": 79240 }, { "epoch": 2.223313227661664, "grad_norm": 0.24514096975326538, "learning_rate": 1.2944779538972273e-05, "loss": 0.0667, "step": 79250 }, { "epoch": 2.22359377191752, "grad_norm": 0.04446076601743698, "learning_rate": 1.2940103801374668e-05, "loss": 0.0083, "step": 79260 }, { "epoch": 2.2238743161733763, "grad_norm": 0.1716500073671341, "learning_rate": 1.2935428063777061e-05, "loss": 0.0494, "step": 79270 }, { "epoch": 2.2241548604292327, "grad_norm": 2.119091272354126, "learning_rate": 1.2930752326179454e-05, "loss": 0.0185, "step": 79280 }, { "epoch": 2.224435404685089, "grad_norm": 0.7865275740623474, "learning_rate": 1.292607658858185e-05, "loss": 0.0164, "step": 79290 }, { "epoch": 2.2247159489409456, "grad_norm": 0.054906465113162994, "learning_rate": 1.2921400850984244e-05, "loss": 0.0509, "step": 79300 }, { "epoch": 2.2249964931968016, "grad_norm": 0.21814468502998352, "learning_rate": 1.2916725113386639e-05, "loss": 0.037, "step": 79310 }, { "epoch": 2.225277037452658, "grad_norm": 0.5773061513900757, "learning_rate": 1.2912049375789032e-05, "loss": 0.0206, "step": 79320 }, { "epoch": 2.2255575817085145, "grad_norm": 0.1556456983089447, "learning_rate": 1.2907373638191425e-05, "loss": 0.0314, "step": 79330 }, { "epoch": 2.225838125964371, "grad_norm": 0.13466480374336243, "learning_rate": 1.2902697900593818e-05, "loss": 0.0272, "step": 79340 }, { "epoch": 2.2261186702202274, "grad_norm": 0.4593693017959595, "learning_rate": 1.2898022162996213e-05, "loss": 0.0279, "step": 79350 }, { "epoch": 2.226399214476084, "grad_norm": 0.3812022805213928, "learning_rate": 1.2893346425398608e-05, "loss": 0.0195, "step": 79360 }, { "epoch": 2.22667975873194, "grad_norm": 0.2498471438884735, "learning_rate": 1.2888670687801003e-05, "loss": 0.0477, "step": 79370 }, { "epoch": 2.2269603029877962, "grad_norm": 0.02068302407860756, "learning_rate": 1.2883994950203396e-05, "loss": 0.0256, "step": 79380 }, { "epoch": 2.2272408472436527, "grad_norm": 0.4062596559524536, "learning_rate": 1.2879319212605789e-05, "loss": 0.0135, "step": 79390 }, { "epoch": 2.227521391499509, "grad_norm": 0.518098771572113, "learning_rate": 1.2874643475008182e-05, "loss": 0.0204, "step": 79400 }, { "epoch": 2.2278019357553656, "grad_norm": 0.5386735200881958, "learning_rate": 1.2869967737410577e-05, "loss": 0.0745, "step": 79410 }, { "epoch": 2.2280824800112216, "grad_norm": 1.3419125080108643, "learning_rate": 1.286529199981297e-05, "loss": 0.0344, "step": 79420 }, { "epoch": 2.228363024267078, "grad_norm": 0.2944817841053009, "learning_rate": 1.2860616262215367e-05, "loss": 0.0196, "step": 79430 }, { "epoch": 2.2286435685229344, "grad_norm": 0.7346105575561523, "learning_rate": 1.285594052461776e-05, "loss": 0.0166, "step": 79440 }, { "epoch": 2.228924112778791, "grad_norm": 0.3127143979072571, "learning_rate": 1.2851264787020153e-05, "loss": 0.0148, "step": 79450 }, { "epoch": 2.2292046570346473, "grad_norm": 0.09452936053276062, "learning_rate": 1.2846589049422548e-05, "loss": 0.0309, "step": 79460 }, { "epoch": 2.2294852012905038, "grad_norm": 0.1892540156841278, "learning_rate": 1.2841913311824941e-05, "loss": 0.0146, "step": 79470 }, { "epoch": 2.2297657455463598, "grad_norm": 0.6811928749084473, "learning_rate": 1.2837237574227334e-05, "loss": 0.005, "step": 79480 }, { "epoch": 2.230046289802216, "grad_norm": 0.04807148873806, "learning_rate": 1.2832561836629727e-05, "loss": 0.0388, "step": 79490 }, { "epoch": 2.2303268340580726, "grad_norm": 0.060951828956604004, "learning_rate": 1.2827886099032124e-05, "loss": 0.0037, "step": 79500 }, { "epoch": 2.230607378313929, "grad_norm": 0.18801133334636688, "learning_rate": 1.2823210361434517e-05, "loss": 0.0082, "step": 79510 }, { "epoch": 2.2308879225697855, "grad_norm": 0.9975072145462036, "learning_rate": 1.2818534623836912e-05, "loss": 0.0488, "step": 79520 }, { "epoch": 2.2311684668256415, "grad_norm": 0.22459502518177032, "learning_rate": 1.2813858886239305e-05, "loss": 0.0164, "step": 79530 }, { "epoch": 2.231449011081498, "grad_norm": 2.257477045059204, "learning_rate": 1.2809183148641698e-05, "loss": 0.0294, "step": 79540 }, { "epoch": 2.2317295553373544, "grad_norm": 0.020096508786082268, "learning_rate": 1.2804507411044091e-05, "loss": 0.0112, "step": 79550 }, { "epoch": 2.232010099593211, "grad_norm": 0.03070659190416336, "learning_rate": 1.2799831673446486e-05, "loss": 0.0399, "step": 79560 }, { "epoch": 2.2322906438490673, "grad_norm": 0.254757821559906, "learning_rate": 1.2795155935848883e-05, "loss": 0.0319, "step": 79570 }, { "epoch": 2.2325711881049237, "grad_norm": 0.27773675322532654, "learning_rate": 1.2790480198251276e-05, "loss": 0.0162, "step": 79580 }, { "epoch": 2.2328517323607797, "grad_norm": 0.13256478309631348, "learning_rate": 1.2785804460653669e-05, "loss": 0.006, "step": 79590 }, { "epoch": 2.233132276616636, "grad_norm": 0.1919250190258026, "learning_rate": 1.2781128723056062e-05, "loss": 0.0198, "step": 79600 }, { "epoch": 2.2334128208724926, "grad_norm": 0.5117636919021606, "learning_rate": 1.2776452985458457e-05, "loss": 0.0257, "step": 79610 }, { "epoch": 2.233693365128349, "grad_norm": 0.11308959871530533, "learning_rate": 1.277177724786085e-05, "loss": 0.0059, "step": 79620 }, { "epoch": 2.2339739093842055, "grad_norm": 0.023081857711076736, "learning_rate": 1.2767101510263243e-05, "loss": 0.0646, "step": 79630 }, { "epoch": 2.234254453640062, "grad_norm": 0.18177498877048492, "learning_rate": 1.276242577266564e-05, "loss": 0.0175, "step": 79640 }, { "epoch": 2.234534997895918, "grad_norm": 0.1286287158727646, "learning_rate": 1.2757750035068033e-05, "loss": 0.0157, "step": 79650 }, { "epoch": 2.2348155421517744, "grad_norm": 0.5469661355018616, "learning_rate": 1.2753074297470428e-05, "loss": 0.0227, "step": 79660 }, { "epoch": 2.235096086407631, "grad_norm": 0.21402284502983093, "learning_rate": 1.2748398559872821e-05, "loss": 0.0096, "step": 79670 }, { "epoch": 2.2353766306634872, "grad_norm": 0.054279692471027374, "learning_rate": 1.2743722822275214e-05, "loss": 0.0087, "step": 79680 }, { "epoch": 2.2356571749193437, "grad_norm": 0.01833854429423809, "learning_rate": 1.2739047084677607e-05, "loss": 0.0047, "step": 79690 }, { "epoch": 2.2359377191752, "grad_norm": 0.26489341259002686, "learning_rate": 1.2734371347080002e-05, "loss": 0.0294, "step": 79700 }, { "epoch": 2.236218263431056, "grad_norm": 0.2384234070777893, "learning_rate": 1.2729695609482397e-05, "loss": 0.0091, "step": 79710 }, { "epoch": 2.2364988076869126, "grad_norm": 0.4587756395339966, "learning_rate": 1.2725019871884792e-05, "loss": 0.0181, "step": 79720 }, { "epoch": 2.236779351942769, "grad_norm": 0.28667163848876953, "learning_rate": 1.2720344134287185e-05, "loss": 0.0077, "step": 79730 }, { "epoch": 2.2370598961986254, "grad_norm": 0.8148247599601746, "learning_rate": 1.2715668396689578e-05, "loss": 0.0145, "step": 79740 }, { "epoch": 2.237340440454482, "grad_norm": 1.3403688669204712, "learning_rate": 1.2710992659091971e-05, "loss": 0.0181, "step": 79750 }, { "epoch": 2.237620984710338, "grad_norm": 0.08842423558235168, "learning_rate": 1.2706316921494366e-05, "loss": 0.0058, "step": 79760 }, { "epoch": 2.2379015289661943, "grad_norm": 0.3784309923648834, "learning_rate": 1.270164118389676e-05, "loss": 0.0381, "step": 79770 }, { "epoch": 2.2381820732220508, "grad_norm": 0.03576464205980301, "learning_rate": 1.2696965446299156e-05, "loss": 0.0281, "step": 79780 }, { "epoch": 2.238462617477907, "grad_norm": 0.032990437000989914, "learning_rate": 1.2692289708701549e-05, "loss": 0.0104, "step": 79790 }, { "epoch": 2.2387431617337636, "grad_norm": 0.13451527059078217, "learning_rate": 1.2687613971103942e-05, "loss": 0.0244, "step": 79800 }, { "epoch": 2.23902370598962, "grad_norm": 0.10790997743606567, "learning_rate": 1.2682938233506337e-05, "loss": 0.044, "step": 79810 }, { "epoch": 2.239304250245476, "grad_norm": 0.33471792936325073, "learning_rate": 1.267826249590873e-05, "loss": 0.0328, "step": 79820 }, { "epoch": 2.2395847945013325, "grad_norm": 0.29245179891586304, "learning_rate": 1.2673586758311123e-05, "loss": 0.0138, "step": 79830 }, { "epoch": 2.239865338757189, "grad_norm": 0.16208569705486298, "learning_rate": 1.2668911020713517e-05, "loss": 0.0144, "step": 79840 }, { "epoch": 2.2401458830130454, "grad_norm": 0.3896842300891876, "learning_rate": 1.2664235283115913e-05, "loss": 0.032, "step": 79850 }, { "epoch": 2.240426427268902, "grad_norm": 0.37781620025634766, "learning_rate": 1.2659559545518306e-05, "loss": 0.0172, "step": 79860 }, { "epoch": 2.240706971524758, "grad_norm": 0.020700616762042046, "learning_rate": 1.2654883807920701e-05, "loss": 0.018, "step": 79870 }, { "epoch": 2.2409875157806143, "grad_norm": 0.013201020658016205, "learning_rate": 1.2650208070323094e-05, "loss": 0.0334, "step": 79880 }, { "epoch": 2.2412680600364707, "grad_norm": 0.23238039016723633, "learning_rate": 1.2645532332725487e-05, "loss": 0.0061, "step": 79890 }, { "epoch": 2.241548604292327, "grad_norm": 0.7019473314285278, "learning_rate": 1.264085659512788e-05, "loss": 0.0471, "step": 79900 }, { "epoch": 2.2418291485481836, "grad_norm": 0.053253330290317535, "learning_rate": 1.2636180857530275e-05, "loss": 0.0162, "step": 79910 }, { "epoch": 2.24210969280404, "grad_norm": 1.7270560264587402, "learning_rate": 1.2631505119932672e-05, "loss": 0.0219, "step": 79920 }, { "epoch": 2.242390237059896, "grad_norm": 0.2368924915790558, "learning_rate": 1.2626829382335065e-05, "loss": 0.0379, "step": 79930 }, { "epoch": 2.2426707813157525, "grad_norm": 0.3622061610221863, "learning_rate": 1.2622153644737458e-05, "loss": 0.0118, "step": 79940 }, { "epoch": 2.242951325571609, "grad_norm": 0.03148871660232544, "learning_rate": 1.2617477907139851e-05, "loss": 0.012, "step": 79950 }, { "epoch": 2.2432318698274654, "grad_norm": 0.06160594895482063, "learning_rate": 1.2612802169542246e-05, "loss": 0.0039, "step": 79960 }, { "epoch": 2.243512414083322, "grad_norm": 0.35297295451164246, "learning_rate": 1.260812643194464e-05, "loss": 0.0299, "step": 79970 }, { "epoch": 2.243792958339178, "grad_norm": 0.010747049935162067, "learning_rate": 1.2603450694347033e-05, "loss": 0.0151, "step": 79980 }, { "epoch": 2.2440735025950342, "grad_norm": 0.26531898975372314, "learning_rate": 1.2598774956749429e-05, "loss": 0.0466, "step": 79990 }, { "epoch": 2.2443540468508907, "grad_norm": 0.3118416368961334, "learning_rate": 1.2594099219151822e-05, "loss": 0.0257, "step": 80000 }, { "epoch": 2.244634591106747, "grad_norm": 1.2189710140228271, "learning_rate": 1.2589423481554215e-05, "loss": 0.0179, "step": 80010 }, { "epoch": 2.2449151353626036, "grad_norm": 0.26102423667907715, "learning_rate": 1.258474774395661e-05, "loss": 0.0216, "step": 80020 }, { "epoch": 2.24519567961846, "grad_norm": 0.18865206837654114, "learning_rate": 1.2580072006359003e-05, "loss": 0.0221, "step": 80030 }, { "epoch": 2.245476223874316, "grad_norm": 0.884271502494812, "learning_rate": 1.2575396268761397e-05, "loss": 0.0105, "step": 80040 }, { "epoch": 2.2457567681301724, "grad_norm": 2.604612112045288, "learning_rate": 1.257072053116379e-05, "loss": 0.0373, "step": 80050 }, { "epoch": 2.246037312386029, "grad_norm": 0.22716441750526428, "learning_rate": 1.2566044793566186e-05, "loss": 0.0047, "step": 80060 }, { "epoch": 2.2463178566418853, "grad_norm": 0.6421786546707153, "learning_rate": 1.2561369055968581e-05, "loss": 0.0333, "step": 80070 }, { "epoch": 2.2465984008977418, "grad_norm": 0.027342695742845535, "learning_rate": 1.2556693318370974e-05, "loss": 0.0095, "step": 80080 }, { "epoch": 2.2468789451535978, "grad_norm": 0.07066542655229568, "learning_rate": 1.2552017580773367e-05, "loss": 0.0078, "step": 80090 }, { "epoch": 2.247159489409454, "grad_norm": 0.4116445481777191, "learning_rate": 1.254734184317576e-05, "loss": 0.0089, "step": 80100 }, { "epoch": 2.2474400336653106, "grad_norm": 0.4931463897228241, "learning_rate": 1.2542666105578155e-05, "loss": 0.0238, "step": 80110 }, { "epoch": 2.247720577921167, "grad_norm": 0.5145263075828552, "learning_rate": 1.2537990367980549e-05, "loss": 0.0167, "step": 80120 }, { "epoch": 2.2480011221770235, "grad_norm": 0.014633768238127232, "learning_rate": 1.2533314630382945e-05, "loss": 0.03, "step": 80130 }, { "epoch": 2.24828166643288, "grad_norm": 2.4057815074920654, "learning_rate": 1.2528638892785338e-05, "loss": 0.038, "step": 80140 }, { "epoch": 2.248562210688736, "grad_norm": 0.0059584518894553185, "learning_rate": 1.2523963155187731e-05, "loss": 0.0049, "step": 80150 }, { "epoch": 2.2488427549445924, "grad_norm": 0.2587454319000244, "learning_rate": 1.2519287417590125e-05, "loss": 0.003, "step": 80160 }, { "epoch": 2.249123299200449, "grad_norm": 0.05980539321899414, "learning_rate": 1.251461167999252e-05, "loss": 0.0071, "step": 80170 }, { "epoch": 2.2494038434563053, "grad_norm": 0.8010545372962952, "learning_rate": 1.2509935942394913e-05, "loss": 0.0195, "step": 80180 }, { "epoch": 2.2496843877121617, "grad_norm": 2.4921255111694336, "learning_rate": 1.2505260204797306e-05, "loss": 0.0238, "step": 80190 }, { "epoch": 2.2499649319680177, "grad_norm": 1.4485281705856323, "learning_rate": 1.2500584467199702e-05, "loss": 0.0239, "step": 80200 }, { "epoch": 2.250245476223874, "grad_norm": 0.021203631535172462, "learning_rate": 1.2495908729602095e-05, "loss": 0.0105, "step": 80210 }, { "epoch": 2.2505260204797306, "grad_norm": 0.6792287826538086, "learning_rate": 1.249123299200449e-05, "loss": 0.0434, "step": 80220 }, { "epoch": 2.250806564735587, "grad_norm": 0.5754197239875793, "learning_rate": 1.2486557254406883e-05, "loss": 0.0277, "step": 80230 }, { "epoch": 2.2510871089914435, "grad_norm": 0.0862416997551918, "learning_rate": 1.2481881516809277e-05, "loss": 0.0179, "step": 80240 }, { "epoch": 2.2513676532473, "grad_norm": 0.20184269547462463, "learning_rate": 1.2477205779211671e-05, "loss": 0.0085, "step": 80250 }, { "epoch": 2.2516481975031564, "grad_norm": 0.037741873413324356, "learning_rate": 1.2472530041614065e-05, "loss": 0.0263, "step": 80260 }, { "epoch": 2.2519287417590124, "grad_norm": 0.15286937355995178, "learning_rate": 1.246785430401646e-05, "loss": 0.0242, "step": 80270 }, { "epoch": 2.252209286014869, "grad_norm": 0.30212119221687317, "learning_rate": 1.2463178566418854e-05, "loss": 0.0223, "step": 80280 }, { "epoch": 2.2524898302707252, "grad_norm": 0.1778760552406311, "learning_rate": 1.2458502828821247e-05, "loss": 0.009, "step": 80290 }, { "epoch": 2.2527703745265817, "grad_norm": 0.02362421713769436, "learning_rate": 1.245382709122364e-05, "loss": 0.0271, "step": 80300 }, { "epoch": 2.2530509187824377, "grad_norm": 0.1392417550086975, "learning_rate": 1.2449151353626034e-05, "loss": 0.0237, "step": 80310 }, { "epoch": 2.253331463038294, "grad_norm": 0.4989601969718933, "learning_rate": 1.244447561602843e-05, "loss": 0.0094, "step": 80320 }, { "epoch": 2.2536120072941506, "grad_norm": 0.0188896544277668, "learning_rate": 1.2439799878430823e-05, "loss": 0.0153, "step": 80330 }, { "epoch": 2.253892551550007, "grad_norm": 0.032765306532382965, "learning_rate": 1.2435124140833217e-05, "loss": 0.016, "step": 80340 }, { "epoch": 2.2541730958058634, "grad_norm": 2.0159647464752197, "learning_rate": 1.2430448403235611e-05, "loss": 0.0357, "step": 80350 }, { "epoch": 2.25445364006172, "grad_norm": 0.1310393363237381, "learning_rate": 1.2425772665638005e-05, "loss": 0.0195, "step": 80360 }, { "epoch": 2.2547341843175763, "grad_norm": 0.2656953036785126, "learning_rate": 1.24210969280404e-05, "loss": 0.0515, "step": 80370 }, { "epoch": 2.2550147285734323, "grad_norm": 1.5806543827056885, "learning_rate": 1.2416421190442793e-05, "loss": 0.0356, "step": 80380 }, { "epoch": 2.2552952728292888, "grad_norm": 1.3060543537139893, "learning_rate": 1.2411745452845187e-05, "loss": 0.0223, "step": 80390 }, { "epoch": 2.255575817085145, "grad_norm": 0.18443430960178375, "learning_rate": 1.240706971524758e-05, "loss": 0.0303, "step": 80400 }, { "epoch": 2.2558563613410016, "grad_norm": 0.03587474673986435, "learning_rate": 1.2402393977649974e-05, "loss": 0.0079, "step": 80410 }, { "epoch": 2.256136905596858, "grad_norm": 0.05836546793580055, "learning_rate": 1.2397718240052369e-05, "loss": 0.0032, "step": 80420 }, { "epoch": 2.256417449852714, "grad_norm": 1.1221330165863037, "learning_rate": 1.2393042502454764e-05, "loss": 0.0428, "step": 80430 }, { "epoch": 2.2566979941085705, "grad_norm": 0.027049731463193893, "learning_rate": 1.2388366764857157e-05, "loss": 0.0207, "step": 80440 }, { "epoch": 2.256978538364427, "grad_norm": 0.06312885135412216, "learning_rate": 1.238369102725955e-05, "loss": 0.0117, "step": 80450 }, { "epoch": 2.2572590826202834, "grad_norm": 0.17987917363643646, "learning_rate": 1.2379015289661945e-05, "loss": 0.0229, "step": 80460 }, { "epoch": 2.25753962687614, "grad_norm": 0.7380489706993103, "learning_rate": 1.237433955206434e-05, "loss": 0.0268, "step": 80470 }, { "epoch": 2.2578201711319963, "grad_norm": 1.413260579109192, "learning_rate": 1.2369663814466733e-05, "loss": 0.0166, "step": 80480 }, { "epoch": 2.2581007153878523, "grad_norm": 0.26892444491386414, "learning_rate": 1.2364988076869128e-05, "loss": 0.0245, "step": 80490 }, { "epoch": 2.2583812596437087, "grad_norm": 0.02262328565120697, "learning_rate": 1.236031233927152e-05, "loss": 0.0383, "step": 80500 }, { "epoch": 2.258661803899565, "grad_norm": 0.03851970657706261, "learning_rate": 1.2355636601673914e-05, "loss": 0.0373, "step": 80510 }, { "epoch": 2.2589423481554216, "grad_norm": 0.04838123917579651, "learning_rate": 1.2350960864076309e-05, "loss": 0.0078, "step": 80520 }, { "epoch": 2.259222892411278, "grad_norm": 1.8112170696258545, "learning_rate": 1.2346285126478704e-05, "loss": 0.0303, "step": 80530 }, { "epoch": 2.259503436667134, "grad_norm": 0.9567426443099976, "learning_rate": 1.2341609388881097e-05, "loss": 0.0396, "step": 80540 }, { "epoch": 2.2597839809229905, "grad_norm": 0.053389985114336014, "learning_rate": 1.233693365128349e-05, "loss": 0.0046, "step": 80550 }, { "epoch": 2.260064525178847, "grad_norm": 0.4326188862323761, "learning_rate": 1.2332257913685885e-05, "loss": 0.0388, "step": 80560 }, { "epoch": 2.2603450694347034, "grad_norm": 0.06620538234710693, "learning_rate": 1.232758217608828e-05, "loss": 0.0164, "step": 80570 }, { "epoch": 2.26062561369056, "grad_norm": 1.051652431488037, "learning_rate": 1.2322906438490673e-05, "loss": 0.0285, "step": 80580 }, { "epoch": 2.2609061579464163, "grad_norm": 0.37658530473709106, "learning_rate": 1.2318230700893066e-05, "loss": 0.0185, "step": 80590 }, { "epoch": 2.2611867022022722, "grad_norm": 0.37710195779800415, "learning_rate": 1.231355496329546e-05, "loss": 0.024, "step": 80600 }, { "epoch": 2.2614672464581287, "grad_norm": 0.05726177990436554, "learning_rate": 1.2308879225697854e-05, "loss": 0.021, "step": 80610 }, { "epoch": 2.261747790713985, "grad_norm": 0.16865432262420654, "learning_rate": 1.2304203488100249e-05, "loss": 0.0129, "step": 80620 }, { "epoch": 2.2620283349698416, "grad_norm": 0.23868955671787262, "learning_rate": 1.2299527750502644e-05, "loss": 0.0184, "step": 80630 }, { "epoch": 2.262308879225698, "grad_norm": 0.04323672130703926, "learning_rate": 1.2294852012905037e-05, "loss": 0.0464, "step": 80640 }, { "epoch": 2.262589423481554, "grad_norm": 0.19457335770130157, "learning_rate": 1.229017627530743e-05, "loss": 0.0123, "step": 80650 }, { "epoch": 2.2628699677374104, "grad_norm": 0.5968988537788391, "learning_rate": 1.2285500537709823e-05, "loss": 0.0225, "step": 80660 }, { "epoch": 2.263150511993267, "grad_norm": 0.06723016500473022, "learning_rate": 1.2280824800112218e-05, "loss": 0.0127, "step": 80670 }, { "epoch": 2.2634310562491233, "grad_norm": 0.6459909677505493, "learning_rate": 1.2276149062514613e-05, "loss": 0.0287, "step": 80680 }, { "epoch": 2.2637116005049798, "grad_norm": 0.2841931879520416, "learning_rate": 1.2271473324917006e-05, "loss": 0.0203, "step": 80690 }, { "epoch": 2.263992144760836, "grad_norm": 0.32009783387184143, "learning_rate": 1.22667975873194e-05, "loss": 0.0149, "step": 80700 }, { "epoch": 2.264272689016692, "grad_norm": 0.32117101550102234, "learning_rate": 1.2262121849721794e-05, "loss": 0.0524, "step": 80710 }, { "epoch": 2.2645532332725486, "grad_norm": 1.0430916547775269, "learning_rate": 1.2257446112124189e-05, "loss": 0.0131, "step": 80720 }, { "epoch": 2.264833777528405, "grad_norm": 0.6220389604568481, "learning_rate": 1.2252770374526582e-05, "loss": 0.025, "step": 80730 }, { "epoch": 2.2651143217842615, "grad_norm": 0.027080891653895378, "learning_rate": 1.2248094636928977e-05, "loss": 0.0152, "step": 80740 }, { "epoch": 2.265394866040118, "grad_norm": 0.011273701675236225, "learning_rate": 1.224341889933137e-05, "loss": 0.0086, "step": 80750 }, { "epoch": 2.265675410295974, "grad_norm": 0.06262043863534927, "learning_rate": 1.2238743161733763e-05, "loss": 0.0158, "step": 80760 }, { "epoch": 2.2659559545518304, "grad_norm": 0.29439613223075867, "learning_rate": 1.2234067424136158e-05, "loss": 0.0074, "step": 80770 }, { "epoch": 2.266236498807687, "grad_norm": 0.04728693142533302, "learning_rate": 1.2229391686538553e-05, "loss": 0.0319, "step": 80780 }, { "epoch": 2.2665170430635433, "grad_norm": 0.07225437462329865, "learning_rate": 1.2224715948940946e-05, "loss": 0.0186, "step": 80790 }, { "epoch": 2.2667975873193997, "grad_norm": 0.025148984044790268, "learning_rate": 1.2220040211343339e-05, "loss": 0.0258, "step": 80800 }, { "epoch": 2.267078131575256, "grad_norm": 0.03057478740811348, "learning_rate": 1.2215364473745734e-05, "loss": 0.0435, "step": 80810 }, { "epoch": 2.267358675831112, "grad_norm": 0.04062555730342865, "learning_rate": 1.2210688736148127e-05, "loss": 0.0205, "step": 80820 }, { "epoch": 2.2676392200869686, "grad_norm": 0.40650278329849243, "learning_rate": 1.2206012998550522e-05, "loss": 0.0109, "step": 80830 }, { "epoch": 2.267919764342825, "grad_norm": 0.013975162990391254, "learning_rate": 1.2201337260952917e-05, "loss": 0.0112, "step": 80840 }, { "epoch": 2.2682003085986815, "grad_norm": 1.4671109914779663, "learning_rate": 1.219666152335531e-05, "loss": 0.0149, "step": 80850 }, { "epoch": 2.268480852854538, "grad_norm": 0.047015439718961716, "learning_rate": 1.2191985785757703e-05, "loss": 0.011, "step": 80860 }, { "epoch": 2.268761397110394, "grad_norm": 0.03924690559506416, "learning_rate": 1.2187310048160098e-05, "loss": 0.0147, "step": 80870 }, { "epoch": 2.2690419413662504, "grad_norm": 0.567528247833252, "learning_rate": 1.2182634310562493e-05, "loss": 0.0284, "step": 80880 }, { "epoch": 2.269322485622107, "grad_norm": 0.0429609976708889, "learning_rate": 1.2177958572964886e-05, "loss": 0.0211, "step": 80890 }, { "epoch": 2.2696030298779633, "grad_norm": 0.05789937078952789, "learning_rate": 1.2173282835367279e-05, "loss": 0.0097, "step": 80900 }, { "epoch": 2.2698835741338197, "grad_norm": 0.05011226609349251, "learning_rate": 1.2168607097769674e-05, "loss": 0.0109, "step": 80910 }, { "epoch": 2.270164118389676, "grad_norm": 0.033475857228040695, "learning_rate": 1.2163931360172067e-05, "loss": 0.0129, "step": 80920 }, { "epoch": 2.2704446626455326, "grad_norm": 0.024625582620501518, "learning_rate": 1.2159255622574462e-05, "loss": 0.0191, "step": 80930 }, { "epoch": 2.2707252069013886, "grad_norm": 0.41599780321121216, "learning_rate": 1.2154579884976857e-05, "loss": 0.029, "step": 80940 }, { "epoch": 2.271005751157245, "grad_norm": 0.052009761333465576, "learning_rate": 1.214990414737925e-05, "loss": 0.0086, "step": 80950 }, { "epoch": 2.2712862954131015, "grad_norm": 6.239321231842041, "learning_rate": 1.2145228409781643e-05, "loss": 0.0387, "step": 80960 }, { "epoch": 2.271566839668958, "grad_norm": 1.35426926612854, "learning_rate": 1.2140552672184036e-05, "loss": 0.0193, "step": 80970 }, { "epoch": 2.271847383924814, "grad_norm": 0.010961869731545448, "learning_rate": 1.2135876934586433e-05, "loss": 0.0187, "step": 80980 }, { "epoch": 2.2721279281806703, "grad_norm": 0.06197577714920044, "learning_rate": 1.2131201196988826e-05, "loss": 0.0126, "step": 80990 }, { "epoch": 2.2724084724365268, "grad_norm": 0.07963493466377258, "learning_rate": 1.2126525459391219e-05, "loss": 0.0252, "step": 81000 }, { "epoch": 2.272689016692383, "grad_norm": 0.1201038807630539, "learning_rate": 1.2121849721793614e-05, "loss": 0.0091, "step": 81010 }, { "epoch": 2.2729695609482397, "grad_norm": 0.03453924506902695, "learning_rate": 1.2117173984196007e-05, "loss": 0.0257, "step": 81020 }, { "epoch": 2.273250105204096, "grad_norm": 0.3676778972148895, "learning_rate": 1.2112498246598402e-05, "loss": 0.013, "step": 81030 }, { "epoch": 2.2735306494599525, "grad_norm": 0.8547825813293457, "learning_rate": 1.2107822509000795e-05, "loss": 0.0234, "step": 81040 }, { "epoch": 2.2738111937158085, "grad_norm": 0.13928759098052979, "learning_rate": 1.210314677140319e-05, "loss": 0.0071, "step": 81050 }, { "epoch": 2.274091737971665, "grad_norm": 0.11768098175525665, "learning_rate": 1.2098471033805583e-05, "loss": 0.0117, "step": 81060 }, { "epoch": 2.2743722822275214, "grad_norm": 0.1289471834897995, "learning_rate": 1.2093795296207976e-05, "loss": 0.0175, "step": 81070 }, { "epoch": 2.274652826483378, "grad_norm": 0.03520585969090462, "learning_rate": 1.2089119558610373e-05, "loss": 0.0131, "step": 81080 }, { "epoch": 2.2749333707392343, "grad_norm": 0.4963972866535187, "learning_rate": 1.2084443821012766e-05, "loss": 0.0162, "step": 81090 }, { "epoch": 2.2752139149950903, "grad_norm": 0.008946969173848629, "learning_rate": 1.2079768083415159e-05, "loss": 0.0107, "step": 81100 }, { "epoch": 2.2754944592509467, "grad_norm": 0.41879433393478394, "learning_rate": 1.2075092345817552e-05, "loss": 0.0338, "step": 81110 }, { "epoch": 2.275775003506803, "grad_norm": 0.20040516555309296, "learning_rate": 1.2070416608219947e-05, "loss": 0.0197, "step": 81120 }, { "epoch": 2.2760555477626596, "grad_norm": 0.010612486861646175, "learning_rate": 1.2065740870622342e-05, "loss": 0.0121, "step": 81130 }, { "epoch": 2.276336092018516, "grad_norm": 0.010047622956335545, "learning_rate": 1.2061065133024735e-05, "loss": 0.0132, "step": 81140 }, { "epoch": 2.2766166362743725, "grad_norm": 0.18166688084602356, "learning_rate": 1.205638939542713e-05, "loss": 0.034, "step": 81150 }, { "epoch": 2.2768971805302285, "grad_norm": 0.01573329232633114, "learning_rate": 1.2051713657829523e-05, "loss": 0.0117, "step": 81160 }, { "epoch": 2.277177724786085, "grad_norm": 0.017333803698420525, "learning_rate": 1.2047037920231916e-05, "loss": 0.0124, "step": 81170 }, { "epoch": 2.2774582690419414, "grad_norm": 0.007872847840189934, "learning_rate": 1.2042362182634311e-05, "loss": 0.0152, "step": 81180 }, { "epoch": 2.277738813297798, "grad_norm": 0.04281049221754074, "learning_rate": 1.2037686445036706e-05, "loss": 0.0142, "step": 81190 }, { "epoch": 2.2780193575536543, "grad_norm": 0.025536231696605682, "learning_rate": 1.2033010707439099e-05, "loss": 0.005, "step": 81200 }, { "epoch": 2.2782999018095103, "grad_norm": 0.9847062826156616, "learning_rate": 1.2028334969841492e-05, "loss": 0.0289, "step": 81210 }, { "epoch": 2.2785804460653667, "grad_norm": 0.11528340727090836, "learning_rate": 1.2023659232243887e-05, "loss": 0.0141, "step": 81220 }, { "epoch": 2.278860990321223, "grad_norm": 0.018420975655317307, "learning_rate": 1.2018983494646282e-05, "loss": 0.0494, "step": 81230 }, { "epoch": 2.2791415345770796, "grad_norm": 0.437954306602478, "learning_rate": 1.2014307757048675e-05, "loss": 0.0195, "step": 81240 }, { "epoch": 2.279422078832936, "grad_norm": 0.02869957685470581, "learning_rate": 1.2009632019451068e-05, "loss": 0.0374, "step": 81250 }, { "epoch": 2.2797026230887925, "grad_norm": 0.6674860715866089, "learning_rate": 1.2004956281853463e-05, "loss": 0.0122, "step": 81260 }, { "epoch": 2.2799831673446485, "grad_norm": 0.022493092343211174, "learning_rate": 1.2000280544255856e-05, "loss": 0.0133, "step": 81270 }, { "epoch": 2.280263711600505, "grad_norm": 0.4179549813270569, "learning_rate": 1.1995604806658251e-05, "loss": 0.0259, "step": 81280 }, { "epoch": 2.2805442558563613, "grad_norm": 1.3346713781356812, "learning_rate": 1.1990929069060646e-05, "loss": 0.0132, "step": 81290 }, { "epoch": 2.2808248001122178, "grad_norm": 0.02580123394727707, "learning_rate": 1.198625333146304e-05, "loss": 0.0396, "step": 81300 }, { "epoch": 2.281105344368074, "grad_norm": 0.058734383434057236, "learning_rate": 1.1981577593865432e-05, "loss": 0.0043, "step": 81310 }, { "epoch": 2.28138588862393, "grad_norm": 1.2974276542663574, "learning_rate": 1.1976901856267825e-05, "loss": 0.0106, "step": 81320 }, { "epoch": 2.2816664328797867, "grad_norm": 1.5424139499664307, "learning_rate": 1.197222611867022e-05, "loss": 0.026, "step": 81330 }, { "epoch": 2.281946977135643, "grad_norm": 1.213597059249878, "learning_rate": 1.1967550381072615e-05, "loss": 0.0168, "step": 81340 }, { "epoch": 2.2822275213914995, "grad_norm": 0.01839766465127468, "learning_rate": 1.1962874643475008e-05, "loss": 0.02, "step": 81350 }, { "epoch": 2.282508065647356, "grad_norm": 0.01971977762877941, "learning_rate": 1.1958198905877403e-05, "loss": 0.0194, "step": 81360 }, { "epoch": 2.2827886099032124, "grad_norm": 0.007863717153668404, "learning_rate": 1.1953523168279796e-05, "loss": 0.0282, "step": 81370 }, { "epoch": 2.2830691541590684, "grad_norm": 0.24039973318576813, "learning_rate": 1.1948847430682191e-05, "loss": 0.0356, "step": 81380 }, { "epoch": 2.283349698414925, "grad_norm": 0.24499230086803436, "learning_rate": 1.1944171693084584e-05, "loss": 0.0412, "step": 81390 }, { "epoch": 2.2836302426707813, "grad_norm": 0.4851168096065521, "learning_rate": 1.193949595548698e-05, "loss": 0.0438, "step": 81400 }, { "epoch": 2.2839107869266377, "grad_norm": 0.09263689070940018, "learning_rate": 1.1934820217889372e-05, "loss": 0.0194, "step": 81410 }, { "epoch": 2.284191331182494, "grad_norm": 0.509106457233429, "learning_rate": 1.1930144480291765e-05, "loss": 0.0224, "step": 81420 }, { "epoch": 2.28447187543835, "grad_norm": 0.4061622619628906, "learning_rate": 1.192546874269416e-05, "loss": 0.0158, "step": 81430 }, { "epoch": 2.2847524196942066, "grad_norm": 0.057871896773576736, "learning_rate": 1.1920793005096555e-05, "loss": 0.0263, "step": 81440 }, { "epoch": 2.285032963950063, "grad_norm": 0.3264864385128021, "learning_rate": 1.1916117267498948e-05, "loss": 0.0199, "step": 81450 }, { "epoch": 2.2853135082059195, "grad_norm": 0.02598392218351364, "learning_rate": 1.1911441529901342e-05, "loss": 0.0122, "step": 81460 }, { "epoch": 2.285594052461776, "grad_norm": 0.052561573684215546, "learning_rate": 1.1906765792303736e-05, "loss": 0.0153, "step": 81470 }, { "epoch": 2.2858745967176324, "grad_norm": 0.062232401221990585, "learning_rate": 1.1902090054706131e-05, "loss": 0.0066, "step": 81480 }, { "epoch": 2.2861551409734884, "grad_norm": 1.1163991689682007, "learning_rate": 1.1897414317108524e-05, "loss": 0.0237, "step": 81490 }, { "epoch": 2.286435685229345, "grad_norm": 0.5656699538230896, "learning_rate": 1.189273857951092e-05, "loss": 0.0094, "step": 81500 }, { "epoch": 2.2867162294852013, "grad_norm": 0.024577710777521133, "learning_rate": 1.1888062841913312e-05, "loss": 0.0406, "step": 81510 }, { "epoch": 2.2869967737410577, "grad_norm": 0.19735054671764374, "learning_rate": 1.1883387104315706e-05, "loss": 0.0157, "step": 81520 }, { "epoch": 2.287277317996914, "grad_norm": 0.37793800234794617, "learning_rate": 1.18787113667181e-05, "loss": 0.0161, "step": 81530 }, { "epoch": 2.28755786225277, "grad_norm": 0.2866263687610626, "learning_rate": 1.1874035629120495e-05, "loss": 0.0039, "step": 81540 }, { "epoch": 2.2878384065086266, "grad_norm": 0.019946888089179993, "learning_rate": 1.1869359891522888e-05, "loss": 0.0125, "step": 81550 }, { "epoch": 2.288118950764483, "grad_norm": 0.07150907069444656, "learning_rate": 1.1864684153925282e-05, "loss": 0.0315, "step": 81560 }, { "epoch": 2.2883994950203395, "grad_norm": 0.037453316152095795, "learning_rate": 1.1860008416327676e-05, "loss": 0.0223, "step": 81570 }, { "epoch": 2.288680039276196, "grad_norm": 0.023099062964320183, "learning_rate": 1.185533267873007e-05, "loss": 0.0216, "step": 81580 }, { "epoch": 2.2889605835320523, "grad_norm": 0.1594531238079071, "learning_rate": 1.1850656941132464e-05, "loss": 0.0139, "step": 81590 }, { "epoch": 2.289241127787909, "grad_norm": 0.7658059597015381, "learning_rate": 1.184598120353486e-05, "loss": 0.0125, "step": 81600 }, { "epoch": 2.2895216720437648, "grad_norm": 0.5045680403709412, "learning_rate": 1.1841305465937252e-05, "loss": 0.0228, "step": 81610 }, { "epoch": 2.289802216299621, "grad_norm": 0.02044943906366825, "learning_rate": 1.1836629728339646e-05, "loss": 0.0145, "step": 81620 }, { "epoch": 2.2900827605554777, "grad_norm": 0.05954071134328842, "learning_rate": 1.183195399074204e-05, "loss": 0.0221, "step": 81630 }, { "epoch": 2.290363304811334, "grad_norm": 0.05618792772293091, "learning_rate": 1.1827278253144435e-05, "loss": 0.0231, "step": 81640 }, { "epoch": 2.2906438490671905, "grad_norm": 0.5965000987052917, "learning_rate": 1.1822602515546828e-05, "loss": 0.0253, "step": 81650 }, { "epoch": 2.2909243933230465, "grad_norm": 0.16189420223236084, "learning_rate": 1.1817926777949222e-05, "loss": 0.038, "step": 81660 }, { "epoch": 2.291204937578903, "grad_norm": 0.45674553513526917, "learning_rate": 1.1813251040351616e-05, "loss": 0.0188, "step": 81670 }, { "epoch": 2.2914854818347594, "grad_norm": 0.35772496461868286, "learning_rate": 1.180857530275401e-05, "loss": 0.0435, "step": 81680 }, { "epoch": 2.291766026090616, "grad_norm": 0.3118321895599365, "learning_rate": 1.1803899565156404e-05, "loss": 0.0216, "step": 81690 }, { "epoch": 2.2920465703464723, "grad_norm": 0.9338226914405823, "learning_rate": 1.1799223827558798e-05, "loss": 0.0234, "step": 81700 }, { "epoch": 2.2923271146023287, "grad_norm": 0.7074741125106812, "learning_rate": 1.1794548089961192e-05, "loss": 0.0416, "step": 81710 }, { "epoch": 2.2926076588581847, "grad_norm": 0.16227483749389648, "learning_rate": 1.1789872352363586e-05, "loss": 0.0066, "step": 81720 }, { "epoch": 2.292888203114041, "grad_norm": 0.04064891114830971, "learning_rate": 1.1785196614765979e-05, "loss": 0.0137, "step": 81730 }, { "epoch": 2.2931687473698976, "grad_norm": 0.8201170563697815, "learning_rate": 1.1780520877168375e-05, "loss": 0.0225, "step": 81740 }, { "epoch": 2.293449291625754, "grad_norm": 1.5811697244644165, "learning_rate": 1.1775845139570768e-05, "loss": 0.057, "step": 81750 }, { "epoch": 2.2937298358816105, "grad_norm": 0.11257956176996231, "learning_rate": 1.1771169401973162e-05, "loss": 0.0321, "step": 81760 }, { "epoch": 2.2940103801374665, "grad_norm": 0.7272412180900574, "learning_rate": 1.1766493664375555e-05, "loss": 0.0392, "step": 81770 }, { "epoch": 2.294290924393323, "grad_norm": 0.05319290980696678, "learning_rate": 1.176181792677795e-05, "loss": 0.032, "step": 81780 }, { "epoch": 2.2945714686491794, "grad_norm": 0.1617778241634369, "learning_rate": 1.1757142189180344e-05, "loss": 0.0331, "step": 81790 }, { "epoch": 2.294852012905036, "grad_norm": 0.22688862681388855, "learning_rate": 1.1752466451582738e-05, "loss": 0.0197, "step": 81800 }, { "epoch": 2.2951325571608923, "grad_norm": 0.09210684895515442, "learning_rate": 1.1747790713985132e-05, "loss": 0.016, "step": 81810 }, { "epoch": 2.2954131014167487, "grad_norm": 0.04309071600437164, "learning_rate": 1.1743114976387526e-05, "loss": 0.0489, "step": 81820 }, { "epoch": 2.2956936456726047, "grad_norm": 0.38965117931365967, "learning_rate": 1.1738439238789919e-05, "loss": 0.0082, "step": 81830 }, { "epoch": 2.295974189928461, "grad_norm": 0.03174401819705963, "learning_rate": 1.1733763501192314e-05, "loss": 0.0245, "step": 81840 }, { "epoch": 2.2962547341843176, "grad_norm": 0.28563210368156433, "learning_rate": 1.1729087763594708e-05, "loss": 0.0221, "step": 81850 }, { "epoch": 2.296535278440174, "grad_norm": 1.45651376247406, "learning_rate": 1.1724412025997102e-05, "loss": 0.0326, "step": 81860 }, { "epoch": 2.2968158226960305, "grad_norm": 1.3925987482070923, "learning_rate": 1.1719736288399495e-05, "loss": 0.0152, "step": 81870 }, { "epoch": 2.2970963669518865, "grad_norm": 2.5364809036254883, "learning_rate": 1.171506055080189e-05, "loss": 0.0354, "step": 81880 }, { "epoch": 2.297376911207743, "grad_norm": 0.009293398819863796, "learning_rate": 1.1710384813204284e-05, "loss": 0.0455, "step": 81890 }, { "epoch": 2.2976574554635993, "grad_norm": 1.716414451599121, "learning_rate": 1.1705709075606678e-05, "loss": 0.0339, "step": 81900 }, { "epoch": 2.297937999719456, "grad_norm": 0.09676366299390793, "learning_rate": 1.170103333800907e-05, "loss": 0.0172, "step": 81910 }, { "epoch": 2.2982185439753122, "grad_norm": 4.332406520843506, "learning_rate": 1.1696357600411466e-05, "loss": 0.0081, "step": 81920 }, { "epoch": 2.2984990882311687, "grad_norm": 1.7048323154449463, "learning_rate": 1.1691681862813859e-05, "loss": 0.0193, "step": 81930 }, { "epoch": 2.2987796324870247, "grad_norm": 0.4591773748397827, "learning_rate": 1.1687006125216254e-05, "loss": 0.0118, "step": 81940 }, { "epoch": 2.299060176742881, "grad_norm": 0.031210515648126602, "learning_rate": 1.1682330387618648e-05, "loss": 0.0047, "step": 81950 }, { "epoch": 2.2993407209987375, "grad_norm": 0.14033374190330505, "learning_rate": 1.1677654650021042e-05, "loss": 0.0261, "step": 81960 }, { "epoch": 2.299621265254594, "grad_norm": 0.07270727306604385, "learning_rate": 1.1672978912423435e-05, "loss": 0.0103, "step": 81970 }, { "epoch": 2.2999018095104504, "grad_norm": 0.02398337982594967, "learning_rate": 1.1668303174825828e-05, "loss": 0.0269, "step": 81980 }, { "epoch": 2.3001823537663064, "grad_norm": 0.09432374686002731, "learning_rate": 1.1663627437228224e-05, "loss": 0.0493, "step": 81990 }, { "epoch": 2.300462898022163, "grad_norm": 0.058255162090063095, "learning_rate": 1.1658951699630618e-05, "loss": 0.0135, "step": 82000 }, { "epoch": 2.3007434422780193, "grad_norm": 0.1302894502878189, "learning_rate": 1.165427596203301e-05, "loss": 0.043, "step": 82010 }, { "epoch": 2.3010239865338757, "grad_norm": 0.09741657972335815, "learning_rate": 1.1649600224435406e-05, "loss": 0.0385, "step": 82020 }, { "epoch": 2.301304530789732, "grad_norm": 0.6221148371696472, "learning_rate": 1.1644924486837799e-05, "loss": 0.0176, "step": 82030 }, { "epoch": 2.3015850750455886, "grad_norm": 0.13769926130771637, "learning_rate": 1.1640248749240194e-05, "loss": 0.013, "step": 82040 }, { "epoch": 2.3018656193014446, "grad_norm": 0.5243629813194275, "learning_rate": 1.1635573011642587e-05, "loss": 0.0125, "step": 82050 }, { "epoch": 2.302146163557301, "grad_norm": 0.29685842990875244, "learning_rate": 1.1630897274044982e-05, "loss": 0.0434, "step": 82060 }, { "epoch": 2.3024267078131575, "grad_norm": 0.8490838408470154, "learning_rate": 1.1626221536447375e-05, "loss": 0.0108, "step": 82070 }, { "epoch": 2.302707252069014, "grad_norm": 0.48372846841812134, "learning_rate": 1.1621545798849768e-05, "loss": 0.0045, "step": 82080 }, { "epoch": 2.3029877963248704, "grad_norm": 0.42080673575401306, "learning_rate": 1.1616870061252163e-05, "loss": 0.036, "step": 82090 }, { "epoch": 2.3032683405807264, "grad_norm": 0.055228643119335175, "learning_rate": 1.1612194323654558e-05, "loss": 0.0137, "step": 82100 }, { "epoch": 2.303548884836583, "grad_norm": 0.1804499328136444, "learning_rate": 1.160751858605695e-05, "loss": 0.0053, "step": 82110 }, { "epoch": 2.3038294290924393, "grad_norm": 0.45453381538391113, "learning_rate": 1.1602842848459344e-05, "loss": 0.0077, "step": 82120 }, { "epoch": 2.3041099733482957, "grad_norm": 0.029700253158807755, "learning_rate": 1.1598167110861739e-05, "loss": 0.0331, "step": 82130 }, { "epoch": 2.304390517604152, "grad_norm": 0.06834365427494049, "learning_rate": 1.1593491373264134e-05, "loss": 0.0144, "step": 82140 }, { "epoch": 2.3046710618600086, "grad_norm": 1.7978129386901855, "learning_rate": 1.1588815635666527e-05, "loss": 0.0417, "step": 82150 }, { "epoch": 2.304951606115865, "grad_norm": 0.010790509171783924, "learning_rate": 1.1584139898068922e-05, "loss": 0.0072, "step": 82160 }, { "epoch": 2.305232150371721, "grad_norm": 0.35429704189300537, "learning_rate": 1.1579464160471315e-05, "loss": 0.0091, "step": 82170 }, { "epoch": 2.3055126946275775, "grad_norm": 1.1764599084854126, "learning_rate": 1.1574788422873708e-05, "loss": 0.0179, "step": 82180 }, { "epoch": 2.305793238883434, "grad_norm": 0.07628223299980164, "learning_rate": 1.1570112685276103e-05, "loss": 0.0019, "step": 82190 }, { "epoch": 2.3060737831392903, "grad_norm": 0.016834806650877, "learning_rate": 1.1565436947678498e-05, "loss": 0.0155, "step": 82200 }, { "epoch": 2.3063543273951463, "grad_norm": 1.966045618057251, "learning_rate": 1.156076121008089e-05, "loss": 0.0318, "step": 82210 }, { "epoch": 2.306634871651003, "grad_norm": 0.13950979709625244, "learning_rate": 1.1556085472483284e-05, "loss": 0.0108, "step": 82220 }, { "epoch": 2.3069154159068592, "grad_norm": 0.07579728215932846, "learning_rate": 1.1551409734885679e-05, "loss": 0.011, "step": 82230 }, { "epoch": 2.3071959601627157, "grad_norm": 0.03775336965918541, "learning_rate": 1.1546733997288072e-05, "loss": 0.0503, "step": 82240 }, { "epoch": 2.307476504418572, "grad_norm": 0.42038416862487793, "learning_rate": 1.1542058259690467e-05, "loss": 0.0103, "step": 82250 }, { "epoch": 2.3077570486744285, "grad_norm": 0.014450052753090858, "learning_rate": 1.1537382522092862e-05, "loss": 0.0125, "step": 82260 }, { "epoch": 2.308037592930285, "grad_norm": 0.3507364094257355, "learning_rate": 1.1532706784495255e-05, "loss": 0.0392, "step": 82270 }, { "epoch": 2.308318137186141, "grad_norm": 0.03886644169688225, "learning_rate": 1.1528031046897648e-05, "loss": 0.0113, "step": 82280 }, { "epoch": 2.3085986814419974, "grad_norm": 0.03673188015818596, "learning_rate": 1.1523355309300043e-05, "loss": 0.0132, "step": 82290 }, { "epoch": 2.308879225697854, "grad_norm": 0.03218456730246544, "learning_rate": 1.1518679571702438e-05, "loss": 0.0188, "step": 82300 }, { "epoch": 2.3091597699537103, "grad_norm": 0.23253898322582245, "learning_rate": 1.1514003834104831e-05, "loss": 0.0101, "step": 82310 }, { "epoch": 2.3094403142095667, "grad_norm": 0.017541592940688133, "learning_rate": 1.1509328096507224e-05, "loss": 0.0268, "step": 82320 }, { "epoch": 2.3097208584654227, "grad_norm": 0.12058064341545105, "learning_rate": 1.1504652358909619e-05, "loss": 0.0095, "step": 82330 }, { "epoch": 2.310001402721279, "grad_norm": 2.9913227558135986, "learning_rate": 1.1499976621312012e-05, "loss": 0.0358, "step": 82340 }, { "epoch": 2.3102819469771356, "grad_norm": 0.016143133863806725, "learning_rate": 1.1495300883714407e-05, "loss": 0.0068, "step": 82350 }, { "epoch": 2.310562491232992, "grad_norm": 0.033249303698539734, "learning_rate": 1.14906251461168e-05, "loss": 0.0046, "step": 82360 }, { "epoch": 2.3108430354888485, "grad_norm": 0.5238046646118164, "learning_rate": 1.1485949408519195e-05, "loss": 0.0041, "step": 82370 }, { "epoch": 2.311123579744705, "grad_norm": 0.36453402042388916, "learning_rate": 1.1481273670921588e-05, "loss": 0.0317, "step": 82380 }, { "epoch": 2.311404124000561, "grad_norm": 0.030247334390878677, "learning_rate": 1.1476597933323983e-05, "loss": 0.0051, "step": 82390 }, { "epoch": 2.3116846682564174, "grad_norm": 0.0442630909383297, "learning_rate": 1.1471922195726378e-05, "loss": 0.0267, "step": 82400 }, { "epoch": 2.311965212512274, "grad_norm": 0.16636516153812408, "learning_rate": 1.1467246458128771e-05, "loss": 0.024, "step": 82410 }, { "epoch": 2.3122457567681303, "grad_norm": 0.01183326356112957, "learning_rate": 1.1462570720531164e-05, "loss": 0.0352, "step": 82420 }, { "epoch": 2.3125263010239867, "grad_norm": 0.018447142094373703, "learning_rate": 1.1457894982933557e-05, "loss": 0.0193, "step": 82430 }, { "epoch": 2.3128068452798427, "grad_norm": 0.04090195521712303, "learning_rate": 1.1453219245335952e-05, "loss": 0.0142, "step": 82440 }, { "epoch": 2.313087389535699, "grad_norm": 1.511473298072815, "learning_rate": 1.1448543507738347e-05, "loss": 0.0249, "step": 82450 }, { "epoch": 2.3133679337915556, "grad_norm": 0.04598066955804825, "learning_rate": 1.144386777014074e-05, "loss": 0.0047, "step": 82460 }, { "epoch": 2.313648478047412, "grad_norm": 0.008359997533261776, "learning_rate": 1.1439192032543135e-05, "loss": 0.0117, "step": 82470 }, { "epoch": 2.3139290223032685, "grad_norm": 0.009613803587853909, "learning_rate": 1.1434516294945528e-05, "loss": 0.0078, "step": 82480 }, { "epoch": 2.314209566559125, "grad_norm": 0.13102929294109344, "learning_rate": 1.1429840557347921e-05, "loss": 0.0472, "step": 82490 }, { "epoch": 2.314490110814981, "grad_norm": 0.2711295187473297, "learning_rate": 1.1425164819750316e-05, "loss": 0.0324, "step": 82500 }, { "epoch": 2.3147706550708373, "grad_norm": 0.5906400084495544, "learning_rate": 1.1420489082152711e-05, "loss": 0.0107, "step": 82510 }, { "epoch": 2.315051199326694, "grad_norm": 0.2450297474861145, "learning_rate": 1.1415813344555104e-05, "loss": 0.0102, "step": 82520 }, { "epoch": 2.3153317435825502, "grad_norm": 0.008583576418459415, "learning_rate": 1.1411137606957497e-05, "loss": 0.0144, "step": 82530 }, { "epoch": 2.3156122878384067, "grad_norm": 0.0996742770075798, "learning_rate": 1.1406461869359892e-05, "loss": 0.022, "step": 82540 }, { "epoch": 2.3158928320942627, "grad_norm": 0.6489540338516235, "learning_rate": 1.1401786131762287e-05, "loss": 0.0086, "step": 82550 }, { "epoch": 2.316173376350119, "grad_norm": 0.07041305303573608, "learning_rate": 1.139711039416468e-05, "loss": 0.0067, "step": 82560 }, { "epoch": 2.3164539206059755, "grad_norm": 0.04080390930175781, "learning_rate": 1.1392434656567073e-05, "loss": 0.0219, "step": 82570 }, { "epoch": 2.316734464861832, "grad_norm": 0.311444491147995, "learning_rate": 1.1387758918969468e-05, "loss": 0.0163, "step": 82580 }, { "epoch": 2.3170150091176884, "grad_norm": 0.014447126537561417, "learning_rate": 1.1383083181371861e-05, "loss": 0.0127, "step": 82590 }, { "epoch": 2.317295553373545, "grad_norm": 0.47957223653793335, "learning_rate": 1.1378407443774256e-05, "loss": 0.0286, "step": 82600 }, { "epoch": 2.317576097629401, "grad_norm": 0.03772151470184326, "learning_rate": 1.1373731706176651e-05, "loss": 0.0192, "step": 82610 }, { "epoch": 2.3178566418852573, "grad_norm": 0.2867538630962372, "learning_rate": 1.1369055968579044e-05, "loss": 0.0447, "step": 82620 }, { "epoch": 2.3181371861411137, "grad_norm": 0.05644835904240608, "learning_rate": 1.1364380230981437e-05, "loss": 0.0169, "step": 82630 }, { "epoch": 2.31841773039697, "grad_norm": 0.7604866623878479, "learning_rate": 1.135970449338383e-05, "loss": 0.0113, "step": 82640 }, { "epoch": 2.3186982746528266, "grad_norm": 0.09413447231054306, "learning_rate": 1.1355028755786227e-05, "loss": 0.0421, "step": 82650 }, { "epoch": 2.3189788189086826, "grad_norm": 0.03962623327970505, "learning_rate": 1.135035301818862e-05, "loss": 0.0276, "step": 82660 }, { "epoch": 2.319259363164539, "grad_norm": 0.9626347422599792, "learning_rate": 1.1345677280591013e-05, "loss": 0.0292, "step": 82670 }, { "epoch": 2.3195399074203955, "grad_norm": 0.07391827553510666, "learning_rate": 1.1341001542993408e-05, "loss": 0.0158, "step": 82680 }, { "epoch": 2.319820451676252, "grad_norm": 0.4220893681049347, "learning_rate": 1.1336325805395801e-05, "loss": 0.011, "step": 82690 }, { "epoch": 2.3201009959321084, "grad_norm": 0.07054764032363892, "learning_rate": 1.1331650067798196e-05, "loss": 0.01, "step": 82700 }, { "epoch": 2.320381540187965, "grad_norm": 0.051972705870866776, "learning_rate": 1.132697433020059e-05, "loss": 0.0172, "step": 82710 }, { "epoch": 2.320662084443821, "grad_norm": 0.09045009315013885, "learning_rate": 1.1322298592602984e-05, "loss": 0.025, "step": 82720 }, { "epoch": 2.3209426286996773, "grad_norm": 0.7578979730606079, "learning_rate": 1.1317622855005377e-05, "loss": 0.0285, "step": 82730 }, { "epoch": 2.3212231729555337, "grad_norm": 0.09433993697166443, "learning_rate": 1.131294711740777e-05, "loss": 0.0063, "step": 82740 }, { "epoch": 2.32150371721139, "grad_norm": 0.03362589329481125, "learning_rate": 1.1308271379810165e-05, "loss": 0.034, "step": 82750 }, { "epoch": 2.3217842614672466, "grad_norm": 1.9237596988677979, "learning_rate": 1.130359564221256e-05, "loss": 0.0137, "step": 82760 }, { "epoch": 2.3220648057231026, "grad_norm": 0.19902430474758148, "learning_rate": 1.1298919904614953e-05, "loss": 0.0156, "step": 82770 }, { "epoch": 2.322345349978959, "grad_norm": 0.024665459990501404, "learning_rate": 1.1294244167017348e-05, "loss": 0.0442, "step": 82780 }, { "epoch": 2.3226258942348155, "grad_norm": 0.12805651128292084, "learning_rate": 1.1289568429419741e-05, "loss": 0.0286, "step": 82790 }, { "epoch": 2.322906438490672, "grad_norm": 0.29484596848487854, "learning_rate": 1.1284892691822136e-05, "loss": 0.0078, "step": 82800 }, { "epoch": 2.3231869827465284, "grad_norm": 0.05981813743710518, "learning_rate": 1.128021695422453e-05, "loss": 0.0151, "step": 82810 }, { "epoch": 2.323467527002385, "grad_norm": 0.043616779148578644, "learning_rate": 1.1275541216626924e-05, "loss": 0.0076, "step": 82820 }, { "epoch": 2.3237480712582412, "grad_norm": 0.12232573330402374, "learning_rate": 1.1270865479029317e-05, "loss": 0.0202, "step": 82830 }, { "epoch": 2.3240286155140972, "grad_norm": 6.6029253005981445, "learning_rate": 1.126618974143171e-05, "loss": 0.0245, "step": 82840 }, { "epoch": 2.3243091597699537, "grad_norm": 0.014813835732638836, "learning_rate": 1.1261514003834105e-05, "loss": 0.0086, "step": 82850 }, { "epoch": 2.32458970402581, "grad_norm": 0.17609398066997528, "learning_rate": 1.12568382662365e-05, "loss": 0.0192, "step": 82860 }, { "epoch": 2.3248702482816666, "grad_norm": 0.058189671486616135, "learning_rate": 1.1252162528638893e-05, "loss": 0.0532, "step": 82870 }, { "epoch": 2.3251507925375225, "grad_norm": 0.2556256651878357, "learning_rate": 1.1247486791041286e-05, "loss": 0.0143, "step": 82880 }, { "epoch": 2.325431336793379, "grad_norm": 0.030394893139600754, "learning_rate": 1.1242811053443681e-05, "loss": 0.0347, "step": 82890 }, { "epoch": 2.3257118810492354, "grad_norm": 0.4059065580368042, "learning_rate": 1.1238135315846076e-05, "loss": 0.0229, "step": 82900 }, { "epoch": 2.325992425305092, "grad_norm": 1.7352875471115112, "learning_rate": 1.123345957824847e-05, "loss": 0.0417, "step": 82910 }, { "epoch": 2.3262729695609483, "grad_norm": 0.9000109434127808, "learning_rate": 1.1228783840650864e-05, "loss": 0.0141, "step": 82920 }, { "epoch": 2.3265535138168048, "grad_norm": 0.3676943778991699, "learning_rate": 1.1224108103053257e-05, "loss": 0.0398, "step": 82930 }, { "epoch": 2.326834058072661, "grad_norm": 0.3159681260585785, "learning_rate": 1.121943236545565e-05, "loss": 0.0147, "step": 82940 }, { "epoch": 2.327114602328517, "grad_norm": 0.1187191754579544, "learning_rate": 1.1214756627858045e-05, "loss": 0.0116, "step": 82950 }, { "epoch": 2.3273951465843736, "grad_norm": 0.025891883298754692, "learning_rate": 1.121008089026044e-05, "loss": 0.0228, "step": 82960 }, { "epoch": 2.32767569084023, "grad_norm": 0.03476980701088905, "learning_rate": 1.1205405152662833e-05, "loss": 0.0087, "step": 82970 }, { "epoch": 2.3279562350960865, "grad_norm": 0.02956514246761799, "learning_rate": 1.1200729415065226e-05, "loss": 0.047, "step": 82980 }, { "epoch": 2.328236779351943, "grad_norm": 4.578246116638184, "learning_rate": 1.1196053677467621e-05, "loss": 0.0202, "step": 82990 }, { "epoch": 2.328517323607799, "grad_norm": 0.07222352921962738, "learning_rate": 1.1191377939870014e-05, "loss": 0.0105, "step": 83000 }, { "epoch": 2.3287978678636554, "grad_norm": 0.045625239610672, "learning_rate": 1.118670220227241e-05, "loss": 0.0175, "step": 83010 }, { "epoch": 2.329078412119512, "grad_norm": 0.08928296715021133, "learning_rate": 1.1182026464674802e-05, "loss": 0.0371, "step": 83020 }, { "epoch": 2.3293589563753683, "grad_norm": 0.02177230268716812, "learning_rate": 1.1177350727077197e-05, "loss": 0.0264, "step": 83030 }, { "epoch": 2.3296395006312247, "grad_norm": 0.23806409537792206, "learning_rate": 1.117267498947959e-05, "loss": 0.0159, "step": 83040 }, { "epoch": 2.329920044887081, "grad_norm": 0.48890966176986694, "learning_rate": 1.1167999251881985e-05, "loss": 0.0121, "step": 83050 }, { "epoch": 2.330200589142937, "grad_norm": 0.061765991151332855, "learning_rate": 1.116332351428438e-05, "loss": 0.029, "step": 83060 }, { "epoch": 2.3304811333987936, "grad_norm": 0.062021806836128235, "learning_rate": 1.1158647776686773e-05, "loss": 0.0483, "step": 83070 }, { "epoch": 2.33076167765465, "grad_norm": 0.45482686161994934, "learning_rate": 1.1153972039089166e-05, "loss": 0.015, "step": 83080 }, { "epoch": 2.3310422219105065, "grad_norm": 0.17996706068515778, "learning_rate": 1.114929630149156e-05, "loss": 0.0335, "step": 83090 }, { "epoch": 2.331322766166363, "grad_norm": 0.4276353120803833, "learning_rate": 1.1144620563893954e-05, "loss": 0.0146, "step": 83100 }, { "epoch": 2.331603310422219, "grad_norm": 0.12432095408439636, "learning_rate": 1.113994482629635e-05, "loss": 0.0407, "step": 83110 }, { "epoch": 2.3318838546780754, "grad_norm": 0.2608640491962433, "learning_rate": 1.1135269088698742e-05, "loss": 0.0143, "step": 83120 }, { "epoch": 2.332164398933932, "grad_norm": 0.0443442165851593, "learning_rate": 1.1130593351101137e-05, "loss": 0.006, "step": 83130 }, { "epoch": 2.3324449431897882, "grad_norm": 0.8262009620666504, "learning_rate": 1.112591761350353e-05, "loss": 0.0244, "step": 83140 }, { "epoch": 2.3327254874456447, "grad_norm": 0.5497874617576599, "learning_rate": 1.1121241875905924e-05, "loss": 0.0415, "step": 83150 }, { "epoch": 2.333006031701501, "grad_norm": 0.049232397228479385, "learning_rate": 1.1116566138308318e-05, "loss": 0.018, "step": 83160 }, { "epoch": 2.333286575957357, "grad_norm": 0.2847977876663208, "learning_rate": 1.1111890400710713e-05, "loss": 0.0206, "step": 83170 }, { "epoch": 2.3335671202132136, "grad_norm": 0.12418463826179504, "learning_rate": 1.1107214663113107e-05, "loss": 0.0102, "step": 83180 }, { "epoch": 2.33384766446907, "grad_norm": 12.532691955566406, "learning_rate": 1.11025389255155e-05, "loss": 0.0063, "step": 83190 }, { "epoch": 2.3341282087249264, "grad_norm": 0.09260722249746323, "learning_rate": 1.1097863187917895e-05, "loss": 0.0575, "step": 83200 }, { "epoch": 2.334408752980783, "grad_norm": 0.7179601192474365, "learning_rate": 1.109318745032029e-05, "loss": 0.043, "step": 83210 }, { "epoch": 2.334689297236639, "grad_norm": 0.4782039523124695, "learning_rate": 1.1088511712722683e-05, "loss": 0.033, "step": 83220 }, { "epoch": 2.3349698414924953, "grad_norm": 0.14157399535179138, "learning_rate": 1.1083835975125076e-05, "loss": 0.0278, "step": 83230 }, { "epoch": 2.3352503857483518, "grad_norm": 0.4478939473628998, "learning_rate": 1.107916023752747e-05, "loss": 0.0176, "step": 83240 }, { "epoch": 2.335530930004208, "grad_norm": 0.07295316457748413, "learning_rate": 1.1074484499929864e-05, "loss": 0.0118, "step": 83250 }, { "epoch": 2.3358114742600646, "grad_norm": 0.9164329171180725, "learning_rate": 1.1069808762332259e-05, "loss": 0.0401, "step": 83260 }, { "epoch": 2.336092018515921, "grad_norm": 0.015648365020751953, "learning_rate": 1.1065133024734653e-05, "loss": 0.0075, "step": 83270 }, { "epoch": 2.336372562771777, "grad_norm": 0.13395731151103973, "learning_rate": 1.1060457287137047e-05, "loss": 0.0203, "step": 83280 }, { "epoch": 2.3366531070276335, "grad_norm": 0.03084995597600937, "learning_rate": 1.105578154953944e-05, "loss": 0.0205, "step": 83290 }, { "epoch": 2.33693365128349, "grad_norm": 0.4753853678703308, "learning_rate": 1.1051105811941835e-05, "loss": 0.024, "step": 83300 }, { "epoch": 2.3372141955393464, "grad_norm": 0.11143915355205536, "learning_rate": 1.104643007434423e-05, "loss": 0.0556, "step": 83310 }, { "epoch": 2.337494739795203, "grad_norm": 0.2631615400314331, "learning_rate": 1.1041754336746623e-05, "loss": 0.0133, "step": 83320 }, { "epoch": 2.337775284051059, "grad_norm": 0.10730031132698059, "learning_rate": 1.1037078599149016e-05, "loss": 0.0247, "step": 83330 }, { "epoch": 2.3380558283069153, "grad_norm": 0.455714613199234, "learning_rate": 1.103240286155141e-05, "loss": 0.0409, "step": 83340 }, { "epoch": 2.3383363725627717, "grad_norm": 0.027476582676172256, "learning_rate": 1.1027727123953804e-05, "loss": 0.0118, "step": 83350 }, { "epoch": 2.338616916818628, "grad_norm": 0.045806385576725006, "learning_rate": 1.1023051386356199e-05, "loss": 0.0133, "step": 83360 }, { "epoch": 2.3388974610744846, "grad_norm": 0.02804051712155342, "learning_rate": 1.1018375648758592e-05, "loss": 0.0017, "step": 83370 }, { "epoch": 2.339178005330341, "grad_norm": 0.762017548084259, "learning_rate": 1.1013699911160987e-05, "loss": 0.0341, "step": 83380 }, { "epoch": 2.339458549586197, "grad_norm": 0.017093902453780174, "learning_rate": 1.100902417356338e-05, "loss": 0.0151, "step": 83390 }, { "epoch": 2.3397390938420535, "grad_norm": 1.031122088432312, "learning_rate": 1.1004348435965773e-05, "loss": 0.0124, "step": 83400 }, { "epoch": 2.34001963809791, "grad_norm": 0.15132932364940643, "learning_rate": 1.099967269836817e-05, "loss": 0.027, "step": 83410 }, { "epoch": 2.3403001823537664, "grad_norm": 0.023692531511187553, "learning_rate": 1.0994996960770563e-05, "loss": 0.025, "step": 83420 }, { "epoch": 2.340580726609623, "grad_norm": 0.1906384378671646, "learning_rate": 1.0990321223172956e-05, "loss": 0.0254, "step": 83430 }, { "epoch": 2.340861270865479, "grad_norm": 0.34445926547050476, "learning_rate": 1.098564548557535e-05, "loss": 0.0087, "step": 83440 }, { "epoch": 2.3411418151213352, "grad_norm": 0.04624612629413605, "learning_rate": 1.0980969747977744e-05, "loss": 0.015, "step": 83450 }, { "epoch": 2.3414223593771917, "grad_norm": 4.834519863128662, "learning_rate": 1.0976294010380139e-05, "loss": 0.0425, "step": 83460 }, { "epoch": 2.341702903633048, "grad_norm": 0.006304553709924221, "learning_rate": 1.0971618272782532e-05, "loss": 0.0123, "step": 83470 }, { "epoch": 2.3419834478889046, "grad_norm": 0.033263761550188065, "learning_rate": 1.0966942535184927e-05, "loss": 0.0288, "step": 83480 }, { "epoch": 2.342263992144761, "grad_norm": 0.02841164357960224, "learning_rate": 1.096226679758732e-05, "loss": 0.0142, "step": 83490 }, { "epoch": 2.3425445364006174, "grad_norm": 0.02384631149470806, "learning_rate": 1.0957591059989713e-05, "loss": 0.0187, "step": 83500 }, { "epoch": 2.3428250806564734, "grad_norm": 0.34377115964889526, "learning_rate": 1.0952915322392108e-05, "loss": 0.0552, "step": 83510 }, { "epoch": 2.34310562491233, "grad_norm": 0.12289454787969589, "learning_rate": 1.0948239584794503e-05, "loss": 0.036, "step": 83520 }, { "epoch": 2.3433861691681863, "grad_norm": 0.4758418798446655, "learning_rate": 1.0943563847196896e-05, "loss": 0.0087, "step": 83530 }, { "epoch": 2.3436667134240428, "grad_norm": 0.047454770654439926, "learning_rate": 1.0938888109599289e-05, "loss": 0.0193, "step": 83540 }, { "epoch": 2.3439472576798988, "grad_norm": 0.15768443048000336, "learning_rate": 1.0934212372001684e-05, "loss": 0.0704, "step": 83550 }, { "epoch": 2.344227801935755, "grad_norm": 0.5171309113502502, "learning_rate": 1.0929536634404079e-05, "loss": 0.0148, "step": 83560 }, { "epoch": 2.3445083461916116, "grad_norm": 0.05420640856027603, "learning_rate": 1.0924860896806472e-05, "loss": 0.0182, "step": 83570 }, { "epoch": 2.344788890447468, "grad_norm": 0.12337226420640945, "learning_rate": 1.0920185159208867e-05, "loss": 0.0146, "step": 83580 }, { "epoch": 2.3450694347033245, "grad_norm": 1.0481727123260498, "learning_rate": 1.091550942161126e-05, "loss": 0.0114, "step": 83590 }, { "epoch": 2.345349978959181, "grad_norm": 0.14920319616794586, "learning_rate": 1.0910833684013653e-05, "loss": 0.0103, "step": 83600 }, { "epoch": 2.3456305232150374, "grad_norm": 0.04183092340826988, "learning_rate": 1.0906157946416048e-05, "loss": 0.0058, "step": 83610 }, { "epoch": 2.3459110674708934, "grad_norm": 1.735148310661316, "learning_rate": 1.0901482208818443e-05, "loss": 0.0326, "step": 83620 }, { "epoch": 2.34619161172675, "grad_norm": 0.05585320293903351, "learning_rate": 1.0896806471220836e-05, "loss": 0.0139, "step": 83630 }, { "epoch": 2.3464721559826063, "grad_norm": 0.018948398530483246, "learning_rate": 1.0892130733623229e-05, "loss": 0.0149, "step": 83640 }, { "epoch": 2.3467527002384627, "grad_norm": 0.02208118513226509, "learning_rate": 1.0887454996025624e-05, "loss": 0.0155, "step": 83650 }, { "epoch": 2.347033244494319, "grad_norm": 0.028438393026590347, "learning_rate": 1.0882779258428017e-05, "loss": 0.0224, "step": 83660 }, { "epoch": 2.347313788750175, "grad_norm": 0.23480625450611115, "learning_rate": 1.0878103520830412e-05, "loss": 0.0155, "step": 83670 }, { "epoch": 2.3475943330060316, "grad_norm": 0.03346562758088112, "learning_rate": 1.0873427783232805e-05, "loss": 0.0055, "step": 83680 }, { "epoch": 2.347874877261888, "grad_norm": 0.49233660101890564, "learning_rate": 1.08687520456352e-05, "loss": 0.0217, "step": 83690 }, { "epoch": 2.3481554215177445, "grad_norm": 1.5261151790618896, "learning_rate": 1.0864076308037593e-05, "loss": 0.0335, "step": 83700 }, { "epoch": 2.348435965773601, "grad_norm": 1.3921960592269897, "learning_rate": 1.0859400570439988e-05, "loss": 0.0367, "step": 83710 }, { "epoch": 2.3487165100294574, "grad_norm": 0.5599359273910522, "learning_rate": 1.0854724832842383e-05, "loss": 0.0243, "step": 83720 }, { "epoch": 2.3489970542853134, "grad_norm": 0.2580375075340271, "learning_rate": 1.0850049095244776e-05, "loss": 0.0059, "step": 83730 }, { "epoch": 2.34927759854117, "grad_norm": 0.003319192910566926, "learning_rate": 1.0845373357647169e-05, "loss": 0.0259, "step": 83740 }, { "epoch": 2.3495581427970262, "grad_norm": 0.1932658702135086, "learning_rate": 1.0840697620049562e-05, "loss": 0.0155, "step": 83750 }, { "epoch": 2.3498386870528827, "grad_norm": 0.1605282872915268, "learning_rate": 1.0836021882451957e-05, "loss": 0.0274, "step": 83760 }, { "epoch": 2.350119231308739, "grad_norm": 0.13848066329956055, "learning_rate": 1.0831346144854352e-05, "loss": 0.0054, "step": 83770 }, { "epoch": 2.350399775564595, "grad_norm": 0.02202356792986393, "learning_rate": 1.0826670407256745e-05, "loss": 0.0175, "step": 83780 }, { "epoch": 2.3506803198204516, "grad_norm": 0.02808110974729061, "learning_rate": 1.082199466965914e-05, "loss": 0.0108, "step": 83790 }, { "epoch": 2.350960864076308, "grad_norm": 0.03212682157754898, "learning_rate": 1.0817318932061533e-05, "loss": 0.0031, "step": 83800 }, { "epoch": 2.3512414083321644, "grad_norm": 0.04500434920191765, "learning_rate": 1.0812643194463928e-05, "loss": 0.0297, "step": 83810 }, { "epoch": 2.351521952588021, "grad_norm": 2.9547102451324463, "learning_rate": 1.0807967456866321e-05, "loss": 0.0116, "step": 83820 }, { "epoch": 2.3518024968438773, "grad_norm": 0.045129820704460144, "learning_rate": 1.0803291719268716e-05, "loss": 0.0209, "step": 83830 }, { "epoch": 2.3520830410997333, "grad_norm": 1.1804131269454956, "learning_rate": 1.0798615981671109e-05, "loss": 0.0219, "step": 83840 }, { "epoch": 2.3523635853555898, "grad_norm": 1.2024644613265991, "learning_rate": 1.0793940244073502e-05, "loss": 0.0284, "step": 83850 }, { "epoch": 2.352644129611446, "grad_norm": 0.08290518075227737, "learning_rate": 1.0789264506475897e-05, "loss": 0.0316, "step": 83860 }, { "epoch": 2.3529246738673026, "grad_norm": 0.3157699704170227, "learning_rate": 1.0784588768878292e-05, "loss": 0.0162, "step": 83870 }, { "epoch": 2.353205218123159, "grad_norm": 0.18786758184432983, "learning_rate": 1.0779913031280685e-05, "loss": 0.0182, "step": 83880 }, { "epoch": 2.353485762379015, "grad_norm": 0.3069426715373993, "learning_rate": 1.0775237293683078e-05, "loss": 0.0409, "step": 83890 }, { "epoch": 2.3537663066348715, "grad_norm": 0.43383586406707764, "learning_rate": 1.0770561556085473e-05, "loss": 0.0144, "step": 83900 }, { "epoch": 2.354046850890728, "grad_norm": 0.2712002992630005, "learning_rate": 1.0765885818487866e-05, "loss": 0.0062, "step": 83910 }, { "epoch": 2.3543273951465844, "grad_norm": 0.2427663952112198, "learning_rate": 1.0761210080890261e-05, "loss": 0.0163, "step": 83920 }, { "epoch": 2.354607939402441, "grad_norm": 0.23220032453536987, "learning_rate": 1.0756534343292656e-05, "loss": 0.0211, "step": 83930 }, { "epoch": 2.3548884836582973, "grad_norm": 0.6950727105140686, "learning_rate": 1.0751858605695049e-05, "loss": 0.0481, "step": 83940 }, { "epoch": 2.3551690279141533, "grad_norm": 1.0526394844055176, "learning_rate": 1.0747182868097442e-05, "loss": 0.0253, "step": 83950 }, { "epoch": 2.3554495721700097, "grad_norm": 0.04710886627435684, "learning_rate": 1.0742507130499837e-05, "loss": 0.0285, "step": 83960 }, { "epoch": 2.355730116425866, "grad_norm": 0.0545293428003788, "learning_rate": 1.0737831392902232e-05, "loss": 0.0096, "step": 83970 }, { "epoch": 2.3560106606817226, "grad_norm": 0.15856321156024933, "learning_rate": 1.0733155655304625e-05, "loss": 0.0124, "step": 83980 }, { "epoch": 2.356291204937579, "grad_norm": 0.18538245558738708, "learning_rate": 1.0728479917707018e-05, "loss": 0.029, "step": 83990 }, { "epoch": 2.356571749193435, "grad_norm": 0.016272040084004402, "learning_rate": 1.0723804180109413e-05, "loss": 0.0388, "step": 84000 }, { "epoch": 2.3568522934492915, "grad_norm": 0.11173515766859055, "learning_rate": 1.0719128442511806e-05, "loss": 0.0293, "step": 84010 }, { "epoch": 2.357132837705148, "grad_norm": 0.2054961621761322, "learning_rate": 1.0714452704914201e-05, "loss": 0.049, "step": 84020 }, { "epoch": 2.3574133819610044, "grad_norm": 0.08391133695840836, "learning_rate": 1.0709776967316594e-05, "loss": 0.0233, "step": 84030 }, { "epoch": 2.357693926216861, "grad_norm": 1.3806267976760864, "learning_rate": 1.0705101229718989e-05, "loss": 0.0374, "step": 84040 }, { "epoch": 2.3579744704727172, "grad_norm": 1.2990367412567139, "learning_rate": 1.0700425492121382e-05, "loss": 0.0315, "step": 84050 }, { "epoch": 2.3582550147285732, "grad_norm": 0.15574109554290771, "learning_rate": 1.0695749754523775e-05, "loss": 0.0143, "step": 84060 }, { "epoch": 2.3585355589844297, "grad_norm": 0.7044504284858704, "learning_rate": 1.0691074016926172e-05, "loss": 0.017, "step": 84070 }, { "epoch": 2.358816103240286, "grad_norm": 0.014255614019930363, "learning_rate": 1.0686398279328565e-05, "loss": 0.0094, "step": 84080 }, { "epoch": 2.3590966474961426, "grad_norm": 0.023380529135465622, "learning_rate": 1.0681722541730958e-05, "loss": 0.0183, "step": 84090 }, { "epoch": 2.359377191751999, "grad_norm": 0.036680832505226135, "learning_rate": 1.0677046804133353e-05, "loss": 0.0027, "step": 84100 }, { "epoch": 2.359657736007855, "grad_norm": 0.16619347035884857, "learning_rate": 1.0672371066535746e-05, "loss": 0.0077, "step": 84110 }, { "epoch": 2.3599382802637114, "grad_norm": 3.0997207164764404, "learning_rate": 1.0667695328938141e-05, "loss": 0.0296, "step": 84120 }, { "epoch": 2.360218824519568, "grad_norm": 0.01342201977968216, "learning_rate": 1.0663019591340534e-05, "loss": 0.0174, "step": 84130 }, { "epoch": 2.3604993687754243, "grad_norm": 0.23465096950531006, "learning_rate": 1.0658343853742929e-05, "loss": 0.0272, "step": 84140 }, { "epoch": 2.3607799130312808, "grad_norm": 0.07603687793016434, "learning_rate": 1.0653668116145322e-05, "loss": 0.0085, "step": 84150 }, { "epoch": 2.361060457287137, "grad_norm": 0.39992159605026245, "learning_rate": 1.0648992378547715e-05, "loss": 0.0307, "step": 84160 }, { "epoch": 2.3613410015429936, "grad_norm": 0.1449318528175354, "learning_rate": 1.0644316640950112e-05, "loss": 0.0082, "step": 84170 }, { "epoch": 2.3616215457988496, "grad_norm": 0.030112622305750847, "learning_rate": 1.0639640903352505e-05, "loss": 0.0043, "step": 84180 }, { "epoch": 2.361902090054706, "grad_norm": 0.3997752368450165, "learning_rate": 1.0634965165754898e-05, "loss": 0.0148, "step": 84190 }, { "epoch": 2.3621826343105625, "grad_norm": 0.014317753724753857, "learning_rate": 1.0630289428157291e-05, "loss": 0.0165, "step": 84200 }, { "epoch": 2.362463178566419, "grad_norm": 1.298511266708374, "learning_rate": 1.0625613690559686e-05, "loss": 0.0059, "step": 84210 }, { "epoch": 2.3627437228222754, "grad_norm": 0.5130361914634705, "learning_rate": 1.0620937952962081e-05, "loss": 0.013, "step": 84220 }, { "epoch": 2.3630242670781314, "grad_norm": 0.7268282771110535, "learning_rate": 1.0616262215364474e-05, "loss": 0.0185, "step": 84230 }, { "epoch": 2.363304811333988, "grad_norm": 1.2328864336013794, "learning_rate": 1.0611586477766869e-05, "loss": 0.0079, "step": 84240 }, { "epoch": 2.3635853555898443, "grad_norm": 0.0659802034497261, "learning_rate": 1.0606910740169262e-05, "loss": 0.0125, "step": 84250 }, { "epoch": 2.3638658998457007, "grad_norm": 0.00755242770537734, "learning_rate": 1.0602235002571655e-05, "loss": 0.0066, "step": 84260 }, { "epoch": 2.364146444101557, "grad_norm": 0.02483517676591873, "learning_rate": 1.059755926497405e-05, "loss": 0.0329, "step": 84270 }, { "epoch": 2.3644269883574136, "grad_norm": 0.5742102861404419, "learning_rate": 1.0592883527376445e-05, "loss": 0.025, "step": 84280 }, { "epoch": 2.3647075326132696, "grad_norm": 0.05212009325623512, "learning_rate": 1.0588207789778838e-05, "loss": 0.0359, "step": 84290 }, { "epoch": 2.364988076869126, "grad_norm": 0.010916545987129211, "learning_rate": 1.0583532052181231e-05, "loss": 0.039, "step": 84300 }, { "epoch": 2.3652686211249825, "grad_norm": 0.029814518988132477, "learning_rate": 1.0578856314583626e-05, "loss": 0.0409, "step": 84310 }, { "epoch": 2.365549165380839, "grad_norm": 0.46127450466156006, "learning_rate": 1.0574180576986021e-05, "loss": 0.0208, "step": 84320 }, { "epoch": 2.3658297096366954, "grad_norm": 1.6326442956924438, "learning_rate": 1.0569504839388414e-05, "loss": 0.016, "step": 84330 }, { "epoch": 2.3661102538925514, "grad_norm": 0.09032358229160309, "learning_rate": 1.0564829101790807e-05, "loss": 0.0159, "step": 84340 }, { "epoch": 2.366390798148408, "grad_norm": 0.029876040294766426, "learning_rate": 1.0560153364193202e-05, "loss": 0.0165, "step": 84350 }, { "epoch": 2.3666713424042642, "grad_norm": 0.7394179701805115, "learning_rate": 1.0555477626595595e-05, "loss": 0.0198, "step": 84360 }, { "epoch": 2.3669518866601207, "grad_norm": 0.02850668877363205, "learning_rate": 1.055080188899799e-05, "loss": 0.0112, "step": 84370 }, { "epoch": 2.367232430915977, "grad_norm": 1.7475658655166626, "learning_rate": 1.0546126151400385e-05, "loss": 0.0504, "step": 84380 }, { "epoch": 2.3675129751718336, "grad_norm": 0.12924666702747345, "learning_rate": 1.0541450413802778e-05, "loss": 0.0065, "step": 84390 }, { "epoch": 2.3677935194276896, "grad_norm": 0.06548202782869339, "learning_rate": 1.0536774676205171e-05, "loss": 0.0095, "step": 84400 }, { "epoch": 2.368074063683546, "grad_norm": 0.1419127732515335, "learning_rate": 1.0532098938607565e-05, "loss": 0.0153, "step": 84410 }, { "epoch": 2.3683546079394024, "grad_norm": 1.1603715419769287, "learning_rate": 1.052742320100996e-05, "loss": 0.0274, "step": 84420 }, { "epoch": 2.368635152195259, "grad_norm": 0.229782834649086, "learning_rate": 1.0522747463412354e-05, "loss": 0.0163, "step": 84430 }, { "epoch": 2.3689156964511153, "grad_norm": 0.2519042193889618, "learning_rate": 1.0518071725814747e-05, "loss": 0.0185, "step": 84440 }, { "epoch": 2.3691962407069713, "grad_norm": 0.9082971215248108, "learning_rate": 1.0513395988217142e-05, "loss": 0.0211, "step": 84450 }, { "epoch": 2.3694767849628278, "grad_norm": 0.3104676306247711, "learning_rate": 1.0508720250619535e-05, "loss": 0.055, "step": 84460 }, { "epoch": 2.369757329218684, "grad_norm": 1.9366803169250488, "learning_rate": 1.050404451302193e-05, "loss": 0.0179, "step": 84470 }, { "epoch": 2.3700378734745406, "grad_norm": 0.20909322798252106, "learning_rate": 1.0499368775424323e-05, "loss": 0.0287, "step": 84480 }, { "epoch": 2.370318417730397, "grad_norm": 0.8959699869155884, "learning_rate": 1.0494693037826718e-05, "loss": 0.0225, "step": 84490 }, { "epoch": 2.3705989619862535, "grad_norm": 0.023179175332188606, "learning_rate": 1.0490017300229111e-05, "loss": 0.0108, "step": 84500 }, { "epoch": 2.3708795062421095, "grad_norm": 0.08570155501365662, "learning_rate": 1.0485341562631505e-05, "loss": 0.0033, "step": 84510 }, { "epoch": 2.371160050497966, "grad_norm": 0.24332736432552338, "learning_rate": 1.04806658250339e-05, "loss": 0.0091, "step": 84520 }, { "epoch": 2.3714405947538224, "grad_norm": 0.30631089210510254, "learning_rate": 1.0475990087436294e-05, "loss": 0.0402, "step": 84530 }, { "epoch": 2.371721139009679, "grad_norm": 0.7204312086105347, "learning_rate": 1.0471314349838687e-05, "loss": 0.0271, "step": 84540 }, { "epoch": 2.3720016832655353, "grad_norm": 0.05886990949511528, "learning_rate": 1.046663861224108e-05, "loss": 0.0185, "step": 84550 }, { "epoch": 2.3722822275213913, "grad_norm": 0.02849404513835907, "learning_rate": 1.0461962874643475e-05, "loss": 0.0235, "step": 84560 }, { "epoch": 2.3725627717772477, "grad_norm": 0.7743640542030334, "learning_rate": 1.045728713704587e-05, "loss": 0.0316, "step": 84570 }, { "epoch": 2.372843316033104, "grad_norm": 0.41091805696487427, "learning_rate": 1.0452611399448263e-05, "loss": 0.0079, "step": 84580 }, { "epoch": 2.3731238602889606, "grad_norm": 0.02970374934375286, "learning_rate": 1.0447935661850658e-05, "loss": 0.0051, "step": 84590 }, { "epoch": 2.373404404544817, "grad_norm": 1.526391863822937, "learning_rate": 1.0443259924253051e-05, "loss": 0.0212, "step": 84600 }, { "epoch": 2.3736849488006735, "grad_norm": 0.48794302344322205, "learning_rate": 1.0438584186655445e-05, "loss": 0.0171, "step": 84610 }, { "epoch": 2.3739654930565295, "grad_norm": 0.30915218591690063, "learning_rate": 1.043390844905784e-05, "loss": 0.0046, "step": 84620 }, { "epoch": 2.374246037312386, "grad_norm": 0.01766522414982319, "learning_rate": 1.0429232711460234e-05, "loss": 0.0279, "step": 84630 }, { "epoch": 2.3745265815682424, "grad_norm": 4.444282531738281, "learning_rate": 1.0424556973862627e-05, "loss": 0.0198, "step": 84640 }, { "epoch": 2.374807125824099, "grad_norm": 0.010219341143965721, "learning_rate": 1.041988123626502e-05, "loss": 0.0279, "step": 84650 }, { "epoch": 2.3750876700799552, "grad_norm": 0.008972954005002975, "learning_rate": 1.0415205498667415e-05, "loss": 0.0106, "step": 84660 }, { "epoch": 2.3753682143358112, "grad_norm": 0.07343357801437378, "learning_rate": 1.0410529761069809e-05, "loss": 0.0053, "step": 84670 }, { "epoch": 2.3756487585916677, "grad_norm": 1.1072713136672974, "learning_rate": 1.0405854023472203e-05, "loss": 0.0158, "step": 84680 }, { "epoch": 2.375929302847524, "grad_norm": 1.1527020931243896, "learning_rate": 1.0401178285874597e-05, "loss": 0.012, "step": 84690 }, { "epoch": 2.3762098471033806, "grad_norm": 0.5995298027992249, "learning_rate": 1.0396502548276991e-05, "loss": 0.0275, "step": 84700 }, { "epoch": 2.376490391359237, "grad_norm": 0.06557951122522354, "learning_rate": 1.0391826810679385e-05, "loss": 0.0199, "step": 84710 }, { "epoch": 2.3767709356150934, "grad_norm": 0.11027813702821732, "learning_rate": 1.038715107308178e-05, "loss": 0.0098, "step": 84720 }, { "epoch": 2.37705147987095, "grad_norm": 0.024973884224891663, "learning_rate": 1.0382475335484174e-05, "loss": 0.0061, "step": 84730 }, { "epoch": 2.377332024126806, "grad_norm": 0.09040547907352448, "learning_rate": 1.0377799597886567e-05, "loss": 0.0185, "step": 84740 }, { "epoch": 2.3776125683826623, "grad_norm": 0.27161160111427307, "learning_rate": 1.037312386028896e-05, "loss": 0.0252, "step": 84750 }, { "epoch": 2.3778931126385188, "grad_norm": 0.021161114796996117, "learning_rate": 1.0368448122691355e-05, "loss": 0.0407, "step": 84760 }, { "epoch": 2.378173656894375, "grad_norm": 0.013689755462110043, "learning_rate": 1.0363772385093749e-05, "loss": 0.0094, "step": 84770 }, { "epoch": 2.378454201150231, "grad_norm": 0.0828651562333107, "learning_rate": 1.0359096647496143e-05, "loss": 0.0284, "step": 84780 }, { "epoch": 2.3787347454060876, "grad_norm": 0.5756077766418457, "learning_rate": 1.0354420909898537e-05, "loss": 0.0269, "step": 84790 }, { "epoch": 2.379015289661944, "grad_norm": 0.15637093782424927, "learning_rate": 1.0349745172300931e-05, "loss": 0.0253, "step": 84800 }, { "epoch": 2.3792958339178005, "grad_norm": 0.05449352040886879, "learning_rate": 1.0345069434703325e-05, "loss": 0.0372, "step": 84810 }, { "epoch": 2.379576378173657, "grad_norm": 0.6175902485847473, "learning_rate": 1.0340393697105718e-05, "loss": 0.0359, "step": 84820 }, { "epoch": 2.3798569224295134, "grad_norm": 0.046015746891498566, "learning_rate": 1.0335717959508114e-05, "loss": 0.0315, "step": 84830 }, { "epoch": 2.38013746668537, "grad_norm": 0.06202748045325279, "learning_rate": 1.0331042221910507e-05, "loss": 0.0081, "step": 84840 }, { "epoch": 2.380418010941226, "grad_norm": 0.36612468957901, "learning_rate": 1.03263664843129e-05, "loss": 0.0194, "step": 84850 }, { "epoch": 2.3806985551970823, "grad_norm": 0.5794095396995544, "learning_rate": 1.0321690746715294e-05, "loss": 0.0167, "step": 84860 }, { "epoch": 2.3809790994529387, "grad_norm": 0.020817426964640617, "learning_rate": 1.0317015009117689e-05, "loss": 0.0313, "step": 84870 }, { "epoch": 2.381259643708795, "grad_norm": 0.02638367936015129, "learning_rate": 1.0312339271520084e-05, "loss": 0.012, "step": 84880 }, { "epoch": 2.3815401879646516, "grad_norm": 0.01840001344680786, "learning_rate": 1.0307663533922477e-05, "loss": 0.022, "step": 84890 }, { "epoch": 2.3818207322205076, "grad_norm": 0.05365162342786789, "learning_rate": 1.0302987796324872e-05, "loss": 0.0179, "step": 84900 }, { "epoch": 2.382101276476364, "grad_norm": 0.7021721005439758, "learning_rate": 1.0298312058727265e-05, "loss": 0.0328, "step": 84910 }, { "epoch": 2.3823818207322205, "grad_norm": 0.026534078642725945, "learning_rate": 1.0293636321129658e-05, "loss": 0.0091, "step": 84920 }, { "epoch": 2.382662364988077, "grad_norm": 0.018366606906056404, "learning_rate": 1.0288960583532053e-05, "loss": 0.0146, "step": 84930 }, { "epoch": 2.3829429092439334, "grad_norm": 0.03755393624305725, "learning_rate": 1.0284284845934448e-05, "loss": 0.0078, "step": 84940 }, { "epoch": 2.38322345349979, "grad_norm": 0.026884671300649643, "learning_rate": 1.027960910833684e-05, "loss": 0.0383, "step": 84950 }, { "epoch": 2.383503997755646, "grad_norm": 0.024715906009078026, "learning_rate": 1.0274933370739234e-05, "loss": 0.009, "step": 84960 }, { "epoch": 2.3837845420115022, "grad_norm": 0.8826810121536255, "learning_rate": 1.0270257633141629e-05, "loss": 0.0217, "step": 84970 }, { "epoch": 2.3840650862673587, "grad_norm": 0.3793911039829254, "learning_rate": 1.0265581895544024e-05, "loss": 0.0239, "step": 84980 }, { "epoch": 2.384345630523215, "grad_norm": 1.5575666427612305, "learning_rate": 1.0260906157946417e-05, "loss": 0.0367, "step": 84990 }, { "epoch": 2.3846261747790716, "grad_norm": 0.028809931129217148, "learning_rate": 1.025623042034881e-05, "loss": 0.0115, "step": 85000 }, { "epoch": 2.3849067190349276, "grad_norm": 0.08020825684070587, "learning_rate": 1.0251554682751205e-05, "loss": 0.0345, "step": 85010 }, { "epoch": 2.385187263290784, "grad_norm": 0.1016102135181427, "learning_rate": 1.0246878945153598e-05, "loss": 0.0168, "step": 85020 }, { "epoch": 2.3854678075466405, "grad_norm": 0.07986491173505783, "learning_rate": 1.0242203207555993e-05, "loss": 0.0313, "step": 85030 }, { "epoch": 2.385748351802497, "grad_norm": 0.053672511130571365, "learning_rate": 1.0237527469958388e-05, "loss": 0.0149, "step": 85040 }, { "epoch": 2.3860288960583533, "grad_norm": 0.28088322281837463, "learning_rate": 1.023285173236078e-05, "loss": 0.0324, "step": 85050 }, { "epoch": 2.3863094403142098, "grad_norm": 0.05046026036143303, "learning_rate": 1.0228175994763174e-05, "loss": 0.0309, "step": 85060 }, { "epoch": 2.3865899845700658, "grad_norm": 0.37766221165657043, "learning_rate": 1.0223500257165567e-05, "loss": 0.0136, "step": 85070 }, { "epoch": 2.386870528825922, "grad_norm": 0.6161224842071533, "learning_rate": 1.0218824519567964e-05, "loss": 0.0067, "step": 85080 }, { "epoch": 2.3871510730817787, "grad_norm": 0.05260002985596657, "learning_rate": 1.0214148781970357e-05, "loss": 0.0033, "step": 85090 }, { "epoch": 2.387431617337635, "grad_norm": 0.022790033370256424, "learning_rate": 1.020947304437275e-05, "loss": 0.0041, "step": 85100 }, { "epoch": 2.3877121615934915, "grad_norm": 0.22785843908786774, "learning_rate": 1.0204797306775145e-05, "loss": 0.0309, "step": 85110 }, { "epoch": 2.3879927058493475, "grad_norm": 0.050995033234357834, "learning_rate": 1.0200121569177538e-05, "loss": 0.0124, "step": 85120 }, { "epoch": 2.388273250105204, "grad_norm": 1.9621437788009644, "learning_rate": 1.0195445831579933e-05, "loss": 0.0173, "step": 85130 }, { "epoch": 2.3885537943610604, "grad_norm": 0.8134580254554749, "learning_rate": 1.0190770093982326e-05, "loss": 0.0085, "step": 85140 }, { "epoch": 2.388834338616917, "grad_norm": 0.24531783163547516, "learning_rate": 1.018609435638472e-05, "loss": 0.009, "step": 85150 }, { "epoch": 2.3891148828727733, "grad_norm": 0.35986563563346863, "learning_rate": 1.0181418618787114e-05, "loss": 0.0063, "step": 85160 }, { "epoch": 2.3893954271286297, "grad_norm": 0.17623627185821533, "learning_rate": 1.0176742881189507e-05, "loss": 0.0282, "step": 85170 }, { "epoch": 2.3896759713844857, "grad_norm": 0.15078428387641907, "learning_rate": 1.0172067143591902e-05, "loss": 0.01, "step": 85180 }, { "epoch": 2.389956515640342, "grad_norm": 0.01344001479446888, "learning_rate": 1.0167391405994297e-05, "loss": 0.0058, "step": 85190 }, { "epoch": 2.3902370598961986, "grad_norm": 0.3621460199356079, "learning_rate": 1.016271566839669e-05, "loss": 0.0126, "step": 85200 }, { "epoch": 2.390517604152055, "grad_norm": 0.36631497740745544, "learning_rate": 1.0158039930799083e-05, "loss": 0.0061, "step": 85210 }, { "epoch": 2.3907981484079115, "grad_norm": 0.02471322752535343, "learning_rate": 1.0153364193201478e-05, "loss": 0.0179, "step": 85220 }, { "epoch": 2.3910786926637675, "grad_norm": 1.4311823844909668, "learning_rate": 1.0148688455603873e-05, "loss": 0.0156, "step": 85230 }, { "epoch": 2.391359236919624, "grad_norm": 0.054542820900678635, "learning_rate": 1.0144012718006266e-05, "loss": 0.0087, "step": 85240 }, { "epoch": 2.3916397811754804, "grad_norm": 0.06287364661693573, "learning_rate": 1.013933698040866e-05, "loss": 0.031, "step": 85250 }, { "epoch": 2.391920325431337, "grad_norm": 0.24527814984321594, "learning_rate": 1.0134661242811054e-05, "loss": 0.0258, "step": 85260 }, { "epoch": 2.3922008696871933, "grad_norm": 0.8386794328689575, "learning_rate": 1.0129985505213447e-05, "loss": 0.0201, "step": 85270 }, { "epoch": 2.3924814139430497, "grad_norm": 0.606113612651825, "learning_rate": 1.0125309767615842e-05, "loss": 0.0171, "step": 85280 }, { "epoch": 2.3927619581989057, "grad_norm": 0.0063665760681033134, "learning_rate": 1.0120634030018237e-05, "loss": 0.0189, "step": 85290 }, { "epoch": 2.393042502454762, "grad_norm": 0.0581330731511116, "learning_rate": 1.011595829242063e-05, "loss": 0.0536, "step": 85300 }, { "epoch": 2.3933230467106186, "grad_norm": 0.0485004261136055, "learning_rate": 1.0111282554823023e-05, "loss": 0.0052, "step": 85310 }, { "epoch": 2.393603590966475, "grad_norm": 0.0009071453823707998, "learning_rate": 1.0106606817225418e-05, "loss": 0.0169, "step": 85320 }, { "epoch": 2.3938841352223315, "grad_norm": 0.2353067845106125, "learning_rate": 1.0101931079627811e-05, "loss": 0.0104, "step": 85330 }, { "epoch": 2.3941646794781875, "grad_norm": 1.1234487295150757, "learning_rate": 1.0097255342030206e-05, "loss": 0.0335, "step": 85340 }, { "epoch": 2.394445223734044, "grad_norm": 0.5385030508041382, "learning_rate": 1.0092579604432599e-05, "loss": 0.0052, "step": 85350 }, { "epoch": 2.3947257679899003, "grad_norm": 2.170496940612793, "learning_rate": 1.0087903866834994e-05, "loss": 0.0151, "step": 85360 }, { "epoch": 2.3950063122457568, "grad_norm": 0.21742312610149384, "learning_rate": 1.0083228129237387e-05, "loss": 0.0359, "step": 85370 }, { "epoch": 2.395286856501613, "grad_norm": 1.1140804290771484, "learning_rate": 1.0078552391639782e-05, "loss": 0.039, "step": 85380 }, { "epoch": 2.3955674007574697, "grad_norm": 0.6700522303581238, "learning_rate": 1.0073876654042177e-05, "loss": 0.017, "step": 85390 }, { "epoch": 2.395847945013326, "grad_norm": 0.16417931020259857, "learning_rate": 1.006920091644457e-05, "loss": 0.0042, "step": 85400 }, { "epoch": 2.396128489269182, "grad_norm": 0.48567822575569153, "learning_rate": 1.0064525178846963e-05, "loss": 0.0189, "step": 85410 }, { "epoch": 2.3964090335250385, "grad_norm": 0.5533686280250549, "learning_rate": 1.0059849441249358e-05, "loss": 0.0297, "step": 85420 }, { "epoch": 2.396689577780895, "grad_norm": 0.017394008114933968, "learning_rate": 1.0055173703651751e-05, "loss": 0.0181, "step": 85430 }, { "epoch": 2.3969701220367514, "grad_norm": 0.35543304681777954, "learning_rate": 1.0050497966054146e-05, "loss": 0.0178, "step": 85440 }, { "epoch": 2.3972506662926074, "grad_norm": 0.18689677119255066, "learning_rate": 1.0045822228456539e-05, "loss": 0.0317, "step": 85450 }, { "epoch": 2.397531210548464, "grad_norm": 0.03820843622088432, "learning_rate": 1.0041146490858934e-05, "loss": 0.0177, "step": 85460 }, { "epoch": 2.3978117548043203, "grad_norm": 1.135392189025879, "learning_rate": 1.0036470753261327e-05, "loss": 0.0371, "step": 85470 }, { "epoch": 2.3980922990601767, "grad_norm": 1.8354429006576538, "learning_rate": 1.0031795015663722e-05, "loss": 0.0132, "step": 85480 }, { "epoch": 2.398372843316033, "grad_norm": 0.02098570205271244, "learning_rate": 1.0027119278066117e-05, "loss": 0.016, "step": 85490 }, { "epoch": 2.3986533875718896, "grad_norm": 2.0788702964782715, "learning_rate": 1.002244354046851e-05, "loss": 0.0358, "step": 85500 }, { "epoch": 2.398933931827746, "grad_norm": 0.3483201563358307, "learning_rate": 1.0017767802870903e-05, "loss": 0.0075, "step": 85510 }, { "epoch": 2.399214476083602, "grad_norm": 0.5870460867881775, "learning_rate": 1.0013092065273296e-05, "loss": 0.0153, "step": 85520 }, { "epoch": 2.3994950203394585, "grad_norm": 0.06390520930290222, "learning_rate": 1.0008416327675691e-05, "loss": 0.0529, "step": 85530 }, { "epoch": 2.399775564595315, "grad_norm": 0.030427681282162666, "learning_rate": 1.0003740590078086e-05, "loss": 0.0082, "step": 85540 }, { "epoch": 2.4000561088511714, "grad_norm": 0.005414614919573069, "learning_rate": 9.999064852480479e-06, "loss": 0.0176, "step": 85550 }, { "epoch": 2.400336653107028, "grad_norm": 0.007586594205349684, "learning_rate": 9.994389114882874e-06, "loss": 0.0054, "step": 85560 }, { "epoch": 2.400617197362884, "grad_norm": 0.0558638796210289, "learning_rate": 9.989713377285267e-06, "loss": 0.0232, "step": 85570 }, { "epoch": 2.4008977416187403, "grad_norm": 0.882469117641449, "learning_rate": 9.98503763968766e-06, "loss": 0.0506, "step": 85580 }, { "epoch": 2.4011782858745967, "grad_norm": 0.20406043529510498, "learning_rate": 9.980361902090055e-06, "loss": 0.0323, "step": 85590 }, { "epoch": 2.401458830130453, "grad_norm": 0.04580473154783249, "learning_rate": 9.97568616449245e-06, "loss": 0.0217, "step": 85600 }, { "epoch": 2.4017393743863096, "grad_norm": 0.2161276787519455, "learning_rate": 9.971010426894843e-06, "loss": 0.0102, "step": 85610 }, { "epoch": 2.402019918642166, "grad_norm": 0.04320709779858589, "learning_rate": 9.966334689297236e-06, "loss": 0.0547, "step": 85620 }, { "epoch": 2.402300462898022, "grad_norm": 0.2101965695619583, "learning_rate": 9.961658951699631e-06, "loss": 0.0185, "step": 85630 }, { "epoch": 2.4025810071538785, "grad_norm": 0.43767252564430237, "learning_rate": 9.956983214102026e-06, "loss": 0.0097, "step": 85640 }, { "epoch": 2.402861551409735, "grad_norm": 0.6150362491607666, "learning_rate": 9.952307476504419e-06, "loss": 0.0177, "step": 85650 }, { "epoch": 2.4031420956655913, "grad_norm": 0.8878085017204285, "learning_rate": 9.947631738906812e-06, "loss": 0.0109, "step": 85660 }, { "epoch": 2.403422639921448, "grad_norm": 2.1393911838531494, "learning_rate": 9.942956001309207e-06, "loss": 0.04, "step": 85670 }, { "epoch": 2.4037031841773038, "grad_norm": 0.6029723286628723, "learning_rate": 9.9382802637116e-06, "loss": 0.009, "step": 85680 }, { "epoch": 2.40398372843316, "grad_norm": 1.0331884622573853, "learning_rate": 9.933604526113995e-06, "loss": 0.0221, "step": 85690 }, { "epoch": 2.4042642726890167, "grad_norm": 0.16886967420578003, "learning_rate": 9.92892878851639e-06, "loss": 0.0163, "step": 85700 }, { "epoch": 2.404544816944873, "grad_norm": 0.01164252683520317, "learning_rate": 9.924253050918783e-06, "loss": 0.0335, "step": 85710 }, { "epoch": 2.4048253612007295, "grad_norm": 0.19225721061229706, "learning_rate": 9.919577313321176e-06, "loss": 0.019, "step": 85720 }, { "epoch": 2.405105905456586, "grad_norm": 0.3591521382331848, "learning_rate": 9.91490157572357e-06, "loss": 0.0081, "step": 85730 }, { "epoch": 2.405386449712442, "grad_norm": 0.8640777468681335, "learning_rate": 9.910225838125966e-06, "loss": 0.0092, "step": 85740 }, { "epoch": 2.4056669939682984, "grad_norm": 0.0551493875682354, "learning_rate": 9.90555010052836e-06, "loss": 0.0417, "step": 85750 }, { "epoch": 2.405947538224155, "grad_norm": 0.005810615140944719, "learning_rate": 9.900874362930752e-06, "loss": 0.0063, "step": 85760 }, { "epoch": 2.4062280824800113, "grad_norm": 0.6430267095565796, "learning_rate": 9.896198625333147e-06, "loss": 0.0226, "step": 85770 }, { "epoch": 2.4065086267358677, "grad_norm": 1.3538652658462524, "learning_rate": 9.89152288773554e-06, "loss": 0.0357, "step": 85780 }, { "epoch": 2.4067891709917237, "grad_norm": 0.01957341656088829, "learning_rate": 9.886847150137935e-06, "loss": 0.0306, "step": 85790 }, { "epoch": 2.40706971524758, "grad_norm": 0.0415971465408802, "learning_rate": 9.882171412540328e-06, "loss": 0.0173, "step": 85800 }, { "epoch": 2.4073502595034366, "grad_norm": 0.1545393168926239, "learning_rate": 9.877495674942723e-06, "loss": 0.0315, "step": 85810 }, { "epoch": 2.407630803759293, "grad_norm": 0.7335155010223389, "learning_rate": 9.872819937345116e-06, "loss": 0.0138, "step": 85820 }, { "epoch": 2.4079113480151495, "grad_norm": 1.7563532590866089, "learning_rate": 9.86814419974751e-06, "loss": 0.0209, "step": 85830 }, { "epoch": 2.408191892271006, "grad_norm": 0.03928777948021889, "learning_rate": 9.863468462149904e-06, "loss": 0.0254, "step": 85840 }, { "epoch": 2.408472436526862, "grad_norm": 0.024452250450849533, "learning_rate": 9.8587927245523e-06, "loss": 0.0199, "step": 85850 }, { "epoch": 2.4087529807827184, "grad_norm": 0.0498296394944191, "learning_rate": 9.854116986954692e-06, "loss": 0.0121, "step": 85860 }, { "epoch": 2.409033525038575, "grad_norm": 0.038799434900283813, "learning_rate": 9.849441249357085e-06, "loss": 0.0331, "step": 85870 }, { "epoch": 2.4093140692944313, "grad_norm": 0.02981516532599926, "learning_rate": 9.84476551175948e-06, "loss": 0.0326, "step": 85880 }, { "epoch": 2.4095946135502877, "grad_norm": 0.06545285135507584, "learning_rate": 9.840089774161875e-06, "loss": 0.0074, "step": 85890 }, { "epoch": 2.4098751578061437, "grad_norm": 0.2781265377998352, "learning_rate": 9.835414036564268e-06, "loss": 0.009, "step": 85900 }, { "epoch": 2.410155702062, "grad_norm": 0.02942269667983055, "learning_rate": 9.830738298966663e-06, "loss": 0.008, "step": 85910 }, { "epoch": 2.4104362463178566, "grad_norm": 0.03411516547203064, "learning_rate": 9.826062561369056e-06, "loss": 0.0209, "step": 85920 }, { "epoch": 2.410716790573713, "grad_norm": 0.01914118602871895, "learning_rate": 9.82138682377145e-06, "loss": 0.011, "step": 85930 }, { "epoch": 2.4109973348295695, "grad_norm": 0.017799168825149536, "learning_rate": 9.816711086173844e-06, "loss": 0.0303, "step": 85940 }, { "epoch": 2.411277879085426, "grad_norm": 0.03346576541662216, "learning_rate": 9.81203534857624e-06, "loss": 0.0201, "step": 85950 }, { "epoch": 2.411558423341282, "grad_norm": 0.01840551383793354, "learning_rate": 9.807359610978632e-06, "loss": 0.0196, "step": 85960 }, { "epoch": 2.4118389675971383, "grad_norm": 0.02039843425154686, "learning_rate": 9.802683873381026e-06, "loss": 0.0117, "step": 85970 }, { "epoch": 2.412119511852995, "grad_norm": 0.03813879191875458, "learning_rate": 9.79800813578342e-06, "loss": 0.0034, "step": 85980 }, { "epoch": 2.412400056108851, "grad_norm": 0.05112684890627861, "learning_rate": 9.793332398185815e-06, "loss": 0.0145, "step": 85990 }, { "epoch": 2.4126806003647077, "grad_norm": 0.31171727180480957, "learning_rate": 9.788656660588208e-06, "loss": 0.0127, "step": 86000 }, { "epoch": 2.4129611446205637, "grad_norm": 0.07109776139259338, "learning_rate": 9.783980922990603e-06, "loss": 0.0317, "step": 86010 }, { "epoch": 2.41324168887642, "grad_norm": 0.5647955536842346, "learning_rate": 9.779305185392996e-06, "loss": 0.0282, "step": 86020 }, { "epoch": 2.4135222331322765, "grad_norm": 0.05372249707579613, "learning_rate": 9.77462944779539e-06, "loss": 0.0178, "step": 86030 }, { "epoch": 2.413802777388133, "grad_norm": 0.5738825798034668, "learning_rate": 9.769953710197784e-06, "loss": 0.0592, "step": 86040 }, { "epoch": 2.4140833216439894, "grad_norm": 0.010876404121518135, "learning_rate": 9.76527797260018e-06, "loss": 0.0082, "step": 86050 }, { "epoch": 2.414363865899846, "grad_norm": 0.03338019922375679, "learning_rate": 9.760602235002572e-06, "loss": 0.0225, "step": 86060 }, { "epoch": 2.4146444101557023, "grad_norm": 1.3990628719329834, "learning_rate": 9.755926497404966e-06, "loss": 0.0196, "step": 86070 }, { "epoch": 2.4149249544115583, "grad_norm": 0.00809060875326395, "learning_rate": 9.75125075980736e-06, "loss": 0.0245, "step": 86080 }, { "epoch": 2.4152054986674147, "grad_norm": 0.02244131825864315, "learning_rate": 9.746575022209754e-06, "loss": 0.0029, "step": 86090 }, { "epoch": 2.415486042923271, "grad_norm": 0.701717734336853, "learning_rate": 9.741899284612148e-06, "loss": 0.0204, "step": 86100 }, { "epoch": 2.4157665871791276, "grad_norm": 0.6499178409576416, "learning_rate": 9.737223547014542e-06, "loss": 0.019, "step": 86110 }, { "epoch": 2.4160471314349836, "grad_norm": 0.17955294251441956, "learning_rate": 9.732547809416936e-06, "loss": 0.0514, "step": 86120 }, { "epoch": 2.41632767569084, "grad_norm": 0.00954608153551817, "learning_rate": 9.72787207181933e-06, "loss": 0.0189, "step": 86130 }, { "epoch": 2.4166082199466965, "grad_norm": 0.03198684751987457, "learning_rate": 9.723196334221724e-06, "loss": 0.022, "step": 86140 }, { "epoch": 2.416888764202553, "grad_norm": 0.03452327102422714, "learning_rate": 9.71852059662412e-06, "loss": 0.0114, "step": 86150 }, { "epoch": 2.4171693084584094, "grad_norm": 0.006999065168201923, "learning_rate": 9.713844859026512e-06, "loss": 0.008, "step": 86160 }, { "epoch": 2.417449852714266, "grad_norm": 0.7488166689872742, "learning_rate": 9.709169121428906e-06, "loss": 0.0314, "step": 86170 }, { "epoch": 2.4177303969701223, "grad_norm": 0.5144951343536377, "learning_rate": 9.704493383831299e-06, "loss": 0.0151, "step": 86180 }, { "epoch": 2.4180109412259783, "grad_norm": 1.3441747426986694, "learning_rate": 9.699817646233694e-06, "loss": 0.0337, "step": 86190 }, { "epoch": 2.4182914854818347, "grad_norm": 0.06890946626663208, "learning_rate": 9.695141908636088e-06, "loss": 0.0189, "step": 86200 }, { "epoch": 2.418572029737691, "grad_norm": 0.051742035895586014, "learning_rate": 9.690466171038482e-06, "loss": 0.01, "step": 86210 }, { "epoch": 2.4188525739935476, "grad_norm": 1.2032957077026367, "learning_rate": 9.685790433440876e-06, "loss": 0.0209, "step": 86220 }, { "epoch": 2.419133118249404, "grad_norm": 0.081904336810112, "learning_rate": 9.68111469584327e-06, "loss": 0.0077, "step": 86230 }, { "epoch": 2.41941366250526, "grad_norm": 0.20126348733901978, "learning_rate": 9.676438958245663e-06, "loss": 0.0155, "step": 86240 }, { "epoch": 2.4196942067611165, "grad_norm": 0.5133017897605896, "learning_rate": 9.671763220648058e-06, "loss": 0.0562, "step": 86250 }, { "epoch": 2.419974751016973, "grad_norm": 0.08244045823812485, "learning_rate": 9.667087483050452e-06, "loss": 0.0192, "step": 86260 }, { "epoch": 2.4202552952728293, "grad_norm": 0.015795907005667686, "learning_rate": 9.662411745452846e-06, "loss": 0.0263, "step": 86270 }, { "epoch": 2.420535839528686, "grad_norm": 0.0077953333966434, "learning_rate": 9.657736007855239e-06, "loss": 0.0351, "step": 86280 }, { "epoch": 2.4208163837845422, "grad_norm": 3.3992671966552734, "learning_rate": 9.653060270257634e-06, "loss": 0.0203, "step": 86290 }, { "epoch": 2.421096928040398, "grad_norm": 0.04075919836759567, "learning_rate": 9.648384532660028e-06, "loss": 0.0423, "step": 86300 }, { "epoch": 2.4213774722962547, "grad_norm": 0.13766995072364807, "learning_rate": 9.643708795062422e-06, "loss": 0.0455, "step": 86310 }, { "epoch": 2.421658016552111, "grad_norm": 0.11458313465118408, "learning_rate": 9.639033057464815e-06, "loss": 0.0224, "step": 86320 }, { "epoch": 2.4219385608079675, "grad_norm": 0.13029123842716217, "learning_rate": 9.63435731986721e-06, "loss": 0.0239, "step": 86330 }, { "epoch": 2.422219105063824, "grad_norm": 0.07197681069374084, "learning_rate": 9.629681582269603e-06, "loss": 0.0104, "step": 86340 }, { "epoch": 2.42249964931968, "grad_norm": 0.31362611055374146, "learning_rate": 9.625005844671998e-06, "loss": 0.0345, "step": 86350 }, { "epoch": 2.4227801935755364, "grad_norm": 0.03849014639854431, "learning_rate": 9.620330107074392e-06, "loss": 0.012, "step": 86360 }, { "epoch": 2.423060737831393, "grad_norm": 2.8464443683624268, "learning_rate": 9.615654369476786e-06, "loss": 0.0231, "step": 86370 }, { "epoch": 2.4233412820872493, "grad_norm": 0.40228062868118286, "learning_rate": 9.610978631879179e-06, "loss": 0.0055, "step": 86380 }, { "epoch": 2.4236218263431057, "grad_norm": 0.8732019066810608, "learning_rate": 9.606302894281574e-06, "loss": 0.0294, "step": 86390 }, { "epoch": 2.423902370598962, "grad_norm": 0.6531897783279419, "learning_rate": 9.601627156683968e-06, "loss": 0.0427, "step": 86400 }, { "epoch": 2.424182914854818, "grad_norm": 0.8209453225135803, "learning_rate": 9.596951419086362e-06, "loss": 0.0223, "step": 86410 }, { "epoch": 2.4244634591106746, "grad_norm": 0.7237081527709961, "learning_rate": 9.592275681488755e-06, "loss": 0.0388, "step": 86420 }, { "epoch": 2.424744003366531, "grad_norm": 0.1112702488899231, "learning_rate": 9.58759994389115e-06, "loss": 0.0316, "step": 86430 }, { "epoch": 2.4250245476223875, "grad_norm": 2.280547618865967, "learning_rate": 9.582924206293543e-06, "loss": 0.0147, "step": 86440 }, { "epoch": 2.425305091878244, "grad_norm": 0.16047970950603485, "learning_rate": 9.578248468695938e-06, "loss": 0.0108, "step": 86450 }, { "epoch": 2.4255856361341, "grad_norm": 0.15981319546699524, "learning_rate": 9.57357273109833e-06, "loss": 0.0105, "step": 86460 }, { "epoch": 2.4258661803899564, "grad_norm": 0.25097736716270447, "learning_rate": 9.568896993500726e-06, "loss": 0.0227, "step": 86470 }, { "epoch": 2.426146724645813, "grad_norm": 4.025575160980225, "learning_rate": 9.564221255903119e-06, "loss": 0.0077, "step": 86480 }, { "epoch": 2.4264272689016693, "grad_norm": 3.4895389080047607, "learning_rate": 9.559545518305512e-06, "loss": 0.0185, "step": 86490 }, { "epoch": 2.4267078131575257, "grad_norm": 0.5613903403282166, "learning_rate": 9.554869780707908e-06, "loss": 0.0361, "step": 86500 }, { "epoch": 2.426988357413382, "grad_norm": 0.045953910797834396, "learning_rate": 9.550194043110302e-06, "loss": 0.0151, "step": 86510 }, { "epoch": 2.427268901669238, "grad_norm": 0.12629668414592743, "learning_rate": 9.545518305512695e-06, "loss": 0.0211, "step": 86520 }, { "epoch": 2.4275494459250946, "grad_norm": 0.34371230006217957, "learning_rate": 9.540842567915088e-06, "loss": 0.0125, "step": 86530 }, { "epoch": 2.427829990180951, "grad_norm": 1.2027947902679443, "learning_rate": 9.536166830317483e-06, "loss": 0.0117, "step": 86540 }, { "epoch": 2.4281105344368075, "grad_norm": 0.07592156529426575, "learning_rate": 9.531491092719878e-06, "loss": 0.0072, "step": 86550 }, { "epoch": 2.428391078692664, "grad_norm": 0.09596215188503265, "learning_rate": 9.52681535512227e-06, "loss": 0.0181, "step": 86560 }, { "epoch": 2.42867162294852, "grad_norm": 0.6138241291046143, "learning_rate": 9.522139617524666e-06, "loss": 0.0183, "step": 86570 }, { "epoch": 2.4289521672043763, "grad_norm": 0.15246087312698364, "learning_rate": 9.517463879927059e-06, "loss": 0.0167, "step": 86580 }, { "epoch": 2.429232711460233, "grad_norm": 0.547247052192688, "learning_rate": 9.512788142329452e-06, "loss": 0.0096, "step": 86590 }, { "epoch": 2.4295132557160892, "grad_norm": 0.0629989355802536, "learning_rate": 9.508112404731847e-06, "loss": 0.0092, "step": 86600 }, { "epoch": 2.4297937999719457, "grad_norm": 0.40277618169784546, "learning_rate": 9.503436667134242e-06, "loss": 0.0333, "step": 86610 }, { "epoch": 2.430074344227802, "grad_norm": 0.5613875985145569, "learning_rate": 9.498760929536635e-06, "loss": 0.0323, "step": 86620 }, { "epoch": 2.430354888483658, "grad_norm": 0.17877434194087982, "learning_rate": 9.494085191939028e-06, "loss": 0.0122, "step": 86630 }, { "epoch": 2.4306354327395145, "grad_norm": 0.3215593993663788, "learning_rate": 9.489409454341423e-06, "loss": 0.005, "step": 86640 }, { "epoch": 2.430915976995371, "grad_norm": 0.018960820510983467, "learning_rate": 9.484733716743818e-06, "loss": 0.0025, "step": 86650 }, { "epoch": 2.4311965212512274, "grad_norm": 0.2897903323173523, "learning_rate": 9.48005797914621e-06, "loss": 0.0079, "step": 86660 }, { "epoch": 2.431477065507084, "grad_norm": 0.2288236767053604, "learning_rate": 9.475382241548606e-06, "loss": 0.0044, "step": 86670 }, { "epoch": 2.43175760976294, "grad_norm": 0.022589636966586113, "learning_rate": 9.470706503950999e-06, "loss": 0.007, "step": 86680 }, { "epoch": 2.4320381540187963, "grad_norm": 0.09524193406105042, "learning_rate": 9.466030766353392e-06, "loss": 0.012, "step": 86690 }, { "epoch": 2.4323186982746527, "grad_norm": 0.26530611515045166, "learning_rate": 9.461355028755787e-06, "loss": 0.0236, "step": 86700 }, { "epoch": 2.432599242530509, "grad_norm": 0.9616710543632507, "learning_rate": 9.456679291158182e-06, "loss": 0.0479, "step": 86710 }, { "epoch": 2.4328797867863656, "grad_norm": 0.05091339349746704, "learning_rate": 9.452003553560575e-06, "loss": 0.0192, "step": 86720 }, { "epoch": 2.433160331042222, "grad_norm": 0.23649421334266663, "learning_rate": 9.447327815962968e-06, "loss": 0.0182, "step": 86730 }, { "epoch": 2.4334408752980785, "grad_norm": 0.036457307636737823, "learning_rate": 9.442652078365363e-06, "loss": 0.0058, "step": 86740 }, { "epoch": 2.4337214195539345, "grad_norm": 0.4083825349807739, "learning_rate": 9.437976340767756e-06, "loss": 0.02, "step": 86750 }, { "epoch": 2.434001963809791, "grad_norm": 0.020697372034192085, "learning_rate": 9.43330060317015e-06, "loss": 0.012, "step": 86760 }, { "epoch": 2.4342825080656474, "grad_norm": 0.03719125688076019, "learning_rate": 9.428624865572544e-06, "loss": 0.0265, "step": 86770 }, { "epoch": 2.434563052321504, "grad_norm": 0.17709746956825256, "learning_rate": 9.423949127974939e-06, "loss": 0.0235, "step": 86780 }, { "epoch": 2.4348435965773603, "grad_norm": 0.05876928195357323, "learning_rate": 9.419273390377332e-06, "loss": 0.045, "step": 86790 }, { "epoch": 2.4351241408332163, "grad_norm": 0.491630494594574, "learning_rate": 9.414597652779727e-06, "loss": 0.0335, "step": 86800 }, { "epoch": 2.4354046850890727, "grad_norm": 0.41173526644706726, "learning_rate": 9.409921915182122e-06, "loss": 0.028, "step": 86810 }, { "epoch": 2.435685229344929, "grad_norm": 0.0947706550359726, "learning_rate": 9.405246177584515e-06, "loss": 0.0322, "step": 86820 }, { "epoch": 2.4359657736007856, "grad_norm": 0.6139553189277649, "learning_rate": 9.400570439986908e-06, "loss": 0.0304, "step": 86830 }, { "epoch": 2.436246317856642, "grad_norm": 0.5682013034820557, "learning_rate": 9.395894702389301e-06, "loss": 0.029, "step": 86840 }, { "epoch": 2.4365268621124985, "grad_norm": 0.09507535398006439, "learning_rate": 9.391218964791696e-06, "loss": 0.0168, "step": 86850 }, { "epoch": 2.4368074063683545, "grad_norm": 0.42830201983451843, "learning_rate": 9.386543227194091e-06, "loss": 0.0263, "step": 86860 }, { "epoch": 2.437087950624211, "grad_norm": 0.10861025005578995, "learning_rate": 9.381867489596484e-06, "loss": 0.0164, "step": 86870 }, { "epoch": 2.4373684948800673, "grad_norm": 0.6371592283248901, "learning_rate": 9.377191751998879e-06, "loss": 0.0269, "step": 86880 }, { "epoch": 2.437649039135924, "grad_norm": 0.8395395874977112, "learning_rate": 9.372516014401272e-06, "loss": 0.018, "step": 86890 }, { "epoch": 2.4379295833917802, "grad_norm": 0.13668644428253174, "learning_rate": 9.367840276803667e-06, "loss": 0.0319, "step": 86900 }, { "epoch": 2.4382101276476362, "grad_norm": 0.24343867599964142, "learning_rate": 9.36316453920606e-06, "loss": 0.015, "step": 86910 }, { "epoch": 2.4384906719034927, "grad_norm": 1.5189979076385498, "learning_rate": 9.358488801608455e-06, "loss": 0.0354, "step": 86920 }, { "epoch": 2.438771216159349, "grad_norm": 0.1781463772058487, "learning_rate": 9.353813064010848e-06, "loss": 0.031, "step": 86930 }, { "epoch": 2.4390517604152055, "grad_norm": 0.3883577883243561, "learning_rate": 9.349137326413241e-06, "loss": 0.0223, "step": 86940 }, { "epoch": 2.439332304671062, "grad_norm": 0.2082740217447281, "learning_rate": 9.344461588815636e-06, "loss": 0.0169, "step": 86950 }, { "epoch": 2.4396128489269184, "grad_norm": 0.031877368688583374, "learning_rate": 9.339785851218031e-06, "loss": 0.0166, "step": 86960 }, { "epoch": 2.4398933931827744, "grad_norm": 0.01724730245769024, "learning_rate": 9.335110113620424e-06, "loss": 0.0134, "step": 86970 }, { "epoch": 2.440173937438631, "grad_norm": 0.43519508838653564, "learning_rate": 9.330434376022817e-06, "loss": 0.0181, "step": 86980 }, { "epoch": 2.4404544816944873, "grad_norm": 0.05970432236790657, "learning_rate": 9.325758638425212e-06, "loss": 0.0267, "step": 86990 }, { "epoch": 2.4407350259503438, "grad_norm": 0.286756694316864, "learning_rate": 9.321082900827605e-06, "loss": 0.0181, "step": 87000 }, { "epoch": 2.4410155702062, "grad_norm": 2.20930552482605, "learning_rate": 9.31640716323e-06, "loss": 0.0376, "step": 87010 }, { "epoch": 2.441296114462056, "grad_norm": 0.16030855476856232, "learning_rate": 9.311731425632395e-06, "loss": 0.0128, "step": 87020 }, { "epoch": 2.4415766587179126, "grad_norm": 0.34880781173706055, "learning_rate": 9.307055688034788e-06, "loss": 0.0135, "step": 87030 }, { "epoch": 2.441857202973769, "grad_norm": 0.029300477355718613, "learning_rate": 9.302379950437181e-06, "loss": 0.019, "step": 87040 }, { "epoch": 2.4421377472296255, "grad_norm": 0.6118443608283997, "learning_rate": 9.297704212839576e-06, "loss": 0.0076, "step": 87050 }, { "epoch": 2.442418291485482, "grad_norm": 0.013780736364424229, "learning_rate": 9.293028475241971e-06, "loss": 0.0121, "step": 87060 }, { "epoch": 2.4426988357413384, "grad_norm": 1.378686785697937, "learning_rate": 9.288352737644364e-06, "loss": 0.0138, "step": 87070 }, { "epoch": 2.4429793799971944, "grad_norm": 0.022443166002631187, "learning_rate": 9.283677000046757e-06, "loss": 0.0202, "step": 87080 }, { "epoch": 2.443259924253051, "grad_norm": 0.20480534434318542, "learning_rate": 9.279001262449152e-06, "loss": 0.0364, "step": 87090 }, { "epoch": 2.4435404685089073, "grad_norm": 0.018191559240221977, "learning_rate": 9.274325524851545e-06, "loss": 0.0126, "step": 87100 }, { "epoch": 2.4438210127647637, "grad_norm": 0.08153241127729416, "learning_rate": 9.26964978725394e-06, "loss": 0.0132, "step": 87110 }, { "epoch": 2.44410155702062, "grad_norm": 0.1763550341129303, "learning_rate": 9.264974049656333e-06, "loss": 0.0105, "step": 87120 }, { "epoch": 2.444382101276476, "grad_norm": 0.6383938193321228, "learning_rate": 9.260298312058728e-06, "loss": 0.0346, "step": 87130 }, { "epoch": 2.4446626455323326, "grad_norm": 0.008900340646505356, "learning_rate": 9.255622574461121e-06, "loss": 0.015, "step": 87140 }, { "epoch": 2.444943189788189, "grad_norm": 0.018644938245415688, "learning_rate": 9.250946836863514e-06, "loss": 0.0099, "step": 87150 }, { "epoch": 2.4452237340440455, "grad_norm": 0.5204829573631287, "learning_rate": 9.246271099265911e-06, "loss": 0.0193, "step": 87160 }, { "epoch": 2.445504278299902, "grad_norm": 0.052433911710977554, "learning_rate": 9.241595361668304e-06, "loss": 0.0554, "step": 87170 }, { "epoch": 2.4457848225557584, "grad_norm": 1.9993891716003418, "learning_rate": 9.236919624070697e-06, "loss": 0.0349, "step": 87180 }, { "epoch": 2.4460653668116143, "grad_norm": 0.0922887921333313, "learning_rate": 9.23224388647309e-06, "loss": 0.0176, "step": 87190 }, { "epoch": 2.446345911067471, "grad_norm": 0.4879247844219208, "learning_rate": 9.227568148875485e-06, "loss": 0.05, "step": 87200 }, { "epoch": 2.4466264553233272, "grad_norm": 0.35596349835395813, "learning_rate": 9.22289241127788e-06, "loss": 0.0157, "step": 87210 }, { "epoch": 2.4469069995791837, "grad_norm": 0.05992888659238815, "learning_rate": 9.218216673680273e-06, "loss": 0.0123, "step": 87220 }, { "epoch": 2.44718754383504, "grad_norm": 0.8994684815406799, "learning_rate": 9.213540936082668e-06, "loss": 0.0239, "step": 87230 }, { "epoch": 2.447468088090896, "grad_norm": 0.011478164233267307, "learning_rate": 9.208865198485061e-06, "loss": 0.0625, "step": 87240 }, { "epoch": 2.4477486323467526, "grad_norm": 0.07015382498502731, "learning_rate": 9.204189460887454e-06, "loss": 0.0238, "step": 87250 }, { "epoch": 2.448029176602609, "grad_norm": 0.3687628209590912, "learning_rate": 9.19951372328985e-06, "loss": 0.0158, "step": 87260 }, { "epoch": 2.4483097208584654, "grad_norm": 0.38569843769073486, "learning_rate": 9.194837985692244e-06, "loss": 0.0196, "step": 87270 }, { "epoch": 2.448590265114322, "grad_norm": 0.030263420194387436, "learning_rate": 9.190162248094637e-06, "loss": 0.0101, "step": 87280 }, { "epoch": 2.4488708093701783, "grad_norm": 0.020298298448324203, "learning_rate": 9.18548651049703e-06, "loss": 0.0066, "step": 87290 }, { "epoch": 2.4491513536260348, "grad_norm": 0.014968584291636944, "learning_rate": 9.180810772899425e-06, "loss": 0.0069, "step": 87300 }, { "epoch": 2.4494318978818908, "grad_norm": 1.289152979850769, "learning_rate": 9.17613503530182e-06, "loss": 0.0355, "step": 87310 }, { "epoch": 2.449712442137747, "grad_norm": 0.15684306621551514, "learning_rate": 9.171459297704213e-06, "loss": 0.0139, "step": 87320 }, { "epoch": 2.4499929863936036, "grad_norm": 0.20296049118041992, "learning_rate": 9.166783560106608e-06, "loss": 0.0063, "step": 87330 }, { "epoch": 2.45027353064946, "grad_norm": 0.7741448879241943, "learning_rate": 9.162107822509001e-06, "loss": 0.0089, "step": 87340 }, { "epoch": 2.450554074905316, "grad_norm": 0.034234095364809036, "learning_rate": 9.157432084911394e-06, "loss": 0.0085, "step": 87350 }, { "epoch": 2.4508346191611725, "grad_norm": 0.03150439262390137, "learning_rate": 9.15275634731379e-06, "loss": 0.016, "step": 87360 }, { "epoch": 2.451115163417029, "grad_norm": 0.025574272498488426, "learning_rate": 9.148080609716184e-06, "loss": 0.0371, "step": 87370 }, { "epoch": 2.4513957076728854, "grad_norm": 0.02964158169925213, "learning_rate": 9.143404872118577e-06, "loss": 0.0116, "step": 87380 }, { "epoch": 2.451676251928742, "grad_norm": 0.028071047738194466, "learning_rate": 9.13872913452097e-06, "loss": 0.0161, "step": 87390 }, { "epoch": 2.4519567961845983, "grad_norm": 0.3683927655220032, "learning_rate": 9.134053396923365e-06, "loss": 0.0196, "step": 87400 }, { "epoch": 2.4522373404404547, "grad_norm": 2.4315249919891357, "learning_rate": 9.12937765932576e-06, "loss": 0.0336, "step": 87410 }, { "epoch": 2.4525178846963107, "grad_norm": 0.33693021535873413, "learning_rate": 9.124701921728153e-06, "loss": 0.0109, "step": 87420 }, { "epoch": 2.452798428952167, "grad_norm": 0.050208088010549545, "learning_rate": 9.120026184130546e-06, "loss": 0.0138, "step": 87430 }, { "epoch": 2.4530789732080236, "grad_norm": 0.15210743248462677, "learning_rate": 9.115350446532941e-06, "loss": 0.0177, "step": 87440 }, { "epoch": 2.45335951746388, "grad_norm": 0.20869655907154083, "learning_rate": 9.110674708935334e-06, "loss": 0.0216, "step": 87450 }, { "epoch": 2.4536400617197365, "grad_norm": 0.5049686431884766, "learning_rate": 9.10599897133773e-06, "loss": 0.0139, "step": 87460 }, { "epoch": 2.4539206059755925, "grad_norm": 0.05382803454995155, "learning_rate": 9.101323233740124e-06, "loss": 0.0107, "step": 87470 }, { "epoch": 2.454201150231449, "grad_norm": 0.917726993560791, "learning_rate": 9.096647496142517e-06, "loss": 0.0231, "step": 87480 }, { "epoch": 2.4544816944873054, "grad_norm": 0.08622442930936813, "learning_rate": 9.09197175854491e-06, "loss": 0.0298, "step": 87490 }, { "epoch": 2.454762238743162, "grad_norm": 0.010733361355960369, "learning_rate": 9.087296020947304e-06, "loss": 0.0202, "step": 87500 }, { "epoch": 2.4550427829990182, "grad_norm": 0.022833576425909996, "learning_rate": 9.082620283349698e-06, "loss": 0.0202, "step": 87510 }, { "epoch": 2.4553233272548747, "grad_norm": 0.06241931766271591, "learning_rate": 9.077944545752093e-06, "loss": 0.03, "step": 87520 }, { "epoch": 2.4556038715107307, "grad_norm": 0.1315467804670334, "learning_rate": 9.073268808154486e-06, "loss": 0.0161, "step": 87530 }, { "epoch": 2.455884415766587, "grad_norm": 0.6637319326400757, "learning_rate": 9.068593070556881e-06, "loss": 0.0169, "step": 87540 }, { "epoch": 2.4561649600224436, "grad_norm": 0.37059932947158813, "learning_rate": 9.063917332959274e-06, "loss": 0.0115, "step": 87550 }, { "epoch": 2.4564455042783, "grad_norm": 1.3419133424758911, "learning_rate": 9.05924159536167e-06, "loss": 0.0366, "step": 87560 }, { "epoch": 2.4567260485341564, "grad_norm": 0.07416535168886185, "learning_rate": 9.054565857764062e-06, "loss": 0.0218, "step": 87570 }, { "epoch": 2.4570065927900124, "grad_norm": 0.026732563972473145, "learning_rate": 9.049890120166457e-06, "loss": 0.0316, "step": 87580 }, { "epoch": 2.457287137045869, "grad_norm": 0.02323947846889496, "learning_rate": 9.04521438256885e-06, "loss": 0.0126, "step": 87590 }, { "epoch": 2.4575676813017253, "grad_norm": 0.04719046503305435, "learning_rate": 9.040538644971244e-06, "loss": 0.0166, "step": 87600 }, { "epoch": 2.4578482255575818, "grad_norm": 0.7662807703018188, "learning_rate": 9.035862907373638e-06, "loss": 0.0342, "step": 87610 }, { "epoch": 2.458128769813438, "grad_norm": 0.3969917297363281, "learning_rate": 9.031187169776033e-06, "loss": 0.0295, "step": 87620 }, { "epoch": 2.4584093140692946, "grad_norm": 0.06057966127991676, "learning_rate": 9.026511432178426e-06, "loss": 0.0536, "step": 87630 }, { "epoch": 2.4586898583251506, "grad_norm": 1.0530768632888794, "learning_rate": 9.02183569458082e-06, "loss": 0.0402, "step": 87640 }, { "epoch": 2.458970402581007, "grad_norm": 1.107952356338501, "learning_rate": 9.017159956983215e-06, "loss": 0.0547, "step": 87650 }, { "epoch": 2.4592509468368635, "grad_norm": 0.3104539215564728, "learning_rate": 9.012484219385608e-06, "loss": 0.0206, "step": 87660 }, { "epoch": 2.45953149109272, "grad_norm": 0.030827876180410385, "learning_rate": 9.007808481788003e-06, "loss": 0.0172, "step": 87670 }, { "epoch": 2.4598120353485764, "grad_norm": 0.08003845065832138, "learning_rate": 9.003132744190397e-06, "loss": 0.0144, "step": 87680 }, { "epoch": 2.4600925796044324, "grad_norm": 0.2704516053199768, "learning_rate": 8.99845700659279e-06, "loss": 0.0196, "step": 87690 }, { "epoch": 2.460373123860289, "grad_norm": 0.05502695590257645, "learning_rate": 8.993781268995184e-06, "loss": 0.0122, "step": 87700 }, { "epoch": 2.4606536681161453, "grad_norm": 0.5459295511245728, "learning_rate": 8.989105531397579e-06, "loss": 0.0075, "step": 87710 }, { "epoch": 2.4609342123720017, "grad_norm": 0.02647779881954193, "learning_rate": 8.984429793799973e-06, "loss": 0.0117, "step": 87720 }, { "epoch": 2.461214756627858, "grad_norm": 0.5486642122268677, "learning_rate": 8.979754056202367e-06, "loss": 0.0174, "step": 87730 }, { "epoch": 2.4614953008837146, "grad_norm": 0.02296297997236252, "learning_rate": 8.97507831860476e-06, "loss": 0.0076, "step": 87740 }, { "epoch": 2.4617758451395706, "grad_norm": 0.013808303512632847, "learning_rate": 8.970402581007155e-06, "loss": 0.0255, "step": 87750 }, { "epoch": 2.462056389395427, "grad_norm": 0.42123064398765564, "learning_rate": 8.965726843409548e-06, "loss": 0.0753, "step": 87760 }, { "epoch": 2.4623369336512835, "grad_norm": 0.05104494467377663, "learning_rate": 8.961051105811943e-06, "loss": 0.0355, "step": 87770 }, { "epoch": 2.46261747790714, "grad_norm": 0.03889531269669533, "learning_rate": 8.956375368214336e-06, "loss": 0.0171, "step": 87780 }, { "epoch": 2.4628980221629964, "grad_norm": 0.7652602791786194, "learning_rate": 8.95169963061673e-06, "loss": 0.034, "step": 87790 }, { "epoch": 2.4631785664188524, "grad_norm": 0.05767492204904556, "learning_rate": 8.947023893019124e-06, "loss": 0.0288, "step": 87800 }, { "epoch": 2.463459110674709, "grad_norm": 0.10471946001052856, "learning_rate": 8.942348155421519e-06, "loss": 0.0086, "step": 87810 }, { "epoch": 2.4637396549305652, "grad_norm": 1.152774691581726, "learning_rate": 8.937672417823913e-06, "loss": 0.0284, "step": 87820 }, { "epoch": 2.4640201991864217, "grad_norm": 0.03899148106575012, "learning_rate": 8.932996680226307e-06, "loss": 0.024, "step": 87830 }, { "epoch": 2.464300743442278, "grad_norm": 0.5465425848960876, "learning_rate": 8.9283209426287e-06, "loss": 0.0308, "step": 87840 }, { "epoch": 2.4645812876981346, "grad_norm": 0.09014111757278442, "learning_rate": 8.923645205031093e-06, "loss": 0.0157, "step": 87850 }, { "epoch": 2.4648618319539906, "grad_norm": 0.4553931951522827, "learning_rate": 8.918969467433488e-06, "loss": 0.0087, "step": 87860 }, { "epoch": 2.465142376209847, "grad_norm": 0.06641990691423416, "learning_rate": 8.914293729835883e-06, "loss": 0.0142, "step": 87870 }, { "epoch": 2.4654229204657034, "grad_norm": 0.05349056422710419, "learning_rate": 8.909617992238276e-06, "loss": 0.0132, "step": 87880 }, { "epoch": 2.46570346472156, "grad_norm": 1.0479185581207275, "learning_rate": 8.90494225464067e-06, "loss": 0.0132, "step": 87890 }, { "epoch": 2.4659840089774163, "grad_norm": 0.045106302946805954, "learning_rate": 8.900266517043064e-06, "loss": 0.023, "step": 87900 }, { "epoch": 2.4662645532332723, "grad_norm": 0.28355833888053894, "learning_rate": 8.895590779445457e-06, "loss": 0.0238, "step": 87910 }, { "epoch": 2.4665450974891288, "grad_norm": 0.023891223594546318, "learning_rate": 8.890915041847852e-06, "loss": 0.0434, "step": 87920 }, { "epoch": 2.466825641744985, "grad_norm": 0.4961808919906616, "learning_rate": 8.886239304250247e-06, "loss": 0.02, "step": 87930 }, { "epoch": 2.4671061860008416, "grad_norm": 0.18539530038833618, "learning_rate": 8.88156356665264e-06, "loss": 0.0264, "step": 87940 }, { "epoch": 2.467386730256698, "grad_norm": 0.0400301069021225, "learning_rate": 8.876887829055033e-06, "loss": 0.0176, "step": 87950 }, { "epoch": 2.4676672745125545, "grad_norm": 0.16889381408691406, "learning_rate": 8.872212091457428e-06, "loss": 0.006, "step": 87960 }, { "epoch": 2.467947818768411, "grad_norm": 0.8508630394935608, "learning_rate": 8.867536353859823e-06, "loss": 0.0333, "step": 87970 }, { "epoch": 2.468228363024267, "grad_norm": 0.17493921518325806, "learning_rate": 8.862860616262216e-06, "loss": 0.0264, "step": 87980 }, { "epoch": 2.4685089072801234, "grad_norm": 0.015370494686067104, "learning_rate": 8.85818487866461e-06, "loss": 0.0308, "step": 87990 }, { "epoch": 2.46878945153598, "grad_norm": 0.024213308468461037, "learning_rate": 8.853509141067004e-06, "loss": 0.0132, "step": 88000 }, { "epoch": 2.4690699957918363, "grad_norm": 0.09366588294506073, "learning_rate": 8.848833403469397e-06, "loss": 0.0055, "step": 88010 }, { "epoch": 2.4693505400476923, "grad_norm": 2.254929780960083, "learning_rate": 8.844157665871792e-06, "loss": 0.0047, "step": 88020 }, { "epoch": 2.4696310843035487, "grad_norm": 0.12063746154308319, "learning_rate": 8.839481928274187e-06, "loss": 0.0532, "step": 88030 }, { "epoch": 2.469911628559405, "grad_norm": 0.1051315888762474, "learning_rate": 8.83480619067658e-06, "loss": 0.0337, "step": 88040 }, { "epoch": 2.4701921728152616, "grad_norm": 2.879102945327759, "learning_rate": 8.830130453078973e-06, "loss": 0.0374, "step": 88050 }, { "epoch": 2.470472717071118, "grad_norm": 0.11627084761857986, "learning_rate": 8.825454715481368e-06, "loss": 0.0104, "step": 88060 }, { "epoch": 2.4707532613269745, "grad_norm": 0.03466423973441124, "learning_rate": 8.820778977883763e-06, "loss": 0.0183, "step": 88070 }, { "epoch": 2.471033805582831, "grad_norm": 0.6048835515975952, "learning_rate": 8.816103240286156e-06, "loss": 0.0597, "step": 88080 }, { "epoch": 2.471314349838687, "grad_norm": 0.1368972361087799, "learning_rate": 8.811427502688549e-06, "loss": 0.0184, "step": 88090 }, { "epoch": 2.4715948940945434, "grad_norm": 0.05571135878562927, "learning_rate": 8.806751765090944e-06, "loss": 0.0301, "step": 88100 }, { "epoch": 2.4718754383504, "grad_norm": 2.8100006580352783, "learning_rate": 8.802076027493337e-06, "loss": 0.0207, "step": 88110 }, { "epoch": 2.4721559826062562, "grad_norm": 0.7494339346885681, "learning_rate": 8.797400289895732e-06, "loss": 0.0074, "step": 88120 }, { "epoch": 2.4724365268621127, "grad_norm": 0.4615256190299988, "learning_rate": 8.792724552298127e-06, "loss": 0.0133, "step": 88130 }, { "epoch": 2.4727170711179687, "grad_norm": 0.4680261015892029, "learning_rate": 8.78804881470052e-06, "loss": 0.0056, "step": 88140 }, { "epoch": 2.472997615373825, "grad_norm": 1.2121576070785522, "learning_rate": 8.783373077102913e-06, "loss": 0.0421, "step": 88150 }, { "epoch": 2.4732781596296816, "grad_norm": 0.11315897852182388, "learning_rate": 8.778697339505306e-06, "loss": 0.0151, "step": 88160 }, { "epoch": 2.473558703885538, "grad_norm": 0.038585200905799866, "learning_rate": 8.774021601907703e-06, "loss": 0.0025, "step": 88170 }, { "epoch": 2.4738392481413944, "grad_norm": 0.030089307576417923, "learning_rate": 8.769345864310096e-06, "loss": 0.0069, "step": 88180 }, { "epoch": 2.474119792397251, "grad_norm": 0.014638660475611687, "learning_rate": 8.764670126712489e-06, "loss": 0.0133, "step": 88190 }, { "epoch": 2.474400336653107, "grad_norm": 0.32021889090538025, "learning_rate": 8.759994389114884e-06, "loss": 0.0198, "step": 88200 }, { "epoch": 2.4746808809089633, "grad_norm": 0.9852771162986755, "learning_rate": 8.755318651517277e-06, "loss": 0.0514, "step": 88210 }, { "epoch": 2.4749614251648198, "grad_norm": 0.056371621787548065, "learning_rate": 8.750642913919672e-06, "loss": 0.026, "step": 88220 }, { "epoch": 2.475241969420676, "grad_norm": 0.09456542879343033, "learning_rate": 8.745967176322065e-06, "loss": 0.0234, "step": 88230 }, { "epoch": 2.4755225136765326, "grad_norm": 0.07709793746471405, "learning_rate": 8.74129143872446e-06, "loss": 0.0178, "step": 88240 }, { "epoch": 2.4758030579323886, "grad_norm": 0.468144953250885, "learning_rate": 8.736615701126853e-06, "loss": 0.0258, "step": 88250 }, { "epoch": 2.476083602188245, "grad_norm": 0.13742509484291077, "learning_rate": 8.731939963529246e-06, "loss": 0.0137, "step": 88260 }, { "epoch": 2.4763641464441015, "grad_norm": 0.06706058979034424, "learning_rate": 8.727264225931641e-06, "loss": 0.0121, "step": 88270 }, { "epoch": 2.476644690699958, "grad_norm": 0.08517295867204666, "learning_rate": 8.722588488334036e-06, "loss": 0.0347, "step": 88280 }, { "epoch": 2.4769252349558144, "grad_norm": 0.3121560513973236, "learning_rate": 8.717912750736429e-06, "loss": 0.0089, "step": 88290 }, { "epoch": 2.477205779211671, "grad_norm": 0.5248374938964844, "learning_rate": 8.713237013138822e-06, "loss": 0.007, "step": 88300 }, { "epoch": 2.477486323467527, "grad_norm": 0.030920909717679024, "learning_rate": 8.708561275541217e-06, "loss": 0.0037, "step": 88310 }, { "epoch": 2.4777668677233833, "grad_norm": 0.022809291258454323, "learning_rate": 8.703885537943612e-06, "loss": 0.0077, "step": 88320 }, { "epoch": 2.4780474119792397, "grad_norm": 0.14928747713565826, "learning_rate": 8.699209800346005e-06, "loss": 0.0253, "step": 88330 }, { "epoch": 2.478327956235096, "grad_norm": 0.8534107208251953, "learning_rate": 8.6945340627484e-06, "loss": 0.009, "step": 88340 }, { "epoch": 2.4786085004909526, "grad_norm": 0.008628501556813717, "learning_rate": 8.689858325150793e-06, "loss": 0.0098, "step": 88350 }, { "epoch": 2.4788890447468086, "grad_norm": 0.8472086191177368, "learning_rate": 8.685182587553186e-06, "loss": 0.019, "step": 88360 }, { "epoch": 2.479169589002665, "grad_norm": 0.13654163479804993, "learning_rate": 8.680506849955581e-06, "loss": 0.0121, "step": 88370 }, { "epoch": 2.4794501332585215, "grad_norm": 0.42872726917266846, "learning_rate": 8.675831112357976e-06, "loss": 0.0071, "step": 88380 }, { "epoch": 2.479730677514378, "grad_norm": 0.06480253487825394, "learning_rate": 8.671155374760369e-06, "loss": 0.0265, "step": 88390 }, { "epoch": 2.4800112217702344, "grad_norm": 0.03139558434486389, "learning_rate": 8.666479637162762e-06, "loss": 0.0309, "step": 88400 }, { "epoch": 2.480291766026091, "grad_norm": 0.0232707429677248, "learning_rate": 8.661803899565157e-06, "loss": 0.0021, "step": 88410 }, { "epoch": 2.480572310281947, "grad_norm": 0.36668142676353455, "learning_rate": 8.65712816196755e-06, "loss": 0.0115, "step": 88420 }, { "epoch": 2.4808528545378032, "grad_norm": 0.2452232837677002, "learning_rate": 8.652452424369945e-06, "loss": 0.0302, "step": 88430 }, { "epoch": 2.4811333987936597, "grad_norm": 0.019407091662287712, "learning_rate": 8.647776686772338e-06, "loss": 0.01, "step": 88440 }, { "epoch": 2.481413943049516, "grad_norm": 0.029038280248641968, "learning_rate": 8.643100949174733e-06, "loss": 0.0119, "step": 88450 }, { "epoch": 2.4816944873053726, "grad_norm": 0.12372875213623047, "learning_rate": 8.638425211577126e-06, "loss": 0.0422, "step": 88460 }, { "epoch": 2.4819750315612286, "grad_norm": 0.04508515074849129, "learning_rate": 8.633749473979521e-06, "loss": 0.0048, "step": 88470 }, { "epoch": 2.482255575817085, "grad_norm": 0.020461909472942352, "learning_rate": 8.629073736381916e-06, "loss": 0.008, "step": 88480 }, { "epoch": 2.4825361200729414, "grad_norm": 0.17418573796749115, "learning_rate": 8.624397998784309e-06, "loss": 0.0253, "step": 88490 }, { "epoch": 2.482816664328798, "grad_norm": 0.4850899577140808, "learning_rate": 8.619722261186702e-06, "loss": 0.0311, "step": 88500 }, { "epoch": 2.4830972085846543, "grad_norm": 0.6084581017494202, "learning_rate": 8.615046523589095e-06, "loss": 0.0108, "step": 88510 }, { "epoch": 2.4833777528405108, "grad_norm": 0.12177059054374695, "learning_rate": 8.61037078599149e-06, "loss": 0.0236, "step": 88520 }, { "epoch": 2.4836582970963668, "grad_norm": 0.034553833305835724, "learning_rate": 8.605695048393885e-06, "loss": 0.0158, "step": 88530 }, { "epoch": 2.483938841352223, "grad_norm": 0.0346485860645771, "learning_rate": 8.601019310796278e-06, "loss": 0.0095, "step": 88540 }, { "epoch": 2.4842193856080796, "grad_norm": 0.0370076559484005, "learning_rate": 8.596343573198673e-06, "loss": 0.0141, "step": 88550 }, { "epoch": 2.484499929863936, "grad_norm": 0.026760073378682137, "learning_rate": 8.591667835601066e-06, "loss": 0.0087, "step": 88560 }, { "epoch": 2.4847804741197925, "grad_norm": 0.09966111928224564, "learning_rate": 8.58699209800346e-06, "loss": 0.0148, "step": 88570 }, { "epoch": 2.4850610183756485, "grad_norm": 0.054741282016038895, "learning_rate": 8.582316360405856e-06, "loss": 0.025, "step": 88580 }, { "epoch": 2.485341562631505, "grad_norm": 1.941402554512024, "learning_rate": 8.577640622808249e-06, "loss": 0.0272, "step": 88590 }, { "epoch": 2.4856221068873614, "grad_norm": 0.08496920764446259, "learning_rate": 8.572964885210642e-06, "loss": 0.0098, "step": 88600 }, { "epoch": 2.485902651143218, "grad_norm": 0.42087486386299133, "learning_rate": 8.568289147613035e-06, "loss": 0.0142, "step": 88610 }, { "epoch": 2.4861831953990743, "grad_norm": 0.8275519609451294, "learning_rate": 8.56361341001543e-06, "loss": 0.0509, "step": 88620 }, { "epoch": 2.4864637396549307, "grad_norm": 0.031529851257801056, "learning_rate": 8.558937672417825e-06, "loss": 0.0136, "step": 88630 }, { "epoch": 2.486744283910787, "grad_norm": 0.06132154166698456, "learning_rate": 8.554261934820218e-06, "loss": 0.0053, "step": 88640 }, { "epoch": 2.487024828166643, "grad_norm": 0.6367399096488953, "learning_rate": 8.549586197222613e-06, "loss": 0.0079, "step": 88650 }, { "epoch": 2.4873053724224996, "grad_norm": 0.06485574692487717, "learning_rate": 8.544910459625006e-06, "loss": 0.0149, "step": 88660 }, { "epoch": 2.487585916678356, "grad_norm": 1.1475279331207275, "learning_rate": 8.5402347220274e-06, "loss": 0.0201, "step": 88670 }, { "epoch": 2.4878664609342125, "grad_norm": 0.5313805937767029, "learning_rate": 8.535558984429794e-06, "loss": 0.0306, "step": 88680 }, { "epoch": 2.4881470051900685, "grad_norm": 0.08918764442205429, "learning_rate": 8.530883246832189e-06, "loss": 0.0107, "step": 88690 }, { "epoch": 2.488427549445925, "grad_norm": 0.20943722128868103, "learning_rate": 8.526207509234582e-06, "loss": 0.0061, "step": 88700 }, { "epoch": 2.4887080937017814, "grad_norm": 0.03615710511803627, "learning_rate": 8.521531771636975e-06, "loss": 0.0102, "step": 88710 }, { "epoch": 2.488988637957638, "grad_norm": 0.15493427217006683, "learning_rate": 8.51685603403937e-06, "loss": 0.0123, "step": 88720 }, { "epoch": 2.4892691822134942, "grad_norm": 0.19572186470031738, "learning_rate": 8.512180296441765e-06, "loss": 0.0208, "step": 88730 }, { "epoch": 2.4895497264693507, "grad_norm": 0.8683943748474121, "learning_rate": 8.507504558844158e-06, "loss": 0.0147, "step": 88740 }, { "epoch": 2.489830270725207, "grad_norm": 0.5998719930648804, "learning_rate": 8.502828821246551e-06, "loss": 0.0195, "step": 88750 }, { "epoch": 2.490110814981063, "grad_norm": 0.37968000769615173, "learning_rate": 8.498153083648946e-06, "loss": 0.0132, "step": 88760 }, { "epoch": 2.4903913592369196, "grad_norm": 1.6516950130462646, "learning_rate": 8.49347734605134e-06, "loss": 0.0077, "step": 88770 }, { "epoch": 2.490671903492776, "grad_norm": 0.02712882123887539, "learning_rate": 8.488801608453734e-06, "loss": 0.0274, "step": 88780 }, { "epoch": 2.4909524477486324, "grad_norm": 0.04755774512887001, "learning_rate": 8.484125870856129e-06, "loss": 0.0125, "step": 88790 }, { "epoch": 2.491232992004489, "grad_norm": 0.014138491824269295, "learning_rate": 8.479450133258522e-06, "loss": 0.0084, "step": 88800 }, { "epoch": 2.491513536260345, "grad_norm": 0.4471966326236725, "learning_rate": 8.474774395660915e-06, "loss": 0.012, "step": 88810 }, { "epoch": 2.4917940805162013, "grad_norm": 0.33626729249954224, "learning_rate": 8.470098658063309e-06, "loss": 0.0061, "step": 88820 }, { "epoch": 2.4920746247720578, "grad_norm": 0.022470029070973396, "learning_rate": 8.465422920465705e-06, "loss": 0.0084, "step": 88830 }, { "epoch": 2.492355169027914, "grad_norm": 0.35904034972190857, "learning_rate": 8.460747182868098e-06, "loss": 0.0038, "step": 88840 }, { "epoch": 2.4926357132837706, "grad_norm": 1.9218969345092773, "learning_rate": 8.456071445270491e-06, "loss": 0.0207, "step": 88850 }, { "epoch": 2.492916257539627, "grad_norm": 0.3708198070526123, "learning_rate": 8.451395707672886e-06, "loss": 0.0242, "step": 88860 }, { "epoch": 2.493196801795483, "grad_norm": 0.5604428648948669, "learning_rate": 8.44671997007528e-06, "loss": 0.0194, "step": 88870 }, { "epoch": 2.4934773460513395, "grad_norm": 0.11665631085634232, "learning_rate": 8.442044232477674e-06, "loss": 0.0248, "step": 88880 }, { "epoch": 2.493757890307196, "grad_norm": 0.25894472002983093, "learning_rate": 8.437368494880067e-06, "loss": 0.0256, "step": 88890 }, { "epoch": 2.4940384345630524, "grad_norm": 0.3168156147003174, "learning_rate": 8.432692757282462e-06, "loss": 0.0134, "step": 88900 }, { "epoch": 2.494318978818909, "grad_norm": 0.2066260725259781, "learning_rate": 8.428017019684855e-06, "loss": 0.0387, "step": 88910 }, { "epoch": 2.494599523074765, "grad_norm": 0.03666940703988075, "learning_rate": 8.423341282087249e-06, "loss": 0.0235, "step": 88920 }, { "epoch": 2.4948800673306213, "grad_norm": 1.7967848777770996, "learning_rate": 8.418665544489643e-06, "loss": 0.0135, "step": 88930 }, { "epoch": 2.4951606115864777, "grad_norm": 0.28669846057891846, "learning_rate": 8.413989806892038e-06, "loss": 0.0173, "step": 88940 }, { "epoch": 2.495441155842334, "grad_norm": 0.06463077664375305, "learning_rate": 8.409314069294431e-06, "loss": 0.0396, "step": 88950 }, { "epoch": 2.4957217000981906, "grad_norm": 1.4070771932601929, "learning_rate": 8.404638331696825e-06, "loss": 0.0292, "step": 88960 }, { "epoch": 2.496002244354047, "grad_norm": 0.030656594783067703, "learning_rate": 8.39996259409922e-06, "loss": 0.0312, "step": 88970 }, { "epoch": 2.496282788609903, "grad_norm": 0.05261926352977753, "learning_rate": 8.395286856501614e-06, "loss": 0.0214, "step": 88980 }, { "epoch": 2.4965633328657595, "grad_norm": 1.1728874444961548, "learning_rate": 8.390611118904007e-06, "loss": 0.0287, "step": 88990 }, { "epoch": 2.496843877121616, "grad_norm": 0.31944888830184937, "learning_rate": 8.385935381306402e-06, "loss": 0.0277, "step": 89000 }, { "epoch": 2.4971244213774724, "grad_norm": 0.0425071120262146, "learning_rate": 8.381259643708795e-06, "loss": 0.0373, "step": 89010 }, { "epoch": 2.497404965633329, "grad_norm": 0.5616599917411804, "learning_rate": 8.376583906111189e-06, "loss": 0.005, "step": 89020 }, { "epoch": 2.497685509889185, "grad_norm": 0.22318650782108307, "learning_rate": 8.371908168513583e-06, "loss": 0.0084, "step": 89030 }, { "epoch": 2.4979660541450412, "grad_norm": 0.7167997360229492, "learning_rate": 8.367232430915978e-06, "loss": 0.0168, "step": 89040 }, { "epoch": 2.4982465984008977, "grad_norm": 0.5147384405136108, "learning_rate": 8.362556693318371e-06, "loss": 0.013, "step": 89050 }, { "epoch": 2.498527142656754, "grad_norm": 0.45978766679763794, "learning_rate": 8.357880955720765e-06, "loss": 0.0154, "step": 89060 }, { "epoch": 2.4988076869126106, "grad_norm": 3.633481025695801, "learning_rate": 8.35320521812316e-06, "loss": 0.0486, "step": 89070 }, { "epoch": 2.499088231168467, "grad_norm": 0.021742800250649452, "learning_rate": 8.348529480525554e-06, "loss": 0.0172, "step": 89080 }, { "epoch": 2.499368775424323, "grad_norm": 0.02538764663040638, "learning_rate": 8.343853742927947e-06, "loss": 0.0072, "step": 89090 }, { "epoch": 2.4996493196801794, "grad_norm": 0.004033722914755344, "learning_rate": 8.33917800533034e-06, "loss": 0.0064, "step": 89100 }, { "epoch": 2.499929863936036, "grad_norm": 0.039858393371105194, "learning_rate": 8.334502267732735e-06, "loss": 0.0169, "step": 89110 }, { "epoch": 2.5002104081918923, "grad_norm": 0.5837299227714539, "learning_rate": 8.329826530135129e-06, "loss": 0.0126, "step": 89120 }, { "epoch": 2.5004909524477488, "grad_norm": 0.21390609443187714, "learning_rate": 8.325150792537523e-06, "loss": 0.0211, "step": 89130 }, { "epoch": 2.5007714967036048, "grad_norm": 0.1890326887369156, "learning_rate": 8.320475054939918e-06, "loss": 0.0063, "step": 89140 }, { "epoch": 2.501052040959461, "grad_norm": 0.22516100108623505, "learning_rate": 8.315799317342311e-06, "loss": 0.0211, "step": 89150 }, { "epoch": 2.5013325852153176, "grad_norm": 1.2356516122817993, "learning_rate": 8.311123579744705e-06, "loss": 0.0321, "step": 89160 }, { "epoch": 2.501613129471174, "grad_norm": 0.25181862711906433, "learning_rate": 8.306447842147098e-06, "loss": 0.0055, "step": 89170 }, { "epoch": 2.5018936737270305, "grad_norm": 0.059064581990242004, "learning_rate": 8.301772104549493e-06, "loss": 0.0199, "step": 89180 }, { "epoch": 2.502174217982887, "grad_norm": 0.7155648469924927, "learning_rate": 8.297096366951887e-06, "loss": 0.0441, "step": 89190 }, { "epoch": 2.5024547622387434, "grad_norm": 0.1070915162563324, "learning_rate": 8.29242062935428e-06, "loss": 0.0052, "step": 89200 }, { "epoch": 2.5027353064945994, "grad_norm": 0.0707872062921524, "learning_rate": 8.287744891756675e-06, "loss": 0.028, "step": 89210 }, { "epoch": 2.503015850750456, "grad_norm": 0.0409788079559803, "learning_rate": 8.283069154159069e-06, "loss": 0.019, "step": 89220 }, { "epoch": 2.5032963950063123, "grad_norm": 0.06206517666578293, "learning_rate": 8.278393416561463e-06, "loss": 0.0064, "step": 89230 }, { "epoch": 2.5035769392621687, "grad_norm": 0.0486799031496048, "learning_rate": 8.273717678963858e-06, "loss": 0.0403, "step": 89240 }, { "epoch": 2.5038574835180247, "grad_norm": 0.9844965934753418, "learning_rate": 8.269041941366251e-06, "loss": 0.0387, "step": 89250 }, { "epoch": 2.504138027773881, "grad_norm": 0.04844869300723076, "learning_rate": 8.264366203768645e-06, "loss": 0.008, "step": 89260 }, { "epoch": 2.5044185720297376, "grad_norm": 0.1041373461484909, "learning_rate": 8.259690466171038e-06, "loss": 0.0199, "step": 89270 }, { "epoch": 2.504699116285594, "grad_norm": 1.7267972230911255, "learning_rate": 8.255014728573433e-06, "loss": 0.0464, "step": 89280 }, { "epoch": 2.5049796605414505, "grad_norm": 0.421815425157547, "learning_rate": 8.250338990975827e-06, "loss": 0.0174, "step": 89290 }, { "epoch": 2.505260204797307, "grad_norm": 0.23398910462856293, "learning_rate": 8.24566325337822e-06, "loss": 0.0099, "step": 89300 }, { "epoch": 2.5055407490531634, "grad_norm": 0.1586201936006546, "learning_rate": 8.240987515780615e-06, "loss": 0.0129, "step": 89310 }, { "epoch": 2.5058212933090194, "grad_norm": 0.8839371800422668, "learning_rate": 8.236311778183009e-06, "loss": 0.0225, "step": 89320 }, { "epoch": 2.506101837564876, "grad_norm": 0.05314404144883156, "learning_rate": 8.231636040585402e-06, "loss": 0.0157, "step": 89330 }, { "epoch": 2.5063823818207323, "grad_norm": 0.5173172354698181, "learning_rate": 8.226960302987797e-06, "loss": 0.0219, "step": 89340 }, { "epoch": 2.5066629260765887, "grad_norm": 0.016336599364876747, "learning_rate": 8.222284565390191e-06, "loss": 0.0114, "step": 89350 }, { "epoch": 2.5069434703324447, "grad_norm": 0.5322468876838684, "learning_rate": 8.217608827792585e-06, "loss": 0.0267, "step": 89360 }, { "epoch": 2.507224014588301, "grad_norm": 0.7175092101097107, "learning_rate": 8.212933090194978e-06, "loss": 0.0512, "step": 89370 }, { "epoch": 2.5075045588441576, "grad_norm": 0.06691243499517441, "learning_rate": 8.208257352597373e-06, "loss": 0.0093, "step": 89380 }, { "epoch": 2.507785103100014, "grad_norm": 1.1853187084197998, "learning_rate": 8.203581614999768e-06, "loss": 0.0127, "step": 89390 }, { "epoch": 2.5080656473558705, "grad_norm": 0.06792433559894562, "learning_rate": 8.19890587740216e-06, "loss": 0.0184, "step": 89400 }, { "epoch": 2.508346191611727, "grad_norm": 0.8197088241577148, "learning_rate": 8.194230139804554e-06, "loss": 0.0286, "step": 89410 }, { "epoch": 2.5086267358675833, "grad_norm": 0.4199291467666626, "learning_rate": 8.189554402206949e-06, "loss": 0.0098, "step": 89420 }, { "epoch": 2.5089072801234393, "grad_norm": 0.5031282901763916, "learning_rate": 8.184878664609342e-06, "loss": 0.0288, "step": 89430 }, { "epoch": 2.5091878243792958, "grad_norm": 0.8250030279159546, "learning_rate": 8.180202927011737e-06, "loss": 0.0202, "step": 89440 }, { "epoch": 2.509468368635152, "grad_norm": 0.21366697549819946, "learning_rate": 8.175527189414132e-06, "loss": 0.0148, "step": 89450 }, { "epoch": 2.5097489128910087, "grad_norm": 0.5033921599388123, "learning_rate": 8.170851451816525e-06, "loss": 0.0179, "step": 89460 }, { "epoch": 2.5100294571468647, "grad_norm": 1.4644172191619873, "learning_rate": 8.166175714218918e-06, "loss": 0.0405, "step": 89470 }, { "epoch": 2.510310001402721, "grad_norm": 0.7738339900970459, "learning_rate": 8.161499976621311e-06, "loss": 0.0253, "step": 89480 }, { "epoch": 2.5105905456585775, "grad_norm": 0.3764796257019043, "learning_rate": 8.156824239023708e-06, "loss": 0.0058, "step": 89490 }, { "epoch": 2.510871089914434, "grad_norm": 3.2267861366271973, "learning_rate": 8.1521485014261e-06, "loss": 0.0359, "step": 89500 }, { "epoch": 2.5111516341702904, "grad_norm": 0.27825653553009033, "learning_rate": 8.147472763828494e-06, "loss": 0.0205, "step": 89510 }, { "epoch": 2.511432178426147, "grad_norm": 0.396670937538147, "learning_rate": 8.142797026230889e-06, "loss": 0.0117, "step": 89520 }, { "epoch": 2.5117127226820033, "grad_norm": 0.2856099307537079, "learning_rate": 8.138121288633282e-06, "loss": 0.0138, "step": 89530 }, { "epoch": 2.5119932669378593, "grad_norm": 1.243607521057129, "learning_rate": 8.133445551035677e-06, "loss": 0.0354, "step": 89540 }, { "epoch": 2.5122738111937157, "grad_norm": 0.12029553949832916, "learning_rate": 8.12876981343807e-06, "loss": 0.0198, "step": 89550 }, { "epoch": 2.512554355449572, "grad_norm": 0.09452743083238602, "learning_rate": 8.124094075840465e-06, "loss": 0.0078, "step": 89560 }, { "epoch": 2.5128348997054286, "grad_norm": 1.377403974533081, "learning_rate": 8.119418338242858e-06, "loss": 0.0384, "step": 89570 }, { "epoch": 2.513115443961285, "grad_norm": 0.41609644889831543, "learning_rate": 8.114742600645251e-06, "loss": 0.0133, "step": 89580 }, { "epoch": 2.513395988217141, "grad_norm": 0.6125099062919617, "learning_rate": 8.110066863047648e-06, "loss": 0.0474, "step": 89590 }, { "epoch": 2.5136765324729975, "grad_norm": 0.08483798801898956, "learning_rate": 8.10539112545004e-06, "loss": 0.025, "step": 89600 }, { "epoch": 2.513957076728854, "grad_norm": 0.1890646368265152, "learning_rate": 8.100715387852434e-06, "loss": 0.03, "step": 89610 }, { "epoch": 2.5142376209847104, "grad_norm": 0.09745412319898605, "learning_rate": 8.096039650254827e-06, "loss": 0.026, "step": 89620 }, { "epoch": 2.514518165240567, "grad_norm": 0.41128233075141907, "learning_rate": 8.091363912657222e-06, "loss": 0.0178, "step": 89630 }, { "epoch": 2.5147987094964233, "grad_norm": 0.06928049772977829, "learning_rate": 8.086688175059617e-06, "loss": 0.0175, "step": 89640 }, { "epoch": 2.5150792537522797, "grad_norm": 0.5709208846092224, "learning_rate": 8.08201243746201e-06, "loss": 0.0268, "step": 89650 }, { "epoch": 2.5153597980081357, "grad_norm": 0.30261003971099854, "learning_rate": 8.077336699864405e-06, "loss": 0.02, "step": 89660 }, { "epoch": 2.515640342263992, "grad_norm": 0.3700981140136719, "learning_rate": 8.072660962266798e-06, "loss": 0.0525, "step": 89670 }, { "epoch": 2.5159208865198486, "grad_norm": 0.13887181878089905, "learning_rate": 8.067985224669191e-06, "loss": 0.0563, "step": 89680 }, { "epoch": 2.516201430775705, "grad_norm": 0.017001451924443245, "learning_rate": 8.063309487071586e-06, "loss": 0.0247, "step": 89690 }, { "epoch": 2.516481975031561, "grad_norm": 0.23883044719696045, "learning_rate": 8.05863374947398e-06, "loss": 0.028, "step": 89700 }, { "epoch": 2.5167625192874175, "grad_norm": 0.03205422684550285, "learning_rate": 8.053958011876374e-06, "loss": 0.0231, "step": 89710 }, { "epoch": 2.517043063543274, "grad_norm": 0.4505016505718231, "learning_rate": 8.049282274278767e-06, "loss": 0.0124, "step": 89720 }, { "epoch": 2.5173236077991303, "grad_norm": 2.3432106971740723, "learning_rate": 8.044606536681162e-06, "loss": 0.0354, "step": 89730 }, { "epoch": 2.5176041520549868, "grad_norm": 0.4486733376979828, "learning_rate": 8.039930799083557e-06, "loss": 0.0187, "step": 89740 }, { "epoch": 2.517884696310843, "grad_norm": 0.09560195356607437, "learning_rate": 8.03525506148595e-06, "loss": 0.0171, "step": 89750 }, { "epoch": 2.5181652405666997, "grad_norm": 0.3251282572746277, "learning_rate": 8.030579323888343e-06, "loss": 0.016, "step": 89760 }, { "epoch": 2.5184457848225557, "grad_norm": 1.1694809198379517, "learning_rate": 8.025903586290738e-06, "loss": 0.0188, "step": 89770 }, { "epoch": 2.518726329078412, "grad_norm": 0.33304867148399353, "learning_rate": 8.021227848693131e-06, "loss": 0.0474, "step": 89780 }, { "epoch": 2.5190068733342685, "grad_norm": 0.5864934325218201, "learning_rate": 8.016552111095526e-06, "loss": 0.0341, "step": 89790 }, { "epoch": 2.519287417590125, "grad_norm": 0.035159531980752945, "learning_rate": 8.01187637349792e-06, "loss": 0.0183, "step": 89800 }, { "epoch": 2.519567961845981, "grad_norm": 0.6108817458152771, "learning_rate": 8.007200635900314e-06, "loss": 0.0199, "step": 89810 }, { "epoch": 2.5198485061018374, "grad_norm": 0.0670720636844635, "learning_rate": 8.002524898302707e-06, "loss": 0.0116, "step": 89820 }, { "epoch": 2.520129050357694, "grad_norm": 0.03253411129117012, "learning_rate": 7.9978491607051e-06, "loss": 0.0088, "step": 89830 }, { "epoch": 2.5204095946135503, "grad_norm": 0.6460120677947998, "learning_rate": 7.993173423107495e-06, "loss": 0.0239, "step": 89840 }, { "epoch": 2.5206901388694067, "grad_norm": 0.2333250641822815, "learning_rate": 7.98849768550989e-06, "loss": 0.0175, "step": 89850 }, { "epoch": 2.520970683125263, "grad_norm": 0.0578199177980423, "learning_rate": 7.983821947912283e-06, "loss": 0.0119, "step": 89860 }, { "epoch": 2.5212512273811196, "grad_norm": 0.044152624905109406, "learning_rate": 7.979146210314678e-06, "loss": 0.014, "step": 89870 }, { "epoch": 2.5215317716369756, "grad_norm": 0.020997576415538788, "learning_rate": 7.974470472717071e-06, "loss": 0.0069, "step": 89880 }, { "epoch": 2.521812315892832, "grad_norm": 1.0448836088180542, "learning_rate": 7.969794735119466e-06, "loss": 0.0172, "step": 89890 }, { "epoch": 2.5220928601486885, "grad_norm": 0.060196612030267715, "learning_rate": 7.96511899752186e-06, "loss": 0.0423, "step": 89900 }, { "epoch": 2.522373404404545, "grad_norm": 0.7252294421195984, "learning_rate": 7.960443259924254e-06, "loss": 0.02, "step": 89910 }, { "epoch": 2.522653948660401, "grad_norm": 0.1995084136724472, "learning_rate": 7.955767522326647e-06, "loss": 0.0096, "step": 89920 }, { "epoch": 2.5229344929162574, "grad_norm": 0.04504738375544548, "learning_rate": 7.95109178472904e-06, "loss": 0.0303, "step": 89930 }, { "epoch": 2.523215037172114, "grad_norm": 0.2871875762939453, "learning_rate": 7.946416047131435e-06, "loss": 0.0391, "step": 89940 }, { "epoch": 2.5234955814279703, "grad_norm": 0.2172039896249771, "learning_rate": 7.94174030953383e-06, "loss": 0.0364, "step": 89950 }, { "epoch": 2.5237761256838267, "grad_norm": 0.30435308814048767, "learning_rate": 7.937064571936223e-06, "loss": 0.0212, "step": 89960 }, { "epoch": 2.524056669939683, "grad_norm": 0.2844463288784027, "learning_rate": 7.932388834338618e-06, "loss": 0.0425, "step": 89970 }, { "epoch": 2.5243372141955396, "grad_norm": 0.6162502765655518, "learning_rate": 7.927713096741011e-06, "loss": 0.0124, "step": 89980 }, { "epoch": 2.5246177584513956, "grad_norm": 0.03368868678808212, "learning_rate": 7.923037359143406e-06, "loss": 0.0079, "step": 89990 }, { "epoch": 2.524898302707252, "grad_norm": 0.08775654435157776, "learning_rate": 7.918361621545799e-06, "loss": 0.0312, "step": 90000 }, { "epoch": 2.5251788469631085, "grad_norm": 0.026540953665971756, "learning_rate": 7.913685883948194e-06, "loss": 0.0089, "step": 90010 }, { "epoch": 2.525459391218965, "grad_norm": 0.055078186094760895, "learning_rate": 7.909010146350587e-06, "loss": 0.0139, "step": 90020 }, { "epoch": 2.525739935474821, "grad_norm": 0.030700301751494408, "learning_rate": 7.90433440875298e-06, "loss": 0.0418, "step": 90030 }, { "epoch": 2.5260204797306773, "grad_norm": 0.06565235555171967, "learning_rate": 7.899658671155375e-06, "loss": 0.0073, "step": 90040 }, { "epoch": 2.526301023986534, "grad_norm": 0.018180910497903824, "learning_rate": 7.89498293355777e-06, "loss": 0.0188, "step": 90050 }, { "epoch": 2.52658156824239, "grad_norm": 1.459062099456787, "learning_rate": 7.890307195960163e-06, "loss": 0.0555, "step": 90060 }, { "epoch": 2.5268621124982467, "grad_norm": 0.022934896871447563, "learning_rate": 7.885631458362556e-06, "loss": 0.007, "step": 90070 }, { "epoch": 2.527142656754103, "grad_norm": 0.09247560054063797, "learning_rate": 7.880955720764951e-06, "loss": 0.032, "step": 90080 }, { "epoch": 2.5274232010099595, "grad_norm": 1.1226277351379395, "learning_rate": 7.876279983167344e-06, "loss": 0.0403, "step": 90090 }, { "epoch": 2.5277037452658155, "grad_norm": 0.026818817481398582, "learning_rate": 7.871604245569739e-06, "loss": 0.0244, "step": 90100 }, { "epoch": 2.527984289521672, "grad_norm": 0.26424506306648254, "learning_rate": 7.866928507972134e-06, "loss": 0.0378, "step": 90110 }, { "epoch": 2.5282648337775284, "grad_norm": 0.10461731255054474, "learning_rate": 7.862252770374527e-06, "loss": 0.0066, "step": 90120 }, { "epoch": 2.528545378033385, "grad_norm": 0.08157049119472504, "learning_rate": 7.85757703277692e-06, "loss": 0.0358, "step": 90130 }, { "epoch": 2.528825922289241, "grad_norm": 0.1836748719215393, "learning_rate": 7.852901295179315e-06, "loss": 0.0342, "step": 90140 }, { "epoch": 2.5291064665450973, "grad_norm": 0.0956205353140831, "learning_rate": 7.84822555758171e-06, "loss": 0.0094, "step": 90150 }, { "epoch": 2.5293870108009537, "grad_norm": 0.05715286359190941, "learning_rate": 7.843549819984103e-06, "loss": 0.0324, "step": 90160 }, { "epoch": 2.52966755505681, "grad_norm": 0.025383900851011276, "learning_rate": 7.838874082386496e-06, "loss": 0.0243, "step": 90170 }, { "epoch": 2.5299480993126666, "grad_norm": 0.028159234672784805, "learning_rate": 7.834198344788891e-06, "loss": 0.042, "step": 90180 }, { "epoch": 2.530228643568523, "grad_norm": 0.048867933452129364, "learning_rate": 7.829522607191284e-06, "loss": 0.0116, "step": 90190 }, { "epoch": 2.5305091878243795, "grad_norm": 1.3842402696609497, "learning_rate": 7.824846869593679e-06, "loss": 0.0206, "step": 90200 }, { "epoch": 2.5307897320802355, "grad_norm": 0.8462294936180115, "learning_rate": 7.820171131996072e-06, "loss": 0.021, "step": 90210 }, { "epoch": 2.531070276336092, "grad_norm": 0.30769652128219604, "learning_rate": 7.815495394398467e-06, "loss": 0.0242, "step": 90220 }, { "epoch": 2.5313508205919484, "grad_norm": 0.18569415807724, "learning_rate": 7.81081965680086e-06, "loss": 0.0086, "step": 90230 }, { "epoch": 2.531631364847805, "grad_norm": 0.023875156417489052, "learning_rate": 7.806143919203253e-06, "loss": 0.0053, "step": 90240 }, { "epoch": 2.5319119091036613, "grad_norm": 0.1683337390422821, "learning_rate": 7.80146818160565e-06, "loss": 0.0076, "step": 90250 }, { "epoch": 2.5321924533595173, "grad_norm": 0.042125504463911057, "learning_rate": 7.796792444008043e-06, "loss": 0.0087, "step": 90260 }, { "epoch": 2.5324729976153737, "grad_norm": 2.434014081954956, "learning_rate": 7.792116706410436e-06, "loss": 0.0268, "step": 90270 }, { "epoch": 2.53275354187123, "grad_norm": 0.006165077909827232, "learning_rate": 7.78744096881283e-06, "loss": 0.0158, "step": 90280 }, { "epoch": 2.5330340861270866, "grad_norm": 0.17016123235225677, "learning_rate": 7.782765231215224e-06, "loss": 0.025, "step": 90290 }, { "epoch": 2.533314630382943, "grad_norm": 0.0203006099909544, "learning_rate": 7.77808949361762e-06, "loss": 0.0551, "step": 90300 }, { "epoch": 2.5335951746387995, "grad_norm": 6.325356483459473, "learning_rate": 7.773413756020012e-06, "loss": 0.0439, "step": 90310 }, { "epoch": 2.533875718894656, "grad_norm": 0.05867696553468704, "learning_rate": 7.768738018422407e-06, "loss": 0.0037, "step": 90320 }, { "epoch": 2.534156263150512, "grad_norm": 3.481571674346924, "learning_rate": 7.7640622808248e-06, "loss": 0.0284, "step": 90330 }, { "epoch": 2.5344368074063683, "grad_norm": 0.30524688959121704, "learning_rate": 7.759386543227193e-06, "loss": 0.0143, "step": 90340 }, { "epoch": 2.534717351662225, "grad_norm": 1.7755398750305176, "learning_rate": 7.754710805629588e-06, "loss": 0.0591, "step": 90350 }, { "epoch": 2.5349978959180812, "grad_norm": 0.007563039194792509, "learning_rate": 7.750035068031983e-06, "loss": 0.0487, "step": 90360 }, { "epoch": 2.535278440173937, "grad_norm": 0.18633607029914856, "learning_rate": 7.745359330434376e-06, "loss": 0.0425, "step": 90370 }, { "epoch": 2.5355589844297937, "grad_norm": 0.03377581015229225, "learning_rate": 7.74068359283677e-06, "loss": 0.0207, "step": 90380 }, { "epoch": 2.53583952868565, "grad_norm": 0.026208361610770226, "learning_rate": 7.736007855239164e-06, "loss": 0.0151, "step": 90390 }, { "epoch": 2.5361200729415065, "grad_norm": 1.3650119304656982, "learning_rate": 7.73133211764156e-06, "loss": 0.0395, "step": 90400 }, { "epoch": 2.536400617197363, "grad_norm": 3.606630563735962, "learning_rate": 7.726656380043952e-06, "loss": 0.0491, "step": 90410 }, { "epoch": 2.5366811614532194, "grad_norm": 1.0845141410827637, "learning_rate": 7.721980642446346e-06, "loss": 0.0186, "step": 90420 }, { "epoch": 2.536961705709076, "grad_norm": 0.22409868240356445, "learning_rate": 7.71730490484874e-06, "loss": 0.0177, "step": 90430 }, { "epoch": 2.537242249964932, "grad_norm": 1.559032678604126, "learning_rate": 7.712629167251134e-06, "loss": 0.0225, "step": 90440 }, { "epoch": 2.5375227942207883, "grad_norm": 0.030037080869078636, "learning_rate": 7.707953429653528e-06, "loss": 0.0099, "step": 90450 }, { "epoch": 2.5378033384766447, "grad_norm": 0.32337450981140137, "learning_rate": 7.703277692055923e-06, "loss": 0.0186, "step": 90460 }, { "epoch": 2.538083882732501, "grad_norm": 0.03972422704100609, "learning_rate": 7.698601954458316e-06, "loss": 0.0127, "step": 90470 }, { "epoch": 2.538364426988357, "grad_norm": 0.21100327372550964, "learning_rate": 7.69392621686071e-06, "loss": 0.0071, "step": 90480 }, { "epoch": 2.5386449712442136, "grad_norm": 0.07270756363868713, "learning_rate": 7.689250479263103e-06, "loss": 0.0123, "step": 90490 }, { "epoch": 2.53892551550007, "grad_norm": 0.021547269076108932, "learning_rate": 7.6845747416655e-06, "loss": 0.0288, "step": 90500 }, { "epoch": 2.5392060597559265, "grad_norm": 0.06655029952526093, "learning_rate": 7.679899004067892e-06, "loss": 0.0166, "step": 90510 }, { "epoch": 2.539486604011783, "grad_norm": 0.038840558379888535, "learning_rate": 7.675223266470286e-06, "loss": 0.0309, "step": 90520 }, { "epoch": 2.5397671482676394, "grad_norm": 0.04341644048690796, "learning_rate": 7.67054752887268e-06, "loss": 0.0174, "step": 90530 }, { "epoch": 2.540047692523496, "grad_norm": 0.18906264007091522, "learning_rate": 7.665871791275074e-06, "loss": 0.0303, "step": 90540 }, { "epoch": 2.540328236779352, "grad_norm": 0.23476730287075043, "learning_rate": 7.661196053677468e-06, "loss": 0.0142, "step": 90550 }, { "epoch": 2.5406087810352083, "grad_norm": 0.12569200992584229, "learning_rate": 7.656520316079863e-06, "loss": 0.0187, "step": 90560 }, { "epoch": 2.5408893252910647, "grad_norm": 1.303977608680725, "learning_rate": 7.651844578482256e-06, "loss": 0.0207, "step": 90570 }, { "epoch": 2.541169869546921, "grad_norm": 0.03774323686957359, "learning_rate": 7.64716884088465e-06, "loss": 0.0272, "step": 90580 }, { "epoch": 2.541450413802777, "grad_norm": 0.1172279417514801, "learning_rate": 7.642493103287043e-06, "loss": 0.02, "step": 90590 }, { "epoch": 2.5417309580586336, "grad_norm": 0.5558121204376221, "learning_rate": 7.637817365689438e-06, "loss": 0.013, "step": 90600 }, { "epoch": 2.54201150231449, "grad_norm": 0.4384537935256958, "learning_rate": 7.633141628091832e-06, "loss": 0.0109, "step": 90610 }, { "epoch": 2.5422920465703465, "grad_norm": 0.039106786251068115, "learning_rate": 7.6284658904942255e-06, "loss": 0.009, "step": 90620 }, { "epoch": 2.542572590826203, "grad_norm": 0.008945013396441936, "learning_rate": 7.62379015289662e-06, "loss": 0.0089, "step": 90630 }, { "epoch": 2.5428531350820593, "grad_norm": 0.02191852033138275, "learning_rate": 7.619114415299014e-06, "loss": 0.0206, "step": 90640 }, { "epoch": 2.543133679337916, "grad_norm": 1.2704260349273682, "learning_rate": 7.6144386777014076e-06, "loss": 0.0135, "step": 90650 }, { "epoch": 2.543414223593772, "grad_norm": 0.05870084464550018, "learning_rate": 7.6097629401038016e-06, "loss": 0.0207, "step": 90660 }, { "epoch": 2.5436947678496282, "grad_norm": 0.01551748439669609, "learning_rate": 7.605087202506196e-06, "loss": 0.0098, "step": 90670 }, { "epoch": 2.5439753121054847, "grad_norm": 0.03108491376042366, "learning_rate": 7.6004114649085896e-06, "loss": 0.0261, "step": 90680 }, { "epoch": 2.544255856361341, "grad_norm": 0.027804501354694366, "learning_rate": 7.5957357273109836e-06, "loss": 0.0112, "step": 90690 }, { "epoch": 2.544536400617197, "grad_norm": 1.5932960510253906, "learning_rate": 7.591059989713378e-06, "loss": 0.0419, "step": 90700 }, { "epoch": 2.5448169448730535, "grad_norm": 0.008101840503513813, "learning_rate": 7.5863842521157716e-06, "loss": 0.005, "step": 90710 }, { "epoch": 2.54509748912891, "grad_norm": 0.23371067643165588, "learning_rate": 7.5817085145181656e-06, "loss": 0.0105, "step": 90720 }, { "epoch": 2.5453780333847664, "grad_norm": 0.5012916922569275, "learning_rate": 7.577032776920559e-06, "loss": 0.0114, "step": 90730 }, { "epoch": 2.545658577640623, "grad_norm": 1.5424295663833618, "learning_rate": 7.572357039322954e-06, "loss": 0.0431, "step": 90740 }, { "epoch": 2.5459391218964793, "grad_norm": 0.008795247413218021, "learning_rate": 7.567681301725348e-06, "loss": 0.0283, "step": 90750 }, { "epoch": 2.5462196661523357, "grad_norm": 0.3658035099506378, "learning_rate": 7.563005564127741e-06, "loss": 0.0119, "step": 90760 }, { "epoch": 2.5465002104081917, "grad_norm": 0.2873697280883789, "learning_rate": 7.5583298265301364e-06, "loss": 0.016, "step": 90770 }, { "epoch": 2.546780754664048, "grad_norm": 0.01832905039191246, "learning_rate": 7.55365408893253e-06, "loss": 0.0135, "step": 90780 }, { "epoch": 2.5470612989199046, "grad_norm": 0.3534705638885498, "learning_rate": 7.548978351334924e-06, "loss": 0.0066, "step": 90790 }, { "epoch": 2.547341843175761, "grad_norm": 0.0036031140480190516, "learning_rate": 7.544302613737317e-06, "loss": 0.0375, "step": 90800 }, { "epoch": 2.547622387431617, "grad_norm": 0.029842160642147064, "learning_rate": 7.539626876139712e-06, "loss": 0.037, "step": 90810 }, { "epoch": 2.5479029316874735, "grad_norm": 0.2825281023979187, "learning_rate": 7.534951138542106e-06, "loss": 0.0171, "step": 90820 }, { "epoch": 2.54818347594333, "grad_norm": 0.544715404510498, "learning_rate": 7.530275400944499e-06, "loss": 0.0421, "step": 90830 }, { "epoch": 2.5484640201991864, "grad_norm": 0.0036164249759167433, "learning_rate": 7.525599663346894e-06, "loss": 0.0084, "step": 90840 }, { "epoch": 2.548744564455043, "grad_norm": 0.02520246058702469, "learning_rate": 7.520923925749288e-06, "loss": 0.0173, "step": 90850 }, { "epoch": 2.5490251087108993, "grad_norm": 0.5603312253952026, "learning_rate": 7.516248188151681e-06, "loss": 0.0081, "step": 90860 }, { "epoch": 2.5493056529667557, "grad_norm": 0.13856282830238342, "learning_rate": 7.511572450554075e-06, "loss": 0.0102, "step": 90870 }, { "epoch": 2.5495861972226117, "grad_norm": 0.026365842670202255, "learning_rate": 7.50689671295647e-06, "loss": 0.0055, "step": 90880 }, { "epoch": 2.549866741478468, "grad_norm": 0.27178865671157837, "learning_rate": 7.502220975358864e-06, "loss": 0.0157, "step": 90890 }, { "epoch": 2.5501472857343246, "grad_norm": 0.5921579599380493, "learning_rate": 7.497545237761257e-06, "loss": 0.0297, "step": 90900 }, { "epoch": 2.550427829990181, "grad_norm": 0.023676633834838867, "learning_rate": 7.492869500163652e-06, "loss": 0.0093, "step": 90910 }, { "epoch": 2.5507083742460375, "grad_norm": 0.09488396346569061, "learning_rate": 7.488193762566046e-06, "loss": 0.006, "step": 90920 }, { "epoch": 2.5509889185018935, "grad_norm": 1.8554344177246094, "learning_rate": 7.483518024968439e-06, "loss": 0.0371, "step": 90930 }, { "epoch": 2.55126946275775, "grad_norm": 0.031621500849723816, "learning_rate": 7.478842287370833e-06, "loss": 0.0155, "step": 90940 }, { "epoch": 2.5515500070136063, "grad_norm": 0.22523099184036255, "learning_rate": 7.474166549773228e-06, "loss": 0.0105, "step": 90950 }, { "epoch": 2.551830551269463, "grad_norm": 0.7694568634033203, "learning_rate": 7.469490812175621e-06, "loss": 0.0509, "step": 90960 }, { "epoch": 2.5521110955253192, "grad_norm": 1.3297327756881714, "learning_rate": 7.464815074578015e-06, "loss": 0.0475, "step": 90970 }, { "epoch": 2.5523916397811757, "grad_norm": 0.36183857917785645, "learning_rate": 7.46013933698041e-06, "loss": 0.0367, "step": 90980 }, { "epoch": 2.552672184037032, "grad_norm": 0.06383152306079865, "learning_rate": 7.455463599382803e-06, "loss": 0.0096, "step": 90990 }, { "epoch": 2.552952728292888, "grad_norm": 0.423723965883255, "learning_rate": 7.450787861785197e-06, "loss": 0.0381, "step": 91000 }, { "epoch": 2.5532332725487445, "grad_norm": 0.055496104061603546, "learning_rate": 7.44611212418759e-06, "loss": 0.0194, "step": 91010 }, { "epoch": 2.553513816804601, "grad_norm": 0.03087725304067135, "learning_rate": 7.441436386589986e-06, "loss": 0.0119, "step": 91020 }, { "epoch": 2.5537943610604574, "grad_norm": 0.017683790996670723, "learning_rate": 7.436760648992379e-06, "loss": 0.0405, "step": 91030 }, { "epoch": 2.5540749053163134, "grad_norm": 0.7847134470939636, "learning_rate": 7.432084911394773e-06, "loss": 0.052, "step": 91040 }, { "epoch": 2.55435544957217, "grad_norm": 0.2990989089012146, "learning_rate": 7.427409173797168e-06, "loss": 0.0059, "step": 91050 }, { "epoch": 2.5546359938280263, "grad_norm": 0.4282538890838623, "learning_rate": 7.422733436199561e-06, "loss": 0.0178, "step": 91060 }, { "epoch": 2.5549165380838827, "grad_norm": 0.14953601360321045, "learning_rate": 7.418057698601955e-06, "loss": 0.021, "step": 91070 }, { "epoch": 2.555197082339739, "grad_norm": 0.14008039236068726, "learning_rate": 7.413381961004348e-06, "loss": 0.0213, "step": 91080 }, { "epoch": 2.5554776265955956, "grad_norm": 0.02551398053765297, "learning_rate": 7.408706223406743e-06, "loss": 0.0198, "step": 91090 }, { "epoch": 2.555758170851452, "grad_norm": 0.04539487510919571, "learning_rate": 7.404030485809137e-06, "loss": 0.0089, "step": 91100 }, { "epoch": 2.556038715107308, "grad_norm": 1.7567734718322754, "learning_rate": 7.39935474821153e-06, "loss": 0.0395, "step": 91110 }, { "epoch": 2.5563192593631645, "grad_norm": 0.4000195860862732, "learning_rate": 7.394679010613925e-06, "loss": 0.0071, "step": 91120 }, { "epoch": 2.556599803619021, "grad_norm": 0.07262744754552841, "learning_rate": 7.390003273016319e-06, "loss": 0.0324, "step": 91130 }, { "epoch": 2.5568803478748774, "grad_norm": 0.03935937583446503, "learning_rate": 7.385327535418712e-06, "loss": 0.0358, "step": 91140 }, { "epoch": 2.5571608921307334, "grad_norm": 0.034721601754426956, "learning_rate": 7.380651797821106e-06, "loss": 0.011, "step": 91150 }, { "epoch": 2.55744143638659, "grad_norm": 0.03224663808941841, "learning_rate": 7.375976060223501e-06, "loss": 0.029, "step": 91160 }, { "epoch": 2.5577219806424463, "grad_norm": 5.519642353057861, "learning_rate": 7.371300322625895e-06, "loss": 0.0337, "step": 91170 }, { "epoch": 2.5580025248983027, "grad_norm": 0.06799189001321793, "learning_rate": 7.366624585028288e-06, "loss": 0.0142, "step": 91180 }, { "epoch": 2.558283069154159, "grad_norm": 2.5478451251983643, "learning_rate": 7.361948847430683e-06, "loss": 0.0483, "step": 91190 }, { "epoch": 2.5585636134100156, "grad_norm": 0.07927072048187256, "learning_rate": 7.357273109833077e-06, "loss": 0.0108, "step": 91200 }, { "epoch": 2.558844157665872, "grad_norm": 0.47289520502090454, "learning_rate": 7.35259737223547e-06, "loss": 0.0163, "step": 91210 }, { "epoch": 2.559124701921728, "grad_norm": 0.0500507578253746, "learning_rate": 7.347921634637865e-06, "loss": 0.0141, "step": 91220 }, { "epoch": 2.5594052461775845, "grad_norm": 0.017193365842103958, "learning_rate": 7.343245897040259e-06, "loss": 0.0025, "step": 91230 }, { "epoch": 2.559685790433441, "grad_norm": 0.0349729061126709, "learning_rate": 7.338570159442652e-06, "loss": 0.0066, "step": 91240 }, { "epoch": 2.5599663346892974, "grad_norm": 0.18490315973758698, "learning_rate": 7.333894421845046e-06, "loss": 0.0219, "step": 91250 }, { "epoch": 2.5602468789451533, "grad_norm": 0.031394049525260925, "learning_rate": 7.329218684247441e-06, "loss": 0.0363, "step": 91260 }, { "epoch": 2.56052742320101, "grad_norm": 0.1596321016550064, "learning_rate": 7.324542946649834e-06, "loss": 0.0055, "step": 91270 }, { "epoch": 2.5608079674568662, "grad_norm": 0.052633512765169144, "learning_rate": 7.319867209052228e-06, "loss": 0.0074, "step": 91280 }, { "epoch": 2.5610885117127227, "grad_norm": 1.0833107233047485, "learning_rate": 7.315191471454623e-06, "loss": 0.021, "step": 91290 }, { "epoch": 2.561369055968579, "grad_norm": 0.050654273480176926, "learning_rate": 7.310515733857017e-06, "loss": 0.0353, "step": 91300 }, { "epoch": 2.5616496002244356, "grad_norm": 0.41671425104141235, "learning_rate": 7.30583999625941e-06, "loss": 0.0066, "step": 91310 }, { "epoch": 2.561930144480292, "grad_norm": 0.061563003808259964, "learning_rate": 7.301164258661804e-06, "loss": 0.0062, "step": 91320 }, { "epoch": 2.562210688736148, "grad_norm": 0.09840133041143417, "learning_rate": 7.296488521064199e-06, "loss": 0.0097, "step": 91330 }, { "epoch": 2.5624912329920044, "grad_norm": 0.014250985346734524, "learning_rate": 7.291812783466592e-06, "loss": 0.0149, "step": 91340 }, { "epoch": 2.562771777247861, "grad_norm": 0.007159409113228321, "learning_rate": 7.287137045868986e-06, "loss": 0.0077, "step": 91350 }, { "epoch": 2.5630523215037173, "grad_norm": 0.386238157749176, "learning_rate": 7.282461308271381e-06, "loss": 0.0058, "step": 91360 }, { "epoch": 2.5633328657595733, "grad_norm": 0.01682254858314991, "learning_rate": 7.277785570673774e-06, "loss": 0.0313, "step": 91370 }, { "epoch": 2.5636134100154297, "grad_norm": 0.016386374831199646, "learning_rate": 7.273109833076168e-06, "loss": 0.0085, "step": 91380 }, { "epoch": 2.563893954271286, "grad_norm": 0.019947784021496773, "learning_rate": 7.268434095478561e-06, "loss": 0.0197, "step": 91390 }, { "epoch": 2.5641744985271426, "grad_norm": 0.02036820724606514, "learning_rate": 7.263758357880957e-06, "loss": 0.0138, "step": 91400 }, { "epoch": 2.564455042782999, "grad_norm": 0.5190779566764832, "learning_rate": 7.25908262028335e-06, "loss": 0.0167, "step": 91410 }, { "epoch": 2.5647355870388555, "grad_norm": 0.30279162526130676, "learning_rate": 7.254406882685744e-06, "loss": 0.0062, "step": 91420 }, { "epoch": 2.565016131294712, "grad_norm": 0.2671058475971222, "learning_rate": 7.249731145088139e-06, "loss": 0.0225, "step": 91430 }, { "epoch": 2.565296675550568, "grad_norm": 0.2756575345993042, "learning_rate": 7.245055407490532e-06, "loss": 0.0147, "step": 91440 }, { "epoch": 2.5655772198064244, "grad_norm": 0.025848325341939926, "learning_rate": 7.240379669892926e-06, "loss": 0.008, "step": 91450 }, { "epoch": 2.565857764062281, "grad_norm": 0.4975709617137909, "learning_rate": 7.235703932295319e-06, "loss": 0.0597, "step": 91460 }, { "epoch": 2.5661383083181373, "grad_norm": 0.011314318515360355, "learning_rate": 7.231028194697714e-06, "loss": 0.0356, "step": 91470 }, { "epoch": 2.5664188525739933, "grad_norm": 0.44547805190086365, "learning_rate": 7.226352457100108e-06, "loss": 0.0157, "step": 91480 }, { "epoch": 2.5666993968298497, "grad_norm": 0.3991676867008209, "learning_rate": 7.221676719502501e-06, "loss": 0.0138, "step": 91490 }, { "epoch": 2.566979941085706, "grad_norm": 0.03860683739185333, "learning_rate": 7.217000981904896e-06, "loss": 0.0262, "step": 91500 }, { "epoch": 2.5672604853415626, "grad_norm": 0.07473082095384598, "learning_rate": 7.21232524430729e-06, "loss": 0.0308, "step": 91510 }, { "epoch": 2.567541029597419, "grad_norm": 0.09896460920572281, "learning_rate": 7.207649506709683e-06, "loss": 0.01, "step": 91520 }, { "epoch": 2.5678215738532755, "grad_norm": 0.5550875067710876, "learning_rate": 7.202973769112077e-06, "loss": 0.0179, "step": 91530 }, { "epoch": 2.568102118109132, "grad_norm": 0.03526690974831581, "learning_rate": 7.198298031514472e-06, "loss": 0.0074, "step": 91540 }, { "epoch": 2.568382662364988, "grad_norm": 0.527099609375, "learning_rate": 7.193622293916866e-06, "loss": 0.0144, "step": 91550 }, { "epoch": 2.5686632066208444, "grad_norm": 0.02250625379383564, "learning_rate": 7.188946556319259e-06, "loss": 0.0071, "step": 91560 }, { "epoch": 2.568943750876701, "grad_norm": 0.9722059369087219, "learning_rate": 7.184270818721654e-06, "loss": 0.0288, "step": 91570 }, { "epoch": 2.5692242951325572, "grad_norm": 0.06598526239395142, "learning_rate": 7.179595081124048e-06, "loss": 0.0177, "step": 91580 }, { "epoch": 2.5695048393884137, "grad_norm": 0.04996892437338829, "learning_rate": 7.174919343526441e-06, "loss": 0.0147, "step": 91590 }, { "epoch": 2.5697853836442697, "grad_norm": 0.07077943533658981, "learning_rate": 7.170243605928835e-06, "loss": 0.0036, "step": 91600 }, { "epoch": 2.570065927900126, "grad_norm": 0.04770023003220558, "learning_rate": 7.16556786833123e-06, "loss": 0.002, "step": 91610 }, { "epoch": 2.5703464721559826, "grad_norm": 0.042240776121616364, "learning_rate": 7.160892130733623e-06, "loss": 0.0125, "step": 91620 }, { "epoch": 2.570627016411839, "grad_norm": 0.17587722837924957, "learning_rate": 7.156216393136017e-06, "loss": 0.0244, "step": 91630 }, { "epoch": 2.5709075606676954, "grad_norm": 1.450438141822815, "learning_rate": 7.151540655538412e-06, "loss": 0.0145, "step": 91640 }, { "epoch": 2.571188104923552, "grad_norm": 3.88283371925354, "learning_rate": 7.146864917940805e-06, "loss": 0.0314, "step": 91650 }, { "epoch": 2.5714686491794083, "grad_norm": 1.440117359161377, "learning_rate": 7.142189180343199e-06, "loss": 0.0182, "step": 91660 }, { "epoch": 2.5717491934352643, "grad_norm": 1.2835887670516968, "learning_rate": 7.137513442745592e-06, "loss": 0.0233, "step": 91670 }, { "epoch": 2.5720297376911208, "grad_norm": 0.21120700240135193, "learning_rate": 7.132837705147988e-06, "loss": 0.0412, "step": 91680 }, { "epoch": 2.572310281946977, "grad_norm": 0.12339644134044647, "learning_rate": 7.128161967550381e-06, "loss": 0.0338, "step": 91690 }, { "epoch": 2.5725908262028336, "grad_norm": 0.027355771511793137, "learning_rate": 7.123486229952775e-06, "loss": 0.0337, "step": 91700 }, { "epoch": 2.5728713704586896, "grad_norm": 0.08186887949705124, "learning_rate": 7.11881049235517e-06, "loss": 0.0297, "step": 91710 }, { "epoch": 2.573151914714546, "grad_norm": 0.5969387888908386, "learning_rate": 7.114134754757563e-06, "loss": 0.0075, "step": 91720 }, { "epoch": 2.5734324589704025, "grad_norm": 0.3970440924167633, "learning_rate": 7.109459017159957e-06, "loss": 0.0144, "step": 91730 }, { "epoch": 2.573713003226259, "grad_norm": 0.18485122919082642, "learning_rate": 7.10478327956235e-06, "loss": 0.0113, "step": 91740 }, { "epoch": 2.5739935474821154, "grad_norm": 1.8485407829284668, "learning_rate": 7.100107541964745e-06, "loss": 0.0054, "step": 91750 }, { "epoch": 2.574274091737972, "grad_norm": 0.24217048287391663, "learning_rate": 7.095431804367139e-06, "loss": 0.0173, "step": 91760 }, { "epoch": 2.5745546359938283, "grad_norm": 0.036953944712877274, "learning_rate": 7.090756066769532e-06, "loss": 0.0148, "step": 91770 }, { "epoch": 2.5748351802496843, "grad_norm": 0.06446743756532669, "learning_rate": 7.086080329171928e-06, "loss": 0.029, "step": 91780 }, { "epoch": 2.5751157245055407, "grad_norm": 0.072475366294384, "learning_rate": 7.081404591574321e-06, "loss": 0.0138, "step": 91790 }, { "epoch": 2.575396268761397, "grad_norm": 0.034223753958940506, "learning_rate": 7.076728853976715e-06, "loss": 0.0069, "step": 91800 }, { "epoch": 2.5756768130172536, "grad_norm": 0.2316376268863678, "learning_rate": 7.07205311637911e-06, "loss": 0.0243, "step": 91810 }, { "epoch": 2.5759573572731096, "grad_norm": 0.013834308832883835, "learning_rate": 7.067377378781503e-06, "loss": 0.0311, "step": 91820 }, { "epoch": 2.576237901528966, "grad_norm": 0.20349355041980743, "learning_rate": 7.062701641183897e-06, "loss": 0.0193, "step": 91830 }, { "epoch": 2.5765184457848225, "grad_norm": 2.674940347671509, "learning_rate": 7.0580259035862904e-06, "loss": 0.0084, "step": 91840 }, { "epoch": 2.576798990040679, "grad_norm": 0.004753796383738518, "learning_rate": 7.053350165988685e-06, "loss": 0.0152, "step": 91850 }, { "epoch": 2.5770795342965354, "grad_norm": 0.023151393979787827, "learning_rate": 7.048674428391079e-06, "loss": 0.0039, "step": 91860 }, { "epoch": 2.577360078552392, "grad_norm": 0.02179545722901821, "learning_rate": 7.0439986907934724e-06, "loss": 0.0161, "step": 91870 }, { "epoch": 2.5776406228082482, "grad_norm": 0.02100895531475544, "learning_rate": 7.039322953195867e-06, "loss": 0.0211, "step": 91880 }, { "epoch": 2.5779211670641042, "grad_norm": 0.627221941947937, "learning_rate": 7.034647215598261e-06, "loss": 0.0084, "step": 91890 }, { "epoch": 2.5782017113199607, "grad_norm": 0.016375329345464706, "learning_rate": 7.0299714780006545e-06, "loss": 0.0449, "step": 91900 }, { "epoch": 2.578482255575817, "grad_norm": 0.2872392535209656, "learning_rate": 7.0252957404030485e-06, "loss": 0.0086, "step": 91910 }, { "epoch": 2.5787627998316736, "grad_norm": 0.0449407696723938, "learning_rate": 7.020620002805443e-06, "loss": 0.0371, "step": 91920 }, { "epoch": 2.5790433440875296, "grad_norm": 0.18886272609233856, "learning_rate": 7.015944265207837e-06, "loss": 0.0239, "step": 91930 }, { "epoch": 2.579323888343386, "grad_norm": 1.4860315322875977, "learning_rate": 7.0112685276102305e-06, "loss": 0.0035, "step": 91940 }, { "epoch": 2.5796044325992424, "grad_norm": 0.03391376510262489, "learning_rate": 7.006592790012625e-06, "loss": 0.017, "step": 91950 }, { "epoch": 2.579884976855099, "grad_norm": 0.01959797739982605, "learning_rate": 7.001917052415019e-06, "loss": 0.0099, "step": 91960 }, { "epoch": 2.5801655211109553, "grad_norm": 0.24931219220161438, "learning_rate": 6.9972413148174125e-06, "loss": 0.0228, "step": 91970 }, { "epoch": 2.5804460653668118, "grad_norm": 0.034310080111026764, "learning_rate": 6.9925655772198065e-06, "loss": 0.0247, "step": 91980 }, { "epoch": 2.580726609622668, "grad_norm": 0.3105641305446625, "learning_rate": 6.987889839622201e-06, "loss": 0.0103, "step": 91990 }, { "epoch": 2.581007153878524, "grad_norm": 1.9607118368148804, "learning_rate": 6.9832141020245945e-06, "loss": 0.02, "step": 92000 }, { "epoch": 2.5812876981343806, "grad_norm": 2.1159069538116455, "learning_rate": 6.9785383644269885e-06, "loss": 0.0283, "step": 92010 }, { "epoch": 2.581568242390237, "grad_norm": 0.023425934836268425, "learning_rate": 6.973862626829383e-06, "loss": 0.0108, "step": 92020 }, { "epoch": 2.5818487866460935, "grad_norm": 0.02295934408903122, "learning_rate": 6.9691868892317765e-06, "loss": 0.0387, "step": 92030 }, { "epoch": 2.5821293309019495, "grad_norm": 0.022962333634495735, "learning_rate": 6.9645111516341705e-06, "loss": 0.045, "step": 92040 }, { "epoch": 2.582409875157806, "grad_norm": 0.03598075360059738, "learning_rate": 6.959835414036564e-06, "loss": 0.0387, "step": 92050 }, { "epoch": 2.5826904194136624, "grad_norm": 0.06092502549290657, "learning_rate": 6.955159676438959e-06, "loss": 0.0149, "step": 92060 }, { "epoch": 2.582970963669519, "grad_norm": 0.14982521533966064, "learning_rate": 6.9504839388413525e-06, "loss": 0.022, "step": 92070 }, { "epoch": 2.5832515079253753, "grad_norm": 0.056602321565151215, "learning_rate": 6.9458082012437465e-06, "loss": 0.005, "step": 92080 }, { "epoch": 2.5835320521812317, "grad_norm": 0.04990231990814209, "learning_rate": 6.941132463646141e-06, "loss": 0.0103, "step": 92090 }, { "epoch": 2.583812596437088, "grad_norm": 0.028395792469382286, "learning_rate": 6.9364567260485345e-06, "loss": 0.0299, "step": 92100 }, { "epoch": 2.584093140692944, "grad_norm": 0.4288254380226135, "learning_rate": 6.9317809884509285e-06, "loss": 0.0465, "step": 92110 }, { "epoch": 2.5843736849488006, "grad_norm": 0.7026904821395874, "learning_rate": 6.927105250853322e-06, "loss": 0.0126, "step": 92120 }, { "epoch": 2.584654229204657, "grad_norm": 1.3573887348175049, "learning_rate": 6.9224295132557165e-06, "loss": 0.0179, "step": 92130 }, { "epoch": 2.5849347734605135, "grad_norm": 1.7432869672775269, "learning_rate": 6.9177537756581105e-06, "loss": 0.0191, "step": 92140 }, { "epoch": 2.58521531771637, "grad_norm": 0.2771148085594177, "learning_rate": 6.913078038060504e-06, "loss": 0.0375, "step": 92150 }, { "epoch": 2.585495861972226, "grad_norm": 0.6589834094047546, "learning_rate": 6.908402300462899e-06, "loss": 0.0252, "step": 92160 }, { "epoch": 2.5857764062280824, "grad_norm": 0.14448265731334686, "learning_rate": 6.9037265628652925e-06, "loss": 0.0104, "step": 92170 }, { "epoch": 2.586056950483939, "grad_norm": 0.042061127722263336, "learning_rate": 6.8990508252676865e-06, "loss": 0.0119, "step": 92180 }, { "epoch": 2.5863374947397952, "grad_norm": 0.054628886282444, "learning_rate": 6.89437508767008e-06, "loss": 0.0278, "step": 92190 }, { "epoch": 2.5866180389956517, "grad_norm": 1.6362786293029785, "learning_rate": 6.8896993500724745e-06, "loss": 0.0241, "step": 92200 }, { "epoch": 2.586898583251508, "grad_norm": 0.11659703403711319, "learning_rate": 6.8850236124748685e-06, "loss": 0.0201, "step": 92210 }, { "epoch": 2.5871791275073646, "grad_norm": 0.020350664854049683, "learning_rate": 6.880347874877262e-06, "loss": 0.0219, "step": 92220 }, { "epoch": 2.5874596717632206, "grad_norm": 0.261180579662323, "learning_rate": 6.8756721372796565e-06, "loss": 0.0034, "step": 92230 }, { "epoch": 2.587740216019077, "grad_norm": 0.12892001867294312, "learning_rate": 6.8709963996820505e-06, "loss": 0.0106, "step": 92240 }, { "epoch": 2.5880207602749334, "grad_norm": 0.4004945158958435, "learning_rate": 6.866320662084444e-06, "loss": 0.0248, "step": 92250 }, { "epoch": 2.58830130453079, "grad_norm": 0.112893246114254, "learning_rate": 6.861644924486838e-06, "loss": 0.0651, "step": 92260 }, { "epoch": 2.588581848786646, "grad_norm": 0.04118068888783455, "learning_rate": 6.8569691868892325e-06, "loss": 0.0196, "step": 92270 }, { "epoch": 2.5888623930425023, "grad_norm": 0.046650126576423645, "learning_rate": 6.852293449291626e-06, "loss": 0.0273, "step": 92280 }, { "epoch": 2.5891429372983588, "grad_norm": 0.099962517619133, "learning_rate": 6.84761771169402e-06, "loss": 0.0097, "step": 92290 }, { "epoch": 2.589423481554215, "grad_norm": 0.3030370771884918, "learning_rate": 6.8429419740964145e-06, "loss": 0.0098, "step": 92300 }, { "epoch": 2.5897040258100716, "grad_norm": 0.1964871734380722, "learning_rate": 6.8382662364988085e-06, "loss": 0.0041, "step": 92310 }, { "epoch": 2.589984570065928, "grad_norm": 0.049234312027692795, "learning_rate": 6.833590498901202e-06, "loss": 0.0418, "step": 92320 }, { "epoch": 2.5902651143217845, "grad_norm": 0.4645415246486664, "learning_rate": 6.828914761303596e-06, "loss": 0.0211, "step": 92330 }, { "epoch": 2.5905456585776405, "grad_norm": 0.6581798791885376, "learning_rate": 6.8242390237059905e-06, "loss": 0.0156, "step": 92340 }, { "epoch": 2.590826202833497, "grad_norm": 0.042681287974119186, "learning_rate": 6.819563286108384e-06, "loss": 0.0235, "step": 92350 }, { "epoch": 2.5911067470893534, "grad_norm": 0.43204987049102783, "learning_rate": 6.814887548510778e-06, "loss": 0.0315, "step": 92360 }, { "epoch": 2.59138729134521, "grad_norm": 0.022578690201044083, "learning_rate": 6.8102118109131726e-06, "loss": 0.0159, "step": 92370 }, { "epoch": 2.591667835601066, "grad_norm": 0.019702592864632607, "learning_rate": 6.805536073315566e-06, "loss": 0.011, "step": 92380 }, { "epoch": 2.5919483798569223, "grad_norm": 0.07944194227457047, "learning_rate": 6.80086033571796e-06, "loss": 0.0033, "step": 92390 }, { "epoch": 2.5922289241127787, "grad_norm": 0.3751118779182434, "learning_rate": 6.796184598120353e-06, "loss": 0.029, "step": 92400 }, { "epoch": 2.592509468368635, "grad_norm": 0.4063441753387451, "learning_rate": 6.791508860522748e-06, "loss": 0.0113, "step": 92410 }, { "epoch": 2.5927900126244916, "grad_norm": 0.02174752950668335, "learning_rate": 6.786833122925142e-06, "loss": 0.0095, "step": 92420 }, { "epoch": 2.593070556880348, "grad_norm": 0.028738021850585938, "learning_rate": 6.782157385327535e-06, "loss": 0.0152, "step": 92430 }, { "epoch": 2.5933511011362045, "grad_norm": 0.05442225933074951, "learning_rate": 6.7774816477299306e-06, "loss": 0.002, "step": 92440 }, { "epoch": 2.5936316453920605, "grad_norm": 0.05624712258577347, "learning_rate": 6.772805910132324e-06, "loss": 0.0206, "step": 92450 }, { "epoch": 2.593912189647917, "grad_norm": 0.2862336039543152, "learning_rate": 6.768130172534718e-06, "loss": 0.0082, "step": 92460 }, { "epoch": 2.5941927339037734, "grad_norm": 0.73844313621521, "learning_rate": 6.763454434937113e-06, "loss": 0.0283, "step": 92470 }, { "epoch": 2.59447327815963, "grad_norm": 0.054828446358442307, "learning_rate": 6.758778697339506e-06, "loss": 0.043, "step": 92480 }, { "epoch": 2.594753822415486, "grad_norm": 2.036386489868164, "learning_rate": 6.7541029597419e-06, "loss": 0.0243, "step": 92490 }, { "epoch": 2.5950343666713422, "grad_norm": 0.04403664916753769, "learning_rate": 6.749427222144293e-06, "loss": 0.006, "step": 92500 }, { "epoch": 2.5953149109271987, "grad_norm": 0.2957034111022949, "learning_rate": 6.744751484546688e-06, "loss": 0.0084, "step": 92510 }, { "epoch": 2.595595455183055, "grad_norm": 0.025383083149790764, "learning_rate": 6.740075746949082e-06, "loss": 0.0088, "step": 92520 }, { "epoch": 2.5958759994389116, "grad_norm": 1.1608484983444214, "learning_rate": 6.735400009351475e-06, "loss": 0.0277, "step": 92530 }, { "epoch": 2.596156543694768, "grad_norm": 0.3494510054588318, "learning_rate": 6.73072427175387e-06, "loss": 0.0084, "step": 92540 }, { "epoch": 2.5964370879506244, "grad_norm": 0.047561466693878174, "learning_rate": 6.726048534156264e-06, "loss": 0.0351, "step": 92550 }, { "epoch": 2.5967176322064804, "grad_norm": 0.04027519002556801, "learning_rate": 6.721372796558657e-06, "loss": 0.0395, "step": 92560 }, { "epoch": 2.596998176462337, "grad_norm": 0.07877081632614136, "learning_rate": 6.716697058961051e-06, "loss": 0.0022, "step": 92570 }, { "epoch": 2.5972787207181933, "grad_norm": 0.0055034528486430645, "learning_rate": 6.712021321363446e-06, "loss": 0.0026, "step": 92580 }, { "epoch": 2.5975592649740498, "grad_norm": 0.026342378929257393, "learning_rate": 6.70734558376584e-06, "loss": 0.0074, "step": 92590 }, { "epoch": 2.5978398092299058, "grad_norm": 0.03398865833878517, "learning_rate": 6.702669846168233e-06, "loss": 0.0366, "step": 92600 }, { "epoch": 2.598120353485762, "grad_norm": 0.1557305008172989, "learning_rate": 6.697994108570628e-06, "loss": 0.0274, "step": 92610 }, { "epoch": 2.5984008977416186, "grad_norm": 0.6205382347106934, "learning_rate": 6.693318370973022e-06, "loss": 0.0173, "step": 92620 }, { "epoch": 2.598681441997475, "grad_norm": 0.011713451705873013, "learning_rate": 6.688642633375415e-06, "loss": 0.0033, "step": 92630 }, { "epoch": 2.5989619862533315, "grad_norm": 0.040254589170217514, "learning_rate": 6.683966895777809e-06, "loss": 0.0172, "step": 92640 }, { "epoch": 2.599242530509188, "grad_norm": 0.18079496920108795, "learning_rate": 6.679291158180204e-06, "loss": 0.0209, "step": 92650 }, { "epoch": 2.5995230747650444, "grad_norm": 0.030427129939198494, "learning_rate": 6.674615420582597e-06, "loss": 0.0333, "step": 92660 }, { "epoch": 2.5998036190209004, "grad_norm": 0.09968544542789459, "learning_rate": 6.669939682984991e-06, "loss": 0.0148, "step": 92670 }, { "epoch": 2.600084163276757, "grad_norm": 0.25290632247924805, "learning_rate": 6.665263945387386e-06, "loss": 0.0289, "step": 92680 }, { "epoch": 2.6003647075326133, "grad_norm": 0.09680721163749695, "learning_rate": 6.66058820778978e-06, "loss": 0.0176, "step": 92690 }, { "epoch": 2.6006452517884697, "grad_norm": 0.012948950752615929, "learning_rate": 6.655912470192173e-06, "loss": 0.0268, "step": 92700 }, { "epoch": 2.6009257960443257, "grad_norm": 0.3347121775150299, "learning_rate": 6.651236732594567e-06, "loss": 0.011, "step": 92710 }, { "epoch": 2.601206340300182, "grad_norm": 0.3707331120967865, "learning_rate": 6.646560994996962e-06, "loss": 0.0162, "step": 92720 }, { "epoch": 2.6014868845560386, "grad_norm": 0.6162655353546143, "learning_rate": 6.641885257399355e-06, "loss": 0.0385, "step": 92730 }, { "epoch": 2.601767428811895, "grad_norm": 0.07337074726819992, "learning_rate": 6.637209519801749e-06, "loss": 0.0265, "step": 92740 }, { "epoch": 2.6020479730677515, "grad_norm": 0.20087552070617676, "learning_rate": 6.632533782204144e-06, "loss": 0.0252, "step": 92750 }, { "epoch": 2.602328517323608, "grad_norm": 0.7484074234962463, "learning_rate": 6.627858044606537e-06, "loss": 0.01, "step": 92760 }, { "epoch": 2.6026090615794644, "grad_norm": 0.4413428008556366, "learning_rate": 6.623182307008931e-06, "loss": 0.0155, "step": 92770 }, { "epoch": 2.6028896058353204, "grad_norm": 0.3011499047279358, "learning_rate": 6.618506569411324e-06, "loss": 0.02, "step": 92780 }, { "epoch": 2.603170150091177, "grad_norm": 0.0338798351585865, "learning_rate": 6.613830831813719e-06, "loss": 0.0103, "step": 92790 }, { "epoch": 2.6034506943470332, "grad_norm": 0.8993493914604187, "learning_rate": 6.609155094216113e-06, "loss": 0.0257, "step": 92800 }, { "epoch": 2.6037312386028897, "grad_norm": 3.7295236587524414, "learning_rate": 6.604479356618506e-06, "loss": 0.0185, "step": 92810 }, { "epoch": 2.604011782858746, "grad_norm": 0.8548086285591125, "learning_rate": 6.599803619020902e-06, "loss": 0.018, "step": 92820 }, { "epoch": 2.604292327114602, "grad_norm": 2.271604537963867, "learning_rate": 6.595127881423295e-06, "loss": 0.0402, "step": 92830 }, { "epoch": 2.6045728713704586, "grad_norm": 0.6635966897010803, "learning_rate": 6.590452143825689e-06, "loss": 0.0338, "step": 92840 }, { "epoch": 2.604853415626315, "grad_norm": 0.03419581428170204, "learning_rate": 6.585776406228082e-06, "loss": 0.0227, "step": 92850 }, { "epoch": 2.6051339598821714, "grad_norm": 0.19676168262958527, "learning_rate": 6.581100668630477e-06, "loss": 0.0207, "step": 92860 }, { "epoch": 2.605414504138028, "grad_norm": 0.03450972959399223, "learning_rate": 6.576424931032871e-06, "loss": 0.0127, "step": 92870 }, { "epoch": 2.6056950483938843, "grad_norm": 0.4542117118835449, "learning_rate": 6.571749193435264e-06, "loss": 0.0082, "step": 92880 }, { "epoch": 2.6059755926497408, "grad_norm": 0.38142937421798706, "learning_rate": 6.567073455837659e-06, "loss": 0.0122, "step": 92890 }, { "epoch": 2.6062561369055968, "grad_norm": 0.03968219831585884, "learning_rate": 6.562397718240053e-06, "loss": 0.013, "step": 92900 }, { "epoch": 2.606536681161453, "grad_norm": 0.14374566078186035, "learning_rate": 6.557721980642446e-06, "loss": 0.0061, "step": 92910 }, { "epoch": 2.6068172254173096, "grad_norm": 0.27471017837524414, "learning_rate": 6.55304624304484e-06, "loss": 0.0115, "step": 92920 }, { "epoch": 2.607097769673166, "grad_norm": 0.5538093447685242, "learning_rate": 6.548370505447235e-06, "loss": 0.0107, "step": 92930 }, { "epoch": 2.607378313929022, "grad_norm": 1.4618821144104004, "learning_rate": 6.543694767849628e-06, "loss": 0.0263, "step": 92940 }, { "epoch": 2.6076588581848785, "grad_norm": 0.03565118834376335, "learning_rate": 6.539019030252022e-06, "loss": 0.0258, "step": 92950 }, { "epoch": 2.607939402440735, "grad_norm": 0.40444573760032654, "learning_rate": 6.534343292654417e-06, "loss": 0.0151, "step": 92960 }, { "epoch": 2.6082199466965914, "grad_norm": 0.7804310917854309, "learning_rate": 6.529667555056811e-06, "loss": 0.0108, "step": 92970 }, { "epoch": 2.608500490952448, "grad_norm": 0.29721733927726746, "learning_rate": 6.524991817459204e-06, "loss": 0.0249, "step": 92980 }, { "epoch": 2.6087810352083043, "grad_norm": 0.03139522299170494, "learning_rate": 6.520316079861598e-06, "loss": 0.0289, "step": 92990 }, { "epoch": 2.6090615794641607, "grad_norm": 0.2656564712524414, "learning_rate": 6.515640342263993e-06, "loss": 0.0178, "step": 93000 }, { "epoch": 2.6093421237200167, "grad_norm": 0.21420566737651825, "learning_rate": 6.510964604666386e-06, "loss": 0.0187, "step": 93010 }, { "epoch": 2.609622667975873, "grad_norm": 0.3895813524723053, "learning_rate": 6.50628886706878e-06, "loss": 0.006, "step": 93020 }, { "epoch": 2.6099032122317296, "grad_norm": 0.042965132743120193, "learning_rate": 6.501613129471175e-06, "loss": 0.0243, "step": 93030 }, { "epoch": 2.610183756487586, "grad_norm": 0.008362570777535439, "learning_rate": 6.496937391873568e-06, "loss": 0.0089, "step": 93040 }, { "epoch": 2.610464300743442, "grad_norm": 0.283778578042984, "learning_rate": 6.492261654275962e-06, "loss": 0.0038, "step": 93050 }, { "epoch": 2.6107448449992985, "grad_norm": 0.004619672894477844, "learning_rate": 6.487585916678355e-06, "loss": 0.0322, "step": 93060 }, { "epoch": 2.611025389255155, "grad_norm": 0.9823833703994751, "learning_rate": 6.482910179080751e-06, "loss": 0.015, "step": 93070 }, { "epoch": 2.6113059335110114, "grad_norm": 0.524641752243042, "learning_rate": 6.478234441483144e-06, "loss": 0.012, "step": 93080 }, { "epoch": 2.611586477766868, "grad_norm": 0.06694815307855606, "learning_rate": 6.473558703885538e-06, "loss": 0.0107, "step": 93090 }, { "epoch": 2.6118670220227242, "grad_norm": 0.17992839217185974, "learning_rate": 6.468882966287933e-06, "loss": 0.0331, "step": 93100 }, { "epoch": 2.6121475662785807, "grad_norm": 0.03887186944484711, "learning_rate": 6.464207228690326e-06, "loss": 0.0212, "step": 93110 }, { "epoch": 2.6124281105344367, "grad_norm": 0.30840811133384705, "learning_rate": 6.45953149109272e-06, "loss": 0.0194, "step": 93120 }, { "epoch": 2.612708654790293, "grad_norm": 0.061374764889478683, "learning_rate": 6.454855753495115e-06, "loss": 0.0302, "step": 93130 }, { "epoch": 2.6129891990461496, "grad_norm": 0.03317554295063019, "learning_rate": 6.450180015897508e-06, "loss": 0.0307, "step": 93140 }, { "epoch": 2.613269743302006, "grad_norm": 1.6149544715881348, "learning_rate": 6.445504278299902e-06, "loss": 0.0313, "step": 93150 }, { "epoch": 2.613550287557862, "grad_norm": 0.014725767076015472, "learning_rate": 6.440828540702295e-06, "loss": 0.0046, "step": 93160 }, { "epoch": 2.6138308318137184, "grad_norm": 0.061113063246011734, "learning_rate": 6.43615280310469e-06, "loss": 0.0066, "step": 93170 }, { "epoch": 2.614111376069575, "grad_norm": 0.02655063010752201, "learning_rate": 6.431477065507084e-06, "loss": 0.0075, "step": 93180 }, { "epoch": 2.6143919203254313, "grad_norm": 0.20301364362239838, "learning_rate": 6.426801327909477e-06, "loss": 0.0228, "step": 93190 }, { "epoch": 2.6146724645812878, "grad_norm": 0.03759315609931946, "learning_rate": 6.422125590311873e-06, "loss": 0.0077, "step": 93200 }, { "epoch": 2.614953008837144, "grad_norm": 0.069157175719738, "learning_rate": 6.417449852714266e-06, "loss": 0.0061, "step": 93210 }, { "epoch": 2.6152335530930007, "grad_norm": 0.05284278839826584, "learning_rate": 6.41277411511666e-06, "loss": 0.0143, "step": 93220 }, { "epoch": 2.6155140973488566, "grad_norm": 0.07927066087722778, "learning_rate": 6.408098377519053e-06, "loss": 0.007, "step": 93230 }, { "epoch": 2.615794641604713, "grad_norm": 0.020628413185477257, "learning_rate": 6.403422639921448e-06, "loss": 0.0248, "step": 93240 }, { "epoch": 2.6160751858605695, "grad_norm": 0.027692126110196114, "learning_rate": 6.398746902323842e-06, "loss": 0.029, "step": 93250 }, { "epoch": 2.616355730116426, "grad_norm": 0.03247276693582535, "learning_rate": 6.394071164726235e-06, "loss": 0.0022, "step": 93260 }, { "epoch": 2.616636274372282, "grad_norm": 0.9355870485305786, "learning_rate": 6.38939542712863e-06, "loss": 0.0394, "step": 93270 }, { "epoch": 2.6169168186281384, "grad_norm": 0.22167262434959412, "learning_rate": 6.384719689531024e-06, "loss": 0.0371, "step": 93280 }, { "epoch": 2.617197362883995, "grad_norm": 0.12275718152523041, "learning_rate": 6.380043951933417e-06, "loss": 0.0214, "step": 93290 }, { "epoch": 2.6174779071398513, "grad_norm": 0.952373743057251, "learning_rate": 6.375368214335811e-06, "loss": 0.037, "step": 93300 }, { "epoch": 2.6177584513957077, "grad_norm": 0.046285081654787064, "learning_rate": 6.370692476738206e-06, "loss": 0.0033, "step": 93310 }, { "epoch": 2.618038995651564, "grad_norm": 0.058222465217113495, "learning_rate": 6.366016739140599e-06, "loss": 0.0105, "step": 93320 }, { "epoch": 2.6183195399074206, "grad_norm": 0.08121522516012192, "learning_rate": 6.361341001542993e-06, "loss": 0.0149, "step": 93330 }, { "epoch": 2.6186000841632766, "grad_norm": 0.02380973845720291, "learning_rate": 6.356665263945388e-06, "loss": 0.0157, "step": 93340 }, { "epoch": 2.618880628419133, "grad_norm": 0.0358676053583622, "learning_rate": 6.351989526347782e-06, "loss": 0.005, "step": 93350 }, { "epoch": 2.6191611726749895, "grad_norm": 2.188495635986328, "learning_rate": 6.347313788750175e-06, "loss": 0.0277, "step": 93360 }, { "epoch": 2.619441716930846, "grad_norm": 0.03799628093838692, "learning_rate": 6.342638051152569e-06, "loss": 0.0116, "step": 93370 }, { "epoch": 2.619722261186702, "grad_norm": 0.03551414608955383, "learning_rate": 6.337962313554964e-06, "loss": 0.0119, "step": 93380 }, { "epoch": 2.6200028054425584, "grad_norm": 0.2758725583553314, "learning_rate": 6.333286575957357e-06, "loss": 0.0467, "step": 93390 }, { "epoch": 2.620283349698415, "grad_norm": 0.04809044674038887, "learning_rate": 6.328610838359751e-06, "loss": 0.0043, "step": 93400 }, { "epoch": 2.6205638939542713, "grad_norm": 0.8502752184867859, "learning_rate": 6.323935100762146e-06, "loss": 0.0204, "step": 93410 }, { "epoch": 2.6208444382101277, "grad_norm": 0.01745164394378662, "learning_rate": 6.319259363164539e-06, "loss": 0.0124, "step": 93420 }, { "epoch": 2.621124982465984, "grad_norm": 0.5978236198425293, "learning_rate": 6.314583625566933e-06, "loss": 0.0269, "step": 93430 }, { "epoch": 2.6214055267218406, "grad_norm": 2.7160871028900146, "learning_rate": 6.3099078879693266e-06, "loss": 0.0639, "step": 93440 }, { "epoch": 2.6216860709776966, "grad_norm": 0.012893921695649624, "learning_rate": 6.305232150371721e-06, "loss": 0.01, "step": 93450 }, { "epoch": 2.621966615233553, "grad_norm": 0.35138335824012756, "learning_rate": 6.300556412774115e-06, "loss": 0.0355, "step": 93460 }, { "epoch": 2.6222471594894095, "grad_norm": 0.06212165579199791, "learning_rate": 6.2958806751765086e-06, "loss": 0.0177, "step": 93470 }, { "epoch": 2.622527703745266, "grad_norm": 0.05908626317977905, "learning_rate": 6.291204937578904e-06, "loss": 0.0129, "step": 93480 }, { "epoch": 2.6228082480011223, "grad_norm": 0.08548450469970703, "learning_rate": 6.286529199981297e-06, "loss": 0.033, "step": 93490 }, { "epoch": 2.6230887922569783, "grad_norm": 0.16198481619358063, "learning_rate": 6.2818534623836914e-06, "loss": 0.0249, "step": 93500 }, { "epoch": 2.6233693365128348, "grad_norm": 0.06516390293836594, "learning_rate": 6.277177724786085e-06, "loss": 0.0367, "step": 93510 }, { "epoch": 2.623649880768691, "grad_norm": 0.04047662764787674, "learning_rate": 6.2725019871884794e-06, "loss": 0.0194, "step": 93520 }, { "epoch": 2.6239304250245477, "grad_norm": 0.10144496709108353, "learning_rate": 6.2678262495908734e-06, "loss": 0.0179, "step": 93530 }, { "epoch": 2.624210969280404, "grad_norm": 0.5674777626991272, "learning_rate": 6.263150511993267e-06, "loss": 0.0185, "step": 93540 }, { "epoch": 2.6244915135362605, "grad_norm": 0.04648958519101143, "learning_rate": 6.2584747743956614e-06, "loss": 0.0068, "step": 93550 }, { "epoch": 2.624772057792117, "grad_norm": 0.18344752490520477, "learning_rate": 6.2537990367980554e-06, "loss": 0.0128, "step": 93560 }, { "epoch": 2.625052602047973, "grad_norm": 1.5745512247085571, "learning_rate": 6.249123299200449e-06, "loss": 0.0332, "step": 93570 }, { "epoch": 2.6253331463038294, "grad_norm": 0.254210501909256, "learning_rate": 6.2444475616028434e-06, "loss": 0.014, "step": 93580 }, { "epoch": 2.625613690559686, "grad_norm": 0.0036162782926112413, "learning_rate": 6.2397718240052374e-06, "loss": 0.0437, "step": 93590 }, { "epoch": 2.6258942348155423, "grad_norm": 0.10358763486146927, "learning_rate": 6.2350960864076314e-06, "loss": 0.0415, "step": 93600 }, { "epoch": 2.6261747790713983, "grad_norm": 0.5112340450286865, "learning_rate": 6.2304203488100254e-06, "loss": 0.029, "step": 93610 }, { "epoch": 2.6264553233272547, "grad_norm": 0.7039194107055664, "learning_rate": 6.225744611212419e-06, "loss": 0.0193, "step": 93620 }, { "epoch": 2.626735867583111, "grad_norm": 0.05544985085725784, "learning_rate": 6.2210688736148135e-06, "loss": 0.024, "step": 93630 }, { "epoch": 2.6270164118389676, "grad_norm": 0.6100120544433594, "learning_rate": 6.216393136017207e-06, "loss": 0.0062, "step": 93640 }, { "epoch": 2.627296956094824, "grad_norm": 0.17142590880393982, "learning_rate": 6.211717398419601e-06, "loss": 0.007, "step": 93650 }, { "epoch": 2.6275775003506805, "grad_norm": 0.04513739421963692, "learning_rate": 6.2070416608219955e-06, "loss": 0.0094, "step": 93660 }, { "epoch": 2.627858044606537, "grad_norm": 0.2252022922039032, "learning_rate": 6.202365923224389e-06, "loss": 0.0045, "step": 93670 }, { "epoch": 2.628138588862393, "grad_norm": 0.017601313069462776, "learning_rate": 6.1976901856267835e-06, "loss": 0.0215, "step": 93680 }, { "epoch": 2.6284191331182494, "grad_norm": 0.3527851998806, "learning_rate": 6.193014448029177e-06, "loss": 0.0152, "step": 93690 }, { "epoch": 2.628699677374106, "grad_norm": 0.24145445227622986, "learning_rate": 6.188338710431571e-06, "loss": 0.0149, "step": 93700 }, { "epoch": 2.6289802216299623, "grad_norm": 0.011826543137431145, "learning_rate": 6.183662972833965e-06, "loss": 0.0224, "step": 93710 }, { "epoch": 2.6292607658858183, "grad_norm": 0.029372435063123703, "learning_rate": 6.178987235236359e-06, "loss": 0.0362, "step": 93720 }, { "epoch": 2.6295413101416747, "grad_norm": 0.05840552970767021, "learning_rate": 6.1743114976387535e-06, "loss": 0.0142, "step": 93730 }, { "epoch": 2.629821854397531, "grad_norm": 0.3731819689273834, "learning_rate": 6.169635760041147e-06, "loss": 0.0142, "step": 93740 }, { "epoch": 2.6301023986533876, "grad_norm": 0.08519764244556427, "learning_rate": 6.164960022443541e-06, "loss": 0.0079, "step": 93750 }, { "epoch": 2.630382942909244, "grad_norm": 1.7832558155059814, "learning_rate": 6.160284284845935e-06, "loss": 0.0404, "step": 93760 }, { "epoch": 2.6306634871651005, "grad_norm": 0.9587083458900452, "learning_rate": 6.155608547248329e-06, "loss": 0.021, "step": 93770 }, { "epoch": 2.630944031420957, "grad_norm": 0.038258545100688934, "learning_rate": 6.150932809650723e-06, "loss": 0.0121, "step": 93780 }, { "epoch": 2.631224575676813, "grad_norm": 0.103483185172081, "learning_rate": 6.146257072053117e-06, "loss": 0.0248, "step": 93790 }, { "epoch": 2.6315051199326693, "grad_norm": 0.17355681955814362, "learning_rate": 6.141581334455511e-06, "loss": 0.012, "step": 93800 }, { "epoch": 2.6317856641885258, "grad_norm": 1.7359800338745117, "learning_rate": 6.136905596857905e-06, "loss": 0.0119, "step": 93810 }, { "epoch": 2.632066208444382, "grad_norm": 0.25137969851493835, "learning_rate": 6.132229859260299e-06, "loss": 0.0103, "step": 93820 }, { "epoch": 2.632346752700238, "grad_norm": 0.03139668330550194, "learning_rate": 6.127554121662693e-06, "loss": 0.0028, "step": 93830 }, { "epoch": 2.6326272969560947, "grad_norm": 2.4609968662261963, "learning_rate": 6.122878384065087e-06, "loss": 0.0151, "step": 93840 }, { "epoch": 2.632907841211951, "grad_norm": 0.39763614535331726, "learning_rate": 6.11820264646748e-06, "loss": 0.0042, "step": 93850 }, { "epoch": 2.6331883854678075, "grad_norm": 0.03993985056877136, "learning_rate": 6.113526908869875e-06, "loss": 0.0352, "step": 93860 }, { "epoch": 2.633468929723664, "grad_norm": 0.04265530779957771, "learning_rate": 6.108851171272269e-06, "loss": 0.0166, "step": 93870 }, { "epoch": 2.6337494739795204, "grad_norm": 9.180434226989746, "learning_rate": 6.104175433674663e-06, "loss": 0.0303, "step": 93880 }, { "epoch": 2.634030018235377, "grad_norm": 0.1129777729511261, "learning_rate": 6.099499696077057e-06, "loss": 0.0097, "step": 93890 }, { "epoch": 2.634310562491233, "grad_norm": 0.0192536823451519, "learning_rate": 6.09482395847945e-06, "loss": 0.0115, "step": 93900 }, { "epoch": 2.6345911067470893, "grad_norm": 0.015942154452204704, "learning_rate": 6.090148220881845e-06, "loss": 0.0152, "step": 93910 }, { "epoch": 2.6348716510029457, "grad_norm": 2.4929966926574707, "learning_rate": 6.085472483284239e-06, "loss": 0.0194, "step": 93920 }, { "epoch": 2.635152195258802, "grad_norm": 0.024395544081926346, "learning_rate": 6.080796745686633e-06, "loss": 0.025, "step": 93930 }, { "epoch": 2.635432739514658, "grad_norm": 0.1062832772731781, "learning_rate": 6.076121008089027e-06, "loss": 0.0144, "step": 93940 }, { "epoch": 2.6357132837705146, "grad_norm": 0.04037969559431076, "learning_rate": 6.07144527049142e-06, "loss": 0.0148, "step": 93950 }, { "epoch": 2.635993828026371, "grad_norm": 0.10298105329275131, "learning_rate": 6.066769532893815e-06, "loss": 0.0148, "step": 93960 }, { "epoch": 2.6362743722822275, "grad_norm": 3.3330490589141846, "learning_rate": 6.062093795296208e-06, "loss": 0.0291, "step": 93970 }, { "epoch": 2.636554916538084, "grad_norm": 0.07591880112886429, "learning_rate": 6.057418057698603e-06, "loss": 0.0118, "step": 93980 }, { "epoch": 2.6368354607939404, "grad_norm": 0.07687719911336899, "learning_rate": 6.052742320100997e-06, "loss": 0.0542, "step": 93990 }, { "epoch": 2.637116005049797, "grad_norm": 0.03808651864528656, "learning_rate": 6.04806658250339e-06, "loss": 0.0112, "step": 94000 }, { "epoch": 2.637396549305653, "grad_norm": 0.26164695620536804, "learning_rate": 6.043390844905785e-06, "loss": 0.0139, "step": 94010 }, { "epoch": 2.6376770935615093, "grad_norm": 0.7242552638053894, "learning_rate": 6.038715107308178e-06, "loss": 0.0067, "step": 94020 }, { "epoch": 2.6379576378173657, "grad_norm": 0.012748421169817448, "learning_rate": 6.034039369710572e-06, "loss": 0.0167, "step": 94030 }, { "epoch": 2.638238182073222, "grad_norm": 0.04396756365895271, "learning_rate": 6.029363632112966e-06, "loss": 0.0287, "step": 94040 }, { "epoch": 2.6385187263290786, "grad_norm": 0.028823906555771828, "learning_rate": 6.02468789451536e-06, "loss": 0.0042, "step": 94050 }, { "epoch": 2.6387992705849346, "grad_norm": 0.6680933237075806, "learning_rate": 6.020012156917755e-06, "loss": 0.0431, "step": 94060 }, { "epoch": 2.639079814840791, "grad_norm": 0.09847007691860199, "learning_rate": 6.015336419320148e-06, "loss": 0.0061, "step": 94070 }, { "epoch": 2.6393603590966475, "grad_norm": 0.03454943001270294, "learning_rate": 6.010660681722542e-06, "loss": 0.0074, "step": 94080 }, { "epoch": 2.639640903352504, "grad_norm": 0.05600763112306595, "learning_rate": 6.005984944124936e-06, "loss": 0.0129, "step": 94090 }, { "epoch": 2.6399214476083603, "grad_norm": 1.2661117315292358, "learning_rate": 6.00130920652733e-06, "loss": 0.0225, "step": 94100 }, { "epoch": 2.640201991864217, "grad_norm": 0.03252699226140976, "learning_rate": 5.996633468929724e-06, "loss": 0.031, "step": 94110 }, { "epoch": 2.6404825361200728, "grad_norm": 0.017742076888680458, "learning_rate": 5.991957731332118e-06, "loss": 0.0157, "step": 94120 }, { "epoch": 2.640763080375929, "grad_norm": 0.12932422757148743, "learning_rate": 5.987281993734512e-06, "loss": 0.0233, "step": 94130 }, { "epoch": 2.6410436246317857, "grad_norm": 0.38040778040885925, "learning_rate": 5.982606256136906e-06, "loss": 0.0118, "step": 94140 }, { "epoch": 2.641324168887642, "grad_norm": 0.07154135406017303, "learning_rate": 5.9779305185393e-06, "loss": 0.0203, "step": 94150 }, { "epoch": 2.6416047131434985, "grad_norm": 0.02921401336789131, "learning_rate": 5.973254780941694e-06, "loss": 0.0118, "step": 94160 }, { "epoch": 2.6418852573993545, "grad_norm": 0.317409485578537, "learning_rate": 5.968579043344088e-06, "loss": 0.0172, "step": 94170 }, { "epoch": 2.642165801655211, "grad_norm": 0.17114491760730743, "learning_rate": 5.963903305746481e-06, "loss": 0.0303, "step": 94180 }, { "epoch": 2.6424463459110674, "grad_norm": 1.1616231203079224, "learning_rate": 5.959227568148876e-06, "loss": 0.0226, "step": 94190 }, { "epoch": 2.642726890166924, "grad_norm": 0.06368871033191681, "learning_rate": 5.95455183055127e-06, "loss": 0.0107, "step": 94200 }, { "epoch": 2.6430074344227803, "grad_norm": 0.05033921077847481, "learning_rate": 5.949876092953664e-06, "loss": 0.0323, "step": 94210 }, { "epoch": 2.6432879786786367, "grad_norm": 0.08448673039674759, "learning_rate": 5.945200355356058e-06, "loss": 0.0021, "step": 94220 }, { "epoch": 2.643568522934493, "grad_norm": 1.244439959526062, "learning_rate": 5.940524617758451e-06, "loss": 0.0382, "step": 94230 }, { "epoch": 2.643849067190349, "grad_norm": 0.06314977258443832, "learning_rate": 5.935848880160846e-06, "loss": 0.0077, "step": 94240 }, { "epoch": 2.6441296114462056, "grad_norm": 0.027601156383752823, "learning_rate": 5.93117314256324e-06, "loss": 0.0224, "step": 94250 }, { "epoch": 2.644410155702062, "grad_norm": 0.7346709966659546, "learning_rate": 5.926497404965634e-06, "loss": 0.009, "step": 94260 }, { "epoch": 2.6446906999579185, "grad_norm": 1.0180472135543823, "learning_rate": 5.921821667368028e-06, "loss": 0.0245, "step": 94270 }, { "epoch": 2.6449712442137745, "grad_norm": 0.13873441517353058, "learning_rate": 5.917145929770421e-06, "loss": 0.0415, "step": 94280 }, { "epoch": 2.645251788469631, "grad_norm": 0.06633155792951584, "learning_rate": 5.912470192172816e-06, "loss": 0.0171, "step": 94290 }, { "epoch": 2.6455323327254874, "grad_norm": 0.1636257916688919, "learning_rate": 5.907794454575209e-06, "loss": 0.0126, "step": 94300 }, { "epoch": 2.645812876981344, "grad_norm": 0.14030243456363678, "learning_rate": 5.903118716977604e-06, "loss": 0.0103, "step": 94310 }, { "epoch": 2.6460934212372003, "grad_norm": 0.1144748330116272, "learning_rate": 5.898442979379998e-06, "loss": 0.0113, "step": 94320 }, { "epoch": 2.6463739654930567, "grad_norm": 0.05041157826781273, "learning_rate": 5.893767241782391e-06, "loss": 0.0141, "step": 94330 }, { "epoch": 2.646654509748913, "grad_norm": 0.3371381461620331, "learning_rate": 5.889091504184786e-06, "loss": 0.0192, "step": 94340 }, { "epoch": 2.646935054004769, "grad_norm": 1.9816093444824219, "learning_rate": 5.884415766587179e-06, "loss": 0.0239, "step": 94350 }, { "epoch": 2.6472155982606256, "grad_norm": 0.022449204698204994, "learning_rate": 5.879740028989573e-06, "loss": 0.0056, "step": 94360 }, { "epoch": 2.647496142516482, "grad_norm": 0.3991550803184509, "learning_rate": 5.875064291391967e-06, "loss": 0.0277, "step": 94370 }, { "epoch": 2.6477766867723385, "grad_norm": 0.011739492416381836, "learning_rate": 5.870388553794361e-06, "loss": 0.0033, "step": 94380 }, { "epoch": 2.6480572310281945, "grad_norm": 0.013250928372144699, "learning_rate": 5.865712816196756e-06, "loss": 0.0066, "step": 94390 }, { "epoch": 2.648337775284051, "grad_norm": 0.08292299509048462, "learning_rate": 5.861037078599149e-06, "loss": 0.0241, "step": 94400 }, { "epoch": 2.6486183195399073, "grad_norm": 0.7939534187316895, "learning_rate": 5.856361341001543e-06, "loss": 0.032, "step": 94410 }, { "epoch": 2.648898863795764, "grad_norm": 0.5469419956207275, "learning_rate": 5.851685603403937e-06, "loss": 0.0366, "step": 94420 }, { "epoch": 2.64917940805162, "grad_norm": 0.029058441519737244, "learning_rate": 5.847009865806331e-06, "loss": 0.004, "step": 94430 }, { "epoch": 2.6494599523074767, "grad_norm": 0.09791234880685806, "learning_rate": 5.842334128208725e-06, "loss": 0.0372, "step": 94440 }, { "epoch": 2.649740496563333, "grad_norm": 0.02864735759794712, "learning_rate": 5.837658390611119e-06, "loss": 0.0062, "step": 94450 }, { "epoch": 2.650021040819189, "grad_norm": 0.05195807293057442, "learning_rate": 5.832982653013513e-06, "loss": 0.0041, "step": 94460 }, { "epoch": 2.6503015850750455, "grad_norm": 0.5883620381355286, "learning_rate": 5.828306915415907e-06, "loss": 0.0056, "step": 94470 }, { "epoch": 2.650582129330902, "grad_norm": 0.27168717980384827, "learning_rate": 5.823631177818301e-06, "loss": 0.0168, "step": 94480 }, { "epoch": 2.6508626735867584, "grad_norm": 0.10194215178489685, "learning_rate": 5.818955440220695e-06, "loss": 0.013, "step": 94490 }, { "epoch": 2.6511432178426144, "grad_norm": 0.08329019695520401, "learning_rate": 5.814279702623089e-06, "loss": 0.0031, "step": 94500 }, { "epoch": 2.651423762098471, "grad_norm": 0.21109184622764587, "learning_rate": 5.809603965025483e-06, "loss": 0.0062, "step": 94510 }, { "epoch": 2.6517043063543273, "grad_norm": 0.041931651532649994, "learning_rate": 5.804928227427877e-06, "loss": 0.01, "step": 94520 }, { "epoch": 2.6519848506101837, "grad_norm": 0.036720871925354004, "learning_rate": 5.800252489830271e-06, "loss": 0.0091, "step": 94530 }, { "epoch": 2.65226539486604, "grad_norm": 0.5068579912185669, "learning_rate": 5.795576752232665e-06, "loss": 0.0393, "step": 94540 }, { "epoch": 2.6525459391218966, "grad_norm": 0.02136615663766861, "learning_rate": 5.790901014635059e-06, "loss": 0.0094, "step": 94550 }, { "epoch": 2.652826483377753, "grad_norm": 0.035920798778533936, "learning_rate": 5.786225277037452e-06, "loss": 0.03, "step": 94560 }, { "epoch": 2.653107027633609, "grad_norm": 0.012333646416664124, "learning_rate": 5.781549539439847e-06, "loss": 0.038, "step": 94570 }, { "epoch": 2.6533875718894655, "grad_norm": 0.27167221903800964, "learning_rate": 5.776873801842241e-06, "loss": 0.0053, "step": 94580 }, { "epoch": 2.653668116145322, "grad_norm": 2.001936197280884, "learning_rate": 5.772198064244635e-06, "loss": 0.0126, "step": 94590 }, { "epoch": 2.6539486604011784, "grad_norm": 0.11493530124425888, "learning_rate": 5.767522326647029e-06, "loss": 0.0104, "step": 94600 }, { "epoch": 2.6542292046570344, "grad_norm": 1.2207458019256592, "learning_rate": 5.762846589049422e-06, "loss": 0.032, "step": 94610 }, { "epoch": 2.654509748912891, "grad_norm": 0.023453354835510254, "learning_rate": 5.758170851451817e-06, "loss": 0.0206, "step": 94620 }, { "epoch": 2.6547902931687473, "grad_norm": 0.040538687258958817, "learning_rate": 5.75349511385421e-06, "loss": 0.0084, "step": 94630 }, { "epoch": 2.6550708374246037, "grad_norm": 0.03621288388967514, "learning_rate": 5.748819376256605e-06, "loss": 0.0218, "step": 94640 }, { "epoch": 2.65535138168046, "grad_norm": 0.3688080608844757, "learning_rate": 5.744143638658999e-06, "loss": 0.0391, "step": 94650 }, { "epoch": 2.6556319259363166, "grad_norm": 0.024187341332435608, "learning_rate": 5.739467901061392e-06, "loss": 0.0182, "step": 94660 }, { "epoch": 2.655912470192173, "grad_norm": 0.4191220998764038, "learning_rate": 5.734792163463787e-06, "loss": 0.0346, "step": 94670 }, { "epoch": 2.656193014448029, "grad_norm": 0.7702041268348694, "learning_rate": 5.73011642586618e-06, "loss": 0.0126, "step": 94680 }, { "epoch": 2.6564735587038855, "grad_norm": 1.0039496421813965, "learning_rate": 5.725440688268575e-06, "loss": 0.0335, "step": 94690 }, { "epoch": 2.656754102959742, "grad_norm": 0.17885608971118927, "learning_rate": 5.720764950670968e-06, "loss": 0.0364, "step": 94700 }, { "epoch": 2.6570346472155983, "grad_norm": 0.040894314646720886, "learning_rate": 5.716089213073362e-06, "loss": 0.0204, "step": 94710 }, { "epoch": 2.657315191471455, "grad_norm": 0.010724330320954323, "learning_rate": 5.711413475475757e-06, "loss": 0.0391, "step": 94720 }, { "epoch": 2.657595735727311, "grad_norm": 0.021184219047427177, "learning_rate": 5.70673773787815e-06, "loss": 0.0335, "step": 94730 }, { "epoch": 2.6578762799831672, "grad_norm": 0.03806246817111969, "learning_rate": 5.702062000280544e-06, "loss": 0.0235, "step": 94740 }, { "epoch": 2.6581568242390237, "grad_norm": 0.03813634067773819, "learning_rate": 5.697386262682938e-06, "loss": 0.0094, "step": 94750 }, { "epoch": 2.65843736849488, "grad_norm": 0.01904417760670185, "learning_rate": 5.692710525085332e-06, "loss": 0.0041, "step": 94760 }, { "epoch": 2.6587179127507365, "grad_norm": 0.04483436048030853, "learning_rate": 5.688034787487726e-06, "loss": 0.0116, "step": 94770 }, { "epoch": 2.658998457006593, "grad_norm": 0.5985316038131714, "learning_rate": 5.68335904989012e-06, "loss": 0.0334, "step": 94780 }, { "epoch": 2.6592790012624494, "grad_norm": 0.06432241201400757, "learning_rate": 5.678683312292514e-06, "loss": 0.0145, "step": 94790 }, { "epoch": 2.6595595455183054, "grad_norm": 0.009192799217998981, "learning_rate": 5.674007574694908e-06, "loss": 0.0158, "step": 94800 }, { "epoch": 2.659840089774162, "grad_norm": 0.23797552287578583, "learning_rate": 5.669331837097302e-06, "loss": 0.01, "step": 94810 }, { "epoch": 2.6601206340300183, "grad_norm": 0.8020152449607849, "learning_rate": 5.664656099499696e-06, "loss": 0.0365, "step": 94820 }, { "epoch": 2.6604011782858747, "grad_norm": 0.2840784788131714, "learning_rate": 5.65998036190209e-06, "loss": 0.013, "step": 94830 }, { "epoch": 2.6606817225417307, "grad_norm": 29.907419204711914, "learning_rate": 5.655304624304484e-06, "loss": 0.0424, "step": 94840 }, { "epoch": 2.660962266797587, "grad_norm": 0.03070438653230667, "learning_rate": 5.650628886706878e-06, "loss": 0.0156, "step": 94850 }, { "epoch": 2.6612428110534436, "grad_norm": 0.3631376326084137, "learning_rate": 5.645953149109272e-06, "loss": 0.0146, "step": 94860 }, { "epoch": 2.6615233553093, "grad_norm": 0.12323234975337982, "learning_rate": 5.641277411511666e-06, "loss": 0.0069, "step": 94870 }, { "epoch": 2.6618038995651565, "grad_norm": 0.04028039425611496, "learning_rate": 5.63660167391406e-06, "loss": 0.0076, "step": 94880 }, { "epoch": 2.662084443821013, "grad_norm": 0.07887875288724899, "learning_rate": 5.631925936316454e-06, "loss": 0.0236, "step": 94890 }, { "epoch": 2.6623649880768694, "grad_norm": 0.2959318459033966, "learning_rate": 5.627250198718848e-06, "loss": 0.0078, "step": 94900 }, { "epoch": 2.6626455323327254, "grad_norm": 0.02161570079624653, "learning_rate": 5.622574461121242e-06, "loss": 0.0023, "step": 94910 }, { "epoch": 2.662926076588582, "grad_norm": 0.21276050806045532, "learning_rate": 5.617898723523636e-06, "loss": 0.0069, "step": 94920 }, { "epoch": 2.6632066208444383, "grad_norm": 0.059329621493816376, "learning_rate": 5.61322298592603e-06, "loss": 0.0065, "step": 94930 }, { "epoch": 2.6634871651002947, "grad_norm": 0.10545063018798828, "learning_rate": 5.6085472483284235e-06, "loss": 0.0439, "step": 94940 }, { "epoch": 2.6637677093561507, "grad_norm": 0.021208738908171654, "learning_rate": 5.603871510730818e-06, "loss": 0.0219, "step": 94950 }, { "epoch": 2.664048253612007, "grad_norm": 0.9301719069480896, "learning_rate": 5.5991957731332115e-06, "loss": 0.0421, "step": 94960 }, { "epoch": 2.6643287978678636, "grad_norm": 0.5467734932899475, "learning_rate": 5.594520035535606e-06, "loss": 0.0452, "step": 94970 }, { "epoch": 2.66460934212372, "grad_norm": 0.7983806729316711, "learning_rate": 5.589844297938e-06, "loss": 0.011, "step": 94980 }, { "epoch": 2.6648898863795765, "grad_norm": 0.43837451934814453, "learning_rate": 5.5851685603403935e-06, "loss": 0.0115, "step": 94990 }, { "epoch": 2.665170430635433, "grad_norm": 0.06129346042871475, "learning_rate": 5.580492822742788e-06, "loss": 0.0095, "step": 95000 }, { "epoch": 2.6654509748912893, "grad_norm": 0.0682043731212616, "learning_rate": 5.5758170851451815e-06, "loss": 0.015, "step": 95010 }, { "epoch": 2.6657315191471453, "grad_norm": 0.2932196259498596, "learning_rate": 5.571141347547576e-06, "loss": 0.0174, "step": 95020 }, { "epoch": 2.666012063403002, "grad_norm": 0.10038435459136963, "learning_rate": 5.5664656099499695e-06, "loss": 0.0209, "step": 95030 }, { "epoch": 2.6662926076588582, "grad_norm": 0.5265293121337891, "learning_rate": 5.5617898723523635e-06, "loss": 0.0303, "step": 95040 }, { "epoch": 2.6665731519147147, "grad_norm": 0.38886138796806335, "learning_rate": 5.557114134754758e-06, "loss": 0.0133, "step": 95050 }, { "epoch": 2.6668536961705707, "grad_norm": 0.4622909724712372, "learning_rate": 5.5524383971571515e-06, "loss": 0.0252, "step": 95060 }, { "epoch": 2.667134240426427, "grad_norm": 0.25921815633773804, "learning_rate": 5.5477626595595455e-06, "loss": 0.0032, "step": 95070 }, { "epoch": 2.6674147846822835, "grad_norm": 0.6024050116539001, "learning_rate": 5.5430869219619395e-06, "loss": 0.012, "step": 95080 }, { "epoch": 2.66769532893814, "grad_norm": 1.1931805610656738, "learning_rate": 5.5384111843643335e-06, "loss": 0.047, "step": 95090 }, { "epoch": 2.6679758731939964, "grad_norm": 0.48534801602363586, "learning_rate": 5.5337354467667275e-06, "loss": 0.0066, "step": 95100 }, { "epoch": 2.668256417449853, "grad_norm": 0.06333350390195847, "learning_rate": 5.5290597091691216e-06, "loss": 0.0415, "step": 95110 }, { "epoch": 2.6685369617057093, "grad_norm": 0.23697492480278015, "learning_rate": 5.5243839715715156e-06, "loss": 0.0088, "step": 95120 }, { "epoch": 2.6688175059615653, "grad_norm": 0.024309180676937103, "learning_rate": 5.5197082339739096e-06, "loss": 0.0179, "step": 95130 }, { "epoch": 2.6690980502174217, "grad_norm": 0.02920120395720005, "learning_rate": 5.5150324963763036e-06, "loss": 0.0396, "step": 95140 }, { "epoch": 2.669378594473278, "grad_norm": 0.08250229805707932, "learning_rate": 5.5103567587786976e-06, "loss": 0.0137, "step": 95150 }, { "epoch": 2.6696591387291346, "grad_norm": 0.03183743357658386, "learning_rate": 5.5056810211810916e-06, "loss": 0.0047, "step": 95160 }, { "epoch": 2.6699396829849906, "grad_norm": 0.04446398839354515, "learning_rate": 5.5010052835834856e-06, "loss": 0.0269, "step": 95170 }, { "epoch": 2.670220227240847, "grad_norm": 0.2548999786376953, "learning_rate": 5.4963295459858796e-06, "loss": 0.0027, "step": 95180 }, { "epoch": 2.6705007714967035, "grad_norm": 0.7198638916015625, "learning_rate": 5.4916538083882736e-06, "loss": 0.0437, "step": 95190 }, { "epoch": 2.67078131575256, "grad_norm": 0.6939931511878967, "learning_rate": 5.4869780707906676e-06, "loss": 0.013, "step": 95200 }, { "epoch": 2.6710618600084164, "grad_norm": 0.0346861332654953, "learning_rate": 5.4823023331930616e-06, "loss": 0.0123, "step": 95210 }, { "epoch": 2.671342404264273, "grad_norm": 0.13218152523040771, "learning_rate": 5.477626595595456e-06, "loss": 0.0366, "step": 95220 }, { "epoch": 2.6716229485201293, "grad_norm": 0.11417555063962936, "learning_rate": 5.47295085799785e-06, "loss": 0.0053, "step": 95230 }, { "epoch": 2.6719034927759853, "grad_norm": 0.042047467082738876, "learning_rate": 5.468275120400244e-06, "loss": 0.0119, "step": 95240 }, { "epoch": 2.6721840370318417, "grad_norm": 0.0436420775949955, "learning_rate": 5.463599382802638e-06, "loss": 0.0096, "step": 95250 }, { "epoch": 2.672464581287698, "grad_norm": 0.07409342378377914, "learning_rate": 5.458923645205032e-06, "loss": 0.0233, "step": 95260 }, { "epoch": 2.6727451255435546, "grad_norm": 0.0446830689907074, "learning_rate": 5.454247907607425e-06, "loss": 0.0222, "step": 95270 }, { "epoch": 2.6730256697994106, "grad_norm": 0.021277491003274918, "learning_rate": 5.44957217000982e-06, "loss": 0.0164, "step": 95280 }, { "epoch": 2.673306214055267, "grad_norm": 1.416226863861084, "learning_rate": 5.444896432412213e-06, "loss": 0.0139, "step": 95290 }, { "epoch": 2.6735867583111235, "grad_norm": 0.05907720699906349, "learning_rate": 5.440220694814608e-06, "loss": 0.0387, "step": 95300 }, { "epoch": 2.67386730256698, "grad_norm": 0.08186223357915878, "learning_rate": 5.435544957217002e-06, "loss": 0.0111, "step": 95310 }, { "epoch": 2.6741478468228363, "grad_norm": 0.3308317959308624, "learning_rate": 5.430869219619395e-06, "loss": 0.0113, "step": 95320 }, { "epoch": 2.674428391078693, "grad_norm": 0.019197124987840652, "learning_rate": 5.42619348202179e-06, "loss": 0.0209, "step": 95330 }, { "epoch": 2.6747089353345492, "grad_norm": 0.05263557657599449, "learning_rate": 5.421517744424183e-06, "loss": 0.0332, "step": 95340 }, { "epoch": 2.6749894795904052, "grad_norm": 0.05493134632706642, "learning_rate": 5.416842006826578e-06, "loss": 0.007, "step": 95350 }, { "epoch": 2.6752700238462617, "grad_norm": 0.07367304712533951, "learning_rate": 5.412166269228971e-06, "loss": 0.0073, "step": 95360 }, { "epoch": 2.675550568102118, "grad_norm": 0.04882495850324631, "learning_rate": 5.407490531631365e-06, "loss": 0.0057, "step": 95370 }, { "epoch": 2.6758311123579746, "grad_norm": 0.24379558861255646, "learning_rate": 5.40281479403376e-06, "loss": 0.0198, "step": 95380 }, { "epoch": 2.676111656613831, "grad_norm": 0.32002565264701843, "learning_rate": 5.398139056436153e-06, "loss": 0.0046, "step": 95390 }, { "epoch": 2.676392200869687, "grad_norm": 0.014101327396929264, "learning_rate": 5.393463318838548e-06, "loss": 0.0029, "step": 95400 }, { "epoch": 2.6766727451255434, "grad_norm": 0.05295850709080696, "learning_rate": 5.388787581240941e-06, "loss": 0.0102, "step": 95410 }, { "epoch": 2.6769532893814, "grad_norm": 0.029863713309168816, "learning_rate": 5.384111843643335e-06, "loss": 0.0367, "step": 95420 }, { "epoch": 2.6772338336372563, "grad_norm": 0.32757672667503357, "learning_rate": 5.379436106045729e-06, "loss": 0.0232, "step": 95430 }, { "epoch": 2.6775143778931128, "grad_norm": 0.11546316742897034, "learning_rate": 5.374760368448123e-06, "loss": 0.0333, "step": 95440 }, { "epoch": 2.677794922148969, "grad_norm": 0.4147520661354065, "learning_rate": 5.370084630850517e-06, "loss": 0.0173, "step": 95450 }, { "epoch": 2.6780754664048256, "grad_norm": 0.12565863132476807, "learning_rate": 5.365408893252911e-06, "loss": 0.0337, "step": 95460 }, { "epoch": 2.6783560106606816, "grad_norm": 0.040481943637132645, "learning_rate": 5.360733155655305e-06, "loss": 0.013, "step": 95470 }, { "epoch": 2.678636554916538, "grad_norm": 0.30013880133628845, "learning_rate": 5.356057418057699e-06, "loss": 0.0205, "step": 95480 }, { "epoch": 2.6789170991723945, "grad_norm": 0.211915522813797, "learning_rate": 5.351381680460093e-06, "loss": 0.0204, "step": 95490 }, { "epoch": 2.679197643428251, "grad_norm": 0.1670711785554886, "learning_rate": 5.346705942862487e-06, "loss": 0.0103, "step": 95500 }, { "epoch": 2.679478187684107, "grad_norm": 0.014661166816949844, "learning_rate": 5.342030205264881e-06, "loss": 0.0165, "step": 95510 }, { "epoch": 2.6797587319399634, "grad_norm": 0.03953642025589943, "learning_rate": 5.337354467667275e-06, "loss": 0.0252, "step": 95520 }, { "epoch": 2.68003927619582, "grad_norm": 0.028398629277944565, "learning_rate": 5.332678730069669e-06, "loss": 0.0509, "step": 95530 }, { "epoch": 2.6803198204516763, "grad_norm": 0.1480003148317337, "learning_rate": 5.328002992472063e-06, "loss": 0.0144, "step": 95540 }, { "epoch": 2.6806003647075327, "grad_norm": 0.06985750794410706, "learning_rate": 5.323327254874457e-06, "loss": 0.0346, "step": 95550 }, { "epoch": 2.680880908963389, "grad_norm": 0.4662548005580902, "learning_rate": 5.318651517276851e-06, "loss": 0.0244, "step": 95560 }, { "epoch": 2.6811614532192456, "grad_norm": 0.03812149539589882, "learning_rate": 5.313975779679245e-06, "loss": 0.0283, "step": 95570 }, { "epoch": 2.6814419974751016, "grad_norm": 0.013611434027552605, "learning_rate": 5.309300042081639e-06, "loss": 0.0186, "step": 95580 }, { "epoch": 2.681722541730958, "grad_norm": 0.04604942351579666, "learning_rate": 5.304624304484033e-06, "loss": 0.0068, "step": 95590 }, { "epoch": 2.6820030859868145, "grad_norm": 0.17754101753234863, "learning_rate": 5.299948566886427e-06, "loss": 0.0092, "step": 95600 }, { "epoch": 2.682283630242671, "grad_norm": 0.017690766602754593, "learning_rate": 5.295272829288821e-06, "loss": 0.0187, "step": 95610 }, { "epoch": 2.682564174498527, "grad_norm": 0.01934981159865856, "learning_rate": 5.290597091691214e-06, "loss": 0.0156, "step": 95620 }, { "epoch": 2.6828447187543834, "grad_norm": 0.030130306258797646, "learning_rate": 5.285921354093609e-06, "loss": 0.0097, "step": 95630 }, { "epoch": 2.68312526301024, "grad_norm": 0.08133537322282791, "learning_rate": 5.281245616496003e-06, "loss": 0.0081, "step": 95640 }, { "epoch": 2.6834058072660962, "grad_norm": 0.34528496861457825, "learning_rate": 5.276569878898396e-06, "loss": 0.0118, "step": 95650 }, { "epoch": 2.6836863515219527, "grad_norm": 0.05467841774225235, "learning_rate": 5.271894141300791e-06, "loss": 0.0143, "step": 95660 }, { "epoch": 2.683966895777809, "grad_norm": 0.014675184153020382, "learning_rate": 5.267218403703184e-06, "loss": 0.0162, "step": 95670 }, { "epoch": 2.6842474400336656, "grad_norm": 0.020231906324625015, "learning_rate": 5.262542666105579e-06, "loss": 0.0046, "step": 95680 }, { "epoch": 2.6845279842895216, "grad_norm": 0.02655744180083275, "learning_rate": 5.257866928507972e-06, "loss": 0.0155, "step": 95690 }, { "epoch": 2.684808528545378, "grad_norm": 0.9748234748840332, "learning_rate": 5.253191190910366e-06, "loss": 0.0264, "step": 95700 }, { "epoch": 2.6850890728012344, "grad_norm": 0.13208572566509247, "learning_rate": 5.248515453312761e-06, "loss": 0.0168, "step": 95710 }, { "epoch": 2.685369617057091, "grad_norm": 0.029207894578576088, "learning_rate": 5.243839715715154e-06, "loss": 0.0184, "step": 95720 }, { "epoch": 2.685650161312947, "grad_norm": 0.05800323933362961, "learning_rate": 5.239163978117549e-06, "loss": 0.0138, "step": 95730 }, { "epoch": 2.6859307055688033, "grad_norm": 0.09734246134757996, "learning_rate": 5.234488240519942e-06, "loss": 0.0237, "step": 95740 }, { "epoch": 2.6862112498246598, "grad_norm": 4.506526470184326, "learning_rate": 5.229812502922336e-06, "loss": 0.0094, "step": 95750 }, { "epoch": 2.686491794080516, "grad_norm": 0.3165908753871918, "learning_rate": 5.22513676532473e-06, "loss": 0.0186, "step": 95760 }, { "epoch": 2.6867723383363726, "grad_norm": 0.02463820017874241, "learning_rate": 5.220461027727124e-06, "loss": 0.0119, "step": 95770 }, { "epoch": 2.687052882592229, "grad_norm": 0.9365989565849304, "learning_rate": 5.215785290129519e-06, "loss": 0.017, "step": 95780 }, { "epoch": 2.6873334268480855, "grad_norm": 0.5715805888175964, "learning_rate": 5.211109552531912e-06, "loss": 0.0139, "step": 95790 }, { "epoch": 2.6876139711039415, "grad_norm": 0.48876458406448364, "learning_rate": 5.206433814934306e-06, "loss": 0.0092, "step": 95800 }, { "epoch": 2.687894515359798, "grad_norm": 0.009520095773041248, "learning_rate": 5.2017580773367e-06, "loss": 0.0118, "step": 95810 }, { "epoch": 2.6881750596156544, "grad_norm": 1.884472370147705, "learning_rate": 5.197082339739094e-06, "loss": 0.0329, "step": 95820 }, { "epoch": 2.688455603871511, "grad_norm": 0.41484230756759644, "learning_rate": 5.192406602141488e-06, "loss": 0.0131, "step": 95830 }, { "epoch": 2.688736148127367, "grad_norm": 0.13384133577346802, "learning_rate": 5.187730864543882e-06, "loss": 0.0359, "step": 95840 }, { "epoch": 2.6890166923832233, "grad_norm": 0.39461156725883484, "learning_rate": 5.183055126946276e-06, "loss": 0.0486, "step": 95850 }, { "epoch": 2.6892972366390797, "grad_norm": 0.04835928976535797, "learning_rate": 5.17837938934867e-06, "loss": 0.0069, "step": 95860 }, { "epoch": 2.689577780894936, "grad_norm": 0.05275988206267357, "learning_rate": 5.173703651751064e-06, "loss": 0.0306, "step": 95870 }, { "epoch": 2.6898583251507926, "grad_norm": 0.1275354027748108, "learning_rate": 5.169027914153458e-06, "loss": 0.0294, "step": 95880 }, { "epoch": 2.690138869406649, "grad_norm": 0.059843018651008606, "learning_rate": 5.164352176555852e-06, "loss": 0.0357, "step": 95890 }, { "epoch": 2.6904194136625055, "grad_norm": 0.05200657621026039, "learning_rate": 5.159676438958246e-06, "loss": 0.0038, "step": 95900 }, { "epoch": 2.6906999579183615, "grad_norm": 0.023155955597758293, "learning_rate": 5.15500070136064e-06, "loss": 0.0167, "step": 95910 }, { "epoch": 2.690980502174218, "grad_norm": 0.012541609816253185, "learning_rate": 5.150324963763034e-06, "loss": 0.0079, "step": 95920 }, { "epoch": 2.6912610464300744, "grad_norm": 0.05078069120645523, "learning_rate": 5.145649226165428e-06, "loss": 0.0232, "step": 95930 }, { "epoch": 2.691541590685931, "grad_norm": 0.18340876698493958, "learning_rate": 5.140973488567822e-06, "loss": 0.0307, "step": 95940 }, { "epoch": 2.691822134941787, "grad_norm": 0.39083072543144226, "learning_rate": 5.136297750970215e-06, "loss": 0.0062, "step": 95950 }, { "epoch": 2.6921026791976432, "grad_norm": 0.15031041204929352, "learning_rate": 5.13162201337261e-06, "loss": 0.0194, "step": 95960 }, { "epoch": 2.6923832234534997, "grad_norm": 0.02250741235911846, "learning_rate": 5.126946275775004e-06, "loss": 0.0032, "step": 95970 }, { "epoch": 2.692663767709356, "grad_norm": 0.020318390801548958, "learning_rate": 5.122270538177398e-06, "loss": 0.0081, "step": 95980 }, { "epoch": 2.6929443119652126, "grad_norm": 0.11771787703037262, "learning_rate": 5.117594800579792e-06, "loss": 0.0061, "step": 95990 }, { "epoch": 2.693224856221069, "grad_norm": 0.13000687956809998, "learning_rate": 5.112919062982185e-06, "loss": 0.034, "step": 96000 }, { "epoch": 2.6935054004769254, "grad_norm": 0.1622113585472107, "learning_rate": 5.10824332538458e-06, "loss": 0.0185, "step": 96010 }, { "epoch": 2.6937859447327814, "grad_norm": 0.06524700671434402, "learning_rate": 5.103567587786973e-06, "loss": 0.0164, "step": 96020 }, { "epoch": 2.694066488988638, "grad_norm": 0.108612559735775, "learning_rate": 5.098891850189367e-06, "loss": 0.054, "step": 96030 }, { "epoch": 2.6943470332444943, "grad_norm": 0.037708766758441925, "learning_rate": 5.094216112591762e-06, "loss": 0.0077, "step": 96040 }, { "epoch": 2.6946275775003508, "grad_norm": 3.071929693222046, "learning_rate": 5.089540374994155e-06, "loss": 0.0184, "step": 96050 }, { "epoch": 2.694908121756207, "grad_norm": 0.926609992980957, "learning_rate": 5.08486463739655e-06, "loss": 0.0318, "step": 96060 }, { "epoch": 2.695188666012063, "grad_norm": 0.0753755047917366, "learning_rate": 5.080188899798943e-06, "loss": 0.0207, "step": 96070 }, { "epoch": 2.6954692102679196, "grad_norm": 0.19814170897006989, "learning_rate": 5.075513162201337e-06, "loss": 0.0379, "step": 96080 }, { "epoch": 2.695749754523776, "grad_norm": 0.14339686930179596, "learning_rate": 5.070837424603731e-06, "loss": 0.0246, "step": 96090 }, { "epoch": 2.6960302987796325, "grad_norm": 0.32115986943244934, "learning_rate": 5.066161687006125e-06, "loss": 0.0087, "step": 96100 }, { "epoch": 2.696310843035489, "grad_norm": 0.027671201154589653, "learning_rate": 5.06148594940852e-06, "loss": 0.0149, "step": 96110 }, { "epoch": 2.6965913872913454, "grad_norm": 0.033116940408945084, "learning_rate": 5.056810211810913e-06, "loss": 0.0079, "step": 96120 }, { "epoch": 2.696871931547202, "grad_norm": 1.1049343347549438, "learning_rate": 5.052134474213307e-06, "loss": 0.0347, "step": 96130 }, { "epoch": 2.697152475803058, "grad_norm": 0.12677264213562012, "learning_rate": 5.047458736615701e-06, "loss": 0.0132, "step": 96140 }, { "epoch": 2.6974330200589143, "grad_norm": 0.016986677423119545, "learning_rate": 5.042782999018095e-06, "loss": 0.0088, "step": 96150 }, { "epoch": 2.6977135643147707, "grad_norm": 0.01468813605606556, "learning_rate": 5.038107261420489e-06, "loss": 0.017, "step": 96160 }, { "epoch": 2.697994108570627, "grad_norm": 0.03733847662806511, "learning_rate": 5.033431523822883e-06, "loss": 0.0114, "step": 96170 }, { "epoch": 2.698274652826483, "grad_norm": 1.9639136791229248, "learning_rate": 5.028755786225277e-06, "loss": 0.0102, "step": 96180 }, { "epoch": 2.6985551970823396, "grad_norm": 0.027758602052927017, "learning_rate": 5.024080048627671e-06, "loss": 0.0179, "step": 96190 }, { "epoch": 2.698835741338196, "grad_norm": 0.4510365426540375, "learning_rate": 5.019404311030065e-06, "loss": 0.0097, "step": 96200 }, { "epoch": 2.6991162855940525, "grad_norm": 1.4302831888198853, "learning_rate": 5.014728573432459e-06, "loss": 0.0329, "step": 96210 }, { "epoch": 2.699396829849909, "grad_norm": 0.016439007595181465, "learning_rate": 5.010052835834853e-06, "loss": 0.0072, "step": 96220 }, { "epoch": 2.6996773741057654, "grad_norm": 0.023010708391666412, "learning_rate": 5.005377098237247e-06, "loss": 0.0086, "step": 96230 }, { "epoch": 2.699957918361622, "grad_norm": 0.00975987408310175, "learning_rate": 5.000701360639641e-06, "loss": 0.019, "step": 96240 }, { "epoch": 2.700238462617478, "grad_norm": 2.1444315910339355, "learning_rate": 4.996025623042035e-06, "loss": 0.0442, "step": 96250 }, { "epoch": 2.7005190068733342, "grad_norm": 0.17264337837696075, "learning_rate": 4.991349885444429e-06, "loss": 0.0353, "step": 96260 }, { "epoch": 2.7007995511291907, "grad_norm": 0.44232720136642456, "learning_rate": 4.986674147846823e-06, "loss": 0.0241, "step": 96270 }, { "epoch": 2.701080095385047, "grad_norm": 0.12520861625671387, "learning_rate": 4.9819984102492164e-06, "loss": 0.0057, "step": 96280 }, { "epoch": 2.701360639640903, "grad_norm": 0.021245693787932396, "learning_rate": 4.977322672651611e-06, "loss": 0.0144, "step": 96290 }, { "epoch": 2.7016411838967596, "grad_norm": 0.39218008518218994, "learning_rate": 4.972646935054005e-06, "loss": 0.0203, "step": 96300 }, { "epoch": 2.701921728152616, "grad_norm": 0.014810794033110142, "learning_rate": 4.967971197456399e-06, "loss": 0.0091, "step": 96310 }, { "epoch": 2.7022022724084724, "grad_norm": 2.6546132564544678, "learning_rate": 4.963295459858793e-06, "loss": 0.0272, "step": 96320 }, { "epoch": 2.702482816664329, "grad_norm": 0.014806770719587803, "learning_rate": 4.9586197222611864e-06, "loss": 0.0213, "step": 96330 }, { "epoch": 2.7027633609201853, "grad_norm": 0.05026613920927048, "learning_rate": 4.953943984663581e-06, "loss": 0.0052, "step": 96340 }, { "epoch": 2.7030439051760418, "grad_norm": 0.5728306174278259, "learning_rate": 4.9492682470659744e-06, "loss": 0.0234, "step": 96350 }, { "epoch": 2.7033244494318978, "grad_norm": 1.8666737079620361, "learning_rate": 4.9445925094683684e-06, "loss": 0.0411, "step": 96360 }, { "epoch": 2.703604993687754, "grad_norm": 0.6678399443626404, "learning_rate": 4.939916771870763e-06, "loss": 0.037, "step": 96370 }, { "epoch": 2.7038855379436106, "grad_norm": 0.49820929765701294, "learning_rate": 4.9352410342731565e-06, "loss": 0.0107, "step": 96380 }, { "epoch": 2.704166082199467, "grad_norm": 0.2627655267715454, "learning_rate": 4.930565296675551e-06, "loss": 0.0256, "step": 96390 }, { "epoch": 2.704446626455323, "grad_norm": 0.04680115729570389, "learning_rate": 4.9258895590779445e-06, "loss": 0.0104, "step": 96400 }, { "epoch": 2.7047271707111795, "grad_norm": 1.4354987144470215, "learning_rate": 4.9212138214803385e-06, "loss": 0.0345, "step": 96410 }, { "epoch": 2.705007714967036, "grad_norm": 0.018880145624279976, "learning_rate": 4.9165380838827325e-06, "loss": 0.0104, "step": 96420 }, { "epoch": 2.7052882592228924, "grad_norm": 0.10500292479991913, "learning_rate": 4.9118623462851265e-06, "loss": 0.008, "step": 96430 }, { "epoch": 2.705568803478749, "grad_norm": 0.6700795888900757, "learning_rate": 4.907186608687521e-06, "loss": 0.0134, "step": 96440 }, { "epoch": 2.7058493477346053, "grad_norm": 0.5274437069892883, "learning_rate": 4.9025108710899145e-06, "loss": 0.0072, "step": 96450 }, { "epoch": 2.7061298919904617, "grad_norm": 0.041492439806461334, "learning_rate": 4.8978351334923085e-06, "loss": 0.0385, "step": 96460 }, { "epoch": 2.7064104362463177, "grad_norm": 0.2636822760105133, "learning_rate": 4.8931593958947025e-06, "loss": 0.0096, "step": 96470 }, { "epoch": 2.706690980502174, "grad_norm": 0.056010954082012177, "learning_rate": 4.8884836582970965e-06, "loss": 0.0184, "step": 96480 }, { "epoch": 2.7069715247580306, "grad_norm": 0.31842926144599915, "learning_rate": 4.883807920699491e-06, "loss": 0.028, "step": 96490 }, { "epoch": 2.707252069013887, "grad_norm": 0.061719704419374466, "learning_rate": 4.8791321831018845e-06, "loss": 0.007, "step": 96500 }, { "epoch": 2.707532613269743, "grad_norm": 0.3137947916984558, "learning_rate": 4.8744564455042785e-06, "loss": 0.0276, "step": 96510 }, { "epoch": 2.7078131575255995, "grad_norm": 0.02317063882946968, "learning_rate": 4.8697807079066725e-06, "loss": 0.0057, "step": 96520 }, { "epoch": 2.708093701781456, "grad_norm": 0.4262639582157135, "learning_rate": 4.8651049703090665e-06, "loss": 0.0219, "step": 96530 }, { "epoch": 2.7083742460373124, "grad_norm": 0.027890386059880257, "learning_rate": 4.8604292327114605e-06, "loss": 0.0096, "step": 96540 }, { "epoch": 2.708654790293169, "grad_norm": 0.586562991142273, "learning_rate": 4.8557534951138545e-06, "loss": 0.0213, "step": 96550 }, { "epoch": 2.7089353345490252, "grad_norm": 0.26901260018348694, "learning_rate": 4.8510777575162485e-06, "loss": 0.0281, "step": 96560 }, { "epoch": 2.7092158788048817, "grad_norm": 0.04063730686903, "learning_rate": 4.8464020199186425e-06, "loss": 0.0124, "step": 96570 }, { "epoch": 2.7094964230607377, "grad_norm": 0.06899912655353546, "learning_rate": 4.8417262823210365e-06, "loss": 0.0123, "step": 96580 }, { "epoch": 2.709776967316594, "grad_norm": 0.03477238491177559, "learning_rate": 4.8370505447234305e-06, "loss": 0.0084, "step": 96590 }, { "epoch": 2.7100575115724506, "grad_norm": 0.06349623948335648, "learning_rate": 4.8323748071258245e-06, "loss": 0.0226, "step": 96600 }, { "epoch": 2.710338055828307, "grad_norm": 2.2034504413604736, "learning_rate": 4.827699069528218e-06, "loss": 0.0353, "step": 96610 }, { "epoch": 2.7106186000841634, "grad_norm": 0.11316626518964767, "learning_rate": 4.8230233319306125e-06, "loss": 0.0287, "step": 96620 }, { "epoch": 2.7108991443400194, "grad_norm": 0.16803567111492157, "learning_rate": 4.8183475943330065e-06, "loss": 0.0163, "step": 96630 }, { "epoch": 2.711179688595876, "grad_norm": 0.19759467244148254, "learning_rate": 4.8136718567354005e-06, "loss": 0.0072, "step": 96640 }, { "epoch": 2.7114602328517323, "grad_norm": 0.7256720066070557, "learning_rate": 4.8089961191377945e-06, "loss": 0.0179, "step": 96650 }, { "epoch": 2.7117407771075888, "grad_norm": 5.696694850921631, "learning_rate": 4.804320381540188e-06, "loss": 0.0414, "step": 96660 }, { "epoch": 2.712021321363445, "grad_norm": 0.09859981387853622, "learning_rate": 4.7996446439425825e-06, "loss": 0.005, "step": 96670 }, { "epoch": 2.7123018656193016, "grad_norm": 0.06470633298158646, "learning_rate": 4.794968906344976e-06, "loss": 0.0046, "step": 96680 }, { "epoch": 2.7125824098751576, "grad_norm": 0.4526364803314209, "learning_rate": 4.7902931687473705e-06, "loss": 0.026, "step": 96690 }, { "epoch": 2.712862954131014, "grad_norm": 0.018169932067394257, "learning_rate": 4.7856174311497645e-06, "loss": 0.0329, "step": 96700 }, { "epoch": 2.7131434983868705, "grad_norm": 0.07389392703771591, "learning_rate": 4.780941693552158e-06, "loss": 0.0267, "step": 96710 }, { "epoch": 2.713424042642727, "grad_norm": 0.7813485860824585, "learning_rate": 4.7762659559545525e-06, "loss": 0.0376, "step": 96720 }, { "epoch": 2.7137045868985834, "grad_norm": 0.454476922750473, "learning_rate": 4.771590218356946e-06, "loss": 0.0182, "step": 96730 }, { "epoch": 2.7139851311544394, "grad_norm": 0.8978266716003418, "learning_rate": 4.76691448075934e-06, "loss": 0.0181, "step": 96740 }, { "epoch": 2.714265675410296, "grad_norm": 0.8688557744026184, "learning_rate": 4.762238743161734e-06, "loss": 0.0347, "step": 96750 }, { "epoch": 2.7145462196661523, "grad_norm": 0.02494489774107933, "learning_rate": 4.757563005564128e-06, "loss": 0.0134, "step": 96760 }, { "epoch": 2.7148267639220087, "grad_norm": 0.018234241753816605, "learning_rate": 4.7528872679665225e-06, "loss": 0.0124, "step": 96770 }, { "epoch": 2.715107308177865, "grad_norm": 0.09787583351135254, "learning_rate": 4.748211530368916e-06, "loss": 0.0246, "step": 96780 }, { "epoch": 2.7153878524337216, "grad_norm": 0.44244271516799927, "learning_rate": 4.74353579277131e-06, "loss": 0.0118, "step": 96790 }, { "epoch": 2.715668396689578, "grad_norm": 0.06280747801065445, "learning_rate": 4.738860055173704e-06, "loss": 0.0055, "step": 96800 }, { "epoch": 2.715948940945434, "grad_norm": 9.647136688232422, "learning_rate": 4.734184317576098e-06, "loss": 0.0234, "step": 96810 }, { "epoch": 2.7162294852012905, "grad_norm": 0.020243816077709198, "learning_rate": 4.7295085799784926e-06, "loss": 0.0171, "step": 96820 }, { "epoch": 2.716510029457147, "grad_norm": 0.10148458927869797, "learning_rate": 4.724832842380886e-06, "loss": 0.0153, "step": 96830 }, { "epoch": 2.7167905737130034, "grad_norm": 0.05266784876585007, "learning_rate": 4.72015710478328e-06, "loss": 0.0059, "step": 96840 }, { "epoch": 2.7170711179688594, "grad_norm": 0.2865549623966217, "learning_rate": 4.715481367185674e-06, "loss": 0.0114, "step": 96850 }, { "epoch": 2.717351662224716, "grad_norm": 4.380252361297607, "learning_rate": 4.710805629588068e-06, "loss": 0.0116, "step": 96860 }, { "epoch": 2.7176322064805722, "grad_norm": 0.2900189757347107, "learning_rate": 4.706129891990462e-06, "loss": 0.026, "step": 96870 }, { "epoch": 2.7179127507364287, "grad_norm": 0.011949995532631874, "learning_rate": 4.701454154392856e-06, "loss": 0.0098, "step": 96880 }, { "epoch": 2.718193294992285, "grad_norm": 0.010264729149639606, "learning_rate": 4.69677841679525e-06, "loss": 0.0118, "step": 96890 }, { "epoch": 2.7184738392481416, "grad_norm": 0.01663404144346714, "learning_rate": 4.692102679197644e-06, "loss": 0.0283, "step": 96900 }, { "epoch": 2.718754383503998, "grad_norm": 0.1509384661912918, "learning_rate": 4.687426941600038e-06, "loss": 0.0142, "step": 96910 }, { "epoch": 2.719034927759854, "grad_norm": 0.47080349922180176, "learning_rate": 4.682751204002432e-06, "loss": 0.0342, "step": 96920 }, { "epoch": 2.7193154720157104, "grad_norm": 3.0140297412872314, "learning_rate": 4.678075466404826e-06, "loss": 0.0159, "step": 96930 }, { "epoch": 2.719596016271567, "grad_norm": 0.09034015238285065, "learning_rate": 4.673399728807219e-06, "loss": 0.0063, "step": 96940 }, { "epoch": 2.7198765605274233, "grad_norm": 0.429349422454834, "learning_rate": 4.668723991209614e-06, "loss": 0.0374, "step": 96950 }, { "epoch": 2.7201571047832793, "grad_norm": 0.0631941705942154, "learning_rate": 4.664048253612008e-06, "loss": 0.0327, "step": 96960 }, { "epoch": 2.7204376490391358, "grad_norm": 0.015663262456655502, "learning_rate": 4.659372516014402e-06, "loss": 0.0418, "step": 96970 }, { "epoch": 2.720718193294992, "grad_norm": 0.8524253368377686, "learning_rate": 4.654696778416796e-06, "loss": 0.0159, "step": 96980 }, { "epoch": 2.7209987375508486, "grad_norm": 0.11144157499074936, "learning_rate": 4.650021040819189e-06, "loss": 0.0183, "step": 96990 }, { "epoch": 2.721279281806705, "grad_norm": 0.7088815569877625, "learning_rate": 4.645345303221584e-06, "loss": 0.0519, "step": 97000 }, { "epoch": 2.7215598260625615, "grad_norm": 0.3590463697910309, "learning_rate": 4.640669565623977e-06, "loss": 0.0096, "step": 97010 }, { "epoch": 2.721840370318418, "grad_norm": 0.22986623644828796, "learning_rate": 4.635993828026372e-06, "loss": 0.0103, "step": 97020 }, { "epoch": 2.722120914574274, "grad_norm": 0.046965714544057846, "learning_rate": 4.631318090428766e-06, "loss": 0.0302, "step": 97030 }, { "epoch": 2.7224014588301304, "grad_norm": 0.12317749112844467, "learning_rate": 4.626642352831159e-06, "loss": 0.0204, "step": 97040 }, { "epoch": 2.722682003085987, "grad_norm": 0.08137939125299454, "learning_rate": 4.621966615233554e-06, "loss": 0.0119, "step": 97050 }, { "epoch": 2.7229625473418433, "grad_norm": 0.19878114759922028, "learning_rate": 4.617290877635947e-06, "loss": 0.0105, "step": 97060 }, { "epoch": 2.7232430915976993, "grad_norm": 0.5173349380493164, "learning_rate": 4.612615140038341e-06, "loss": 0.0073, "step": 97070 }, { "epoch": 2.7235236358535557, "grad_norm": 0.041129838675260544, "learning_rate": 4.607939402440735e-06, "loss": 0.0136, "step": 97080 }, { "epoch": 2.723804180109412, "grad_norm": 0.023018499836325645, "learning_rate": 4.603263664843129e-06, "loss": 0.0144, "step": 97090 }, { "epoch": 2.7240847243652686, "grad_norm": 0.5185394883155823, "learning_rate": 4.598587927245524e-06, "loss": 0.0373, "step": 97100 }, { "epoch": 2.724365268621125, "grad_norm": 0.11436716467142105, "learning_rate": 4.593912189647917e-06, "loss": 0.005, "step": 97110 }, { "epoch": 2.7246458128769815, "grad_norm": 0.023389184847474098, "learning_rate": 4.589236452050311e-06, "loss": 0.011, "step": 97120 }, { "epoch": 2.724926357132838, "grad_norm": 0.04633180424571037, "learning_rate": 4.584560714452705e-06, "loss": 0.012, "step": 97130 }, { "epoch": 2.725206901388694, "grad_norm": 0.0876665785908699, "learning_rate": 4.579884976855099e-06, "loss": 0.0139, "step": 97140 }, { "epoch": 2.7254874456445504, "grad_norm": 0.39418721199035645, "learning_rate": 4.575209239257494e-06, "loss": 0.0163, "step": 97150 }, { "epoch": 2.725767989900407, "grad_norm": 0.008377982303500175, "learning_rate": 4.570533501659887e-06, "loss": 0.0145, "step": 97160 }, { "epoch": 2.7260485341562632, "grad_norm": 0.021236460655927658, "learning_rate": 4.565857764062281e-06, "loss": 0.0306, "step": 97170 }, { "epoch": 2.7263290784121192, "grad_norm": 1.3045552968978882, "learning_rate": 4.561182026464675e-06, "loss": 0.0246, "step": 97180 }, { "epoch": 2.7266096226679757, "grad_norm": 0.5446003079414368, "learning_rate": 4.556506288867069e-06, "loss": 0.0132, "step": 97190 }, { "epoch": 2.726890166923832, "grad_norm": 0.05723093822598457, "learning_rate": 4.551830551269463e-06, "loss": 0.0056, "step": 97200 }, { "epoch": 2.7271707111796886, "grad_norm": 0.309341698884964, "learning_rate": 4.547154813671857e-06, "loss": 0.0045, "step": 97210 }, { "epoch": 2.727451255435545, "grad_norm": 0.018036969006061554, "learning_rate": 4.542479076074251e-06, "loss": 0.0107, "step": 97220 }, { "epoch": 2.7277317996914014, "grad_norm": 0.12035676091909409, "learning_rate": 4.537803338476645e-06, "loss": 0.0043, "step": 97230 }, { "epoch": 2.728012343947258, "grad_norm": 0.011024261824786663, "learning_rate": 4.533127600879039e-06, "loss": 0.0146, "step": 97240 }, { "epoch": 2.728292888203114, "grad_norm": 0.10273898392915726, "learning_rate": 4.528451863281433e-06, "loss": 0.0069, "step": 97250 }, { "epoch": 2.7285734324589703, "grad_norm": 0.02254326455295086, "learning_rate": 4.523776125683827e-06, "loss": 0.0277, "step": 97260 }, { "epoch": 2.7288539767148268, "grad_norm": 0.01623787358403206, "learning_rate": 4.51910038808622e-06, "loss": 0.0127, "step": 97270 }, { "epoch": 2.729134520970683, "grad_norm": 0.044248055666685104, "learning_rate": 4.514424650488615e-06, "loss": 0.0137, "step": 97280 }, { "epoch": 2.7294150652265396, "grad_norm": 0.18413566052913666, "learning_rate": 4.509748912891009e-06, "loss": 0.0177, "step": 97290 }, { "epoch": 2.7296956094823956, "grad_norm": 0.526718020439148, "learning_rate": 4.505073175293403e-06, "loss": 0.008, "step": 97300 }, { "epoch": 2.729976153738252, "grad_norm": 0.015199770219624043, "learning_rate": 4.500397437695797e-06, "loss": 0.0065, "step": 97310 }, { "epoch": 2.7302566979941085, "grad_norm": 0.19563697278499603, "learning_rate": 4.49572170009819e-06, "loss": 0.0246, "step": 97320 }, { "epoch": 2.730537242249965, "grad_norm": 1.6128565073013306, "learning_rate": 4.491045962500585e-06, "loss": 0.0352, "step": 97330 }, { "epoch": 2.7308177865058214, "grad_norm": 0.007921867072582245, "learning_rate": 4.486370224902978e-06, "loss": 0.0189, "step": 97340 }, { "epoch": 2.731098330761678, "grad_norm": 0.043405789881944656, "learning_rate": 4.481694487305373e-06, "loss": 0.0277, "step": 97350 }, { "epoch": 2.7313788750175343, "grad_norm": 0.4610677659511566, "learning_rate": 4.477018749707767e-06, "loss": 0.0186, "step": 97360 }, { "epoch": 2.7316594192733903, "grad_norm": 1.177526831626892, "learning_rate": 4.47234301211016e-06, "loss": 0.0548, "step": 97370 }, { "epoch": 2.7319399635292467, "grad_norm": 0.052110325545072556, "learning_rate": 4.467667274512555e-06, "loss": 0.026, "step": 97380 }, { "epoch": 2.732220507785103, "grad_norm": 0.031395524740219116, "learning_rate": 4.462991536914948e-06, "loss": 0.0198, "step": 97390 }, { "epoch": 2.7325010520409596, "grad_norm": 0.1882261484861374, "learning_rate": 4.458315799317343e-06, "loss": 0.0166, "step": 97400 }, { "epoch": 2.7327815962968156, "grad_norm": 0.16173626482486725, "learning_rate": 4.453640061719736e-06, "loss": 0.0139, "step": 97410 }, { "epoch": 2.733062140552672, "grad_norm": 0.2895037531852722, "learning_rate": 4.44896432412213e-06, "loss": 0.0151, "step": 97420 }, { "epoch": 2.7333426848085285, "grad_norm": 0.030069053173065186, "learning_rate": 4.444288586524525e-06, "loss": 0.0244, "step": 97430 }, { "epoch": 2.733623229064385, "grad_norm": 0.012016207911074162, "learning_rate": 4.439612848926918e-06, "loss": 0.0168, "step": 97440 }, { "epoch": 2.7339037733202414, "grad_norm": 0.30833929777145386, "learning_rate": 4.434937111329312e-06, "loss": 0.0584, "step": 97450 }, { "epoch": 2.734184317576098, "grad_norm": 0.06442336738109589, "learning_rate": 4.430261373731706e-06, "loss": 0.0085, "step": 97460 }, { "epoch": 2.7344648618319543, "grad_norm": 0.01747160404920578, "learning_rate": 4.4255856361341e-06, "loss": 0.0314, "step": 97470 }, { "epoch": 2.7347454060878102, "grad_norm": 0.23468336462974548, "learning_rate": 4.420909898536495e-06, "loss": 0.0117, "step": 97480 }, { "epoch": 2.7350259503436667, "grad_norm": 0.23641163110733032, "learning_rate": 4.416234160938888e-06, "loss": 0.0044, "step": 97490 }, { "epoch": 2.735306494599523, "grad_norm": 0.34138864278793335, "learning_rate": 4.411558423341282e-06, "loss": 0.0149, "step": 97500 }, { "epoch": 2.7355870388553796, "grad_norm": 0.6996464133262634, "learning_rate": 4.406882685743676e-06, "loss": 0.0253, "step": 97510 }, { "epoch": 2.7358675831112356, "grad_norm": 0.03984682261943817, "learning_rate": 4.40220694814607e-06, "loss": 0.0047, "step": 97520 }, { "epoch": 2.736148127367092, "grad_norm": 0.17951934039592743, "learning_rate": 4.397531210548464e-06, "loss": 0.0157, "step": 97530 }, { "epoch": 2.7364286716229484, "grad_norm": 0.5557291507720947, "learning_rate": 4.392855472950858e-06, "loss": 0.0354, "step": 97540 }, { "epoch": 2.736709215878805, "grad_norm": 0.6642881035804749, "learning_rate": 4.388179735353252e-06, "loss": 0.0205, "step": 97550 }, { "epoch": 2.7369897601346613, "grad_norm": 0.6662353277206421, "learning_rate": 4.383503997755646e-06, "loss": 0.0129, "step": 97560 }, { "epoch": 2.7372703043905178, "grad_norm": 0.30541595816612244, "learning_rate": 4.37882826015804e-06, "loss": 0.0462, "step": 97570 }, { "epoch": 2.737550848646374, "grad_norm": 0.48681342601776123, "learning_rate": 4.374152522560434e-06, "loss": 0.0196, "step": 97580 }, { "epoch": 2.73783139290223, "grad_norm": 0.6455636620521545, "learning_rate": 4.369476784962828e-06, "loss": 0.035, "step": 97590 }, { "epoch": 2.7381119371580867, "grad_norm": 0.0359266996383667, "learning_rate": 4.364801047365222e-06, "loss": 0.006, "step": 97600 }, { "epoch": 2.738392481413943, "grad_norm": 0.08056779950857162, "learning_rate": 4.360125309767616e-06, "loss": 0.0059, "step": 97610 }, { "epoch": 2.7386730256697995, "grad_norm": 0.026766326278448105, "learning_rate": 4.35544957217001e-06, "loss": 0.0138, "step": 97620 }, { "epoch": 2.7389535699256555, "grad_norm": 0.11636752635240555, "learning_rate": 4.350773834572404e-06, "loss": 0.0409, "step": 97630 }, { "epoch": 2.739234114181512, "grad_norm": 0.38340264558792114, "learning_rate": 4.346098096974798e-06, "loss": 0.0097, "step": 97640 }, { "epoch": 2.7395146584373684, "grad_norm": 1.7026065587997437, "learning_rate": 4.341422359377191e-06, "loss": 0.0638, "step": 97650 }, { "epoch": 2.739795202693225, "grad_norm": 0.0924546867609024, "learning_rate": 4.336746621779586e-06, "loss": 0.012, "step": 97660 }, { "epoch": 2.7400757469490813, "grad_norm": 0.01608145795762539, "learning_rate": 4.332070884181979e-06, "loss": 0.0205, "step": 97670 }, { "epoch": 2.7403562912049377, "grad_norm": 0.05502180755138397, "learning_rate": 4.327395146584374e-06, "loss": 0.0184, "step": 97680 }, { "epoch": 2.740636835460794, "grad_norm": 0.32280975580215454, "learning_rate": 4.322719408986768e-06, "loss": 0.0096, "step": 97690 }, { "epoch": 2.74091737971665, "grad_norm": 0.08199383318424225, "learning_rate": 4.318043671389161e-06, "loss": 0.044, "step": 97700 }, { "epoch": 2.7411979239725066, "grad_norm": 0.3945506513118744, "learning_rate": 4.313367933791556e-06, "loss": 0.017, "step": 97710 }, { "epoch": 2.741478468228363, "grad_norm": 0.28826844692230225, "learning_rate": 4.308692196193949e-06, "loss": 0.0168, "step": 97720 }, { "epoch": 2.7417590124842195, "grad_norm": 0.23766936361789703, "learning_rate": 4.304016458596344e-06, "loss": 0.013, "step": 97730 }, { "epoch": 2.7420395567400755, "grad_norm": 0.06458599120378494, "learning_rate": 4.299340720998737e-06, "loss": 0.0176, "step": 97740 }, { "epoch": 2.742320100995932, "grad_norm": 0.49679839611053467, "learning_rate": 4.294664983401131e-06, "loss": 0.0122, "step": 97750 }, { "epoch": 2.7426006452517884, "grad_norm": 0.6684425473213196, "learning_rate": 4.289989245803526e-06, "loss": 0.0105, "step": 97760 }, { "epoch": 2.742881189507645, "grad_norm": 0.014896619133651257, "learning_rate": 4.285313508205919e-06, "loss": 0.0114, "step": 97770 }, { "epoch": 2.7431617337635013, "grad_norm": 0.28014546632766724, "learning_rate": 4.280637770608314e-06, "loss": 0.0048, "step": 97780 }, { "epoch": 2.7434422780193577, "grad_norm": 0.7044385671615601, "learning_rate": 4.275962033010707e-06, "loss": 0.0039, "step": 97790 }, { "epoch": 2.743722822275214, "grad_norm": 0.3779221475124359, "learning_rate": 4.271286295413101e-06, "loss": 0.0219, "step": 97800 }, { "epoch": 2.74400336653107, "grad_norm": 0.3977547585964203, "learning_rate": 4.266610557815496e-06, "loss": 0.0072, "step": 97810 }, { "epoch": 2.7442839107869266, "grad_norm": 0.06259335577487946, "learning_rate": 4.261934820217889e-06, "loss": 0.0256, "step": 97820 }, { "epoch": 2.744564455042783, "grad_norm": 0.02200593426823616, "learning_rate": 4.257259082620283e-06, "loss": 0.02, "step": 97830 }, { "epoch": 2.7448449992986395, "grad_norm": 0.2792532742023468, "learning_rate": 4.252583345022677e-06, "loss": 0.0078, "step": 97840 }, { "epoch": 2.7451255435544955, "grad_norm": 0.04898834973573685, "learning_rate": 4.247907607425071e-06, "loss": 0.004, "step": 97850 }, { "epoch": 2.745406087810352, "grad_norm": 0.32332780957221985, "learning_rate": 4.243231869827465e-06, "loss": 0.0113, "step": 97860 }, { "epoch": 2.7456866320662083, "grad_norm": 0.02107483707368374, "learning_rate": 4.238556132229859e-06, "loss": 0.063, "step": 97870 }, { "epoch": 2.7459671763220648, "grad_norm": 0.01559491828083992, "learning_rate": 4.233880394632253e-06, "loss": 0.017, "step": 97880 }, { "epoch": 2.746247720577921, "grad_norm": 0.05894310772418976, "learning_rate": 4.229204657034647e-06, "loss": 0.0254, "step": 97890 }, { "epoch": 2.7465282648337777, "grad_norm": 0.8746993541717529, "learning_rate": 4.224528919437041e-06, "loss": 0.0105, "step": 97900 }, { "epoch": 2.746808809089634, "grad_norm": 0.04991190880537033, "learning_rate": 4.219853181839435e-06, "loss": 0.0091, "step": 97910 }, { "epoch": 2.74708935334549, "grad_norm": 0.028718626126646996, "learning_rate": 4.215177444241829e-06, "loss": 0.0045, "step": 97920 }, { "epoch": 2.7473698976013465, "grad_norm": 0.939889669418335, "learning_rate": 4.210501706644223e-06, "loss": 0.0455, "step": 97930 }, { "epoch": 2.747650441857203, "grad_norm": 0.0069495574571192265, "learning_rate": 4.205825969046617e-06, "loss": 0.0183, "step": 97940 }, { "epoch": 2.7479309861130594, "grad_norm": 0.030550742521882057, "learning_rate": 4.201150231449011e-06, "loss": 0.0287, "step": 97950 }, { "epoch": 2.748211530368916, "grad_norm": 0.24104171991348267, "learning_rate": 4.196474493851405e-06, "loss": 0.0227, "step": 97960 }, { "epoch": 2.748492074624772, "grad_norm": 0.010856451466679573, "learning_rate": 4.191798756253799e-06, "loss": 0.0497, "step": 97970 }, { "epoch": 2.7487726188806283, "grad_norm": 0.050284769386053085, "learning_rate": 4.187123018656193e-06, "loss": 0.0107, "step": 97980 }, { "epoch": 2.7490531631364847, "grad_norm": 0.4712170660495758, "learning_rate": 4.1824472810585874e-06, "loss": 0.0224, "step": 97990 }, { "epoch": 2.749333707392341, "grad_norm": 0.25176116824150085, "learning_rate": 4.177771543460981e-06, "loss": 0.0205, "step": 98000 }, { "epoch": 2.7496142516481976, "grad_norm": 0.054529473185539246, "learning_rate": 4.1730958058633754e-06, "loss": 0.0208, "step": 98010 }, { "epoch": 2.749894795904054, "grad_norm": 0.15739431977272034, "learning_rate": 4.1684200682657694e-06, "loss": 0.0187, "step": 98020 }, { "epoch": 2.7501753401599105, "grad_norm": 0.7280672192573547, "learning_rate": 4.163744330668163e-06, "loss": 0.0321, "step": 98030 }, { "epoch": 2.7504558844157665, "grad_norm": 0.47603148221969604, "learning_rate": 4.1590685930705574e-06, "loss": 0.0111, "step": 98040 }, { "epoch": 2.750736428671623, "grad_norm": 0.031408101320266724, "learning_rate": 4.154392855472951e-06, "loss": 0.0073, "step": 98050 }, { "epoch": 2.7510169729274794, "grad_norm": 0.050200771540403366, "learning_rate": 4.1497171178753454e-06, "loss": 0.0473, "step": 98060 }, { "epoch": 2.751297517183336, "grad_norm": 0.5481410622596741, "learning_rate": 4.145041380277739e-06, "loss": 0.0136, "step": 98070 }, { "epoch": 2.751578061439192, "grad_norm": 0.39717963337898254, "learning_rate": 4.140365642680133e-06, "loss": 0.0226, "step": 98080 }, { "epoch": 2.7518586056950483, "grad_norm": 0.8599832057952881, "learning_rate": 4.1356899050825274e-06, "loss": 0.0387, "step": 98090 }, { "epoch": 2.7521391499509047, "grad_norm": 0.03148459643125534, "learning_rate": 4.131014167484921e-06, "loss": 0.0048, "step": 98100 }, { "epoch": 2.752419694206761, "grad_norm": 0.40717852115631104, "learning_rate": 4.1263384298873155e-06, "loss": 0.0171, "step": 98110 }, { "epoch": 2.7527002384626176, "grad_norm": 0.24973945319652557, "learning_rate": 4.121662692289709e-06, "loss": 0.0156, "step": 98120 }, { "epoch": 2.752980782718474, "grad_norm": 0.04176067188382149, "learning_rate": 4.116986954692103e-06, "loss": 0.0085, "step": 98130 }, { "epoch": 2.7532613269743305, "grad_norm": 0.2432372272014618, "learning_rate": 4.1123112170944975e-06, "loss": 0.0656, "step": 98140 }, { "epoch": 2.7535418712301865, "grad_norm": 0.03799721971154213, "learning_rate": 4.107635479496891e-06, "loss": 0.0037, "step": 98150 }, { "epoch": 2.753822415486043, "grad_norm": 0.05845782533288002, "learning_rate": 4.102959741899285e-06, "loss": 0.021, "step": 98160 }, { "epoch": 2.7541029597418993, "grad_norm": 0.03665563836693764, "learning_rate": 4.098284004301679e-06, "loss": 0.0161, "step": 98170 }, { "epoch": 2.7543835039977558, "grad_norm": 0.0855654925107956, "learning_rate": 4.093608266704073e-06, "loss": 0.0257, "step": 98180 }, { "epoch": 2.7546640482536118, "grad_norm": 0.4654264450073242, "learning_rate": 4.088932529106467e-06, "loss": 0.0298, "step": 98190 }, { "epoch": 2.754944592509468, "grad_norm": 0.2804160714149475, "learning_rate": 4.084256791508861e-06, "loss": 0.0315, "step": 98200 }, { "epoch": 2.7552251367653247, "grad_norm": 0.8214215636253357, "learning_rate": 4.079581053911255e-06, "loss": 0.0135, "step": 98210 }, { "epoch": 2.755505681021181, "grad_norm": 0.11476704478263855, "learning_rate": 4.074905316313649e-06, "loss": 0.0187, "step": 98220 }, { "epoch": 2.7557862252770375, "grad_norm": 0.04712029546499252, "learning_rate": 4.070229578716043e-06, "loss": 0.0198, "step": 98230 }, { "epoch": 2.756066769532894, "grad_norm": 0.07745998352766037, "learning_rate": 4.065553841118437e-06, "loss": 0.0282, "step": 98240 }, { "epoch": 2.7563473137887504, "grad_norm": 0.03934316709637642, "learning_rate": 4.060878103520831e-06, "loss": 0.0105, "step": 98250 }, { "epoch": 2.7566278580446064, "grad_norm": 0.5845766067504883, "learning_rate": 4.056202365923225e-06, "loss": 0.0092, "step": 98260 }, { "epoch": 2.756908402300463, "grad_norm": 0.35577309131622314, "learning_rate": 4.051526628325619e-06, "loss": 0.0172, "step": 98270 }, { "epoch": 2.7571889465563193, "grad_norm": 0.35813382267951965, "learning_rate": 4.046850890728013e-06, "loss": 0.012, "step": 98280 }, { "epoch": 2.7574694908121757, "grad_norm": 0.3195952773094177, "learning_rate": 4.042175153130407e-06, "loss": 0.0216, "step": 98290 }, { "epoch": 2.7577500350680317, "grad_norm": 0.016118209809064865, "learning_rate": 4.037499415532801e-06, "loss": 0.019, "step": 98300 }, { "epoch": 2.758030579323888, "grad_norm": 1.5460902452468872, "learning_rate": 4.032823677935195e-06, "loss": 0.0456, "step": 98310 }, { "epoch": 2.7583111235797446, "grad_norm": 0.3829297125339508, "learning_rate": 4.028147940337589e-06, "loss": 0.0333, "step": 98320 }, { "epoch": 2.758591667835601, "grad_norm": 0.6489512324333191, "learning_rate": 4.023472202739982e-06, "loss": 0.0146, "step": 98330 }, { "epoch": 2.7588722120914575, "grad_norm": 1.8976432085037231, "learning_rate": 4.018796465142377e-06, "loss": 0.0102, "step": 98340 }, { "epoch": 2.759152756347314, "grad_norm": 0.9889737963676453, "learning_rate": 4.014120727544771e-06, "loss": 0.0295, "step": 98350 }, { "epoch": 2.7594333006031704, "grad_norm": 0.7141300439834595, "learning_rate": 4.009444989947164e-06, "loss": 0.0194, "step": 98360 }, { "epoch": 2.7597138448590264, "grad_norm": 0.6071731448173523, "learning_rate": 4.004769252349559e-06, "loss": 0.0291, "step": 98370 }, { "epoch": 2.759994389114883, "grad_norm": 0.056936267763376236, "learning_rate": 4.000093514751952e-06, "loss": 0.0105, "step": 98380 }, { "epoch": 2.7602749333707393, "grad_norm": 0.010615776292979717, "learning_rate": 3.995417777154347e-06, "loss": 0.0124, "step": 98390 }, { "epoch": 2.7605554776265957, "grad_norm": 0.011840260587632656, "learning_rate": 3.990742039556741e-06, "loss": 0.0382, "step": 98400 }, { "epoch": 2.7608360218824517, "grad_norm": 0.008473563939332962, "learning_rate": 3.986066301959134e-06, "loss": 0.011, "step": 98410 }, { "epoch": 2.761116566138308, "grad_norm": 0.35384488105773926, "learning_rate": 3.981390564361529e-06, "loss": 0.0085, "step": 98420 }, { "epoch": 2.7613971103941646, "grad_norm": 0.15556305646896362, "learning_rate": 3.976714826763922e-06, "loss": 0.0054, "step": 98430 }, { "epoch": 2.761677654650021, "grad_norm": 1.0046367645263672, "learning_rate": 3.972039089166317e-06, "loss": 0.0147, "step": 98440 }, { "epoch": 2.7619581989058775, "grad_norm": 0.021275006234645844, "learning_rate": 3.96736335156871e-06, "loss": 0.0367, "step": 98450 }, { "epoch": 2.762238743161734, "grad_norm": 0.02899477444589138, "learning_rate": 3.962687613971104e-06, "loss": 0.0118, "step": 98460 }, { "epoch": 2.7625192874175903, "grad_norm": 0.011770063079893589, "learning_rate": 3.958011876373499e-06, "loss": 0.0093, "step": 98470 }, { "epoch": 2.7627998316734463, "grad_norm": 0.23549823462963104, "learning_rate": 3.953336138775892e-06, "loss": 0.0102, "step": 98480 }, { "epoch": 2.763080375929303, "grad_norm": 2.310311794281006, "learning_rate": 3.948660401178287e-06, "loss": 0.02, "step": 98490 }, { "epoch": 2.763360920185159, "grad_norm": 0.23475505411624908, "learning_rate": 3.94398466358068e-06, "loss": 0.0102, "step": 98500 }, { "epoch": 2.7636414644410157, "grad_norm": 0.8987151980400085, "learning_rate": 3.939308925983074e-06, "loss": 0.053, "step": 98510 }, { "epoch": 2.7639220086968717, "grad_norm": 0.09099803119897842, "learning_rate": 3.934633188385468e-06, "loss": 0.0367, "step": 98520 }, { "epoch": 2.764202552952728, "grad_norm": 0.9385293126106262, "learning_rate": 3.929957450787862e-06, "loss": 0.0345, "step": 98530 }, { "epoch": 2.7644830972085845, "grad_norm": 0.05004371330142021, "learning_rate": 3.925281713190256e-06, "loss": 0.054, "step": 98540 }, { "epoch": 2.764763641464441, "grad_norm": 0.04122493416070938, "learning_rate": 3.92060597559265e-06, "loss": 0.0365, "step": 98550 }, { "epoch": 2.7650441857202974, "grad_norm": 0.5973609685897827, "learning_rate": 3.915930237995044e-06, "loss": 0.0089, "step": 98560 }, { "epoch": 2.765324729976154, "grad_norm": 0.6229075193405151, "learning_rate": 3.911254500397438e-06, "loss": 0.0226, "step": 98570 }, { "epoch": 2.7656052742320103, "grad_norm": 0.2052660584449768, "learning_rate": 3.906578762799832e-06, "loss": 0.0328, "step": 98580 }, { "epoch": 2.7658858184878663, "grad_norm": 0.017743758857250214, "learning_rate": 3.901903025202226e-06, "loss": 0.0334, "step": 98590 }, { "epoch": 2.7661663627437227, "grad_norm": 1.342902421951294, "learning_rate": 3.89722728760462e-06, "loss": 0.0259, "step": 98600 }, { "epoch": 2.766446906999579, "grad_norm": 0.00732979504391551, "learning_rate": 3.892551550007014e-06, "loss": 0.0278, "step": 98610 }, { "epoch": 2.7667274512554356, "grad_norm": 0.20016871392726898, "learning_rate": 3.887875812409408e-06, "loss": 0.0205, "step": 98620 }, { "epoch": 2.767007995511292, "grad_norm": 0.23656903207302094, "learning_rate": 3.883200074811802e-06, "loss": 0.0142, "step": 98630 }, { "epoch": 2.767288539767148, "grad_norm": 2.699535369873047, "learning_rate": 3.878524337214196e-06, "loss": 0.0466, "step": 98640 }, { "epoch": 2.7675690840230045, "grad_norm": 0.052501313388347626, "learning_rate": 3.87384859961659e-06, "loss": 0.0217, "step": 98650 }, { "epoch": 2.767849628278861, "grad_norm": 0.3493613004684448, "learning_rate": 3.869172862018983e-06, "loss": 0.0097, "step": 98660 }, { "epoch": 2.7681301725347174, "grad_norm": 0.028407234698534012, "learning_rate": 3.864497124421378e-06, "loss": 0.0281, "step": 98670 }, { "epoch": 2.768410716790574, "grad_norm": 0.07707203924655914, "learning_rate": 3.859821386823772e-06, "loss": 0.0114, "step": 98680 }, { "epoch": 2.7686912610464303, "grad_norm": 0.36227181553840637, "learning_rate": 3.855145649226166e-06, "loss": 0.0138, "step": 98690 }, { "epoch": 2.7689718053022867, "grad_norm": 0.13211360573768616, "learning_rate": 3.85046991162856e-06, "loss": 0.0185, "step": 98700 }, { "epoch": 2.7692523495581427, "grad_norm": 0.035928864032030106, "learning_rate": 3.845794174030953e-06, "loss": 0.0139, "step": 98710 }, { "epoch": 2.769532893813999, "grad_norm": 0.5473422408103943, "learning_rate": 3.841118436433348e-06, "loss": 0.0266, "step": 98720 }, { "epoch": 2.7698134380698556, "grad_norm": 0.03475858271121979, "learning_rate": 3.836442698835742e-06, "loss": 0.0302, "step": 98730 }, { "epoch": 2.770093982325712, "grad_norm": 0.023337554186582565, "learning_rate": 3.831766961238135e-06, "loss": 0.0324, "step": 98740 }, { "epoch": 2.770374526581568, "grad_norm": 0.09213779121637344, "learning_rate": 3.82709122364053e-06, "loss": 0.021, "step": 98750 }, { "epoch": 2.7706550708374245, "grad_norm": 1.6012266874313354, "learning_rate": 3.822415486042923e-06, "loss": 0.0387, "step": 98760 }, { "epoch": 2.770935615093281, "grad_norm": 0.40534958243370056, "learning_rate": 3.817739748445318e-06, "loss": 0.0084, "step": 98770 }, { "epoch": 2.7712161593491373, "grad_norm": 0.06647509336471558, "learning_rate": 3.813064010847711e-06, "loss": 0.0054, "step": 98780 }, { "epoch": 2.771496703604994, "grad_norm": 0.7185389399528503, "learning_rate": 3.8083882732501055e-06, "loss": 0.0116, "step": 98790 }, { "epoch": 2.7717772478608502, "grad_norm": 0.22461585700511932, "learning_rate": 3.8037125356524995e-06, "loss": 0.022, "step": 98800 }, { "epoch": 2.7720577921167067, "grad_norm": 0.32688090205192566, "learning_rate": 3.799036798054893e-06, "loss": 0.0218, "step": 98810 }, { "epoch": 2.7723383363725627, "grad_norm": 0.017110729590058327, "learning_rate": 3.7943610604572875e-06, "loss": 0.0054, "step": 98820 }, { "epoch": 2.772618880628419, "grad_norm": 0.3572097420692444, "learning_rate": 3.789685322859681e-06, "loss": 0.0149, "step": 98830 }, { "epoch": 2.7728994248842755, "grad_norm": 0.5192736983299255, "learning_rate": 3.7850095852620755e-06, "loss": 0.0067, "step": 98840 }, { "epoch": 2.773179969140132, "grad_norm": 0.2849927544593811, "learning_rate": 3.780333847664469e-06, "loss": 0.0147, "step": 98850 }, { "epoch": 2.773460513395988, "grad_norm": 2.7321698665618896, "learning_rate": 3.775658110066863e-06, "loss": 0.032, "step": 98860 }, { "epoch": 2.7737410576518444, "grad_norm": 0.022323476150631905, "learning_rate": 3.7709823724692575e-06, "loss": 0.0164, "step": 98870 }, { "epoch": 2.774021601907701, "grad_norm": 0.356204092502594, "learning_rate": 3.766306634871651e-06, "loss": 0.0108, "step": 98880 }, { "epoch": 2.7743021461635573, "grad_norm": 0.020547978579998016, "learning_rate": 3.7616308972740455e-06, "loss": 0.016, "step": 98890 }, { "epoch": 2.7745826904194137, "grad_norm": 0.022181060165166855, "learning_rate": 3.756955159676439e-06, "loss": 0.0174, "step": 98900 }, { "epoch": 2.77486323467527, "grad_norm": 0.09108683466911316, "learning_rate": 3.752279422078833e-06, "loss": 0.0055, "step": 98910 }, { "epoch": 2.7751437789311266, "grad_norm": 1.3851414918899536, "learning_rate": 3.7476036844812267e-06, "loss": 0.0419, "step": 98920 }, { "epoch": 2.7754243231869826, "grad_norm": 4.224987030029297, "learning_rate": 3.742927946883621e-06, "loss": 0.0256, "step": 98930 }, { "epoch": 2.775704867442839, "grad_norm": 0.003407210810109973, "learning_rate": 3.7382522092860155e-06, "loss": 0.0041, "step": 98940 }, { "epoch": 2.7759854116986955, "grad_norm": 0.441933810710907, "learning_rate": 3.733576471688409e-06, "loss": 0.0105, "step": 98950 }, { "epoch": 2.776265955954552, "grad_norm": 0.06306535005569458, "learning_rate": 3.728900734090803e-06, "loss": 0.0069, "step": 98960 }, { "epoch": 2.776546500210408, "grad_norm": 0.013743095099925995, "learning_rate": 3.7242249964931967e-06, "loss": 0.0188, "step": 98970 }, { "epoch": 2.7768270444662644, "grad_norm": 0.07606027275323868, "learning_rate": 3.719549258895591e-06, "loss": 0.0171, "step": 98980 }, { "epoch": 2.777107588722121, "grad_norm": 0.07514607161283493, "learning_rate": 3.7148735212979847e-06, "loss": 0.008, "step": 98990 }, { "epoch": 2.7773881329779773, "grad_norm": 2.7925593852996826, "learning_rate": 3.7101977837003787e-06, "loss": 0.0357, "step": 99000 }, { "epoch": 2.7776686772338337, "grad_norm": 0.30503445863723755, "learning_rate": 3.705522046102773e-06, "loss": 0.0191, "step": 99010 }, { "epoch": 2.77794922148969, "grad_norm": 3.4797070026397705, "learning_rate": 3.7008463085051667e-06, "loss": 0.0267, "step": 99020 }, { "epoch": 2.7782297657455466, "grad_norm": 0.5590865612030029, "learning_rate": 3.696170570907561e-06, "loss": 0.0288, "step": 99030 }, { "epoch": 2.7785103100014026, "grad_norm": 0.17575904726982117, "learning_rate": 3.6914948333099547e-06, "loss": 0.0289, "step": 99040 }, { "epoch": 2.778790854257259, "grad_norm": 0.21583981812000275, "learning_rate": 3.6868190957123487e-06, "loss": 0.0083, "step": 99050 }, { "epoch": 2.7790713985131155, "grad_norm": 0.03670915588736534, "learning_rate": 3.682143358114743e-06, "loss": 0.0117, "step": 99060 }, { "epoch": 2.779351942768972, "grad_norm": 0.42946943640708923, "learning_rate": 3.6774676205171367e-06, "loss": 0.0115, "step": 99070 }, { "epoch": 2.779632487024828, "grad_norm": 0.4494803845882416, "learning_rate": 3.672791882919531e-06, "loss": 0.0147, "step": 99080 }, { "epoch": 2.7799130312806843, "grad_norm": 0.15112651884555817, "learning_rate": 3.6681161453219247e-06, "loss": 0.0244, "step": 99090 }, { "epoch": 2.780193575536541, "grad_norm": 0.11501306295394897, "learning_rate": 3.6634404077243187e-06, "loss": 0.0086, "step": 99100 }, { "epoch": 2.7804741197923972, "grad_norm": 0.5109874606132507, "learning_rate": 3.6587646701267123e-06, "loss": 0.0102, "step": 99110 }, { "epoch": 2.7807546640482537, "grad_norm": 0.9834601283073425, "learning_rate": 3.6540889325291067e-06, "loss": 0.0226, "step": 99120 }, { "epoch": 2.78103520830411, "grad_norm": 0.038835544139146805, "learning_rate": 3.649413194931501e-06, "loss": 0.0126, "step": 99130 }, { "epoch": 2.7813157525599665, "grad_norm": 0.4023655951023102, "learning_rate": 3.6447374573338947e-06, "loss": 0.0349, "step": 99140 }, { "epoch": 2.7815962968158225, "grad_norm": 0.3255198001861572, "learning_rate": 3.6400617197362887e-06, "loss": 0.0145, "step": 99150 }, { "epoch": 2.781876841071679, "grad_norm": 0.15960194170475006, "learning_rate": 3.6353859821386823e-06, "loss": 0.0069, "step": 99160 }, { "epoch": 2.7821573853275354, "grad_norm": 0.43496257066726685, "learning_rate": 3.6307102445410767e-06, "loss": 0.0051, "step": 99170 }, { "epoch": 2.782437929583392, "grad_norm": 0.2953956127166748, "learning_rate": 3.6260345069434703e-06, "loss": 0.0266, "step": 99180 }, { "epoch": 2.7827184738392483, "grad_norm": 0.005629365798085928, "learning_rate": 3.6213587693458643e-06, "loss": 0.0077, "step": 99190 }, { "epoch": 2.7829990180951043, "grad_norm": 0.02399521879851818, "learning_rate": 3.6166830317482587e-06, "loss": 0.0265, "step": 99200 }, { "epoch": 2.7832795623509607, "grad_norm": 0.7489615082740784, "learning_rate": 3.6120072941506523e-06, "loss": 0.0467, "step": 99210 }, { "epoch": 2.783560106606817, "grad_norm": 0.5073890686035156, "learning_rate": 3.6073315565530467e-06, "loss": 0.0159, "step": 99220 }, { "epoch": 2.7838406508626736, "grad_norm": 0.06048720329999924, "learning_rate": 3.6026558189554403e-06, "loss": 0.0177, "step": 99230 }, { "epoch": 2.78412119511853, "grad_norm": 0.47885075211524963, "learning_rate": 3.5979800813578343e-06, "loss": 0.0097, "step": 99240 }, { "epoch": 2.7844017393743865, "grad_norm": 0.04538440331816673, "learning_rate": 3.593304343760228e-06, "loss": 0.0095, "step": 99250 }, { "epoch": 2.7846822836302425, "grad_norm": 0.16227447986602783, "learning_rate": 3.5886286061626223e-06, "loss": 0.0074, "step": 99260 }, { "epoch": 2.784962827886099, "grad_norm": 0.0335659384727478, "learning_rate": 3.5839528685650168e-06, "loss": 0.0157, "step": 99270 }, { "epoch": 2.7852433721419554, "grad_norm": 0.2582368850708008, "learning_rate": 3.5792771309674103e-06, "loss": 0.0123, "step": 99280 }, { "epoch": 2.785523916397812, "grad_norm": 0.053035371005535126, "learning_rate": 3.5746013933698043e-06, "loss": 0.0051, "step": 99290 }, { "epoch": 2.7858044606536683, "grad_norm": 0.12173629552125931, "learning_rate": 3.569925655772198e-06, "loss": 0.0136, "step": 99300 }, { "epoch": 2.7860850049095243, "grad_norm": 0.0875578299164772, "learning_rate": 3.5652499181745923e-06, "loss": 0.0293, "step": 99310 }, { "epoch": 2.7863655491653807, "grad_norm": 0.7933509349822998, "learning_rate": 3.560574180576986e-06, "loss": 0.0292, "step": 99320 }, { "epoch": 2.786646093421237, "grad_norm": 0.5108590722084045, "learning_rate": 3.55589844297938e-06, "loss": 0.0053, "step": 99330 }, { "epoch": 2.7869266376770936, "grad_norm": 0.5607981085777283, "learning_rate": 3.5512227053817743e-06, "loss": 0.0224, "step": 99340 }, { "epoch": 2.78720718193295, "grad_norm": 0.2698008716106415, "learning_rate": 3.546546967784168e-06, "loss": 0.011, "step": 99350 }, { "epoch": 2.7874877261888065, "grad_norm": 0.28693991899490356, "learning_rate": 3.5418712301865623e-06, "loss": 0.026, "step": 99360 }, { "epoch": 2.787768270444663, "grad_norm": 0.12157543748617172, "learning_rate": 3.537195492588956e-06, "loss": 0.0174, "step": 99370 }, { "epoch": 2.788048814700519, "grad_norm": 0.029140714555978775, "learning_rate": 3.53251975499135e-06, "loss": 0.0334, "step": 99380 }, { "epoch": 2.7883293589563753, "grad_norm": 0.10487908869981766, "learning_rate": 3.5278440173937444e-06, "loss": 0.0114, "step": 99390 }, { "epoch": 2.788609903212232, "grad_norm": 0.02065090462565422, "learning_rate": 3.523168279796138e-06, "loss": 0.0085, "step": 99400 }, { "epoch": 2.7888904474680882, "grad_norm": 0.03276306018233299, "learning_rate": 3.5184925421985324e-06, "loss": 0.0038, "step": 99410 }, { "epoch": 2.7891709917239442, "grad_norm": 0.08562154322862625, "learning_rate": 3.513816804600926e-06, "loss": 0.0117, "step": 99420 }, { "epoch": 2.7894515359798007, "grad_norm": 0.019470542669296265, "learning_rate": 3.50914106700332e-06, "loss": 0.0072, "step": 99430 }, { "epoch": 2.789732080235657, "grad_norm": 0.09442698210477829, "learning_rate": 3.5044653294057135e-06, "loss": 0.0267, "step": 99440 }, { "epoch": 2.7900126244915135, "grad_norm": 0.013705488294363022, "learning_rate": 3.499789591808108e-06, "loss": 0.0343, "step": 99450 }, { "epoch": 2.79029316874737, "grad_norm": 0.3048751652240753, "learning_rate": 3.4951138542105024e-06, "loss": 0.0239, "step": 99460 }, { "epoch": 2.7905737130032264, "grad_norm": 0.023429974913597107, "learning_rate": 3.490438116612896e-06, "loss": 0.012, "step": 99470 }, { "epoch": 2.790854257259083, "grad_norm": 0.024814056232571602, "learning_rate": 3.48576237901529e-06, "loss": 0.0138, "step": 99480 }, { "epoch": 2.791134801514939, "grad_norm": 0.3057454824447632, "learning_rate": 3.4810866414176835e-06, "loss": 0.0079, "step": 99490 }, { "epoch": 2.7914153457707953, "grad_norm": 0.5083408355712891, "learning_rate": 3.476410903820078e-06, "loss": 0.0159, "step": 99500 }, { "epoch": 2.7916958900266517, "grad_norm": 1.923302173614502, "learning_rate": 3.4717351662224715e-06, "loss": 0.0232, "step": 99510 }, { "epoch": 2.791976434282508, "grad_norm": 0.033664409071207047, "learning_rate": 3.4670594286248655e-06, "loss": 0.038, "step": 99520 }, { "epoch": 2.792256978538364, "grad_norm": 0.018868178129196167, "learning_rate": 3.46238369102726e-06, "loss": 0.033, "step": 99530 }, { "epoch": 2.7925375227942206, "grad_norm": 3.7288589477539062, "learning_rate": 3.4577079534296535e-06, "loss": 0.0438, "step": 99540 }, { "epoch": 2.792818067050077, "grad_norm": 0.347543329000473, "learning_rate": 3.453032215832048e-06, "loss": 0.0339, "step": 99550 }, { "epoch": 2.7930986113059335, "grad_norm": 0.005717538297176361, "learning_rate": 3.4483564782344415e-06, "loss": 0.0219, "step": 99560 }, { "epoch": 2.79337915556179, "grad_norm": 0.5537550449371338, "learning_rate": 3.4436807406368355e-06, "loss": 0.0053, "step": 99570 }, { "epoch": 2.7936596998176464, "grad_norm": 0.049639638513326645, "learning_rate": 3.439005003039229e-06, "loss": 0.0095, "step": 99580 }, { "epoch": 2.793940244073503, "grad_norm": 1.0985755920410156, "learning_rate": 3.4343292654416236e-06, "loss": 0.0247, "step": 99590 }, { "epoch": 2.794220788329359, "grad_norm": 2.522463798522949, "learning_rate": 3.429653527844018e-06, "loss": 0.0565, "step": 99600 }, { "epoch": 2.7945013325852153, "grad_norm": 0.023190785199403763, "learning_rate": 3.4249777902464116e-06, "loss": 0.0236, "step": 99610 }, { "epoch": 2.7947818768410717, "grad_norm": 0.02697625756263733, "learning_rate": 3.4203020526488056e-06, "loss": 0.0263, "step": 99620 }, { "epoch": 2.795062421096928, "grad_norm": 0.48661866784095764, "learning_rate": 3.415626315051199e-06, "loss": 0.0216, "step": 99630 }, { "epoch": 2.795342965352784, "grad_norm": 0.03856627270579338, "learning_rate": 3.4109505774535936e-06, "loss": 0.0114, "step": 99640 }, { "epoch": 2.7956235096086406, "grad_norm": 0.004134817980229855, "learning_rate": 3.406274839855987e-06, "loss": 0.021, "step": 99650 }, { "epoch": 2.795904053864497, "grad_norm": 0.20701968669891357, "learning_rate": 3.4015991022583816e-06, "loss": 0.0111, "step": 99660 }, { "epoch": 2.7961845981203535, "grad_norm": 0.06883914023637772, "learning_rate": 3.3969233646607756e-06, "loss": 0.0101, "step": 99670 }, { "epoch": 2.79646514237621, "grad_norm": 0.30808624625205994, "learning_rate": 3.392247627063169e-06, "loss": 0.0117, "step": 99680 }, { "epoch": 2.7967456866320664, "grad_norm": 0.02510407753288746, "learning_rate": 3.3875718894655636e-06, "loss": 0.0551, "step": 99690 }, { "epoch": 2.797026230887923, "grad_norm": 0.11034011840820312, "learning_rate": 3.382896151867957e-06, "loss": 0.0023, "step": 99700 }, { "epoch": 2.797306775143779, "grad_norm": 0.8990176916122437, "learning_rate": 3.378220414270351e-06, "loss": 0.0075, "step": 99710 }, { "epoch": 2.7975873193996352, "grad_norm": 0.09746234118938446, "learning_rate": 3.3735446766727456e-06, "loss": 0.0165, "step": 99720 }, { "epoch": 2.7978678636554917, "grad_norm": 0.4511845111846924, "learning_rate": 3.368868939075139e-06, "loss": 0.0284, "step": 99730 }, { "epoch": 2.798148407911348, "grad_norm": 0.4160049557685852, "learning_rate": 3.3641932014775336e-06, "loss": 0.0069, "step": 99740 }, { "epoch": 2.798428952167204, "grad_norm": 0.05822250619530678, "learning_rate": 3.359517463879927e-06, "loss": 0.008, "step": 99750 }, { "epoch": 2.7987094964230605, "grad_norm": 0.011694948188960552, "learning_rate": 3.354841726282321e-06, "loss": 0.0367, "step": 99760 }, { "epoch": 2.798990040678917, "grad_norm": 0.16537149250507355, "learning_rate": 3.3501659886847147e-06, "loss": 0.0106, "step": 99770 }, { "epoch": 2.7992705849347734, "grad_norm": 0.22696562111377716, "learning_rate": 3.345490251087109e-06, "loss": 0.043, "step": 99780 }, { "epoch": 2.79955112919063, "grad_norm": 0.07160148024559021, "learning_rate": 3.3408145134895036e-06, "loss": 0.0207, "step": 99790 }, { "epoch": 2.7998316734464863, "grad_norm": 0.7503019571304321, "learning_rate": 3.336138775891897e-06, "loss": 0.0137, "step": 99800 }, { "epoch": 2.8001122177023428, "grad_norm": 0.01047939620912075, "learning_rate": 3.331463038294291e-06, "loss": 0.0406, "step": 99810 }, { "epoch": 2.8003927619581988, "grad_norm": 0.12166433036327362, "learning_rate": 3.3267873006966848e-06, "loss": 0.0319, "step": 99820 }, { "epoch": 2.800673306214055, "grad_norm": 1.1988554000854492, "learning_rate": 3.322111563099079e-06, "loss": 0.0155, "step": 99830 }, { "epoch": 2.8009538504699116, "grad_norm": 0.0515449196100235, "learning_rate": 3.3174358255014728e-06, "loss": 0.0155, "step": 99840 }, { "epoch": 2.801234394725768, "grad_norm": 0.20219603180885315, "learning_rate": 3.312760087903867e-06, "loss": 0.0257, "step": 99850 }, { "epoch": 2.8015149389816245, "grad_norm": 0.24043798446655273, "learning_rate": 3.308084350306261e-06, "loss": 0.0085, "step": 99860 }, { "epoch": 2.8017954832374805, "grad_norm": 0.04579060524702072, "learning_rate": 3.3034086127086548e-06, "loss": 0.0153, "step": 99870 }, { "epoch": 2.802076027493337, "grad_norm": 0.7531810402870178, "learning_rate": 3.298732875111049e-06, "loss": 0.0122, "step": 99880 }, { "epoch": 2.8023565717491934, "grad_norm": 0.0645771324634552, "learning_rate": 3.2940571375134428e-06, "loss": 0.0078, "step": 99890 }, { "epoch": 2.80263711600505, "grad_norm": 0.041157424449920654, "learning_rate": 3.2893813999158368e-06, "loss": 0.0241, "step": 99900 }, { "epoch": 2.8029176602609063, "grad_norm": 0.2374466359615326, "learning_rate": 3.2847056623182304e-06, "loss": 0.0165, "step": 99910 }, { "epoch": 2.8031982045167627, "grad_norm": 0.2927073836326599, "learning_rate": 3.2800299247206248e-06, "loss": 0.0106, "step": 99920 }, { "epoch": 2.803478748772619, "grad_norm": 0.03379327431321144, "learning_rate": 3.275354187123019e-06, "loss": 0.0258, "step": 99930 }, { "epoch": 2.803759293028475, "grad_norm": 0.7113417387008667, "learning_rate": 3.2706784495254128e-06, "loss": 0.0193, "step": 99940 }, { "epoch": 2.8040398372843316, "grad_norm": 0.07308351248502731, "learning_rate": 3.2660027119278068e-06, "loss": 0.0118, "step": 99950 }, { "epoch": 2.804320381540188, "grad_norm": 0.36304667592048645, "learning_rate": 3.2613269743302004e-06, "loss": 0.0444, "step": 99960 }, { "epoch": 2.8046009257960445, "grad_norm": 0.03560710325837135, "learning_rate": 3.256651236732595e-06, "loss": 0.0029, "step": 99970 }, { "epoch": 2.8048814700519005, "grad_norm": 0.03401888534426689, "learning_rate": 3.2519754991349884e-06, "loss": 0.0095, "step": 99980 }, { "epoch": 2.805162014307757, "grad_norm": 0.07616101950407028, "learning_rate": 3.247299761537383e-06, "loss": 0.0234, "step": 99990 }, { "epoch": 2.8054425585636134, "grad_norm": 0.06490380316972733, "learning_rate": 3.242624023939777e-06, "loss": 0.0595, "step": 100000 }, { "epoch": 2.80572310281947, "grad_norm": 0.09158789366483688, "learning_rate": 3.2379482863421704e-06, "loss": 0.0341, "step": 100010 }, { "epoch": 2.8060036470753262, "grad_norm": 0.30142587423324585, "learning_rate": 3.233272548744565e-06, "loss": 0.0103, "step": 100020 }, { "epoch": 2.8062841913311827, "grad_norm": 0.05337204411625862, "learning_rate": 3.2285968111469584e-06, "loss": 0.0194, "step": 100030 }, { "epoch": 2.806564735587039, "grad_norm": 0.023686731234192848, "learning_rate": 3.223921073549353e-06, "loss": 0.0131, "step": 100040 }, { "epoch": 2.806845279842895, "grad_norm": 1.828624963760376, "learning_rate": 3.219245335951747e-06, "loss": 0.0177, "step": 100050 }, { "epoch": 2.8071258240987516, "grad_norm": 3.509481906890869, "learning_rate": 3.2145695983541404e-06, "loss": 0.0234, "step": 100060 }, { "epoch": 2.807406368354608, "grad_norm": 0.04959489405155182, "learning_rate": 3.209893860756535e-06, "loss": 0.003, "step": 100070 }, { "epoch": 2.8076869126104644, "grad_norm": 0.08381042629480362, "learning_rate": 3.2052181231589284e-06, "loss": 0.0082, "step": 100080 }, { "epoch": 2.8079674568663204, "grad_norm": 0.10672687739133835, "learning_rate": 3.2005423855613224e-06, "loss": 0.023, "step": 100090 }, { "epoch": 2.808248001122177, "grad_norm": 0.018705494701862335, "learning_rate": 3.195866647963716e-06, "loss": 0.0044, "step": 100100 }, { "epoch": 2.8085285453780333, "grad_norm": 0.030295949429273605, "learning_rate": 3.1911909103661104e-06, "loss": 0.0117, "step": 100110 }, { "epoch": 2.8088090896338898, "grad_norm": 0.022872161120176315, "learning_rate": 3.186515172768505e-06, "loss": 0.0131, "step": 100120 }, { "epoch": 2.809089633889746, "grad_norm": 0.22153793275356293, "learning_rate": 3.1818394351708984e-06, "loss": 0.0177, "step": 100130 }, { "epoch": 2.8093701781456026, "grad_norm": 0.4060366749763489, "learning_rate": 3.1771636975732924e-06, "loss": 0.009, "step": 100140 }, { "epoch": 2.809650722401459, "grad_norm": 0.048789430409669876, "learning_rate": 3.172487959975686e-06, "loss": 0.008, "step": 100150 }, { "epoch": 2.809931266657315, "grad_norm": 0.15111422538757324, "learning_rate": 3.1678122223780804e-06, "loss": 0.007, "step": 100160 }, { "epoch": 2.8102118109131715, "grad_norm": 0.724479615688324, "learning_rate": 3.163136484780474e-06, "loss": 0.0286, "step": 100170 }, { "epoch": 2.810492355169028, "grad_norm": 0.056888505816459656, "learning_rate": 3.1584607471828684e-06, "loss": 0.0451, "step": 100180 }, { "epoch": 2.8107728994248844, "grad_norm": 0.32426926493644714, "learning_rate": 3.1537850095852624e-06, "loss": 0.0144, "step": 100190 }, { "epoch": 2.8110534436807404, "grad_norm": 0.05629437044262886, "learning_rate": 3.149109271987656e-06, "loss": 0.0222, "step": 100200 }, { "epoch": 2.811333987936597, "grad_norm": 0.4132240414619446, "learning_rate": 3.1444335343900504e-06, "loss": 0.0096, "step": 100210 }, { "epoch": 2.8116145321924533, "grad_norm": 0.014409046620130539, "learning_rate": 3.139757796792444e-06, "loss": 0.035, "step": 100220 }, { "epoch": 2.8118950764483097, "grad_norm": 0.6847306489944458, "learning_rate": 3.1350820591948384e-06, "loss": 0.0366, "step": 100230 }, { "epoch": 2.812175620704166, "grad_norm": 0.544266402721405, "learning_rate": 3.130406321597232e-06, "loss": 0.0258, "step": 100240 }, { "epoch": 2.8124561649600226, "grad_norm": 0.06569482386112213, "learning_rate": 3.125730583999626e-06, "loss": 0.0047, "step": 100250 }, { "epoch": 2.812736709215879, "grad_norm": 0.6795011162757874, "learning_rate": 3.12105484640202e-06, "loss": 0.0064, "step": 100260 }, { "epoch": 2.813017253471735, "grad_norm": 0.4190938472747803, "learning_rate": 3.116379108804414e-06, "loss": 0.0134, "step": 100270 }, { "epoch": 2.8132977977275915, "grad_norm": 0.09363698959350586, "learning_rate": 3.111703371206808e-06, "loss": 0.0186, "step": 100280 }, { "epoch": 2.813578341983448, "grad_norm": 0.4203953444957733, "learning_rate": 3.107027633609202e-06, "loss": 0.0104, "step": 100290 }, { "epoch": 2.8138588862393044, "grad_norm": 0.27393007278442383, "learning_rate": 3.102351896011596e-06, "loss": 0.0285, "step": 100300 }, { "epoch": 2.8141394304951604, "grad_norm": 0.034625183790922165, "learning_rate": 3.09767615841399e-06, "loss": 0.0185, "step": 100310 }, { "epoch": 2.814419974751017, "grad_norm": 0.1508517563343048, "learning_rate": 3.093000420816384e-06, "loss": 0.012, "step": 100320 }, { "epoch": 2.8147005190068732, "grad_norm": 0.14943139255046844, "learning_rate": 3.0883246832187776e-06, "loss": 0.016, "step": 100330 }, { "epoch": 2.8149810632627297, "grad_norm": 0.3109113872051239, "learning_rate": 3.0836489456211716e-06, "loss": 0.0073, "step": 100340 }, { "epoch": 2.815261607518586, "grad_norm": 0.881848931312561, "learning_rate": 3.078973208023566e-06, "loss": 0.0075, "step": 100350 }, { "epoch": 2.8155421517744426, "grad_norm": 0.6625425815582275, "learning_rate": 3.07429747042596e-06, "loss": 0.0128, "step": 100360 }, { "epoch": 2.815822696030299, "grad_norm": 1.7777518033981323, "learning_rate": 3.069621732828354e-06, "loss": 0.0299, "step": 100370 }, { "epoch": 2.816103240286155, "grad_norm": 0.035904087126255035, "learning_rate": 3.0649459952307476e-06, "loss": 0.0313, "step": 100380 }, { "epoch": 2.8163837845420114, "grad_norm": 0.2001873254776001, "learning_rate": 3.0602702576331416e-06, "loss": 0.0139, "step": 100390 }, { "epoch": 2.816664328797868, "grad_norm": 0.07648865878582001, "learning_rate": 3.0555945200355356e-06, "loss": 0.0136, "step": 100400 }, { "epoch": 2.8169448730537243, "grad_norm": 0.050621867179870605, "learning_rate": 3.0509187824379296e-06, "loss": 0.005, "step": 100410 }, { "epoch": 2.8172254173095803, "grad_norm": 0.012882952578365803, "learning_rate": 3.0462430448403236e-06, "loss": 0.031, "step": 100420 }, { "epoch": 2.8175059615654368, "grad_norm": 0.02770945616066456, "learning_rate": 3.0415673072427176e-06, "loss": 0.0325, "step": 100430 }, { "epoch": 2.817786505821293, "grad_norm": 0.038032758980989456, "learning_rate": 3.0368915696451116e-06, "loss": 0.0114, "step": 100440 }, { "epoch": 2.8180670500771496, "grad_norm": 0.29376456141471863, "learning_rate": 3.0322158320475056e-06, "loss": 0.0299, "step": 100450 }, { "epoch": 2.818347594333006, "grad_norm": 0.0160361360758543, "learning_rate": 3.0275400944498996e-06, "loss": 0.0065, "step": 100460 }, { "epoch": 2.8186281385888625, "grad_norm": 0.548561692237854, "learning_rate": 3.0228643568522936e-06, "loss": 0.0264, "step": 100470 }, { "epoch": 2.818908682844719, "grad_norm": 0.6623975038528442, "learning_rate": 3.0181886192546876e-06, "loss": 0.0204, "step": 100480 }, { "epoch": 2.819189227100575, "grad_norm": 0.07576477527618408, "learning_rate": 3.0135128816570816e-06, "loss": 0.022, "step": 100490 }, { "epoch": 2.8194697713564314, "grad_norm": 0.48991909623146057, "learning_rate": 3.0088371440594756e-06, "loss": 0.0396, "step": 100500 }, { "epoch": 2.819750315612288, "grad_norm": 0.24669404327869415, "learning_rate": 3.0041614064618696e-06, "loss": 0.0229, "step": 100510 }, { "epoch": 2.8200308598681443, "grad_norm": 0.03417018800973892, "learning_rate": 2.9994856688642632e-06, "loss": 0.0154, "step": 100520 }, { "epoch": 2.8203114041240007, "grad_norm": 0.47977814078330994, "learning_rate": 2.9948099312666572e-06, "loss": 0.0251, "step": 100530 }, { "epoch": 2.8205919483798567, "grad_norm": 0.037322141230106354, "learning_rate": 2.9901341936690512e-06, "loss": 0.0065, "step": 100540 }, { "epoch": 2.820872492635713, "grad_norm": 0.04803347960114479, "learning_rate": 2.9854584560714457e-06, "loss": 0.0314, "step": 100550 }, { "epoch": 2.8211530368915696, "grad_norm": 0.016182012856006622, "learning_rate": 2.9807827184738397e-06, "loss": 0.0092, "step": 100560 }, { "epoch": 2.821433581147426, "grad_norm": 0.1992042064666748, "learning_rate": 2.9761069808762332e-06, "loss": 0.0241, "step": 100570 }, { "epoch": 2.8217141254032825, "grad_norm": 2.7936432361602783, "learning_rate": 2.9714312432786272e-06, "loss": 0.0302, "step": 100580 }, { "epoch": 2.821994669659139, "grad_norm": 0.3739040195941925, "learning_rate": 2.9667555056810212e-06, "loss": 0.0092, "step": 100590 }, { "epoch": 2.8222752139149954, "grad_norm": 0.006091832183301449, "learning_rate": 2.9620797680834152e-06, "loss": 0.0027, "step": 100600 }, { "epoch": 2.8225557581708514, "grad_norm": 0.05503499135375023, "learning_rate": 2.9574040304858092e-06, "loss": 0.005, "step": 100610 }, { "epoch": 2.822836302426708, "grad_norm": 0.026654815301299095, "learning_rate": 2.9527282928882032e-06, "loss": 0.0164, "step": 100620 }, { "epoch": 2.8231168466825642, "grad_norm": 0.011836270801723003, "learning_rate": 2.9480525552905972e-06, "loss": 0.011, "step": 100630 }, { "epoch": 2.8233973909384207, "grad_norm": 0.04823334515094757, "learning_rate": 2.9433768176929912e-06, "loss": 0.0141, "step": 100640 }, { "epoch": 2.8236779351942767, "grad_norm": 0.025055812671780586, "learning_rate": 2.9387010800953853e-06, "loss": 0.0183, "step": 100650 }, { "epoch": 2.823958479450133, "grad_norm": 0.825127363204956, "learning_rate": 2.9340253424977793e-06, "loss": 0.0272, "step": 100660 }, { "epoch": 2.8242390237059896, "grad_norm": 0.027495209127664566, "learning_rate": 2.929349604900173e-06, "loss": 0.0184, "step": 100670 }, { "epoch": 2.824519567961846, "grad_norm": 0.8383206725120544, "learning_rate": 2.9246738673025673e-06, "loss": 0.048, "step": 100680 }, { "epoch": 2.8248001122177024, "grad_norm": 1.9365991353988647, "learning_rate": 2.9199981297049613e-06, "loss": 0.0212, "step": 100690 }, { "epoch": 2.825080656473559, "grad_norm": 0.11450410634279251, "learning_rate": 2.9153223921073553e-06, "loss": 0.0087, "step": 100700 }, { "epoch": 2.8253612007294153, "grad_norm": 0.20494678616523743, "learning_rate": 2.910646654509749e-06, "loss": 0.0016, "step": 100710 }, { "epoch": 2.8256417449852713, "grad_norm": 0.04467964544892311, "learning_rate": 2.905970916912143e-06, "loss": 0.005, "step": 100720 }, { "epoch": 2.8259222892411278, "grad_norm": 0.046153098344802856, "learning_rate": 2.901295179314537e-06, "loss": 0.0164, "step": 100730 }, { "epoch": 2.826202833496984, "grad_norm": 1.633514642715454, "learning_rate": 2.8966194417169313e-06, "loss": 0.0209, "step": 100740 }, { "epoch": 2.8264833777528406, "grad_norm": 0.6352909803390503, "learning_rate": 2.8919437041193253e-06, "loss": 0.0198, "step": 100750 }, { "epoch": 2.8267639220086966, "grad_norm": 0.025056717917323112, "learning_rate": 2.887267966521719e-06, "loss": 0.0164, "step": 100760 }, { "epoch": 2.827044466264553, "grad_norm": 0.977267861366272, "learning_rate": 2.882592228924113e-06, "loss": 0.0641, "step": 100770 }, { "epoch": 2.8273250105204095, "grad_norm": 0.7060287594795227, "learning_rate": 2.877916491326507e-06, "loss": 0.0161, "step": 100780 }, { "epoch": 2.827605554776266, "grad_norm": 0.6874880194664001, "learning_rate": 2.873240753728901e-06, "loss": 0.0125, "step": 100790 }, { "epoch": 2.8278860990321224, "grad_norm": 0.0582369863986969, "learning_rate": 2.868565016131295e-06, "loss": 0.0251, "step": 100800 }, { "epoch": 2.828166643287979, "grad_norm": 0.04916049540042877, "learning_rate": 2.863889278533689e-06, "loss": 0.0282, "step": 100810 }, { "epoch": 2.8284471875438353, "grad_norm": 0.059217412024736404, "learning_rate": 2.859213540936083e-06, "loss": 0.0184, "step": 100820 }, { "epoch": 2.8287277317996913, "grad_norm": 0.37716102600097656, "learning_rate": 2.854537803338477e-06, "loss": 0.0062, "step": 100830 }, { "epoch": 2.8290082760555477, "grad_norm": 0.05099337175488472, "learning_rate": 2.849862065740871e-06, "loss": 0.0103, "step": 100840 }, { "epoch": 2.829288820311404, "grad_norm": 0.559553325176239, "learning_rate": 2.845186328143265e-06, "loss": 0.0128, "step": 100850 }, { "epoch": 2.8295693645672606, "grad_norm": 0.049727872014045715, "learning_rate": 2.8405105905456585e-06, "loss": 0.0156, "step": 100860 }, { "epoch": 2.8298499088231166, "grad_norm": 1.216301441192627, "learning_rate": 2.8358348529480525e-06, "loss": 0.0231, "step": 100870 }, { "epoch": 2.830130453078973, "grad_norm": 0.11188387125730515, "learning_rate": 2.831159115350447e-06, "loss": 0.0061, "step": 100880 }, { "epoch": 2.8304109973348295, "grad_norm": 0.26690006256103516, "learning_rate": 2.826483377752841e-06, "loss": 0.0121, "step": 100890 }, { "epoch": 2.830691541590686, "grad_norm": 0.0202318225055933, "learning_rate": 2.8218076401552345e-06, "loss": 0.0428, "step": 100900 }, { "epoch": 2.8309720858465424, "grad_norm": 0.021009109914302826, "learning_rate": 2.8171319025576285e-06, "loss": 0.0134, "step": 100910 }, { "epoch": 2.831252630102399, "grad_norm": 0.06530445069074631, "learning_rate": 2.8124561649600225e-06, "loss": 0.0178, "step": 100920 }, { "epoch": 2.8315331743582552, "grad_norm": 0.1235971599817276, "learning_rate": 2.8077804273624165e-06, "loss": 0.0233, "step": 100930 }, { "epoch": 2.8318137186141112, "grad_norm": 0.030188925564289093, "learning_rate": 2.803104689764811e-06, "loss": 0.0351, "step": 100940 }, { "epoch": 2.8320942628699677, "grad_norm": 0.054572612047195435, "learning_rate": 2.7984289521672045e-06, "loss": 0.0139, "step": 100950 }, { "epoch": 2.832374807125824, "grad_norm": 0.007388267666101456, "learning_rate": 2.7937532145695985e-06, "loss": 0.0266, "step": 100960 }, { "epoch": 2.8326553513816806, "grad_norm": 0.8937549591064453, "learning_rate": 2.7890774769719925e-06, "loss": 0.0207, "step": 100970 }, { "epoch": 2.8329358956375366, "grad_norm": 0.2261933833360672, "learning_rate": 2.7844017393743865e-06, "loss": 0.0187, "step": 100980 }, { "epoch": 2.833216439893393, "grad_norm": 0.09428255259990692, "learning_rate": 2.7797260017767805e-06, "loss": 0.0307, "step": 100990 }, { "epoch": 2.8334969841492494, "grad_norm": 0.15543486177921295, "learning_rate": 2.775050264179174e-06, "loss": 0.0173, "step": 101000 }, { "epoch": 2.833777528405106, "grad_norm": 0.056580521166324615, "learning_rate": 2.7703745265815685e-06, "loss": 0.0347, "step": 101010 }, { "epoch": 2.8340580726609623, "grad_norm": 0.10529971122741699, "learning_rate": 2.7656987889839625e-06, "loss": 0.0246, "step": 101020 }, { "epoch": 2.8343386169168188, "grad_norm": 0.052244994789361954, "learning_rate": 2.7610230513863565e-06, "loss": 0.01, "step": 101030 }, { "epoch": 2.834619161172675, "grad_norm": 0.05586251616477966, "learning_rate": 2.7563473137887505e-06, "loss": 0.0141, "step": 101040 }, { "epoch": 2.834899705428531, "grad_norm": 0.1440400630235672, "learning_rate": 2.751671576191144e-06, "loss": 0.0074, "step": 101050 }, { "epoch": 2.8351802496843876, "grad_norm": 0.012930287979543209, "learning_rate": 2.746995838593538e-06, "loss": 0.0241, "step": 101060 }, { "epoch": 2.835460793940244, "grad_norm": 0.03930787742137909, "learning_rate": 2.7423201009959325e-06, "loss": 0.0197, "step": 101070 }, { "epoch": 2.8357413381961005, "grad_norm": 0.016853999346494675, "learning_rate": 2.7376443633983265e-06, "loss": 0.0178, "step": 101080 }, { "epoch": 2.8360218824519565, "grad_norm": 0.12905895709991455, "learning_rate": 2.73296862580072e-06, "loss": 0.0398, "step": 101090 }, { "epoch": 2.836302426707813, "grad_norm": 0.12378742545843124, "learning_rate": 2.728292888203114e-06, "loss": 0.0334, "step": 101100 }, { "epoch": 2.8365829709636694, "grad_norm": 0.3287613093852997, "learning_rate": 2.723617150605508e-06, "loss": 0.0089, "step": 101110 }, { "epoch": 2.836863515219526, "grad_norm": 0.02104973793029785, "learning_rate": 2.718941413007902e-06, "loss": 0.0339, "step": 101120 }, { "epoch": 2.8371440594753823, "grad_norm": 0.05810534209012985, "learning_rate": 2.714265675410296e-06, "loss": 0.0212, "step": 101130 }, { "epoch": 2.8374246037312387, "grad_norm": 0.11074046045541763, "learning_rate": 2.70958993781269e-06, "loss": 0.036, "step": 101140 }, { "epoch": 2.837705147987095, "grad_norm": 0.04407990351319313, "learning_rate": 2.704914200215084e-06, "loss": 0.0534, "step": 101150 }, { "epoch": 2.837985692242951, "grad_norm": 0.05506325885653496, "learning_rate": 2.700238462617478e-06, "loss": 0.0159, "step": 101160 }, { "epoch": 2.8382662364988076, "grad_norm": 0.16501569747924805, "learning_rate": 2.695562725019872e-06, "loss": 0.0064, "step": 101170 }, { "epoch": 2.838546780754664, "grad_norm": 0.1036754846572876, "learning_rate": 2.690886987422266e-06, "loss": 0.0104, "step": 101180 }, { "epoch": 2.8388273250105205, "grad_norm": 1.890191674232483, "learning_rate": 2.6862112498246597e-06, "loss": 0.0249, "step": 101190 }, { "epoch": 2.839107869266377, "grad_norm": 1.311317801475525, "learning_rate": 2.6815355122270537e-06, "loss": 0.0129, "step": 101200 }, { "epoch": 2.839388413522233, "grad_norm": 0.5384807586669922, "learning_rate": 2.676859774629448e-06, "loss": 0.0148, "step": 101210 }, { "epoch": 2.8396689577780894, "grad_norm": 0.011199303902685642, "learning_rate": 2.672184037031842e-06, "loss": 0.0047, "step": 101220 }, { "epoch": 2.839949502033946, "grad_norm": 0.015190325677394867, "learning_rate": 2.667508299434236e-06, "loss": 0.0329, "step": 101230 }, { "epoch": 2.8402300462898022, "grad_norm": 0.07444542646408081, "learning_rate": 2.6628325618366297e-06, "loss": 0.0118, "step": 101240 }, { "epoch": 2.8405105905456587, "grad_norm": 1.9685451984405518, "learning_rate": 2.6581568242390237e-06, "loss": 0.0313, "step": 101250 }, { "epoch": 2.840791134801515, "grad_norm": 0.7890541553497314, "learning_rate": 2.6534810866414177e-06, "loss": 0.0149, "step": 101260 }, { "epoch": 2.8410716790573716, "grad_norm": 0.8877725005149841, "learning_rate": 2.648805349043812e-06, "loss": 0.02, "step": 101270 }, { "epoch": 2.8413522233132276, "grad_norm": 0.035054415464401245, "learning_rate": 2.6441296114462057e-06, "loss": 0.0144, "step": 101280 }, { "epoch": 2.841632767569084, "grad_norm": 0.19756083190441132, "learning_rate": 2.6394538738485997e-06, "loss": 0.0106, "step": 101290 }, { "epoch": 2.8419133118249404, "grad_norm": 0.6988120079040527, "learning_rate": 2.6347781362509937e-06, "loss": 0.0076, "step": 101300 }, { "epoch": 2.842193856080797, "grad_norm": 0.03551902994513512, "learning_rate": 2.6301023986533877e-06, "loss": 0.021, "step": 101310 }, { "epoch": 2.842474400336653, "grad_norm": 0.07052966207265854, "learning_rate": 2.6254266610557817e-06, "loss": 0.0094, "step": 101320 }, { "epoch": 2.8427549445925093, "grad_norm": 0.015479432418942451, "learning_rate": 2.6207509234581753e-06, "loss": 0.0368, "step": 101330 }, { "epoch": 2.8430354888483658, "grad_norm": 0.014145748689770699, "learning_rate": 2.6160751858605697e-06, "loss": 0.0253, "step": 101340 }, { "epoch": 2.843316033104222, "grad_norm": 0.03251979872584343, "learning_rate": 2.6113994482629637e-06, "loss": 0.0069, "step": 101350 }, { "epoch": 2.8435965773600786, "grad_norm": 0.04938603565096855, "learning_rate": 2.6067237106653577e-06, "loss": 0.0107, "step": 101360 }, { "epoch": 2.843877121615935, "grad_norm": 0.03028246946632862, "learning_rate": 2.6020479730677517e-06, "loss": 0.0155, "step": 101370 }, { "epoch": 2.8441576658717915, "grad_norm": 0.011493098922073841, "learning_rate": 2.5973722354701453e-06, "loss": 0.0213, "step": 101380 }, { "epoch": 2.8444382101276475, "grad_norm": 0.2678868770599365, "learning_rate": 2.5926964978725393e-06, "loss": 0.03, "step": 101390 }, { "epoch": 2.844718754383504, "grad_norm": 0.2667733132839203, "learning_rate": 2.5880207602749337e-06, "loss": 0.0591, "step": 101400 }, { "epoch": 2.8449992986393604, "grad_norm": 2.005033493041992, "learning_rate": 2.5833450226773277e-06, "loss": 0.0118, "step": 101410 }, { "epoch": 2.845279842895217, "grad_norm": 0.02174452133476734, "learning_rate": 2.5786692850797213e-06, "loss": 0.0108, "step": 101420 }, { "epoch": 2.845560387151073, "grad_norm": 0.08302337676286697, "learning_rate": 2.5739935474821153e-06, "loss": 0.0218, "step": 101430 }, { "epoch": 2.8458409314069293, "grad_norm": 0.4720892310142517, "learning_rate": 2.5693178098845093e-06, "loss": 0.006, "step": 101440 }, { "epoch": 2.8461214756627857, "grad_norm": 0.04522472247481346, "learning_rate": 2.5646420722869033e-06, "loss": 0.0044, "step": 101450 }, { "epoch": 2.846402019918642, "grad_norm": 1.6361771821975708, "learning_rate": 2.5599663346892973e-06, "loss": 0.0215, "step": 101460 }, { "epoch": 2.8466825641744986, "grad_norm": 1.3371423482894897, "learning_rate": 2.5552905970916913e-06, "loss": 0.0312, "step": 101470 }, { "epoch": 2.846963108430355, "grad_norm": 0.25854596495628357, "learning_rate": 2.5506148594940853e-06, "loss": 0.0281, "step": 101480 }, { "epoch": 2.8472436526862115, "grad_norm": 0.09782785922288895, "learning_rate": 2.5459391218964793e-06, "loss": 0.0153, "step": 101490 }, { "epoch": 2.8475241969420675, "grad_norm": 0.05939691141247749, "learning_rate": 2.5412633842988733e-06, "loss": 0.0111, "step": 101500 }, { "epoch": 2.847804741197924, "grad_norm": 0.380016952753067, "learning_rate": 2.5365876467012673e-06, "loss": 0.0155, "step": 101510 }, { "epoch": 2.8480852854537804, "grad_norm": 1.377068042755127, "learning_rate": 2.531911909103661e-06, "loss": 0.0125, "step": 101520 }, { "epoch": 2.848365829709637, "grad_norm": 0.4133104383945465, "learning_rate": 2.527236171506055e-06, "loss": 0.0104, "step": 101530 }, { "epoch": 2.848646373965493, "grad_norm": 0.019785204902291298, "learning_rate": 2.5225604339084493e-06, "loss": 0.0095, "step": 101540 }, { "epoch": 2.8489269182213492, "grad_norm": 0.018518850207328796, "learning_rate": 2.5178846963108433e-06, "loss": 0.0163, "step": 101550 }, { "epoch": 2.8492074624772057, "grad_norm": 0.1758873164653778, "learning_rate": 2.5132089587132373e-06, "loss": 0.0081, "step": 101560 }, { "epoch": 2.849488006733062, "grad_norm": 0.04260603338479996, "learning_rate": 2.508533221115631e-06, "loss": 0.01, "step": 101570 }, { "epoch": 2.8497685509889186, "grad_norm": 0.2956300377845764, "learning_rate": 2.503857483518025e-06, "loss": 0.0595, "step": 101580 }, { "epoch": 2.850049095244775, "grad_norm": 0.7623551487922668, "learning_rate": 2.499181745920419e-06, "loss": 0.0205, "step": 101590 }, { "epoch": 2.8503296395006315, "grad_norm": 2.866107225418091, "learning_rate": 2.4945060083228133e-06, "loss": 0.0359, "step": 101600 }, { "epoch": 2.8506101837564874, "grad_norm": 1.258488655090332, "learning_rate": 2.489830270725207e-06, "loss": 0.017, "step": 101610 }, { "epoch": 2.850890728012344, "grad_norm": 0.6631786823272705, "learning_rate": 2.485154533127601e-06, "loss": 0.0227, "step": 101620 }, { "epoch": 2.8511712722682003, "grad_norm": 0.04346822574734688, "learning_rate": 2.480478795529995e-06, "loss": 0.0149, "step": 101630 }, { "epoch": 2.8514518165240568, "grad_norm": 0.6706247329711914, "learning_rate": 2.475803057932389e-06, "loss": 0.0114, "step": 101640 }, { "epoch": 2.8517323607799128, "grad_norm": 0.04872418940067291, "learning_rate": 2.471127320334783e-06, "loss": 0.012, "step": 101650 }, { "epoch": 2.852012905035769, "grad_norm": 0.28696855902671814, "learning_rate": 2.466451582737177e-06, "loss": 0.0237, "step": 101660 }, { "epoch": 2.8522934492916256, "grad_norm": 0.08809927850961685, "learning_rate": 2.461775845139571e-06, "loss": 0.0117, "step": 101670 }, { "epoch": 2.852573993547482, "grad_norm": 0.2427254319190979, "learning_rate": 2.457100107541965e-06, "loss": 0.0039, "step": 101680 }, { "epoch": 2.8528545378033385, "grad_norm": 0.6704059839248657, "learning_rate": 2.452424369944359e-06, "loss": 0.0093, "step": 101690 }, { "epoch": 2.853135082059195, "grad_norm": 0.0845332145690918, "learning_rate": 2.447748632346753e-06, "loss": 0.0088, "step": 101700 }, { "epoch": 2.8534156263150514, "grad_norm": 0.15916532278060913, "learning_rate": 2.4430728947491465e-06, "loss": 0.0196, "step": 101710 }, { "epoch": 2.8536961705709074, "grad_norm": 0.037186264991760254, "learning_rate": 2.4383971571515405e-06, "loss": 0.0299, "step": 101720 }, { "epoch": 2.853976714826764, "grad_norm": 0.10095835477113724, "learning_rate": 2.433721419553935e-06, "loss": 0.0249, "step": 101730 }, { "epoch": 2.8542572590826203, "grad_norm": 0.17840126156806946, "learning_rate": 2.429045681956329e-06, "loss": 0.0192, "step": 101740 }, { "epoch": 2.8545378033384767, "grad_norm": 0.7701758742332458, "learning_rate": 2.424369944358723e-06, "loss": 0.0189, "step": 101750 }, { "epoch": 2.854818347594333, "grad_norm": 1.2738862037658691, "learning_rate": 2.4196942067611165e-06, "loss": 0.0456, "step": 101760 }, { "epoch": 2.855098891850189, "grad_norm": 0.005705833900719881, "learning_rate": 2.4150184691635105e-06, "loss": 0.0098, "step": 101770 }, { "epoch": 2.8553794361060456, "grad_norm": 0.13938865065574646, "learning_rate": 2.4103427315659045e-06, "loss": 0.0295, "step": 101780 }, { "epoch": 2.855659980361902, "grad_norm": 0.055885639041662216, "learning_rate": 2.4056669939682985e-06, "loss": 0.0303, "step": 101790 }, { "epoch": 2.8559405246177585, "grad_norm": 0.10447242856025696, "learning_rate": 2.4009912563706925e-06, "loss": 0.0163, "step": 101800 }, { "epoch": 2.856221068873615, "grad_norm": 0.08088366687297821, "learning_rate": 2.3963155187730865e-06, "loss": 0.0216, "step": 101810 }, { "epoch": 2.8565016131294714, "grad_norm": 0.07395732402801514, "learning_rate": 2.3916397811754806e-06, "loss": 0.0043, "step": 101820 }, { "epoch": 2.8567821573853274, "grad_norm": 1.2332302331924438, "learning_rate": 2.3869640435778746e-06, "loss": 0.0102, "step": 101830 }, { "epoch": 2.857062701641184, "grad_norm": 0.08113298565149307, "learning_rate": 2.3822883059802686e-06, "loss": 0.0068, "step": 101840 }, { "epoch": 2.8573432458970403, "grad_norm": 0.02871810831129551, "learning_rate": 2.3776125683826626e-06, "loss": 0.0215, "step": 101850 }, { "epoch": 2.8576237901528967, "grad_norm": 1.775007963180542, "learning_rate": 2.372936830785056e-06, "loss": 0.0215, "step": 101860 }, { "epoch": 2.857904334408753, "grad_norm": 0.35086801648139954, "learning_rate": 2.3682610931874506e-06, "loss": 0.02, "step": 101870 }, { "epoch": 2.858184878664609, "grad_norm": 0.04810934513807297, "learning_rate": 2.3635853555898446e-06, "loss": 0.0119, "step": 101880 }, { "epoch": 2.8584654229204656, "grad_norm": 0.1035749688744545, "learning_rate": 2.3589096179922386e-06, "loss": 0.0067, "step": 101890 }, { "epoch": 2.858745967176322, "grad_norm": 0.019799070432782173, "learning_rate": 2.354233880394632e-06, "loss": 0.0412, "step": 101900 }, { "epoch": 2.8590265114321785, "grad_norm": 0.1770394891500473, "learning_rate": 2.349558142797026e-06, "loss": 0.0269, "step": 101910 }, { "epoch": 2.859307055688035, "grad_norm": 0.4304809272289276, "learning_rate": 2.34488240519942e-06, "loss": 0.009, "step": 101920 }, { "epoch": 2.8595875999438913, "grad_norm": 0.386322021484375, "learning_rate": 2.3402066676018146e-06, "loss": 0.0078, "step": 101930 }, { "epoch": 2.8598681441997478, "grad_norm": 0.5853291153907776, "learning_rate": 2.3355309300042086e-06, "loss": 0.0096, "step": 101940 }, { "epoch": 2.8601486884556038, "grad_norm": 1.175545573234558, "learning_rate": 2.330855192406602e-06, "loss": 0.0113, "step": 101950 }, { "epoch": 2.86042923271146, "grad_norm": 0.40423810482025146, "learning_rate": 2.326179454808996e-06, "loss": 0.0088, "step": 101960 }, { "epoch": 2.8607097769673167, "grad_norm": 0.40322259068489075, "learning_rate": 2.32150371721139e-06, "loss": 0.0216, "step": 101970 }, { "epoch": 2.860990321223173, "grad_norm": 0.011633923277258873, "learning_rate": 2.316827979613784e-06, "loss": 0.0103, "step": 101980 }, { "epoch": 2.861270865479029, "grad_norm": 0.07702820748090744, "learning_rate": 2.312152242016178e-06, "loss": 0.0057, "step": 101990 }, { "epoch": 2.8615514097348855, "grad_norm": 0.017001204192638397, "learning_rate": 2.307476504418572e-06, "loss": 0.0064, "step": 102000 }, { "epoch": 2.861831953990742, "grad_norm": 0.9849240779876709, "learning_rate": 2.302800766820966e-06, "loss": 0.0162, "step": 102010 }, { "epoch": 2.8621124982465984, "grad_norm": 0.023978905752301216, "learning_rate": 2.29812502922336e-06, "loss": 0.0247, "step": 102020 }, { "epoch": 2.862393042502455, "grad_norm": 0.24320273101329803, "learning_rate": 2.293449291625754e-06, "loss": 0.016, "step": 102030 }, { "epoch": 2.8626735867583113, "grad_norm": 0.23214992880821228, "learning_rate": 2.288773554028148e-06, "loss": 0.0249, "step": 102040 }, { "epoch": 2.8629541310141677, "grad_norm": 0.3334801197052002, "learning_rate": 2.2840978164305418e-06, "loss": 0.015, "step": 102050 }, { "epoch": 2.8632346752700237, "grad_norm": 0.0895843431353569, "learning_rate": 2.279422078832936e-06, "loss": 0.0104, "step": 102060 }, { "epoch": 2.86351521952588, "grad_norm": 1.7160677909851074, "learning_rate": 2.27474634123533e-06, "loss": 0.0418, "step": 102070 }, { "epoch": 2.8637957637817366, "grad_norm": 0.005498518235981464, "learning_rate": 2.270070603637724e-06, "loss": 0.0343, "step": 102080 }, { "epoch": 2.864076308037593, "grad_norm": 0.021441712975502014, "learning_rate": 2.2653948660401178e-06, "loss": 0.0146, "step": 102090 }, { "epoch": 2.864356852293449, "grad_norm": 0.6328451037406921, "learning_rate": 2.2607191284425118e-06, "loss": 0.0124, "step": 102100 }, { "epoch": 2.8646373965493055, "grad_norm": 0.19318722188472748, "learning_rate": 2.2560433908449058e-06, "loss": 0.0097, "step": 102110 }, { "epoch": 2.864917940805162, "grad_norm": 0.04223921522498131, "learning_rate": 2.2513676532472998e-06, "loss": 0.0138, "step": 102120 }, { "epoch": 2.8651984850610184, "grad_norm": 0.08697471767663956, "learning_rate": 2.246691915649694e-06, "loss": 0.0335, "step": 102130 }, { "epoch": 2.865479029316875, "grad_norm": 0.3472469747066498, "learning_rate": 2.2420161780520878e-06, "loss": 0.0487, "step": 102140 }, { "epoch": 2.8657595735727313, "grad_norm": 0.42497140169143677, "learning_rate": 2.2373404404544818e-06, "loss": 0.0316, "step": 102150 }, { "epoch": 2.8660401178285877, "grad_norm": 0.04961823672056198, "learning_rate": 2.2326647028568758e-06, "loss": 0.0126, "step": 102160 }, { "epoch": 2.8663206620844437, "grad_norm": 0.033549483865499496, "learning_rate": 2.2279889652592698e-06, "loss": 0.0317, "step": 102170 }, { "epoch": 2.8666012063403, "grad_norm": 1.8218973875045776, "learning_rate": 2.2233132276616638e-06, "loss": 0.0188, "step": 102180 }, { "epoch": 2.8668817505961566, "grad_norm": 1.8874074220657349, "learning_rate": 2.2186374900640578e-06, "loss": 0.0217, "step": 102190 }, { "epoch": 2.867162294852013, "grad_norm": 0.9411609768867493, "learning_rate": 2.213961752466452e-06, "loss": 0.0276, "step": 102200 }, { "epoch": 2.867442839107869, "grad_norm": 1.0427964925765991, "learning_rate": 2.209286014868846e-06, "loss": 0.0193, "step": 102210 }, { "epoch": 2.8677233833637255, "grad_norm": 0.356328547000885, "learning_rate": 2.20461027727124e-06, "loss": 0.0116, "step": 102220 }, { "epoch": 2.868003927619582, "grad_norm": 2.9587509632110596, "learning_rate": 2.1999345396736334e-06, "loss": 0.0271, "step": 102230 }, { "epoch": 2.8682844718754383, "grad_norm": 1.182389259338379, "learning_rate": 2.1952588020760274e-06, "loss": 0.0318, "step": 102240 }, { "epoch": 2.8685650161312948, "grad_norm": 0.27493295073509216, "learning_rate": 2.1905830644784214e-06, "loss": 0.0228, "step": 102250 }, { "epoch": 2.868845560387151, "grad_norm": 0.297309011220932, "learning_rate": 2.185907326880816e-06, "loss": 0.0228, "step": 102260 }, { "epoch": 2.8691261046430077, "grad_norm": 0.07145434617996216, "learning_rate": 2.18123158928321e-06, "loss": 0.0167, "step": 102270 }, { "epoch": 2.8694066488988637, "grad_norm": 0.23543450236320496, "learning_rate": 2.1765558516856034e-06, "loss": 0.0135, "step": 102280 }, { "epoch": 2.86968719315472, "grad_norm": 0.029050925746560097, "learning_rate": 2.1718801140879974e-06, "loss": 0.0121, "step": 102290 }, { "epoch": 2.8699677374105765, "grad_norm": 0.13811829686164856, "learning_rate": 2.1672043764903914e-06, "loss": 0.0281, "step": 102300 }, { "epoch": 2.870248281666433, "grad_norm": 1.9883190393447876, "learning_rate": 2.1625286388927854e-06, "loss": 0.0177, "step": 102310 }, { "epoch": 2.870528825922289, "grad_norm": 0.010529661551117897, "learning_rate": 2.1578529012951794e-06, "loss": 0.0344, "step": 102320 }, { "epoch": 2.8708093701781454, "grad_norm": 0.16486522555351257, "learning_rate": 2.1531771636975734e-06, "loss": 0.0383, "step": 102330 }, { "epoch": 2.871089914434002, "grad_norm": 2.0800018310546875, "learning_rate": 2.1485014260999674e-06, "loss": 0.0263, "step": 102340 }, { "epoch": 2.8713704586898583, "grad_norm": 0.011997311376035213, "learning_rate": 2.1438256885023614e-06, "loss": 0.0196, "step": 102350 }, { "epoch": 2.8716510029457147, "grad_norm": 0.9804973006248474, "learning_rate": 2.1391499509047554e-06, "loss": 0.0294, "step": 102360 }, { "epoch": 2.871931547201571, "grad_norm": 0.02155475877225399, "learning_rate": 2.1344742133071494e-06, "loss": 0.0033, "step": 102370 }, { "epoch": 2.8722120914574276, "grad_norm": 0.6589000225067139, "learning_rate": 2.129798475709543e-06, "loss": 0.015, "step": 102380 }, { "epoch": 2.8724926357132836, "grad_norm": 0.37673047184944153, "learning_rate": 2.1251227381119374e-06, "loss": 0.0146, "step": 102390 }, { "epoch": 2.87277317996914, "grad_norm": 0.7099005579948425, "learning_rate": 2.1204470005143314e-06, "loss": 0.0463, "step": 102400 }, { "epoch": 2.8730537242249965, "grad_norm": 1.782771110534668, "learning_rate": 2.1157712629167254e-06, "loss": 0.0294, "step": 102410 }, { "epoch": 2.873334268480853, "grad_norm": 0.17518660426139832, "learning_rate": 2.111095525319119e-06, "loss": 0.0244, "step": 102420 }, { "epoch": 2.8736148127367094, "grad_norm": 0.3174799084663391, "learning_rate": 2.106419787721513e-06, "loss": 0.0246, "step": 102430 }, { "epoch": 2.8738953569925654, "grad_norm": 0.4976891875267029, "learning_rate": 2.101744050123907e-06, "loss": 0.0323, "step": 102440 }, { "epoch": 2.874175901248422, "grad_norm": 0.6414636373519897, "learning_rate": 2.097068312526301e-06, "loss": 0.0216, "step": 102450 }, { "epoch": 2.8744564455042783, "grad_norm": 0.024001482874155045, "learning_rate": 2.0923925749286954e-06, "loss": 0.0124, "step": 102460 }, { "epoch": 2.8747369897601347, "grad_norm": 0.024039283394813538, "learning_rate": 2.087716837331089e-06, "loss": 0.0527, "step": 102470 }, { "epoch": 2.875017534015991, "grad_norm": 0.3203933537006378, "learning_rate": 2.083041099733483e-06, "loss": 0.036, "step": 102480 }, { "epoch": 2.8752980782718476, "grad_norm": 0.05187439173460007, "learning_rate": 2.078365362135877e-06, "loss": 0.0242, "step": 102490 }, { "epoch": 2.875578622527704, "grad_norm": 0.013693132437765598, "learning_rate": 2.073689624538271e-06, "loss": 0.0099, "step": 102500 }, { "epoch": 2.87585916678356, "grad_norm": 0.5383554100990295, "learning_rate": 2.069013886940665e-06, "loss": 0.0095, "step": 102510 }, { "epoch": 2.8761397110394165, "grad_norm": 0.18465563654899597, "learning_rate": 2.064338149343059e-06, "loss": 0.0033, "step": 102520 }, { "epoch": 2.876420255295273, "grad_norm": 0.0664994865655899, "learning_rate": 2.059662411745453e-06, "loss": 0.0085, "step": 102530 }, { "epoch": 2.8767007995511293, "grad_norm": 0.0766683891415596, "learning_rate": 2.054986674147847e-06, "loss": 0.0183, "step": 102540 }, { "epoch": 2.8769813438069853, "grad_norm": 0.05560008063912392, "learning_rate": 2.050310936550241e-06, "loss": 0.0279, "step": 102550 }, { "epoch": 2.8772618880628418, "grad_norm": 0.4050937294960022, "learning_rate": 2.045635198952635e-06, "loss": 0.0528, "step": 102560 }, { "epoch": 2.877542432318698, "grad_norm": 0.0191530529409647, "learning_rate": 2.0409594613550286e-06, "loss": 0.0119, "step": 102570 }, { "epoch": 2.8778229765745547, "grad_norm": 0.5818196535110474, "learning_rate": 2.0362837237574226e-06, "loss": 0.0263, "step": 102580 }, { "epoch": 2.878103520830411, "grad_norm": 0.18283802270889282, "learning_rate": 2.031607986159817e-06, "loss": 0.038, "step": 102590 }, { "epoch": 2.8783840650862675, "grad_norm": 0.10622741281986237, "learning_rate": 2.026932248562211e-06, "loss": 0.0177, "step": 102600 }, { "epoch": 2.878664609342124, "grad_norm": 0.34289562702178955, "learning_rate": 2.0222565109646046e-06, "loss": 0.0455, "step": 102610 }, { "epoch": 2.87894515359798, "grad_norm": 0.05401809141039848, "learning_rate": 2.0175807733669986e-06, "loss": 0.0444, "step": 102620 }, { "epoch": 2.8792256978538364, "grad_norm": 0.25120627880096436, "learning_rate": 2.0129050357693926e-06, "loss": 0.0487, "step": 102630 }, { "epoch": 2.879506242109693, "grad_norm": 0.2757810652256012, "learning_rate": 2.0082292981717866e-06, "loss": 0.0192, "step": 102640 }, { "epoch": 2.8797867863655493, "grad_norm": 0.142717182636261, "learning_rate": 2.0035535605741806e-06, "loss": 0.0152, "step": 102650 }, { "epoch": 2.8800673306214053, "grad_norm": 0.39275115728378296, "learning_rate": 1.9988778229765746e-06, "loss": 0.0124, "step": 102660 }, { "epoch": 2.8803478748772617, "grad_norm": 0.11275318264961243, "learning_rate": 1.9942020853789686e-06, "loss": 0.0232, "step": 102670 }, { "epoch": 2.880628419133118, "grad_norm": 1.1482245922088623, "learning_rate": 1.9895263477813626e-06, "loss": 0.0394, "step": 102680 }, { "epoch": 2.8809089633889746, "grad_norm": 0.3395359218120575, "learning_rate": 1.9848506101837566e-06, "loss": 0.013, "step": 102690 }, { "epoch": 2.881189507644831, "grad_norm": 0.017915265634655952, "learning_rate": 1.9801748725861506e-06, "loss": 0.0203, "step": 102700 }, { "epoch": 2.8814700519006875, "grad_norm": 0.15651968121528625, "learning_rate": 1.975499134988544e-06, "loss": 0.0079, "step": 102710 }, { "epoch": 2.881750596156544, "grad_norm": 0.04793728142976761, "learning_rate": 1.9708233973909386e-06, "loss": 0.0079, "step": 102720 }, { "epoch": 2.8820311404124, "grad_norm": 0.04636568948626518, "learning_rate": 1.9661476597933326e-06, "loss": 0.0149, "step": 102730 }, { "epoch": 2.8823116846682564, "grad_norm": 0.019528940320014954, "learning_rate": 1.9614719221957266e-06, "loss": 0.0066, "step": 102740 }, { "epoch": 2.882592228924113, "grad_norm": 0.6370291709899902, "learning_rate": 1.9567961845981206e-06, "loss": 0.0107, "step": 102750 }, { "epoch": 2.8828727731799693, "grad_norm": 0.042535215616226196, "learning_rate": 1.9521204470005142e-06, "loss": 0.0123, "step": 102760 }, { "epoch": 2.8831533174358253, "grad_norm": 0.3139062821865082, "learning_rate": 1.9474447094029082e-06, "loss": 0.0096, "step": 102770 }, { "epoch": 2.8834338616916817, "grad_norm": 0.05865916982293129, "learning_rate": 1.9427689718053022e-06, "loss": 0.0191, "step": 102780 }, { "epoch": 2.883714405947538, "grad_norm": 0.10181321948766708, "learning_rate": 1.9380932342076967e-06, "loss": 0.0035, "step": 102790 }, { "epoch": 2.8839949502033946, "grad_norm": 0.10031558573246002, "learning_rate": 1.9334174966100902e-06, "loss": 0.0131, "step": 102800 }, { "epoch": 2.884275494459251, "grad_norm": 0.16539478302001953, "learning_rate": 1.9287417590124842e-06, "loss": 0.0142, "step": 102810 }, { "epoch": 2.8845560387151075, "grad_norm": 1.2579761743545532, "learning_rate": 1.9240660214148782e-06, "loss": 0.0343, "step": 102820 }, { "epoch": 2.884836582970964, "grad_norm": 0.062036290764808655, "learning_rate": 1.9193902838172722e-06, "loss": 0.0454, "step": 102830 }, { "epoch": 2.88511712722682, "grad_norm": 0.027116799727082253, "learning_rate": 1.9147145462196662e-06, "loss": 0.0115, "step": 102840 }, { "epoch": 2.8853976714826763, "grad_norm": 1.128970980644226, "learning_rate": 1.9100388086220602e-06, "loss": 0.0303, "step": 102850 }, { "epoch": 2.885678215738533, "grad_norm": 0.010018359869718552, "learning_rate": 1.9053630710244542e-06, "loss": 0.0128, "step": 102860 }, { "epoch": 2.8859587599943892, "grad_norm": 0.04169579595327377, "learning_rate": 1.9006873334268482e-06, "loss": 0.0159, "step": 102870 }, { "epoch": 2.886239304250245, "grad_norm": 0.2859645187854767, "learning_rate": 1.8960115958292422e-06, "loss": 0.0081, "step": 102880 }, { "epoch": 2.8865198485061017, "grad_norm": 0.6862057447433472, "learning_rate": 1.891335858231636e-06, "loss": 0.0349, "step": 102890 }, { "epoch": 2.886800392761958, "grad_norm": 1.0968458652496338, "learning_rate": 1.88666012063403e-06, "loss": 0.0061, "step": 102900 }, { "epoch": 2.8870809370178145, "grad_norm": 0.03730802237987518, "learning_rate": 1.8819843830364238e-06, "loss": 0.0104, "step": 102910 }, { "epoch": 2.887361481273671, "grad_norm": 0.09799923747777939, "learning_rate": 1.8773086454388183e-06, "loss": 0.0086, "step": 102920 }, { "epoch": 2.8876420255295274, "grad_norm": 1.4123953580856323, "learning_rate": 1.872632907841212e-06, "loss": 0.0193, "step": 102930 }, { "epoch": 2.887922569785384, "grad_norm": 0.15352970361709595, "learning_rate": 1.867957170243606e-06, "loss": 0.0125, "step": 102940 }, { "epoch": 2.88820311404124, "grad_norm": 0.4005698561668396, "learning_rate": 1.863281432646e-06, "loss": 0.0079, "step": 102950 }, { "epoch": 2.8884836582970963, "grad_norm": 19.285825729370117, "learning_rate": 1.8586056950483938e-06, "loss": 0.0116, "step": 102960 }, { "epoch": 2.8887642025529527, "grad_norm": 0.4997519254684448, "learning_rate": 1.8539299574507878e-06, "loss": 0.0092, "step": 102970 }, { "epoch": 2.889044746808809, "grad_norm": 0.513091504573822, "learning_rate": 1.8492542198531818e-06, "loss": 0.0222, "step": 102980 }, { "epoch": 2.889325291064665, "grad_norm": 0.023457450792193413, "learning_rate": 1.844578482255576e-06, "loss": 0.0298, "step": 102990 }, { "epoch": 2.8896058353205216, "grad_norm": 0.07803378254175186, "learning_rate": 1.8399027446579699e-06, "loss": 0.0094, "step": 103000 }, { "epoch": 2.889886379576378, "grad_norm": 0.021065419539809227, "learning_rate": 1.8352270070603639e-06, "loss": 0.004, "step": 103010 }, { "epoch": 2.8901669238322345, "grad_norm": 1.0358675718307495, "learning_rate": 1.8305512694627579e-06, "loss": 0.054, "step": 103020 }, { "epoch": 2.890447468088091, "grad_norm": 0.4106592535972595, "learning_rate": 1.8258755318651516e-06, "loss": 0.0079, "step": 103030 }, { "epoch": 2.8907280123439474, "grad_norm": 0.014747955836355686, "learning_rate": 1.8211997942675456e-06, "loss": 0.0206, "step": 103040 }, { "epoch": 2.891008556599804, "grad_norm": 1.9296139478683472, "learning_rate": 1.8165240566699399e-06, "loss": 0.0294, "step": 103050 }, { "epoch": 2.89128910085566, "grad_norm": 0.27656233310699463, "learning_rate": 1.8118483190723339e-06, "loss": 0.0092, "step": 103060 }, { "epoch": 2.8915696451115163, "grad_norm": 0.03806574270129204, "learning_rate": 1.8071725814747279e-06, "loss": 0.0152, "step": 103070 }, { "epoch": 2.8918501893673727, "grad_norm": 0.010451585054397583, "learning_rate": 1.8024968438771217e-06, "loss": 0.0376, "step": 103080 }, { "epoch": 2.892130733623229, "grad_norm": 0.5201473832130432, "learning_rate": 1.7978211062795157e-06, "loss": 0.0053, "step": 103090 }, { "epoch": 2.8924112778790856, "grad_norm": 0.004889001604169607, "learning_rate": 1.7931453686819095e-06, "loss": 0.0038, "step": 103100 }, { "epoch": 2.8926918221349416, "grad_norm": 0.8812286257743835, "learning_rate": 1.7884696310843035e-06, "loss": 0.0401, "step": 103110 }, { "epoch": 2.892972366390798, "grad_norm": 0.679766058921814, "learning_rate": 1.7837938934866977e-06, "loss": 0.0265, "step": 103120 }, { "epoch": 2.8932529106466545, "grad_norm": 0.01892062835395336, "learning_rate": 1.7791181558890917e-06, "loss": 0.0044, "step": 103130 }, { "epoch": 2.893533454902511, "grad_norm": 1.373426079750061, "learning_rate": 1.7744424182914857e-06, "loss": 0.0147, "step": 103140 }, { "epoch": 2.8938139991583673, "grad_norm": 0.041015464812517166, "learning_rate": 1.7697666806938795e-06, "loss": 0.0099, "step": 103150 }, { "epoch": 2.894094543414224, "grad_norm": 0.27150240540504456, "learning_rate": 1.7650909430962735e-06, "loss": 0.0124, "step": 103160 }, { "epoch": 2.8943750876700802, "grad_norm": 0.2687862515449524, "learning_rate": 1.7604152054986675e-06, "loss": 0.0133, "step": 103170 }, { "epoch": 2.8946556319259362, "grad_norm": 0.5635643005371094, "learning_rate": 1.7557394679010617e-06, "loss": 0.044, "step": 103180 }, { "epoch": 2.8949361761817927, "grad_norm": 0.023642536252737045, "learning_rate": 1.7510637303034555e-06, "loss": 0.0265, "step": 103190 }, { "epoch": 2.895216720437649, "grad_norm": 0.4453774392604828, "learning_rate": 1.7463879927058495e-06, "loss": 0.0146, "step": 103200 }, { "epoch": 2.8954972646935055, "grad_norm": 1.1752134561538696, "learning_rate": 1.7417122551082435e-06, "loss": 0.0118, "step": 103210 }, { "epoch": 2.8957778089493615, "grad_norm": 0.12439398467540741, "learning_rate": 1.7370365175106373e-06, "loss": 0.0076, "step": 103220 }, { "epoch": 2.896058353205218, "grad_norm": 0.21112163364887238, "learning_rate": 1.7323607799130313e-06, "loss": 0.0298, "step": 103230 }, { "epoch": 2.8963388974610744, "grad_norm": 0.05436503887176514, "learning_rate": 1.7276850423154253e-06, "loss": 0.018, "step": 103240 }, { "epoch": 2.896619441716931, "grad_norm": 0.061942946165800095, "learning_rate": 1.7230093047178195e-06, "loss": 0.0062, "step": 103250 }, { "epoch": 2.8968999859727873, "grad_norm": 0.12283123284578323, "learning_rate": 1.7183335671202135e-06, "loss": 0.0022, "step": 103260 }, { "epoch": 2.8971805302286437, "grad_norm": 0.3721179962158203, "learning_rate": 1.7136578295226073e-06, "loss": 0.0203, "step": 103270 }, { "epoch": 2.8974610744845, "grad_norm": 0.20424634218215942, "learning_rate": 1.7089820919250013e-06, "loss": 0.0251, "step": 103280 }, { "epoch": 2.897741618740356, "grad_norm": 0.06426917016506195, "learning_rate": 1.704306354327395e-06, "loss": 0.0171, "step": 103290 }, { "epoch": 2.8980221629962126, "grad_norm": 0.08544386923313141, "learning_rate": 1.699630616729789e-06, "loss": 0.0135, "step": 103300 }, { "epoch": 2.898302707252069, "grad_norm": 0.19021093845367432, "learning_rate": 1.694954879132183e-06, "loss": 0.0111, "step": 103310 }, { "epoch": 2.8985832515079255, "grad_norm": 0.5261613726615906, "learning_rate": 1.6902791415345773e-06, "loss": 0.0448, "step": 103320 }, { "epoch": 2.8988637957637815, "grad_norm": 0.028960872441530228, "learning_rate": 1.6856034039369713e-06, "loss": 0.0105, "step": 103330 }, { "epoch": 2.899144340019638, "grad_norm": 0.061404746025800705, "learning_rate": 1.680927666339365e-06, "loss": 0.0108, "step": 103340 }, { "epoch": 2.8994248842754944, "grad_norm": 0.06872653216123581, "learning_rate": 1.676251928741759e-06, "loss": 0.004, "step": 103350 }, { "epoch": 2.899705428531351, "grad_norm": 0.6110506057739258, "learning_rate": 1.671576191144153e-06, "loss": 0.0174, "step": 103360 }, { "epoch": 2.8999859727872073, "grad_norm": 2.9792542457580566, "learning_rate": 1.6669004535465469e-06, "loss": 0.0363, "step": 103370 }, { "epoch": 2.9002665170430637, "grad_norm": 0.018417788669466972, "learning_rate": 1.662224715948941e-06, "loss": 0.0082, "step": 103380 }, { "epoch": 2.90054706129892, "grad_norm": 0.3878297805786133, "learning_rate": 1.657548978351335e-06, "loss": 0.0134, "step": 103390 }, { "epoch": 2.900827605554776, "grad_norm": 0.011827114969491959, "learning_rate": 1.652873240753729e-06, "loss": 0.016, "step": 103400 }, { "epoch": 2.9011081498106326, "grad_norm": 0.031813811510801315, "learning_rate": 1.6481975031561229e-06, "loss": 0.0277, "step": 103410 }, { "epoch": 2.901388694066489, "grad_norm": 0.3024926781654358, "learning_rate": 1.6435217655585169e-06, "loss": 0.0173, "step": 103420 }, { "epoch": 2.9016692383223455, "grad_norm": 0.04168740287423134, "learning_rate": 1.6388460279609109e-06, "loss": 0.0044, "step": 103430 }, { "epoch": 2.9019497825782015, "grad_norm": 0.01103522814810276, "learning_rate": 1.6341702903633047e-06, "loss": 0.013, "step": 103440 }, { "epoch": 2.902230326834058, "grad_norm": 0.12023364752531052, "learning_rate": 1.629494552765699e-06, "loss": 0.0199, "step": 103450 }, { "epoch": 2.9025108710899143, "grad_norm": 0.026647675782442093, "learning_rate": 1.624818815168093e-06, "loss": 0.0239, "step": 103460 }, { "epoch": 2.902791415345771, "grad_norm": 0.07406295090913773, "learning_rate": 1.620143077570487e-06, "loss": 0.0081, "step": 103470 }, { "epoch": 2.9030719596016272, "grad_norm": 3.286874532699585, "learning_rate": 1.6154673399728807e-06, "loss": 0.0229, "step": 103480 }, { "epoch": 2.9033525038574837, "grad_norm": 1.6723582744598389, "learning_rate": 1.6107916023752747e-06, "loss": 0.0251, "step": 103490 }, { "epoch": 2.90363304811334, "grad_norm": 0.0269312784075737, "learning_rate": 1.6061158647776687e-06, "loss": 0.0067, "step": 103500 }, { "epoch": 2.903913592369196, "grad_norm": 0.1462041437625885, "learning_rate": 1.601440127180063e-06, "loss": 0.0026, "step": 103510 }, { "epoch": 2.9041941366250525, "grad_norm": 0.6469116806983948, "learning_rate": 1.596764389582457e-06, "loss": 0.0113, "step": 103520 }, { "epoch": 2.904474680880909, "grad_norm": 0.03240128234028816, "learning_rate": 1.5920886519848507e-06, "loss": 0.006, "step": 103530 }, { "epoch": 2.9047552251367654, "grad_norm": 0.3085421621799469, "learning_rate": 1.5874129143872447e-06, "loss": 0.0129, "step": 103540 }, { "epoch": 2.9050357693926214, "grad_norm": 0.983409583568573, "learning_rate": 1.5827371767896385e-06, "loss": 0.0371, "step": 103550 }, { "epoch": 2.905316313648478, "grad_norm": 0.18716095387935638, "learning_rate": 1.5780614391920325e-06, "loss": 0.0184, "step": 103560 }, { "epoch": 2.9055968579043343, "grad_norm": 0.05059640854597092, "learning_rate": 1.5733857015944265e-06, "loss": 0.0155, "step": 103570 }, { "epoch": 2.9058774021601907, "grad_norm": 0.024403883144259453, "learning_rate": 1.5687099639968207e-06, "loss": 0.0094, "step": 103580 }, { "epoch": 2.906157946416047, "grad_norm": 0.034068118780851364, "learning_rate": 1.5640342263992147e-06, "loss": 0.0154, "step": 103590 }, { "epoch": 2.9064384906719036, "grad_norm": 0.06257811933755875, "learning_rate": 1.5593584888016085e-06, "loss": 0.0119, "step": 103600 }, { "epoch": 2.90671903492776, "grad_norm": 1.6473479270935059, "learning_rate": 1.5546827512040025e-06, "loss": 0.0275, "step": 103610 }, { "epoch": 2.906999579183616, "grad_norm": 0.42956534028053284, "learning_rate": 1.5500070136063965e-06, "loss": 0.0051, "step": 103620 }, { "epoch": 2.9072801234394725, "grad_norm": 0.039546508342027664, "learning_rate": 1.5453312760087905e-06, "loss": 0.0161, "step": 103630 }, { "epoch": 2.907560667695329, "grad_norm": 0.04486985504627228, "learning_rate": 1.5406555384111845e-06, "loss": 0.0249, "step": 103640 }, { "epoch": 2.9078412119511854, "grad_norm": 0.04705080762505531, "learning_rate": 1.5359798008135783e-06, "loss": 0.0031, "step": 103650 }, { "epoch": 2.9081217562070414, "grad_norm": 0.02041994407773018, "learning_rate": 1.5313040632159725e-06, "loss": 0.0048, "step": 103660 }, { "epoch": 2.908402300462898, "grad_norm": 0.18467459082603455, "learning_rate": 1.5266283256183663e-06, "loss": 0.0251, "step": 103670 }, { "epoch": 2.9086828447187543, "grad_norm": 1.6289552450180054, "learning_rate": 1.5219525880207603e-06, "loss": 0.0292, "step": 103680 }, { "epoch": 2.9089633889746107, "grad_norm": 0.195382222533226, "learning_rate": 1.5172768504231543e-06, "loss": 0.0087, "step": 103690 }, { "epoch": 2.909243933230467, "grad_norm": 0.02184424363076687, "learning_rate": 1.5126011128255483e-06, "loss": 0.0171, "step": 103700 }, { "epoch": 2.9095244774863236, "grad_norm": 0.05437736213207245, "learning_rate": 1.5079253752279423e-06, "loss": 0.0165, "step": 103710 }, { "epoch": 2.90980502174218, "grad_norm": 0.3513372242450714, "learning_rate": 1.5032496376303361e-06, "loss": 0.0319, "step": 103720 }, { "epoch": 2.910085565998036, "grad_norm": 0.35120633244514465, "learning_rate": 1.4985739000327303e-06, "loss": 0.0235, "step": 103730 }, { "epoch": 2.9103661102538925, "grad_norm": 1.2677499055862427, "learning_rate": 1.4938981624351241e-06, "loss": 0.027, "step": 103740 }, { "epoch": 2.910646654509749, "grad_norm": 0.010491810739040375, "learning_rate": 1.4892224248375181e-06, "loss": 0.0355, "step": 103750 }, { "epoch": 2.9109271987656054, "grad_norm": 0.03596312552690506, "learning_rate": 1.4845466872399123e-06, "loss": 0.0141, "step": 103760 }, { "epoch": 2.911207743021462, "grad_norm": 0.17907798290252686, "learning_rate": 1.4798709496423061e-06, "loss": 0.0198, "step": 103770 }, { "epoch": 2.911488287277318, "grad_norm": 0.0046021593734622, "learning_rate": 1.4751952120447001e-06, "loss": 0.0201, "step": 103780 }, { "epoch": 2.9117688315331742, "grad_norm": 0.7495173811912537, "learning_rate": 1.4705194744470941e-06, "loss": 0.0311, "step": 103790 }, { "epoch": 2.9120493757890307, "grad_norm": 0.1029186099767685, "learning_rate": 1.4658437368494881e-06, "loss": 0.0097, "step": 103800 }, { "epoch": 2.912329920044887, "grad_norm": 0.04479917138814926, "learning_rate": 1.4611679992518821e-06, "loss": 0.0235, "step": 103810 }, { "epoch": 2.9126104643007436, "grad_norm": 0.07454480230808258, "learning_rate": 1.456492261654276e-06, "loss": 0.0255, "step": 103820 }, { "epoch": 2.9128910085566, "grad_norm": 0.02404061146080494, "learning_rate": 1.4518165240566701e-06, "loss": 0.0063, "step": 103830 }, { "epoch": 2.9131715528124564, "grad_norm": 0.015419269911944866, "learning_rate": 1.447140786459064e-06, "loss": 0.0133, "step": 103840 }, { "epoch": 2.9134520970683124, "grad_norm": 0.02661055326461792, "learning_rate": 1.442465048861458e-06, "loss": 0.0143, "step": 103850 }, { "epoch": 2.913732641324169, "grad_norm": 0.302204966545105, "learning_rate": 1.437789311263852e-06, "loss": 0.023, "step": 103860 }, { "epoch": 2.9140131855800253, "grad_norm": 0.004901766777038574, "learning_rate": 1.433113573666246e-06, "loss": 0.0219, "step": 103870 }, { "epoch": 2.9142937298358818, "grad_norm": 0.514171302318573, "learning_rate": 1.42843783606864e-06, "loss": 0.0373, "step": 103880 }, { "epoch": 2.9145742740917377, "grad_norm": 0.5339508056640625, "learning_rate": 1.423762098471034e-06, "loss": 0.024, "step": 103890 }, { "epoch": 2.914854818347594, "grad_norm": 0.22060003876686096, "learning_rate": 1.419086360873428e-06, "loss": 0.0153, "step": 103900 }, { "epoch": 2.9151353626034506, "grad_norm": 0.019667508080601692, "learning_rate": 1.4144106232758217e-06, "loss": 0.0072, "step": 103910 }, { "epoch": 2.915415906859307, "grad_norm": 0.4152342677116394, "learning_rate": 1.409734885678216e-06, "loss": 0.0259, "step": 103920 }, { "epoch": 2.9156964511151635, "grad_norm": 0.15824441611766815, "learning_rate": 1.4050591480806097e-06, "loss": 0.0049, "step": 103930 }, { "epoch": 2.91597699537102, "grad_norm": 0.5765893459320068, "learning_rate": 1.4003834104830037e-06, "loss": 0.0288, "step": 103940 }, { "epoch": 2.9162575396268764, "grad_norm": 0.04963994771242142, "learning_rate": 1.3957076728853977e-06, "loss": 0.0023, "step": 103950 }, { "epoch": 2.9165380838827324, "grad_norm": 1.2491014003753662, "learning_rate": 1.3910319352877917e-06, "loss": 0.0155, "step": 103960 }, { "epoch": 2.916818628138589, "grad_norm": 2.310744285583496, "learning_rate": 1.3863561976901857e-06, "loss": 0.041, "step": 103970 }, { "epoch": 2.9170991723944453, "grad_norm": 0.09162010997533798, "learning_rate": 1.3816804600925795e-06, "loss": 0.005, "step": 103980 }, { "epoch": 2.9173797166503017, "grad_norm": 0.11886753141880035, "learning_rate": 1.3770047224949737e-06, "loss": 0.0081, "step": 103990 }, { "epoch": 2.9176602609061577, "grad_norm": 0.05963073670864105, "learning_rate": 1.3723289848973675e-06, "loss": 0.0048, "step": 104000 }, { "epoch": 2.917940805162014, "grad_norm": 0.051910869777202606, "learning_rate": 1.3676532472997615e-06, "loss": 0.0218, "step": 104010 }, { "epoch": 2.9182213494178706, "grad_norm": 0.02649329975247383, "learning_rate": 1.3629775097021558e-06, "loss": 0.016, "step": 104020 }, { "epoch": 2.918501893673727, "grad_norm": 0.018655523657798767, "learning_rate": 1.3583017721045495e-06, "loss": 0.006, "step": 104030 }, { "epoch": 2.9187824379295835, "grad_norm": 0.08279748260974884, "learning_rate": 1.3536260345069435e-06, "loss": 0.0197, "step": 104040 }, { "epoch": 2.91906298218544, "grad_norm": 0.5100643634796143, "learning_rate": 1.3489502969093373e-06, "loss": 0.013, "step": 104050 }, { "epoch": 2.9193435264412964, "grad_norm": 0.5989148020744324, "learning_rate": 1.3442745593117316e-06, "loss": 0.0361, "step": 104060 }, { "epoch": 2.9196240706971524, "grad_norm": 0.5745933055877686, "learning_rate": 1.3395988217141256e-06, "loss": 0.0245, "step": 104070 }, { "epoch": 2.919904614953009, "grad_norm": 0.025547334924340248, "learning_rate": 1.3349230841165193e-06, "loss": 0.0179, "step": 104080 }, { "epoch": 2.9201851592088652, "grad_norm": 4.112996578216553, "learning_rate": 1.3302473465189136e-06, "loss": 0.0162, "step": 104090 }, { "epoch": 2.9204657034647217, "grad_norm": 0.03983807563781738, "learning_rate": 1.3255716089213073e-06, "loss": 0.0042, "step": 104100 }, { "epoch": 2.9207462477205777, "grad_norm": 0.03853464499115944, "learning_rate": 1.3208958713237014e-06, "loss": 0.0079, "step": 104110 }, { "epoch": 2.921026791976434, "grad_norm": 0.08411505818367004, "learning_rate": 1.3162201337260954e-06, "loss": 0.0085, "step": 104120 }, { "epoch": 2.9213073362322906, "grad_norm": 0.02531927265226841, "learning_rate": 1.3115443961284894e-06, "loss": 0.008, "step": 104130 }, { "epoch": 2.921587880488147, "grad_norm": 0.035729121416807175, "learning_rate": 1.3068686585308834e-06, "loss": 0.0199, "step": 104140 }, { "epoch": 2.9218684247440034, "grad_norm": 1.871687650680542, "learning_rate": 1.3021929209332771e-06, "loss": 0.0181, "step": 104150 }, { "epoch": 2.92214896899986, "grad_norm": 0.2683276832103729, "learning_rate": 1.2975171833356714e-06, "loss": 0.0105, "step": 104160 }, { "epoch": 2.9224295132557163, "grad_norm": 0.12599776685237885, "learning_rate": 1.2928414457380652e-06, "loss": 0.0289, "step": 104170 }, { "epoch": 2.9227100575115723, "grad_norm": 0.006474985741078854, "learning_rate": 1.2881657081404592e-06, "loss": 0.0681, "step": 104180 }, { "epoch": 2.9229906017674288, "grad_norm": 0.7494492530822754, "learning_rate": 1.2834899705428532e-06, "loss": 0.0144, "step": 104190 }, { "epoch": 2.923271146023285, "grad_norm": 0.046957507729530334, "learning_rate": 1.2788142329452472e-06, "loss": 0.0057, "step": 104200 }, { "epoch": 2.9235516902791416, "grad_norm": 0.18559250235557556, "learning_rate": 1.2741384953476412e-06, "loss": 0.0279, "step": 104210 }, { "epoch": 2.9238322345349976, "grad_norm": 0.01870197430253029, "learning_rate": 1.2694627577500352e-06, "loss": 0.0053, "step": 104220 }, { "epoch": 2.924112778790854, "grad_norm": 0.5169330835342407, "learning_rate": 1.2647870201524292e-06, "loss": 0.0144, "step": 104230 }, { "epoch": 2.9243933230467105, "grad_norm": 0.48837199807167053, "learning_rate": 1.260111282554823e-06, "loss": 0.0255, "step": 104240 }, { "epoch": 2.924673867302567, "grad_norm": 0.026098079979419708, "learning_rate": 1.2554355449572172e-06, "loss": 0.0095, "step": 104250 }, { "epoch": 2.9249544115584234, "grad_norm": 0.27137231826782227, "learning_rate": 1.2507598073596112e-06, "loss": 0.0201, "step": 104260 }, { "epoch": 2.92523495581428, "grad_norm": 0.03895046189427376, "learning_rate": 1.246084069762005e-06, "loss": 0.0313, "step": 104270 }, { "epoch": 2.9255155000701363, "grad_norm": 0.1152832955121994, "learning_rate": 1.241408332164399e-06, "loss": 0.0222, "step": 104280 }, { "epoch": 2.9257960443259923, "grad_norm": 0.5210082530975342, "learning_rate": 1.236732594566793e-06, "loss": 0.0158, "step": 104290 }, { "epoch": 2.9260765885818487, "grad_norm": 0.18435320258140564, "learning_rate": 1.232056856969187e-06, "loss": 0.0242, "step": 104300 }, { "epoch": 2.926357132837705, "grad_norm": 0.351788192987442, "learning_rate": 1.227381119371581e-06, "loss": 0.0139, "step": 104310 }, { "epoch": 2.9266376770935616, "grad_norm": 0.008039392530918121, "learning_rate": 1.222705381773975e-06, "loss": 0.0171, "step": 104320 }, { "epoch": 2.926918221349418, "grad_norm": 0.03005852736532688, "learning_rate": 1.218029644176369e-06, "loss": 0.0099, "step": 104330 }, { "epoch": 2.927198765605274, "grad_norm": 0.0455145426094532, "learning_rate": 1.2133539065787628e-06, "loss": 0.0068, "step": 104340 }, { "epoch": 2.9274793098611305, "grad_norm": 0.6928292512893677, "learning_rate": 1.208678168981157e-06, "loss": 0.0163, "step": 104350 }, { "epoch": 2.927759854116987, "grad_norm": 0.26795655488967896, "learning_rate": 1.2040024313835508e-06, "loss": 0.0183, "step": 104360 }, { "epoch": 2.9280403983728434, "grad_norm": 0.023917347192764282, "learning_rate": 1.1993266937859448e-06, "loss": 0.0015, "step": 104370 }, { "epoch": 2.9283209426287, "grad_norm": 0.3423047661781311, "learning_rate": 1.1946509561883388e-06, "loss": 0.0209, "step": 104380 }, { "epoch": 2.9286014868845562, "grad_norm": 0.7405719757080078, "learning_rate": 1.1899752185907328e-06, "loss": 0.0115, "step": 104390 }, { "epoch": 2.9288820311404122, "grad_norm": 0.01211511343717575, "learning_rate": 1.1852994809931268e-06, "loss": 0.0208, "step": 104400 }, { "epoch": 2.9291625753962687, "grad_norm": 0.6789849996566772, "learning_rate": 1.1806237433955206e-06, "loss": 0.036, "step": 104410 }, { "epoch": 2.929443119652125, "grad_norm": 0.4395073652267456, "learning_rate": 1.1759480057979148e-06, "loss": 0.0151, "step": 104420 }, { "epoch": 2.9297236639079816, "grad_norm": 0.04608667641878128, "learning_rate": 1.1712722682003086e-06, "loss": 0.0025, "step": 104430 }, { "epoch": 2.930004208163838, "grad_norm": 1.834648847579956, "learning_rate": 1.1665965306027026e-06, "loss": 0.0206, "step": 104440 }, { "epoch": 2.930284752419694, "grad_norm": 0.9628393054008484, "learning_rate": 1.1619207930050966e-06, "loss": 0.0187, "step": 104450 }, { "epoch": 2.9305652966755504, "grad_norm": 0.015189438126981258, "learning_rate": 1.1572450554074906e-06, "loss": 0.0229, "step": 104460 }, { "epoch": 2.930845840931407, "grad_norm": 0.34028124809265137, "learning_rate": 1.1525693178098846e-06, "loss": 0.0105, "step": 104470 }, { "epoch": 2.9311263851872633, "grad_norm": 0.043859899044036865, "learning_rate": 1.1478935802122786e-06, "loss": 0.0091, "step": 104480 }, { "epoch": 2.9314069294431198, "grad_norm": 0.37176698446273804, "learning_rate": 1.1432178426146726e-06, "loss": 0.01, "step": 104490 }, { "epoch": 2.931687473698976, "grad_norm": 0.04456387832760811, "learning_rate": 1.1385421050170664e-06, "loss": 0.0106, "step": 104500 }, { "epoch": 2.9319680179548326, "grad_norm": 0.12474516034126282, "learning_rate": 1.1338663674194604e-06, "loss": 0.0071, "step": 104510 }, { "epoch": 2.9322485622106886, "grad_norm": 1.1437714099884033, "learning_rate": 1.1291906298218546e-06, "loss": 0.0419, "step": 104520 }, { "epoch": 2.932529106466545, "grad_norm": 0.3471706509590149, "learning_rate": 1.1245148922242484e-06, "loss": 0.0423, "step": 104530 }, { "epoch": 2.9328096507224015, "grad_norm": 0.19881848990917206, "learning_rate": 1.1198391546266424e-06, "loss": 0.023, "step": 104540 }, { "epoch": 2.933090194978258, "grad_norm": 0.014767300337553024, "learning_rate": 1.1151634170290364e-06, "loss": 0.0132, "step": 104550 }, { "epoch": 2.933370739234114, "grad_norm": 0.10918967425823212, "learning_rate": 1.1104876794314304e-06, "loss": 0.0031, "step": 104560 }, { "epoch": 2.9336512834899704, "grad_norm": 0.3696046471595764, "learning_rate": 1.1058119418338244e-06, "loss": 0.0119, "step": 104570 }, { "epoch": 2.933931827745827, "grad_norm": 0.1496376097202301, "learning_rate": 1.1011362042362184e-06, "loss": 0.0195, "step": 104580 }, { "epoch": 2.9342123720016833, "grad_norm": 0.1555151641368866, "learning_rate": 1.0964604666386124e-06, "loss": 0.03, "step": 104590 }, { "epoch": 2.9344929162575397, "grad_norm": 0.09054514020681381, "learning_rate": 1.0917847290410062e-06, "loss": 0.0109, "step": 104600 }, { "epoch": 2.934773460513396, "grad_norm": 0.1636245995759964, "learning_rate": 1.0871089914434002e-06, "loss": 0.024, "step": 104610 }, { "epoch": 2.9350540047692526, "grad_norm": 0.02421155944466591, "learning_rate": 1.0824332538457942e-06, "loss": 0.0036, "step": 104620 }, { "epoch": 2.9353345490251086, "grad_norm": 1.1721463203430176, "learning_rate": 1.0777575162481882e-06, "loss": 0.0568, "step": 104630 }, { "epoch": 2.935615093280965, "grad_norm": 0.03088798001408577, "learning_rate": 1.0730817786505822e-06, "loss": 0.0351, "step": 104640 }, { "epoch": 2.9358956375368215, "grad_norm": 0.033711668103933334, "learning_rate": 1.0684060410529762e-06, "loss": 0.0079, "step": 104650 }, { "epoch": 2.936176181792678, "grad_norm": 0.2522633969783783, "learning_rate": 1.0637303034553702e-06, "loss": 0.0066, "step": 104660 }, { "epoch": 2.936456726048534, "grad_norm": 0.8301776647567749, "learning_rate": 1.059054565857764e-06, "loss": 0.0533, "step": 104670 }, { "epoch": 2.9367372703043904, "grad_norm": 0.03067716956138611, "learning_rate": 1.0543788282601582e-06, "loss": 0.0281, "step": 104680 }, { "epoch": 2.937017814560247, "grad_norm": 0.24369950592517853, "learning_rate": 1.049703090662552e-06, "loss": 0.017, "step": 104690 }, { "epoch": 2.9372983588161032, "grad_norm": 0.3593599796295166, "learning_rate": 1.045027353064946e-06, "loss": 0.0253, "step": 104700 }, { "epoch": 2.9375789030719597, "grad_norm": 0.33232608437538147, "learning_rate": 1.04035161546734e-06, "loss": 0.0099, "step": 104710 }, { "epoch": 2.937859447327816, "grad_norm": 0.049838464707136154, "learning_rate": 1.035675877869734e-06, "loss": 0.0118, "step": 104720 }, { "epoch": 2.9381399915836726, "grad_norm": 0.03844582289457321, "learning_rate": 1.031000140272128e-06, "loss": 0.0024, "step": 104730 }, { "epoch": 2.9384205358395286, "grad_norm": 0.3146943747997284, "learning_rate": 1.0263244026745218e-06, "loss": 0.0268, "step": 104740 }, { "epoch": 2.938701080095385, "grad_norm": 0.04282139986753464, "learning_rate": 1.021648665076916e-06, "loss": 0.0237, "step": 104750 }, { "epoch": 2.9389816243512414, "grad_norm": 0.02549874037504196, "learning_rate": 1.01697292747931e-06, "loss": 0.0306, "step": 104760 }, { "epoch": 2.939262168607098, "grad_norm": 0.4198112487792969, "learning_rate": 1.0122971898817038e-06, "loss": 0.0133, "step": 104770 }, { "epoch": 2.939542712862954, "grad_norm": 0.2533705234527588, "learning_rate": 1.007621452284098e-06, "loss": 0.0363, "step": 104780 }, { "epoch": 2.9398232571188103, "grad_norm": 0.31768232583999634, "learning_rate": 1.0029457146864918e-06, "loss": 0.0345, "step": 104790 }, { "epoch": 2.9401038013746668, "grad_norm": 1.1788225173950195, "learning_rate": 9.982699770888858e-07, "loss": 0.0429, "step": 104800 }, { "epoch": 2.940384345630523, "grad_norm": 0.08350925892591476, "learning_rate": 9.935942394912798e-07, "loss": 0.0141, "step": 104810 }, { "epoch": 2.9406648898863796, "grad_norm": 0.013355121947824955, "learning_rate": 9.889185018936738e-07, "loss": 0.0061, "step": 104820 }, { "epoch": 2.940945434142236, "grad_norm": 0.5641232132911682, "learning_rate": 9.842427642960678e-07, "loss": 0.0093, "step": 104830 }, { "epoch": 2.9412259783980925, "grad_norm": 0.030952421948313713, "learning_rate": 9.795670266984616e-07, "loss": 0.013, "step": 104840 }, { "epoch": 2.9415065226539485, "grad_norm": 0.0722162202000618, "learning_rate": 9.748912891008558e-07, "loss": 0.0216, "step": 104850 }, { "epoch": 2.941787066909805, "grad_norm": 0.22227679193019867, "learning_rate": 9.702155515032496e-07, "loss": 0.0129, "step": 104860 }, { "epoch": 2.9420676111656614, "grad_norm": 0.044333506375551224, "learning_rate": 9.655398139056436e-07, "loss": 0.0329, "step": 104870 }, { "epoch": 2.942348155421518, "grad_norm": 0.014114942401647568, "learning_rate": 9.608640763080376e-07, "loss": 0.0273, "step": 104880 }, { "epoch": 2.942628699677374, "grad_norm": 0.030017416924238205, "learning_rate": 9.561883387104316e-07, "loss": 0.0321, "step": 104890 }, { "epoch": 2.9429092439332303, "grad_norm": 0.15347926318645477, "learning_rate": 9.515126011128255e-07, "loss": 0.0205, "step": 104900 }, { "epoch": 2.9431897881890867, "grad_norm": 0.07889334112405777, "learning_rate": 9.468368635152196e-07, "loss": 0.0242, "step": 104910 }, { "epoch": 2.943470332444943, "grad_norm": 0.019811240956187248, "learning_rate": 9.421611259176135e-07, "loss": 0.0083, "step": 104920 }, { "epoch": 2.9437508767007996, "grad_norm": 0.06732822954654694, "learning_rate": 9.374853883200075e-07, "loss": 0.0099, "step": 104930 }, { "epoch": 2.944031420956656, "grad_norm": 0.24352723360061646, "learning_rate": 9.328096507224014e-07, "loss": 0.0087, "step": 104940 }, { "epoch": 2.9443119652125125, "grad_norm": 0.74444580078125, "learning_rate": 9.281339131247955e-07, "loss": 0.019, "step": 104950 }, { "epoch": 2.9445925094683685, "grad_norm": 0.09255576133728027, "learning_rate": 9.234581755271894e-07, "loss": 0.0144, "step": 104960 }, { "epoch": 2.944873053724225, "grad_norm": 1.6036851406097412, "learning_rate": 9.187824379295833e-07, "loss": 0.023, "step": 104970 }, { "epoch": 2.9451535979800814, "grad_norm": 0.6489961743354797, "learning_rate": 9.141067003319774e-07, "loss": 0.0117, "step": 104980 }, { "epoch": 2.945434142235938, "grad_norm": 1.362121820449829, "learning_rate": 9.094309627343714e-07, "loss": 0.009, "step": 104990 }, { "epoch": 2.9457146864917942, "grad_norm": 0.030160069465637207, "learning_rate": 9.047552251367653e-07, "loss": 0.0168, "step": 105000 }, { "epoch": 2.9459952307476502, "grad_norm": 0.1654166579246521, "learning_rate": 9.000794875391594e-07, "loss": 0.0096, "step": 105010 }, { "epoch": 2.9462757750035067, "grad_norm": 1.0653605461120605, "learning_rate": 8.954037499415533e-07, "loss": 0.0449, "step": 105020 }, { "epoch": 2.946556319259363, "grad_norm": 0.3053143322467804, "learning_rate": 8.907280123439472e-07, "loss": 0.0229, "step": 105030 }, { "epoch": 2.9468368635152196, "grad_norm": 1.2157979011535645, "learning_rate": 8.860522747463412e-07, "loss": 0.0245, "step": 105040 }, { "epoch": 2.947117407771076, "grad_norm": 0.006724233739078045, "learning_rate": 8.813765371487353e-07, "loss": 0.005, "step": 105050 }, { "epoch": 2.9473979520269324, "grad_norm": 0.01480527687817812, "learning_rate": 8.767007995511292e-07, "loss": 0.0317, "step": 105060 }, { "epoch": 2.947678496282789, "grad_norm": 0.22601833939552307, "learning_rate": 8.720250619535231e-07, "loss": 0.0311, "step": 105070 }, { "epoch": 2.947959040538645, "grad_norm": 0.050489641726017, "learning_rate": 8.673493243559172e-07, "loss": 0.0043, "step": 105080 }, { "epoch": 2.9482395847945013, "grad_norm": 1.1686146259307861, "learning_rate": 8.626735867583111e-07, "loss": 0.0362, "step": 105090 }, { "epoch": 2.9485201290503578, "grad_norm": 0.3680408298969269, "learning_rate": 8.579978491607051e-07, "loss": 0.0192, "step": 105100 }, { "epoch": 2.948800673306214, "grad_norm": 0.24276359379291534, "learning_rate": 8.533221115630991e-07, "loss": 0.0217, "step": 105110 }, { "epoch": 2.94908121756207, "grad_norm": 0.02505679614841938, "learning_rate": 8.486463739654931e-07, "loss": 0.0087, "step": 105120 }, { "epoch": 2.9493617618179266, "grad_norm": 1.0839229822158813, "learning_rate": 8.43970636367887e-07, "loss": 0.0244, "step": 105130 }, { "epoch": 2.949642306073783, "grad_norm": 0.09000681340694427, "learning_rate": 8.392948987702811e-07, "loss": 0.01, "step": 105140 }, { "epoch": 2.9499228503296395, "grad_norm": 0.5920588374137878, "learning_rate": 8.34619161172675e-07, "loss": 0.0172, "step": 105150 }, { "epoch": 2.950203394585496, "grad_norm": 0.01843724027276039, "learning_rate": 8.299434235750689e-07, "loss": 0.0151, "step": 105160 }, { "epoch": 2.9504839388413524, "grad_norm": 0.06361169368028641, "learning_rate": 8.252676859774629e-07, "loss": 0.0104, "step": 105170 }, { "epoch": 2.950764483097209, "grad_norm": 0.06798262149095535, "learning_rate": 8.20591948379857e-07, "loss": 0.0351, "step": 105180 }, { "epoch": 2.951045027353065, "grad_norm": 1.4484113454818726, "learning_rate": 8.159162107822509e-07, "loss": 0.04, "step": 105190 }, { "epoch": 2.9513255716089213, "grad_norm": 0.1105637401342392, "learning_rate": 8.112404731846448e-07, "loss": 0.0138, "step": 105200 }, { "epoch": 2.9516061158647777, "grad_norm": 0.8745942115783691, "learning_rate": 8.06564735587039e-07, "loss": 0.0444, "step": 105210 }, { "epoch": 2.951886660120634, "grad_norm": 0.05481605976819992, "learning_rate": 8.018889979894328e-07, "loss": 0.0141, "step": 105220 }, { "epoch": 2.95216720437649, "grad_norm": 0.025442123413085938, "learning_rate": 7.972132603918269e-07, "loss": 0.0195, "step": 105230 }, { "epoch": 2.9524477486323466, "grad_norm": 2.9656014442443848, "learning_rate": 7.925375227942209e-07, "loss": 0.0195, "step": 105240 }, { "epoch": 2.952728292888203, "grad_norm": 1.0658146142959595, "learning_rate": 7.878617851966149e-07, "loss": 0.0114, "step": 105250 }, { "epoch": 2.9530088371440595, "grad_norm": 0.07838192582130432, "learning_rate": 7.831860475990088e-07, "loss": 0.0092, "step": 105260 }, { "epoch": 2.953289381399916, "grad_norm": 0.5551972389221191, "learning_rate": 7.785103100014028e-07, "loss": 0.0256, "step": 105270 }, { "epoch": 2.9535699256557724, "grad_norm": 0.033490341156721115, "learning_rate": 7.738345724037968e-07, "loss": 0.0377, "step": 105280 }, { "epoch": 2.953850469911629, "grad_norm": 0.3661574125289917, "learning_rate": 7.691588348061907e-07, "loss": 0.0262, "step": 105290 }, { "epoch": 2.954131014167485, "grad_norm": 0.01492878794670105, "learning_rate": 7.644830972085848e-07, "loss": 0.0094, "step": 105300 }, { "epoch": 2.9544115584233412, "grad_norm": 1.174522876739502, "learning_rate": 7.598073596109787e-07, "loss": 0.0103, "step": 105310 }, { "epoch": 2.9546921026791977, "grad_norm": 0.22091910243034363, "learning_rate": 7.551316220133727e-07, "loss": 0.017, "step": 105320 }, { "epoch": 2.954972646935054, "grad_norm": 0.0607893243432045, "learning_rate": 7.504558844157667e-07, "loss": 0.0049, "step": 105330 }, { "epoch": 2.95525319119091, "grad_norm": 0.1586768925189972, "learning_rate": 7.457801468181606e-07, "loss": 0.0196, "step": 105340 }, { "epoch": 2.9555337354467666, "grad_norm": 0.4014427661895752, "learning_rate": 7.411044092205546e-07, "loss": 0.0101, "step": 105350 }, { "epoch": 2.955814279702623, "grad_norm": 1.3560878038406372, "learning_rate": 7.364286716229486e-07, "loss": 0.0401, "step": 105360 }, { "epoch": 2.9560948239584794, "grad_norm": 0.044926516711711884, "learning_rate": 7.317529340253426e-07, "loss": 0.025, "step": 105370 }, { "epoch": 2.956375368214336, "grad_norm": 0.7227925658226013, "learning_rate": 7.270771964277366e-07, "loss": 0.0094, "step": 105380 }, { "epoch": 2.9566559124701923, "grad_norm": 0.2185213267803192, "learning_rate": 7.224014588301305e-07, "loss": 0.0197, "step": 105390 }, { "epoch": 2.9569364567260488, "grad_norm": 0.0777713879942894, "learning_rate": 7.177257212325245e-07, "loss": 0.0071, "step": 105400 }, { "epoch": 2.9572170009819048, "grad_norm": 2.3115618228912354, "learning_rate": 7.130499836349184e-07, "loss": 0.016, "step": 105410 }, { "epoch": 2.957497545237761, "grad_norm": 0.054590243846178055, "learning_rate": 7.083742460373124e-07, "loss": 0.025, "step": 105420 }, { "epoch": 2.9577780894936176, "grad_norm": 0.14335399866104126, "learning_rate": 7.036985084397065e-07, "loss": 0.0196, "step": 105430 }, { "epoch": 2.958058633749474, "grad_norm": 0.6822972297668457, "learning_rate": 6.990227708421004e-07, "loss": 0.0429, "step": 105440 }, { "epoch": 2.95833917800533, "grad_norm": 0.15236955881118774, "learning_rate": 6.943470332444944e-07, "loss": 0.0086, "step": 105450 }, { "epoch": 2.9586197222611865, "grad_norm": 0.39362671971321106, "learning_rate": 6.896712956468883e-07, "loss": 0.0481, "step": 105460 }, { "epoch": 2.958900266517043, "grad_norm": 2.7698214054107666, "learning_rate": 6.849955580492823e-07, "loss": 0.0327, "step": 105470 }, { "epoch": 2.9591808107728994, "grad_norm": 0.20050425827503204, "learning_rate": 6.803198204516763e-07, "loss": 0.0325, "step": 105480 }, { "epoch": 2.959461355028756, "grad_norm": 0.48470041155815125, "learning_rate": 6.756440828540703e-07, "loss": 0.0139, "step": 105490 }, { "epoch": 2.9597418992846123, "grad_norm": 0.2409341037273407, "learning_rate": 6.709683452564643e-07, "loss": 0.013, "step": 105500 }, { "epoch": 2.9600224435404687, "grad_norm": 0.09651729464530945, "learning_rate": 6.662926076588583e-07, "loss": 0.0066, "step": 105510 }, { "epoch": 2.9603029877963247, "grad_norm": 0.01795782893896103, "learning_rate": 6.616168700612522e-07, "loss": 0.0144, "step": 105520 }, { "epoch": 2.960583532052181, "grad_norm": 0.03530487045645714, "learning_rate": 6.569411324636462e-07, "loss": 0.0037, "step": 105530 }, { "epoch": 2.9608640763080376, "grad_norm": 0.04937652125954628, "learning_rate": 6.522653948660401e-07, "loss": 0.0112, "step": 105540 }, { "epoch": 2.961144620563894, "grad_norm": 0.1732945591211319, "learning_rate": 6.475896572684342e-07, "loss": 0.0234, "step": 105550 }, { "epoch": 2.96142516481975, "grad_norm": 2.3823812007904053, "learning_rate": 6.429139196708282e-07, "loss": 0.0186, "step": 105560 }, { "epoch": 2.9617057090756065, "grad_norm": 0.15581014752388, "learning_rate": 6.382381820732221e-07, "loss": 0.0147, "step": 105570 }, { "epoch": 2.961986253331463, "grad_norm": 0.6481471061706543, "learning_rate": 6.335624444756161e-07, "loss": 0.044, "step": 105580 }, { "epoch": 2.9622667975873194, "grad_norm": 2.202317953109741, "learning_rate": 6.2888670687801e-07, "loss": 0.0208, "step": 105590 }, { "epoch": 2.962547341843176, "grad_norm": 1.0136646032333374, "learning_rate": 6.24210969280404e-07, "loss": 0.0086, "step": 105600 }, { "epoch": 2.9628278860990322, "grad_norm": 0.07154334336519241, "learning_rate": 6.19535231682798e-07, "loss": 0.037, "step": 105610 }, { "epoch": 2.9631084303548887, "grad_norm": 0.041414774954319, "learning_rate": 6.14859494085192e-07, "loss": 0.0253, "step": 105620 }, { "epoch": 2.9633889746107447, "grad_norm": 0.055881865322589874, "learning_rate": 6.10183756487586e-07, "loss": 0.0292, "step": 105630 }, { "epoch": 2.963669518866601, "grad_norm": 1.634820580482483, "learning_rate": 6.055080188899799e-07, "loss": 0.0179, "step": 105640 }, { "epoch": 2.9639500631224576, "grad_norm": 0.19082342088222504, "learning_rate": 6.008322812923739e-07, "loss": 0.0425, "step": 105650 }, { "epoch": 2.964230607378314, "grad_norm": 0.07103191316127777, "learning_rate": 5.961565436947679e-07, "loss": 0.0364, "step": 105660 }, { "epoch": 2.9645111516341704, "grad_norm": 0.010790450498461723, "learning_rate": 5.914808060971618e-07, "loss": 0.0251, "step": 105670 }, { "epoch": 2.9647916958900264, "grad_norm": 0.948830783367157, "learning_rate": 5.868050684995559e-07, "loss": 0.024, "step": 105680 }, { "epoch": 2.965072240145883, "grad_norm": 0.0798567533493042, "learning_rate": 5.821293309019498e-07, "loss": 0.0348, "step": 105690 }, { "epoch": 2.9653527844017393, "grad_norm": 0.03327339142560959, "learning_rate": 5.774535933043438e-07, "loss": 0.035, "step": 105700 }, { "epoch": 2.9656333286575958, "grad_norm": 0.07046222686767578, "learning_rate": 5.727778557067378e-07, "loss": 0.025, "step": 105710 }, { "epoch": 2.965913872913452, "grad_norm": 0.1261732429265976, "learning_rate": 5.681021181091317e-07, "loss": 0.0282, "step": 105720 }, { "epoch": 2.9661944171693087, "grad_norm": 0.3958442211151123, "learning_rate": 5.634263805115257e-07, "loss": 0.0377, "step": 105730 }, { "epoch": 2.966474961425165, "grad_norm": 0.030193351209163666, "learning_rate": 5.587506429139197e-07, "loss": 0.0386, "step": 105740 }, { "epoch": 2.966755505681021, "grad_norm": 0.5185174345970154, "learning_rate": 5.540749053163137e-07, "loss": 0.015, "step": 105750 }, { "epoch": 2.9670360499368775, "grad_norm": 0.3005850613117218, "learning_rate": 5.493991677187077e-07, "loss": 0.014, "step": 105760 }, { "epoch": 2.967316594192734, "grad_norm": 0.1115543469786644, "learning_rate": 5.447234301211016e-07, "loss": 0.0056, "step": 105770 }, { "epoch": 2.9675971384485904, "grad_norm": 0.027783846482634544, "learning_rate": 5.400476925234956e-07, "loss": 0.011, "step": 105780 }, { "epoch": 2.9678776827044464, "grad_norm": 0.3256705105304718, "learning_rate": 5.353719549258896e-07, "loss": 0.014, "step": 105790 }, { "epoch": 2.968158226960303, "grad_norm": 0.1154978945851326, "learning_rate": 5.306962173282836e-07, "loss": 0.0165, "step": 105800 }, { "epoch": 2.9684387712161593, "grad_norm": 0.017267635092139244, "learning_rate": 5.260204797306776e-07, "loss": 0.0035, "step": 105810 }, { "epoch": 2.9687193154720157, "grad_norm": 0.056331753730773926, "learning_rate": 5.213447421330715e-07, "loss": 0.0068, "step": 105820 }, { "epoch": 2.968999859727872, "grad_norm": 0.011419291608035564, "learning_rate": 5.166690045354655e-07, "loss": 0.0305, "step": 105830 }, { "epoch": 2.9692804039837286, "grad_norm": 0.036169666796922684, "learning_rate": 5.119932669378595e-07, "loss": 0.0282, "step": 105840 }, { "epoch": 2.969560948239585, "grad_norm": 0.0700477659702301, "learning_rate": 5.073175293402534e-07, "loss": 0.0141, "step": 105850 }, { "epoch": 2.969841492495441, "grad_norm": 0.050442907959222794, "learning_rate": 5.026417917426474e-07, "loss": 0.0283, "step": 105860 }, { "epoch": 2.9701220367512975, "grad_norm": 0.16829898953437805, "learning_rate": 4.979660541450414e-07, "loss": 0.0106, "step": 105870 }, { "epoch": 2.970402581007154, "grad_norm": 0.42418399453163147, "learning_rate": 4.932903165474354e-07, "loss": 0.0073, "step": 105880 }, { "epoch": 2.9706831252630104, "grad_norm": 0.5197057127952576, "learning_rate": 4.886145789498294e-07, "loss": 0.0102, "step": 105890 }, { "epoch": 2.9709636695188664, "grad_norm": 0.6216461062431335, "learning_rate": 4.839388413522233e-07, "loss": 0.0108, "step": 105900 }, { "epoch": 2.971244213774723, "grad_norm": 0.047130465507507324, "learning_rate": 4.792631037546173e-07, "loss": 0.0192, "step": 105910 }, { "epoch": 2.9715247580305792, "grad_norm": 0.03316589444875717, "learning_rate": 4.7458736615701126e-07, "loss": 0.0125, "step": 105920 }, { "epoch": 2.9718053022864357, "grad_norm": 0.07141388207674026, "learning_rate": 4.6991162855940526e-07, "loss": 0.0052, "step": 105930 }, { "epoch": 2.972085846542292, "grad_norm": 0.02531948685646057, "learning_rate": 4.6523589096179926e-07, "loss": 0.0043, "step": 105940 }, { "epoch": 2.9723663907981486, "grad_norm": 0.02700342983007431, "learning_rate": 4.605601533641932e-07, "loss": 0.0276, "step": 105950 }, { "epoch": 2.972646935054005, "grad_norm": 0.04894736409187317, "learning_rate": 4.558844157665872e-07, "loss": 0.0182, "step": 105960 }, { "epoch": 2.972927479309861, "grad_norm": 0.15919183194637299, "learning_rate": 4.5120867816898116e-07, "loss": 0.042, "step": 105970 }, { "epoch": 2.9732080235657175, "grad_norm": 0.31537535786628723, "learning_rate": 4.4653294057137517e-07, "loss": 0.0367, "step": 105980 }, { "epoch": 2.973488567821574, "grad_norm": 0.019639046862721443, "learning_rate": 4.4185720297376917e-07, "loss": 0.0323, "step": 105990 }, { "epoch": 2.9737691120774303, "grad_norm": 0.37779080867767334, "learning_rate": 4.3718146537616307e-07, "loss": 0.0159, "step": 106000 }, { "epoch": 2.9740496563332863, "grad_norm": 0.08432567864656448, "learning_rate": 4.325057277785571e-07, "loss": 0.0229, "step": 106010 }, { "epoch": 2.9743302005891428, "grad_norm": 0.2908511161804199, "learning_rate": 4.27829990180951e-07, "loss": 0.0029, "step": 106020 }, { "epoch": 2.974610744844999, "grad_norm": 0.771959662437439, "learning_rate": 4.23154252583345e-07, "loss": 0.0597, "step": 106030 }, { "epoch": 2.9748912891008557, "grad_norm": 1.8663444519042969, "learning_rate": 4.184785149857391e-07, "loss": 0.0176, "step": 106040 }, { "epoch": 2.975171833356712, "grad_norm": 0.09415959566831589, "learning_rate": 4.1380277738813297e-07, "loss": 0.0094, "step": 106050 }, { "epoch": 2.9754523776125685, "grad_norm": 0.08607950806617737, "learning_rate": 4.0912703979052697e-07, "loss": 0.013, "step": 106060 }, { "epoch": 2.975732921868425, "grad_norm": 0.05412375181913376, "learning_rate": 4.04451302192921e-07, "loss": 0.0296, "step": 106070 }, { "epoch": 2.976013466124281, "grad_norm": 0.050997160375118256, "learning_rate": 3.997755645953149e-07, "loss": 0.0057, "step": 106080 }, { "epoch": 2.9762940103801374, "grad_norm": 0.018107540905475616, "learning_rate": 3.9509982699770893e-07, "loss": 0.0425, "step": 106090 }, { "epoch": 2.976574554635994, "grad_norm": 0.03529658541083336, "learning_rate": 3.904240894001029e-07, "loss": 0.0249, "step": 106100 }, { "epoch": 2.9768550988918503, "grad_norm": 0.02697630040347576, "learning_rate": 3.857483518024969e-07, "loss": 0.0308, "step": 106110 }, { "epoch": 2.9771356431477063, "grad_norm": 1.5541963577270508, "learning_rate": 3.8107261420489083e-07, "loss": 0.0267, "step": 106120 }, { "epoch": 2.9774161874035627, "grad_norm": 0.9327417016029358, "learning_rate": 3.7639687660728483e-07, "loss": 0.0182, "step": 106130 }, { "epoch": 2.977696731659419, "grad_norm": 0.1004006415605545, "learning_rate": 3.7172113900967883e-07, "loss": 0.0165, "step": 106140 }, { "epoch": 2.9779772759152756, "grad_norm": 0.5204915404319763, "learning_rate": 3.670454014120728e-07, "loss": 0.0139, "step": 106150 }, { "epoch": 2.978257820171132, "grad_norm": 0.9390612244606018, "learning_rate": 3.6236966381446673e-07, "loss": 0.0402, "step": 106160 }, { "epoch": 2.9785383644269885, "grad_norm": 0.08614825457334518, "learning_rate": 3.5769392621686073e-07, "loss": 0.0038, "step": 106170 }, { "epoch": 2.978818908682845, "grad_norm": 0.058124035596847534, "learning_rate": 3.530181886192547e-07, "loss": 0.0302, "step": 106180 }, { "epoch": 2.979099452938701, "grad_norm": 0.11031382530927658, "learning_rate": 3.483424510216487e-07, "loss": 0.0207, "step": 106190 }, { "epoch": 2.9793799971945574, "grad_norm": 0.26111581921577454, "learning_rate": 3.436667134240427e-07, "loss": 0.0039, "step": 106200 }, { "epoch": 2.979660541450414, "grad_norm": 0.553304135799408, "learning_rate": 3.3899097582643664e-07, "loss": 0.0176, "step": 106210 }, { "epoch": 2.9799410857062703, "grad_norm": 0.10810796916484833, "learning_rate": 3.343152382288306e-07, "loss": 0.0301, "step": 106220 }, { "epoch": 2.9802216299621263, "grad_norm": 0.06880349665880203, "learning_rate": 3.296395006312246e-07, "loss": 0.0177, "step": 106230 }, { "epoch": 2.9805021742179827, "grad_norm": 0.05537901818752289, "learning_rate": 3.249637630336186e-07, "loss": 0.002, "step": 106240 }, { "epoch": 2.980782718473839, "grad_norm": 1.4013233184814453, "learning_rate": 3.2028802543601254e-07, "loss": 0.0121, "step": 106250 }, { "epoch": 2.9810632627296956, "grad_norm": 0.6145200133323669, "learning_rate": 3.1561228783840654e-07, "loss": 0.0375, "step": 106260 }, { "epoch": 2.981343806985552, "grad_norm": 0.007210355717688799, "learning_rate": 3.109365502408005e-07, "loss": 0.0234, "step": 106270 }, { "epoch": 2.9816243512414085, "grad_norm": 1.736182689666748, "learning_rate": 3.0626081264319444e-07, "loss": 0.055, "step": 106280 }, { "epoch": 2.981904895497265, "grad_norm": 0.8828563094139099, "learning_rate": 3.0158507504558844e-07, "loss": 0.0237, "step": 106290 }, { "epoch": 2.982185439753121, "grad_norm": 0.6054939031600952, "learning_rate": 2.9690933744798244e-07, "loss": 0.0498, "step": 106300 }, { "epoch": 2.9824659840089773, "grad_norm": 0.21033717691898346, "learning_rate": 2.922335998503764e-07, "loss": 0.019, "step": 106310 }, { "epoch": 2.9827465282648338, "grad_norm": 3.2566659450531006, "learning_rate": 2.875578622527704e-07, "loss": 0.0154, "step": 106320 }, { "epoch": 2.98302707252069, "grad_norm": 1.1765236854553223, "learning_rate": 2.828821246551644e-07, "loss": 0.0231, "step": 106330 }, { "epoch": 2.9833076167765467, "grad_norm": 0.0703609511256218, "learning_rate": 2.7820638705755835e-07, "loss": 0.0282, "step": 106340 }, { "epoch": 2.9835881610324027, "grad_norm": 0.06554500013589859, "learning_rate": 2.735306494599523e-07, "loss": 0.0111, "step": 106350 }, { "epoch": 2.983868705288259, "grad_norm": 0.2463730126619339, "learning_rate": 2.688549118623463e-07, "loss": 0.038, "step": 106360 }, { "epoch": 2.9841492495441155, "grad_norm": 0.02527710795402527, "learning_rate": 2.6417917426474025e-07, "loss": 0.0327, "step": 106370 }, { "epoch": 2.984429793799972, "grad_norm": 0.6298306584358215, "learning_rate": 2.5950343666713425e-07, "loss": 0.0258, "step": 106380 }, { "epoch": 2.9847103380558284, "grad_norm": 1.1767257452011108, "learning_rate": 2.5482769906952825e-07, "loss": 0.0209, "step": 106390 }, { "epoch": 2.984990882311685, "grad_norm": 0.2245771288871765, "learning_rate": 2.501519614719222e-07, "loss": 0.0194, "step": 106400 }, { "epoch": 2.9852714265675413, "grad_norm": 0.8835018873214722, "learning_rate": 2.4547622387431615e-07, "loss": 0.0415, "step": 106410 }, { "epoch": 2.9855519708233973, "grad_norm": 1.0156302452087402, "learning_rate": 2.4080048627671015e-07, "loss": 0.0358, "step": 106420 }, { "epoch": 2.9858325150792537, "grad_norm": 0.03269356116652489, "learning_rate": 2.3612474867910416e-07, "loss": 0.0173, "step": 106430 }, { "epoch": 2.98611305933511, "grad_norm": 0.6813226342201233, "learning_rate": 2.3144901108149813e-07, "loss": 0.0126, "step": 106440 }, { "epoch": 2.9863936035909666, "grad_norm": 1.4566065073013306, "learning_rate": 2.2677327348389208e-07, "loss": 0.0375, "step": 106450 }, { "epoch": 2.9866741478468226, "grad_norm": 0.05154797062277794, "learning_rate": 2.2209753588628606e-07, "loss": 0.0467, "step": 106460 }, { "epoch": 2.986954692102679, "grad_norm": 0.25796645879745483, "learning_rate": 2.1742179828868006e-07, "loss": 0.0075, "step": 106470 }, { "epoch": 2.9872352363585355, "grad_norm": 0.3114092946052551, "learning_rate": 2.1274606069107404e-07, "loss": 0.0297, "step": 106480 }, { "epoch": 2.987515780614392, "grad_norm": 0.1841002106666565, "learning_rate": 2.08070323093468e-07, "loss": 0.0131, "step": 106490 }, { "epoch": 2.9877963248702484, "grad_norm": 0.2577696442604065, "learning_rate": 2.0339458549586196e-07, "loss": 0.0085, "step": 106500 }, { "epoch": 2.988076869126105, "grad_norm": 0.1216774582862854, "learning_rate": 1.9871884789825594e-07, "loss": 0.0229, "step": 106510 }, { "epoch": 2.9883574133819613, "grad_norm": 0.056432392448186874, "learning_rate": 1.9404311030064994e-07, "loss": 0.0081, "step": 106520 }, { "epoch": 2.9886379576378173, "grad_norm": 0.5276334285736084, "learning_rate": 1.8936737270304391e-07, "loss": 0.0101, "step": 106530 }, { "epoch": 2.9889185018936737, "grad_norm": 1.5775359869003296, "learning_rate": 1.846916351054379e-07, "loss": 0.0442, "step": 106540 }, { "epoch": 2.98919904614953, "grad_norm": 1.4748293161392212, "learning_rate": 1.8001589750783187e-07, "loss": 0.0294, "step": 106550 }, { "epoch": 2.9894795904053866, "grad_norm": 0.15841378271579742, "learning_rate": 1.7534015991022584e-07, "loss": 0.0268, "step": 106560 }, { "epoch": 2.9897601346612426, "grad_norm": 1.7957254648208618, "learning_rate": 1.7066442231261982e-07, "loss": 0.0198, "step": 106570 }, { "epoch": 2.990040678917099, "grad_norm": 0.0439264215528965, "learning_rate": 1.659886847150138e-07, "loss": 0.0069, "step": 106580 }, { "epoch": 2.9903212231729555, "grad_norm": 0.07253038883209229, "learning_rate": 1.6131294711740777e-07, "loss": 0.0194, "step": 106590 }, { "epoch": 2.990601767428812, "grad_norm": 0.05028906464576721, "learning_rate": 1.5663720951980177e-07, "loss": 0.008, "step": 106600 }, { "epoch": 2.9908823116846683, "grad_norm": 0.04587754234671593, "learning_rate": 1.5196147192219572e-07, "loss": 0.0108, "step": 106610 }, { "epoch": 2.991162855940525, "grad_norm": 0.03947416692972183, "learning_rate": 1.4728573432458972e-07, "loss": 0.0125, "step": 106620 }, { "epoch": 2.991443400196381, "grad_norm": 0.11357130855321884, "learning_rate": 1.426099967269837e-07, "loss": 0.0397, "step": 106630 }, { "epoch": 2.991723944452237, "grad_norm": 0.06484346836805344, "learning_rate": 1.3793425912937765e-07, "loss": 0.0051, "step": 106640 }, { "epoch": 2.9920044887080937, "grad_norm": 0.08865038305521011, "learning_rate": 1.3325852153177165e-07, "loss": 0.0045, "step": 106650 }, { "epoch": 2.99228503296395, "grad_norm": 0.43360424041748047, "learning_rate": 1.2858278393416563e-07, "loss": 0.0179, "step": 106660 }, { "epoch": 2.9925655772198065, "grad_norm": 0.03234716132283211, "learning_rate": 1.239070463365596e-07, "loss": 0.0157, "step": 106670 }, { "epoch": 2.9928461214756625, "grad_norm": 0.1303013563156128, "learning_rate": 1.1923130873895358e-07, "loss": 0.0368, "step": 106680 }, { "epoch": 2.993126665731519, "grad_norm": 0.43130385875701904, "learning_rate": 1.1455557114134757e-07, "loss": 0.0142, "step": 106690 }, { "epoch": 2.9934072099873754, "grad_norm": 0.3262905776500702, "learning_rate": 1.0987983354374153e-07, "loss": 0.0156, "step": 106700 }, { "epoch": 2.993687754243232, "grad_norm": 0.47026899456977844, "learning_rate": 1.052040959461355e-07, "loss": 0.0103, "step": 106710 }, { "epoch": 2.9939682984990883, "grad_norm": 0.14596804976463318, "learning_rate": 1.005283583485295e-07, "loss": 0.0216, "step": 106720 }, { "epoch": 2.9942488427549447, "grad_norm": 0.07478600740432739, "learning_rate": 9.585262075092346e-08, "loss": 0.0355, "step": 106730 }, { "epoch": 2.994529387010801, "grad_norm": 0.028799375519156456, "learning_rate": 9.117688315331743e-08, "loss": 0.0196, "step": 106740 }, { "epoch": 2.994809931266657, "grad_norm": 0.7213448882102966, "learning_rate": 8.650114555571142e-08, "loss": 0.0162, "step": 106750 }, { "epoch": 2.9950904755225136, "grad_norm": 0.06582538783550262, "learning_rate": 8.18254079581054e-08, "loss": 0.0256, "step": 106760 }, { "epoch": 2.99537101977837, "grad_norm": 1.7119001150131226, "learning_rate": 7.714967036049937e-08, "loss": 0.0199, "step": 106770 }, { "epoch": 2.9956515640342265, "grad_norm": 0.03047153539955616, "learning_rate": 7.247393276289335e-08, "loss": 0.0581, "step": 106780 }, { "epoch": 2.9959321082900825, "grad_norm": 0.019939130172133446, "learning_rate": 6.779819516528734e-08, "loss": 0.0266, "step": 106790 }, { "epoch": 2.996212652545939, "grad_norm": 0.028703441843390465, "learning_rate": 6.31224575676813e-08, "loss": 0.0145, "step": 106800 }, { "epoch": 2.9964931968017954, "grad_norm": 0.13955998420715332, "learning_rate": 5.8446719970075276e-08, "loss": 0.0336, "step": 106810 }, { "epoch": 2.996773741057652, "grad_norm": 0.06898689270019531, "learning_rate": 5.377098237246926e-08, "loss": 0.0187, "step": 106820 }, { "epoch": 2.9970542853135083, "grad_norm": 0.09711993485689163, "learning_rate": 4.9095244774863234e-08, "loss": 0.0166, "step": 106830 }, { "epoch": 2.9973348295693647, "grad_norm": 0.01836164854466915, "learning_rate": 4.441950717725722e-08, "loss": 0.0143, "step": 106840 }, { "epoch": 2.997615373825221, "grad_norm": 0.051848605275154114, "learning_rate": 3.974376957965119e-08, "loss": 0.0078, "step": 106850 }, { "epoch": 2.997895918081077, "grad_norm": 0.2767074406147003, "learning_rate": 3.506803198204517e-08, "loss": 0.013, "step": 106860 }, { "epoch": 2.9981764623369336, "grad_norm": 0.15412354469299316, "learning_rate": 3.0392294384439144e-08, "loss": 0.0071, "step": 106870 }, { "epoch": 2.99845700659279, "grad_norm": 1.3813947439193726, "learning_rate": 2.5716556786833127e-08, "loss": 0.0194, "step": 106880 }, { "epoch": 2.9987375508486465, "grad_norm": 0.5003724098205566, "learning_rate": 2.1040819189227102e-08, "loss": 0.0148, "step": 106890 }, { "epoch": 2.999018095104503, "grad_norm": 0.11276789754629135, "learning_rate": 1.6365081591621078e-08, "loss": 0.0078, "step": 106900 }, { "epoch": 2.999298639360359, "grad_norm": 1.2795422077178955, "learning_rate": 1.1689343994015057e-08, "loss": 0.021, "step": 106910 }, { "epoch": 2.9995791836162153, "grad_norm": 1.4640471935272217, "learning_rate": 7.013606396409034e-09, "loss": 0.0308, "step": 106920 }, { "epoch": 2.999859727872072, "grad_norm": 0.542366087436676, "learning_rate": 2.3378687988030113e-09, "loss": 0.0161, "step": 106930 } ], "logging_steps": 10, "max_steps": 106935, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.588658407821248e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }