diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10759 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 15310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006531678641410843, + "grad_norm": 50.13682207454293, + "learning_rate": 1.948051948051948e-06, + "loss": 1.429, + "step": 10 + }, + { + "epoch": 0.0013063357282821686, + "grad_norm": 6.483749028520749, + "learning_rate": 3.896103896103896e-06, + "loss": 1.4295, + "step": 20 + }, + { + "epoch": 0.001959503592423253, + "grad_norm": 11.258923869500972, + "learning_rate": 5.844155844155845e-06, + "loss": 1.4339, + "step": 30 + }, + { + "epoch": 0.002612671456564337, + "grad_norm": 6.7183749781782875, + "learning_rate": 7.792207792207792e-06, + "loss": 1.4304, + "step": 40 + }, + { + "epoch": 0.0032658393207054214, + "grad_norm": 11.099196437583155, + "learning_rate": 9.74025974025974e-06, + "loss": 1.3604, + "step": 50 + }, + { + "epoch": 0.003919007184846506, + "grad_norm": 15.685328846269847, + "learning_rate": 1.168831168831169e-05, + "loss": 1.3311, + "step": 60 + }, + { + "epoch": 0.0045721750489875895, + "grad_norm": 14.152624074300729, + "learning_rate": 1.3636363636363637e-05, + "loss": 1.1945, + "step": 70 + }, + { + "epoch": 0.005225342913128674, + "grad_norm": 9.464879217329338, + "learning_rate": 1.5584415584415583e-05, + "loss": 1.0984, + "step": 80 + }, + { + "epoch": 0.005878510777269758, + "grad_norm": 4.346270152613842, + "learning_rate": 1.753246753246753e-05, + "loss": 1.0648, + "step": 90 + }, + { + "epoch": 0.006531678641410843, + "grad_norm": 7.209147883203971, + "learning_rate": 1.948051948051948e-05, + "loss": 1.0054, + "step": 100 + }, + { + "epoch": 0.007184846505551927, + "grad_norm": 7.838309905384678, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9712, + "step": 110 + }, + { + "epoch": 0.007838014369693011, + "grad_norm": 3.9193862374214126, + "learning_rate": 2.337662337662338e-05, + "loss": 0.9499, + "step": 120 + }, + { + "epoch": 0.008491182233834096, + "grad_norm": 8.131745340798473, + "learning_rate": 2.5324675324675325e-05, + "loss": 0.8816, + "step": 130 + }, + { + "epoch": 0.009144350097975179, + "grad_norm": 10.711063220109187, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.8879, + "step": 140 + }, + { + "epoch": 0.009797517962116264, + "grad_norm": 8.374944885983533, + "learning_rate": 2.922077922077922e-05, + "loss": 0.8724, + "step": 150 + }, + { + "epoch": 0.010450685826257348, + "grad_norm": 14.67427097049884, + "learning_rate": 2.9999988399031158e-05, + "loss": 0.845, + "step": 160 + }, + { + "epoch": 0.011103853690398433, + "grad_norm": 8.900791570201601, + "learning_rate": 2.9999917504286554e-05, + "loss": 0.8545, + "step": 170 + }, + { + "epoch": 0.011757021554539516, + "grad_norm": 4.858043034268298, + "learning_rate": 2.999978216008427e-05, + "loss": 0.8396, + "step": 180 + }, + { + "epoch": 0.012410189418680601, + "grad_norm": 13.951574224432301, + "learning_rate": 2.999958236700584e-05, + "loss": 0.8306, + "step": 190 + }, + { + "epoch": 0.013063357282821686, + "grad_norm": 9.887416710966955, + "learning_rate": 2.99993181259097e-05, + "loss": 0.8278, + "step": 200 + }, + { + "epoch": 0.013716525146962769, + "grad_norm": 3.113643304504475, + "learning_rate": 2.9998989437931214e-05, + "loss": 0.8318, + "step": 210 + }, + { + "epoch": 0.014369693011103853, + "grad_norm": 3.547332330871949, + "learning_rate": 2.999859630448263e-05, + "loss": 0.831, + "step": 220 + }, + { + "epoch": 0.015022860875244938, + "grad_norm": 8.422255470301957, + "learning_rate": 2.999813872725311e-05, + "loss": 0.8438, + "step": 230 + }, + { + "epoch": 0.015676028739386023, + "grad_norm": 966.0900579138572, + "learning_rate": 2.9997616708208702e-05, + "loss": 0.7792, + "step": 240 + }, + { + "epoch": 0.016329196603527107, + "grad_norm": 9.933693210645407, + "learning_rate": 2.9997030249592345e-05, + "loss": 0.8272, + "step": 250 + }, + { + "epoch": 0.016982364467668192, + "grad_norm": 4.180403660535027, + "learning_rate": 2.9996379353923846e-05, + "loss": 0.8057, + "step": 260 + }, + { + "epoch": 0.017635532331809273, + "grad_norm": 3.9202496933723787, + "learning_rate": 2.999566402399988e-05, + "loss": 0.8032, + "step": 270 + }, + { + "epoch": 0.018288700195950358, + "grad_norm": 7.743246766273075, + "learning_rate": 2.9994884262893974e-05, + "loss": 0.8045, + "step": 280 + }, + { + "epoch": 0.018941868060091443, + "grad_norm": 3.6560068946289226, + "learning_rate": 2.9994040073956487e-05, + "loss": 0.8272, + "step": 290 + }, + { + "epoch": 0.019595035924232528, + "grad_norm": 3.92164288932011, + "learning_rate": 2.9993131460814615e-05, + "loss": 0.8225, + "step": 300 + }, + { + "epoch": 0.020248203788373612, + "grad_norm": 8.75806817966988, + "learning_rate": 2.9992158427372346e-05, + "loss": 0.8036, + "step": 310 + }, + { + "epoch": 0.020901371652514697, + "grad_norm": 4.701880610395596, + "learning_rate": 2.999112097781047e-05, + "loss": 0.7996, + "step": 320 + }, + { + "epoch": 0.02155453951665578, + "grad_norm": 7.380549266606561, + "learning_rate": 2.9990019116586555e-05, + "loss": 0.8004, + "step": 330 + }, + { + "epoch": 0.022207707380796866, + "grad_norm": 5.255075090708445, + "learning_rate": 2.998885284843491e-05, + "loss": 0.8035, + "step": 340 + }, + { + "epoch": 0.022860875244937948, + "grad_norm": 10.529978870685822, + "learning_rate": 2.9987622178366593e-05, + "loss": 0.7879, + "step": 350 + }, + { + "epoch": 0.023514043109079032, + "grad_norm": 25.688231817434417, + "learning_rate": 2.998632711166936e-05, + "loss": 0.8008, + "step": 360 + }, + { + "epoch": 0.024167210973220117, + "grad_norm": 5.948674658618657, + "learning_rate": 2.998496765390767e-05, + "loss": 0.791, + "step": 370 + }, + { + "epoch": 0.024820378837361202, + "grad_norm": 13.793404614679359, + "learning_rate": 2.998354381092264e-05, + "loss": 0.8319, + "step": 380 + }, + { + "epoch": 0.025473546701502287, + "grad_norm": 4.171433100352656, + "learning_rate": 2.9982055588832035e-05, + "loss": 0.8031, + "step": 390 + }, + { + "epoch": 0.02612671456564337, + "grad_norm": 10.096197156428968, + "learning_rate": 2.9980502994030224e-05, + "loss": 0.7615, + "step": 400 + }, + { + "epoch": 0.026779882429784456, + "grad_norm": 30.522248705094206, + "learning_rate": 2.9978886033188174e-05, + "loss": 0.7885, + "step": 410 + }, + { + "epoch": 0.027433050293925537, + "grad_norm": 6.45218649775459, + "learning_rate": 2.997720471325341e-05, + "loss": 0.776, + "step": 420 + }, + { + "epoch": 0.028086218158066622, + "grad_norm": 4.498444394274958, + "learning_rate": 2.9975459041449976e-05, + "loss": 0.8379, + "step": 430 + }, + { + "epoch": 0.028739386022207707, + "grad_norm": 2.914112407676264, + "learning_rate": 2.997364902527842e-05, + "loss": 0.8202, + "step": 440 + }, + { + "epoch": 0.02939255388634879, + "grad_norm": 3.68499903809575, + "learning_rate": 2.997177467251576e-05, + "loss": 0.79, + "step": 450 + }, + { + "epoch": 0.030045721750489876, + "grad_norm": 3.434793273613202, + "learning_rate": 2.996983599121544e-05, + "loss": 0.8031, + "step": 460 + }, + { + "epoch": 0.03069888961463096, + "grad_norm": 10.108430844869435, + "learning_rate": 2.99678329897073e-05, + "loss": 0.7911, + "step": 470 + }, + { + "epoch": 0.031352057478772045, + "grad_norm": 10.811615130428738, + "learning_rate": 2.996576567659755e-05, + "loss": 0.7627, + "step": 480 + }, + { + "epoch": 0.03200522534291313, + "grad_norm": 5.292945867397493, + "learning_rate": 2.9963634060768714e-05, + "loss": 0.7886, + "step": 490 + }, + { + "epoch": 0.032658393207054215, + "grad_norm": 5.789433805885046, + "learning_rate": 2.996143815137961e-05, + "loss": 0.7936, + "step": 500 + }, + { + "epoch": 0.033311561071195296, + "grad_norm": 4.780827921269806, + "learning_rate": 2.9959177957865304e-05, + "loss": 0.7361, + "step": 510 + }, + { + "epoch": 0.033964728935336384, + "grad_norm": 3.19844524641641, + "learning_rate": 2.9956853489937063e-05, + "loss": 0.7536, + "step": 520 + }, + { + "epoch": 0.034617896799477466, + "grad_norm": 58.45926650107535, + "learning_rate": 2.9954464757582325e-05, + "loss": 0.7816, + "step": 530 + }, + { + "epoch": 0.03527106466361855, + "grad_norm": 7.287791706613491, + "learning_rate": 2.995201177106464e-05, + "loss": 0.7597, + "step": 540 + }, + { + "epoch": 0.035924232527759635, + "grad_norm": 5.025544527071579, + "learning_rate": 2.9949494540923645e-05, + "loss": 0.7975, + "step": 550 + }, + { + "epoch": 0.036577400391900716, + "grad_norm": 16.859946084801752, + "learning_rate": 2.9946913077975016e-05, + "loss": 0.752, + "step": 560 + }, + { + "epoch": 0.037230568256041804, + "grad_norm": 9.540096992303516, + "learning_rate": 2.9944267393310395e-05, + "loss": 0.7701, + "step": 570 + }, + { + "epoch": 0.037883736120182886, + "grad_norm": 3.905591008866653, + "learning_rate": 2.994155749829738e-05, + "loss": 0.7681, + "step": 580 + }, + { + "epoch": 0.038536903984323974, + "grad_norm": 5.130639113979418, + "learning_rate": 2.993878340457945e-05, + "loss": 0.7719, + "step": 590 + }, + { + "epoch": 0.039190071848465055, + "grad_norm": 6.976901500022458, + "learning_rate": 2.9935945124075926e-05, + "loss": 0.8087, + "step": 600 + }, + { + "epoch": 0.039843239712606136, + "grad_norm": 3.2874593928044766, + "learning_rate": 2.9933042668981924e-05, + "loss": 0.7503, + "step": 610 + }, + { + "epoch": 0.040496407576747225, + "grad_norm": 6.152264074384876, + "learning_rate": 2.993007605176828e-05, + "loss": 0.7865, + "step": 620 + }, + { + "epoch": 0.041149575440888306, + "grad_norm": 4.05841892368891, + "learning_rate": 2.9927045285181526e-05, + "loss": 0.8067, + "step": 630 + }, + { + "epoch": 0.041802743305029394, + "grad_norm": 8.56078728023433, + "learning_rate": 2.992395038224382e-05, + "loss": 0.7806, + "step": 640 + }, + { + "epoch": 0.042455911169170475, + "grad_norm": 4.239768109368226, + "learning_rate": 2.992079135625289e-05, + "loss": 0.7711, + "step": 650 + }, + { + "epoch": 0.04310907903331156, + "grad_norm": 8.718034174803794, + "learning_rate": 2.9917568220781976e-05, + "loss": 0.7526, + "step": 660 + }, + { + "epoch": 0.043762246897452645, + "grad_norm": 6.8733904254859635, + "learning_rate": 2.9914280989679778e-05, + "loss": 0.7822, + "step": 670 + }, + { + "epoch": 0.04441541476159373, + "grad_norm": 4.682363111426855, + "learning_rate": 2.9910929677070387e-05, + "loss": 0.7724, + "step": 680 + }, + { + "epoch": 0.045068582625734814, + "grad_norm": 6.781911228833026, + "learning_rate": 2.9907514297353243e-05, + "loss": 0.7858, + "step": 690 + }, + { + "epoch": 0.045721750489875895, + "grad_norm": 4.18015613246061, + "learning_rate": 2.9904034865203037e-05, + "loss": 0.7328, + "step": 700 + }, + { + "epoch": 0.046374918354016983, + "grad_norm": 8.73866715173814, + "learning_rate": 2.99004913955697e-05, + "loss": 0.7489, + "step": 710 + }, + { + "epoch": 0.047028086218158065, + "grad_norm": 2.1158891476329287, + "learning_rate": 2.9896883903678288e-05, + "loss": 0.7838, + "step": 720 + }, + { + "epoch": 0.04768125408229915, + "grad_norm": 4.40400562888192, + "learning_rate": 2.9893212405028946e-05, + "loss": 0.7756, + "step": 730 + }, + { + "epoch": 0.048334421946440234, + "grad_norm": 3.2733836021629332, + "learning_rate": 2.9889476915396834e-05, + "loss": 0.8157, + "step": 740 + }, + { + "epoch": 0.04898758981058132, + "grad_norm": 9.691053563211746, + "learning_rate": 2.9885677450832064e-05, + "loss": 0.7459, + "step": 750 + }, + { + "epoch": 0.049640757674722404, + "grad_norm": 5.514889762655123, + "learning_rate": 2.9881814027659618e-05, + "loss": 0.7647, + "step": 760 + }, + { + "epoch": 0.050293925538863485, + "grad_norm": 5.300888116668702, + "learning_rate": 2.9877886662479287e-05, + "loss": 0.7747, + "step": 770 + }, + { + "epoch": 0.05094709340300457, + "grad_norm": 5.100283311940675, + "learning_rate": 2.98738953721656e-05, + "loss": 0.8026, + "step": 780 + }, + { + "epoch": 0.051600261267145654, + "grad_norm": 3.765640195443926, + "learning_rate": 2.986984017386776e-05, + "loss": 0.7805, + "step": 790 + }, + { + "epoch": 0.05225342913128674, + "grad_norm": 8.129135253261586, + "learning_rate": 2.986572108500954e-05, + "loss": 0.7925, + "step": 800 + }, + { + "epoch": 0.052906596995427824, + "grad_norm": 6.170634084606177, + "learning_rate": 2.9861538123289246e-05, + "loss": 0.7931, + "step": 810 + }, + { + "epoch": 0.05355976485956891, + "grad_norm": 8.434072292757108, + "learning_rate": 2.9857291306679617e-05, + "loss": 0.8074, + "step": 820 + }, + { + "epoch": 0.05421293272370999, + "grad_norm": 3.0182177214560464, + "learning_rate": 2.985298065342776e-05, + "loss": 0.7815, + "step": 830 + }, + { + "epoch": 0.054866100587851074, + "grad_norm": 4.281090058172293, + "learning_rate": 2.984860618205505e-05, + "loss": 0.7628, + "step": 840 + }, + { + "epoch": 0.05551926845199216, + "grad_norm": 7.669782613486172, + "learning_rate": 2.9844167911357088e-05, + "loss": 0.8085, + "step": 850 + }, + { + "epoch": 0.056172436316133244, + "grad_norm": 10.278577834140307, + "learning_rate": 2.983966586040358e-05, + "loss": 0.7776, + "step": 860 + }, + { + "epoch": 0.05682560418027433, + "grad_norm": 5.472825455483349, + "learning_rate": 2.9835100048538293e-05, + "loss": 0.7852, + "step": 870 + }, + { + "epoch": 0.05747877204441541, + "grad_norm": 18.571831717674197, + "learning_rate": 2.9830470495378928e-05, + "loss": 0.7604, + "step": 880 + }, + { + "epoch": 0.0581319399085565, + "grad_norm": 45.51617747690787, + "learning_rate": 2.9825777220817087e-05, + "loss": 0.7829, + "step": 890 + }, + { + "epoch": 0.05878510777269758, + "grad_norm": 21.44711612975514, + "learning_rate": 2.9821020245018137e-05, + "loss": 0.7728, + "step": 900 + }, + { + "epoch": 0.05943827563683867, + "grad_norm": 2.5662956039754348, + "learning_rate": 2.981619958842116e-05, + "loss": 0.8107, + "step": 910 + }, + { + "epoch": 0.06009144350097975, + "grad_norm": 6.927752270821966, + "learning_rate": 2.9811315271738854e-05, + "loss": 0.7859, + "step": 920 + }, + { + "epoch": 0.06074461136512083, + "grad_norm": 4.418111864600093, + "learning_rate": 2.9806367315957434e-05, + "loss": 0.7631, + "step": 930 + }, + { + "epoch": 0.06139777922926192, + "grad_norm": 27.811599133935445, + "learning_rate": 2.980135574233656e-05, + "loss": 0.7858, + "step": 940 + }, + { + "epoch": 0.062050947093403, + "grad_norm": 1.5924527741860024, + "learning_rate": 2.979628057240923e-05, + "loss": 0.7261, + "step": 950 + }, + { + "epoch": 0.06270411495754409, + "grad_norm": 2.631126701746248, + "learning_rate": 2.9791141827981684e-05, + "loss": 0.7734, + "step": 960 + }, + { + "epoch": 0.06335728282168518, + "grad_norm": 9.827368484273071, + "learning_rate": 2.9785939531133343e-05, + "loss": 0.7732, + "step": 970 + }, + { + "epoch": 0.06401045068582625, + "grad_norm": 3.482548940825404, + "learning_rate": 2.978067370421667e-05, + "loss": 0.762, + "step": 980 + }, + { + "epoch": 0.06466361854996734, + "grad_norm": 4.301384574294822, + "learning_rate": 2.9775344369857102e-05, + "loss": 0.7824, + "step": 990 + }, + { + "epoch": 0.06531678641410843, + "grad_norm": 6.880269476026657, + "learning_rate": 2.976995155095295e-05, + "loss": 0.7596, + "step": 1000 + }, + { + "epoch": 0.0659699542782495, + "grad_norm": 4.441018871903774, + "learning_rate": 2.9764495270675286e-05, + "loss": 0.7105, + "step": 1010 + }, + { + "epoch": 0.06662312214239059, + "grad_norm": 5.327517477621752, + "learning_rate": 2.975897555246786e-05, + "loss": 0.8219, + "step": 1020 + }, + { + "epoch": 0.06727629000653168, + "grad_norm": 6.659492383519359, + "learning_rate": 2.9753392420047e-05, + "loss": 0.7628, + "step": 1030 + }, + { + "epoch": 0.06792945787067277, + "grad_norm": 3.460680812160345, + "learning_rate": 2.9747745897401487e-05, + "loss": 0.7538, + "step": 1040 + }, + { + "epoch": 0.06858262573481384, + "grad_norm": 5.411519563956954, + "learning_rate": 2.9742036008792472e-05, + "loss": 0.7394, + "step": 1050 + }, + { + "epoch": 0.06923579359895493, + "grad_norm": 18.5454870281513, + "learning_rate": 2.9736262778753382e-05, + "loss": 0.7538, + "step": 1060 + }, + { + "epoch": 0.06988896146309602, + "grad_norm": 3.491031463918346, + "learning_rate": 2.9730426232089786e-05, + "loss": 0.7784, + "step": 1070 + }, + { + "epoch": 0.0705421293272371, + "grad_norm": 5.651477971498172, + "learning_rate": 2.9724526393879303e-05, + "loss": 0.7284, + "step": 1080 + }, + { + "epoch": 0.07119529719137818, + "grad_norm": 6.328779021951344, + "learning_rate": 2.9718563289471506e-05, + "loss": 0.7461, + "step": 1090 + }, + { + "epoch": 0.07184846505551927, + "grad_norm": 6.698982150067563, + "learning_rate": 2.9712536944487777e-05, + "loss": 0.7337, + "step": 1100 + }, + { + "epoch": 0.07250163291966036, + "grad_norm": 8.045318038088014, + "learning_rate": 2.970644738482125e-05, + "loss": 0.7653, + "step": 1110 + }, + { + "epoch": 0.07315480078380143, + "grad_norm": 7.165574308433186, + "learning_rate": 2.9700294636636652e-05, + "loss": 0.7282, + "step": 1120 + }, + { + "epoch": 0.07380796864794252, + "grad_norm": 28.043634475051775, + "learning_rate": 2.9694078726370218e-05, + "loss": 0.7691, + "step": 1130 + }, + { + "epoch": 0.07446113651208361, + "grad_norm": 3.415324032603739, + "learning_rate": 2.9687799680729552e-05, + "loss": 0.7767, + "step": 1140 + }, + { + "epoch": 0.07511430437622468, + "grad_norm": 5.9188102098763995, + "learning_rate": 2.9681457526693553e-05, + "loss": 0.7984, + "step": 1150 + }, + { + "epoch": 0.07576747224036577, + "grad_norm": 12.190132678391851, + "learning_rate": 2.9675052291512262e-05, + "loss": 0.7411, + "step": 1160 + }, + { + "epoch": 0.07642064010450686, + "grad_norm": 3.8362362001431265, + "learning_rate": 2.966858400270676e-05, + "loss": 0.7234, + "step": 1170 + }, + { + "epoch": 0.07707380796864795, + "grad_norm": 7.046244861654371, + "learning_rate": 2.966205268806904e-05, + "loss": 0.7348, + "step": 1180 + }, + { + "epoch": 0.07772697583278902, + "grad_norm": 3.5760388018830613, + "learning_rate": 2.9655458375661913e-05, + "loss": 0.7434, + "step": 1190 + }, + { + "epoch": 0.07838014369693011, + "grad_norm": 4.7886859191892714, + "learning_rate": 2.9648801093818846e-05, + "loss": 0.7574, + "step": 1200 + }, + { + "epoch": 0.0790333115610712, + "grad_norm": 3.8124034431612475, + "learning_rate": 2.964208087114389e-05, + "loss": 0.7775, + "step": 1210 + }, + { + "epoch": 0.07968647942521227, + "grad_norm": 4.986098227783862, + "learning_rate": 2.9635297736511505e-05, + "loss": 0.7407, + "step": 1220 + }, + { + "epoch": 0.08033964728935336, + "grad_norm": 5.391683192672441, + "learning_rate": 2.962845171906648e-05, + "loss": 0.7119, + "step": 1230 + }, + { + "epoch": 0.08099281515349445, + "grad_norm": 5.190092794658212, + "learning_rate": 2.9621542848223787e-05, + "loss": 0.7643, + "step": 1240 + }, + { + "epoch": 0.08164598301763554, + "grad_norm": 17.85573580184784, + "learning_rate": 2.961457115366845e-05, + "loss": 0.771, + "step": 1250 + }, + { + "epoch": 0.08229915088177661, + "grad_norm": 3.112489760048809, + "learning_rate": 2.960753666535543e-05, + "loss": 0.7856, + "step": 1260 + }, + { + "epoch": 0.0829523187459177, + "grad_norm": 8.048692812727564, + "learning_rate": 2.9600439413509496e-05, + "loss": 0.7928, + "step": 1270 + }, + { + "epoch": 0.08360548661005879, + "grad_norm": 6.309739519597919, + "learning_rate": 2.9593279428625078e-05, + "loss": 0.7604, + "step": 1280 + }, + { + "epoch": 0.08425865447419988, + "grad_norm": 6.314599622778636, + "learning_rate": 2.958605674146615e-05, + "loss": 0.7681, + "step": 1290 + }, + { + "epoch": 0.08491182233834095, + "grad_norm": 6.952462496966706, + "learning_rate": 2.9578771383066117e-05, + "loss": 0.7606, + "step": 1300 + }, + { + "epoch": 0.08556499020248204, + "grad_norm": 5.972111738893218, + "learning_rate": 2.9571423384727632e-05, + "loss": 0.7344, + "step": 1310 + }, + { + "epoch": 0.08621815806662313, + "grad_norm": 6.369021435943961, + "learning_rate": 2.9564012778022506e-05, + "loss": 0.7873, + "step": 1320 + }, + { + "epoch": 0.0868713259307642, + "grad_norm": 8.110777158554084, + "learning_rate": 2.955653959479155e-05, + "loss": 0.7646, + "step": 1330 + }, + { + "epoch": 0.08752449379490529, + "grad_norm": 4.019702548129581, + "learning_rate": 2.9549003867144453e-05, + "loss": 0.7408, + "step": 1340 + }, + { + "epoch": 0.08817766165904638, + "grad_norm": 13.973111625931505, + "learning_rate": 2.9541405627459627e-05, + "loss": 0.6958, + "step": 1350 + }, + { + "epoch": 0.08883082952318747, + "grad_norm": 2.718868998411597, + "learning_rate": 2.9533744908384074e-05, + "loss": 0.7668, + "step": 1360 + }, + { + "epoch": 0.08948399738732854, + "grad_norm": 2.8832063069963643, + "learning_rate": 2.9526021742833267e-05, + "loss": 0.7744, + "step": 1370 + }, + { + "epoch": 0.09013716525146963, + "grad_norm": 2.802666831448759, + "learning_rate": 2.951823616399097e-05, + "loss": 0.7265, + "step": 1380 + }, + { + "epoch": 0.09079033311561072, + "grad_norm": 7.0402793044257965, + "learning_rate": 2.9510388205309123e-05, + "loss": 0.7341, + "step": 1390 + }, + { + "epoch": 0.09144350097975179, + "grad_norm": 3.9078944292597333, + "learning_rate": 2.9502477900507694e-05, + "loss": 0.7243, + "step": 1400 + }, + { + "epoch": 0.09209666884389288, + "grad_norm": 3.570504815662888, + "learning_rate": 2.949450528357452e-05, + "loss": 0.7714, + "step": 1410 + }, + { + "epoch": 0.09274983670803397, + "grad_norm": 3.261257189226981, + "learning_rate": 2.9486470388765183e-05, + "loss": 0.784, + "step": 1420 + }, + { + "epoch": 0.09340300457217506, + "grad_norm": 8.416258345667368, + "learning_rate": 2.9478373250602844e-05, + "loss": 0.7665, + "step": 1430 + }, + { + "epoch": 0.09405617243631613, + "grad_norm": 4.735214914280431, + "learning_rate": 2.9470213903878108e-05, + "loss": 0.725, + "step": 1440 + }, + { + "epoch": 0.09470934030045722, + "grad_norm": 4.961243146957466, + "learning_rate": 2.946199238364887e-05, + "loss": 0.7589, + "step": 1450 + }, + { + "epoch": 0.0953625081645983, + "grad_norm": 4.324872388776254, + "learning_rate": 2.9453708725240154e-05, + "loss": 0.7854, + "step": 1460 + }, + { + "epoch": 0.09601567602873938, + "grad_norm": 4.738763468609272, + "learning_rate": 2.944536296424397e-05, + "loss": 0.7405, + "step": 1470 + }, + { + "epoch": 0.09666884389288047, + "grad_norm": 22.601514566359473, + "learning_rate": 2.943695513651918e-05, + "loss": 0.7413, + "step": 1480 + }, + { + "epoch": 0.09732201175702156, + "grad_norm": 75.19621793286049, + "learning_rate": 2.9428485278191295e-05, + "loss": 0.7446, + "step": 1490 + }, + { + "epoch": 0.09797517962116264, + "grad_norm": 5.707450334330352, + "learning_rate": 2.941995342565238e-05, + "loss": 0.7351, + "step": 1500 + }, + { + "epoch": 0.09862834748530372, + "grad_norm": 5.128650388705621, + "learning_rate": 2.941135961556085e-05, + "loss": 0.759, + "step": 1510 + }, + { + "epoch": 0.09928151534944481, + "grad_norm": 8.469700383423309, + "learning_rate": 2.940270388484134e-05, + "loss": 0.7141, + "step": 1520 + }, + { + "epoch": 0.0999346832135859, + "grad_norm": 5.139404894150258, + "learning_rate": 2.939398627068452e-05, + "loss": 0.7814, + "step": 1530 + }, + { + "epoch": 0.10058785107772697, + "grad_norm": 12.839584083356975, + "learning_rate": 2.938520681054697e-05, + "loss": 0.7216, + "step": 1540 + }, + { + "epoch": 0.10124101894186806, + "grad_norm": 12.828728051827055, + "learning_rate": 2.9376365542150997e-05, + "loss": 0.7255, + "step": 1550 + }, + { + "epoch": 0.10189418680600915, + "grad_norm": 3.271346778822822, + "learning_rate": 2.9367462503484467e-05, + "loss": 0.7404, + "step": 1560 + }, + { + "epoch": 0.10254735467015023, + "grad_norm": 5.95331499091922, + "learning_rate": 2.935849773280066e-05, + "loss": 0.7715, + "step": 1570 + }, + { + "epoch": 0.10320052253429131, + "grad_norm": 6.876843846760272, + "learning_rate": 2.9349471268618096e-05, + "loss": 0.7226, + "step": 1580 + }, + { + "epoch": 0.1038536903984324, + "grad_norm": 4.813657150858963, + "learning_rate": 2.9340383149720373e-05, + "loss": 0.7405, + "step": 1590 + }, + { + "epoch": 0.10450685826257348, + "grad_norm": 18.641291309864204, + "learning_rate": 2.9331233415155986e-05, + "loss": 0.7195, + "step": 1600 + }, + { + "epoch": 0.10516002612671456, + "grad_norm": 4.725836680776646, + "learning_rate": 2.9322022104238183e-05, + "loss": 0.7936, + "step": 1610 + }, + { + "epoch": 0.10581319399085565, + "grad_norm": 2.7058577643128507, + "learning_rate": 2.9312749256544788e-05, + "loss": 0.7124, + "step": 1620 + }, + { + "epoch": 0.10646636185499674, + "grad_norm": 2.076137194339343, + "learning_rate": 2.9303414911918015e-05, + "loss": 0.7315, + "step": 1630 + }, + { + "epoch": 0.10711952971913782, + "grad_norm": 3.1215441095992587, + "learning_rate": 2.9294019110464318e-05, + "loss": 0.7024, + "step": 1640 + }, + { + "epoch": 0.1077726975832789, + "grad_norm": 6.738642426914526, + "learning_rate": 2.92845618925542e-05, + "loss": 0.7719, + "step": 1650 + }, + { + "epoch": 0.10842586544741999, + "grad_norm": 61.13561345420421, + "learning_rate": 2.9275043298822054e-05, + "loss": 0.7832, + "step": 1660 + }, + { + "epoch": 0.10907903331156107, + "grad_norm": 5.770332213677231, + "learning_rate": 2.9265463370165997e-05, + "loss": 0.7037, + "step": 1670 + }, + { + "epoch": 0.10973220117570215, + "grad_norm": 4.453443656361965, + "learning_rate": 2.9255822147747658e-05, + "loss": 0.7521, + "step": 1680 + }, + { + "epoch": 0.11038536903984324, + "grad_norm": 3.881967986158134, + "learning_rate": 2.924611967299204e-05, + "loss": 0.7312, + "step": 1690 + }, + { + "epoch": 0.11103853690398433, + "grad_norm": 4.7459812229458604, + "learning_rate": 2.9236355987587325e-05, + "loss": 0.7439, + "step": 1700 + }, + { + "epoch": 0.11169170476812541, + "grad_norm": 3.9650318420593953, + "learning_rate": 2.9226531133484685e-05, + "loss": 0.7815, + "step": 1710 + }, + { + "epoch": 0.11234487263226649, + "grad_norm": 3.287402968914717, + "learning_rate": 2.9216645152898125e-05, + "loss": 0.7322, + "step": 1720 + }, + { + "epoch": 0.11299804049640758, + "grad_norm": 4.898218858819153, + "learning_rate": 2.9206698088304276e-05, + "loss": 0.6943, + "step": 1730 + }, + { + "epoch": 0.11365120836054866, + "grad_norm": 8.742472435174454, + "learning_rate": 2.919668998244225e-05, + "loss": 0.7678, + "step": 1740 + }, + { + "epoch": 0.11430437622468975, + "grad_norm": 11.256465495078574, + "learning_rate": 2.9186620878313404e-05, + "loss": 0.7589, + "step": 1750 + }, + { + "epoch": 0.11495754408883083, + "grad_norm": 4.467024036364152, + "learning_rate": 2.9176490819181196e-05, + "loss": 0.7555, + "step": 1760 + }, + { + "epoch": 0.11561071195297191, + "grad_norm": 7.081866692189798, + "learning_rate": 2.9166299848570993e-05, + "loss": 0.7166, + "step": 1770 + }, + { + "epoch": 0.116263879817113, + "grad_norm": 2.909828362138725, + "learning_rate": 2.9156048010269866e-05, + "loss": 0.7397, + "step": 1780 + }, + { + "epoch": 0.11691704768125408, + "grad_norm": 14.007574484697054, + "learning_rate": 2.9145735348326426e-05, + "loss": 0.7565, + "step": 1790 + }, + { + "epoch": 0.11757021554539517, + "grad_norm": 3.294438333989939, + "learning_rate": 2.9135361907050604e-05, + "loss": 0.7642, + "step": 1800 + }, + { + "epoch": 0.11822338340953625, + "grad_norm": 11.798762007605262, + "learning_rate": 2.9124927731013496e-05, + "loss": 0.7116, + "step": 1810 + }, + { + "epoch": 0.11887655127367734, + "grad_norm": 2.710752541336547, + "learning_rate": 2.9114432865047144e-05, + "loss": 0.7397, + "step": 1820 + }, + { + "epoch": 0.11952971913781842, + "grad_norm": 10.017624011337196, + "learning_rate": 2.9103877354244362e-05, + "loss": 0.7187, + "step": 1830 + }, + { + "epoch": 0.1201828870019595, + "grad_norm": 6.811000553967279, + "learning_rate": 2.9093261243958528e-05, + "loss": 0.7354, + "step": 1840 + }, + { + "epoch": 0.12083605486610059, + "grad_norm": 6.731318583444567, + "learning_rate": 2.908258457980339e-05, + "loss": 0.7384, + "step": 1850 + }, + { + "epoch": 0.12148922273024167, + "grad_norm": 5.388750098539709, + "learning_rate": 2.9071847407652892e-05, + "loss": 0.7755, + "step": 1860 + }, + { + "epoch": 0.12214239059438275, + "grad_norm": 5.4169301286838625, + "learning_rate": 2.9061049773640943e-05, + "loss": 0.7103, + "step": 1870 + }, + { + "epoch": 0.12279555845852384, + "grad_norm": 4.42587627635462, + "learning_rate": 2.905019172416124e-05, + "loss": 0.7438, + "step": 1880 + }, + { + "epoch": 0.12344872632266493, + "grad_norm": 31.054664897790516, + "learning_rate": 2.903927330586707e-05, + "loss": 0.7478, + "step": 1890 + }, + { + "epoch": 0.124101894186806, + "grad_norm": 7.972291406323152, + "learning_rate": 2.9028294565671097e-05, + "loss": 0.7421, + "step": 1900 + }, + { + "epoch": 0.1247550620509471, + "grad_norm": 7.274125571643045, + "learning_rate": 2.9017255550745174e-05, + "loss": 0.7979, + "step": 1910 + }, + { + "epoch": 0.12540822991508818, + "grad_norm": 2.8814760330477016, + "learning_rate": 2.900615630852013e-05, + "loss": 0.7396, + "step": 1920 + }, + { + "epoch": 0.12606139777922926, + "grad_norm": 4.905355189661924, + "learning_rate": 2.8994996886685567e-05, + "loss": 0.7547, + "step": 1930 + }, + { + "epoch": 0.12671456564337036, + "grad_norm": 9.505898100329482, + "learning_rate": 2.8983777333189662e-05, + "loss": 0.7304, + "step": 1940 + }, + { + "epoch": 0.12736773350751143, + "grad_norm": 3.323201006898466, + "learning_rate": 2.8972497696238954e-05, + "loss": 0.697, + "step": 1950 + }, + { + "epoch": 0.1280209013716525, + "grad_norm": 29.662689056629922, + "learning_rate": 2.8961158024298148e-05, + "loss": 0.7271, + "step": 1960 + }, + { + "epoch": 0.1286740692357936, + "grad_norm": 7.41746219554662, + "learning_rate": 2.894975836608989e-05, + "loss": 0.7538, + "step": 1970 + }, + { + "epoch": 0.12932723709993468, + "grad_norm": 4.4427785506369775, + "learning_rate": 2.8938298770594568e-05, + "loss": 0.7547, + "step": 1980 + }, + { + "epoch": 0.12998040496407576, + "grad_norm": 7.768812174099432, + "learning_rate": 2.89267792870501e-05, + "loss": 0.7507, + "step": 1990 + }, + { + "epoch": 0.13063357282821686, + "grad_norm": 6.238069973821311, + "learning_rate": 2.891519996495172e-05, + "loss": 0.7371, + "step": 2000 + }, + { + "epoch": 0.13128674069235793, + "grad_norm": 3.5939260688441923, + "learning_rate": 2.8903560854051777e-05, + "loss": 0.7411, + "step": 2010 + }, + { + "epoch": 0.131939908556499, + "grad_norm": 14.321069542353115, + "learning_rate": 2.8891862004359495e-05, + "loss": 0.7161, + "step": 2020 + }, + { + "epoch": 0.1325930764206401, + "grad_norm": 3.8064655011185975, + "learning_rate": 2.8880103466140798e-05, + "loss": 0.753, + "step": 2030 + }, + { + "epoch": 0.13324624428478118, + "grad_norm": 3.1071775713204413, + "learning_rate": 2.8868285289918044e-05, + "loss": 0.7919, + "step": 2040 + }, + { + "epoch": 0.1338994121489223, + "grad_norm": 7.793511836887719, + "learning_rate": 2.885640752646986e-05, + "loss": 0.7162, + "step": 2050 + }, + { + "epoch": 0.13455258001306336, + "grad_norm": 4.22301736393872, + "learning_rate": 2.8844470226830882e-05, + "loss": 0.7529, + "step": 2060 + }, + { + "epoch": 0.13520574787720444, + "grad_norm": 9.346993486220331, + "learning_rate": 2.883247344229156e-05, + "loss": 0.7674, + "step": 2070 + }, + { + "epoch": 0.13585891574134554, + "grad_norm": 15.933630619441285, + "learning_rate": 2.882041722439793e-05, + "loss": 0.7193, + "step": 2080 + }, + { + "epoch": 0.1365120836054866, + "grad_norm": 4.190682866936528, + "learning_rate": 2.880830162495138e-05, + "loss": 0.7673, + "step": 2090 + }, + { + "epoch": 0.13716525146962769, + "grad_norm": 7.037534897279013, + "learning_rate": 2.8796126696008465e-05, + "loss": 0.7254, + "step": 2100 + }, + { + "epoch": 0.1378184193337688, + "grad_norm": 2.6255399965993043, + "learning_rate": 2.8783892489880636e-05, + "loss": 0.7119, + "step": 2110 + }, + { + "epoch": 0.13847158719790986, + "grad_norm": 42.27865008405173, + "learning_rate": 2.8771599059134048e-05, + "loss": 0.7424, + "step": 2120 + }, + { + "epoch": 0.13912475506205094, + "grad_norm": 5.132678536767327, + "learning_rate": 2.875924645658932e-05, + "loss": 0.7242, + "step": 2130 + }, + { + "epoch": 0.13977792292619204, + "grad_norm": 3.603668060800223, + "learning_rate": 2.874683473532131e-05, + "loss": 0.7573, + "step": 2140 + }, + { + "epoch": 0.1404310907903331, + "grad_norm": 4.072902799746948, + "learning_rate": 2.8734363948658892e-05, + "loss": 0.715, + "step": 2150 + }, + { + "epoch": 0.1410842586544742, + "grad_norm": 22.401733113408323, + "learning_rate": 2.8721834150184728e-05, + "loss": 0.8006, + "step": 2160 + }, + { + "epoch": 0.1417374265186153, + "grad_norm": 6.604851078546388, + "learning_rate": 2.8709245393735028e-05, + "loss": 0.7337, + "step": 2170 + }, + { + "epoch": 0.14239059438275636, + "grad_norm": 52.26699830034364, + "learning_rate": 2.869659773339932e-05, + "loss": 0.7302, + "step": 2180 + }, + { + "epoch": 0.14304376224689747, + "grad_norm": 13.544021605130228, + "learning_rate": 2.8683891223520228e-05, + "loss": 0.7646, + "step": 2190 + }, + { + "epoch": 0.14369693011103854, + "grad_norm": 9.634615764152278, + "learning_rate": 2.8671125918693235e-05, + "loss": 0.7597, + "step": 2200 + }, + { + "epoch": 0.14435009797517961, + "grad_norm": 24.241467420017734, + "learning_rate": 2.865830187376643e-05, + "loss": 0.725, + "step": 2210 + }, + { + "epoch": 0.14500326583932072, + "grad_norm": 5.0945694153152745, + "learning_rate": 2.8645419143840317e-05, + "loss": 0.7535, + "step": 2220 + }, + { + "epoch": 0.1456564337034618, + "grad_norm": 4.106753597412838, + "learning_rate": 2.8632477784267512e-05, + "loss": 0.6977, + "step": 2230 + }, + { + "epoch": 0.14630960156760286, + "grad_norm": 6.34150573419331, + "learning_rate": 2.8619477850652566e-05, + "loss": 0.7453, + "step": 2240 + }, + { + "epoch": 0.14696276943174397, + "grad_norm": 4.133675670803359, + "learning_rate": 2.8606419398851704e-05, + "loss": 0.7155, + "step": 2250 + }, + { + "epoch": 0.14761593729588504, + "grad_norm": 6.462096694964287, + "learning_rate": 2.859330248497257e-05, + "loss": 0.7986, + "step": 2260 + }, + { + "epoch": 0.14826910516002612, + "grad_norm": 6.261836630889005, + "learning_rate": 2.8580127165374016e-05, + "loss": 0.6918, + "step": 2270 + }, + { + "epoch": 0.14892227302416722, + "grad_norm": 2.233205529252193, + "learning_rate": 2.8566893496665826e-05, + "loss": 0.7545, + "step": 2280 + }, + { + "epoch": 0.1495754408883083, + "grad_norm": 6.483858754393392, + "learning_rate": 2.8553601535708498e-05, + "loss": 0.7325, + "step": 2290 + }, + { + "epoch": 0.15022860875244937, + "grad_norm": 5.925565913090568, + "learning_rate": 2.8540251339612986e-05, + "loss": 0.7474, + "step": 2300 + }, + { + "epoch": 0.15088177661659047, + "grad_norm": 8.4240823942178, + "learning_rate": 2.852684296574048e-05, + "loss": 0.7397, + "step": 2310 + }, + { + "epoch": 0.15153494448073154, + "grad_norm": 3.4197951314588866, + "learning_rate": 2.851337647170211e-05, + "loss": 0.7321, + "step": 2320 + }, + { + "epoch": 0.15218811234487264, + "grad_norm": 3.051619840631103, + "learning_rate": 2.849985191535875e-05, + "loss": 0.6953, + "step": 2330 + }, + { + "epoch": 0.15284128020901372, + "grad_norm": 5.255096504929576, + "learning_rate": 2.8486269354820743e-05, + "loss": 0.7239, + "step": 2340 + }, + { + "epoch": 0.1534944480731548, + "grad_norm": 4.604713902237571, + "learning_rate": 2.847262884844765e-05, + "loss": 0.772, + "step": 2350 + }, + { + "epoch": 0.1541476159372959, + "grad_norm": 4.944285498551487, + "learning_rate": 2.8458930454848014e-05, + "loss": 0.7378, + "step": 2360 + }, + { + "epoch": 0.15480078380143697, + "grad_norm": 5.82955432372675, + "learning_rate": 2.8445174232879087e-05, + "loss": 0.7163, + "step": 2370 + }, + { + "epoch": 0.15545395166557804, + "grad_norm": 3.2222937177713744, + "learning_rate": 2.8431360241646605e-05, + "loss": 0.7484, + "step": 2380 + }, + { + "epoch": 0.15610711952971915, + "grad_norm": 9.753891388404778, + "learning_rate": 2.8417488540504504e-05, + "loss": 0.7307, + "step": 2390 + }, + { + "epoch": 0.15676028739386022, + "grad_norm": 3.5077449392766566, + "learning_rate": 2.8403559189054692e-05, + "loss": 0.7638, + "step": 2400 + }, + { + "epoch": 0.1574134552580013, + "grad_norm": 5.776464182538349, + "learning_rate": 2.8389572247146772e-05, + "loss": 0.7199, + "step": 2410 + }, + { + "epoch": 0.1580666231221424, + "grad_norm": 8.250958473708572, + "learning_rate": 2.8375527774877795e-05, + "loss": 0.7389, + "step": 2420 + }, + { + "epoch": 0.15871979098628347, + "grad_norm": 5.5029051650684755, + "learning_rate": 2.8361425832592002e-05, + "loss": 0.7483, + "step": 2430 + }, + { + "epoch": 0.15937295885042455, + "grad_norm": 5.755468339066939, + "learning_rate": 2.8347266480880563e-05, + "loss": 0.7284, + "step": 2440 + }, + { + "epoch": 0.16002612671456565, + "grad_norm": 2.397047503653553, + "learning_rate": 2.833304978058131e-05, + "loss": 0.7442, + "step": 2450 + }, + { + "epoch": 0.16067929457870672, + "grad_norm": 5.380685859554883, + "learning_rate": 2.8318775792778497e-05, + "loss": 0.7466, + "step": 2460 + }, + { + "epoch": 0.16133246244284782, + "grad_norm": 44.37031913032137, + "learning_rate": 2.83044445788025e-05, + "loss": 0.7185, + "step": 2470 + }, + { + "epoch": 0.1619856303069889, + "grad_norm": 66.50805064637852, + "learning_rate": 2.82900562002296e-05, + "loss": 0.7074, + "step": 2480 + }, + { + "epoch": 0.16263879817112997, + "grad_norm": 8.037240610638753, + "learning_rate": 2.827561071888168e-05, + "loss": 0.7179, + "step": 2490 + }, + { + "epoch": 0.16329196603527107, + "grad_norm": 5.618766212652152, + "learning_rate": 2.8261108196825972e-05, + "loss": 0.7523, + "step": 2500 + }, + { + "epoch": 0.16394513389941215, + "grad_norm": 4.727081825163079, + "learning_rate": 2.8246548696374808e-05, + "loss": 0.7568, + "step": 2510 + }, + { + "epoch": 0.16459830176355322, + "grad_norm": 6.317400248396289, + "learning_rate": 2.8231932280085312e-05, + "loss": 0.7294, + "step": 2520 + }, + { + "epoch": 0.16525146962769433, + "grad_norm": 4.105422755599874, + "learning_rate": 2.8217259010759185e-05, + "loss": 0.7123, + "step": 2530 + }, + { + "epoch": 0.1659046374918354, + "grad_norm": 3.912817233451184, + "learning_rate": 2.820252895144238e-05, + "loss": 0.696, + "step": 2540 + }, + { + "epoch": 0.16655780535597647, + "grad_norm": 2.792808858923082, + "learning_rate": 2.8187742165424867e-05, + "loss": 0.6958, + "step": 2550 + }, + { + "epoch": 0.16721097322011758, + "grad_norm": 3.3697500015033515, + "learning_rate": 2.8172898716240358e-05, + "loss": 0.7514, + "step": 2560 + }, + { + "epoch": 0.16786414108425865, + "grad_norm": 12.842632607073732, + "learning_rate": 2.8157998667666014e-05, + "loss": 0.7384, + "step": 2570 + }, + { + "epoch": 0.16851730894839975, + "grad_norm": 3.272066033879194, + "learning_rate": 2.8143042083722196e-05, + "loss": 0.7468, + "step": 2580 + }, + { + "epoch": 0.16917047681254083, + "grad_norm": 7.731601163741944, + "learning_rate": 2.8128029028672165e-05, + "loss": 0.7475, + "step": 2590 + }, + { + "epoch": 0.1698236446766819, + "grad_norm": 3.297949304055716, + "learning_rate": 2.8112959567021837e-05, + "loss": 0.7589, + "step": 2600 + }, + { + "epoch": 0.170476812540823, + "grad_norm": 14.339494194357066, + "learning_rate": 2.809783376351947e-05, + "loss": 0.7546, + "step": 2610 + }, + { + "epoch": 0.17112998040496408, + "grad_norm": 4.46409481839347, + "learning_rate": 2.808265168315541e-05, + "loss": 0.7425, + "step": 2620 + }, + { + "epoch": 0.17178314826910515, + "grad_norm": 9.466307069996041, + "learning_rate": 2.80674133911618e-05, + "loss": 0.7441, + "step": 2630 + }, + { + "epoch": 0.17243631613324625, + "grad_norm": 7.071614823083529, + "learning_rate": 2.805211895301233e-05, + "loss": 0.72, + "step": 2640 + }, + { + "epoch": 0.17308948399738733, + "grad_norm": 4.5445298901654745, + "learning_rate": 2.803676843442189e-05, + "loss": 0.7564, + "step": 2650 + }, + { + "epoch": 0.1737426518615284, + "grad_norm": 4.607374060860952, + "learning_rate": 2.8021361901346356e-05, + "loss": 0.7509, + "step": 2660 + }, + { + "epoch": 0.1743958197256695, + "grad_norm": 3.916261462631034, + "learning_rate": 2.8005899419982276e-05, + "loss": 0.7363, + "step": 2670 + }, + { + "epoch": 0.17504898758981058, + "grad_norm": 4.285270418160644, + "learning_rate": 2.7990381056766583e-05, + "loss": 0.6819, + "step": 2680 + }, + { + "epoch": 0.17570215545395165, + "grad_norm": 30.066405996248537, + "learning_rate": 2.7974806878376312e-05, + "loss": 0.7257, + "step": 2690 + }, + { + "epoch": 0.17635532331809275, + "grad_norm": 6.215186550197811, + "learning_rate": 2.7959176951728326e-05, + "loss": 0.7205, + "step": 2700 + }, + { + "epoch": 0.17700849118223383, + "grad_norm": 6.606929967094239, + "learning_rate": 2.7943491343979012e-05, + "loss": 0.6971, + "step": 2710 + }, + { + "epoch": 0.17766165904637493, + "grad_norm": 4.870287647251215, + "learning_rate": 2.7927750122524004e-05, + "loss": 0.738, + "step": 2720 + }, + { + "epoch": 0.178314826910516, + "grad_norm": 3.840109737338219, + "learning_rate": 2.7911953354997882e-05, + "loss": 0.7033, + "step": 2730 + }, + { + "epoch": 0.17896799477465708, + "grad_norm": 8.663692208470097, + "learning_rate": 2.78961011092739e-05, + "loss": 0.7536, + "step": 2740 + }, + { + "epoch": 0.17962116263879818, + "grad_norm": 8.706739021144726, + "learning_rate": 2.7880193453463664e-05, + "loss": 0.7418, + "step": 2750 + }, + { + "epoch": 0.18027433050293926, + "grad_norm": 4.286484907893715, + "learning_rate": 2.786423045591688e-05, + "loss": 0.7254, + "step": 2760 + }, + { + "epoch": 0.18092749836708033, + "grad_norm": 4.090802478364384, + "learning_rate": 2.7848212185221025e-05, + "loss": 0.7362, + "step": 2770 + }, + { + "epoch": 0.18158066623122143, + "grad_norm": 11.328580271023869, + "learning_rate": 2.783213871020106e-05, + "loss": 0.7241, + "step": 2780 + }, + { + "epoch": 0.1822338340953625, + "grad_norm": 8.48461958671395, + "learning_rate": 2.7816010099919157e-05, + "loss": 0.6719, + "step": 2790 + }, + { + "epoch": 0.18288700195950358, + "grad_norm": 3.2686306933480678, + "learning_rate": 2.7799826423674376e-05, + "loss": 0.7089, + "step": 2800 + }, + { + "epoch": 0.18354016982364468, + "grad_norm": 65.07123500441651, + "learning_rate": 2.7783587751002373e-05, + "loss": 0.7057, + "step": 2810 + }, + { + "epoch": 0.18419333768778576, + "grad_norm": 10.1627349338379, + "learning_rate": 2.776729415167511e-05, + "loss": 0.7236, + "step": 2820 + }, + { + "epoch": 0.18484650555192683, + "grad_norm": 2.5258850888099715, + "learning_rate": 2.7750945695700545e-05, + "loss": 0.7182, + "step": 2830 + }, + { + "epoch": 0.18549967341606793, + "grad_norm": 2.9763625671042457, + "learning_rate": 2.773454245332234e-05, + "loss": 0.7246, + "step": 2840 + }, + { + "epoch": 0.186152841280209, + "grad_norm": 4.674364122198806, + "learning_rate": 2.771808449501956e-05, + "loss": 0.7512, + "step": 2850 + }, + { + "epoch": 0.1868060091443501, + "grad_norm": 8.972997636490977, + "learning_rate": 2.770157189150635e-05, + "loss": 0.7218, + "step": 2860 + }, + { + "epoch": 0.18745917700849118, + "grad_norm": 6.751308738143618, + "learning_rate": 2.7685004713731667e-05, + "loss": 0.7219, + "step": 2870 + }, + { + "epoch": 0.18811234487263226, + "grad_norm": 9.421091936001982, + "learning_rate": 2.766838303287894e-05, + "loss": 0.6971, + "step": 2880 + }, + { + "epoch": 0.18876551273677336, + "grad_norm": 25.71006507749567, + "learning_rate": 2.7651706920365778e-05, + "loss": 0.7211, + "step": 2890 + }, + { + "epoch": 0.18941868060091444, + "grad_norm": 7.097688766178269, + "learning_rate": 2.7634976447843673e-05, + "loss": 0.7289, + "step": 2900 + }, + { + "epoch": 0.1900718484650555, + "grad_norm": 3.012421324402975, + "learning_rate": 2.761819168719768e-05, + "loss": 0.7334, + "step": 2910 + }, + { + "epoch": 0.1907250163291966, + "grad_norm": 6.763816256941492, + "learning_rate": 2.760135271054611e-05, + "loss": 0.7342, + "step": 2920 + }, + { + "epoch": 0.19137818419333769, + "grad_norm": 12.339869717446561, + "learning_rate": 2.7584459590240213e-05, + "loss": 0.7673, + "step": 2930 + }, + { + "epoch": 0.19203135205747876, + "grad_norm": 9.943986419226533, + "learning_rate": 2.75675123988639e-05, + "loss": 0.7316, + "step": 2940 + }, + { + "epoch": 0.19268451992161986, + "grad_norm": 3.9625775254424487, + "learning_rate": 2.7550511209233377e-05, + "loss": 0.7656, + "step": 2950 + }, + { + "epoch": 0.19333768778576094, + "grad_norm": 6.646184770465973, + "learning_rate": 2.753345609439689e-05, + "loss": 0.7096, + "step": 2960 + }, + { + "epoch": 0.19399085564990204, + "grad_norm": 3.363167542629984, + "learning_rate": 2.751634712763435e-05, + "loss": 0.773, + "step": 2970 + }, + { + "epoch": 0.1946440235140431, + "grad_norm": 9.64459690768457, + "learning_rate": 2.749918438245709e-05, + "loss": 0.7521, + "step": 2980 + }, + { + "epoch": 0.1952971913781842, + "grad_norm": 7.66917331463007, + "learning_rate": 2.7481967932607478e-05, + "loss": 0.7682, + "step": 2990 + }, + { + "epoch": 0.1959503592423253, + "grad_norm": 7.4386800903665975, + "learning_rate": 2.7464697852058648e-05, + "loss": 0.6853, + "step": 3000 + }, + { + "epoch": 0.19660352710646636, + "grad_norm": 5.948601553568303, + "learning_rate": 2.7447374215014157e-05, + "loss": 0.7079, + "step": 3010 + }, + { + "epoch": 0.19725669497060744, + "grad_norm": 4.017629160392304, + "learning_rate": 2.742999709590769e-05, + "loss": 0.6856, + "step": 3020 + }, + { + "epoch": 0.19790986283474854, + "grad_norm": 6.6087233490439905, + "learning_rate": 2.741256656940272e-05, + "loss": 0.7229, + "step": 3030 + }, + { + "epoch": 0.19856303069888961, + "grad_norm": 5.738147627571181, + "learning_rate": 2.7395082710392183e-05, + "loss": 0.721, + "step": 3040 + }, + { + "epoch": 0.1992161985630307, + "grad_norm": 4.753298367179536, + "learning_rate": 2.7377545593998178e-05, + "loss": 0.7019, + "step": 3050 + }, + { + "epoch": 0.1998693664271718, + "grad_norm": 6.611797204555651, + "learning_rate": 2.7359955295571624e-05, + "loss": 0.7115, + "step": 3060 + }, + { + "epoch": 0.20052253429131286, + "grad_norm": 4.432909888406863, + "learning_rate": 2.7342311890691957e-05, + "loss": 0.7427, + "step": 3070 + }, + { + "epoch": 0.20117570215545394, + "grad_norm": 12.633636844045604, + "learning_rate": 2.7324615455166778e-05, + "loss": 0.7014, + "step": 3080 + }, + { + "epoch": 0.20182887001959504, + "grad_norm": 3.514345470279303, + "learning_rate": 2.7306866065031562e-05, + "loss": 0.7306, + "step": 3090 + }, + { + "epoch": 0.20248203788373612, + "grad_norm": 7.274767695647385, + "learning_rate": 2.728906379654929e-05, + "loss": 0.7801, + "step": 3100 + }, + { + "epoch": 0.20313520574787722, + "grad_norm": 3.0139129997105454, + "learning_rate": 2.727120872621015e-05, + "loss": 0.7143, + "step": 3110 + }, + { + "epoch": 0.2037883736120183, + "grad_norm": 5.802657459099795, + "learning_rate": 2.7253300930731212e-05, + "loss": 0.7374, + "step": 3120 + }, + { + "epoch": 0.20444154147615937, + "grad_norm": 3.3669875366596598, + "learning_rate": 2.7235340487056074e-05, + "loss": 0.7172, + "step": 3130 + }, + { + "epoch": 0.20509470934030047, + "grad_norm": 5.503009319660073, + "learning_rate": 2.7217327472354555e-05, + "loss": 0.7321, + "step": 3140 + }, + { + "epoch": 0.20574787720444154, + "grad_norm": 5.726618455558819, + "learning_rate": 2.7199261964022345e-05, + "loss": 0.7416, + "step": 3150 + }, + { + "epoch": 0.20640104506858262, + "grad_norm": 14.751272372954604, + "learning_rate": 2.7181144039680688e-05, + "loss": 0.732, + "step": 3160 + }, + { + "epoch": 0.20705421293272372, + "grad_norm": 6.756690771244652, + "learning_rate": 2.7162973777176033e-05, + "loss": 0.7102, + "step": 3170 + }, + { + "epoch": 0.2077073807968648, + "grad_norm": 5.508517519150598, + "learning_rate": 2.7144751254579727e-05, + "loss": 0.7163, + "step": 3180 + }, + { + "epoch": 0.20836054866100587, + "grad_norm": 3.295298267114904, + "learning_rate": 2.7126476550187635e-05, + "loss": 0.7882, + "step": 3190 + }, + { + "epoch": 0.20901371652514697, + "grad_norm": 4.285106842952411, + "learning_rate": 2.7108149742519842e-05, + "loss": 0.7027, + "step": 3200 + }, + { + "epoch": 0.20966688438928804, + "grad_norm": 4.589013804618004, + "learning_rate": 2.7089770910320312e-05, + "loss": 0.7351, + "step": 3210 + }, + { + "epoch": 0.21032005225342912, + "grad_norm": 7.427253910226353, + "learning_rate": 2.7071340132556518e-05, + "loss": 0.7341, + "step": 3220 + }, + { + "epoch": 0.21097322011757022, + "grad_norm": 7.242467453040407, + "learning_rate": 2.7052857488419146e-05, + "loss": 0.7321, + "step": 3230 + }, + { + "epoch": 0.2116263879817113, + "grad_norm": 3.3718873257981823, + "learning_rate": 2.703432305732172e-05, + "loss": 0.7518, + "step": 3240 + }, + { + "epoch": 0.2122795558458524, + "grad_norm": 4.628943334503347, + "learning_rate": 2.701573691890029e-05, + "loss": 0.7681, + "step": 3250 + }, + { + "epoch": 0.21293272370999347, + "grad_norm": 3.692360083654589, + "learning_rate": 2.6997099153013053e-05, + "loss": 0.7438, + "step": 3260 + }, + { + "epoch": 0.21358589157413455, + "grad_norm": 6.026452562923238, + "learning_rate": 2.6978409839740045e-05, + "loss": 0.6755, + "step": 3270 + }, + { + "epoch": 0.21423905943827565, + "grad_norm": 8.388146401979114, + "learning_rate": 2.6959669059382787e-05, + "loss": 0.7152, + "step": 3280 + }, + { + "epoch": 0.21489222730241672, + "grad_norm": 5.489947061474543, + "learning_rate": 2.6940876892463924e-05, + "loss": 0.7331, + "step": 3290 + }, + { + "epoch": 0.2155453951665578, + "grad_norm": 5.238150530271462, + "learning_rate": 2.6922033419726903e-05, + "loss": 0.7216, + "step": 3300 + }, + { + "epoch": 0.2161985630306989, + "grad_norm": 5.76694380401119, + "learning_rate": 2.690313872213561e-05, + "loss": 0.7041, + "step": 3310 + }, + { + "epoch": 0.21685173089483997, + "grad_norm": 4.334325872361498, + "learning_rate": 2.6884192880874018e-05, + "loss": 0.7035, + "step": 3320 + }, + { + "epoch": 0.21750489875898105, + "grad_norm": 3.646615269686379, + "learning_rate": 2.6865195977345864e-05, + "loss": 0.7135, + "step": 3330 + }, + { + "epoch": 0.21815806662312215, + "grad_norm": 3.2031026699869516, + "learning_rate": 2.6846148093174266e-05, + "loss": 0.7478, + "step": 3340 + }, + { + "epoch": 0.21881123448726322, + "grad_norm": 3.294578873577567, + "learning_rate": 2.6827049310201392e-05, + "loss": 0.6966, + "step": 3350 + }, + { + "epoch": 0.2194644023514043, + "grad_norm": 13.531582658656262, + "learning_rate": 2.6807899710488118e-05, + "loss": 0.7714, + "step": 3360 + }, + { + "epoch": 0.2201175702155454, + "grad_norm": 3.1667911772760817, + "learning_rate": 2.6788699376313635e-05, + "loss": 0.6883, + "step": 3370 + }, + { + "epoch": 0.22077073807968647, + "grad_norm": 12.709667365090638, + "learning_rate": 2.6769448390175156e-05, + "loss": 0.7083, + "step": 3380 + }, + { + "epoch": 0.22142390594382758, + "grad_norm": 4.90247272458579, + "learning_rate": 2.67501468347875e-05, + "loss": 0.7254, + "step": 3390 + }, + { + "epoch": 0.22207707380796865, + "grad_norm": 6.586314850084481, + "learning_rate": 2.673079479308277e-05, + "loss": 0.7053, + "step": 3400 + }, + { + "epoch": 0.22273024167210972, + "grad_norm": 8.67302524395751, + "learning_rate": 2.671139234821001e-05, + "loss": 0.7461, + "step": 3410 + }, + { + "epoch": 0.22338340953625083, + "grad_norm": 4.199679298872582, + "learning_rate": 2.669193958353481e-05, + "loss": 0.7108, + "step": 3420 + }, + { + "epoch": 0.2240365774003919, + "grad_norm": 3.7113020225982543, + "learning_rate": 2.6672436582638962e-05, + "loss": 0.7379, + "step": 3430 + }, + { + "epoch": 0.22468974526453298, + "grad_norm": 3.1660823450229953, + "learning_rate": 2.6652883429320127e-05, + "loss": 0.7458, + "step": 3440 + }, + { + "epoch": 0.22534291312867408, + "grad_norm": 4.557885122260074, + "learning_rate": 2.6633280207591434e-05, + "loss": 0.7384, + "step": 3450 + }, + { + "epoch": 0.22599608099281515, + "grad_norm": 3.206803010538826, + "learning_rate": 2.6613627001681156e-05, + "loss": 0.7837, + "step": 3460 + }, + { + "epoch": 0.22664924885695623, + "grad_norm": 5.426168451300572, + "learning_rate": 2.659392389603232e-05, + "loss": 0.7091, + "step": 3470 + }, + { + "epoch": 0.22730241672109733, + "grad_norm": 5.983994434496897, + "learning_rate": 2.6574170975302347e-05, + "loss": 0.7322, + "step": 3480 + }, + { + "epoch": 0.2279555845852384, + "grad_norm": 4.1889732388324346, + "learning_rate": 2.6554368324362716e-05, + "loss": 0.7274, + "step": 3490 + }, + { + "epoch": 0.2286087524493795, + "grad_norm": 4.279384234331947, + "learning_rate": 2.653451602829856e-05, + "loss": 0.7167, + "step": 3500 + }, + { + "epoch": 0.22926192031352058, + "grad_norm": 10.493028415319717, + "learning_rate": 2.6514614172408342e-05, + "loss": 0.7518, + "step": 3510 + }, + { + "epoch": 0.22991508817766165, + "grad_norm": 5.127178380468306, + "learning_rate": 2.649466284220344e-05, + "loss": 0.7087, + "step": 3520 + }, + { + "epoch": 0.23056825604180275, + "grad_norm": 4.447652673443472, + "learning_rate": 2.6474662123407827e-05, + "loss": 0.7088, + "step": 3530 + }, + { + "epoch": 0.23122142390594383, + "grad_norm": 4.424791216908705, + "learning_rate": 2.6454612101957676e-05, + "loss": 0.7059, + "step": 3540 + }, + { + "epoch": 0.2318745917700849, + "grad_norm": 5.883257966824098, + "learning_rate": 2.6434512864000988e-05, + "loss": 0.7456, + "step": 3550 + }, + { + "epoch": 0.232527759634226, + "grad_norm": 4.3204215451597845, + "learning_rate": 2.6414364495897242e-05, + "loss": 0.7413, + "step": 3560 + }, + { + "epoch": 0.23318092749836708, + "grad_norm": 6.554390974464567, + "learning_rate": 2.6394167084217005e-05, + "loss": 0.7181, + "step": 3570 + }, + { + "epoch": 0.23383409536250815, + "grad_norm": 6.3921681525273994, + "learning_rate": 2.637392071574157e-05, + "loss": 0.7177, + "step": 3580 + }, + { + "epoch": 0.23448726322664926, + "grad_norm": 4.851980784102187, + "learning_rate": 2.635362547746258e-05, + "loss": 0.7402, + "step": 3590 + }, + { + "epoch": 0.23514043109079033, + "grad_norm": 3.4119093890395322, + "learning_rate": 2.6333281456581654e-05, + "loss": 0.691, + "step": 3600 + }, + { + "epoch": 0.2357935989549314, + "grad_norm": 3.8612085843913864, + "learning_rate": 2.631288874051002e-05, + "loss": 0.7416, + "step": 3610 + }, + { + "epoch": 0.2364467668190725, + "grad_norm": 17.80518970057431, + "learning_rate": 2.6292447416868113e-05, + "loss": 0.7185, + "step": 3620 + }, + { + "epoch": 0.23709993468321358, + "grad_norm": 8.691379915690442, + "learning_rate": 2.6271957573485244e-05, + "loss": 0.7232, + "step": 3630 + }, + { + "epoch": 0.23775310254735468, + "grad_norm": 7.889136326928524, + "learning_rate": 2.6251419298399176e-05, + "loss": 0.7473, + "step": 3640 + }, + { + "epoch": 0.23840627041149576, + "grad_norm": 4.379578292960399, + "learning_rate": 2.6230832679855773e-05, + "loss": 0.7269, + "step": 3650 + }, + { + "epoch": 0.23905943827563683, + "grad_norm": 5.353751120664064, + "learning_rate": 2.6210197806308617e-05, + "loss": 0.7662, + "step": 3660 + }, + { + "epoch": 0.23971260613977793, + "grad_norm": 2.7709561275339194, + "learning_rate": 2.6189514766418625e-05, + "loss": 0.7094, + "step": 3670 + }, + { + "epoch": 0.240365774003919, + "grad_norm": 3.9103422168330204, + "learning_rate": 2.6168783649053666e-05, + "loss": 0.7028, + "step": 3680 + }, + { + "epoch": 0.24101894186806008, + "grad_norm": 9.505874423721908, + "learning_rate": 2.6148004543288178e-05, + "loss": 0.7255, + "step": 3690 + }, + { + "epoch": 0.24167210973220118, + "grad_norm": 7.653416946344283, + "learning_rate": 2.6127177538402795e-05, + "loss": 0.7207, + "step": 3700 + }, + { + "epoch": 0.24232527759634226, + "grad_norm": 6.251847566275377, + "learning_rate": 2.6106302723883952e-05, + "loss": 0.7724, + "step": 3710 + }, + { + "epoch": 0.24297844546048333, + "grad_norm": 7.31365044828085, + "learning_rate": 2.60853801894235e-05, + "loss": 0.694, + "step": 3720 + }, + { + "epoch": 0.24363161332462444, + "grad_norm": 10.798141206505173, + "learning_rate": 2.6064410024918352e-05, + "loss": 0.7268, + "step": 3730 + }, + { + "epoch": 0.2442847811887655, + "grad_norm": 7.3726815927726195, + "learning_rate": 2.6043392320470033e-05, + "loss": 0.7419, + "step": 3740 + }, + { + "epoch": 0.24493794905290658, + "grad_norm": 8.397591708864457, + "learning_rate": 2.6022327166384363e-05, + "loss": 0.7248, + "step": 3750 + }, + { + "epoch": 0.24559111691704769, + "grad_norm": 6.615639461597031, + "learning_rate": 2.600121465317102e-05, + "loss": 0.737, + "step": 3760 + }, + { + "epoch": 0.24624428478118876, + "grad_norm": 12.252576907293445, + "learning_rate": 2.5980054871543167e-05, + "loss": 0.678, + "step": 3770 + }, + { + "epoch": 0.24689745264532986, + "grad_norm": 6.008549638823585, + "learning_rate": 2.5958847912417065e-05, + "loss": 0.7436, + "step": 3780 + }, + { + "epoch": 0.24755062050947094, + "grad_norm": 9.731321783664672, + "learning_rate": 2.5937593866911694e-05, + "loss": 0.6884, + "step": 3790 + }, + { + "epoch": 0.248203788373612, + "grad_norm": 4.872437219190829, + "learning_rate": 2.5916292826348327e-05, + "loss": 0.7551, + "step": 3800 + }, + { + "epoch": 0.2488569562377531, + "grad_norm": 3.2546621774905304, + "learning_rate": 2.5894944882250177e-05, + "loss": 0.7235, + "step": 3810 + }, + { + "epoch": 0.2495101241018942, + "grad_norm": 9.424086986405833, + "learning_rate": 2.5873550126341963e-05, + "loss": 0.6996, + "step": 3820 + }, + { + "epoch": 0.25016329196603526, + "grad_norm": 5.514870221849538, + "learning_rate": 2.585210865054957e-05, + "loss": 0.7349, + "step": 3830 + }, + { + "epoch": 0.25081645983017636, + "grad_norm": 6.9013441794446315, + "learning_rate": 2.5830620546999587e-05, + "loss": 0.7647, + "step": 3840 + }, + { + "epoch": 0.25146962769431747, + "grad_norm": 6.6845750918764875, + "learning_rate": 2.580908590801897e-05, + "loss": 0.7193, + "step": 3850 + }, + { + "epoch": 0.2521227955584585, + "grad_norm": 10.961278212697293, + "learning_rate": 2.5787504826134613e-05, + "loss": 0.7236, + "step": 3860 + }, + { + "epoch": 0.2527759634225996, + "grad_norm": 12.837907116460197, + "learning_rate": 2.5765877394072965e-05, + "loss": 0.6781, + "step": 3870 + }, + { + "epoch": 0.2534291312867407, + "grad_norm": 10.64515408154912, + "learning_rate": 2.5744203704759616e-05, + "loss": 0.7158, + "step": 3880 + }, + { + "epoch": 0.25408229915088176, + "grad_norm": 5.828039582881938, + "learning_rate": 2.572248385131892e-05, + "loss": 0.7243, + "step": 3890 + }, + { + "epoch": 0.25473546701502287, + "grad_norm": 4.7378551420496064, + "learning_rate": 2.5700717927073572e-05, + "loss": 0.686, + "step": 3900 + }, + { + "epoch": 0.25538863487916397, + "grad_norm": 5.101256286871471, + "learning_rate": 2.5678906025544212e-05, + "loss": 0.6929, + "step": 3910 + }, + { + "epoch": 0.256041802743305, + "grad_norm": 4.945280487776523, + "learning_rate": 2.5657048240449055e-05, + "loss": 0.7152, + "step": 3920 + }, + { + "epoch": 0.2566949706074461, + "grad_norm": 6.325141259828677, + "learning_rate": 2.5635144665703425e-05, + "loss": 0.7386, + "step": 3930 + }, + { + "epoch": 0.2573481384715872, + "grad_norm": 3.328094625558282, + "learning_rate": 2.5613195395419422e-05, + "loss": 0.7853, + "step": 3940 + }, + { + "epoch": 0.25800130633572826, + "grad_norm": 14.77901385371437, + "learning_rate": 2.559120052390546e-05, + "loss": 0.7757, + "step": 3950 + }, + { + "epoch": 0.25865447419986937, + "grad_norm": 2.926012995140229, + "learning_rate": 2.55691601456659e-05, + "loss": 0.6735, + "step": 3960 + }, + { + "epoch": 0.25930764206401047, + "grad_norm": 5.225338186608472, + "learning_rate": 2.5547074355400615e-05, + "loss": 0.7265, + "step": 3970 + }, + { + "epoch": 0.2599608099281515, + "grad_norm": 4.206340254882354, + "learning_rate": 2.5524943248004618e-05, + "loss": 0.7145, + "step": 3980 + }, + { + "epoch": 0.2606139777922926, + "grad_norm": 6.368887270719955, + "learning_rate": 2.550276691856762e-05, + "loss": 0.7423, + "step": 3990 + }, + { + "epoch": 0.2612671456564337, + "grad_norm": 3.0483125865376963, + "learning_rate": 2.548054546237364e-05, + "loss": 0.7132, + "step": 4000 + }, + { + "epoch": 0.26192031352057477, + "grad_norm": 13.296685653396278, + "learning_rate": 2.5458278974900587e-05, + "loss": 0.7084, + "step": 4010 + }, + { + "epoch": 0.26257348138471587, + "grad_norm": 38.457562726457006, + "learning_rate": 2.5435967551819856e-05, + "loss": 0.7265, + "step": 4020 + }, + { + "epoch": 0.26322664924885697, + "grad_norm": 109.85436026280432, + "learning_rate": 2.5413611288995915e-05, + "loss": 0.729, + "step": 4030 + }, + { + "epoch": 0.263879817112998, + "grad_norm": 8.077784990426677, + "learning_rate": 2.53912102824859e-05, + "loss": 0.7015, + "step": 4040 + }, + { + "epoch": 0.2645329849771391, + "grad_norm": 8.861234493320074, + "learning_rate": 2.5368764628539184e-05, + "loss": 0.7495, + "step": 4050 + }, + { + "epoch": 0.2651861528412802, + "grad_norm": 3.554047847668293, + "learning_rate": 2.5346274423596973e-05, + "loss": 0.7424, + "step": 4060 + }, + { + "epoch": 0.26583932070542127, + "grad_norm": 2.927566126492755, + "learning_rate": 2.5323739764291912e-05, + "loss": 0.7611, + "step": 4070 + }, + { + "epoch": 0.26649248856956237, + "grad_norm": 3.4058451280668876, + "learning_rate": 2.5301160747447627e-05, + "loss": 0.6972, + "step": 4080 + }, + { + "epoch": 0.26714565643370347, + "grad_norm": 3.0373695188526497, + "learning_rate": 2.5278537470078352e-05, + "loss": 0.7033, + "step": 4090 + }, + { + "epoch": 0.2677988242978446, + "grad_norm": 5.525119120836987, + "learning_rate": 2.525587002938848e-05, + "loss": 0.7608, + "step": 4100 + }, + { + "epoch": 0.2684519921619856, + "grad_norm": 5.173021264413592, + "learning_rate": 2.5233158522772166e-05, + "loss": 0.7528, + "step": 4110 + }, + { + "epoch": 0.2691051600261267, + "grad_norm": 7.9505349609036235, + "learning_rate": 2.5210403047812896e-05, + "loss": 0.7787, + "step": 4120 + }, + { + "epoch": 0.2697583278902678, + "grad_norm": 8.093183932153524, + "learning_rate": 2.518760370228308e-05, + "loss": 0.7288, + "step": 4130 + }, + { + "epoch": 0.27041149575440887, + "grad_norm": 7.35362845406468, + "learning_rate": 2.516476058414362e-05, + "loss": 0.7048, + "step": 4140 + }, + { + "epoch": 0.27106466361855, + "grad_norm": 6.063117241653435, + "learning_rate": 2.5141873791543494e-05, + "loss": 0.7099, + "step": 4150 + }, + { + "epoch": 0.2717178314826911, + "grad_norm": 6.2465652182913445, + "learning_rate": 2.511894342281933e-05, + "loss": 0.7636, + "step": 4160 + }, + { + "epoch": 0.2723709993468321, + "grad_norm": 2.4005462620859435, + "learning_rate": 2.5095969576494998e-05, + "loss": 0.7347, + "step": 4170 + }, + { + "epoch": 0.2730241672109732, + "grad_norm": 4.287489527920315, + "learning_rate": 2.5072952351281166e-05, + "loss": 0.7047, + "step": 4180 + }, + { + "epoch": 0.2736773350751143, + "grad_norm": 10.924198105492543, + "learning_rate": 2.504989184607489e-05, + "loss": 0.7135, + "step": 4190 + }, + { + "epoch": 0.27433050293925537, + "grad_norm": 3.344600105958022, + "learning_rate": 2.502678815995919e-05, + "loss": 0.7566, + "step": 4200 + }, + { + "epoch": 0.2749836708033965, + "grad_norm": 8.178872258203794, + "learning_rate": 2.500364139220261e-05, + "loss": 0.7012, + "step": 4210 + }, + { + "epoch": 0.2756368386675376, + "grad_norm": 13.792796170330694, + "learning_rate": 2.4980451642258807e-05, + "loss": 0.7214, + "step": 4220 + }, + { + "epoch": 0.2762900065316786, + "grad_norm": 4.686318820906931, + "learning_rate": 2.495721900976611e-05, + "loss": 0.7561, + "step": 4230 + }, + { + "epoch": 0.2769431743958197, + "grad_norm": 52.23368325147696, + "learning_rate": 2.4933943594547116e-05, + "loss": 0.7079, + "step": 4240 + }, + { + "epoch": 0.2775963422599608, + "grad_norm": 3.3254162864810706, + "learning_rate": 2.4910625496608227e-05, + "loss": 0.7151, + "step": 4250 + }, + { + "epoch": 0.2782495101241019, + "grad_norm": 6.428077097460673, + "learning_rate": 2.488726481613925e-05, + "loss": 0.7109, + "step": 4260 + }, + { + "epoch": 0.278902677988243, + "grad_norm": 13.474459188221212, + "learning_rate": 2.4863861653512947e-05, + "loss": 0.7513, + "step": 4270 + }, + { + "epoch": 0.2795558458523841, + "grad_norm": 5.57604753719738, + "learning_rate": 2.484041610928461e-05, + "loss": 0.7523, + "step": 4280 + }, + { + "epoch": 0.2802090137165251, + "grad_norm": 7.417290679168985, + "learning_rate": 2.4816928284191642e-05, + "loss": 0.7464, + "step": 4290 + }, + { + "epoch": 0.2808621815806662, + "grad_norm": 7.4368198628749616, + "learning_rate": 2.4793398279153098e-05, + "loss": 0.7309, + "step": 4300 + }, + { + "epoch": 0.28151534944480733, + "grad_norm": 5.404604799289123, + "learning_rate": 2.4769826195269276e-05, + "loss": 0.6812, + "step": 4310 + }, + { + "epoch": 0.2821685173089484, + "grad_norm": 5.67385871603199, + "learning_rate": 2.474621213382126e-05, + "loss": 0.7485, + "step": 4320 + }, + { + "epoch": 0.2828216851730895, + "grad_norm": 3.327246034991726, + "learning_rate": 2.4722556196270516e-05, + "loss": 0.7491, + "step": 4330 + }, + { + "epoch": 0.2834748530372306, + "grad_norm": 4.298927510493578, + "learning_rate": 2.4698858484258413e-05, + "loss": 0.7447, + "step": 4340 + }, + { + "epoch": 0.2841280209013716, + "grad_norm": 2.830645539219132, + "learning_rate": 2.4675119099605832e-05, + "loss": 0.6611, + "step": 4350 + }, + { + "epoch": 0.2847811887655127, + "grad_norm": 2.892594280107655, + "learning_rate": 2.46513381443127e-05, + "loss": 0.7574, + "step": 4360 + }, + { + "epoch": 0.28543435662965383, + "grad_norm": 2.8096820701671392, + "learning_rate": 2.462751572055755e-05, + "loss": 0.7352, + "step": 4370 + }, + { + "epoch": 0.28608752449379493, + "grad_norm": 6.625013819266248, + "learning_rate": 2.46036519306971e-05, + "loss": 0.6894, + "step": 4380 + }, + { + "epoch": 0.286740692357936, + "grad_norm": 5.553982675208715, + "learning_rate": 2.457974687726581e-05, + "loss": 0.69, + "step": 4390 + }, + { + "epoch": 0.2873938602220771, + "grad_norm": 10.463948438278225, + "learning_rate": 2.4555800662975415e-05, + "loss": 0.7381, + "step": 4400 + }, + { + "epoch": 0.2880470280862182, + "grad_norm": 5.237933324400403, + "learning_rate": 2.4531813390714523e-05, + "loss": 0.705, + "step": 4410 + }, + { + "epoch": 0.28870019595035923, + "grad_norm": 29.3661171326619, + "learning_rate": 2.4507785163548145e-05, + "loss": 0.6982, + "step": 4420 + }, + { + "epoch": 0.28935336381450033, + "grad_norm": 5.569785158681364, + "learning_rate": 2.448371608471726e-05, + "loss": 0.7602, + "step": 4430 + }, + { + "epoch": 0.29000653167864143, + "grad_norm": 2.7943484866446475, + "learning_rate": 2.4459606257638375e-05, + "loss": 0.7343, + "step": 4440 + }, + { + "epoch": 0.2906596995427825, + "grad_norm": 2.9668248409437354, + "learning_rate": 2.4435455785903088e-05, + "loss": 0.6952, + "step": 4450 + }, + { + "epoch": 0.2913128674069236, + "grad_norm": 4.0679875797768545, + "learning_rate": 2.441126477327761e-05, + "loss": 0.7106, + "step": 4460 + }, + { + "epoch": 0.2919660352710647, + "grad_norm": 12.16575396175707, + "learning_rate": 2.4387033323702364e-05, + "loss": 0.712, + "step": 4470 + }, + { + "epoch": 0.29261920313520573, + "grad_norm": 2.8440806437673234, + "learning_rate": 2.4362761541291502e-05, + "loss": 0.7658, + "step": 4480 + }, + { + "epoch": 0.29327237099934683, + "grad_norm": 2.970610346903458, + "learning_rate": 2.433844953033249e-05, + "loss": 0.7424, + "step": 4490 + }, + { + "epoch": 0.29392553886348793, + "grad_norm": 6.591526614830109, + "learning_rate": 2.431409739528562e-05, + "loss": 0.7057, + "step": 4500 + }, + { + "epoch": 0.294578706727629, + "grad_norm": 3.5000170298570765, + "learning_rate": 2.42897052407836e-05, + "loss": 0.7207, + "step": 4510 + }, + { + "epoch": 0.2952318745917701, + "grad_norm": 4.09870087115134, + "learning_rate": 2.4265273171631077e-05, + "loss": 0.7326, + "step": 4520 + }, + { + "epoch": 0.2958850424559112, + "grad_norm": 3.634194650545928, + "learning_rate": 2.42408012928042e-05, + "loss": 0.7509, + "step": 4530 + }, + { + "epoch": 0.29653821032005223, + "grad_norm": 4.634696693084635, + "learning_rate": 2.4216289709450176e-05, + "loss": 0.7639, + "step": 4540 + }, + { + "epoch": 0.29719137818419333, + "grad_norm": 1.674361047291477, + "learning_rate": 2.4191738526886794e-05, + "loss": 0.7451, + "step": 4550 + }, + { + "epoch": 0.29784454604833444, + "grad_norm": 4.679840951482941, + "learning_rate": 2.4167147850601998e-05, + "loss": 0.6997, + "step": 4560 + }, + { + "epoch": 0.2984977139124755, + "grad_norm": 15.092169321509138, + "learning_rate": 2.414251778625342e-05, + "loss": 0.7105, + "step": 4570 + }, + { + "epoch": 0.2991508817766166, + "grad_norm": 4.95638690004173, + "learning_rate": 2.411784843966793e-05, + "loss": 0.7067, + "step": 4580 + }, + { + "epoch": 0.2998040496407577, + "grad_norm": 9.521604766867073, + "learning_rate": 2.4093139916841172e-05, + "loss": 0.7225, + "step": 4590 + }, + { + "epoch": 0.30045721750489873, + "grad_norm": 10.371086393594378, + "learning_rate": 2.4068392323937125e-05, + "loss": 0.6966, + "step": 4600 + }, + { + "epoch": 0.30111038536903983, + "grad_norm": 6.1990655256347535, + "learning_rate": 2.4043605767287643e-05, + "loss": 0.7165, + "step": 4610 + }, + { + "epoch": 0.30176355323318094, + "grad_norm": 7.466217760942189, + "learning_rate": 2.4018780353391978e-05, + "loss": 0.6912, + "step": 4620 + }, + { + "epoch": 0.30241672109732204, + "grad_norm": 5.232860562458902, + "learning_rate": 2.3993916188916348e-05, + "loss": 0.7015, + "step": 4630 + }, + { + "epoch": 0.3030698889614631, + "grad_norm": 7.8628232759082195, + "learning_rate": 2.396901338069348e-05, + "loss": 0.701, + "step": 4640 + }, + { + "epoch": 0.3037230568256042, + "grad_norm": 5.6841565040341315, + "learning_rate": 2.394407203572211e-05, + "loss": 0.7673, + "step": 4650 + }, + { + "epoch": 0.3043762246897453, + "grad_norm": 8.54205331314945, + "learning_rate": 2.3919092261166584e-05, + "loss": 0.6904, + "step": 4660 + }, + { + "epoch": 0.30502939255388634, + "grad_norm": 11.156476357778907, + "learning_rate": 2.3894074164356353e-05, + "loss": 0.6855, + "step": 4670 + }, + { + "epoch": 0.30568256041802744, + "grad_norm": 7.870284591240595, + "learning_rate": 2.3869017852785525e-05, + "loss": 0.724, + "step": 4680 + }, + { + "epoch": 0.30633572828216854, + "grad_norm": 3.3237816492979007, + "learning_rate": 2.3843923434112402e-05, + "loss": 0.7228, + "step": 4690 + }, + { + "epoch": 0.3069888961463096, + "grad_norm": 2.480450775802249, + "learning_rate": 2.3818791016159022e-05, + "loss": 0.7184, + "step": 4700 + }, + { + "epoch": 0.3076420640104507, + "grad_norm": 7.916224283272688, + "learning_rate": 2.3793620706910696e-05, + "loss": 0.7388, + "step": 4710 + }, + { + "epoch": 0.3082952318745918, + "grad_norm": 2.943344146547367, + "learning_rate": 2.3768412614515536e-05, + "loss": 0.6956, + "step": 4720 + }, + { + "epoch": 0.30894839973873284, + "grad_norm": 3.327029912173773, + "learning_rate": 2.3743166847283995e-05, + "loss": 0.7233, + "step": 4730 + }, + { + "epoch": 0.30960156760287394, + "grad_norm": 6.533958348071706, + "learning_rate": 2.3717883513688405e-05, + "loss": 0.7083, + "step": 4740 + }, + { + "epoch": 0.31025473546701504, + "grad_norm": 2.2289245925613885, + "learning_rate": 2.3692562722362508e-05, + "loss": 0.746, + "step": 4750 + }, + { + "epoch": 0.3109079033311561, + "grad_norm": 3.7748651745700323, + "learning_rate": 2.3667204582100984e-05, + "loss": 0.6574, + "step": 4760 + }, + { + "epoch": 0.3115610711952972, + "grad_norm": 3.0117659311519964, + "learning_rate": 2.3641809201858996e-05, + "loss": 0.732, + "step": 4770 + }, + { + "epoch": 0.3122142390594383, + "grad_norm": 6.5912357978805245, + "learning_rate": 2.3616376690751703e-05, + "loss": 0.7292, + "step": 4780 + }, + { + "epoch": 0.31286740692357934, + "grad_norm": 4.055781986644913, + "learning_rate": 2.359090715805381e-05, + "loss": 0.7227, + "step": 4790 + }, + { + "epoch": 0.31352057478772044, + "grad_norm": 5.128394514646037, + "learning_rate": 2.3565400713199095e-05, + "loss": 0.7592, + "step": 4800 + }, + { + "epoch": 0.31417374265186154, + "grad_norm": 8.779130673134816, + "learning_rate": 2.3539857465779925e-05, + "loss": 0.6676, + "step": 4810 + }, + { + "epoch": 0.3148269105160026, + "grad_norm": 5.429036887728827, + "learning_rate": 2.3514277525546803e-05, + "loss": 0.7009, + "step": 4820 + }, + { + "epoch": 0.3154800783801437, + "grad_norm": 7.8542004760770014, + "learning_rate": 2.348866100240789e-05, + "loss": 0.7233, + "step": 4830 + }, + { + "epoch": 0.3161332462442848, + "grad_norm": 6.478615135277363, + "learning_rate": 2.3463008006428506e-05, + "loss": 0.7015, + "step": 4840 + }, + { + "epoch": 0.31678641410842584, + "grad_norm": 3.4721596945324875, + "learning_rate": 2.343731864783073e-05, + "loss": 0.7295, + "step": 4850 + }, + { + "epoch": 0.31743958197256694, + "grad_norm": 5.134712506262902, + "learning_rate": 2.3411593036992835e-05, + "loss": 0.7464, + "step": 4860 + }, + { + "epoch": 0.31809274983670804, + "grad_norm": 5.049954351842052, + "learning_rate": 2.3385831284448873e-05, + "loss": 0.7473, + "step": 4870 + }, + { + "epoch": 0.3187459177008491, + "grad_norm": 7.027799717697589, + "learning_rate": 2.336003350088819e-05, + "loss": 0.688, + "step": 4880 + }, + { + "epoch": 0.3193990855649902, + "grad_norm": 4.380900953812384, + "learning_rate": 2.3334199797154936e-05, + "loss": 0.6974, + "step": 4890 + }, + { + "epoch": 0.3200522534291313, + "grad_norm": 1.802352319241488, + "learning_rate": 2.3308330284247605e-05, + "loss": 0.6911, + "step": 4900 + }, + { + "epoch": 0.3207054212932724, + "grad_norm": 3.6480027783199236, + "learning_rate": 2.3282425073318546e-05, + "loss": 0.7089, + "step": 4910 + }, + { + "epoch": 0.32135858915741344, + "grad_norm": 8.846036258285976, + "learning_rate": 2.3256484275673486e-05, + "loss": 0.7138, + "step": 4920 + }, + { + "epoch": 0.32201175702155455, + "grad_norm": 5.168086297852975, + "learning_rate": 2.3230508002771067e-05, + "loss": 0.7055, + "step": 4930 + }, + { + "epoch": 0.32266492488569565, + "grad_norm": 3.9475777647489396, + "learning_rate": 2.320449636622235e-05, + "loss": 0.7257, + "step": 4940 + }, + { + "epoch": 0.3233180927498367, + "grad_norm": 4.931806927889069, + "learning_rate": 2.3178449477790325e-05, + "loss": 0.7071, + "step": 4950 + }, + { + "epoch": 0.3239712606139778, + "grad_norm": 2.2215574528530437, + "learning_rate": 2.3152367449389483e-05, + "loss": 0.7037, + "step": 4960 + }, + { + "epoch": 0.3246244284781189, + "grad_norm": 8.690398027898421, + "learning_rate": 2.312625039308528e-05, + "loss": 0.7256, + "step": 4970 + }, + { + "epoch": 0.32527759634225994, + "grad_norm": 10.206026981904913, + "learning_rate": 2.3100098421093655e-05, + "loss": 0.6887, + "step": 4980 + }, + { + "epoch": 0.32593076420640105, + "grad_norm": 3.80442191887017, + "learning_rate": 2.3073911645780602e-05, + "loss": 0.7179, + "step": 4990 + }, + { + "epoch": 0.32658393207054215, + "grad_norm": 4.728308085787767, + "learning_rate": 2.304769017966163e-05, + "loss": 0.7809, + "step": 5000 + }, + { + "epoch": 0.3272370999346832, + "grad_norm": 11.391935045550042, + "learning_rate": 2.302143413540132e-05, + "loss": 0.7669, + "step": 5010 + }, + { + "epoch": 0.3278902677988243, + "grad_norm": 11.911310517206102, + "learning_rate": 2.2995143625812804e-05, + "loss": 0.7085, + "step": 5020 + }, + { + "epoch": 0.3285434356629654, + "grad_norm": 7.262195721290905, + "learning_rate": 2.296881876385731e-05, + "loss": 0.7089, + "step": 5030 + }, + { + "epoch": 0.32919660352710645, + "grad_norm": 8.385837204775681, + "learning_rate": 2.2942459662643667e-05, + "loss": 0.7249, + "step": 5040 + }, + { + "epoch": 0.32984977139124755, + "grad_norm": 9.484288524755074, + "learning_rate": 2.291606643542782e-05, + "loss": 0.7342, + "step": 5050 + }, + { + "epoch": 0.33050293925538865, + "grad_norm": 4.6070263067396535, + "learning_rate": 2.288963919561233e-05, + "loss": 0.7352, + "step": 5060 + }, + { + "epoch": 0.3311561071195297, + "grad_norm": 4.701683051508864, + "learning_rate": 2.2863178056745913e-05, + "loss": 0.737, + "step": 5070 + }, + { + "epoch": 0.3318092749836708, + "grad_norm": 3.5407757854087034, + "learning_rate": 2.2836683132522927e-05, + "loss": 0.7017, + "step": 5080 + }, + { + "epoch": 0.3324624428478119, + "grad_norm": 8.776372349033894, + "learning_rate": 2.2810154536782903e-05, + "loss": 0.6874, + "step": 5090 + }, + { + "epoch": 0.33311561071195295, + "grad_norm": 14.30802960940852, + "learning_rate": 2.2783592383510038e-05, + "loss": 0.7418, + "step": 5100 + }, + { + "epoch": 0.33376877857609405, + "grad_norm": 15.268248599076554, + "learning_rate": 2.275699678683272e-05, + "loss": 0.7243, + "step": 5110 + }, + { + "epoch": 0.33442194644023515, + "grad_norm": 3.8522854449552493, + "learning_rate": 2.2730367861023023e-05, + "loss": 0.6963, + "step": 5120 + }, + { + "epoch": 0.3350751143043762, + "grad_norm": 15.097323735018765, + "learning_rate": 2.2703705720496235e-05, + "loss": 0.7404, + "step": 5130 + }, + { + "epoch": 0.3357282821685173, + "grad_norm": 3.289494356993917, + "learning_rate": 2.2677010479810362e-05, + "loss": 0.7096, + "step": 5140 + }, + { + "epoch": 0.3363814500326584, + "grad_norm": 3.3373795905520107, + "learning_rate": 2.2650282253665605e-05, + "loss": 0.7068, + "step": 5150 + }, + { + "epoch": 0.3370346178967995, + "grad_norm": 22.639667962001727, + "learning_rate": 2.2623521156903914e-05, + "loss": 0.6999, + "step": 5160 + }, + { + "epoch": 0.33768778576094055, + "grad_norm": 8.066217556055694, + "learning_rate": 2.2596727304508474e-05, + "loss": 0.7185, + "step": 5170 + }, + { + "epoch": 0.33834095362508165, + "grad_norm": 3.5470860624323395, + "learning_rate": 2.256990081160319e-05, + "loss": 0.6573, + "step": 5180 + }, + { + "epoch": 0.33899412148922276, + "grad_norm": 2.833517423894922, + "learning_rate": 2.2543041793452228e-05, + "loss": 0.6982, + "step": 5190 + }, + { + "epoch": 0.3396472893533638, + "grad_norm": 5.640485006270031, + "learning_rate": 2.2516150365459507e-05, + "loss": 0.7177, + "step": 5200 + }, + { + "epoch": 0.3403004572175049, + "grad_norm": 3.839458519984891, + "learning_rate": 2.2489226643168183e-05, + "loss": 0.7174, + "step": 5210 + }, + { + "epoch": 0.340953625081646, + "grad_norm": 6.786572033585219, + "learning_rate": 2.246227074226018e-05, + "loss": 0.7393, + "step": 5220 + }, + { + "epoch": 0.34160679294578705, + "grad_norm": 8.830342296217026, + "learning_rate": 2.243528277855568e-05, + "loss": 0.6876, + "step": 5230 + }, + { + "epoch": 0.34225996080992815, + "grad_norm": 12.293778646461922, + "learning_rate": 2.2408262868012635e-05, + "loss": 0.7381, + "step": 5240 + }, + { + "epoch": 0.34291312867406926, + "grad_norm": 6.77122748377005, + "learning_rate": 2.2381211126726255e-05, + "loss": 0.7216, + "step": 5250 + }, + { + "epoch": 0.3435662965382103, + "grad_norm": 3.839666787432626, + "learning_rate": 2.2354127670928513e-05, + "loss": 0.6704, + "step": 5260 + }, + { + "epoch": 0.3442194644023514, + "grad_norm": 3.5517692240588605, + "learning_rate": 2.2327012616987646e-05, + "loss": 0.6805, + "step": 5270 + }, + { + "epoch": 0.3448726322664925, + "grad_norm": 5.315718450066826, + "learning_rate": 2.2299866081407676e-05, + "loss": 0.729, + "step": 5280 + }, + { + "epoch": 0.34552580013063355, + "grad_norm": 6.848787865977037, + "learning_rate": 2.227268818082787e-05, + "loss": 0.6739, + "step": 5290 + }, + { + "epoch": 0.34617896799477466, + "grad_norm": 3.6917748968535564, + "learning_rate": 2.2245479032022272e-05, + "loss": 0.7269, + "step": 5300 + }, + { + "epoch": 0.34683213585891576, + "grad_norm": 9.160335319411757, + "learning_rate": 2.2218238751899174e-05, + "loss": 0.6839, + "step": 5310 + }, + { + "epoch": 0.3474853037230568, + "grad_norm": 5.424577814365955, + "learning_rate": 2.2190967457500646e-05, + "loss": 0.6939, + "step": 5320 + }, + { + "epoch": 0.3481384715871979, + "grad_norm": 3.6648874879145015, + "learning_rate": 2.2163665266002007e-05, + "loss": 0.7073, + "step": 5330 + }, + { + "epoch": 0.348791639451339, + "grad_norm": 9.13369002747884, + "learning_rate": 2.213633229471133e-05, + "loss": 0.7327, + "step": 5340 + }, + { + "epoch": 0.34944480731548005, + "grad_norm": 2.881754363096986, + "learning_rate": 2.210896866106894e-05, + "loss": 0.7052, + "step": 5350 + }, + { + "epoch": 0.35009797517962116, + "grad_norm": 5.529284801624932, + "learning_rate": 2.2081574482646903e-05, + "loss": 0.7253, + "step": 5360 + }, + { + "epoch": 0.35075114304376226, + "grad_norm": 2.9786955632123235, + "learning_rate": 2.205414987714854e-05, + "loss": 0.6812, + "step": 5370 + }, + { + "epoch": 0.3514043109079033, + "grad_norm": 3.993821378974443, + "learning_rate": 2.202669496240788e-05, + "loss": 0.6951, + "step": 5380 + }, + { + "epoch": 0.3520574787720444, + "grad_norm": 10.693892502406564, + "learning_rate": 2.1999209856389215e-05, + "loss": 0.6938, + "step": 5390 + }, + { + "epoch": 0.3527106466361855, + "grad_norm": 10.783647265180912, + "learning_rate": 2.1971694677186523e-05, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 0.35336381450032656, + "grad_norm": 5.596700220664436, + "learning_rate": 2.194414954302302e-05, + "loss": 0.7373, + "step": 5410 + }, + { + "epoch": 0.35401698236446766, + "grad_norm": 10.407517181316326, + "learning_rate": 2.191657457225062e-05, + "loss": 0.7397, + "step": 5420 + }, + { + "epoch": 0.35467015022860876, + "grad_norm": 5.081907583105732, + "learning_rate": 2.1888969883349436e-05, + "loss": 0.6908, + "step": 5430 + }, + { + "epoch": 0.35532331809274986, + "grad_norm": 30.697198457905934, + "learning_rate": 2.1861335594927264e-05, + "loss": 0.7265, + "step": 5440 + }, + { + "epoch": 0.3559764859568909, + "grad_norm": 3.973571491869874, + "learning_rate": 2.1833671825719092e-05, + "loss": 0.7273, + "step": 5450 + }, + { + "epoch": 0.356629653821032, + "grad_norm": 3.6562494783319606, + "learning_rate": 2.1805978694586564e-05, + "loss": 0.742, + "step": 5460 + }, + { + "epoch": 0.3572828216851731, + "grad_norm": 17.645169020048908, + "learning_rate": 2.1778256320517485e-05, + "loss": 0.7066, + "step": 5470 + }, + { + "epoch": 0.35793598954931416, + "grad_norm": 4.218794080715886, + "learning_rate": 2.1750504822625316e-05, + "loss": 0.6779, + "step": 5480 + }, + { + "epoch": 0.35858915741345526, + "grad_norm": 6.070136540242526, + "learning_rate": 2.172272432014864e-05, + "loss": 0.6884, + "step": 5490 + }, + { + "epoch": 0.35924232527759636, + "grad_norm": 4.296131638468927, + "learning_rate": 2.169491493245066e-05, + "loss": 0.7549, + "step": 5500 + }, + { + "epoch": 0.3598954931417374, + "grad_norm": 4.45693239441751, + "learning_rate": 2.1667076779018708e-05, + "loss": 0.7081, + "step": 5510 + }, + { + "epoch": 0.3605486610058785, + "grad_norm": 6.126532226736765, + "learning_rate": 2.16392099794637e-05, + "loss": 0.7233, + "step": 5520 + }, + { + "epoch": 0.3612018288700196, + "grad_norm": 10.690768628798947, + "learning_rate": 2.1611314653519633e-05, + "loss": 0.7494, + "step": 5530 + }, + { + "epoch": 0.36185499673416066, + "grad_norm": 6.005580532144763, + "learning_rate": 2.1583390921043074e-05, + "loss": 0.7013, + "step": 5540 + }, + { + "epoch": 0.36250816459830176, + "grad_norm": 4.400593474851796, + "learning_rate": 2.1555438902012644e-05, + "loss": 0.6549, + "step": 5550 + }, + { + "epoch": 0.36316133246244287, + "grad_norm": 2.4175705589561884, + "learning_rate": 2.152745871652851e-05, + "loss": 0.7459, + "step": 5560 + }, + { + "epoch": 0.3638145003265839, + "grad_norm": 8.380975214402772, + "learning_rate": 2.1499450484811836e-05, + "loss": 0.7475, + "step": 5570 + }, + { + "epoch": 0.364467668190725, + "grad_norm": 7.535104105735855, + "learning_rate": 2.1471414327204325e-05, + "loss": 0.7463, + "step": 5580 + }, + { + "epoch": 0.3651208360548661, + "grad_norm": 10.183550297183858, + "learning_rate": 2.1443350364167635e-05, + "loss": 0.7225, + "step": 5590 + }, + { + "epoch": 0.36577400391900716, + "grad_norm": 6.961617279255035, + "learning_rate": 2.1415258716282912e-05, + "loss": 0.7151, + "step": 5600 + }, + { + "epoch": 0.36642717178314826, + "grad_norm": 1.940684132435257, + "learning_rate": 2.1387139504250254e-05, + "loss": 0.7195, + "step": 5610 + }, + { + "epoch": 0.36708033964728937, + "grad_norm": 3.744684393791611, + "learning_rate": 2.135899284888819e-05, + "loss": 0.6486, + "step": 5620 + }, + { + "epoch": 0.3677335075114304, + "grad_norm": 6.069719872334578, + "learning_rate": 2.1330818871133164e-05, + "loss": 0.7101, + "step": 5630 + }, + { + "epoch": 0.3683866753755715, + "grad_norm": 12.369927493771815, + "learning_rate": 2.130261769203901e-05, + "loss": 0.7299, + "step": 5640 + }, + { + "epoch": 0.3690398432397126, + "grad_norm": 5.184241316936024, + "learning_rate": 2.1274389432776442e-05, + "loss": 0.6976, + "step": 5650 + }, + { + "epoch": 0.36969301110385366, + "grad_norm": 5.144146765126566, + "learning_rate": 2.124613421463253e-05, + "loss": 0.7232, + "step": 5660 + }, + { + "epoch": 0.37034617896799477, + "grad_norm": 4.746113040084255, + "learning_rate": 2.121785215901018e-05, + "loss": 0.7095, + "step": 5670 + }, + { + "epoch": 0.37099934683213587, + "grad_norm": 5.32268400735139, + "learning_rate": 2.118954338742759e-05, + "loss": 0.7106, + "step": 5680 + }, + { + "epoch": 0.37165251469627697, + "grad_norm": 7.237241370763546, + "learning_rate": 2.1161208021517766e-05, + "loss": 0.7085, + "step": 5690 + }, + { + "epoch": 0.372305682560418, + "grad_norm": 4.077001658098951, + "learning_rate": 2.1132846183027978e-05, + "loss": 0.7134, + "step": 5700 + }, + { + "epoch": 0.3729588504245591, + "grad_norm": 5.488216261292269, + "learning_rate": 2.1104457993819237e-05, + "loss": 0.7269, + "step": 5710 + }, + { + "epoch": 0.3736120182887002, + "grad_norm": 17.11430526139107, + "learning_rate": 2.1076043575865768e-05, + "loss": 0.6564, + "step": 5720 + }, + { + "epoch": 0.37426518615284127, + "grad_norm": 10.97778991135696, + "learning_rate": 2.10476030512545e-05, + "loss": 0.7019, + "step": 5730 + }, + { + "epoch": 0.37491835401698237, + "grad_norm": 8.044336485211486, + "learning_rate": 2.1019136542184534e-05, + "loss": 0.6942, + "step": 5740 + }, + { + "epoch": 0.37557152188112347, + "grad_norm": 18.021968575001992, + "learning_rate": 2.099064417096662e-05, + "loss": 0.7102, + "step": 5750 + }, + { + "epoch": 0.3762246897452645, + "grad_norm": 7.597040915393705, + "learning_rate": 2.0962126060022603e-05, + "loss": 0.726, + "step": 5760 + }, + { + "epoch": 0.3768778576094056, + "grad_norm": 4.866355469739272, + "learning_rate": 2.0933582331884967e-05, + "loss": 0.6813, + "step": 5770 + }, + { + "epoch": 0.3775310254735467, + "grad_norm": 9.713816871869412, + "learning_rate": 2.0905013109196217e-05, + "loss": 0.6868, + "step": 5780 + }, + { + "epoch": 0.37818419333768777, + "grad_norm": 10.125773082465265, + "learning_rate": 2.0876418514708442e-05, + "loss": 0.7506, + "step": 5790 + }, + { + "epoch": 0.37883736120182887, + "grad_norm": 6.982779164809807, + "learning_rate": 2.0847798671282706e-05, + "loss": 0.6706, + "step": 5800 + }, + { + "epoch": 0.37949052906597, + "grad_norm": 3.4051762559620142, + "learning_rate": 2.081915370188859e-05, + "loss": 0.7132, + "step": 5810 + }, + { + "epoch": 0.380143696930111, + "grad_norm": 3.4718152498585506, + "learning_rate": 2.0790483729603624e-05, + "loss": 0.6742, + "step": 5820 + }, + { + "epoch": 0.3807968647942521, + "grad_norm": 22.035510447982027, + "learning_rate": 2.0761788877612746e-05, + "loss": 0.7255, + "step": 5830 + }, + { + "epoch": 0.3814500326583932, + "grad_norm": 2.0183516401641395, + "learning_rate": 2.0733069269207828e-05, + "loss": 0.7506, + "step": 5840 + }, + { + "epoch": 0.38210320052253427, + "grad_norm": 40.64552816683451, + "learning_rate": 2.0704325027787085e-05, + "loss": 0.7109, + "step": 5850 + }, + { + "epoch": 0.38275636838667537, + "grad_norm": 11.592439068405213, + "learning_rate": 2.0675556276854588e-05, + "loss": 0.6948, + "step": 5860 + }, + { + "epoch": 0.3834095362508165, + "grad_norm": 8.94221169646604, + "learning_rate": 2.0646763140019702e-05, + "loss": 0.7036, + "step": 5870 + }, + { + "epoch": 0.3840627041149575, + "grad_norm": 3.4153432996898463, + "learning_rate": 2.0617945740996583e-05, + "loss": 0.7016, + "step": 5880 + }, + { + "epoch": 0.3847158719790986, + "grad_norm": 4.83458645240433, + "learning_rate": 2.0589104203603624e-05, + "loss": 0.6892, + "step": 5890 + }, + { + "epoch": 0.3853690398432397, + "grad_norm": 2.992026860080857, + "learning_rate": 2.056023865176294e-05, + "loss": 0.7367, + "step": 5900 + }, + { + "epoch": 0.38602220770738077, + "grad_norm": 7.321261903148404, + "learning_rate": 2.0531349209499822e-05, + "loss": 0.7121, + "step": 5910 + }, + { + "epoch": 0.3866753755715219, + "grad_norm": 4.135261026651754, + "learning_rate": 2.0502436000942206e-05, + "loss": 0.7015, + "step": 5920 + }, + { + "epoch": 0.387328543435663, + "grad_norm": 5.739710682346368, + "learning_rate": 2.047349915032016e-05, + "loss": 0.6811, + "step": 5930 + }, + { + "epoch": 0.3879817112998041, + "grad_norm": 7.817442528367328, + "learning_rate": 2.0444538781965324e-05, + "loss": 0.7079, + "step": 5940 + }, + { + "epoch": 0.3886348791639451, + "grad_norm": 1.7125273681458524, + "learning_rate": 2.041555502031037e-05, + "loss": 0.6939, + "step": 5950 + }, + { + "epoch": 0.3892880470280862, + "grad_norm": 7.063769471170878, + "learning_rate": 2.0386547989888514e-05, + "loss": 0.7409, + "step": 5960 + }, + { + "epoch": 0.38994121489222733, + "grad_norm": 6.649069768334491, + "learning_rate": 2.0357517815332918e-05, + "loss": 0.7039, + "step": 5970 + }, + { + "epoch": 0.3905943827563684, + "grad_norm": 4.9949670256327225, + "learning_rate": 2.0328464621376216e-05, + "loss": 0.708, + "step": 5980 + }, + { + "epoch": 0.3912475506205095, + "grad_norm": 10.223302544529933, + "learning_rate": 2.0299388532849922e-05, + "loss": 0.7625, + "step": 5990 + }, + { + "epoch": 0.3919007184846506, + "grad_norm": 3.4792253561647155, + "learning_rate": 2.027028967468394e-05, + "loss": 0.7313, + "step": 6000 + }, + { + "epoch": 0.3925538863487916, + "grad_norm": 6.173284673862917, + "learning_rate": 2.0241168171906002e-05, + "loss": 0.6961, + "step": 6010 + }, + { + "epoch": 0.3932070542129327, + "grad_norm": 9.069327472622437, + "learning_rate": 2.0212024149641124e-05, + "loss": 0.6839, + "step": 6020 + }, + { + "epoch": 0.39386022207707383, + "grad_norm": 4.354312267637299, + "learning_rate": 2.0182857733111094e-05, + "loss": 0.7299, + "step": 6030 + }, + { + "epoch": 0.3945133899412149, + "grad_norm": 10.938547313138859, + "learning_rate": 2.015366904763392e-05, + "loss": 0.737, + "step": 6040 + }, + { + "epoch": 0.395166557805356, + "grad_norm": 3.368581390407729, + "learning_rate": 2.012445821862329e-05, + "loss": 0.6936, + "step": 6050 + }, + { + "epoch": 0.3958197256694971, + "grad_norm": 22.651820687722875, + "learning_rate": 2.0095225371588023e-05, + "loss": 0.6701, + "step": 6060 + }, + { + "epoch": 0.3964728935336381, + "grad_norm": 7.087662693961875, + "learning_rate": 2.006597063213156e-05, + "loss": 0.7057, + "step": 6070 + }, + { + "epoch": 0.39712606139777923, + "grad_norm": 3.6259318502326736, + "learning_rate": 2.0036694125951395e-05, + "loss": 0.747, + "step": 6080 + }, + { + "epoch": 0.39777922926192033, + "grad_norm": 2.4208395436437615, + "learning_rate": 2.0007395978838556e-05, + "loss": 0.7042, + "step": 6090 + }, + { + "epoch": 0.3984323971260614, + "grad_norm": 3.7718790757505682, + "learning_rate": 1.9978076316677035e-05, + "loss": 0.7039, + "step": 6100 + }, + { + "epoch": 0.3990855649902025, + "grad_norm": 4.715238565697955, + "learning_rate": 1.9948735265443297e-05, + "loss": 0.7006, + "step": 6110 + }, + { + "epoch": 0.3997387328543436, + "grad_norm": 3.656750114993287, + "learning_rate": 1.9919372951205675e-05, + "loss": 0.6848, + "step": 6120 + }, + { + "epoch": 0.40039190071848463, + "grad_norm": 6.2809029098948335, + "learning_rate": 1.9889989500123896e-05, + "loss": 0.7516, + "step": 6130 + }, + { + "epoch": 0.40104506858262573, + "grad_norm": 4.45791400589284, + "learning_rate": 1.9860585038448472e-05, + "loss": 0.7293, + "step": 6140 + }, + { + "epoch": 0.40169823644676683, + "grad_norm": 6.671689461980347, + "learning_rate": 1.9831159692520208e-05, + "loss": 0.7139, + "step": 6150 + }, + { + "epoch": 0.4023514043109079, + "grad_norm": 7.413217349665978, + "learning_rate": 1.9801713588769643e-05, + "loss": 0.6818, + "step": 6160 + }, + { + "epoch": 0.403004572175049, + "grad_norm": 3.0801533633605467, + "learning_rate": 1.9772246853716497e-05, + "loss": 0.6924, + "step": 6170 + }, + { + "epoch": 0.4036577400391901, + "grad_norm": 8.663708499869044, + "learning_rate": 1.9742759613969136e-05, + "loss": 0.7205, + "step": 6180 + }, + { + "epoch": 0.40431090790333113, + "grad_norm": 6.16162556292061, + "learning_rate": 1.9713251996224037e-05, + "loss": 0.7135, + "step": 6190 + }, + { + "epoch": 0.40496407576747223, + "grad_norm": 4.790487901849723, + "learning_rate": 1.9683724127265228e-05, + "loss": 0.7015, + "step": 6200 + }, + { + "epoch": 0.40561724363161333, + "grad_norm": 3.6618688723110346, + "learning_rate": 1.965417613396375e-05, + "loss": 0.6915, + "step": 6210 + }, + { + "epoch": 0.40627041149575444, + "grad_norm": 16.05473312037314, + "learning_rate": 1.96246081432771e-05, + "loss": 0.7518, + "step": 6220 + }, + { + "epoch": 0.4069235793598955, + "grad_norm": 4.88766672771986, + "learning_rate": 1.959502028224872e-05, + "loss": 0.7345, + "step": 6230 + }, + { + "epoch": 0.4075767472240366, + "grad_norm": 12.426776499556185, + "learning_rate": 1.9565412678007414e-05, + "loss": 0.7051, + "step": 6240 + }, + { + "epoch": 0.4082299150881777, + "grad_norm": 17.259950283504946, + "learning_rate": 1.9535785457766816e-05, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 0.40888308295231873, + "grad_norm": 3.9776106316833317, + "learning_rate": 1.950613874882484e-05, + "loss": 0.6904, + "step": 6260 + }, + { + "epoch": 0.40953625081645983, + "grad_norm": 4.209309505641302, + "learning_rate": 1.947647267856314e-05, + "loss": 0.7336, + "step": 6270 + }, + { + "epoch": 0.41018941868060094, + "grad_norm": 4.510058288197555, + "learning_rate": 1.9446787374446574e-05, + "loss": 0.7202, + "step": 6280 + }, + { + "epoch": 0.410842586544742, + "grad_norm": 6.194243484377249, + "learning_rate": 1.9417082964022605e-05, + "loss": 0.6833, + "step": 6290 + }, + { + "epoch": 0.4114957544088831, + "grad_norm": 3.311152634015392, + "learning_rate": 1.938735957492083e-05, + "loss": 0.6833, + "step": 6300 + }, + { + "epoch": 0.4121489222730242, + "grad_norm": 57.47319772756037, + "learning_rate": 1.935761733485236e-05, + "loss": 0.7215, + "step": 6310 + }, + { + "epoch": 0.41280209013716523, + "grad_norm": 6.884796376261111, + "learning_rate": 1.9327856371609327e-05, + "loss": 0.7029, + "step": 6320 + }, + { + "epoch": 0.41345525800130634, + "grad_norm": 4.169035556055065, + "learning_rate": 1.9298076813064282e-05, + "loss": 0.7448, + "step": 6330 + }, + { + "epoch": 0.41410842586544744, + "grad_norm": 5.96368320968989, + "learning_rate": 1.9268278787169696e-05, + "loss": 0.7163, + "step": 6340 + }, + { + "epoch": 0.4147615937295885, + "grad_norm": 4.222858152405998, + "learning_rate": 1.923846242195738e-05, + "loss": 0.7119, + "step": 6350 + }, + { + "epoch": 0.4154147615937296, + "grad_norm": 11.408875045751458, + "learning_rate": 1.9208627845537946e-05, + "loss": 0.7135, + "step": 6360 + }, + { + "epoch": 0.4160679294578707, + "grad_norm": 6.302076439551584, + "learning_rate": 1.9178775186100245e-05, + "loss": 0.6941, + "step": 6370 + }, + { + "epoch": 0.41672109732201174, + "grad_norm": 7.347215719366371, + "learning_rate": 1.914890457191083e-05, + "loss": 0.7243, + "step": 6380 + }, + { + "epoch": 0.41737426518615284, + "grad_norm": 4.592029067528999, + "learning_rate": 1.91190161313134e-05, + "loss": 0.6766, + "step": 6390 + }, + { + "epoch": 0.41802743305029394, + "grad_norm": 5.2659124188411335, + "learning_rate": 1.9089109992728253e-05, + "loss": 0.6972, + "step": 6400 + }, + { + "epoch": 0.418680600914435, + "grad_norm": 3.8701089324208575, + "learning_rate": 1.9059186284651714e-05, + "loss": 0.6983, + "step": 6410 + }, + { + "epoch": 0.4193337687785761, + "grad_norm": 9.765641406236218, + "learning_rate": 1.902924513565561e-05, + "loss": 0.713, + "step": 6420 + }, + { + "epoch": 0.4199869366427172, + "grad_norm": 47.84550535643343, + "learning_rate": 1.8999286674386712e-05, + "loss": 0.7405, + "step": 6430 + }, + { + "epoch": 0.42064010450685824, + "grad_norm": 4.640806868871516, + "learning_rate": 1.8969311029566158e-05, + "loss": 0.7386, + "step": 6440 + }, + { + "epoch": 0.42129327237099934, + "grad_norm": 4.9456925811064645, + "learning_rate": 1.8939318329988924e-05, + "loss": 0.6878, + "step": 6450 + }, + { + "epoch": 0.42194644023514044, + "grad_norm": 3.7569417916267196, + "learning_rate": 1.890930870452327e-05, + "loss": 0.719, + "step": 6460 + }, + { + "epoch": 0.42259960809928154, + "grad_norm": 4.046929672048653, + "learning_rate": 1.8879282282110183e-05, + "loss": 0.6635, + "step": 6470 + }, + { + "epoch": 0.4232527759634226, + "grad_norm": 3.241109572317114, + "learning_rate": 1.8849239191762807e-05, + "loss": 0.7271, + "step": 6480 + }, + { + "epoch": 0.4239059438275637, + "grad_norm": 3.5379414582873, + "learning_rate": 1.881917956256591e-05, + "loss": 0.7063, + "step": 6490 + }, + { + "epoch": 0.4245591116917048, + "grad_norm": 5.3498102814022355, + "learning_rate": 1.878910352367533e-05, + "loss": 0.666, + "step": 6500 + }, + { + "epoch": 0.42521227955584584, + "grad_norm": 3.062766083037805, + "learning_rate": 1.8759011204317403e-05, + "loss": 0.6783, + "step": 6510 + }, + { + "epoch": 0.42586544741998694, + "grad_norm": 11.9116282093037, + "learning_rate": 1.872890273378841e-05, + "loss": 0.6585, + "step": 6520 + }, + { + "epoch": 0.42651861528412804, + "grad_norm": 18.417492786135245, + "learning_rate": 1.8698778241454048e-05, + "loss": 0.6759, + "step": 6530 + }, + { + "epoch": 0.4271717831482691, + "grad_norm": 8.22022213823648, + "learning_rate": 1.8668637856748826e-05, + "loss": 0.7064, + "step": 6540 + }, + { + "epoch": 0.4278249510124102, + "grad_norm": 9.406437361869447, + "learning_rate": 1.8638481709175566e-05, + "loss": 0.7451, + "step": 6550 + }, + { + "epoch": 0.4284781188765513, + "grad_norm": 24.665543539235813, + "learning_rate": 1.8608309928304797e-05, + "loss": 0.7097, + "step": 6560 + }, + { + "epoch": 0.42913128674069234, + "grad_norm": 6.918551299318292, + "learning_rate": 1.857812264377423e-05, + "loss": 0.7246, + "step": 6570 + }, + { + "epoch": 0.42978445460483344, + "grad_norm": 4.661685101269544, + "learning_rate": 1.8547919985288183e-05, + "loss": 0.7204, + "step": 6580 + }, + { + "epoch": 0.43043762246897455, + "grad_norm": 6.459041072664668, + "learning_rate": 1.851770208261704e-05, + "loss": 0.6513, + "step": 6590 + }, + { + "epoch": 0.4310907903331156, + "grad_norm": 15.875963076716992, + "learning_rate": 1.8487469065596668e-05, + "loss": 0.7152, + "step": 6600 + }, + { + "epoch": 0.4317439581972567, + "grad_norm": 5.891032642456025, + "learning_rate": 1.845722106412789e-05, + "loss": 0.6918, + "step": 6610 + }, + { + "epoch": 0.4323971260613978, + "grad_norm": 5.232261663805651, + "learning_rate": 1.842695820817591e-05, + "loss": 0.6852, + "step": 6620 + }, + { + "epoch": 0.43305029392553884, + "grad_norm": 12.336035217709867, + "learning_rate": 1.8396680627769753e-05, + "loss": 0.7119, + "step": 6630 + }, + { + "epoch": 0.43370346178967994, + "grad_norm": 11.538495193823614, + "learning_rate": 1.8366388453001702e-05, + "loss": 0.7276, + "step": 6640 + }, + { + "epoch": 0.43435662965382105, + "grad_norm": 2.5758268343611985, + "learning_rate": 1.833608181402676e-05, + "loss": 0.6689, + "step": 6650 + }, + { + "epoch": 0.4350097975179621, + "grad_norm": 4.745025640163145, + "learning_rate": 1.830576084106208e-05, + "loss": 0.7138, + "step": 6660 + }, + { + "epoch": 0.4356629653821032, + "grad_norm": 5.142215007475423, + "learning_rate": 1.8275425664386385e-05, + "loss": 0.6803, + "step": 6670 + }, + { + "epoch": 0.4363161332462443, + "grad_norm": 5.252139790751175, + "learning_rate": 1.8245076414339438e-05, + "loss": 0.6771, + "step": 6680 + }, + { + "epoch": 0.43696930111038534, + "grad_norm": 5.820963942150845, + "learning_rate": 1.821471322132148e-05, + "loss": 0.7338, + "step": 6690 + }, + { + "epoch": 0.43762246897452645, + "grad_norm": 13.496280864296235, + "learning_rate": 1.8184336215792644e-05, + "loss": 0.6952, + "step": 6700 + }, + { + "epoch": 0.43827563683866755, + "grad_norm": 9.016756353678497, + "learning_rate": 1.8153945528272415e-05, + "loss": 0.7139, + "step": 6710 + }, + { + "epoch": 0.4389288047028086, + "grad_norm": 4.822974955108365, + "learning_rate": 1.8123541289339068e-05, + "loss": 0.7025, + "step": 6720 + }, + { + "epoch": 0.4395819725669497, + "grad_norm": 4.348630290498651, + "learning_rate": 1.8093123629629105e-05, + "loss": 0.7191, + "step": 6730 + }, + { + "epoch": 0.4402351404310908, + "grad_norm": 4.015291548959369, + "learning_rate": 1.8062692679836684e-05, + "loss": 0.707, + "step": 6740 + }, + { + "epoch": 0.4408883082952319, + "grad_norm": 5.595847306715769, + "learning_rate": 1.803224857071307e-05, + "loss": 0.708, + "step": 6750 + }, + { + "epoch": 0.44154147615937295, + "grad_norm": 6.389126553007975, + "learning_rate": 1.8001791433066082e-05, + "loss": 0.6708, + "step": 6760 + }, + { + "epoch": 0.44219464402351405, + "grad_norm": 8.602448545134093, + "learning_rate": 1.7971321397759495e-05, + "loss": 0.7064, + "step": 6770 + }, + { + "epoch": 0.44284781188765515, + "grad_norm": 9.192310968011101, + "learning_rate": 1.7940838595712522e-05, + "loss": 0.6585, + "step": 6780 + }, + { + "epoch": 0.4435009797517962, + "grad_norm": 6.114239953281097, + "learning_rate": 1.791034315789921e-05, + "loss": 0.7215, + "step": 6790 + }, + { + "epoch": 0.4441541476159373, + "grad_norm": 5.817934510545314, + "learning_rate": 1.7879835215347915e-05, + "loss": 0.7057, + "step": 6800 + }, + { + "epoch": 0.4448073154800784, + "grad_norm": 5.461675111129483, + "learning_rate": 1.784931489914072e-05, + "loss": 0.6925, + "step": 6810 + }, + { + "epoch": 0.44546048334421945, + "grad_norm": 13.13059369734387, + "learning_rate": 1.781878234041286e-05, + "loss": 0.6884, + "step": 6820 + }, + { + "epoch": 0.44611365120836055, + "grad_norm": 7.136465651727822, + "learning_rate": 1.7788237670352176e-05, + "loss": 0.737, + "step": 6830 + }, + { + "epoch": 0.44676681907250165, + "grad_norm": 13.671072521091867, + "learning_rate": 1.7757681020198557e-05, + "loss": 0.7404, + "step": 6840 + }, + { + "epoch": 0.4474199869366427, + "grad_norm": 5.397882237012338, + "learning_rate": 1.7727112521243362e-05, + "loss": 0.7256, + "step": 6850 + }, + { + "epoch": 0.4480731548007838, + "grad_norm": 6.12908954723118, + "learning_rate": 1.769653230482886e-05, + "loss": 0.6802, + "step": 6860 + }, + { + "epoch": 0.4487263226649249, + "grad_norm": 3.6504708286902066, + "learning_rate": 1.7665940502347654e-05, + "loss": 0.7045, + "step": 6870 + }, + { + "epoch": 0.44937949052906595, + "grad_norm": 58.966599142676735, + "learning_rate": 1.763533724524215e-05, + "loss": 0.741, + "step": 6880 + }, + { + "epoch": 0.45003265839320705, + "grad_norm": 3.9881784274514915, + "learning_rate": 1.760472266500396e-05, + "loss": 0.7162, + "step": 6890 + }, + { + "epoch": 0.45068582625734815, + "grad_norm": 4.002812827071041, + "learning_rate": 1.7574096893173336e-05, + "loss": 0.724, + "step": 6900 + }, + { + "epoch": 0.4513389941214892, + "grad_norm": 4.939553714933158, + "learning_rate": 1.7543460061338636e-05, + "loss": 0.6901, + "step": 6910 + }, + { + "epoch": 0.4519921619856303, + "grad_norm": 5.030454395011258, + "learning_rate": 1.7512812301135726e-05, + "loss": 0.7254, + "step": 6920 + }, + { + "epoch": 0.4526453298497714, + "grad_norm": 22.27855846953453, + "learning_rate": 1.748215374424744e-05, + "loss": 0.7051, + "step": 6930 + }, + { + "epoch": 0.45329849771391245, + "grad_norm": 7.470444746348905, + "learning_rate": 1.7451484522402983e-05, + "loss": 0.7057, + "step": 6940 + }, + { + "epoch": 0.45395166557805355, + "grad_norm": 11.682170774889052, + "learning_rate": 1.7420804767377398e-05, + "loss": 0.7149, + "step": 6950 + }, + { + "epoch": 0.45460483344219466, + "grad_norm": 5.335499780442737, + "learning_rate": 1.739011461099098e-05, + "loss": 0.7296, + "step": 6960 + }, + { + "epoch": 0.4552580013063357, + "grad_norm": 4.741493237122223, + "learning_rate": 1.7359414185108727e-05, + "loss": 0.6507, + "step": 6970 + }, + { + "epoch": 0.4559111691704768, + "grad_norm": 3.041945945425016, + "learning_rate": 1.7328703621639737e-05, + "loss": 0.7245, + "step": 6980 + }, + { + "epoch": 0.4565643370346179, + "grad_norm": 4.6202574462500365, + "learning_rate": 1.7297983052536683e-05, + "loss": 0.6651, + "step": 6990 + }, + { + "epoch": 0.457217504898759, + "grad_norm": 11.336858163854028, + "learning_rate": 1.7267252609795236e-05, + "loss": 0.7212, + "step": 7000 + }, + { + "epoch": 0.45787067276290006, + "grad_norm": 6.600581819725192, + "learning_rate": 1.723651242545347e-05, + "loss": 0.7195, + "step": 7010 + }, + { + "epoch": 0.45852384062704116, + "grad_norm": 7.634299938630818, + "learning_rate": 1.7205762631591323e-05, + "loss": 0.6876, + "step": 7020 + }, + { + "epoch": 0.45917700849118226, + "grad_norm": 5.545937214410351, + "learning_rate": 1.7175003360330027e-05, + "loss": 0.7441, + "step": 7030 + }, + { + "epoch": 0.4598301763553233, + "grad_norm": 2.453391382612166, + "learning_rate": 1.7144234743831538e-05, + "loss": 0.7199, + "step": 7040 + }, + { + "epoch": 0.4604833442194644, + "grad_norm": 5.482371080990207, + "learning_rate": 1.7113456914297956e-05, + "loss": 0.7049, + "step": 7050 + }, + { + "epoch": 0.4611365120836055, + "grad_norm": 5.677867067700385, + "learning_rate": 1.7082670003970968e-05, + "loss": 0.7171, + "step": 7060 + }, + { + "epoch": 0.46178967994774656, + "grad_norm": 6.625917766987666, + "learning_rate": 1.7051874145131276e-05, + "loss": 0.7625, + "step": 7070 + }, + { + "epoch": 0.46244284781188766, + "grad_norm": 3.9787150554458712, + "learning_rate": 1.7021069470098048e-05, + "loss": 0.6928, + "step": 7080 + }, + { + "epoch": 0.46309601567602876, + "grad_norm": 6.607647609209365, + "learning_rate": 1.6990256111228306e-05, + "loss": 0.7183, + "step": 7090 + }, + { + "epoch": 0.4637491835401698, + "grad_norm": 26.05901856805948, + "learning_rate": 1.69594342009164e-05, + "loss": 0.7199, + "step": 7100 + }, + { + "epoch": 0.4644023514043109, + "grad_norm": 2.609610541214694, + "learning_rate": 1.6928603871593417e-05, + "loss": 0.6709, + "step": 7110 + }, + { + "epoch": 0.465055519268452, + "grad_norm": 3.437628310149699, + "learning_rate": 1.6897765255726626e-05, + "loss": 0.6821, + "step": 7120 + }, + { + "epoch": 0.46570868713259306, + "grad_norm": 3.694418672043116, + "learning_rate": 1.6866918485818883e-05, + "loss": 0.7009, + "step": 7130 + }, + { + "epoch": 0.46636185499673416, + "grad_norm": 5.064209072648378, + "learning_rate": 1.6836063694408095e-05, + "loss": 0.7264, + "step": 7140 + }, + { + "epoch": 0.46701502286087526, + "grad_norm": 4.097679523587385, + "learning_rate": 1.680520101406663e-05, + "loss": 0.7179, + "step": 7150 + }, + { + "epoch": 0.4676681907250163, + "grad_norm": 6.233472103811337, + "learning_rate": 1.6774330577400752e-05, + "loss": 0.6983, + "step": 7160 + }, + { + "epoch": 0.4683213585891574, + "grad_norm": 5.792189357233483, + "learning_rate": 1.6743452517050048e-05, + "loss": 0.7063, + "step": 7170 + }, + { + "epoch": 0.4689745264532985, + "grad_norm": 7.292501637926463, + "learning_rate": 1.6712566965686864e-05, + "loss": 0.7, + "step": 7180 + }, + { + "epoch": 0.46962769431743956, + "grad_norm": 10.071292424078878, + "learning_rate": 1.6681674056015738e-05, + "loss": 0.7445, + "step": 7190 + }, + { + "epoch": 0.47028086218158066, + "grad_norm": 3.1886962441070827, + "learning_rate": 1.6650773920772813e-05, + "loss": 0.726, + "step": 7200 + }, + { + "epoch": 0.47093403004572176, + "grad_norm": 4.3591848899379215, + "learning_rate": 1.661986669272528e-05, + "loss": 0.7052, + "step": 7210 + }, + { + "epoch": 0.4715871979098628, + "grad_norm": 6.972903805566907, + "learning_rate": 1.658895250467081e-05, + "loss": 0.7293, + "step": 7220 + }, + { + "epoch": 0.4722403657740039, + "grad_norm": 4.495721902724383, + "learning_rate": 1.6558031489436987e-05, + "loss": 0.6583, + "step": 7230 + }, + { + "epoch": 0.472893533638145, + "grad_norm": 3.427324388987157, + "learning_rate": 1.652710377988071e-05, + "loss": 0.6369, + "step": 7240 + }, + { + "epoch": 0.47354670150228606, + "grad_norm": 2.2380193054416533, + "learning_rate": 1.6496169508887645e-05, + "loss": 0.6559, + "step": 7250 + }, + { + "epoch": 0.47419986936642716, + "grad_norm": 5.534199246287345, + "learning_rate": 1.6465228809371666e-05, + "loss": 0.7328, + "step": 7260 + }, + { + "epoch": 0.47485303723056826, + "grad_norm": 5.194906990459634, + "learning_rate": 1.6434281814274257e-05, + "loss": 0.7506, + "step": 7270 + }, + { + "epoch": 0.47550620509470937, + "grad_norm": 25.079647296086396, + "learning_rate": 1.6403328656563948e-05, + "loss": 0.6865, + "step": 7280 + }, + { + "epoch": 0.4761593729588504, + "grad_norm": 29.382249562541475, + "learning_rate": 1.6372369469235756e-05, + "loss": 0.705, + "step": 7290 + }, + { + "epoch": 0.4768125408229915, + "grad_norm": 6.934988306059051, + "learning_rate": 1.6341404385310592e-05, + "loss": 0.7332, + "step": 7300 + }, + { + "epoch": 0.4774657086871326, + "grad_norm": 5.0690173359655555, + "learning_rate": 1.631043353783473e-05, + "loss": 0.7018, + "step": 7310 + }, + { + "epoch": 0.47811887655127366, + "grad_norm": 6.086800862922415, + "learning_rate": 1.6279457059879173e-05, + "loss": 0.7089, + "step": 7320 + }, + { + "epoch": 0.47877204441541477, + "grad_norm": 3.6627542400974775, + "learning_rate": 1.6248475084539137e-05, + "loss": 0.7287, + "step": 7330 + }, + { + "epoch": 0.47942521227955587, + "grad_norm": 9.57059828973796, + "learning_rate": 1.6217487744933466e-05, + "loss": 0.7164, + "step": 7340 + }, + { + "epoch": 0.4800783801436969, + "grad_norm": 5.553447257648302, + "learning_rate": 1.618649517420403e-05, + "loss": 0.7384, + "step": 7350 + }, + { + "epoch": 0.480731548007838, + "grad_norm": 17.982629659825115, + "learning_rate": 1.615549750551519e-05, + "loss": 0.7176, + "step": 7360 + }, + { + "epoch": 0.4813847158719791, + "grad_norm": 5.054362597628958, + "learning_rate": 1.6124494872053204e-05, + "loss": 0.6956, + "step": 7370 + }, + { + "epoch": 0.48203788373612017, + "grad_norm": 5.3668664117133345, + "learning_rate": 1.609348740702567e-05, + "loss": 0.6896, + "step": 7380 + }, + { + "epoch": 0.48269105160026127, + "grad_norm": 6.833581067068372, + "learning_rate": 1.6062475243660942e-05, + "loss": 0.6546, + "step": 7390 + }, + { + "epoch": 0.48334421946440237, + "grad_norm": 8.426135738805158, + "learning_rate": 1.6031458515207552e-05, + "loss": 0.7207, + "step": 7400 + }, + { + "epoch": 0.4839973873285434, + "grad_norm": 6.445906991995997, + "learning_rate": 1.6000437354933664e-05, + "loss": 0.6955, + "step": 7410 + }, + { + "epoch": 0.4846505551926845, + "grad_norm": 9.185210273507879, + "learning_rate": 1.5969411896126465e-05, + "loss": 0.7361, + "step": 7420 + }, + { + "epoch": 0.4853037230568256, + "grad_norm": 9.628498580999864, + "learning_rate": 1.5938382272091635e-05, + "loss": 0.6671, + "step": 7430 + }, + { + "epoch": 0.48595689092096667, + "grad_norm": 18.780973765647442, + "learning_rate": 1.5907348616152722e-05, + "loss": 0.6964, + "step": 7440 + }, + { + "epoch": 0.48661005878510777, + "grad_norm": 3.4109938598953313, + "learning_rate": 1.5876311061650625e-05, + "loss": 0.7258, + "step": 7450 + }, + { + "epoch": 0.48726322664924887, + "grad_norm": 5.148159268491969, + "learning_rate": 1.584526974194297e-05, + "loss": 0.7413, + "step": 7460 + }, + { + "epoch": 0.4879163945133899, + "grad_norm": 5.825994107139637, + "learning_rate": 1.581422479040358e-05, + "loss": 0.6995, + "step": 7470 + }, + { + "epoch": 0.488569562377531, + "grad_norm": 8.15661433199093, + "learning_rate": 1.5783176340421877e-05, + "loss": 0.737, + "step": 7480 + }, + { + "epoch": 0.4892227302416721, + "grad_norm": 2.488291310719702, + "learning_rate": 1.57521245254023e-05, + "loss": 0.6931, + "step": 7490 + }, + { + "epoch": 0.48987589810581317, + "grad_norm": 4.250185794135119, + "learning_rate": 1.572106947876377e-05, + "loss": 0.6854, + "step": 7500 + }, + { + "epoch": 0.49052906596995427, + "grad_norm": 3.8635990640808355, + "learning_rate": 1.5690011333939074e-05, + "loss": 0.7205, + "step": 7510 + }, + { + "epoch": 0.49118223383409537, + "grad_norm": 9.085150254077263, + "learning_rate": 1.565895022437432e-05, + "loss": 0.7455, + "step": 7520 + }, + { + "epoch": 0.4918354016982365, + "grad_norm": 4.516782829977407, + "learning_rate": 1.562788628352836e-05, + "loss": 0.6965, + "step": 7530 + }, + { + "epoch": 0.4924885695623775, + "grad_norm": 4.2329680297711265, + "learning_rate": 1.5596819644872195e-05, + "loss": 0.7156, + "step": 7540 + }, + { + "epoch": 0.4931417374265186, + "grad_norm": 13.379779282646771, + "learning_rate": 1.556575044188843e-05, + "loss": 0.688, + "step": 7550 + }, + { + "epoch": 0.4937949052906597, + "grad_norm": 10.879140832861319, + "learning_rate": 1.553467880807069e-05, + "loss": 0.6622, + "step": 7560 + }, + { + "epoch": 0.49444807315480077, + "grad_norm": 4.213562665524711, + "learning_rate": 1.5503604876923035e-05, + "loss": 0.7061, + "step": 7570 + }, + { + "epoch": 0.4951012410189419, + "grad_norm": 3.7923437638813424, + "learning_rate": 1.5472528781959402e-05, + "loss": 0.7046, + "step": 7580 + }, + { + "epoch": 0.495754408883083, + "grad_norm": 4.996725536064961, + "learning_rate": 1.5441450656703012e-05, + "loss": 0.6894, + "step": 7590 + }, + { + "epoch": 0.496407576747224, + "grad_norm": 13.749534460336342, + "learning_rate": 1.5410370634685835e-05, + "loss": 0.7193, + "step": 7600 + }, + { + "epoch": 0.4970607446113651, + "grad_norm": 14.370136532914689, + "learning_rate": 1.5379288849447964e-05, + "loss": 0.7396, + "step": 7610 + }, + { + "epoch": 0.4977139124755062, + "grad_norm": 2.9206834291164174, + "learning_rate": 1.5348205434537098e-05, + "loss": 0.6699, + "step": 7620 + }, + { + "epoch": 0.4983670803396473, + "grad_norm": 22.590699893533344, + "learning_rate": 1.5317120523507904e-05, + "loss": 0.706, + "step": 7630 + }, + { + "epoch": 0.4990202482037884, + "grad_norm": 39.07888094030937, + "learning_rate": 1.5286034249921495e-05, + "loss": 0.7308, + "step": 7640 + }, + { + "epoch": 0.4996734160679295, + "grad_norm": 6.180342416505246, + "learning_rate": 1.5254946747344843e-05, + "loss": 0.7015, + "step": 7650 + }, + { + "epoch": 0.5003265839320705, + "grad_norm": 3.506114659272752, + "learning_rate": 1.52238581493502e-05, + "loss": 0.6883, + "step": 7660 + }, + { + "epoch": 0.5009797517962117, + "grad_norm": 3.506978434952958, + "learning_rate": 1.5192768589514508e-05, + "loss": 0.7279, + "step": 7670 + }, + { + "epoch": 0.5016329196603527, + "grad_norm": 6.328901379843207, + "learning_rate": 1.5161678201418857e-05, + "loss": 0.7585, + "step": 7680 + }, + { + "epoch": 0.5022860875244938, + "grad_norm": 3.59766393737303, + "learning_rate": 1.5130587118647891e-05, + "loss": 0.6758, + "step": 7690 + }, + { + "epoch": 0.5029392553886349, + "grad_norm": 3.098792850125791, + "learning_rate": 1.5099495474789243e-05, + "loss": 0.7318, + "step": 7700 + }, + { + "epoch": 0.503592423252776, + "grad_norm": 4.313613747783113, + "learning_rate": 1.5068403403432948e-05, + "loss": 0.6997, + "step": 7710 + }, + { + "epoch": 0.504245591116917, + "grad_norm": 4.960621068279318, + "learning_rate": 1.5037311038170888e-05, + "loss": 0.6933, + "step": 7720 + }, + { + "epoch": 0.5048987589810582, + "grad_norm": 4.50244603965065, + "learning_rate": 1.5006218512596204e-05, + "loss": 0.7122, + "step": 7730 + }, + { + "epoch": 0.5055519268451992, + "grad_norm": 5.402529456090525, + "learning_rate": 1.4975125960302718e-05, + "loss": 0.7356, + "step": 7740 + }, + { + "epoch": 0.5062050947093403, + "grad_norm": 5.251731768589659, + "learning_rate": 1.4944033514884378e-05, + "loss": 0.7091, + "step": 7750 + }, + { + "epoch": 0.5068582625734814, + "grad_norm": 4.527582618132294, + "learning_rate": 1.4912941309934673e-05, + "loss": 0.7349, + "step": 7760 + }, + { + "epoch": 0.5075114304376225, + "grad_norm": 5.063886664473719, + "learning_rate": 1.4881849479046042e-05, + "loss": 0.7231, + "step": 7770 + }, + { + "epoch": 0.5081645983017635, + "grad_norm": 7.340580162798091, + "learning_rate": 1.485075815580934e-05, + "loss": 0.6708, + "step": 7780 + }, + { + "epoch": 0.5088177661659047, + "grad_norm": 8.279851370689158, + "learning_rate": 1.481966747381323e-05, + "loss": 0.6817, + "step": 7790 + }, + { + "epoch": 0.5094709340300457, + "grad_norm": 4.007759673902586, + "learning_rate": 1.4788577566643612e-05, + "loss": 0.7361, + "step": 7800 + }, + { + "epoch": 0.5101241018941868, + "grad_norm": 3.607676977732233, + "learning_rate": 1.4757488567883066e-05, + "loss": 0.6834, + "step": 7810 + }, + { + "epoch": 0.5107772697583279, + "grad_norm": 6.825186579463042, + "learning_rate": 1.472640061111027e-05, + "loss": 0.6966, + "step": 7820 + }, + { + "epoch": 0.511430437622469, + "grad_norm": 4.199100198181181, + "learning_rate": 1.4695313829899421e-05, + "loss": 0.7256, + "step": 7830 + }, + { + "epoch": 0.51208360548661, + "grad_norm": 3.8617562426661523, + "learning_rate": 1.4664228357819667e-05, + "loss": 0.7319, + "step": 7840 + }, + { + "epoch": 0.5127367733507512, + "grad_norm": 59.94700214694851, + "learning_rate": 1.4633144328434534e-05, + "loss": 0.7075, + "step": 7850 + }, + { + "epoch": 0.5133899412148922, + "grad_norm": 4.66758709867641, + "learning_rate": 1.4602061875301339e-05, + "loss": 0.6796, + "step": 7860 + }, + { + "epoch": 0.5140431090790333, + "grad_norm": 9.719456098118954, + "learning_rate": 1.4570981131970636e-05, + "loss": 0.7226, + "step": 7870 + }, + { + "epoch": 0.5146962769431744, + "grad_norm": 5.463175870167361, + "learning_rate": 1.4539902231985631e-05, + "loss": 0.6783, + "step": 7880 + }, + { + "epoch": 0.5153494448073155, + "grad_norm": 5.382574555075144, + "learning_rate": 1.4508825308881605e-05, + "loss": 0.6674, + "step": 7890 + }, + { + "epoch": 0.5160026126714565, + "grad_norm": 4.868251531153773, + "learning_rate": 1.4477750496185348e-05, + "loss": 0.7024, + "step": 7900 + }, + { + "epoch": 0.5166557805355977, + "grad_norm": 3.7294213547214325, + "learning_rate": 1.4446677927414587e-05, + "loss": 0.7643, + "step": 7910 + }, + { + "epoch": 0.5173089483997387, + "grad_norm": 3.3941905879029948, + "learning_rate": 1.44156077360774e-05, + "loss": 0.6935, + "step": 7920 + }, + { + "epoch": 0.5179621162638798, + "grad_norm": 11.955624228102595, + "learning_rate": 1.4384540055671652e-05, + "loss": 0.6563, + "step": 7930 + }, + { + "epoch": 0.5186152841280209, + "grad_norm": 4.223751917576418, + "learning_rate": 1.4353475019684431e-05, + "loss": 0.709, + "step": 7940 + }, + { + "epoch": 0.519268451992162, + "grad_norm": 3.743270303247067, + "learning_rate": 1.4322412761591441e-05, + "loss": 0.6992, + "step": 7950 + }, + { + "epoch": 0.519921619856303, + "grad_norm": 5.432489476270528, + "learning_rate": 1.4291353414856466e-05, + "loss": 0.6729, + "step": 7960 + }, + { + "epoch": 0.5205747877204442, + "grad_norm": 4.5886135319405374, + "learning_rate": 1.4260297112930774e-05, + "loss": 0.654, + "step": 7970 + }, + { + "epoch": 0.5212279555845852, + "grad_norm": 3.5771128459787525, + "learning_rate": 1.4229243989252554e-05, + "loss": 0.6779, + "step": 7980 + }, + { + "epoch": 0.5218811234487263, + "grad_norm": 7.088599397946011, + "learning_rate": 1.4198194177246343e-05, + "loss": 0.7128, + "step": 7990 + }, + { + "epoch": 0.5225342913128674, + "grad_norm": 10.356939935810402, + "learning_rate": 1.4167147810322438e-05, + "loss": 0.718, + "step": 8000 + }, + { + "epoch": 0.5231874591770085, + "grad_norm": 7.230293589180331, + "learning_rate": 1.4136105021876346e-05, + "loss": 0.6918, + "step": 8010 + }, + { + "epoch": 0.5238406270411495, + "grad_norm": 3.9909166676063683, + "learning_rate": 1.4105065945288196e-05, + "loss": 0.7007, + "step": 8020 + }, + { + "epoch": 0.5244937949052907, + "grad_norm": 5.794003497753727, + "learning_rate": 1.4074030713922151e-05, + "loss": 0.6887, + "step": 8030 + }, + { + "epoch": 0.5251469627694317, + "grad_norm": 2.9783249490709203, + "learning_rate": 1.4042999461125876e-05, + "loss": 0.7596, + "step": 8040 + }, + { + "epoch": 0.5258001306335728, + "grad_norm": 26.692386939122034, + "learning_rate": 1.4011972320229934e-05, + "loss": 0.6974, + "step": 8050 + }, + { + "epoch": 0.5264532984977139, + "grad_norm": 2.6819712524637076, + "learning_rate": 1.398094942454721e-05, + "loss": 0.6884, + "step": 8060 + }, + { + "epoch": 0.527106466361855, + "grad_norm": 4.990492364294924, + "learning_rate": 1.3949930907372363e-05, + "loss": 0.6969, + "step": 8070 + }, + { + "epoch": 0.527759634225996, + "grad_norm": 5.327689323623585, + "learning_rate": 1.3918916901981234e-05, + "loss": 0.7193, + "step": 8080 + }, + { + "epoch": 0.5284128020901372, + "grad_norm": 2.7492206509522057, + "learning_rate": 1.3887907541630272e-05, + "loss": 0.7205, + "step": 8090 + }, + { + "epoch": 0.5290659699542782, + "grad_norm": 2.8357777616212565, + "learning_rate": 1.3856902959555987e-05, + "loss": 0.7318, + "step": 8100 + }, + { + "epoch": 0.5297191378184193, + "grad_norm": 3.9368304939861085, + "learning_rate": 1.3825903288974329e-05, + "loss": 0.6837, + "step": 8110 + }, + { + "epoch": 0.5303723056825604, + "grad_norm": 3.977284424009092, + "learning_rate": 1.3794908663080165e-05, + "loss": 0.6738, + "step": 8120 + }, + { + "epoch": 0.5310254735467015, + "grad_norm": 8.306483794883848, + "learning_rate": 1.376391921504669e-05, + "loss": 0.6866, + "step": 8130 + }, + { + "epoch": 0.5316786414108425, + "grad_norm": 14.970355431587016, + "learning_rate": 1.3732935078024839e-05, + "loss": 0.688, + "step": 8140 + }, + { + "epoch": 0.5323318092749837, + "grad_norm": 3.481786339363183, + "learning_rate": 1.3701956385142732e-05, + "loss": 0.6571, + "step": 8150 + }, + { + "epoch": 0.5329849771391247, + "grad_norm": 4.091950698182643, + "learning_rate": 1.3670983269505098e-05, + "loss": 0.6786, + "step": 8160 + }, + { + "epoch": 0.5336381450032658, + "grad_norm": 4.133504020078944, + "learning_rate": 1.3640015864192709e-05, + "loss": 0.7041, + "step": 8170 + }, + { + "epoch": 0.5342913128674069, + "grad_norm": 4.756759050876008, + "learning_rate": 1.3609054302261787e-05, + "loss": 0.718, + "step": 8180 + }, + { + "epoch": 0.534944480731548, + "grad_norm": 4.278232151247267, + "learning_rate": 1.3578098716743457e-05, + "loss": 0.7106, + "step": 8190 + }, + { + "epoch": 0.5355976485956891, + "grad_norm": 2.0249684555045726, + "learning_rate": 1.3547149240643165e-05, + "loss": 0.6737, + "step": 8200 + }, + { + "epoch": 0.5362508164598302, + "grad_norm": 5.223443576125692, + "learning_rate": 1.3516206006940108e-05, + "loss": 0.6646, + "step": 8210 + }, + { + "epoch": 0.5369039843239712, + "grad_norm": 4.667809596201585, + "learning_rate": 1.3485269148586655e-05, + "loss": 0.7258, + "step": 8220 + }, + { + "epoch": 0.5375571521881124, + "grad_norm": 7.584805216491663, + "learning_rate": 1.3454338798507793e-05, + "loss": 0.7059, + "step": 8230 + }, + { + "epoch": 0.5382103200522534, + "grad_norm": 3.369443911847391, + "learning_rate": 1.3423415089600531e-05, + "loss": 0.718, + "step": 8240 + }, + { + "epoch": 0.5388634879163945, + "grad_norm": 2.1903022256230056, + "learning_rate": 1.3392498154733359e-05, + "loss": 0.6787, + "step": 8250 + }, + { + "epoch": 0.5395166557805356, + "grad_norm": 16.6815682487673, + "learning_rate": 1.3361588126745646e-05, + "loss": 0.7269, + "step": 8260 + }, + { + "epoch": 0.5401698236446767, + "grad_norm": 3.702406157567106, + "learning_rate": 1.3330685138447095e-05, + "loss": 0.7176, + "step": 8270 + }, + { + "epoch": 0.5408229915088177, + "grad_norm": 3.4301661546239473, + "learning_rate": 1.3299789322617156e-05, + "loss": 0.6611, + "step": 8280 + }, + { + "epoch": 0.5414761593729589, + "grad_norm": 4.73145186322431, + "learning_rate": 1.3268900812004468e-05, + "loss": 0.7399, + "step": 8290 + }, + { + "epoch": 0.5421293272371, + "grad_norm": 6.065859463546294, + "learning_rate": 1.3238019739326275e-05, + "loss": 0.7051, + "step": 8300 + }, + { + "epoch": 0.542782495101241, + "grad_norm": 3.5219919830092383, + "learning_rate": 1.3207146237267866e-05, + "loss": 0.7082, + "step": 8310 + }, + { + "epoch": 0.5434356629653821, + "grad_norm": 5.153036144779057, + "learning_rate": 1.3176280438482007e-05, + "loss": 0.696, + "step": 8320 + }, + { + "epoch": 0.5440888308295232, + "grad_norm": 18.004351619708324, + "learning_rate": 1.3145422475588357e-05, + "loss": 0.7294, + "step": 8330 + }, + { + "epoch": 0.5447419986936642, + "grad_norm": 3.241110896411055, + "learning_rate": 1.3114572481172905e-05, + "loss": 0.6854, + "step": 8340 + }, + { + "epoch": 0.5453951665578054, + "grad_norm": 29.532297848473245, + "learning_rate": 1.3083730587787416e-05, + "loss": 0.7101, + "step": 8350 + }, + { + "epoch": 0.5460483344219464, + "grad_norm": 7.991969846743042, + "learning_rate": 1.305289692794883e-05, + "loss": 0.6968, + "step": 8360 + }, + { + "epoch": 0.5467015022860875, + "grad_norm": 3.3552311379053354, + "learning_rate": 1.3022071634138723e-05, + "loss": 0.6995, + "step": 8370 + }, + { + "epoch": 0.5473546701502287, + "grad_norm": 3.2979728714825955, + "learning_rate": 1.2991254838802722e-05, + "loss": 0.7522, + "step": 8380 + }, + { + "epoch": 0.5480078380143697, + "grad_norm": 14.98352824852725, + "learning_rate": 1.2960446674349939e-05, + "loss": 0.7072, + "step": 8390 + }, + { + "epoch": 0.5486610058785107, + "grad_norm": 8.870482678617899, + "learning_rate": 1.2929647273152407e-05, + "loss": 0.7086, + "step": 8400 + }, + { + "epoch": 0.5493141737426519, + "grad_norm": 13.259488145434055, + "learning_rate": 1.2898856767544486e-05, + "loss": 0.6932, + "step": 8410 + }, + { + "epoch": 0.549967341606793, + "grad_norm": 6.786398062995833, + "learning_rate": 1.286807528982234e-05, + "loss": 0.6994, + "step": 8420 + }, + { + "epoch": 0.550620509470934, + "grad_norm": 21.918977139408003, + "learning_rate": 1.2837302972243331e-05, + "loss": 0.6888, + "step": 8430 + }, + { + "epoch": 0.5512736773350752, + "grad_norm": 5.569225574398577, + "learning_rate": 1.2806539947025465e-05, + "loss": 0.7531, + "step": 8440 + }, + { + "epoch": 0.5519268451992162, + "grad_norm": 7.651003912583063, + "learning_rate": 1.277578634634682e-05, + "loss": 0.6703, + "step": 8450 + }, + { + "epoch": 0.5525800130633572, + "grad_norm": 3.8123580405899546, + "learning_rate": 1.274504230234498e-05, + "loss": 0.7247, + "step": 8460 + }, + { + "epoch": 0.5532331809274984, + "grad_norm": 4.623355057456077, + "learning_rate": 1.2714307947116473e-05, + "loss": 0.7145, + "step": 8470 + }, + { + "epoch": 0.5538863487916394, + "grad_norm": 2.574327944116888, + "learning_rate": 1.2683583412716202e-05, + "loss": 0.6898, + "step": 8480 + }, + { + "epoch": 0.5545395166557805, + "grad_norm": 3.2663389456748404, + "learning_rate": 1.2652868831156846e-05, + "loss": 0.7128, + "step": 8490 + }, + { + "epoch": 0.5551926845199217, + "grad_norm": 4.498572229008819, + "learning_rate": 1.262216433440835e-05, + "loss": 0.6927, + "step": 8500 + }, + { + "epoch": 0.5558458523840627, + "grad_norm": 3.2539273521224508, + "learning_rate": 1.2591470054397316e-05, + "loss": 0.7144, + "step": 8510 + }, + { + "epoch": 0.5564990202482037, + "grad_norm": 6.184850260248766, + "learning_rate": 1.256078612300645e-05, + "loss": 0.7184, + "step": 8520 + }, + { + "epoch": 0.5571521881123449, + "grad_norm": 4.851817445306759, + "learning_rate": 1.253011267207399e-05, + "loss": 0.7139, + "step": 8530 + }, + { + "epoch": 0.557805355976486, + "grad_norm": 4.124875615874441, + "learning_rate": 1.2499449833393147e-05, + "loss": 0.7228, + "step": 8540 + }, + { + "epoch": 0.558458523840627, + "grad_norm": 16.611404251919918, + "learning_rate": 1.2468797738711543e-05, + "loss": 0.7002, + "step": 8550 + }, + { + "epoch": 0.5591116917047682, + "grad_norm": 5.330949767268735, + "learning_rate": 1.2438156519730613e-05, + "loss": 0.6904, + "step": 8560 + }, + { + "epoch": 0.5597648595689092, + "grad_norm": 3.6213033835624406, + "learning_rate": 1.240752630810508e-05, + "loss": 0.7664, + "step": 8570 + }, + { + "epoch": 0.5604180274330502, + "grad_norm": 5.470156417925566, + "learning_rate": 1.2376907235442377e-05, + "loss": 0.6961, + "step": 8580 + }, + { + "epoch": 0.5610711952971914, + "grad_norm": 4.024325315426769, + "learning_rate": 1.2346299433302067e-05, + "loss": 0.7538, + "step": 8590 + }, + { + "epoch": 0.5617243631613325, + "grad_norm": 5.837701451748462, + "learning_rate": 1.2315703033195285e-05, + "loss": 0.7085, + "step": 8600 + }, + { + "epoch": 0.5623775310254735, + "grad_norm": 4.8934203133055, + "learning_rate": 1.228511816658419e-05, + "loss": 0.6848, + "step": 8610 + }, + { + "epoch": 0.5630306988896147, + "grad_norm": 3.7706313667515503, + "learning_rate": 1.2254544964881364e-05, + "loss": 0.7374, + "step": 8620 + }, + { + "epoch": 0.5636838667537557, + "grad_norm": 4.437415968408255, + "learning_rate": 1.2223983559449292e-05, + "loss": 0.6693, + "step": 8630 + }, + { + "epoch": 0.5643370346178967, + "grad_norm": 10.73476128251022, + "learning_rate": 1.2193434081599758e-05, + "loss": 0.7481, + "step": 8640 + }, + { + "epoch": 0.5649902024820379, + "grad_norm": 4.852001422186861, + "learning_rate": 1.2162896662593297e-05, + "loss": 0.6515, + "step": 8650 + }, + { + "epoch": 0.565643370346179, + "grad_norm": 5.824228247078944, + "learning_rate": 1.2132371433638643e-05, + "loss": 0.6588, + "step": 8660 + }, + { + "epoch": 0.56629653821032, + "grad_norm": 5.839446905935213, + "learning_rate": 1.2101858525892147e-05, + "loss": 0.7222, + "step": 8670 + }, + { + "epoch": 0.5669497060744612, + "grad_norm": 3.12927883944411, + "learning_rate": 1.207135807045722e-05, + "loss": 0.676, + "step": 8680 + }, + { + "epoch": 0.5676028739386022, + "grad_norm": 4.3550286177561, + "learning_rate": 1.204087019838377e-05, + "loss": 0.7436, + "step": 8690 + }, + { + "epoch": 0.5682560418027433, + "grad_norm": 4.882541008077315, + "learning_rate": 1.2010395040667642e-05, + "loss": 0.664, + "step": 8700 + }, + { + "epoch": 0.5689092096668844, + "grad_norm": 5.572176970355917, + "learning_rate": 1.1979932728250045e-05, + "loss": 0.7028, + "step": 8710 + }, + { + "epoch": 0.5695623775310255, + "grad_norm": 9.180272587737047, + "learning_rate": 1.1949483392016997e-05, + "loss": 0.7212, + "step": 8720 + }, + { + "epoch": 0.5702155453951666, + "grad_norm": 7.546332108773005, + "learning_rate": 1.1919047162798773e-05, + "loss": 0.6982, + "step": 8730 + }, + { + "epoch": 0.5708687132593077, + "grad_norm": 10.472934899896995, + "learning_rate": 1.1888624171369315e-05, + "loss": 0.6695, + "step": 8740 + }, + { + "epoch": 0.5715218811234487, + "grad_norm": 6.384248494945576, + "learning_rate": 1.1858214548445698e-05, + "loss": 0.6871, + "step": 8750 + }, + { + "epoch": 0.5721750489875899, + "grad_norm": 2.402182869835395, + "learning_rate": 1.1827818424687554e-05, + "loss": 0.7061, + "step": 8760 + }, + { + "epoch": 0.5728282168517309, + "grad_norm": 6.648042445206821, + "learning_rate": 1.1797435930696518e-05, + "loss": 0.7003, + "step": 8770 + }, + { + "epoch": 0.573481384715872, + "grad_norm": 9.54863736482344, + "learning_rate": 1.1767067197015658e-05, + "loss": 0.6984, + "step": 8780 + }, + { + "epoch": 0.5741345525800131, + "grad_norm": 19.88307419400473, + "learning_rate": 1.1736712354128914e-05, + "loss": 0.6891, + "step": 8790 + }, + { + "epoch": 0.5747877204441542, + "grad_norm": 10.644397576568926, + "learning_rate": 1.1706371532460546e-05, + "loss": 0.679, + "step": 8800 + }, + { + "epoch": 0.5754408883082952, + "grad_norm": 2.3336898554043213, + "learning_rate": 1.1676044862374584e-05, + "loss": 0.7158, + "step": 8810 + }, + { + "epoch": 0.5760940561724364, + "grad_norm": 6.048589734364383, + "learning_rate": 1.1645732474174225e-05, + "loss": 0.6977, + "step": 8820 + }, + { + "epoch": 0.5767472240365774, + "grad_norm": 4.169729302491312, + "learning_rate": 1.1615434498101325e-05, + "loss": 0.6741, + "step": 8830 + }, + { + "epoch": 0.5774003919007185, + "grad_norm": 14.92365469180896, + "learning_rate": 1.1585151064335811e-05, + "loss": 0.7175, + "step": 8840 + }, + { + "epoch": 0.5780535597648596, + "grad_norm": 10.701959120465434, + "learning_rate": 1.1554882302995118e-05, + "loss": 0.6797, + "step": 8850 + }, + { + "epoch": 0.5787067276290007, + "grad_norm": 8.957942605184153, + "learning_rate": 1.1524628344133653e-05, + "loss": 0.7207, + "step": 8860 + }, + { + "epoch": 0.5793598954931417, + "grad_norm": 4.787159179029991, + "learning_rate": 1.1494389317742204e-05, + "loss": 0.7303, + "step": 8870 + }, + { + "epoch": 0.5800130633572829, + "grad_norm": 4.314912867368067, + "learning_rate": 1.1464165353747412e-05, + "loss": 0.7074, + "step": 8880 + }, + { + "epoch": 0.5806662312214239, + "grad_norm": 11.290676584978511, + "learning_rate": 1.14339565820112e-05, + "loss": 0.6827, + "step": 8890 + }, + { + "epoch": 0.581319399085565, + "grad_norm": 16.338731976431227, + "learning_rate": 1.1403763132330214e-05, + "loss": 0.7293, + "step": 8900 + }, + { + "epoch": 0.5819725669497061, + "grad_norm": 4.281523981168196, + "learning_rate": 1.1373585134435257e-05, + "loss": 0.6684, + "step": 8910 + }, + { + "epoch": 0.5826257348138472, + "grad_norm": 2.8456531905150486, + "learning_rate": 1.1343422717990753e-05, + "loss": 0.6874, + "step": 8920 + }, + { + "epoch": 0.5832789026779882, + "grad_norm": 52.77777843453033, + "learning_rate": 1.1313276012594174e-05, + "loss": 0.721, + "step": 8930 + }, + { + "epoch": 0.5839320705421294, + "grad_norm": 7.929333909142418, + "learning_rate": 1.1283145147775493e-05, + "loss": 0.6996, + "step": 8940 + }, + { + "epoch": 0.5845852384062704, + "grad_norm": 2.8746395714437747, + "learning_rate": 1.12530302529966e-05, + "loss": 0.725, + "step": 8950 + }, + { + "epoch": 0.5852384062704115, + "grad_norm": 4.914356701994025, + "learning_rate": 1.1222931457650792e-05, + "loss": 0.7305, + "step": 8960 + }, + { + "epoch": 0.5858915741345526, + "grad_norm": 7.837217247347996, + "learning_rate": 1.1192848891062181e-05, + "loss": 0.6508, + "step": 8970 + }, + { + "epoch": 0.5865447419986937, + "grad_norm": 3.4712253021031962, + "learning_rate": 1.1162782682485152e-05, + "loss": 0.6367, + "step": 8980 + }, + { + "epoch": 0.5871979098628347, + "grad_norm": 2.877607904262043, + "learning_rate": 1.1132732961103808e-05, + "loss": 0.6545, + "step": 8990 + }, + { + "epoch": 0.5878510777269759, + "grad_norm": 8.107637605161019, + "learning_rate": 1.11026998560314e-05, + "loss": 0.7188, + "step": 9000 + }, + { + "epoch": 0.5885042455911169, + "grad_norm": 4.6981324726393945, + "learning_rate": 1.1072683496309804e-05, + "loss": 0.7503, + "step": 9010 + }, + { + "epoch": 0.589157413455258, + "grad_norm": 3.394614970378694, + "learning_rate": 1.1042684010908929e-05, + "loss": 0.7021, + "step": 9020 + }, + { + "epoch": 0.5898105813193991, + "grad_norm": 8.911408439658569, + "learning_rate": 1.1012701528726187e-05, + "loss": 0.7031, + "step": 9030 + }, + { + "epoch": 0.5904637491835402, + "grad_norm": 4.013093499233945, + "learning_rate": 1.0982736178585939e-05, + "loss": 0.6802, + "step": 9040 + }, + { + "epoch": 0.5911169170476812, + "grad_norm": 7.257045873716964, + "learning_rate": 1.0952788089238924e-05, + "loss": 0.7254, + "step": 9050 + }, + { + "epoch": 0.5917700849118224, + "grad_norm": 14.982688068983514, + "learning_rate": 1.0922857389361734e-05, + "loss": 0.6788, + "step": 9060 + }, + { + "epoch": 0.5924232527759634, + "grad_norm": 91.51916103165227, + "learning_rate": 1.0892944207556227e-05, + "loss": 0.6519, + "step": 9070 + }, + { + "epoch": 0.5930764206401045, + "grad_norm": 6.205600657688591, + "learning_rate": 1.0863048672349008e-05, + "loss": 0.7368, + "step": 9080 + }, + { + "epoch": 0.5937295885042456, + "grad_norm": 6.509641392641457, + "learning_rate": 1.0833170912190846e-05, + "loss": 0.6625, + "step": 9090 + }, + { + "epoch": 0.5943827563683867, + "grad_norm": 2.6445889487797176, + "learning_rate": 1.0803311055456139e-05, + "loss": 0.7142, + "step": 9100 + }, + { + "epoch": 0.5950359242325277, + "grad_norm": 5.711217703774923, + "learning_rate": 1.0773469230442372e-05, + "loss": 0.6801, + "step": 9110 + }, + { + "epoch": 0.5956890920966689, + "grad_norm": 7.889073950278876, + "learning_rate": 1.074364556536954e-05, + "loss": 0.6735, + "step": 9120 + }, + { + "epoch": 0.5963422599608099, + "grad_norm": 3.353698192285854, + "learning_rate": 1.071384018837962e-05, + "loss": 0.657, + "step": 9130 + }, + { + "epoch": 0.596995427824951, + "grad_norm": 2.6333231897601395, + "learning_rate": 1.0684053227536007e-05, + "loss": 0.7279, + "step": 9140 + }, + { + "epoch": 0.5976485956890921, + "grad_norm": 5.586013814764972, + "learning_rate": 1.0654284810822972e-05, + "loss": 0.71, + "step": 9150 + }, + { + "epoch": 0.5983017635532332, + "grad_norm": 7.257685041180998, + "learning_rate": 1.0624535066145103e-05, + "loss": 0.6757, + "step": 9160 + }, + { + "epoch": 0.5989549314173742, + "grad_norm": 6.444891084708104, + "learning_rate": 1.0594804121326773e-05, + "loss": 0.7004, + "step": 9170 + }, + { + "epoch": 0.5996080992815154, + "grad_norm": 6.434512732255839, + "learning_rate": 1.0565092104111555e-05, + "loss": 0.6643, + "step": 9180 + }, + { + "epoch": 0.6002612671456564, + "grad_norm": 4.260366922972642, + "learning_rate": 1.0535399142161722e-05, + "loss": 0.7153, + "step": 9190 + }, + { + "epoch": 0.6009144350097975, + "grad_norm": 17.9026416385046, + "learning_rate": 1.050572536305765e-05, + "loss": 0.6703, + "step": 9200 + }, + { + "epoch": 0.6015676028739386, + "grad_norm": 6.1533366634295135, + "learning_rate": 1.0476070894297319e-05, + "loss": 0.6978, + "step": 9210 + }, + { + "epoch": 0.6022207707380797, + "grad_norm": 6.951694298470321, + "learning_rate": 1.0446435863295713e-05, + "loss": 0.6933, + "step": 9220 + }, + { + "epoch": 0.6028739386022207, + "grad_norm": 4.479107145165328, + "learning_rate": 1.041682039738432e-05, + "loss": 0.6497, + "step": 9230 + }, + { + "epoch": 0.6035271064663619, + "grad_norm": 7.84728121965074, + "learning_rate": 1.0387224623810553e-05, + "loss": 0.7165, + "step": 9240 + }, + { + "epoch": 0.6041802743305029, + "grad_norm": 3.5520424192576256, + "learning_rate": 1.0357648669737207e-05, + "loss": 0.7187, + "step": 9250 + }, + { + "epoch": 0.6048334421946441, + "grad_norm": 11.145100811330805, + "learning_rate": 1.0328092662241934e-05, + "loss": 0.7626, + "step": 9260 + }, + { + "epoch": 0.6054866100587851, + "grad_norm": 4.628007890145608, + "learning_rate": 1.0298556728316677e-05, + "loss": 0.7583, + "step": 9270 + }, + { + "epoch": 0.6061397779229262, + "grad_norm": 2.4723596379906847, + "learning_rate": 1.0269040994867126e-05, + "loss": 0.7218, + "step": 9280 + }, + { + "epoch": 0.6067929457870673, + "grad_norm": 3.549410413755828, + "learning_rate": 1.023954558871218e-05, + "loss": 0.7301, + "step": 9290 + }, + { + "epoch": 0.6074461136512084, + "grad_norm": 3.8402240987792906, + "learning_rate": 1.0210070636583397e-05, + "loss": 0.7194, + "step": 9300 + }, + { + "epoch": 0.6080992815153494, + "grad_norm": 9.342316890776898, + "learning_rate": 1.0180616265124454e-05, + "loss": 0.6945, + "step": 9310 + }, + { + "epoch": 0.6087524493794906, + "grad_norm": 8.938765856323773, + "learning_rate": 1.0151182600890605e-05, + "loss": 0.6951, + "step": 9320 + }, + { + "epoch": 0.6094056172436316, + "grad_norm": 6.274849282692598, + "learning_rate": 1.012176977034811e-05, + "loss": 0.6848, + "step": 9330 + }, + { + "epoch": 0.6100587851077727, + "grad_norm": 4.769425874462992, + "learning_rate": 1.0092377899873738e-05, + "loss": 0.6752, + "step": 9340 + }, + { + "epoch": 0.6107119529719138, + "grad_norm": 3.329658405780108, + "learning_rate": 1.006300711575419e-05, + "loss": 0.6937, + "step": 9350 + }, + { + "epoch": 0.6113651208360549, + "grad_norm": 5.471425475561112, + "learning_rate": 1.0033657544185567e-05, + "loss": 0.6984, + "step": 9360 + }, + { + "epoch": 0.6120182887001959, + "grad_norm": 4.918442658610056, + "learning_rate": 1.0004329311272832e-05, + "loss": 0.6955, + "step": 9370 + }, + { + "epoch": 0.6126714565643371, + "grad_norm": 4.707819525216195, + "learning_rate": 9.97502254302925e-06, + "loss": 0.6808, + "step": 9380 + }, + { + "epoch": 0.6133246244284781, + "grad_norm": 5.203240104066543, + "learning_rate": 9.945737365375876e-06, + "loss": 0.6939, + "step": 9390 + }, + { + "epoch": 0.6139777922926192, + "grad_norm": 9.333326975502391, + "learning_rate": 9.916473904140984e-06, + "loss": 0.6521, + "step": 9400 + }, + { + "epoch": 0.6146309601567603, + "grad_norm": 3.09437801267708, + "learning_rate": 9.887232285059548e-06, + "loss": 0.6954, + "step": 9410 + }, + { + "epoch": 0.6152841280209014, + "grad_norm": 32.01391680411232, + "learning_rate": 9.85801263377269e-06, + "loss": 0.7049, + "step": 9420 + }, + { + "epoch": 0.6159372958850424, + "grad_norm": 5.260083641866695, + "learning_rate": 9.828815075827148e-06, + "loss": 0.6741, + "step": 9430 + }, + { + "epoch": 0.6165904637491836, + "grad_norm": 4.0416270513002255, + "learning_rate": 9.799639736674729e-06, + "loss": 0.7333, + "step": 9440 + }, + { + "epoch": 0.6172436316133246, + "grad_norm": 5.155925948624099, + "learning_rate": 9.770486741671777e-06, + "loss": 0.7013, + "step": 9450 + }, + { + "epoch": 0.6178967994774657, + "grad_norm": 3.91699790906619, + "learning_rate": 9.74135621607863e-06, + "loss": 0.7214, + "step": 9460 + }, + { + "epoch": 0.6185499673416068, + "grad_norm": 6.112641926933419, + "learning_rate": 9.712248285059079e-06, + "loss": 0.6964, + "step": 9470 + }, + { + "epoch": 0.6192031352057479, + "grad_norm": 3.154827892850623, + "learning_rate": 9.683163073679831e-06, + "loss": 0.6868, + "step": 9480 + }, + { + "epoch": 0.6198563030698889, + "grad_norm": 4.0062924006415805, + "learning_rate": 9.65410070690999e-06, + "loss": 0.671, + "step": 9490 + }, + { + "epoch": 0.6205094709340301, + "grad_norm": 3.703242561631074, + "learning_rate": 9.625061309620487e-06, + "loss": 0.6945, + "step": 9500 + }, + { + "epoch": 0.6211626387981711, + "grad_norm": 6.392048497671309, + "learning_rate": 9.59604500658357e-06, + "loss": 0.6774, + "step": 9510 + }, + { + "epoch": 0.6218158066623122, + "grad_norm": 5.178076400022287, + "learning_rate": 9.56705192247226e-06, + "loss": 0.7251, + "step": 9520 + }, + { + "epoch": 0.6224689745264533, + "grad_norm": 2.5070876740750947, + "learning_rate": 9.53808218185981e-06, + "loss": 0.6909, + "step": 9530 + }, + { + "epoch": 0.6231221423905944, + "grad_norm": 4.151677190296909, + "learning_rate": 9.509135909219178e-06, + "loss": 0.6889, + "step": 9540 + }, + { + "epoch": 0.6237753102547354, + "grad_norm": 3.542263040633155, + "learning_rate": 9.48021322892249e-06, + "loss": 0.6894, + "step": 9550 + }, + { + "epoch": 0.6244284781188766, + "grad_norm": 6.583784192577517, + "learning_rate": 9.451314265240489e-06, + "loss": 0.6659, + "step": 9560 + }, + { + "epoch": 0.6250816459830176, + "grad_norm": 5.24457887775372, + "learning_rate": 9.422439142342035e-06, + "loss": 0.6829, + "step": 9570 + }, + { + "epoch": 0.6257348138471587, + "grad_norm": 7.216483152433425, + "learning_rate": 9.393587984293546e-06, + "loss": 0.7424, + "step": 9580 + }, + { + "epoch": 0.6263879817112998, + "grad_norm": 5.40830328432894, + "learning_rate": 9.36476091505846e-06, + "loss": 0.7183, + "step": 9590 + }, + { + "epoch": 0.6270411495754409, + "grad_norm": 2.612648013361852, + "learning_rate": 9.335958058496734e-06, + "loss": 0.6786, + "step": 9600 + }, + { + "epoch": 0.6276943174395819, + "grad_norm": 3.9662598735254826, + "learning_rate": 9.307179538364274e-06, + "loss": 0.7128, + "step": 9610 + }, + { + "epoch": 0.6283474853037231, + "grad_norm": 5.322804409368759, + "learning_rate": 9.278425478312437e-06, + "loss": 0.6438, + "step": 9620 + }, + { + "epoch": 0.6290006531678641, + "grad_norm": 4.24650699461021, + "learning_rate": 9.249696001887462e-06, + "loss": 0.7191, + "step": 9630 + }, + { + "epoch": 0.6296538210320052, + "grad_norm": 4.311181129354523, + "learning_rate": 9.220991232529977e-06, + "loss": 0.6694, + "step": 9640 + }, + { + "epoch": 0.6303069888961463, + "grad_norm": 2.438796652186847, + "learning_rate": 9.192311293574452e-06, + "loss": 0.6893, + "step": 9650 + }, + { + "epoch": 0.6309601567602874, + "grad_norm": 2.3407674504161, + "learning_rate": 9.163656308248666e-06, + "loss": 0.6814, + "step": 9660 + }, + { + "epoch": 0.6316133246244284, + "grad_norm": 5.193439989443432, + "learning_rate": 9.135026399673175e-06, + "loss": 0.7126, + "step": 9670 + }, + { + "epoch": 0.6322664924885696, + "grad_norm": 11.270517585736275, + "learning_rate": 9.106421690860796e-06, + "loss": 0.7057, + "step": 9680 + }, + { + "epoch": 0.6329196603527106, + "grad_norm": 5.1149136915038165, + "learning_rate": 9.077842304716069e-06, + "loss": 0.7197, + "step": 9690 + }, + { + "epoch": 0.6335728282168517, + "grad_norm": 4.952294505678321, + "learning_rate": 9.049288364034742e-06, + "loss": 0.6765, + "step": 9700 + }, + { + "epoch": 0.6342259960809928, + "grad_norm": 7.154019337112617, + "learning_rate": 9.020759991503207e-06, + "loss": 0.726, + "step": 9710 + }, + { + "epoch": 0.6348791639451339, + "grad_norm": 16.238808871780552, + "learning_rate": 8.99225730969802e-06, + "loss": 0.7069, + "step": 9720 + }, + { + "epoch": 0.6355323318092749, + "grad_norm": 2.991481926898025, + "learning_rate": 8.963780441085347e-06, + "loss": 0.6926, + "step": 9730 + }, + { + "epoch": 0.6361854996734161, + "grad_norm": 3.3243848532788682, + "learning_rate": 8.935329508020446e-06, + "loss": 0.7167, + "step": 9740 + }, + { + "epoch": 0.6368386675375571, + "grad_norm": 4.070596461723911, + "learning_rate": 8.906904632747137e-06, + "loss": 0.7356, + "step": 9750 + }, + { + "epoch": 0.6374918354016982, + "grad_norm": 3.7465849267000575, + "learning_rate": 8.878505937397272e-06, + "loss": 0.6961, + "step": 9760 + }, + { + "epoch": 0.6381450032658393, + "grad_norm": 3.790695499226875, + "learning_rate": 8.850133543990228e-06, + "loss": 0.6806, + "step": 9770 + }, + { + "epoch": 0.6387981711299804, + "grad_norm": 7.764921037477569, + "learning_rate": 8.821787574432371e-06, + "loss": 0.6691, + "step": 9780 + }, + { + "epoch": 0.6394513389941215, + "grad_norm": 5.566876179450992, + "learning_rate": 8.793468150516517e-06, + "loss": 0.7063, + "step": 9790 + }, + { + "epoch": 0.6401045068582626, + "grad_norm": 3.4053216071478345, + "learning_rate": 8.765175393921441e-06, + "loss": 0.6836, + "step": 9800 + }, + { + "epoch": 0.6407576747224036, + "grad_norm": 5.502089363662013, + "learning_rate": 8.736909426211335e-06, + "loss": 0.6388, + "step": 9810 + }, + { + "epoch": 0.6414108425865448, + "grad_norm": 2.7638694234276517, + "learning_rate": 8.708670368835286e-06, + "loss": 0.6598, + "step": 9820 + }, + { + "epoch": 0.6420640104506858, + "grad_norm": 12.233396527562816, + "learning_rate": 8.680458343126753e-06, + "loss": 0.7278, + "step": 9830 + }, + { + "epoch": 0.6427171783148269, + "grad_norm": 4.917255280169664, + "learning_rate": 8.65227347030306e-06, + "loss": 0.6496, + "step": 9840 + }, + { + "epoch": 0.643370346178968, + "grad_norm": 4.7377241016215725, + "learning_rate": 8.624115871464852e-06, + "loss": 0.6733, + "step": 9850 + }, + { + "epoch": 0.6440235140431091, + "grad_norm": 2.7426327074990207, + "learning_rate": 8.595985667595596e-06, + "loss": 0.7166, + "step": 9860 + }, + { + "epoch": 0.6446766819072501, + "grad_norm": 73.36784262898324, + "learning_rate": 8.56788297956104e-06, + "loss": 0.7123, + "step": 9870 + }, + { + "epoch": 0.6453298497713913, + "grad_norm": 3.134712080929007, + "learning_rate": 8.539807928108728e-06, + "loss": 0.6646, + "step": 9880 + }, + { + "epoch": 0.6459830176355323, + "grad_norm": 8.323293003427095, + "learning_rate": 8.511760633867436e-06, + "loss": 0.649, + "step": 9890 + }, + { + "epoch": 0.6466361854996734, + "grad_norm": 7.07615085043799, + "learning_rate": 8.483741217346696e-06, + "loss": 0.6633, + "step": 9900 + }, + { + "epoch": 0.6472893533638145, + "grad_norm": 7.961109043478078, + "learning_rate": 8.455749798936245e-06, + "loss": 0.7161, + "step": 9910 + }, + { + "epoch": 0.6479425212279556, + "grad_norm": 2.4133702739326104, + "learning_rate": 8.42778649890552e-06, + "loss": 0.665, + "step": 9920 + }, + { + "epoch": 0.6485956890920966, + "grad_norm": 7.42532223602706, + "learning_rate": 8.399851437403172e-06, + "loss": 0.6916, + "step": 9930 + }, + { + "epoch": 0.6492488569562378, + "grad_norm": 3.5245869658336026, + "learning_rate": 8.371944734456469e-06, + "loss": 0.7338, + "step": 9940 + }, + { + "epoch": 0.6499020248203788, + "grad_norm": 5.328557956142334, + "learning_rate": 8.344066509970884e-06, + "loss": 0.6911, + "step": 9950 + }, + { + "epoch": 0.6505551926845199, + "grad_norm": 7.668967882961343, + "learning_rate": 8.316216883729493e-06, + "loss": 0.6889, + "step": 9960 + }, + { + "epoch": 0.651208360548661, + "grad_norm": 8.063885237104227, + "learning_rate": 8.288395975392515e-06, + "loss": 0.7142, + "step": 9970 + }, + { + "epoch": 0.6518615284128021, + "grad_norm": 10.741056048367856, + "learning_rate": 8.260603904496769e-06, + "loss": 0.7316, + "step": 9980 + }, + { + "epoch": 0.6525146962769431, + "grad_norm": 4.3002787477499185, + "learning_rate": 8.232840790455173e-06, + "loss": 0.6964, + "step": 9990 + }, + { + "epoch": 0.6531678641410843, + "grad_norm": 4.695114258502196, + "learning_rate": 8.205106752556227e-06, + "loss": 0.7238, + "step": 10000 + }, + { + "epoch": 0.6538210320052253, + "grad_norm": 11.537353885010582, + "learning_rate": 8.177401909963496e-06, + "loss": 0.7204, + "step": 10010 + }, + { + "epoch": 0.6544741998693664, + "grad_norm": 13.72025313430244, + "learning_rate": 8.149726381715108e-06, + "loss": 0.7004, + "step": 10020 + }, + { + "epoch": 0.6551273677335075, + "grad_norm": 4.275313668148756, + "learning_rate": 8.122080286723233e-06, + "loss": 0.6829, + "step": 10030 + }, + { + "epoch": 0.6557805355976486, + "grad_norm": 2.261053906695268, + "learning_rate": 8.094463743773587e-06, + "loss": 0.7081, + "step": 10040 + }, + { + "epoch": 0.6564337034617896, + "grad_norm": 15.40058816554506, + "learning_rate": 8.066876871524893e-06, + "loss": 0.7099, + "step": 10050 + }, + { + "epoch": 0.6570868713259308, + "grad_norm": 9.434056746531544, + "learning_rate": 8.039319788508413e-06, + "loss": 0.6964, + "step": 10060 + }, + { + "epoch": 0.6577400391900718, + "grad_norm": 4.916230564927696, + "learning_rate": 8.011792613127389e-06, + "loss": 0.6974, + "step": 10070 + }, + { + "epoch": 0.6583932070542129, + "grad_norm": 3.415752629777518, + "learning_rate": 7.984295463656591e-06, + "loss": 0.6811, + "step": 10080 + }, + { + "epoch": 0.659046374918354, + "grad_norm": 3.159574664982398, + "learning_rate": 7.956828458241738e-06, + "loss": 0.7241, + "step": 10090 + }, + { + "epoch": 0.6596995427824951, + "grad_norm": 7.408267490822411, + "learning_rate": 7.929391714899066e-06, + "loss": 0.6528, + "step": 10100 + }, + { + "epoch": 0.6603527106466361, + "grad_norm": 8.520105705500969, + "learning_rate": 7.901985351514772e-06, + "loss": 0.7304, + "step": 10110 + }, + { + "epoch": 0.6610058785107773, + "grad_norm": 4.930976711189574, + "learning_rate": 7.874609485844513e-06, + "loss": 0.6879, + "step": 10120 + }, + { + "epoch": 0.6616590463749183, + "grad_norm": 4.4618731317544285, + "learning_rate": 7.847264235512924e-06, + "loss": 0.6824, + "step": 10130 + }, + { + "epoch": 0.6623122142390594, + "grad_norm": 5.219787677232259, + "learning_rate": 7.819949718013077e-06, + "loss": 0.7147, + "step": 10140 + }, + { + "epoch": 0.6629653821032006, + "grad_norm": 3.99345983845127, + "learning_rate": 7.792666050706023e-06, + "loss": 0.6897, + "step": 10150 + }, + { + "epoch": 0.6636185499673416, + "grad_norm": 5.542787577146197, + "learning_rate": 7.765413350820236e-06, + "loss": 0.6884, + "step": 10160 + }, + { + "epoch": 0.6642717178314826, + "grad_norm": 30.100215102532907, + "learning_rate": 7.73819173545114e-06, + "loss": 0.67, + "step": 10170 + }, + { + "epoch": 0.6649248856956238, + "grad_norm": 4.793597664333635, + "learning_rate": 7.711001321560596e-06, + "loss": 0.685, + "step": 10180 + }, + { + "epoch": 0.6655780535597648, + "grad_norm": 8.678629566067418, + "learning_rate": 7.683842225976423e-06, + "loss": 0.6828, + "step": 10190 + }, + { + "epoch": 0.6662312214239059, + "grad_norm": 3.6349803731984025, + "learning_rate": 7.656714565391852e-06, + "loss": 0.6821, + "step": 10200 + }, + { + "epoch": 0.666884389288047, + "grad_norm": 8.673927297972229, + "learning_rate": 7.629618456365055e-06, + "loss": 0.7103, + "step": 10210 + }, + { + "epoch": 0.6675375571521881, + "grad_norm": 8.057835375428068, + "learning_rate": 7.60255401531865e-06, + "loss": 0.6935, + "step": 10220 + }, + { + "epoch": 0.6681907250163291, + "grad_norm": 9.950279991005985, + "learning_rate": 7.5755213585391775e-06, + "loss": 0.6699, + "step": 10230 + }, + { + "epoch": 0.6688438928804703, + "grad_norm": 6.578518164839038, + "learning_rate": 7.548520602176613e-06, + "loss": 0.6962, + "step": 10240 + }, + { + "epoch": 0.6694970607446113, + "grad_norm": 3.6595824615281742, + "learning_rate": 7.521551862243861e-06, + "loss": 0.6775, + "step": 10250 + }, + { + "epoch": 0.6701502286087524, + "grad_norm": 6.542277248906721, + "learning_rate": 7.4946152546162815e-06, + "loss": 0.7179, + "step": 10260 + }, + { + "epoch": 0.6708033964728936, + "grad_norm": 3.1715830400767593, + "learning_rate": 7.46771089503115e-06, + "loss": 0.6577, + "step": 10270 + }, + { + "epoch": 0.6714565643370346, + "grad_norm": 9.166993800019167, + "learning_rate": 7.4408388990872086e-06, + "loss": 0.7217, + "step": 10280 + }, + { + "epoch": 0.6721097322011756, + "grad_norm": 3.1457370128580266, + "learning_rate": 7.41399938224412e-06, + "loss": 0.7238, + "step": 10290 + }, + { + "epoch": 0.6727629000653168, + "grad_norm": 5.087128249324293, + "learning_rate": 7.387192459822002e-06, + "loss": 0.6834, + "step": 10300 + }, + { + "epoch": 0.6734160679294579, + "grad_norm": 3.3234894038686136, + "learning_rate": 7.360418247000945e-06, + "loss": 0.6807, + "step": 10310 + }, + { + "epoch": 0.674069235793599, + "grad_norm": 3.0582560502629152, + "learning_rate": 7.333676858820461e-06, + "loss": 0.721, + "step": 10320 + }, + { + "epoch": 0.67472240365774, + "grad_norm": 2.7130806226776465, + "learning_rate": 7.3069684101790594e-06, + "loss": 0.6809, + "step": 10330 + }, + { + "epoch": 0.6753755715218811, + "grad_norm": 3.7810845457086617, + "learning_rate": 7.2802930158336974e-06, + "loss": 0.7271, + "step": 10340 + }, + { + "epoch": 0.6760287393860223, + "grad_norm": 5.206439454660348, + "learning_rate": 7.253650790399333e-06, + "loss": 0.6402, + "step": 10350 + }, + { + "epoch": 0.6766819072501633, + "grad_norm": 10.397540139012824, + "learning_rate": 7.2270418483483785e-06, + "loss": 0.6979, + "step": 10360 + }, + { + "epoch": 0.6773350751143044, + "grad_norm": 14.072917690695897, + "learning_rate": 7.2004663040102666e-06, + "loss": 0.7332, + "step": 10370 + }, + { + "epoch": 0.6779882429784455, + "grad_norm": 20.837793142434133, + "learning_rate": 7.173924271570917e-06, + "loss": 0.7167, + "step": 10380 + }, + { + "epoch": 0.6786414108425866, + "grad_norm": 2.95535033006669, + "learning_rate": 7.147415865072263e-06, + "loss": 0.691, + "step": 10390 + }, + { + "epoch": 0.6792945787067276, + "grad_norm": 5.20187262048636, + "learning_rate": 7.120941198411757e-06, + "loss": 0.7248, + "step": 10400 + }, + { + "epoch": 0.6799477465708688, + "grad_norm": 14.561008884563188, + "learning_rate": 7.094500385341882e-06, + "loss": 0.7156, + "step": 10410 + }, + { + "epoch": 0.6806009144350098, + "grad_norm": 6.659356337135833, + "learning_rate": 7.068093539469674e-06, + "loss": 0.68, + "step": 10420 + }, + { + "epoch": 0.6812540822991509, + "grad_norm": 7.950217324411396, + "learning_rate": 7.0417207742562106e-06, + "loss": 0.6544, + "step": 10430 + }, + { + "epoch": 0.681907250163292, + "grad_norm": 2.5071824848115853, + "learning_rate": 7.015382203016151e-06, + "loss": 0.7305, + "step": 10440 + }, + { + "epoch": 0.6825604180274331, + "grad_norm": 6.4374528901681884, + "learning_rate": 6.989077938917218e-06, + "loss": 0.7117, + "step": 10450 + }, + { + "epoch": 0.6832135858915741, + "grad_norm": 3.7611806134016508, + "learning_rate": 6.96280809497975e-06, + "loss": 0.6836, + "step": 10460 + }, + { + "epoch": 0.6838667537557153, + "grad_norm": 5.700978499632452, + "learning_rate": 6.93657278407616e-06, + "loss": 0.6625, + "step": 10470 + }, + { + "epoch": 0.6845199216198563, + "grad_norm": 2.581514477053975, + "learning_rate": 6.910372118930523e-06, + "loss": 0.6672, + "step": 10480 + }, + { + "epoch": 0.6851730894839974, + "grad_norm": 12.617504349736718, + "learning_rate": 6.8842062121180274e-06, + "loss": 0.6884, + "step": 10490 + }, + { + "epoch": 0.6858262573481385, + "grad_norm": 2.5495646497807947, + "learning_rate": 6.858075176064523e-06, + "loss": 0.7237, + "step": 10500 + }, + { + "epoch": 0.6864794252122796, + "grad_norm": 9.075860218651238, + "learning_rate": 6.831979123046042e-06, + "loss": 0.6929, + "step": 10510 + }, + { + "epoch": 0.6871325930764206, + "grad_norm": 11.79800801678243, + "learning_rate": 6.805918165188288e-06, + "loss": 0.6983, + "step": 10520 + }, + { + "epoch": 0.6877857609405618, + "grad_norm": 16.602144291276336, + "learning_rate": 6.779892414466196e-06, + "loss": 0.7003, + "step": 10530 + }, + { + "epoch": 0.6884389288047028, + "grad_norm": 16.30785786139135, + "learning_rate": 6.75390198270341e-06, + "loss": 0.7112, + "step": 10540 + }, + { + "epoch": 0.6890920966688439, + "grad_norm": 2.458437276248381, + "learning_rate": 6.727946981571826e-06, + "loss": 0.7271, + "step": 10550 + }, + { + "epoch": 0.689745264532985, + "grad_norm": 5.955957574861067, + "learning_rate": 6.702027522591101e-06, + "loss": 0.6631, + "step": 10560 + }, + { + "epoch": 0.6903984323971261, + "grad_norm": 2.8278119667432966, + "learning_rate": 6.676143717128197e-06, + "loss": 0.6936, + "step": 10570 + }, + { + "epoch": 0.6910516002612671, + "grad_norm": 4.3483105327397835, + "learning_rate": 6.65029567639687e-06, + "loss": 0.6828, + "step": 10580 + }, + { + "epoch": 0.6917047681254083, + "grad_norm": 3.947818013403805, + "learning_rate": 6.624483511457204e-06, + "loss": 0.712, + "step": 10590 + }, + { + "epoch": 0.6923579359895493, + "grad_norm": 3.374901805438211, + "learning_rate": 6.598707333215154e-06, + "loss": 0.7021, + "step": 10600 + }, + { + "epoch": 0.6930111038536904, + "grad_norm": 6.057687009033569, + "learning_rate": 6.5729672524220365e-06, + "loss": 0.6851, + "step": 10610 + }, + { + "epoch": 0.6936642717178315, + "grad_norm": 3.843916602517143, + "learning_rate": 6.5472633796740885e-06, + "loss": 0.6767, + "step": 10620 + }, + { + "epoch": 0.6943174395819726, + "grad_norm": 4.808172683191776, + "learning_rate": 6.521595825411942e-06, + "loss": 0.7512, + "step": 10630 + }, + { + "epoch": 0.6949706074461136, + "grad_norm": 2.3894514806323017, + "learning_rate": 6.495964699920215e-06, + "loss": 0.674, + "step": 10640 + }, + { + "epoch": 0.6956237753102548, + "grad_norm": 4.533425486319881, + "learning_rate": 6.4703701133269795e-06, + "loss": 0.7744, + "step": 10650 + }, + { + "epoch": 0.6962769431743958, + "grad_norm": 4.777684895896499, + "learning_rate": 6.444812175603333e-06, + "loss": 0.6787, + "step": 10660 + }, + { + "epoch": 0.6969301110385369, + "grad_norm": 6.419177600320241, + "learning_rate": 6.419290996562885e-06, + "loss": 0.6809, + "step": 10670 + }, + { + "epoch": 0.697583278902678, + "grad_norm": 7.25828815129346, + "learning_rate": 6.393806685861316e-06, + "loss": 0.7134, + "step": 10680 + }, + { + "epoch": 0.6982364467668191, + "grad_norm": 4.27661929331523, + "learning_rate": 6.368359352995906e-06, + "loss": 0.7359, + "step": 10690 + }, + { + "epoch": 0.6988896146309601, + "grad_norm": 3.8979699842570548, + "learning_rate": 6.342949107305026e-06, + "loss": 0.6823, + "step": 10700 + }, + { + "epoch": 0.6995427824951013, + "grad_norm": 3.649185659413299, + "learning_rate": 6.317576057967728e-06, + "loss": 0.6787, + "step": 10710 + }, + { + "epoch": 0.7001959503592423, + "grad_norm": 10.476570510576728, + "learning_rate": 6.292240314003217e-06, + "loss": 0.6764, + "step": 10720 + }, + { + "epoch": 0.7008491182233834, + "grad_norm": 20.378338312121556, + "learning_rate": 6.266941984270434e-06, + "loss": 0.6936, + "step": 10730 + }, + { + "epoch": 0.7015022860875245, + "grad_norm": 4.145143282154411, + "learning_rate": 6.241681177467542e-06, + "loss": 0.648, + "step": 10740 + }, + { + "epoch": 0.7021554539516656, + "grad_norm": 4.968717562971461, + "learning_rate": 6.216458002131502e-06, + "loss": 0.6815, + "step": 10750 + }, + { + "epoch": 0.7028086218158066, + "grad_norm": 3.49150362081721, + "learning_rate": 6.1912725666375695e-06, + "loss": 0.6583, + "step": 10760 + }, + { + "epoch": 0.7034617896799478, + "grad_norm": 3.0513662248766322, + "learning_rate": 6.166124979198849e-06, + "loss": 0.6995, + "step": 10770 + }, + { + "epoch": 0.7041149575440888, + "grad_norm": 5.397307127156423, + "learning_rate": 6.141015347865828e-06, + "loss": 0.6558, + "step": 10780 + }, + { + "epoch": 0.7047681254082299, + "grad_norm": 3.1775879172611146, + "learning_rate": 6.1159437805259e-06, + "loss": 0.7086, + "step": 10790 + }, + { + "epoch": 0.705421293272371, + "grad_norm": 6.070511077028698, + "learning_rate": 6.090910384902932e-06, + "loss": 0.7152, + "step": 10800 + }, + { + "epoch": 0.7060744611365121, + "grad_norm": 11.451698703609093, + "learning_rate": 6.065915268556756e-06, + "loss": 0.6721, + "step": 10810 + }, + { + "epoch": 0.7067276290006531, + "grad_norm": 4.040931608599787, + "learning_rate": 6.040958538882752e-06, + "loss": 0.6974, + "step": 10820 + }, + { + "epoch": 0.7073807968647943, + "grad_norm": 8.50160291928914, + "learning_rate": 6.016040303111346e-06, + "loss": 0.6284, + "step": 10830 + }, + { + "epoch": 0.7080339647289353, + "grad_norm": 9.50722108755325, + "learning_rate": 5.991160668307587e-06, + "loss": 0.6882, + "step": 10840 + }, + { + "epoch": 0.7086871325930765, + "grad_norm": 7.7646464109268205, + "learning_rate": 5.966319741370658e-06, + "loss": 0.7234, + "step": 10850 + }, + { + "epoch": 0.7093403004572175, + "grad_norm": 5.587117786690515, + "learning_rate": 5.941517629033432e-06, + "loss": 0.6888, + "step": 10860 + }, + { + "epoch": 0.7099934683213586, + "grad_norm": 4.801651654593505, + "learning_rate": 5.916754437862004e-06, + "loss": 0.7223, + "step": 10870 + }, + { + "epoch": 0.7106466361854997, + "grad_norm": 10.77526477037024, + "learning_rate": 5.89203027425524e-06, + "loss": 0.667, + "step": 10880 + }, + { + "epoch": 0.7112998040496408, + "grad_norm": 4.811645630139854, + "learning_rate": 5.867345244444328e-06, + "loss": 0.6292, + "step": 10890 + }, + { + "epoch": 0.7119529719137818, + "grad_norm": 3.9493363511020103, + "learning_rate": 5.8426994544922955e-06, + "loss": 0.7109, + "step": 10900 + }, + { + "epoch": 0.712606139777923, + "grad_norm": 4.308805569433444, + "learning_rate": 5.818093010293586e-06, + "loss": 0.722, + "step": 10910 + }, + { + "epoch": 0.713259307642064, + "grad_norm": 3.8841534453939537, + "learning_rate": 5.793526017573577e-06, + "loss": 0.7473, + "step": 10920 + }, + { + "epoch": 0.7139124755062051, + "grad_norm": 2.9837089714102847, + "learning_rate": 5.768998581888138e-06, + "loss": 0.6982, + "step": 10930 + }, + { + "epoch": 0.7145656433703462, + "grad_norm": 2.9982269133645367, + "learning_rate": 5.7445108086231715e-06, + "loss": 0.6836, + "step": 10940 + }, + { + "epoch": 0.7152188112344873, + "grad_norm": 4.547014476826066, + "learning_rate": 5.720062802994181e-06, + "loss": 0.6605, + "step": 10950 + }, + { + "epoch": 0.7158719790986283, + "grad_norm": 2.1769079718774704, + "learning_rate": 5.6956546700457885e-06, + "loss": 0.6883, + "step": 10960 + }, + { + "epoch": 0.7165251469627695, + "grad_norm": 5.03574147163945, + "learning_rate": 5.671286514651289e-06, + "loss": 0.7189, + "step": 10970 + }, + { + "epoch": 0.7171783148269105, + "grad_norm": 4.862398145535597, + "learning_rate": 5.646958441512234e-06, + "loss": 0.7095, + "step": 10980 + }, + { + "epoch": 0.7178314826910516, + "grad_norm": 15.308086895953474, + "learning_rate": 5.622670555157924e-06, + "loss": 0.6784, + "step": 10990 + }, + { + "epoch": 0.7184846505551927, + "grad_norm": 9.436744508124669, + "learning_rate": 5.5984229599450275e-06, + "loss": 0.709, + "step": 11000 + }, + { + "epoch": 0.7191378184193338, + "grad_norm": 2.532456946200012, + "learning_rate": 5.57421576005705e-06, + "loss": 0.7034, + "step": 11010 + }, + { + "epoch": 0.7197909862834748, + "grad_norm": 5.041112107392264, + "learning_rate": 5.550049059503976e-06, + "loss": 0.711, + "step": 11020 + }, + { + "epoch": 0.720444154147616, + "grad_norm": 5.007525312309261, + "learning_rate": 5.525922962121746e-06, + "loss": 0.6772, + "step": 11030 + }, + { + "epoch": 0.721097322011757, + "grad_norm": 3.8826844100241056, + "learning_rate": 5.50183757157187e-06, + "loss": 0.6671, + "step": 11040 + }, + { + "epoch": 0.7217504898758981, + "grad_norm": 3.8447345000076814, + "learning_rate": 5.477792991340932e-06, + "loss": 0.7535, + "step": 11050 + }, + { + "epoch": 0.7224036577400392, + "grad_norm": 6.033444336490015, + "learning_rate": 5.453789324740175e-06, + "loss": 0.7139, + "step": 11060 + }, + { + "epoch": 0.7230568256041803, + "grad_norm": 4.553578681833169, + "learning_rate": 5.4298266749050616e-06, + "loss": 0.6643, + "step": 11070 + }, + { + "epoch": 0.7237099934683213, + "grad_norm": 7.790695891534031, + "learning_rate": 5.405905144794807e-06, + "loss": 0.7111, + "step": 11080 + }, + { + "epoch": 0.7243631613324625, + "grad_norm": 22.8654485053664, + "learning_rate": 5.38202483719195e-06, + "loss": 0.6625, + "step": 11090 + }, + { + "epoch": 0.7250163291966035, + "grad_norm": 7.7791755086408925, + "learning_rate": 5.3581858547019095e-06, + "loss": 0.7008, + "step": 11100 + }, + { + "epoch": 0.7256694970607446, + "grad_norm": 3.286051116880111, + "learning_rate": 5.334388299752559e-06, + "loss": 0.6656, + "step": 11110 + }, + { + "epoch": 0.7263226649248857, + "grad_norm": 3.587098841551492, + "learning_rate": 5.310632274593751e-06, + "loss": 0.6798, + "step": 11120 + }, + { + "epoch": 0.7269758327890268, + "grad_norm": 7.054947046720567, + "learning_rate": 5.286917881296918e-06, + "loss": 0.7412, + "step": 11130 + }, + { + "epoch": 0.7276290006531678, + "grad_norm": 14.569440636615642, + "learning_rate": 5.263245221754604e-06, + "loss": 0.6485, + "step": 11140 + }, + { + "epoch": 0.728282168517309, + "grad_norm": 4.516233832158538, + "learning_rate": 5.239614397680038e-06, + "loss": 0.6619, + "step": 11150 + }, + { + "epoch": 0.72893533638145, + "grad_norm": 4.493184756832641, + "learning_rate": 5.216025510606698e-06, + "loss": 0.6696, + "step": 11160 + }, + { + "epoch": 0.7295885042455911, + "grad_norm": 5.640137712136578, + "learning_rate": 5.192478661887869e-06, + "loss": 0.681, + "step": 11170 + }, + { + "epoch": 0.7302416721097322, + "grad_norm": 6.6556384741462, + "learning_rate": 5.168973952696225e-06, + "loss": 0.7231, + "step": 11180 + }, + { + "epoch": 0.7308948399738733, + "grad_norm": 7.515936640823098, + "learning_rate": 5.1455114840233636e-06, + "loss": 0.673, + "step": 11190 + }, + { + "epoch": 0.7315480078380143, + "grad_norm": 3.5093368242859024, + "learning_rate": 5.122091356679405e-06, + "loss": 0.7244, + "step": 11200 + }, + { + "epoch": 0.7322011757021555, + "grad_norm": 15.219204185021983, + "learning_rate": 5.098713671292531e-06, + "loss": 0.6844, + "step": 11210 + }, + { + "epoch": 0.7328543435662965, + "grad_norm": 4.243075170782062, + "learning_rate": 5.075378528308577e-06, + "loss": 0.6838, + "step": 11220 + }, + { + "epoch": 0.7335075114304376, + "grad_norm": 5.773093578537657, + "learning_rate": 5.052086027990578e-06, + "loss": 0.6706, + "step": 11230 + }, + { + "epoch": 0.7341606792945787, + "grad_norm": 2.574491124656257, + "learning_rate": 5.028836270418352e-06, + "loss": 0.7104, + "step": 11240 + }, + { + "epoch": 0.7348138471587198, + "grad_norm": 3.6829042406272094, + "learning_rate": 5.005629355488066e-06, + "loss": 0.6982, + "step": 11250 + }, + { + "epoch": 0.7354670150228608, + "grad_norm": 8.464757675064218, + "learning_rate": 4.9824653829118015e-06, + "loss": 0.7169, + "step": 11260 + }, + { + "epoch": 0.736120182887002, + "grad_norm": 8.172113535446657, + "learning_rate": 4.959344452217148e-06, + "loss": 0.7048, + "step": 11270 + }, + { + "epoch": 0.736773350751143, + "grad_norm": 5.29669855324662, + "learning_rate": 4.936266662746737e-06, + "loss": 0.7039, + "step": 11280 + }, + { + "epoch": 0.7374265186152841, + "grad_norm": 3.060578035099832, + "learning_rate": 4.91323211365786e-06, + "loss": 0.6704, + "step": 11290 + }, + { + "epoch": 0.7380796864794252, + "grad_norm": 4.77220405088633, + "learning_rate": 4.890240903922002e-06, + "loss": 0.6563, + "step": 11300 + }, + { + "epoch": 0.7387328543435663, + "grad_norm": 5.885766735187112, + "learning_rate": 4.867293132324439e-06, + "loss": 0.6889, + "step": 11310 + }, + { + "epoch": 0.7393860222077073, + "grad_norm": 4.012360786090751, + "learning_rate": 4.8443888974638035e-06, + "loss": 0.6678, + "step": 11320 + }, + { + "epoch": 0.7400391900718485, + "grad_norm": 7.5356316067983915, + "learning_rate": 4.821528297751682e-06, + "loss": 0.7078, + "step": 11330 + }, + { + "epoch": 0.7406923579359895, + "grad_norm": 6.360271118911102, + "learning_rate": 4.798711431412161e-06, + "loss": 0.7228, + "step": 11340 + }, + { + "epoch": 0.7413455258001306, + "grad_norm": 7.053731961346386, + "learning_rate": 4.775938396481417e-06, + "loss": 0.6698, + "step": 11350 + }, + { + "epoch": 0.7419986936642717, + "grad_norm": 3.6864863150034894, + "learning_rate": 4.753209290807314e-06, + "loss": 0.6992, + "step": 11360 + }, + { + "epoch": 0.7426518615284128, + "grad_norm": 5.584661164553851, + "learning_rate": 4.730524212048951e-06, + "loss": 0.7142, + "step": 11370 + }, + { + "epoch": 0.7433050293925539, + "grad_norm": 3.6761825942391004, + "learning_rate": 4.7078832576762796e-06, + "loss": 0.697, + "step": 11380 + }, + { + "epoch": 0.743958197256695, + "grad_norm": 6.215721121189116, + "learning_rate": 4.685286524969629e-06, + "loss": 0.7307, + "step": 11390 + }, + { + "epoch": 0.744611365120836, + "grad_norm": 2.1010462470722593, + "learning_rate": 4.66273411101936e-06, + "loss": 0.721, + "step": 11400 + }, + { + "epoch": 0.7452645329849772, + "grad_norm": 2.9120407659276193, + "learning_rate": 4.640226112725385e-06, + "loss": 0.7089, + "step": 11410 + }, + { + "epoch": 0.7459177008491182, + "grad_norm": 3.369323477827884, + "learning_rate": 4.6177626267967995e-06, + "loss": 0.6807, + "step": 11420 + }, + { + "epoch": 0.7465708687132593, + "grad_norm": 6.767124353473997, + "learning_rate": 4.595343749751426e-06, + "loss": 0.6976, + "step": 11430 + }, + { + "epoch": 0.7472240365774004, + "grad_norm": 8.934916516358554, + "learning_rate": 4.5729695779154226e-06, + "loss": 0.6557, + "step": 11440 + }, + { + "epoch": 0.7478772044415415, + "grad_norm": 5.360081573264151, + "learning_rate": 4.550640207422877e-06, + "loss": 0.6952, + "step": 11450 + }, + { + "epoch": 0.7485303723056825, + "grad_norm": 9.06782995876813, + "learning_rate": 4.528355734215366e-06, + "loss": 0.7008, + "step": 11460 + }, + { + "epoch": 0.7491835401698237, + "grad_norm": 4.1971231598815235, + "learning_rate": 4.506116254041564e-06, + "loss": 0.6627, + "step": 11470 + }, + { + "epoch": 0.7498367080339647, + "grad_norm": 17.670004667271773, + "learning_rate": 4.483921862456819e-06, + "loss": 0.6703, + "step": 11480 + }, + { + "epoch": 0.7504898758981058, + "grad_norm": 7.1153667981089255, + "learning_rate": 4.4617726548227675e-06, + "loss": 0.7008, + "step": 11490 + }, + { + "epoch": 0.7511430437622469, + "grad_norm": 6.198148561728875, + "learning_rate": 4.439668726306884e-06, + "loss": 0.6873, + "step": 11500 + }, + { + "epoch": 0.751796211626388, + "grad_norm": 4.730018751820983, + "learning_rate": 4.417610171882114e-06, + "loss": 0.6713, + "step": 11510 + }, + { + "epoch": 0.752449379490529, + "grad_norm": 11.858758681696122, + "learning_rate": 4.395597086326432e-06, + "loss": 0.7241, + "step": 11520 + }, + { + "epoch": 0.7531025473546702, + "grad_norm": 2.5604194009971657, + "learning_rate": 4.373629564222452e-06, + "loss": 0.7211, + "step": 11530 + }, + { + "epoch": 0.7537557152188112, + "grad_norm": 6.127721376563953, + "learning_rate": 4.351707699957022e-06, + "loss": 0.6998, + "step": 11540 + }, + { + "epoch": 0.7544088830829523, + "grad_norm": 4.5795583726944695, + "learning_rate": 4.329831587720802e-06, + "loss": 0.7306, + "step": 11550 + }, + { + "epoch": 0.7550620509470934, + "grad_norm": 5.700473953084509, + "learning_rate": 4.308001321507894e-06, + "loss": 0.7182, + "step": 11560 + }, + { + "epoch": 0.7557152188112345, + "grad_norm": 8.045066259856924, + "learning_rate": 4.2862169951153876e-06, + "loss": 0.6862, + "step": 11570 + }, + { + "epoch": 0.7563683866753755, + "grad_norm": 30.07752840317419, + "learning_rate": 4.264478702143012e-06, + "loss": 0.7133, + "step": 11580 + }, + { + "epoch": 0.7570215545395167, + "grad_norm": 9.266849327574121, + "learning_rate": 4.242786535992684e-06, + "loss": 0.7105, + "step": 11590 + }, + { + "epoch": 0.7576747224036577, + "grad_norm": 6.445159799327492, + "learning_rate": 4.221140589868147e-06, + "loss": 0.6981, + "step": 11600 + }, + { + "epoch": 0.7583278902677988, + "grad_norm": 6.142205187903812, + "learning_rate": 4.199540956774541e-06, + "loss": 0.6832, + "step": 11610 + }, + { + "epoch": 0.75898105813194, + "grad_norm": 4.052417391594583, + "learning_rate": 4.177987729518021e-06, + "loss": 0.6577, + "step": 11620 + }, + { + "epoch": 0.759634225996081, + "grad_norm": 6.323134425360704, + "learning_rate": 4.156481000705346e-06, + "loss": 0.6844, + "step": 11630 + }, + { + "epoch": 0.760287393860222, + "grad_norm": 3.8227515050705354, + "learning_rate": 4.135020862743491e-06, + "loss": 0.6884, + "step": 11640 + }, + { + "epoch": 0.7609405617243632, + "grad_norm": 6.410463017777649, + "learning_rate": 4.113607407839253e-06, + "loss": 0.6684, + "step": 11650 + }, + { + "epoch": 0.7615937295885042, + "grad_norm": 4.813746699567568, + "learning_rate": 4.0922407279988335e-06, + "loss": 0.6729, + "step": 11660 + }, + { + "epoch": 0.7622468974526453, + "grad_norm": 3.2257376071290094, + "learning_rate": 4.070920915027476e-06, + "loss": 0.6794, + "step": 11670 + }, + { + "epoch": 0.7629000653167864, + "grad_norm": 2.888089279221371, + "learning_rate": 4.049648060529033e-06, + "loss": 0.6889, + "step": 11680 + }, + { + "epoch": 0.7635532331809275, + "grad_norm": 5.197354252520824, + "learning_rate": 4.028422255905616e-06, + "loss": 0.7189, + "step": 11690 + }, + { + "epoch": 0.7642064010450685, + "grad_norm": 4.325704069656428, + "learning_rate": 4.007243592357151e-06, + "loss": 0.6961, + "step": 11700 + }, + { + "epoch": 0.7648595689092097, + "grad_norm": 2.574492698992469, + "learning_rate": 3.98611216088104e-06, + "loss": 0.7196, + "step": 11710 + }, + { + "epoch": 0.7655127367733507, + "grad_norm": 15.270702380581223, + "learning_rate": 3.965028052271734e-06, + "loss": 0.6795, + "step": 11720 + }, + { + "epoch": 0.7661659046374918, + "grad_norm": 4.859452519917749, + "learning_rate": 3.94399135712035e-06, + "loss": 0.6708, + "step": 11730 + }, + { + "epoch": 0.766819072501633, + "grad_norm": 6.767605181222972, + "learning_rate": 3.923002165814301e-06, + "loss": 0.6824, + "step": 11740 + }, + { + "epoch": 0.767472240365774, + "grad_norm": 26.047999750760614, + "learning_rate": 3.902060568536873e-06, + "loss": 0.6562, + "step": 11750 + }, + { + "epoch": 0.768125408229915, + "grad_norm": 5.14331132911694, + "learning_rate": 3.881166655266879e-06, + "loss": 0.7051, + "step": 11760 + }, + { + "epoch": 0.7687785760940562, + "grad_norm": 3.0823853728057458, + "learning_rate": 3.860320515778224e-06, + "loss": 0.6721, + "step": 11770 + }, + { + "epoch": 0.7694317439581972, + "grad_norm": 4.808685772880641, + "learning_rate": 3.8395222396395685e-06, + "loss": 0.6915, + "step": 11780 + }, + { + "epoch": 0.7700849118223383, + "grad_norm": 5.018712314639996, + "learning_rate": 3.818771916213906e-06, + "loss": 0.6796, + "step": 11790 + }, + { + "epoch": 0.7707380796864794, + "grad_norm": 3.4740197644766804, + "learning_rate": 3.798069634658208e-06, + "loss": 0.6998, + "step": 11800 + }, + { + "epoch": 0.7713912475506205, + "grad_norm": 12.853249556821103, + "learning_rate": 3.7774154839230135e-06, + "loss": 0.6859, + "step": 11810 + }, + { + "epoch": 0.7720444154147615, + "grad_norm": 5.520376088143, + "learning_rate": 3.756809552752059e-06, + "loss": 0.7069, + "step": 11820 + }, + { + "epoch": 0.7726975832789027, + "grad_norm": 7.54059220839262, + "learning_rate": 3.736251929681914e-06, + "loss": 0.6804, + "step": 11830 + }, + { + "epoch": 0.7733507511430437, + "grad_norm": 5.109724027045723, + "learning_rate": 3.7157427030415714e-06, + "loss": 0.6751, + "step": 11840 + }, + { + "epoch": 0.7740039190071848, + "grad_norm": 7.887641085709507, + "learning_rate": 3.6952819609520826e-06, + "loss": 0.716, + "step": 11850 + }, + { + "epoch": 0.774657086871326, + "grad_norm": 4.475583913204593, + "learning_rate": 3.674869791326179e-06, + "loss": 0.7377, + "step": 11860 + }, + { + "epoch": 0.775310254735467, + "grad_norm": 10.331787568014837, + "learning_rate": 3.654506281867898e-06, + "loss": 0.7288, + "step": 11870 + }, + { + "epoch": 0.7759634225996082, + "grad_norm": 7.437261240997636, + "learning_rate": 3.634191520072191e-06, + "loss": 0.7145, + "step": 11880 + }, + { + "epoch": 0.7766165904637492, + "grad_norm": 3.740876734779163, + "learning_rate": 3.6139255932245707e-06, + "loss": 0.6993, + "step": 11890 + }, + { + "epoch": 0.7772697583278902, + "grad_norm": 7.71460714629273, + "learning_rate": 3.593708588400714e-06, + "loss": 0.7172, + "step": 11900 + }, + { + "epoch": 0.7779229261920314, + "grad_norm": 5.226329667132641, + "learning_rate": 3.5735405924660914e-06, + "loss": 0.6739, + "step": 11910 + }, + { + "epoch": 0.7785760940561725, + "grad_norm": 6.898912769850468, + "learning_rate": 3.5534216920756185e-06, + "loss": 0.6955, + "step": 11920 + }, + { + "epoch": 0.7792292619203135, + "grad_norm": 3.142973519932529, + "learning_rate": 3.533351973673238e-06, + "loss": 0.7032, + "step": 11930 + }, + { + "epoch": 0.7798824297844547, + "grad_norm": 3.9251708509836387, + "learning_rate": 3.5133315234915984e-06, + "loss": 0.6849, + "step": 11940 + }, + { + "epoch": 0.7805355976485957, + "grad_norm": 9.332540569242788, + "learning_rate": 3.493360427551643e-06, + "loss": 0.7404, + "step": 11950 + }, + { + "epoch": 0.7811887655127367, + "grad_norm": 7.326016966754033, + "learning_rate": 3.4734387716622724e-06, + "loss": 0.6932, + "step": 11960 + }, + { + "epoch": 0.7818419333768779, + "grad_norm": 8.101112816744926, + "learning_rate": 3.453566641419942e-06, + "loss": 0.6652, + "step": 11970 + }, + { + "epoch": 0.782495101241019, + "grad_norm": 43.286231944771565, + "learning_rate": 3.4337441222083316e-06, + "loss": 0.6901, + "step": 11980 + }, + { + "epoch": 0.78314826910516, + "grad_norm": 4.646261266777329, + "learning_rate": 3.4139712991979487e-06, + "loss": 0.6991, + "step": 11990 + }, + { + "epoch": 0.7838014369693012, + "grad_norm": 4.87134473654209, + "learning_rate": 3.3942482573457716e-06, + "loss": 0.7127, + "step": 12000 + }, + { + "epoch": 0.7844546048334422, + "grad_norm": 2.988937963991205, + "learning_rate": 3.374575081394891e-06, + "loss": 0.7015, + "step": 12010 + }, + { + "epoch": 0.7851077726975833, + "grad_norm": 4.634966859174349, + "learning_rate": 3.354951855874136e-06, + "loss": 0.6939, + "step": 12020 + }, + { + "epoch": 0.7857609405617244, + "grad_norm": 4.259983454210986, + "learning_rate": 3.33537866509773e-06, + "loss": 0.6993, + "step": 12030 + }, + { + "epoch": 0.7864141084258655, + "grad_norm": 3.421594338251067, + "learning_rate": 3.3158555931648915e-06, + "loss": 0.691, + "step": 12040 + }, + { + "epoch": 0.7870672762900065, + "grad_norm": 4.756566377761871, + "learning_rate": 3.296382723959521e-06, + "loss": 0.6625, + "step": 12050 + }, + { + "epoch": 0.7877204441541477, + "grad_norm": 6.204055063009476, + "learning_rate": 3.2769601411497917e-06, + "loss": 0.6749, + "step": 12060 + }, + { + "epoch": 0.7883736120182887, + "grad_norm": 4.677416114890872, + "learning_rate": 3.2575879281878387e-06, + "loss": 0.7238, + "step": 12070 + }, + { + "epoch": 0.7890267798824298, + "grad_norm": 7.883616018032296, + "learning_rate": 3.238266168309341e-06, + "loss": 0.6868, + "step": 12080 + }, + { + "epoch": 0.7896799477465709, + "grad_norm": 4.685982827389975, + "learning_rate": 3.218994944533235e-06, + "loss": 0.7452, + "step": 12090 + }, + { + "epoch": 0.790333115610712, + "grad_norm": 7.214355980990887, + "learning_rate": 3.199774339661299e-06, + "loss": 0.6869, + "step": 12100 + }, + { + "epoch": 0.790986283474853, + "grad_norm": 4.079648255912239, + "learning_rate": 3.1806044362778184e-06, + "loss": 0.6876, + "step": 12110 + }, + { + "epoch": 0.7916394513389942, + "grad_norm": 3.734472169769081, + "learning_rate": 3.161485316749248e-06, + "loss": 0.6945, + "step": 12120 + }, + { + "epoch": 0.7922926192031352, + "grad_norm": 2.452217663175586, + "learning_rate": 3.142417063223822e-06, + "loss": 0.7043, + "step": 12130 + }, + { + "epoch": 0.7929457870672763, + "grad_norm": 3.530193609704299, + "learning_rate": 3.1233997576312453e-06, + "loss": 0.6959, + "step": 12140 + }, + { + "epoch": 0.7935989549314174, + "grad_norm": 2.5668358755276413, + "learning_rate": 3.1044334816822856e-06, + "loss": 0.7136, + "step": 12150 + }, + { + "epoch": 0.7942521227955585, + "grad_norm": 4.931477990820825, + "learning_rate": 3.085518316868482e-06, + "loss": 0.6985, + "step": 12160 + }, + { + "epoch": 0.7949052906596995, + "grad_norm": 8.704483227583205, + "learning_rate": 3.06665434446175e-06, + "loss": 0.6761, + "step": 12170 + }, + { + "epoch": 0.7955584585238407, + "grad_norm": 20.461859228952505, + "learning_rate": 3.04784164551406e-06, + "loss": 0.7015, + "step": 12180 + }, + { + "epoch": 0.7962116263879817, + "grad_norm": 3.6639577366669425, + "learning_rate": 3.0290803008570716e-06, + "loss": 0.7254, + "step": 12190 + }, + { + "epoch": 0.7968647942521228, + "grad_norm": 8.385447348465478, + "learning_rate": 3.010370391101788e-06, + "loss": 0.6877, + "step": 12200 + }, + { + "epoch": 0.7975179621162639, + "grad_norm": 92.3924258343859, + "learning_rate": 2.9917119966382296e-06, + "loss": 0.681, + "step": 12210 + }, + { + "epoch": 0.798171129980405, + "grad_norm": 6.476743615060319, + "learning_rate": 2.9731051976350605e-06, + "loss": 0.7358, + "step": 12220 + }, + { + "epoch": 0.798824297844546, + "grad_norm": 6.862513630059609, + "learning_rate": 2.954550074039258e-06, + "loss": 0.7084, + "step": 12230 + }, + { + "epoch": 0.7994774657086872, + "grad_norm": 5.881076997488694, + "learning_rate": 2.93604670557577e-06, + "loss": 0.7179, + "step": 12240 + }, + { + "epoch": 0.8001306335728282, + "grad_norm": 2.7363017242268954, + "learning_rate": 2.917595171747178e-06, + "loss": 0.6615, + "step": 12250 + }, + { + "epoch": 0.8007838014369693, + "grad_norm": 13.487462756624321, + "learning_rate": 2.8991955518333353e-06, + "loss": 0.6833, + "step": 12260 + }, + { + "epoch": 0.8014369693011104, + "grad_norm": 4.4447295706736645, + "learning_rate": 2.8808479248910484e-06, + "loss": 0.7112, + "step": 12270 + }, + { + "epoch": 0.8020901371652515, + "grad_norm": 6.063889806264584, + "learning_rate": 2.862552369753725e-06, + "loss": 0.6724, + "step": 12280 + }, + { + "epoch": 0.8027433050293925, + "grad_norm": 6.6405287791742165, + "learning_rate": 2.8443089650310313e-06, + "loss": 0.7205, + "step": 12290 + }, + { + "epoch": 0.8033964728935337, + "grad_norm": 3.9419553466853516, + "learning_rate": 2.8261177891085803e-06, + "loss": 0.6971, + "step": 12300 + }, + { + "epoch": 0.8040496407576747, + "grad_norm": 4.354713052784848, + "learning_rate": 2.807978920147547e-06, + "loss": 0.6968, + "step": 12310 + }, + { + "epoch": 0.8047028086218158, + "grad_norm": 13.069194629609031, + "learning_rate": 2.789892436084393e-06, + "loss": 0.6771, + "step": 12320 + }, + { + "epoch": 0.8053559764859569, + "grad_norm": 8.064868194770664, + "learning_rate": 2.7718584146304727e-06, + "loss": 0.6947, + "step": 12330 + }, + { + "epoch": 0.806009144350098, + "grad_norm": 2.3428135336163436, + "learning_rate": 2.7538769332717486e-06, + "loss": 0.6939, + "step": 12340 + }, + { + "epoch": 0.806662312214239, + "grad_norm": 7.1717066526912445, + "learning_rate": 2.73594806926842e-06, + "loss": 0.6727, + "step": 12350 + }, + { + "epoch": 0.8073154800783802, + "grad_norm": 7.3523299073095165, + "learning_rate": 2.7180718996546223e-06, + "loss": 0.6875, + "step": 12360 + }, + { + "epoch": 0.8079686479425212, + "grad_norm": 3.342750043282877, + "learning_rate": 2.700248501238068e-06, + "loss": 0.6709, + "step": 12370 + }, + { + "epoch": 0.8086218158066623, + "grad_norm": 2.8621407562447256, + "learning_rate": 2.6824779505997387e-06, + "loss": 0.6844, + "step": 12380 + }, + { + "epoch": 0.8092749836708034, + "grad_norm": 5.293840737674243, + "learning_rate": 2.6647603240935416e-06, + "loss": 0.7063, + "step": 12390 + }, + { + "epoch": 0.8099281515349445, + "grad_norm": 2.758612843514663, + "learning_rate": 2.6470956978459894e-06, + "loss": 0.7293, + "step": 12400 + }, + { + "epoch": 0.8105813193990856, + "grad_norm": 3.4769473430781326, + "learning_rate": 2.6294841477558746e-06, + "loss": 0.6904, + "step": 12410 + }, + { + "epoch": 0.8112344872632267, + "grad_norm": 3.5772224187425494, + "learning_rate": 2.6119257494939338e-06, + "loss": 0.7102, + "step": 12420 + }, + { + "epoch": 0.8118876551273677, + "grad_norm": 49.35632558244181, + "learning_rate": 2.594420578502537e-06, + "loss": 0.6637, + "step": 12430 + }, + { + "epoch": 0.8125408229915089, + "grad_norm": 2.4670140404148997, + "learning_rate": 2.576968709995342e-06, + "loss": 0.6894, + "step": 12440 + }, + { + "epoch": 0.8131939908556499, + "grad_norm": 3.1293163245147513, + "learning_rate": 2.5595702189570034e-06, + "loss": 0.6928, + "step": 12450 + }, + { + "epoch": 0.813847158719791, + "grad_norm": 4.722752774421984, + "learning_rate": 2.542225180142807e-06, + "loss": 0.7045, + "step": 12460 + }, + { + "epoch": 0.8145003265839321, + "grad_norm": 4.282456499492312, + "learning_rate": 2.524933668078393e-06, + "loss": 0.6874, + "step": 12470 + }, + { + "epoch": 0.8151534944480732, + "grad_norm": 7.445379875334576, + "learning_rate": 2.507695757059406e-06, + "loss": 0.693, + "step": 12480 + }, + { + "epoch": 0.8158066623122142, + "grad_norm": 9.103161742545142, + "learning_rate": 2.490511521151187e-06, + "loss": 0.6902, + "step": 12490 + }, + { + "epoch": 0.8164598301763554, + "grad_norm": 23.66529921042094, + "learning_rate": 2.473381034188457e-06, + "loss": 0.6828, + "step": 12500 + }, + { + "epoch": 0.8171129980404964, + "grad_norm": 7.930931677716769, + "learning_rate": 2.45630436977499e-06, + "loss": 0.6911, + "step": 12510 + }, + { + "epoch": 0.8177661659046375, + "grad_norm": 6.202448810456874, + "learning_rate": 2.439281601283313e-06, + "loss": 0.7151, + "step": 12520 + }, + { + "epoch": 0.8184193337687786, + "grad_norm": 4.083723529675006, + "learning_rate": 2.4223128018543698e-06, + "loss": 0.6664, + "step": 12530 + }, + { + "epoch": 0.8190725016329197, + "grad_norm": 7.755204698823513, + "learning_rate": 2.4053980443972262e-06, + "loss": 0.7058, + "step": 12540 + }, + { + "epoch": 0.8197256694970607, + "grad_norm": 2.526437776845295, + "learning_rate": 2.388537401588738e-06, + "loss": 0.7069, + "step": 12550 + }, + { + "epoch": 0.8203788373612019, + "grad_norm": 7.229351499395132, + "learning_rate": 2.371730945873264e-06, + "loss": 0.6185, + "step": 12560 + }, + { + "epoch": 0.8210320052253429, + "grad_norm": 2.0170822198816527, + "learning_rate": 2.3549787494623277e-06, + "loss": 0.6377, + "step": 12570 + }, + { + "epoch": 0.821685173089484, + "grad_norm": 6.849124278873261, + "learning_rate": 2.3382808843343225e-06, + "loss": 0.6372, + "step": 12580 + }, + { + "epoch": 0.8223383409536251, + "grad_norm": 3.949714123967933, + "learning_rate": 2.321637422234203e-06, + "loss": 0.6714, + "step": 12590 + }, + { + "epoch": 0.8229915088177662, + "grad_norm": 5.466741225656826, + "learning_rate": 2.305048434673168e-06, + "loss": 0.6814, + "step": 12600 + }, + { + "epoch": 0.8236446766819072, + "grad_norm": 9.13517549226144, + "learning_rate": 2.2885139929283605e-06, + "loss": 0.6825, + "step": 12610 + }, + { + "epoch": 0.8242978445460484, + "grad_norm": 3.5636581412270987, + "learning_rate": 2.2720341680425514e-06, + "loss": 0.6953, + "step": 12620 + }, + { + "epoch": 0.8249510124101894, + "grad_norm": 5.3332834638807185, + "learning_rate": 2.255609030823859e-06, + "loss": 0.706, + "step": 12630 + }, + { + "epoch": 0.8256041802743305, + "grad_norm": 6.947948938053646, + "learning_rate": 2.239238651845409e-06, + "loss": 0.7044, + "step": 12640 + }, + { + "epoch": 0.8262573481384716, + "grad_norm": 10.389458566460489, + "learning_rate": 2.2229231014450648e-06, + "loss": 0.6701, + "step": 12650 + }, + { + "epoch": 0.8269105160026127, + "grad_norm": 4.448630973164709, + "learning_rate": 2.2066624497251005e-06, + "loss": 0.7235, + "step": 12660 + }, + { + "epoch": 0.8275636838667537, + "grad_norm": 7.655730311603453, + "learning_rate": 2.1904567665519086e-06, + "loss": 0.7614, + "step": 12670 + }, + { + "epoch": 0.8282168517308949, + "grad_norm": 10.563719188142853, + "learning_rate": 2.1743061215557148e-06, + "loss": 0.732, + "step": 12680 + }, + { + "epoch": 0.8288700195950359, + "grad_norm": 11.322455477804937, + "learning_rate": 2.1582105841302425e-06, + "loss": 0.6881, + "step": 12690 + }, + { + "epoch": 0.829523187459177, + "grad_norm": 10.512372674522478, + "learning_rate": 2.1421702234324587e-06, + "loss": 0.6815, + "step": 12700 + }, + { + "epoch": 0.8301763553233181, + "grad_norm": 4.34459453029744, + "learning_rate": 2.1261851083822383e-06, + "loss": 0.6837, + "step": 12710 + }, + { + "epoch": 0.8308295231874592, + "grad_norm": 15.14965475463287, + "learning_rate": 2.110255307662101e-06, + "loss": 0.6867, + "step": 12720 + }, + { + "epoch": 0.8314826910516002, + "grad_norm": 6.047047851615411, + "learning_rate": 2.094380889716881e-06, + "loss": 0.7015, + "step": 12730 + }, + { + "epoch": 0.8321358589157414, + "grad_norm": 3.228008989758837, + "learning_rate": 2.078561922753471e-06, + "loss": 0.6774, + "step": 12740 + }, + { + "epoch": 0.8327890267798824, + "grad_norm": 5.460843195940031, + "learning_rate": 2.062798474740496e-06, + "loss": 0.6863, + "step": 12750 + }, + { + "epoch": 0.8334421946440235, + "grad_norm": 29.725573858395695, + "learning_rate": 2.047090613408043e-06, + "loss": 0.6753, + "step": 12760 + }, + { + "epoch": 0.8340953625081646, + "grad_norm": 4.378649687928872, + "learning_rate": 2.0314384062473564e-06, + "loss": 0.7087, + "step": 12770 + }, + { + "epoch": 0.8347485303723057, + "grad_norm": 6.7044943809033395, + "learning_rate": 2.0158419205105545e-06, + "loss": 0.6614, + "step": 12780 + }, + { + "epoch": 0.8354016982364467, + "grad_norm": 16.866259310380592, + "learning_rate": 2.0003012232103496e-06, + "loss": 0.6765, + "step": 12790 + }, + { + "epoch": 0.8360548661005879, + "grad_norm": 4.040076241104312, + "learning_rate": 1.9848163811197375e-06, + "loss": 0.6687, + "step": 12800 + }, + { + "epoch": 0.8367080339647289, + "grad_norm": 3.751237410791482, + "learning_rate": 1.9693874607717334e-06, + "loss": 0.674, + "step": 12810 + }, + { + "epoch": 0.83736120182887, + "grad_norm": 6.810935024428175, + "learning_rate": 1.9540145284590656e-06, + "loss": 0.7296, + "step": 12820 + }, + { + "epoch": 0.8380143696930111, + "grad_norm": 22.30397259690559, + "learning_rate": 1.9386976502339195e-06, + "loss": 0.6802, + "step": 12830 + }, + { + "epoch": 0.8386675375571522, + "grad_norm": 9.640858454172971, + "learning_rate": 1.923436891907608e-06, + "loss": 0.697, + "step": 12840 + }, + { + "epoch": 0.8393207054212932, + "grad_norm": 5.1550539421271075, + "learning_rate": 1.9082323190503403e-06, + "loss": 0.6743, + "step": 12850 + }, + { + "epoch": 0.8399738732854344, + "grad_norm": 3.6143646760410846, + "learning_rate": 1.8930839969909075e-06, + "loss": 0.7415, + "step": 12860 + }, + { + "epoch": 0.8406270411495754, + "grad_norm": 5.464168525956508, + "learning_rate": 1.877991990816405e-06, + "loss": 0.6898, + "step": 12870 + }, + { + "epoch": 0.8412802090137165, + "grad_norm": 3.456997732226285, + "learning_rate": 1.8629563653719705e-06, + "loss": 0.6824, + "step": 12880 + }, + { + "epoch": 0.8419333768778576, + "grad_norm": 3.9004033393633533, + "learning_rate": 1.8479771852604805e-06, + "loss": 0.7254, + "step": 12890 + }, + { + "epoch": 0.8425865447419987, + "grad_norm": 14.7940208316794, + "learning_rate": 1.8330545148422966e-06, + "loss": 0.6697, + "step": 12900 + }, + { + "epoch": 0.8432397126061397, + "grad_norm": 5.865402823921409, + "learning_rate": 1.8181884182349707e-06, + "loss": 0.6707, + "step": 12910 + }, + { + "epoch": 0.8438928804702809, + "grad_norm": 5.3692230407006765, + "learning_rate": 1.8033789593129763e-06, + "loss": 0.7178, + "step": 12920 + }, + { + "epoch": 0.8445460483344219, + "grad_norm": 2.7141742127684214, + "learning_rate": 1.788626201707434e-06, + "loss": 0.6836, + "step": 12930 + }, + { + "epoch": 0.8451992161985631, + "grad_norm": 13.91576329266843, + "learning_rate": 1.773930208805849e-06, + "loss": 0.6617, + "step": 12940 + }, + { + "epoch": 0.8458523840627041, + "grad_norm": 6.069386768818718, + "learning_rate": 1.7592910437518134e-06, + "loss": 0.6836, + "step": 12950 + }, + { + "epoch": 0.8465055519268452, + "grad_norm": 6.304680844366065, + "learning_rate": 1.7447087694447577e-06, + "loss": 0.693, + "step": 12960 + }, + { + "epoch": 0.8471587197909863, + "grad_norm": 3.3827802202829838, + "learning_rate": 1.7301834485396733e-06, + "loss": 0.7326, + "step": 12970 + }, + { + "epoch": 0.8478118876551274, + "grad_norm": 3.4211838951874456, + "learning_rate": 1.7157151434468371e-06, + "loss": 0.7205, + "step": 12980 + }, + { + "epoch": 0.8484650555192684, + "grad_norm": 4.341322186081593, + "learning_rate": 1.7013039163315602e-06, + "loss": 0.6569, + "step": 12990 + }, + { + "epoch": 0.8491182233834096, + "grad_norm": 4.980096876435401, + "learning_rate": 1.6869498291138886e-06, + "loss": 0.6956, + "step": 13000 + }, + { + "epoch": 0.8497713912475506, + "grad_norm": 4.701793190597957, + "learning_rate": 1.6726529434683808e-06, + "loss": 0.6784, + "step": 13010 + }, + { + "epoch": 0.8504245591116917, + "grad_norm": 2.3121382971719107, + "learning_rate": 1.6584133208238023e-06, + "loss": 0.6594, + "step": 13020 + }, + { + "epoch": 0.8510777269758328, + "grad_norm": 4.607070398447369, + "learning_rate": 1.6442310223628936e-06, + "loss": 0.6819, + "step": 13030 + }, + { + "epoch": 0.8517308948399739, + "grad_norm": 5.658518973644752, + "learning_rate": 1.6301061090220825e-06, + "loss": 0.6855, + "step": 13040 + }, + { + "epoch": 0.8523840627041149, + "grad_norm": 3.675512970915112, + "learning_rate": 1.6160386414912354e-06, + "loss": 0.6815, + "step": 13050 + }, + { + "epoch": 0.8530372305682561, + "grad_norm": 3.722639598188754, + "learning_rate": 1.6020286802134027e-06, + "loss": 0.6526, + "step": 13060 + }, + { + "epoch": 0.8536903984323971, + "grad_norm": 6.002349711311793, + "learning_rate": 1.5880762853845294e-06, + "loss": 0.6802, + "step": 13070 + }, + { + "epoch": 0.8543435662965382, + "grad_norm": 20.86184076601306, + "learning_rate": 1.5741815169532398e-06, + "loss": 0.6974, + "step": 13080 + }, + { + "epoch": 0.8549967341606793, + "grad_norm": 3.7061664951431528, + "learning_rate": 1.560344434620543e-06, + "loss": 0.6771, + "step": 13090 + }, + { + "epoch": 0.8556499020248204, + "grad_norm": 3.2473742808850674, + "learning_rate": 1.5465650978396035e-06, + "loss": 0.6931, + "step": 13100 + }, + { + "epoch": 0.8563030698889614, + "grad_norm": 4.710037980625067, + "learning_rate": 1.5328435658154565e-06, + "loss": 0.694, + "step": 13110 + }, + { + "epoch": 0.8569562377531026, + "grad_norm": 6.6459503004091, + "learning_rate": 1.5191798975047889e-06, + "loss": 0.6468, + "step": 13120 + }, + { + "epoch": 0.8576094056172436, + "grad_norm": 24.670829765303676, + "learning_rate": 1.5055741516156519e-06, + "loss": 0.7096, + "step": 13130 + }, + { + "epoch": 0.8582625734813847, + "grad_norm": 4.199668625974335, + "learning_rate": 1.4920263866072314e-06, + "loss": 0.718, + "step": 13140 + }, + { + "epoch": 0.8589157413455258, + "grad_norm": 31.907476179303377, + "learning_rate": 1.4785366606895879e-06, + "loss": 0.6497, + "step": 13150 + }, + { + "epoch": 0.8595689092096669, + "grad_norm": 7.202527863416497, + "learning_rate": 1.4651050318234055e-06, + "loss": 0.7042, + "step": 13160 + }, + { + "epoch": 0.8602220770738079, + "grad_norm": 40.873267834092985, + "learning_rate": 1.451731557719752e-06, + "loss": 0.7122, + "step": 13170 + }, + { + "epoch": 0.8608752449379491, + "grad_norm": 5.618784882943198, + "learning_rate": 1.4384162958398166e-06, + "loss": 0.703, + "step": 13180 + }, + { + "epoch": 0.8615284128020901, + "grad_norm": 4.327833475699335, + "learning_rate": 1.4251593033946803e-06, + "loss": 0.6976, + "step": 13190 + }, + { + "epoch": 0.8621815806662312, + "grad_norm": 3.571729116055556, + "learning_rate": 1.4119606373450455e-06, + "loss": 0.6785, + "step": 13200 + }, + { + "epoch": 0.8628347485303723, + "grad_norm": 3.4649354047382745, + "learning_rate": 1.3988203544010292e-06, + "loss": 0.6709, + "step": 13210 + }, + { + "epoch": 0.8634879163945134, + "grad_norm": 7.301633145272006, + "learning_rate": 1.3857385110218668e-06, + "loss": 0.6924, + "step": 13220 + }, + { + "epoch": 0.8641410842586544, + "grad_norm": 3.4298172146330854, + "learning_rate": 1.3727151634157249e-06, + "loss": 0.7395, + "step": 13230 + }, + { + "epoch": 0.8647942521227956, + "grad_norm": 107.76265151943878, + "learning_rate": 1.3597503675394225e-06, + "loss": 0.6282, + "step": 13240 + }, + { + "epoch": 0.8654474199869366, + "grad_norm": 3.904019382647567, + "learning_rate": 1.3468441790981983e-06, + "loss": 0.7327, + "step": 13250 + }, + { + "epoch": 0.8661005878510777, + "grad_norm": 5.708366264006957, + "learning_rate": 1.3339966535454861e-06, + "loss": 0.6733, + "step": 13260 + }, + { + "epoch": 0.8667537557152188, + "grad_norm": 4.535509514665049, + "learning_rate": 1.321207846082656e-06, + "loss": 0.7116, + "step": 13270 + }, + { + "epoch": 0.8674069235793599, + "grad_norm": 11.624418510742403, + "learning_rate": 1.3084778116587948e-06, + "loss": 0.6779, + "step": 13280 + }, + { + "epoch": 0.8680600914435009, + "grad_norm": 4.095446751856266, + "learning_rate": 1.2958066049704564e-06, + "loss": 0.6732, + "step": 13290 + }, + { + "epoch": 0.8687132593076421, + "grad_norm": 3.6972034608317315, + "learning_rate": 1.2831942804614306e-06, + "loss": 0.7488, + "step": 13300 + }, + { + "epoch": 0.8693664271717831, + "grad_norm": 3.3586477512686637, + "learning_rate": 1.2706408923225138e-06, + "loss": 0.7328, + "step": 13310 + }, + { + "epoch": 0.8700195950359242, + "grad_norm": 4.6623427704591744, + "learning_rate": 1.2581464944912774e-06, + "loss": 0.7201, + "step": 13320 + }, + { + "epoch": 0.8706727629000653, + "grad_norm": 15.700944344857493, + "learning_rate": 1.245711140651825e-06, + "loss": 0.6937, + "step": 13330 + }, + { + "epoch": 0.8713259307642064, + "grad_norm": 4.451147450157866, + "learning_rate": 1.2333348842345687e-06, + "loss": 0.6852, + "step": 13340 + }, + { + "epoch": 0.8719790986283474, + "grad_norm": 2.738144548734267, + "learning_rate": 1.2210177784160064e-06, + "loss": 0.7138, + "step": 13350 + }, + { + "epoch": 0.8726322664924886, + "grad_norm": 2.8632800842532586, + "learning_rate": 1.2087598761184765e-06, + "loss": 0.6942, + "step": 13360 + }, + { + "epoch": 0.8732854343566296, + "grad_norm": 7.694040880817023, + "learning_rate": 1.1965612300099555e-06, + "loss": 0.7027, + "step": 13370 + }, + { + "epoch": 0.8739386022207707, + "grad_norm": 14.48342526973317, + "learning_rate": 1.1844218925037953e-06, + "loss": 0.6937, + "step": 13380 + }, + { + "epoch": 0.8745917700849118, + "grad_norm": 2.2731035649293525, + "learning_rate": 1.1723419157585386e-06, + "loss": 0.7297, + "step": 13390 + }, + { + "epoch": 0.8752449379490529, + "grad_norm": 5.450732911551555, + "learning_rate": 1.16032135167766e-06, + "loss": 0.6678, + "step": 13400 + }, + { + "epoch": 0.8758981058131939, + "grad_norm": 2.4707591898591983, + "learning_rate": 1.148360251909374e-06, + "loss": 0.7047, + "step": 13410 + }, + { + "epoch": 0.8765512736773351, + "grad_norm": 4.69178149194048, + "learning_rate": 1.1364586678463868e-06, + "loss": 0.6913, + "step": 13420 + }, + { + "epoch": 0.8772044415414761, + "grad_norm": 5.475472291406911, + "learning_rate": 1.1246166506256834e-06, + "loss": 0.7222, + "step": 13430 + }, + { + "epoch": 0.8778576094056172, + "grad_norm": 6.649352605756714, + "learning_rate": 1.1128342511283278e-06, + "loss": 0.6908, + "step": 13440 + }, + { + "epoch": 0.8785107772697583, + "grad_norm": 4.445498497797621, + "learning_rate": 1.1011115199792032e-06, + "loss": 0.6752, + "step": 13450 + }, + { + "epoch": 0.8791639451338994, + "grad_norm": 10.860958812672406, + "learning_rate": 1.0894485075468385e-06, + "loss": 0.6967, + "step": 13460 + }, + { + "epoch": 0.8798171129980406, + "grad_norm": 19.130271434463495, + "learning_rate": 1.0778452639431585e-06, + "loss": 0.6903, + "step": 13470 + }, + { + "epoch": 0.8804702808621816, + "grad_norm": 6.029789723135012, + "learning_rate": 1.0663018390232947e-06, + "loss": 0.6964, + "step": 13480 + }, + { + "epoch": 0.8811234487263226, + "grad_norm": 4.7575663485688935, + "learning_rate": 1.0548182823853463e-06, + "loss": 0.7022, + "step": 13490 + }, + { + "epoch": 0.8817766165904638, + "grad_norm": 5.457678175572029, + "learning_rate": 1.0433946433701896e-06, + "loss": 0.7079, + "step": 13500 + }, + { + "epoch": 0.8824297844546048, + "grad_norm": 7.560645262211808, + "learning_rate": 1.0320309710612469e-06, + "loss": 0.6716, + "step": 13510 + }, + { + "epoch": 0.8830829523187459, + "grad_norm": 12.568101839096094, + "learning_rate": 1.0207273142842899e-06, + "loss": 0.6721, + "step": 13520 + }, + { + "epoch": 0.883736120182887, + "grad_norm": 5.211823395284773, + "learning_rate": 1.00948372160722e-06, + "loss": 0.6953, + "step": 13530 + }, + { + "epoch": 0.8843892880470281, + "grad_norm": 6.709679162531643, + "learning_rate": 9.983002413398635e-07, + "loss": 0.694, + "step": 13540 + }, + { + "epoch": 0.8850424559111691, + "grad_norm": 9.576647297562653, + "learning_rate": 9.871769215337744e-07, + "loss": 0.6808, + "step": 13550 + }, + { + "epoch": 0.8856956237753103, + "grad_norm": 3.263554550975047, + "learning_rate": 9.76113809982006e-07, + "loss": 0.6822, + "step": 13560 + }, + { + "epoch": 0.8863487916394513, + "grad_norm": 2.721918474062616, + "learning_rate": 9.651109542189246e-07, + "loss": 0.6771, + "step": 13570 + }, + { + "epoch": 0.8870019595035924, + "grad_norm": 6.588269310751218, + "learning_rate": 9.541684015199937e-07, + "loss": 0.6984, + "step": 13580 + }, + { + "epoch": 0.8876551273677336, + "grad_norm": 4.5199986169188415, + "learning_rate": 9.432861989015806e-07, + "loss": 0.6777, + "step": 13590 + }, + { + "epoch": 0.8883082952318746, + "grad_norm": 4.463527161594297, + "learning_rate": 9.324643931207438e-07, + "loss": 0.6895, + "step": 13600 + }, + { + "epoch": 0.8889614630960156, + "grad_norm": 2.5564012291530616, + "learning_rate": 9.217030306750424e-07, + "loss": 0.6993, + "step": 13610 + }, + { + "epoch": 0.8896146309601568, + "grad_norm": 3.939372389518821, + "learning_rate": 9.110021578023265e-07, + "loss": 0.7517, + "step": 13620 + }, + { + "epoch": 0.8902677988242979, + "grad_norm": 3.3916239167939524, + "learning_rate": 9.003618204805458e-07, + "loss": 0.6961, + "step": 13630 + }, + { + "epoch": 0.8909209666884389, + "grad_norm": 3.676860784580647, + "learning_rate": 8.897820644275517e-07, + "loss": 0.6922, + "step": 13640 + }, + { + "epoch": 0.89157413455258, + "grad_norm": 84.63930247489932, + "learning_rate": 8.792629351008935e-07, + "loss": 0.7671, + "step": 13650 + }, + { + "epoch": 0.8922273024167211, + "grad_norm": 3.7924065060240566, + "learning_rate": 8.688044776976373e-07, + "loss": 0.7007, + "step": 13660 + }, + { + "epoch": 0.8928804702808621, + "grad_norm": 5.923617546949608, + "learning_rate": 8.584067371541543e-07, + "loss": 0.669, + "step": 13670 + }, + { + "epoch": 0.8935336381450033, + "grad_norm": 6.261930142057152, + "learning_rate": 8.480697581459379e-07, + "loss": 0.6811, + "step": 13680 + }, + { + "epoch": 0.8941868060091444, + "grad_norm": 4.144322540176693, + "learning_rate": 8.377935850874136e-07, + "loss": 0.6428, + "step": 13690 + }, + { + "epoch": 0.8948399738732854, + "grad_norm": 5.285507743069797, + "learning_rate": 8.275782621317424e-07, + "loss": 0.6897, + "step": 13700 + }, + { + "epoch": 0.8954931417374266, + "grad_norm": 5.880334907388088, + "learning_rate": 8.174238331706346e-07, + "loss": 0.7171, + "step": 13710 + }, + { + "epoch": 0.8961463096015676, + "grad_norm": 4.729785268155556, + "learning_rate": 8.073303418341582e-07, + "loss": 0.7273, + "step": 13720 + }, + { + "epoch": 0.8967994774657086, + "grad_norm": 4.18389236139603, + "learning_rate": 7.972978314905572e-07, + "loss": 0.6712, + "step": 13730 + }, + { + "epoch": 0.8974526453298498, + "grad_norm": 36.68213932833151, + "learning_rate": 7.873263452460533e-07, + "loss": 0.7055, + "step": 13740 + }, + { + "epoch": 0.8981058131939909, + "grad_norm": 6.203815873346086, + "learning_rate": 7.774159259446834e-07, + "loss": 0.7088, + "step": 13750 + }, + { + "epoch": 0.8987589810581319, + "grad_norm": 6.0828838792008675, + "learning_rate": 7.675666161680822e-07, + "loss": 0.6817, + "step": 13760 + }, + { + "epoch": 0.8994121489222731, + "grad_norm": 15.42583812742743, + "learning_rate": 7.577784582353314e-07, + "loss": 0.6922, + "step": 13770 + }, + { + "epoch": 0.9000653167864141, + "grad_norm": 5.40224506897626, + "learning_rate": 7.480514942027595e-07, + "loss": 0.6646, + "step": 13780 + }, + { + "epoch": 0.9007184846505552, + "grad_norm": 11.010252596168959, + "learning_rate": 7.383857658637699e-07, + "loss": 0.6413, + "step": 13790 + }, + { + "epoch": 0.9013716525146963, + "grad_norm": 10.280050004770693, + "learning_rate": 7.287813147486522e-07, + "loss": 0.6916, + "step": 13800 + }, + { + "epoch": 0.9020248203788374, + "grad_norm": 27.838447999335717, + "learning_rate": 7.192381821244076e-07, + "loss": 0.6672, + "step": 13810 + }, + { + "epoch": 0.9026779882429784, + "grad_norm": 3.5603793340741716, + "learning_rate": 7.097564089945819e-07, + "loss": 0.6974, + "step": 13820 + }, + { + "epoch": 0.9033311561071196, + "grad_norm": 1.8945309353857664, + "learning_rate": 7.003360360990713e-07, + "loss": 0.695, + "step": 13830 + }, + { + "epoch": 0.9039843239712606, + "grad_norm": 3.734134937694426, + "learning_rate": 6.909771039139618e-07, + "loss": 0.6432, + "step": 13840 + }, + { + "epoch": 0.9046374918354017, + "grad_norm": 8.454806289303644, + "learning_rate": 6.816796526513469e-07, + "loss": 0.6882, + "step": 13850 + }, + { + "epoch": 0.9052906596995428, + "grad_norm": 6.13037154100083, + "learning_rate": 6.724437222591601e-07, + "loss": 0.7562, + "step": 13860 + }, + { + "epoch": 0.9059438275636839, + "grad_norm": 37.35666127481901, + "learning_rate": 6.632693524209993e-07, + "loss": 0.693, + "step": 13870 + }, + { + "epoch": 0.9065969954278249, + "grad_norm": 2.5934276150658575, + "learning_rate": 6.541565825559608e-07, + "loss": 0.6659, + "step": 13880 + }, + { + "epoch": 0.9072501632919661, + "grad_norm": 4.4479991658024725, + "learning_rate": 6.451054518184613e-07, + "loss": 0.7033, + "step": 13890 + }, + { + "epoch": 0.9079033311561071, + "grad_norm": 3.9779535944376616, + "learning_rate": 6.361159990980836e-07, + "loss": 0.7081, + "step": 13900 + }, + { + "epoch": 0.9085564990202482, + "grad_norm": 10.652074806950573, + "learning_rate": 6.271882630193931e-07, + "loss": 0.7306, + "step": 13910 + }, + { + "epoch": 0.9092096668843893, + "grad_norm": 8.959014393548467, + "learning_rate": 6.183222819417822e-07, + "loss": 0.7099, + "step": 13920 + }, + { + "epoch": 0.9098628347485304, + "grad_norm": 7.508628205476429, + "learning_rate": 6.09518093959312e-07, + "loss": 0.6944, + "step": 13930 + }, + { + "epoch": 0.9105160026126714, + "grad_norm": 3.6530980274780824, + "learning_rate": 6.007757369005278e-07, + "loss": 0.6975, + "step": 13940 + }, + { + "epoch": 0.9111691704768126, + "grad_norm": 16.31255518878356, + "learning_rate": 5.920952483283159e-07, + "loss": 0.6187, + "step": 13950 + }, + { + "epoch": 0.9118223383409536, + "grad_norm": 3.3490188300490993, + "learning_rate": 5.834766655397334e-07, + "loss": 0.6923, + "step": 13960 + }, + { + "epoch": 0.9124755062050947, + "grad_norm": 5.306609893923255, + "learning_rate": 5.749200255658516e-07, + "loss": 0.6937, + "step": 13970 + }, + { + "epoch": 0.9131286740692358, + "grad_norm": 7.073860955629541, + "learning_rate": 5.664253651715917e-07, + "loss": 0.6991, + "step": 13980 + }, + { + "epoch": 0.9137818419333769, + "grad_norm": 8.503460348036263, + "learning_rate": 5.579927208555713e-07, + "loss": 0.7047, + "step": 13990 + }, + { + "epoch": 0.914435009797518, + "grad_norm": 8.502466629263433, + "learning_rate": 5.496221288499459e-07, + "loss": 0.6506, + "step": 14000 + }, + { + "epoch": 0.9150881776616591, + "grad_norm": 3.533571813440642, + "learning_rate": 5.413136251202544e-07, + "loss": 0.7007, + "step": 14010 + }, + { + "epoch": 0.9157413455258001, + "grad_norm": 4.645811939047689, + "learning_rate": 5.330672453652657e-07, + "loss": 0.7376, + "step": 14020 + }, + { + "epoch": 0.9163945133899413, + "grad_norm": 9.018121912413273, + "learning_rate": 5.248830250168174e-07, + "loss": 0.7021, + "step": 14030 + }, + { + "epoch": 0.9170476812540823, + "grad_norm": 2.049483638954049, + "learning_rate": 5.167609992396788e-07, + "loss": 0.691, + "step": 14040 + }, + { + "epoch": 0.9177008491182234, + "grad_norm": 3.587822578628003, + "learning_rate": 5.087012029313832e-07, + "loss": 0.7162, + "step": 14050 + }, + { + "epoch": 0.9183540169823645, + "grad_norm": 5.337187010546555, + "learning_rate": 5.007036707220874e-07, + "loss": 0.6885, + "step": 14060 + }, + { + "epoch": 0.9190071848465056, + "grad_norm": 4.327237365263435, + "learning_rate": 4.927684369744195e-07, + "loss": 0.7006, + "step": 14070 + }, + { + "epoch": 0.9196603527106466, + "grad_norm": 4.396522769226511, + "learning_rate": 4.848955357833396e-07, + "loss": 0.6899, + "step": 14080 + }, + { + "epoch": 0.9203135205747878, + "grad_norm": 5.83333667573379, + "learning_rate": 4.770850009759769e-07, + "loss": 0.7049, + "step": 14090 + }, + { + "epoch": 0.9209666884389288, + "grad_norm": 3.0477622280944137, + "learning_rate": 4.693368661114988e-07, + "loss": 0.7459, + "step": 14100 + }, + { + "epoch": 0.9216198563030699, + "grad_norm": 8.532140217261535, + "learning_rate": 4.6165116448096346e-07, + "loss": 0.6397, + "step": 14110 + }, + { + "epoch": 0.922273024167211, + "grad_norm": 10.03387663966036, + "learning_rate": 4.5402792910717026e-07, + "loss": 0.6632, + "step": 14120 + }, + { + "epoch": 0.9229261920313521, + "grad_norm": 7.781927975452916, + "learning_rate": 4.4646719274452685e-07, + "loss": 0.6627, + "step": 14130 + }, + { + "epoch": 0.9235793598954931, + "grad_norm": 4.884147473757335, + "learning_rate": 4.3896898787889885e-07, + "loss": 0.643, + "step": 14140 + }, + { + "epoch": 0.9242325277596343, + "grad_norm": 2.4835514646988126, + "learning_rate": 4.315333467274851e-07, + "loss": 0.7385, + "step": 14150 + }, + { + "epoch": 0.9248856956237753, + "grad_norm": 3.3527394609036048, + "learning_rate": 4.2416030123865634e-07, + "loss": 0.6989, + "step": 14160 + }, + { + "epoch": 0.9255388634879164, + "grad_norm": 6.4679488271034815, + "learning_rate": 4.1684988309184656e-07, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 0.9261920313520575, + "grad_norm": 13.210624488086639, + "learning_rate": 4.0960212369739016e-07, + "loss": 0.6825, + "step": 14180 + }, + { + "epoch": 0.9268451992161986, + "grad_norm": 3.6521328951919405, + "learning_rate": 4.024170541964017e-07, + "loss": 0.6929, + "step": 14190 + }, + { + "epoch": 0.9274983670803396, + "grad_norm": 7.8804629314783465, + "learning_rate": 3.9529470546064315e-07, + "loss": 0.705, + "step": 14200 + }, + { + "epoch": 0.9281515349444808, + "grad_norm": 6.128700539787899, + "learning_rate": 3.8823510809238184e-07, + "loss": 0.6672, + "step": 14210 + }, + { + "epoch": 0.9288047028086218, + "grad_norm": 15.312114352605443, + "learning_rate": 3.8123829242426577e-07, + "loss": 0.6954, + "step": 14220 + }, + { + "epoch": 0.9294578706727629, + "grad_norm": 9.102448069176269, + "learning_rate": 3.743042885191922e-07, + "loss": 0.6824, + "step": 14230 + }, + { + "epoch": 0.930111038536904, + "grad_norm": 2.7931428014768844, + "learning_rate": 3.6743312617017745e-07, + "loss": 0.6554, + "step": 14240 + }, + { + "epoch": 0.9307642064010451, + "grad_norm": 2.903822346384219, + "learning_rate": 3.6062483490023056e-07, + "loss": 0.7012, + "step": 14250 + }, + { + "epoch": 0.9314173742651861, + "grad_norm": 11.335648094600034, + "learning_rate": 3.538794439622234e-07, + "loss": 0.6825, + "step": 14260 + }, + { + "epoch": 0.9320705421293273, + "grad_norm": 4.648415752969088, + "learning_rate": 3.471969823387705e-07, + "loss": 0.6567, + "step": 14270 + }, + { + "epoch": 0.9327237099934683, + "grad_norm": 5.642285892343845, + "learning_rate": 3.4057747874209457e-07, + "loss": 0.6932, + "step": 14280 + }, + { + "epoch": 0.9333768778576094, + "grad_norm": 2.1561865313490833, + "learning_rate": 3.340209616139145e-07, + "loss": 0.6706, + "step": 14290 + }, + { + "epoch": 0.9340300457217505, + "grad_norm": 4.556269114261926, + "learning_rate": 3.2752745912531743e-07, + "loss": 0.6885, + "step": 14300 + }, + { + "epoch": 0.9346832135858916, + "grad_norm": 3.377248791911224, + "learning_rate": 3.2109699917663713e-07, + "loss": 0.6943, + "step": 14310 + }, + { + "epoch": 0.9353363814500326, + "grad_norm": 4.49289672560348, + "learning_rate": 3.1472960939733566e-07, + "loss": 0.6961, + "step": 14320 + }, + { + "epoch": 0.9359895493141738, + "grad_norm": 5.042183316243519, + "learning_rate": 3.0842531714588673e-07, + "loss": 0.6841, + "step": 14330 + }, + { + "epoch": 0.9366427171783148, + "grad_norm": 8.043654543674085, + "learning_rate": 3.0218414950964944e-07, + "loss": 0.6923, + "step": 14340 + }, + { + "epoch": 0.9372958850424559, + "grad_norm": 4.451546800933181, + "learning_rate": 2.9600613330476814e-07, + "loss": 0.6797, + "step": 14350 + }, + { + "epoch": 0.937949052906597, + "grad_norm": 3.1482886191633104, + "learning_rate": 2.8989129507603904e-07, + "loss": 0.6939, + "step": 14360 + }, + { + "epoch": 0.9386022207707381, + "grad_norm": 3.7098862614638266, + "learning_rate": 2.8383966109680747e-07, + "loss": 0.6863, + "step": 14370 + }, + { + "epoch": 0.9392553886348791, + "grad_norm": 5.3282925308156, + "learning_rate": 2.778512573688491e-07, + "loss": 0.6897, + "step": 14380 + }, + { + "epoch": 0.9399085564990203, + "grad_norm": 4.219253623006551, + "learning_rate": 2.719261096222669e-07, + "loss": 0.7354, + "step": 14390 + }, + { + "epoch": 0.9405617243631613, + "grad_norm": 3.929316539205132, + "learning_rate": 2.660642433153698e-07, + "loss": 0.6779, + "step": 14400 + }, + { + "epoch": 0.9412148922273024, + "grad_norm": 5.935845074156229, + "learning_rate": 2.602656836345707e-07, + "loss": 0.7155, + "step": 14410 + }, + { + "epoch": 0.9418680600914435, + "grad_norm": 17.773980074135935, + "learning_rate": 2.545304554942751e-07, + "loss": 0.7328, + "step": 14420 + }, + { + "epoch": 0.9425212279555846, + "grad_norm": 7.140119338547583, + "learning_rate": 2.4885858353677295e-07, + "loss": 0.6906, + "step": 14430 + }, + { + "epoch": 0.9431743958197256, + "grad_norm": 3.561871657255324, + "learning_rate": 2.4325009213214177e-07, + "loss": 0.696, + "step": 14440 + }, + { + "epoch": 0.9438275636838668, + "grad_norm": 9.022172530381402, + "learning_rate": 2.3770500537812211e-07, + "loss": 0.6945, + "step": 14450 + }, + { + "epoch": 0.9444807315480078, + "grad_norm": 2.9858760553941734, + "learning_rate": 2.32223347100039e-07, + "loss": 0.7145, + "step": 14460 + }, + { + "epoch": 0.9451338994121489, + "grad_norm": 10.192499162843738, + "learning_rate": 2.2680514085068049e-07, + "loss": 0.6805, + "step": 14470 + }, + { + "epoch": 0.94578706727629, + "grad_norm": 2.867441607864355, + "learning_rate": 2.214504099102044e-07, + "loss": 0.6791, + "step": 14480 + }, + { + "epoch": 0.9464402351404311, + "grad_norm": 3.496606271353713, + "learning_rate": 2.161591772860383e-07, + "loss": 0.6719, + "step": 14490 + }, + { + "epoch": 0.9470934030045721, + "grad_norm": 9.951317352040162, + "learning_rate": 2.109314657127781e-07, + "loss": 0.7256, + "step": 14500 + }, + { + "epoch": 0.9477465708687133, + "grad_norm": 3.795601588076029, + "learning_rate": 2.0576729765209468e-07, + "loss": 0.6505, + "step": 14510 + }, + { + "epoch": 0.9483997387328543, + "grad_norm": 3.0206069510686486, + "learning_rate": 2.0066669529262726e-07, + "loss": 0.7397, + "step": 14520 + }, + { + "epoch": 0.9490529065969955, + "grad_norm": 4.714697506081651, + "learning_rate": 1.9562968054990693e-07, + "loss": 0.6803, + "step": 14530 + }, + { + "epoch": 0.9497060744611365, + "grad_norm": 3.9588049089040362, + "learning_rate": 1.9065627506623663e-07, + "loss": 0.6761, + "step": 14540 + }, + { + "epoch": 0.9503592423252776, + "grad_norm": 2.6319925633183385, + "learning_rate": 1.8574650021062622e-07, + "loss": 0.6709, + "step": 14550 + }, + { + "epoch": 0.9510124101894187, + "grad_norm": 7.501251879162482, + "learning_rate": 1.8090037707867602e-07, + "loss": 0.697, + "step": 14560 + }, + { + "epoch": 0.9516655780535598, + "grad_norm": 11.39937787951777, + "learning_rate": 1.7611792649250168e-07, + "loss": 0.7192, + "step": 14570 + }, + { + "epoch": 0.9523187459177008, + "grad_norm": 26.26072372101657, + "learning_rate": 1.7139916900064111e-07, + "loss": 0.6729, + "step": 14580 + }, + { + "epoch": 0.952971913781842, + "grad_norm": 22.837025290547228, + "learning_rate": 1.6674412487796109e-07, + "loss": 0.6705, + "step": 14590 + }, + { + "epoch": 0.953625081645983, + "grad_norm": 17.314522635400518, + "learning_rate": 1.6215281412557737e-07, + "loss": 0.7002, + "step": 14600 + }, + { + "epoch": 0.9542782495101241, + "grad_norm": 5.138758914896164, + "learning_rate": 1.5762525647076308e-07, + "loss": 0.6723, + "step": 14610 + }, + { + "epoch": 0.9549314173742652, + "grad_norm": 11.749498072515893, + "learning_rate": 1.5316147136687053e-07, + "loss": 0.7342, + "step": 14620 + }, + { + "epoch": 0.9555845852384063, + "grad_norm": 9.208858239128203, + "learning_rate": 1.4876147799323613e-07, + "loss": 0.6792, + "step": 14630 + }, + { + "epoch": 0.9562377531025473, + "grad_norm": 2.8501151814525647, + "learning_rate": 1.4442529525511395e-07, + "loss": 0.6651, + "step": 14640 + }, + { + "epoch": 0.9568909209666885, + "grad_norm": 3.7411306397999913, + "learning_rate": 1.4015294178357895e-07, + "loss": 0.6937, + "step": 14650 + }, + { + "epoch": 0.9575440888308295, + "grad_norm": 17.34969017785557, + "learning_rate": 1.3594443593545724e-07, + "loss": 0.6594, + "step": 14660 + }, + { + "epoch": 0.9581972566949706, + "grad_norm": 5.181828903418344, + "learning_rate": 1.3179979579324265e-07, + "loss": 0.6961, + "step": 14670 + }, + { + "epoch": 0.9588504245591117, + "grad_norm": 3.4066746998174007, + "learning_rate": 1.2771903916502014e-07, + "loss": 0.6623, + "step": 14680 + }, + { + "epoch": 0.9595035924232528, + "grad_norm": 5.360315357154784, + "learning_rate": 1.23702183584391e-07, + "loss": 0.6627, + "step": 14690 + }, + { + "epoch": 0.9601567602873938, + "grad_norm": 3.624361179231387, + "learning_rate": 1.1974924631039108e-07, + "loss": 0.6754, + "step": 14700 + }, + { + "epoch": 0.960809928151535, + "grad_norm": 6.004317002153827, + "learning_rate": 1.1586024432742759e-07, + "loss": 0.6712, + "step": 14710 + }, + { + "epoch": 0.961463096015676, + "grad_norm": 3.1624037173846915, + "learning_rate": 1.1203519434519582e-07, + "loss": 0.6449, + "step": 14720 + }, + { + "epoch": 0.9621162638798171, + "grad_norm": 12.210848778068598, + "learning_rate": 1.082741127986142e-07, + "loss": 0.6341, + "step": 14730 + }, + { + "epoch": 0.9627694317439582, + "grad_norm": 4.19471051605192, + "learning_rate": 1.0457701584774936e-07, + "loss": 0.6853, + "step": 14740 + }, + { + "epoch": 0.9634225996080993, + "grad_norm": 7.093214107218928, + "learning_rate": 1.0094391937774617e-07, + "loss": 0.7483, + "step": 14750 + }, + { + "epoch": 0.9640757674722403, + "grad_norm": 45.34717692511988, + "learning_rate": 9.737483899876443e-08, + "loss": 0.682, + "step": 14760 + }, + { + "epoch": 0.9647289353363815, + "grad_norm": 3.034575064258203, + "learning_rate": 9.386979004590734e-08, + "loss": 0.7301, + "step": 14770 + }, + { + "epoch": 0.9653821032005225, + "grad_norm": 7.736430440263213, + "learning_rate": 9.042878757915984e-08, + "loss": 0.6526, + "step": 14780 + }, + { + "epoch": 0.9660352710646636, + "grad_norm": 6.196227728838487, + "learning_rate": 8.705184638331698e-08, + "loss": 0.6768, + "step": 14790 + }, + { + "epoch": 0.9666884389288047, + "grad_norm": 3.6317626748795067, + "learning_rate": 8.373898096793065e-08, + "loss": 0.6733, + "step": 14800 + }, + { + "epoch": 0.9673416067929458, + "grad_norm": 21.752350077330007, + "learning_rate": 8.049020556723464e-08, + "loss": 0.6687, + "step": 14810 + }, + { + "epoch": 0.9679947746570868, + "grad_norm": 2.414092721101793, + "learning_rate": 7.730553414009466e-08, + "loss": 0.6846, + "step": 14820 + }, + { + "epoch": 0.968647942521228, + "grad_norm": 6.9056862993753825, + "learning_rate": 7.418498036994182e-08, + "loss": 0.7105, + "step": 14830 + }, + { + "epoch": 0.969301110385369, + "grad_norm": 20.5521136085634, + "learning_rate": 7.112855766471749e-08, + "loss": 0.6715, + "step": 14840 + }, + { + "epoch": 0.9699542782495101, + "grad_norm": 6.202034573357447, + "learning_rate": 6.813627915681186e-08, + "loss": 0.6898, + "step": 14850 + }, + { + "epoch": 0.9706074461136512, + "grad_norm": 5.266374212104467, + "learning_rate": 6.520815770301058e-08, + "loss": 0.7049, + "step": 14860 + }, + { + "epoch": 0.9712606139777923, + "grad_norm": 8.003394360461222, + "learning_rate": 6.234420588443978e-08, + "loss": 0.6853, + "step": 14870 + }, + { + "epoch": 0.9719137818419333, + "grad_norm": 21.539384830513455, + "learning_rate": 5.954443600650783e-08, + "loss": 0.6738, + "step": 14880 + }, + { + "epoch": 0.9725669497060745, + "grad_norm": 13.756086077616871, + "learning_rate": 5.680886009886199e-08, + "loss": 0.7059, + "step": 14890 + }, + { + "epoch": 0.9732201175702155, + "grad_norm": 4.776557963789963, + "learning_rate": 5.413748991532019e-08, + "loss": 0.6703, + "step": 14900 + }, + { + "epoch": 0.9738732854343566, + "grad_norm": 5.150915542256727, + "learning_rate": 5.153033693384101e-08, + "loss": 0.6762, + "step": 14910 + }, + { + "epoch": 0.9745264532984977, + "grad_norm": 4.569623220258107, + "learning_rate": 4.898741235645543e-08, + "loss": 0.711, + "step": 14920 + }, + { + "epoch": 0.9751796211626388, + "grad_norm": 2.9152966869512986, + "learning_rate": 4.650872710923349e-08, + "loss": 0.6874, + "step": 14930 + }, + { + "epoch": 0.9758327890267798, + "grad_norm": 4.391462470174728, + "learning_rate": 4.4094291842227684e-08, + "loss": 0.663, + "step": 14940 + }, + { + "epoch": 0.976485956890921, + "grad_norm": 4.087919553170458, + "learning_rate": 4.174411692943136e-08, + "loss": 0.677, + "step": 14950 + }, + { + "epoch": 0.977139124755062, + "grad_norm": 2.1554312879370157, + "learning_rate": 3.945821246873205e-08, + "loss": 0.6425, + "step": 14960 + }, + { + "epoch": 0.9777922926192031, + "grad_norm": 9.7690536949704, + "learning_rate": 3.723658828187149e-08, + "loss": 0.7033, + "step": 14970 + }, + { + "epoch": 0.9784454604833442, + "grad_norm": 6.956938857033174, + "learning_rate": 3.50792539144007e-08, + "loss": 0.643, + "step": 14980 + }, + { + "epoch": 0.9790986283474853, + "grad_norm": 18.59785275114106, + "learning_rate": 3.298621863564e-08, + "loss": 0.6852, + "step": 14990 + }, + { + "epoch": 0.9797517962116263, + "grad_norm": 6.774665694768924, + "learning_rate": 3.095749143863735e-08, + "loss": 0.6704, + "step": 15000 + }, + { + "epoch": 0.9804049640757675, + "grad_norm": 6.242109276731404, + "learning_rate": 2.8993081040130098e-08, + "loss": 0.7305, + "step": 15010 + }, + { + "epoch": 0.9810581319399085, + "grad_norm": 6.9854383235254, + "learning_rate": 2.7092995880513283e-08, + "loss": 0.7011, + "step": 15020 + }, + { + "epoch": 0.9817112998040496, + "grad_norm": 2.284336666928396, + "learning_rate": 2.525724412379471e-08, + "loss": 0.7013, + "step": 15030 + }, + { + "epoch": 0.9823644676681907, + "grad_norm": 25.699388263709825, + "learning_rate": 2.3485833657563293e-08, + "loss": 0.6755, + "step": 15040 + }, + { + "epoch": 0.9830176355323318, + "grad_norm": 6.009184166094514, + "learning_rate": 2.1778772092959086e-08, + "loss": 0.7308, + "step": 15050 + }, + { + "epoch": 0.983670803396473, + "grad_norm": 11.053594248061634, + "learning_rate": 2.013606676463331e-08, + "loss": 0.7425, + "step": 15060 + }, + { + "epoch": 0.984323971260614, + "grad_norm": 3.607102772922988, + "learning_rate": 1.8557724730725035e-08, + "loss": 0.6991, + "step": 15070 + }, + { + "epoch": 0.984977139124755, + "grad_norm": 3.205612333365301, + "learning_rate": 1.7043752772822886e-08, + "loss": 0.6464, + "step": 15080 + }, + { + "epoch": 0.9856303069888962, + "grad_norm": 5.556643714300547, + "learning_rate": 1.5594157395940056e-08, + "loss": 0.7331, + "step": 15090 + }, + { + "epoch": 0.9862834748530372, + "grad_norm": 3.5464258119596814, + "learning_rate": 1.4208944828486003e-08, + "loss": 0.7062, + "step": 15100 + }, + { + "epoch": 0.9869366427171783, + "grad_norm": 3.722206624948899, + "learning_rate": 1.2888121022243126e-08, + "loss": 0.6512, + "step": 15110 + }, + { + "epoch": 0.9875898105813194, + "grad_norm": 2.2409677410944533, + "learning_rate": 1.16316916523318e-08, + "loss": 0.7349, + "step": 15120 + }, + { + "epoch": 0.9882429784454605, + "grad_norm": 3.878672305578926, + "learning_rate": 1.043966211719538e-08, + "loss": 0.7203, + "step": 15130 + }, + { + "epoch": 0.9888961463096015, + "grad_norm": 5.454242056298144, + "learning_rate": 9.312037538571905e-09, + "loss": 0.717, + "step": 15140 + }, + { + "epoch": 0.9895493141737427, + "grad_norm": 5.726823288208327, + "learning_rate": 8.24882276147576e-09, + "loss": 0.7076, + "step": 15150 + }, + { + "epoch": 0.9902024820378837, + "grad_norm": 14.755082628670916, + "learning_rate": 7.250022354171048e-09, + "loss": 0.6764, + "step": 15160 + }, + { + "epoch": 0.9908556499020248, + "grad_norm": 3.6312013396320113, + "learning_rate": 6.315640608158257e-09, + "loss": 0.669, + "step": 15170 + }, + { + "epoch": 0.991508817766166, + "grad_norm": 4.409239275794741, + "learning_rate": 5.445681538154279e-09, + "loss": 0.6649, + "step": 15180 + }, + { + "epoch": 0.992161985630307, + "grad_norm": 5.127595755856114, + "learning_rate": 4.640148882069095e-09, + "loss": 0.6749, + "step": 15190 + }, + { + "epoch": 0.992815153494448, + "grad_norm": 4.635445835436082, + "learning_rate": 3.899046101000781e-09, + "loss": 0.7167, + "step": 15200 + }, + { + "epoch": 0.9934683213585892, + "grad_norm": 7.473855033630282, + "learning_rate": 3.2223763792121884e-09, + "loss": 0.7007, + "step": 15210 + }, + { + "epoch": 0.9941214892227302, + "grad_norm": 2.7252881588946205, + "learning_rate": 2.610142624115963e-09, + "loss": 0.7108, + "step": 15220 + }, + { + "epoch": 0.9947746570868713, + "grad_norm": 13.861173872125017, + "learning_rate": 2.0623474662712085e-09, + "loss": 0.6722, + "step": 15230 + }, + { + "epoch": 0.9954278249510125, + "grad_norm": 6.165396310361432, + "learning_rate": 1.5789932593635037e-09, + "loss": 0.6937, + "step": 15240 + }, + { + "epoch": 0.9960809928151535, + "grad_norm": 2.3860816417389623, + "learning_rate": 1.1600820801982437e-09, + "loss": 0.6757, + "step": 15250 + }, + { + "epoch": 0.9967341606792945, + "grad_norm": 6.100593050284992, + "learning_rate": 8.056157286923104e-10, + "loss": 0.6631, + "step": 15260 + }, + { + "epoch": 0.9973873285434357, + "grad_norm": 21.92836751794244, + "learning_rate": 5.155957278657475e-10, + "loss": 0.7129, + "step": 15270 + }, + { + "epoch": 0.9980404964075767, + "grad_norm": 3.9263976263245834, + "learning_rate": 2.900233238334327e-10, + "loss": 0.6716, + "step": 15280 + }, + { + "epoch": 0.9986936642717178, + "grad_norm": 8.924314362505408, + "learning_rate": 1.2889948580174783e-10, + "loss": 0.7055, + "step": 15290 + }, + { + "epoch": 0.999346832135859, + "grad_norm": 15.640463539532494, + "learning_rate": 3.222490606524797e-11, + "loss": 0.7206, + "step": 15300 + }, + { + "epoch": 1.0, + "grad_norm": 3.167310766334676, + "learning_rate": 0.0, + "loss": 0.6538, + "step": 15310 + }, + { + "epoch": 1.0, + "step": 15310, + "total_flos": 4.185096573664887e+19, + "train_loss": 0.7165804752721108, + "train_runtime": 78866.3439, + "train_samples_per_second": 12.424, + "train_steps_per_second": 0.194 + } + ], + "logging_steps": 10, + "max_steps": 15310, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.185096573664887e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}