diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9999981318415465, + "eval_steps": 500, + "global_step": 535286, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018681584534637058, + "grad_norm": 7.906698703765869, + "learning_rate": 9.999978643682902e-05, + "loss": 3.2962, + "step": 500 + }, + { + "epoch": 0.0037363169069274116, + "grad_norm": 8.97018051147461, + "learning_rate": 9.99991423149794e-05, + "loss": 3.1699, + "step": 1000 + }, + { + "epoch": 0.005604475360391117, + "grad_norm": 15.646541595458984, + "learning_rate": 9.999806763655335e-05, + "loss": 3.0952, + "step": 1500 + }, + { + "epoch": 0.007472633813854823, + "grad_norm": 7.570132732391357, + "learning_rate": 9.999656241080522e-05, + "loss": 3.0763, + "step": 2000 + }, + { + "epoch": 0.009340792267318529, + "grad_norm": 5.853657245635986, + "learning_rate": 9.999462665069693e-05, + "loss": 3.0355, + "step": 2500 + }, + { + "epoch": 0.011208950720782235, + "grad_norm": 6.587205410003662, + "learning_rate": 9.999226553509718e-05, + "loss": 3.0291, + "step": 3000 + }, + { + "epoch": 0.01307710917424594, + "grad_norm": 6.537953853607178, + "learning_rate": 9.998946962095583e-05, + "loss": 2.9902, + "step": 3500 + }, + { + "epoch": 0.014945267627709646, + "grad_norm": 5.199572563171387, + "learning_rate": 9.998624323353232e-05, + "loss": 2.9598, + "step": 4000 + }, + { + "epoch": 0.016813426081173352, + "grad_norm": 6.676388263702393, + "learning_rate": 9.998258640060996e-05, + "loss": 2.9677, + "step": 4500 + }, + { + "epoch": 0.018681584534637058, + "grad_norm": 5.827391147613525, + "learning_rate": 9.997849915367876e-05, + "loss": 2.9525, + "step": 5000 + }, + { + "epoch": 0.020549742988100764, + "grad_norm": 7.3195953369140625, + "learning_rate": 9.997398152793517e-05, + "loss": 2.8857, + "step": 5500 + }, + { + "epoch": 0.02241790144156447, + "grad_norm": 5.648918151855469, + "learning_rate": 9.99690335622817e-05, + "loss": 2.8672, + "step": 6000 + }, + { + "epoch": 0.024286059895028175, + "grad_norm": 5.5112528800964355, + "learning_rate": 9.996366648525912e-05, + "loss": 2.9211, + "step": 6500 + }, + { + "epoch": 0.02615421834849188, + "grad_norm": 5.33510684967041, + "learning_rate": 9.995785883176955e-05, + "loss": 2.8746, + "step": 7000 + }, + { + "epoch": 0.028022376801955587, + "grad_norm": 4.034855365753174, + "learning_rate": 9.995162097720716e-05, + "loss": 2.8976, + "step": 7500 + }, + { + "epoch": 0.029890535255419293, + "grad_norm": 6.461133003234863, + "learning_rate": 9.994495297528784e-05, + "loss": 2.8834, + "step": 8000 + }, + { + "epoch": 0.031758693708883, + "grad_norm": 5.551321029663086, + "learning_rate": 9.993785488343162e-05, + "loss": 2.7976, + "step": 8500 + }, + { + "epoch": 0.033626852162346704, + "grad_norm": 4.432355880737305, + "learning_rate": 9.993032676276217e-05, + "loss": 2.8252, + "step": 9000 + }, + { + "epoch": 0.03549501061581041, + "grad_norm": 4.1577839851379395, + "learning_rate": 9.99223686781062e-05, + "loss": 2.8277, + "step": 9500 + }, + { + "epoch": 0.037363169069274116, + "grad_norm": 4.840776443481445, + "learning_rate": 9.991398069799303e-05, + "loss": 2.8151, + "step": 10000 + }, + { + "epoch": 0.03923132752273782, + "grad_norm": 5.0539093017578125, + "learning_rate": 9.99051628946539e-05, + "loss": 2.8342, + "step": 10500 + }, + { + "epoch": 0.04109948597620153, + "grad_norm": 4.669162750244141, + "learning_rate": 9.989593426795811e-05, + "loss": 2.8473, + "step": 11000 + }, + { + "epoch": 0.04296764442966523, + "grad_norm": 6.490626811981201, + "learning_rate": 9.98862579089188e-05, + "loss": 2.7918, + "step": 11500 + }, + { + "epoch": 0.04483580288312894, + "grad_norm": 5.523123741149902, + "learning_rate": 9.98761519653822e-05, + "loss": 2.8058, + "step": 12000 + }, + { + "epoch": 0.046703961336592645, + "grad_norm": 4.8026227951049805, + "learning_rate": 9.98656165243734e-05, + "loss": 2.76, + "step": 12500 + }, + { + "epoch": 0.04857211979005635, + "grad_norm": 4.3018083572387695, + "learning_rate": 9.985467403479736e-05, + "loss": 2.7533, + "step": 13000 + }, + { + "epoch": 0.050440278243520056, + "grad_norm": 3.7984395027160645, + "learning_rate": 9.984330394823319e-05, + "loss": 2.7928, + "step": 13500 + }, + { + "epoch": 0.05230843669698376, + "grad_norm": 3.971073627471924, + "learning_rate": 9.983148229059621e-05, + "loss": 2.7542, + "step": 14000 + }, + { + "epoch": 0.05417659515044747, + "grad_norm": 4.415925979614258, + "learning_rate": 9.98192315201501e-05, + "loss": 2.7767, + "step": 14500 + }, + { + "epoch": 0.056044753603911174, + "grad_norm": 4.695183277130127, + "learning_rate": 9.980655174238964e-05, + "loss": 2.7724, + "step": 15000 + }, + { + "epoch": 0.05791291205737488, + "grad_norm": 5.4851484298706055, + "learning_rate": 9.979344306650395e-05, + "loss": 2.7768, + "step": 15500 + }, + { + "epoch": 0.059781070510838585, + "grad_norm": 4.120709419250488, + "learning_rate": 9.977990560537549e-05, + "loss": 2.7775, + "step": 16000 + }, + { + "epoch": 0.06164922896430229, + "grad_norm": 3.63053560256958, + "learning_rate": 9.976593947557912e-05, + "loss": 2.7329, + "step": 16500 + }, + { + "epoch": 0.063517387417766, + "grad_norm": 4.178781509399414, + "learning_rate": 9.97515447973811e-05, + "loss": 2.7428, + "step": 17000 + }, + { + "epoch": 0.0653855458712297, + "grad_norm": 3.8429136276245117, + "learning_rate": 9.973675176842667e-05, + "loss": 2.7136, + "step": 17500 + }, + { + "epoch": 0.06725370432469341, + "grad_norm": 3.6935720443725586, + "learning_rate": 9.972150122544814e-05, + "loss": 2.6918, + "step": 18000 + }, + { + "epoch": 0.06912186277815711, + "grad_norm": 4.678779125213623, + "learning_rate": 9.970582251673812e-05, + "loss": 2.686, + "step": 18500 + }, + { + "epoch": 0.07099002123162082, + "grad_norm": 5.219886779785156, + "learning_rate": 9.968971577731036e-05, + "loss": 2.7664, + "step": 19000 + }, + { + "epoch": 0.07285817968508453, + "grad_norm": 3.985466241836548, + "learning_rate": 9.967318114586451e-05, + "loss": 2.7409, + "step": 19500 + }, + { + "epoch": 0.07472633813854823, + "grad_norm": 5.018237590789795, + "learning_rate": 9.965621876478483e-05, + "loss": 2.7278, + "step": 20000 + }, + { + "epoch": 0.07659449659201194, + "grad_norm": 4.305635452270508, + "learning_rate": 9.963882878013921e-05, + "loss": 2.7453, + "step": 20500 + }, + { + "epoch": 0.07846265504547564, + "grad_norm": 3.6431195735931396, + "learning_rate": 9.962101134167761e-05, + "loss": 2.6693, + "step": 21000 + }, + { + "epoch": 0.08033081349893935, + "grad_norm": 3.750077962875366, + "learning_rate": 9.960280351865064e-05, + "loss": 2.7108, + "step": 21500 + }, + { + "epoch": 0.08219897195240305, + "grad_norm": 3.730613946914673, + "learning_rate": 9.95841324906568e-05, + "loss": 2.6607, + "step": 22000 + }, + { + "epoch": 0.08406713040586676, + "grad_norm": 4.009971618652344, + "learning_rate": 9.956503447985205e-05, + "loss": 2.7232, + "step": 22500 + }, + { + "epoch": 0.08593528885933047, + "grad_norm": 3.1298115253448486, + "learning_rate": 9.954550965069465e-05, + "loss": 2.6655, + "step": 23000 + }, + { + "epoch": 0.08780344731279417, + "grad_norm": 3.9897303581237793, + "learning_rate": 9.952555817131835e-05, + "loss": 2.6755, + "step": 23500 + }, + { + "epoch": 0.08967160576625788, + "grad_norm": 5.565286636352539, + "learning_rate": 9.950522139495593e-05, + "loss": 2.6854, + "step": 24000 + }, + { + "epoch": 0.09153976421972158, + "grad_norm": 3.5274269580841064, + "learning_rate": 9.948441798666596e-05, + "loss": 2.6821, + "step": 24500 + }, + { + "epoch": 0.09340792267318529, + "grad_norm": 4.026999473571777, + "learning_rate": 9.946323133845033e-05, + "loss": 2.6389, + "step": 25000 + }, + { + "epoch": 0.095276081126649, + "grad_norm": 4.1627326011657715, + "learning_rate": 9.944157671638854e-05, + "loss": 2.6786, + "step": 25500 + }, + { + "epoch": 0.0971442395801127, + "grad_norm": 3.341585159301758, + "learning_rate": 9.94194963391034e-05, + "loss": 2.6419, + "step": 26000 + }, + { + "epoch": 0.09901239803357641, + "grad_norm": 3.5735983848571777, + "learning_rate": 9.939699039673516e-05, + "loss": 2.652, + "step": 26500 + }, + { + "epoch": 0.10088055648704011, + "grad_norm": 3.736764669418335, + "learning_rate": 9.937405908308882e-05, + "loss": 2.701, + "step": 27000 + }, + { + "epoch": 0.10274871494050382, + "grad_norm": 3.172218084335327, + "learning_rate": 9.935070259563231e-05, + "loss": 2.6086, + "step": 27500 + }, + { + "epoch": 0.10461687339396752, + "grad_norm": 3.945516347885132, + "learning_rate": 9.932692113549484e-05, + "loss": 2.6714, + "step": 28000 + }, + { + "epoch": 0.10648503184743123, + "grad_norm": 2.7730209827423096, + "learning_rate": 9.930271490746525e-05, + "loss": 2.6346, + "step": 28500 + }, + { + "epoch": 0.10835319030089494, + "grad_norm": 3.7872776985168457, + "learning_rate": 9.92780841199901e-05, + "loss": 2.6376, + "step": 29000 + }, + { + "epoch": 0.11022134875435864, + "grad_norm": 3.9411559104919434, + "learning_rate": 9.925302898517198e-05, + "loss": 2.6674, + "step": 29500 + }, + { + "epoch": 0.11208950720782235, + "grad_norm": 4.368437767028809, + "learning_rate": 9.922760110043857e-05, + "loss": 2.6232, + "step": 30000 + }, + { + "epoch": 0.11395766566128605, + "grad_norm": 4.385318279266357, + "learning_rate": 9.920169876946009e-05, + "loss": 2.595, + "step": 30500 + }, + { + "epoch": 0.11582582411474976, + "grad_norm": 3.4636647701263428, + "learning_rate": 9.917537274891421e-05, + "loss": 2.6073, + "step": 31000 + }, + { + "epoch": 0.11769398256821346, + "grad_norm": 2.474412202835083, + "learning_rate": 9.914862326550168e-05, + "loss": 2.655, + "step": 31500 + }, + { + "epoch": 0.11956214102167717, + "grad_norm": 3.5162529945373535, + "learning_rate": 9.912145054956974e-05, + "loss": 2.6259, + "step": 32000 + }, + { + "epoch": 0.12143029947514088, + "grad_norm": 3.149369716644287, + "learning_rate": 9.909385483511026e-05, + "loss": 2.6045, + "step": 32500 + }, + { + "epoch": 0.12329845792860458, + "grad_norm": 3.873689651489258, + "learning_rate": 9.906583635975763e-05, + "loss": 2.6476, + "step": 33000 + }, + { + "epoch": 0.1251666163820683, + "grad_norm": 4.371992588043213, + "learning_rate": 9.90374526682891e-05, + "loss": 2.6149, + "step": 33500 + }, + { + "epoch": 0.127034774835532, + "grad_norm": 4.554148197174072, + "learning_rate": 9.900859024291592e-05, + "loss": 2.6146, + "step": 34000 + }, + { + "epoch": 0.1289029332889957, + "grad_norm": 4.277965545654297, + "learning_rate": 9.897930579088681e-05, + "loss": 2.5902, + "step": 34500 + }, + { + "epoch": 0.1307710917424594, + "grad_norm": 4.317843914031982, + "learning_rate": 9.894959956437835e-05, + "loss": 2.6276, + "step": 35000 + }, + { + "epoch": 0.13263925019592313, + "grad_norm": 3.6088337898254395, + "learning_rate": 9.891953249519332e-05, + "loss": 2.5647, + "step": 35500 + }, + { + "epoch": 0.13450740864938682, + "grad_norm": 2.6994011402130127, + "learning_rate": 9.888898433303897e-05, + "loss": 2.6306, + "step": 36000 + }, + { + "epoch": 0.13637556710285054, + "grad_norm": 3.670053005218506, + "learning_rate": 9.885801517418857e-05, + "loss": 2.6103, + "step": 36500 + }, + { + "epoch": 0.13824372555631423, + "grad_norm": 3.3493151664733887, + "learning_rate": 9.882662528532621e-05, + "loss": 2.5293, + "step": 37000 + }, + { + "epoch": 0.14011188400977795, + "grad_norm": 4.308838844299316, + "learning_rate": 9.879481493675895e-05, + "loss": 2.5701, + "step": 37500 + }, + { + "epoch": 0.14198004246324164, + "grad_norm": 3.550856828689575, + "learning_rate": 9.876258440241463e-05, + "loss": 2.5949, + "step": 38000 + }, + { + "epoch": 0.14384820091670536, + "grad_norm": 3.9775218963623047, + "learning_rate": 9.872999967960666e-05, + "loss": 2.5844, + "step": 38500 + }, + { + "epoch": 0.14571635937016905, + "grad_norm": 3.936997413635254, + "learning_rate": 9.869693044893364e-05, + "loss": 2.5558, + "step": 39000 + }, + { + "epoch": 0.14758451782363277, + "grad_norm": 4.209615707397461, + "learning_rate": 9.866344187539423e-05, + "loss": 2.5605, + "step": 39500 + }, + { + "epoch": 0.14945267627709646, + "grad_norm": 4.603176116943359, + "learning_rate": 9.862960248064681e-05, + "loss": 2.6045, + "step": 40000 + }, + { + "epoch": 0.15132083473056018, + "grad_norm": 3.0863678455352783, + "learning_rate": 9.859527692735271e-05, + "loss": 2.5638, + "step": 40500 + }, + { + "epoch": 0.15318899318402387, + "grad_norm": 3.8357596397399902, + "learning_rate": 9.856053290655904e-05, + "loss": 2.5569, + "step": 41000 + }, + { + "epoch": 0.1550571516374876, + "grad_norm": 3.3822269439697266, + "learning_rate": 9.85253707174563e-05, + "loss": 2.5459, + "step": 41500 + }, + { + "epoch": 0.1569253100909513, + "grad_norm": 4.058901309967041, + "learning_rate": 9.848979066283589e-05, + "loss": 2.6128, + "step": 42000 + }, + { + "epoch": 0.158793468544415, + "grad_norm": 4.78932523727417, + "learning_rate": 9.84537930490876e-05, + "loss": 2.5862, + "step": 42500 + }, + { + "epoch": 0.1606616269978787, + "grad_norm": 3.3654229640960693, + "learning_rate": 9.841737818619692e-05, + "loss": 2.5509, + "step": 43000 + }, + { + "epoch": 0.16252978545134242, + "grad_norm": 3.9686570167541504, + "learning_rate": 9.838054638774244e-05, + "loss": 2.5089, + "step": 43500 + }, + { + "epoch": 0.1643979439048061, + "grad_norm": 2.973649740219116, + "learning_rate": 9.834329797089303e-05, + "loss": 2.5321, + "step": 44000 + }, + { + "epoch": 0.16626610235826983, + "grad_norm": 2.5326201915740967, + "learning_rate": 9.83056332564052e-05, + "loss": 2.5408, + "step": 44500 + }, + { + "epoch": 0.16813426081173352, + "grad_norm": 3.884883165359497, + "learning_rate": 9.826762914491992e-05, + "loss": 2.5352, + "step": 45000 + }, + { + "epoch": 0.17000241926519724, + "grad_norm": 3.9567508697509766, + "learning_rate": 9.822913364272259e-05, + "loss": 2.5619, + "step": 45500 + }, + { + "epoch": 0.17187057771866093, + "grad_norm": 3.041057825088501, + "learning_rate": 9.819022282598776e-05, + "loss": 2.555, + "step": 46000 + }, + { + "epoch": 0.17373873617212465, + "grad_norm": 3.1877288818359375, + "learning_rate": 9.815089702978735e-05, + "loss": 2.5458, + "step": 46500 + }, + { + "epoch": 0.17560689462558834, + "grad_norm": 3.142703056335449, + "learning_rate": 9.811115659276677e-05, + "loss": 2.5607, + "step": 47000 + }, + { + "epoch": 0.17747505307905206, + "grad_norm": 3.609555959701538, + "learning_rate": 9.807100185714202e-05, + "loss": 2.5683, + "step": 47500 + }, + { + "epoch": 0.17934321153251576, + "grad_norm": 3.200345277786255, + "learning_rate": 9.803051471896693e-05, + "loss": 2.5496, + "step": 48000 + }, + { + "epoch": 0.18121136998597948, + "grad_norm": 3.56850266456604, + "learning_rate": 9.798953325390536e-05, + "loss": 2.5425, + "step": 48500 + }, + { + "epoch": 0.18307952843944317, + "grad_norm": 3.4314849376678467, + "learning_rate": 9.794813853757214e-05, + "loss": 2.5238, + "step": 49000 + }, + { + "epoch": 0.1849476868929069, + "grad_norm": 3.024343967437744, + "learning_rate": 9.790633092642875e-05, + "loss": 2.5786, + "step": 49500 + }, + { + "epoch": 0.18681584534637058, + "grad_norm": 3.2595534324645996, + "learning_rate": 9.786419563225273e-05, + "loss": 2.5386, + "step": 50000 + }, + { + "epoch": 0.1886840037998343, + "grad_norm": 3.6985089778900146, + "learning_rate": 9.782156413906974e-05, + "loss": 2.5338, + "step": 50500 + }, + { + "epoch": 0.190552162253298, + "grad_norm": 2.9342880249023438, + "learning_rate": 9.777852084104404e-05, + "loss": 2.4992, + "step": 51000 + }, + { + "epoch": 0.1924203207067617, + "grad_norm": 2.8690543174743652, + "learning_rate": 9.773506610883352e-05, + "loss": 2.571, + "step": 51500 + }, + { + "epoch": 0.1942884791602254, + "grad_norm": 2.8353734016418457, + "learning_rate": 9.769120031663902e-05, + "loss": 2.4895, + "step": 52000 + }, + { + "epoch": 0.19615663761368912, + "grad_norm": 3.6773738861083984, + "learning_rate": 9.764692384220111e-05, + "loss": 2.5121, + "step": 52500 + }, + { + "epoch": 0.19802479606715281, + "grad_norm": 3.3569443225860596, + "learning_rate": 9.760223706679688e-05, + "loss": 2.527, + "step": 53000 + }, + { + "epoch": 0.19989295452061653, + "grad_norm": 2.970712184906006, + "learning_rate": 9.755714037523662e-05, + "loss": 2.5337, + "step": 53500 + }, + { + "epoch": 0.20176111297408023, + "grad_norm": 3.2004318237304688, + "learning_rate": 9.751172557674817e-05, + "loss": 2.5342, + "step": 54000 + }, + { + "epoch": 0.20362927142754395, + "grad_norm": 3.16782546043396, + "learning_rate": 9.746581103930153e-05, + "loss": 2.524, + "step": 54500 + }, + { + "epoch": 0.20549742988100764, + "grad_norm": 3.3260490894317627, + "learning_rate": 9.741948776050147e-05, + "loss": 2.4701, + "step": 55000 + }, + { + "epoch": 0.20736558833447136, + "grad_norm": 3.6631577014923096, + "learning_rate": 9.737275613925072e-05, + "loss": 2.5314, + "step": 55500 + }, + { + "epoch": 0.20923374678793505, + "grad_norm": 2.5733258724212646, + "learning_rate": 9.732561657796828e-05, + "loss": 2.5362, + "step": 56000 + }, + { + "epoch": 0.21110190524139877, + "grad_norm": 3.8227956295013428, + "learning_rate": 9.727816498322433e-05, + "loss": 2.4807, + "step": 56500 + }, + { + "epoch": 0.21297006369486246, + "grad_norm": 3.5182738304138184, + "learning_rate": 9.723021157702207e-05, + "loss": 2.5263, + "step": 57000 + }, + { + "epoch": 0.21483822214832618, + "grad_norm": 3.405224084854126, + "learning_rate": 9.71818514582792e-05, + "loss": 2.5105, + "step": 57500 + }, + { + "epoch": 0.21670638060178987, + "grad_norm": 2.988802671432495, + "learning_rate": 9.713308504343815e-05, + "loss": 2.5297, + "step": 58000 + }, + { + "epoch": 0.2185745390552536, + "grad_norm": 2.3862366676330566, + "learning_rate": 9.708391275244016e-05, + "loss": 2.5006, + "step": 58500 + }, + { + "epoch": 0.22044269750871728, + "grad_norm": 3.3643691539764404, + "learning_rate": 9.703433500872156e-05, + "loss": 2.5255, + "step": 59000 + }, + { + "epoch": 0.222310855962181, + "grad_norm": 3.6664035320281982, + "learning_rate": 9.698435223921016e-05, + "loss": 2.4421, + "step": 59500 + }, + { + "epoch": 0.2241790144156447, + "grad_norm": 3.3508718013763428, + "learning_rate": 9.693396487432153e-05, + "loss": 2.4893, + "step": 60000 + }, + { + "epoch": 0.22604717286910841, + "grad_norm": 3.5202372074127197, + "learning_rate": 9.688337731857194e-05, + "loss": 2.505, + "step": 60500 + }, + { + "epoch": 0.2279153313225721, + "grad_norm": 4.265177249908447, + "learning_rate": 9.683218368212872e-05, + "loss": 2.5134, + "step": 61000 + }, + { + "epoch": 0.22978348977603583, + "grad_norm": 3.761479377746582, + "learning_rate": 9.67805867606742e-05, + "loss": 2.477, + "step": 61500 + }, + { + "epoch": 0.23165164822949952, + "grad_norm": 3.254711866378784, + "learning_rate": 9.67285869985239e-05, + "loss": 2.4894, + "step": 62000 + }, + { + "epoch": 0.23351980668296324, + "grad_norm": 3.4447569847106934, + "learning_rate": 9.667629004906115e-05, + "loss": 2.5338, + "step": 62500 + }, + { + "epoch": 0.23538796513642693, + "grad_norm": 3.283677577972412, + "learning_rate": 9.662348675576849e-05, + "loss": 2.5028, + "step": 63000 + }, + { + "epoch": 0.23725612358989065, + "grad_norm": 3.641008138656616, + "learning_rate": 9.657028197461201e-05, + "loss": 2.5102, + "step": 63500 + }, + { + "epoch": 0.23912428204335434, + "grad_norm": 2.3239517211914062, + "learning_rate": 9.651667616375301e-05, + "loss": 2.4692, + "step": 64000 + }, + { + "epoch": 0.24099244049681806, + "grad_norm": 2.590287446975708, + "learning_rate": 9.646266978480605e-05, + "loss": 2.4753, + "step": 64500 + }, + { + "epoch": 0.24286059895028175, + "grad_norm": 3.5106756687164307, + "learning_rate": 9.640826330283514e-05, + "loss": 2.4541, + "step": 65000 + }, + { + "epoch": 0.24472875740374547, + "grad_norm": 2.9911463260650635, + "learning_rate": 9.635345718634972e-05, + "loss": 2.5228, + "step": 65500 + }, + { + "epoch": 0.24659691585720916, + "grad_norm": 3.7811479568481445, + "learning_rate": 9.629825190730053e-05, + "loss": 2.468, + "step": 66000 + }, + { + "epoch": 0.24846507431067288, + "grad_norm": 3.073608875274658, + "learning_rate": 9.624275954658023e-05, + "loss": 2.5416, + "step": 66500 + }, + { + "epoch": 0.2503332327641366, + "grad_norm": 2.943208932876587, + "learning_rate": 9.618675816793752e-05, + "loss": 2.4685, + "step": 67000 + }, + { + "epoch": 0.25220139121760027, + "grad_norm": 2.2683610916137695, + "learning_rate": 9.613047225704368e-05, + "loss": 2.4953, + "step": 67500 + }, + { + "epoch": 0.254069549671064, + "grad_norm": 3.0341203212738037, + "learning_rate": 9.607367670392133e-05, + "loss": 2.4601, + "step": 68000 + }, + { + "epoch": 0.2559377081245277, + "grad_norm": 3.2594239711761475, + "learning_rate": 9.60164843975031e-05, + "loss": 2.4339, + "step": 68500 + }, + { + "epoch": 0.2578058665779914, + "grad_norm": 3.045818328857422, + "learning_rate": 9.595889583028791e-05, + "loss": 2.4237, + "step": 69000 + }, + { + "epoch": 0.2596740250314551, + "grad_norm": 3.0980165004730225, + "learning_rate": 9.590091149818697e-05, + "loss": 2.5111, + "step": 69500 + }, + { + "epoch": 0.2615421834849188, + "grad_norm": 2.206389904022217, + "learning_rate": 9.584253190051957e-05, + "loss": 2.4885, + "step": 70000 + }, + { + "epoch": 0.26341034193838253, + "grad_norm": 3.909090518951416, + "learning_rate": 9.578387548236723e-05, + "loss": 2.4945, + "step": 70500 + }, + { + "epoch": 0.26527850039184625, + "grad_norm": 3.3355019092559814, + "learning_rate": 9.572470765314143e-05, + "loss": 2.4225, + "step": 71000 + }, + { + "epoch": 0.2671466588453099, + "grad_norm": 2.9104554653167725, + "learning_rate": 9.56651460756897e-05, + "loss": 2.4666, + "step": 71500 + }, + { + "epoch": 0.26901481729877363, + "grad_norm": 2.195571184158325, + "learning_rate": 9.560519126291337e-05, + "loss": 2.4738, + "step": 72000 + }, + { + "epoch": 0.27088297575223735, + "grad_norm": 2.8600668907165527, + "learning_rate": 9.554484373110011e-05, + "loss": 2.3982, + "step": 72500 + }, + { + "epoch": 0.2727511342057011, + "grad_norm": 2.985612630844116, + "learning_rate": 9.54842258704496e-05, + "loss": 2.4708, + "step": 73000 + }, + { + "epoch": 0.27461929265916474, + "grad_norm": 2.609339475631714, + "learning_rate": 9.542309524577655e-05, + "loss": 2.4385, + "step": 73500 + }, + { + "epoch": 0.27648745111262846, + "grad_norm": 2.9328203201293945, + "learning_rate": 9.536157347014623e-05, + "loss": 2.3942, + "step": 74000 + }, + { + "epoch": 0.2783556095660922, + "grad_norm": 3.242722511291504, + "learning_rate": 9.529966107333978e-05, + "loss": 2.4568, + "step": 74500 + }, + { + "epoch": 0.2802237680195559, + "grad_norm": 2.90252423286438, + "learning_rate": 9.523735858850218e-05, + "loss": 2.4495, + "step": 75000 + }, + { + "epoch": 0.2820919264730196, + "grad_norm": 2.491132974624634, + "learning_rate": 9.517466655213752e-05, + "loss": 2.4401, + "step": 75500 + }, + { + "epoch": 0.2839600849264833, + "grad_norm": 2.714989185333252, + "learning_rate": 9.511171205407364e-05, + "loss": 2.4607, + "step": 76000 + }, + { + "epoch": 0.285828243379947, + "grad_norm": 3.1541576385498047, + "learning_rate": 9.50482433139732e-05, + "loss": 2.4522, + "step": 76500 + }, + { + "epoch": 0.2876964018334107, + "grad_norm": 3.280564546585083, + "learning_rate": 9.498438665087013e-05, + "loss": 2.4696, + "step": 77000 + }, + { + "epoch": 0.28956456028687444, + "grad_norm": 3.0421793460845947, + "learning_rate": 9.492014261465201e-05, + "loss": 2.482, + "step": 77500 + }, + { + "epoch": 0.2914327187403381, + "grad_norm": 2.658756971359253, + "learning_rate": 9.485551175854214e-05, + "loss": 2.4464, + "step": 78000 + }, + { + "epoch": 0.2933008771938018, + "grad_norm": 4.537105083465576, + "learning_rate": 9.479049463909488e-05, + "loss": 2.444, + "step": 78500 + }, + { + "epoch": 0.29516903564726554, + "grad_norm": 2.9097115993499756, + "learning_rate": 9.472509181619083e-05, + "loss": 2.4631, + "step": 79000 + }, + { + "epoch": 0.29703719410072926, + "grad_norm": 2.133843421936035, + "learning_rate": 9.465943581295223e-05, + "loss": 2.4159, + "step": 79500 + }, + { + "epoch": 0.2989053525541929, + "grad_norm": 2.5699055194854736, + "learning_rate": 9.459326404463687e-05, + "loss": 2.4392, + "step": 80000 + }, + { + "epoch": 0.30077351100765665, + "grad_norm": 2.927656412124634, + "learning_rate": 9.452684176567582e-05, + "loss": 2.4121, + "step": 80500 + }, + { + "epoch": 0.30264166946112037, + "grad_norm": 3.3542892932891846, + "learning_rate": 9.44599033266823e-05, + "loss": 2.4138, + "step": 81000 + }, + { + "epoch": 0.3045098279145841, + "grad_norm": 2.9518256187438965, + "learning_rate": 9.439258203104611e-05, + "loss": 2.4193, + "step": 81500 + }, + { + "epoch": 0.30637798636804775, + "grad_norm": 2.9476184844970703, + "learning_rate": 9.432487845848965e-05, + "loss": 2.3944, + "step": 82000 + }, + { + "epoch": 0.30824614482151147, + "grad_norm": 2.688512086868286, + "learning_rate": 9.425679319202733e-05, + "loss": 2.4331, + "step": 82500 + }, + { + "epoch": 0.3101143032749752, + "grad_norm": 2.971700429916382, + "learning_rate": 9.418832681796042e-05, + "loss": 2.4513, + "step": 83000 + }, + { + "epoch": 0.3119824617284389, + "grad_norm": 2.495612382888794, + "learning_rate": 9.411947992587194e-05, + "loss": 2.3972, + "step": 83500 + }, + { + "epoch": 0.3138506201819026, + "grad_norm": 3.071038246154785, + "learning_rate": 9.405025310862172e-05, + "loss": 2.4309, + "step": 84000 + }, + { + "epoch": 0.3157187786353663, + "grad_norm": 3.627650260925293, + "learning_rate": 9.398064696234121e-05, + "loss": 2.4297, + "step": 84500 + }, + { + "epoch": 0.31758693708883, + "grad_norm": 2.077777147293091, + "learning_rate": 9.391066208642838e-05, + "loss": 2.4245, + "step": 85000 + }, + { + "epoch": 0.31945509554229373, + "grad_norm": 3.0603654384613037, + "learning_rate": 9.384044018651683e-05, + "loss": 2.4145, + "step": 85500 + }, + { + "epoch": 0.3213232539957574, + "grad_norm": 2.993283271789551, + "learning_rate": 9.37697004170087e-05, + "loss": 2.4095, + "step": 86000 + }, + { + "epoch": 0.3231914124492211, + "grad_norm": 2.8521878719329834, + "learning_rate": 9.369858373438785e-05, + "loss": 2.3967, + "step": 86500 + }, + { + "epoch": 0.32505957090268484, + "grad_norm": 3.297847032546997, + "learning_rate": 9.362709075105988e-05, + "loss": 2.4343, + "step": 87000 + }, + { + "epoch": 0.32692772935614856, + "grad_norm": 2.3240292072296143, + "learning_rate": 9.355522208267086e-05, + "loss": 2.3947, + "step": 87500 + }, + { + "epoch": 0.3287958878096122, + "grad_norm": 3.8041253089904785, + "learning_rate": 9.348297834810195e-05, + "loss": 2.4111, + "step": 88000 + }, + { + "epoch": 0.33066404626307594, + "grad_norm": 2.6961183547973633, + "learning_rate": 9.341036016946413e-05, + "loss": 2.4159, + "step": 88500 + }, + { + "epoch": 0.33253220471653966, + "grad_norm": 3.0299246311187744, + "learning_rate": 9.33373681720928e-05, + "loss": 2.4012, + "step": 89000 + }, + { + "epoch": 0.3344003631700034, + "grad_norm": 2.75026273727417, + "learning_rate": 9.326415008694199e-05, + "loss": 2.3755, + "step": 89500 + }, + { + "epoch": 0.33626852162346704, + "grad_norm": 2.4696195125579834, + "learning_rate": 9.319056093086089e-05, + "loss": 2.3953, + "step": 90000 + }, + { + "epoch": 0.33813668007693076, + "grad_norm": 2.428610324859619, + "learning_rate": 9.311645274788967e-05, + "loss": 2.4433, + "step": 90500 + }, + { + "epoch": 0.3400048385303945, + "grad_norm": 2.851217269897461, + "learning_rate": 9.304197327710381e-05, + "loss": 2.429, + "step": 91000 + }, + { + "epoch": 0.3418729969838582, + "grad_norm": 3.0488922595977783, + "learning_rate": 9.296712315986686e-05, + "loss": 2.417, + "step": 91500 + }, + { + "epoch": 0.34374115543732187, + "grad_norm": 2.7306880950927734, + "learning_rate": 9.289190304073406e-05, + "loss": 2.4539, + "step": 92000 + }, + { + "epoch": 0.3456093138907856, + "grad_norm": 3.2483866214752197, + "learning_rate": 9.281631356744687e-05, + "loss": 2.3616, + "step": 92500 + }, + { + "epoch": 0.3474774723442493, + "grad_norm": 2.66874098777771, + "learning_rate": 9.274035539092736e-05, + "loss": 2.3984, + "step": 93000 + }, + { + "epoch": 0.349345630797713, + "grad_norm": 2.5911643505096436, + "learning_rate": 9.266402916527259e-05, + "loss": 2.4403, + "step": 93500 + }, + { + "epoch": 0.3512137892511767, + "grad_norm": 3.084787607192993, + "learning_rate": 9.258748930120269e-05, + "loss": 2.3685, + "step": 94000 + }, + { + "epoch": 0.3530819477046404, + "grad_norm": 3.077162742614746, + "learning_rate": 9.251042968504211e-05, + "loss": 2.4033, + "step": 94500 + }, + { + "epoch": 0.35495010615810413, + "grad_norm": 2.7327165603637695, + "learning_rate": 9.243300399970075e-05, + "loss": 2.357, + "step": 95000 + }, + { + "epoch": 0.35681826461156785, + "grad_norm": 2.942444324493408, + "learning_rate": 9.235521291191276e-05, + "loss": 2.4114, + "step": 95500 + }, + { + "epoch": 0.3586864230650315, + "grad_norm": 2.504429817199707, + "learning_rate": 9.227705709155896e-05, + "loss": 2.3763, + "step": 96000 + }, + { + "epoch": 0.36055458151849523, + "grad_norm": 3.322981119155884, + "learning_rate": 9.219853721166094e-05, + "loss": 2.4037, + "step": 96500 + }, + { + "epoch": 0.36242273997195895, + "grad_norm": 2.8509936332702637, + "learning_rate": 9.21196539483753e-05, + "loss": 2.4089, + "step": 97000 + }, + { + "epoch": 0.36429089842542267, + "grad_norm": 3.585662603378296, + "learning_rate": 9.204040798098783e-05, + "loss": 2.4132, + "step": 97500 + }, + { + "epoch": 0.36615905687888634, + "grad_norm": 2.8213889598846436, + "learning_rate": 9.196095956872841e-05, + "loss": 2.3647, + "step": 98000 + }, + { + "epoch": 0.36802721533235006, + "grad_norm": 3.3626108169555664, + "learning_rate": 9.188099096546838e-05, + "loss": 2.4143, + "step": 98500 + }, + { + "epoch": 0.3698953737858138, + "grad_norm": 2.993591785430908, + "learning_rate": 9.180066171330013e-05, + "loss": 2.3806, + "step": 99000 + }, + { + "epoch": 0.3717635322392775, + "grad_norm": 2.9788472652435303, + "learning_rate": 9.171997250396128e-05, + "loss": 2.3571, + "step": 99500 + }, + { + "epoch": 0.37363169069274116, + "grad_norm": 2.3888766765594482, + "learning_rate": 9.163908648731292e-05, + "loss": 2.3841, + "step": 100000 + }, + { + "epoch": 0.3754998491462049, + "grad_norm": 3.0424160957336426, + "learning_rate": 9.155768016766876e-05, + "loss": 2.4152, + "step": 100500 + }, + { + "epoch": 0.3773680075996686, + "grad_norm": 2.592036724090576, + "learning_rate": 9.147591598323593e-05, + "loss": 2.3465, + "step": 101000 + }, + { + "epoch": 0.3792361660531323, + "grad_norm": 2.8690261840820312, + "learning_rate": 9.139379463810866e-05, + "loss": 2.3974, + "step": 101500 + }, + { + "epoch": 0.381104324506596, + "grad_norm": 2.7227180004119873, + "learning_rate": 9.131148215032317e-05, + "loss": 2.3688, + "step": 102000 + }, + { + "epoch": 0.3829724829600597, + "grad_norm": 2.856623888015747, + "learning_rate": 9.12286493191618e-05, + "loss": 2.4341, + "step": 102500 + }, + { + "epoch": 0.3848406414135234, + "grad_norm": 2.56028151512146, + "learning_rate": 9.114546145658827e-05, + "loss": 2.427, + "step": 103000 + }, + { + "epoch": 0.38670879986698714, + "grad_norm": 3.3118507862091064, + "learning_rate": 9.106208671644056e-05, + "loss": 2.3166, + "step": 103500 + }, + { + "epoch": 0.3885769583204508, + "grad_norm": 3.2025699615478516, + "learning_rate": 9.097819164962692e-05, + "loss": 2.4462, + "step": 104000 + }, + { + "epoch": 0.3904451167739145, + "grad_norm": 3.240300416946411, + "learning_rate": 9.089394370816208e-05, + "loss": 2.4285, + "step": 104500 + }, + { + "epoch": 0.39231327522737824, + "grad_norm": 3.5723962783813477, + "learning_rate": 9.080934361752857e-05, + "loss": 2.355, + "step": 105000 + }, + { + "epoch": 0.39418143368084196, + "grad_norm": 3.186774253845215, + "learning_rate": 9.072456235949608e-05, + "loss": 2.4029, + "step": 105500 + }, + { + "epoch": 0.39604959213430563, + "grad_norm": 2.629359006881714, + "learning_rate": 9.063926085974259e-05, + "loss": 2.3459, + "step": 106000 + }, + { + "epoch": 0.39791775058776935, + "grad_norm": 3.2429652214050293, + "learning_rate": 9.055360940396558e-05, + "loss": 2.3847, + "step": 106500 + }, + { + "epoch": 0.39978590904123307, + "grad_norm": 2.427645206451416, + "learning_rate": 9.046760872973364e-05, + "loss": 2.3435, + "step": 107000 + }, + { + "epoch": 0.4016540674946968, + "grad_norm": 2.556652784347534, + "learning_rate": 9.038143262321399e-05, + "loss": 2.4121, + "step": 107500 + }, + { + "epoch": 0.40352222594816045, + "grad_norm": 2.9563798904418945, + "learning_rate": 9.029473643152501e-05, + "loss": 2.3786, + "step": 108000 + }, + { + "epoch": 0.40539038440162417, + "grad_norm": 2.457141876220703, + "learning_rate": 9.020769325060857e-05, + "loss": 2.3734, + "step": 108500 + }, + { + "epoch": 0.4072585428550879, + "grad_norm": 2.489871025085449, + "learning_rate": 9.012030383001778e-05, + "loss": 2.3934, + "step": 109000 + }, + { + "epoch": 0.4091267013085516, + "grad_norm": 2.9061882495880127, + "learning_rate": 9.003256892228738e-05, + "loss": 2.3507, + "step": 109500 + }, + { + "epoch": 0.4109948597620153, + "grad_norm": 3.2263598442077637, + "learning_rate": 8.994448928292711e-05, + "loss": 2.3866, + "step": 110000 + }, + { + "epoch": 0.412863018215479, + "grad_norm": 2.9006874561309814, + "learning_rate": 8.985606567041537e-05, + "loss": 2.3546, + "step": 110500 + }, + { + "epoch": 0.4147311766689427, + "grad_norm": 2.51509428024292, + "learning_rate": 8.976747672185874e-05, + "loss": 2.3669, + "step": 111000 + }, + { + "epoch": 0.41659933512240643, + "grad_norm": 2.6938908100128174, + "learning_rate": 8.967836813445061e-05, + "loss": 2.3485, + "step": 111500 + }, + { + "epoch": 0.4184674935758701, + "grad_norm": 2.7218174934387207, + "learning_rate": 8.958891786553452e-05, + "loss": 2.3798, + "step": 112000 + }, + { + "epoch": 0.4203356520293338, + "grad_norm": 3.0031161308288574, + "learning_rate": 8.949912668539173e-05, + "loss": 2.3501, + "step": 112500 + }, + { + "epoch": 0.42220381048279754, + "grad_norm": 2.5878889560699463, + "learning_rate": 8.940899536723916e-05, + "loss": 2.3512, + "step": 113000 + }, + { + "epoch": 0.42407196893626126, + "grad_norm": 2.7273967266082764, + "learning_rate": 8.931852468722277e-05, + "loss": 2.3394, + "step": 113500 + }, + { + "epoch": 0.4259401273897249, + "grad_norm": 2.3990983963012695, + "learning_rate": 8.922771542441081e-05, + "loss": 2.3104, + "step": 114000 + }, + { + "epoch": 0.42780828584318864, + "grad_norm": 3.0549476146698, + "learning_rate": 8.913656836078725e-05, + "loss": 2.3557, + "step": 114500 + }, + { + "epoch": 0.42967644429665236, + "grad_norm": 2.417224168777466, + "learning_rate": 8.904508428124488e-05, + "loss": 2.32, + "step": 115000 + }, + { + "epoch": 0.4315446027501161, + "grad_norm": 2.56392502784729, + "learning_rate": 8.895363192352878e-05, + "loss": 2.3651, + "step": 115500 + }, + { + "epoch": 0.43341276120357974, + "grad_norm": 2.027083396911621, + "learning_rate": 8.886147751859986e-05, + "loss": 2.3277, + "step": 116000 + }, + { + "epoch": 0.43528091965704346, + "grad_norm": 1.902034044265747, + "learning_rate": 8.876898846663621e-05, + "loss": 2.3185, + "step": 116500 + }, + { + "epoch": 0.4371490781105072, + "grad_norm": 2.7564985752105713, + "learning_rate": 8.867616556408684e-05, + "loss": 2.3674, + "step": 117000 + }, + { + "epoch": 0.4390172365639709, + "grad_norm": 3.024198532104492, + "learning_rate": 8.858300961027575e-05, + "loss": 2.3832, + "step": 117500 + }, + { + "epoch": 0.44088539501743457, + "grad_norm": 2.2952866554260254, + "learning_rate": 8.84895214073948e-05, + "loss": 2.3799, + "step": 118000 + }, + { + "epoch": 0.4427535534708983, + "grad_norm": 2.352498769760132, + "learning_rate": 8.839570176049705e-05, + "loss": 2.3958, + "step": 118500 + }, + { + "epoch": 0.444621711924362, + "grad_norm": 3.565748453140259, + "learning_rate": 8.830155147748969e-05, + "loss": 2.3614, + "step": 119000 + }, + { + "epoch": 0.4464898703778257, + "grad_norm": 3.0577287673950195, + "learning_rate": 8.82072606579692e-05, + "loss": 2.3458, + "step": 119500 + }, + { + "epoch": 0.4483580288312894, + "grad_norm": 2.6253695487976074, + "learning_rate": 8.81124521950556e-05, + "loss": 2.3273, + "step": 120000 + }, + { + "epoch": 0.4502261872847531, + "grad_norm": 2.1585161685943604, + "learning_rate": 8.801731553517346e-05, + "loss": 2.3298, + "step": 120500 + }, + { + "epoch": 0.45209434573821683, + "grad_norm": 2.5908641815185547, + "learning_rate": 8.792185149757116e-05, + "loss": 2.323, + "step": 121000 + }, + { + "epoch": 0.45396250419168055, + "grad_norm": 1.9700515270233154, + "learning_rate": 8.78262528108574e-05, + "loss": 2.3285, + "step": 121500 + }, + { + "epoch": 0.4558306626451442, + "grad_norm": 2.0091867446899414, + "learning_rate": 8.773013713746569e-05, + "loss": 2.3353, + "step": 122000 + }, + { + "epoch": 0.45769882109860793, + "grad_norm": 3.026522159576416, + "learning_rate": 8.763369655932719e-05, + "loss": 2.3478, + "step": 122500 + }, + { + "epoch": 0.45956697955207165, + "grad_norm": 2.7834973335266113, + "learning_rate": 8.753693190691863e-05, + "loss": 2.3256, + "step": 123000 + }, + { + "epoch": 0.4614351380055354, + "grad_norm": 3.004798173904419, + "learning_rate": 8.743984401350747e-05, + "loss": 2.3466, + "step": 123500 + }, + { + "epoch": 0.46330329645899904, + "grad_norm": 2.611668586730957, + "learning_rate": 8.734262885694443e-05, + "loss": 2.3222, + "step": 124000 + }, + { + "epoch": 0.46517145491246276, + "grad_norm": 2.902439594268799, + "learning_rate": 8.72448976347505e-05, + "loss": 2.3485, + "step": 124500 + }, + { + "epoch": 0.4670396133659265, + "grad_norm": 2.932037353515625, + "learning_rate": 8.714684568634262e-05, + "loss": 2.3258, + "step": 125000 + }, + { + "epoch": 0.4689077718193902, + "grad_norm": 2.526458263397217, + "learning_rate": 8.70484738560735e-05, + "loss": 2.3549, + "step": 125500 + }, + { + "epoch": 0.47077593027285386, + "grad_norm": 2.8670670986175537, + "learning_rate": 8.694978299105044e-05, + "loss": 2.3685, + "step": 126000 + }, + { + "epoch": 0.4726440887263176, + "grad_norm": 2.95123553276062, + "learning_rate": 8.685077394112803e-05, + "loss": 2.327, + "step": 126500 + }, + { + "epoch": 0.4745122471797813, + "grad_norm": 3.010820150375366, + "learning_rate": 8.675164652779493e-05, + "loss": 2.3247, + "step": 127000 + }, + { + "epoch": 0.476380405633245, + "grad_norm": 1.896767258644104, + "learning_rate": 8.665200430068873e-05, + "loss": 2.3158, + "step": 127500 + }, + { + "epoch": 0.4782485640867087, + "grad_norm": 2.559565305709839, + "learning_rate": 8.655204645293866e-05, + "loss": 2.3425, + "step": 128000 + }, + { + "epoch": 0.4801167225401724, + "grad_norm": 2.658048391342163, + "learning_rate": 8.645177384530965e-05, + "loss": 2.3565, + "step": 128500 + }, + { + "epoch": 0.4819848809936361, + "grad_norm": 1.818748116493225, + "learning_rate": 8.635118734127712e-05, + "loss": 2.3441, + "step": 129000 + }, + { + "epoch": 0.48385303944709984, + "grad_norm": 2.627014398574829, + "learning_rate": 8.625028780701953e-05, + "loss": 2.3296, + "step": 129500 + }, + { + "epoch": 0.4857211979005635, + "grad_norm": 2.687391519546509, + "learning_rate": 8.614907611141099e-05, + "loss": 2.3334, + "step": 130000 + }, + { + "epoch": 0.4875893563540272, + "grad_norm": 3.092353582382202, + "learning_rate": 8.604755312601363e-05, + "loss": 2.3278, + "step": 130500 + }, + { + "epoch": 0.48945751480749095, + "grad_norm": 3.0431768894195557, + "learning_rate": 8.59459237010844e-05, + "loss": 2.299, + "step": 131000 + }, + { + "epoch": 0.49132567326095467, + "grad_norm": 2.2302520275115967, + "learning_rate": 8.584378137971116e-05, + "loss": 2.2837, + "step": 131500 + }, + { + "epoch": 0.49319383171441833, + "grad_norm": 2.7669031620025635, + "learning_rate": 8.574133039752728e-05, + "loss": 2.3202, + "step": 132000 + }, + { + "epoch": 0.49506199016788205, + "grad_norm": 2.6957993507385254, + "learning_rate": 8.563857163676681e-05, + "loss": 2.3214, + "step": 132500 + }, + { + "epoch": 0.49693014862134577, + "grad_norm": 2.662504196166992, + "learning_rate": 8.553571241931346e-05, + "loss": 2.2907, + "step": 133000 + }, + { + "epoch": 0.4987983070748095, + "grad_norm": 2.6600215435028076, + "learning_rate": 8.54323413698205e-05, + "loss": 2.2866, + "step": 133500 + }, + { + "epoch": 0.5006664655282732, + "grad_norm": 1.6196849346160889, + "learning_rate": 8.532866520254174e-05, + "loss": 2.3064, + "step": 134000 + }, + { + "epoch": 0.5025346239817369, + "grad_norm": 2.3502981662750244, + "learning_rate": 8.522468481026161e-05, + "loss": 2.3447, + "step": 134500 + }, + { + "epoch": 0.5044027824352005, + "grad_norm": 2.94901442527771, + "learning_rate": 8.512040108838428e-05, + "loss": 2.3602, + "step": 135000 + }, + { + "epoch": 0.5062709408886643, + "grad_norm": 2.749366283416748, + "learning_rate": 8.501581493492603e-05, + "loss": 2.3389, + "step": 135500 + }, + { + "epoch": 0.508139099342128, + "grad_norm": 3.2299070358276367, + "learning_rate": 8.491113732620424e-05, + "loss": 2.3348, + "step": 136000 + }, + { + "epoch": 0.5100072577955918, + "grad_norm": 2.3727314472198486, + "learning_rate": 8.480616028924504e-05, + "loss": 2.2864, + "step": 136500 + }, + { + "epoch": 0.5118754162490554, + "grad_norm": 1.8499844074249268, + "learning_rate": 8.470067345222588e-05, + "loss": 2.271, + "step": 137000 + }, + { + "epoch": 0.5137435747025191, + "grad_norm": 3.1945462226867676, + "learning_rate": 8.459488779801767e-05, + "loss": 2.2967, + "step": 137500 + }, + { + "epoch": 0.5156117331559829, + "grad_norm": 2.6457462310791016, + "learning_rate": 8.448880423757021e-05, + "loss": 2.2784, + "step": 138000 + }, + { + "epoch": 0.5174798916094465, + "grad_norm": 2.016098976135254, + "learning_rate": 8.438242368439869e-05, + "loss": 2.3013, + "step": 138500 + }, + { + "epoch": 0.5193480500629102, + "grad_norm": 1.97508704662323, + "learning_rate": 8.42757470545757e-05, + "loss": 2.3232, + "step": 139000 + }, + { + "epoch": 0.521216208516374, + "grad_norm": 2.349184274673462, + "learning_rate": 8.416877526672355e-05, + "loss": 2.3266, + "step": 139500 + }, + { + "epoch": 0.5230843669698376, + "grad_norm": 2.6522152423858643, + "learning_rate": 8.406150924200616e-05, + "loss": 2.2941, + "step": 140000 + }, + { + "epoch": 0.5249525254233014, + "grad_norm": 3.5393903255462646, + "learning_rate": 8.395394990412121e-05, + "loss": 2.3459, + "step": 140500 + }, + { + "epoch": 0.5268206838767651, + "grad_norm": 2.5476553440093994, + "learning_rate": 8.38460981792922e-05, + "loss": 2.2942, + "step": 141000 + }, + { + "epoch": 0.5286888423302287, + "grad_norm": 2.8197927474975586, + "learning_rate": 8.373817157288324e-05, + "loss": 2.3426, + "step": 141500 + }, + { + "epoch": 0.5305570007836925, + "grad_norm": 2.1316707134246826, + "learning_rate": 8.362973844302275e-05, + "loss": 2.2985, + "step": 142000 + }, + { + "epoch": 0.5324251592371562, + "grad_norm": 1.9890694618225098, + "learning_rate": 8.352101571809362e-05, + "loss": 2.2896, + "step": 142500 + }, + { + "epoch": 0.5342933176906198, + "grad_norm": 3.057724952697754, + "learning_rate": 8.34120043343376e-05, + "loss": 2.3079, + "step": 143000 + }, + { + "epoch": 0.5361614761440836, + "grad_norm": 2.373011350631714, + "learning_rate": 8.330270523048216e-05, + "loss": 2.3294, + "step": 143500 + }, + { + "epoch": 0.5380296345975473, + "grad_norm": 2.1205389499664307, + "learning_rate": 8.31931193477324e-05, + "loss": 2.2969, + "step": 144000 + }, + { + "epoch": 0.539897793051011, + "grad_norm": 2.767277956008911, + "learning_rate": 8.308324762976294e-05, + "loss": 2.2901, + "step": 144500 + }, + { + "epoch": 0.5417659515044747, + "grad_norm": 2.847618579864502, + "learning_rate": 8.297309102270986e-05, + "loss": 2.3128, + "step": 145000 + }, + { + "epoch": 0.5436341099579384, + "grad_norm": 2.3643147945404053, + "learning_rate": 8.286287163899844e-05, + "loss": 2.2991, + "step": 145500 + }, + { + "epoch": 0.5455022684114021, + "grad_norm": 3.874725103378296, + "learning_rate": 8.275214866701926e-05, + "loss": 2.2602, + "step": 146000 + }, + { + "epoch": 0.5473704268648658, + "grad_norm": 2.4457411766052246, + "learning_rate": 8.264114365714206e-05, + "loss": 2.3038, + "step": 146500 + }, + { + "epoch": 0.5492385853183295, + "grad_norm": 2.56156063079834, + "learning_rate": 8.252985756526198e-05, + "loss": 2.3193, + "step": 147000 + }, + { + "epoch": 0.5511067437717933, + "grad_norm": 3.2425754070281982, + "learning_rate": 8.241851476105105e-05, + "loss": 2.294, + "step": 147500 + }, + { + "epoch": 0.5529749022252569, + "grad_norm": 3.299207925796509, + "learning_rate": 8.23066699398898e-05, + "loss": 2.2933, + "step": 148000 + }, + { + "epoch": 0.5548430606787207, + "grad_norm": 2.3422181606292725, + "learning_rate": 8.219454691697226e-05, + "loss": 2.3066, + "step": 148500 + }, + { + "epoch": 0.5567112191321844, + "grad_norm": 2.9155092239379883, + "learning_rate": 8.208214665782109e-05, + "loss": 2.2698, + "step": 149000 + }, + { + "epoch": 0.558579377585648, + "grad_norm": 3.0940420627593994, + "learning_rate": 8.196969575847251e-05, + "loss": 2.2787, + "step": 149500 + }, + { + "epoch": 0.5604475360391118, + "grad_norm": 3.761610507965088, + "learning_rate": 8.185674448258929e-05, + "loss": 2.3008, + "step": 150000 + }, + { + "epoch": 0.5623156944925755, + "grad_norm": 2.735173463821411, + "learning_rate": 8.174374560372093e-05, + "loss": 2.3122, + "step": 150500 + }, + { + "epoch": 0.5641838529460392, + "grad_norm": 2.3430800437927246, + "learning_rate": 8.163024719393988e-05, + "loss": 2.2645, + "step": 151000 + }, + { + "epoch": 0.5660520113995029, + "grad_norm": 2.489206314086914, + "learning_rate": 8.151647640726769e-05, + "loss": 2.2695, + "step": 151500 + }, + { + "epoch": 0.5679201698529666, + "grad_norm": 3.2072606086730957, + "learning_rate": 8.140243422341638e-05, + "loss": 2.2641, + "step": 152000 + }, + { + "epoch": 0.5697883283064303, + "grad_norm": 3.0480380058288574, + "learning_rate": 8.128812162443502e-05, + "loss": 2.3294, + "step": 152500 + }, + { + "epoch": 0.571656486759894, + "grad_norm": 3.000128746032715, + "learning_rate": 8.117353959470134e-05, + "loss": 2.2637, + "step": 153000 + }, + { + "epoch": 0.5735246452133577, + "grad_norm": 3.1820998191833496, + "learning_rate": 8.105868912091317e-05, + "loss": 2.2759, + "step": 153500 + }, + { + "epoch": 0.5753928036668214, + "grad_norm": 2.6837666034698486, + "learning_rate": 8.094357119208004e-05, + "loss": 2.2549, + "step": 154000 + }, + { + "epoch": 0.5772609621202851, + "grad_norm": 2.4082396030426025, + "learning_rate": 8.082841783357048e-05, + "loss": 2.3007, + "step": 154500 + }, + { + "epoch": 0.5791291205737489, + "grad_norm": 2.461305618286133, + "learning_rate": 8.0712768500827e-05, + "loss": 2.2654, + "step": 155000 + }, + { + "epoch": 0.5809972790272125, + "grad_norm": 2.9279286861419678, + "learning_rate": 8.059708678275976e-05, + "loss": 2.2669, + "step": 155500 + }, + { + "epoch": 0.5828654374806762, + "grad_norm": 2.3760006427764893, + "learning_rate": 8.048091002168906e-05, + "loss": 2.2429, + "step": 156000 + }, + { + "epoch": 0.58473359593414, + "grad_norm": 2.879556894302368, + "learning_rate": 8.036447078099056e-05, + "loss": 2.2694, + "step": 156500 + }, + { + "epoch": 0.5866017543876036, + "grad_norm": 1.9433120489120483, + "learning_rate": 8.024777006335506e-05, + "loss": 2.243, + "step": 157000 + }, + { + "epoch": 0.5884699128410673, + "grad_norm": 2.5363948345184326, + "learning_rate": 8.013080887372506e-05, + "loss": 2.267, + "step": 157500 + }, + { + "epoch": 0.5903380712945311, + "grad_norm": 2.3004775047302246, + "learning_rate": 8.001358821928599e-05, + "loss": 2.2711, + "step": 158000 + }, + { + "epoch": 0.5922062297479948, + "grad_norm": 2.1187326908111572, + "learning_rate": 7.989610910945766e-05, + "loss": 2.2733, + "step": 158500 + }, + { + "epoch": 0.5940743882014585, + "grad_norm": 2.612976312637329, + "learning_rate": 7.977860828524794e-05, + "loss": 2.2617, + "step": 159000 + }, + { + "epoch": 0.5959425466549222, + "grad_norm": 2.5254204273223877, + "learning_rate": 7.96606158136407e-05, + "loss": 2.2624, + "step": 159500 + }, + { + "epoch": 0.5978107051083859, + "grad_norm": 2.352216958999634, + "learning_rate": 7.954236792618814e-05, + "loss": 2.2923, + "step": 160000 + }, + { + "epoch": 0.5996788635618496, + "grad_norm": 2.5276451110839844, + "learning_rate": 7.942386564115584e-05, + "loss": 2.281, + "step": 160500 + }, + { + "epoch": 0.6015470220153133, + "grad_norm": 2.3592355251312256, + "learning_rate": 7.930510997900007e-05, + "loss": 2.252, + "step": 161000 + }, + { + "epoch": 0.603415180468777, + "grad_norm": 3.495464324951172, + "learning_rate": 7.918610196235899e-05, + "loss": 2.2379, + "step": 161500 + }, + { + "epoch": 0.6052833389222407, + "grad_norm": 2.2157094478607178, + "learning_rate": 7.906684261604388e-05, + "loss": 2.2813, + "step": 162000 + }, + { + "epoch": 0.6071514973757044, + "grad_norm": 3.170558452606201, + "learning_rate": 7.894733296703025e-05, + "loss": 2.2457, + "step": 162500 + }, + { + "epoch": 0.6090196558291682, + "grad_norm": 3.1325762271881104, + "learning_rate": 7.882781381038415e-05, + "loss": 2.2531, + "step": 163000 + }, + { + "epoch": 0.6108878142826318, + "grad_norm": 2.3855438232421875, + "learning_rate": 7.87078071409669e-05, + "loss": 2.2665, + "step": 163500 + }, + { + "epoch": 0.6127559727360955, + "grad_norm": 2.261495351791382, + "learning_rate": 7.858755326060588e-05, + "loss": 2.2769, + "step": 164000 + }, + { + "epoch": 0.6146241311895593, + "grad_norm": 3.212700128555298, + "learning_rate": 7.846705320484082e-05, + "loss": 2.2719, + "step": 164500 + }, + { + "epoch": 0.6164922896430229, + "grad_norm": 2.875687837600708, + "learning_rate": 7.83465497456751e-05, + "loss": 2.2756, + "step": 165000 + }, + { + "epoch": 0.6183604480964866, + "grad_norm": 3.213188886642456, + "learning_rate": 7.822556094134869e-05, + "loss": 2.2475, + "step": 165500 + }, + { + "epoch": 0.6202286065499504, + "grad_norm": 2.9114816188812256, + "learning_rate": 7.81043290788352e-05, + "loss": 2.2411, + "step": 166000 + }, + { + "epoch": 0.622096765003414, + "grad_norm": 2.960690498352051, + "learning_rate": 7.798285520209603e-05, + "loss": 2.2823, + "step": 166500 + }, + { + "epoch": 0.6239649234568778, + "grad_norm": 2.9522547721862793, + "learning_rate": 7.786138402665644e-05, + "loss": 2.2186, + "step": 167000 + }, + { + "epoch": 0.6258330819103415, + "grad_norm": 2.8541057109832764, + "learning_rate": 7.773942974047013e-05, + "loss": 2.2735, + "step": 167500 + }, + { + "epoch": 0.6277012403638051, + "grad_norm": 2.182999849319458, + "learning_rate": 7.761723658230827e-05, + "loss": 2.2556, + "step": 168000 + }, + { + "epoch": 0.6295693988172689, + "grad_norm": 2.0711419582366943, + "learning_rate": 7.749480560441025e-05, + "loss": 2.2949, + "step": 168500 + }, + { + "epoch": 0.6314375572707326, + "grad_norm": 2.7931690216064453, + "learning_rate": 7.737238343214024e-05, + "loss": 2.2579, + "step": 169000 + }, + { + "epoch": 0.6333057157241962, + "grad_norm": 2.2357709407806396, + "learning_rate": 7.724948045003347e-05, + "loss": 2.2145, + "step": 169500 + }, + { + "epoch": 0.63517387417766, + "grad_norm": 2.4123311042785645, + "learning_rate": 7.712634281504125e-05, + "loss": 2.2908, + "step": 170000 + }, + { + "epoch": 0.6370420326311237, + "grad_norm": 3.390855312347412, + "learning_rate": 7.700321856241075e-05, + "loss": 2.1975, + "step": 170500 + }, + { + "epoch": 0.6389101910845875, + "grad_norm": 2.8016293048858643, + "learning_rate": 7.687961526877562e-05, + "loss": 2.2842, + "step": 171000 + }, + { + "epoch": 0.6407783495380511, + "grad_norm": 2.734112501144409, + "learning_rate": 7.675578050726744e-05, + "loss": 2.2881, + "step": 171500 + }, + { + "epoch": 0.6426465079915148, + "grad_norm": 2.7221627235412598, + "learning_rate": 7.66317153442619e-05, + "loss": 2.2748, + "step": 172000 + }, + { + "epoch": 0.6445146664449786, + "grad_norm": 2.9320507049560547, + "learning_rate": 7.650766966527448e-05, + "loss": 2.2157, + "step": 172500 + }, + { + "epoch": 0.6463828248984422, + "grad_norm": 2.428924798965454, + "learning_rate": 7.638314736178451e-05, + "loss": 2.2613, + "step": 173000 + }, + { + "epoch": 0.6482509833519059, + "grad_norm": 2.5038206577301025, + "learning_rate": 7.62583978656453e-05, + "loss": 2.2606, + "step": 173500 + }, + { + "epoch": 0.6501191418053697, + "grad_norm": 2.3970868587493896, + "learning_rate": 7.613342225110954e-05, + "loss": 2.2383, + "step": 174000 + }, + { + "epoch": 0.6519873002588333, + "grad_norm": 2.124425172805786, + "learning_rate": 7.60082215943772e-05, + "loss": 2.2513, + "step": 174500 + }, + { + "epoch": 0.6538554587122971, + "grad_norm": 3.180497884750366, + "learning_rate": 7.58830480456262e-05, + "loss": 2.2722, + "step": 175000 + }, + { + "epoch": 0.6557236171657608, + "grad_norm": 2.8902299404144287, + "learning_rate": 7.575740098553152e-05, + "loss": 2.2439, + "step": 175500 + }, + { + "epoch": 0.6575917756192244, + "grad_norm": 2.987680196762085, + "learning_rate": 7.563153212126435e-05, + "loss": 2.233, + "step": 176000 + }, + { + "epoch": 0.6594599340726882, + "grad_norm": 2.5328335762023926, + "learning_rate": 7.550544253671663e-05, + "loss": 2.2434, + "step": 176500 + }, + { + "epoch": 0.6613280925261519, + "grad_norm": 2.5823991298675537, + "learning_rate": 7.537913331768098e-05, + "loss": 2.2261, + "step": 177000 + }, + { + "epoch": 0.6631962509796155, + "grad_norm": 3.252668619155884, + "learning_rate": 7.525260555184135e-05, + "loss": 2.2626, + "step": 177500 + }, + { + "epoch": 0.6650644094330793, + "grad_norm": 2.427614688873291, + "learning_rate": 7.512586032876367e-05, + "loss": 2.2249, + "step": 178000 + }, + { + "epoch": 0.666932567886543, + "grad_norm": 2.6210880279541016, + "learning_rate": 7.49988987398865e-05, + "loss": 2.2602, + "step": 178500 + }, + { + "epoch": 0.6688007263400068, + "grad_norm": 2.7572479248046875, + "learning_rate": 7.487223101332892e-05, + "loss": 2.2325, + "step": 179000 + }, + { + "epoch": 0.6706688847934704, + "grad_norm": 3.2144672870635986, + "learning_rate": 7.474484082913688e-05, + "loss": 2.2835, + "step": 179500 + }, + { + "epoch": 0.6725370432469341, + "grad_norm": 2.4524009227752686, + "learning_rate": 7.461723756021062e-05, + "loss": 2.274, + "step": 180000 + }, + { + "epoch": 0.6744052017003979, + "grad_norm": 2.676546335220337, + "learning_rate": 7.44894223053775e-05, + "loss": 2.2941, + "step": 180500 + }, + { + "epoch": 0.6762733601538615, + "grad_norm": 3.0090246200561523, + "learning_rate": 7.43613961652904e-05, + "loss": 2.2545, + "step": 181000 + }, + { + "epoch": 0.6781415186073252, + "grad_norm": 2.6397953033447266, + "learning_rate": 7.423316024241814e-05, + "loss": 2.2541, + "step": 181500 + }, + { + "epoch": 0.680009677060789, + "grad_norm": 3.0165371894836426, + "learning_rate": 7.410471564103606e-05, + "loss": 2.2319, + "step": 182000 + }, + { + "epoch": 0.6818778355142526, + "grad_norm": 2.1070499420166016, + "learning_rate": 7.39760634672165e-05, + "loss": 2.2617, + "step": 182500 + }, + { + "epoch": 0.6837459939677164, + "grad_norm": 2.777233123779297, + "learning_rate": 7.384746275141047e-05, + "loss": 2.2206, + "step": 183000 + }, + { + "epoch": 0.6856141524211801, + "grad_norm": 2.188089370727539, + "learning_rate": 7.371839916767453e-05, + "loss": 2.2428, + "step": 183500 + }, + { + "epoch": 0.6874823108746437, + "grad_norm": 2.427400827407837, + "learning_rate": 7.358913133818016e-05, + "loss": 2.2161, + "step": 184000 + }, + { + "epoch": 0.6893504693281075, + "grad_norm": 2.542616605758667, + "learning_rate": 7.34596603760887e-05, + "loss": 2.266, + "step": 184500 + }, + { + "epoch": 0.6912186277815712, + "grad_norm": 2.6249241828918457, + "learning_rate": 7.333024694314207e-05, + "loss": 2.2383, + "step": 185000 + }, + { + "epoch": 0.6930867862350348, + "grad_norm": 2.5798895359039307, + "learning_rate": 7.320037346301442e-05, + "loss": 2.2524, + "step": 185500 + }, + { + "epoch": 0.6949549446884986, + "grad_norm": 2.9020352363586426, + "learning_rate": 7.307030019799232e-05, + "loss": 2.2251, + "step": 186000 + }, + { + "epoch": 0.6968231031419623, + "grad_norm": 3.3277840614318848, + "learning_rate": 7.294002826817298e-05, + "loss": 2.2608, + "step": 186500 + }, + { + "epoch": 0.698691261595426, + "grad_norm": 2.6658146381378174, + "learning_rate": 7.280955879536435e-05, + "loss": 2.2689, + "step": 187000 + }, + { + "epoch": 0.7005594200488897, + "grad_norm": 2.736542224884033, + "learning_rate": 7.267915443013911e-05, + "loss": 2.2004, + "step": 187500 + }, + { + "epoch": 0.7024275785023534, + "grad_norm": 2.440765619277954, + "learning_rate": 7.254829363303503e-05, + "loss": 2.2541, + "step": 188000 + }, + { + "epoch": 0.7042957369558172, + "grad_norm": 2.6804561614990234, + "learning_rate": 7.241723866627799e-05, + "loss": 2.2647, + "step": 188500 + }, + { + "epoch": 0.7061638954092808, + "grad_norm": 2.6702585220336914, + "learning_rate": 7.228599065841891e-05, + "loss": 2.2004, + "step": 189000 + }, + { + "epoch": 0.7080320538627445, + "grad_norm": 2.5987019538879395, + "learning_rate": 7.215481381028357e-05, + "loss": 2.2509, + "step": 189500 + }, + { + "epoch": 0.7099002123162083, + "grad_norm": 2.9680731296539307, + "learning_rate": 7.20231834929401e-05, + "loss": 2.2262, + "step": 190000 + }, + { + "epoch": 0.7117683707696719, + "grad_norm": 3.8419201374053955, + "learning_rate": 7.189136352781376e-05, + "loss": 2.2313, + "step": 190500 + }, + { + "epoch": 0.7136365292231357, + "grad_norm": 2.6179468631744385, + "learning_rate": 7.175935505004304e-05, + "loss": 2.2466, + "step": 191000 + }, + { + "epoch": 0.7155046876765994, + "grad_norm": 1.9412791728973389, + "learning_rate": 7.162742377434187e-05, + "loss": 2.2336, + "step": 191500 + }, + { + "epoch": 0.717372846130063, + "grad_norm": 2.312648057937622, + "learning_rate": 7.149504205451939e-05, + "loss": 2.2124, + "step": 192000 + }, + { + "epoch": 0.7192410045835268, + "grad_norm": 2.4080445766448975, + "learning_rate": 7.136247523488743e-05, + "loss": 2.2103, + "step": 192500 + }, + { + "epoch": 0.7211091630369905, + "grad_norm": 3.0859153270721436, + "learning_rate": 7.122972445701587e-05, + "loss": 2.1961, + "step": 193000 + }, + { + "epoch": 0.7229773214904541, + "grad_norm": 3.438227415084839, + "learning_rate": 7.10970569129335e-05, + "loss": 2.2128, + "step": 193500 + }, + { + "epoch": 0.7248454799439179, + "grad_norm": 2.6577913761138916, + "learning_rate": 7.096394201181632e-05, + "loss": 2.2254, + "step": 194000 + }, + { + "epoch": 0.7267136383973816, + "grad_norm": 2.579580068588257, + "learning_rate": 7.083064658434042e-05, + "loss": 2.2562, + "step": 194500 + }, + { + "epoch": 0.7285817968508453, + "grad_norm": 2.957392454147339, + "learning_rate": 7.069717177834997e-05, + "loss": 2.2762, + "step": 195000 + }, + { + "epoch": 0.730449955304309, + "grad_norm": 1.9975017309188843, + "learning_rate": 7.056378622641193e-05, + "loss": 2.2385, + "step": 195500 + }, + { + "epoch": 0.7323181137577727, + "grad_norm": 3.1538219451904297, + "learning_rate": 7.042995646610036e-05, + "loss": 2.2086, + "step": 196000 + }, + { + "epoch": 0.7341862722112364, + "grad_norm": 2.2817578315734863, + "learning_rate": 7.02959507777287e-05, + "loss": 2.2153, + "step": 196500 + }, + { + "epoch": 0.7360544306647001, + "grad_norm": 2.5474236011505127, + "learning_rate": 7.016177031525738e-05, + "loss": 2.2388, + "step": 197000 + }, + { + "epoch": 0.7379225891181638, + "grad_norm": 2.5271482467651367, + "learning_rate": 7.002795399479169e-05, + "loss": 2.2344, + "step": 197500 + }, + { + "epoch": 0.7397907475716275, + "grad_norm": 1.9711894989013672, + "learning_rate": 6.989342813955246e-05, + "loss": 2.1875, + "step": 198000 + }, + { + "epoch": 0.7416589060250912, + "grad_norm": 2.832296133041382, + "learning_rate": 6.97587309764484e-05, + "loss": 2.2378, + "step": 198500 + }, + { + "epoch": 0.743527064478555, + "grad_norm": 3.224106788635254, + "learning_rate": 6.962386366539439e-05, + "loss": 2.1749, + "step": 199000 + }, + { + "epoch": 0.7453952229320187, + "grad_norm": 2.2426908016204834, + "learning_rate": 6.948882736777054e-05, + "loss": 2.1997, + "step": 199500 + }, + { + "epoch": 0.7472633813854823, + "grad_norm": 2.7945656776428223, + "learning_rate": 6.935362324641206e-05, + "loss": 2.2217, + "step": 200000 + }, + { + "epoch": 0.7491315398389461, + "grad_norm": 2.7567574977874756, + "learning_rate": 6.921825246559942e-05, + "loss": 2.2296, + "step": 200500 + }, + { + "epoch": 0.7509996982924098, + "grad_norm": 2.5919723510742188, + "learning_rate": 6.908298742798458e-05, + "loss": 2.2364, + "step": 201000 + }, + { + "epoch": 0.7528678567458734, + "grad_norm": 2.993880271911621, + "learning_rate": 6.894728715432299e-05, + "loss": 2.2065, + "step": 201500 + }, + { + "epoch": 0.7547360151993372, + "grad_norm": 2.4301109313964844, + "learning_rate": 6.881142372028077e-05, + "loss": 2.2457, + "step": 202000 + }, + { + "epoch": 0.7566041736528009, + "grad_norm": 2.623084783554077, + "learning_rate": 6.867539829581595e-05, + "loss": 2.1742, + "step": 202500 + }, + { + "epoch": 0.7584723321062646, + "grad_norm": 3.4304981231689453, + "learning_rate": 6.853921205228139e-05, + "loss": 2.2292, + "step": 203000 + }, + { + "epoch": 0.7603404905597283, + "grad_norm": 1.7889618873596191, + "learning_rate": 6.84028661624149e-05, + "loss": 2.217, + "step": 203500 + }, + { + "epoch": 0.762208649013192, + "grad_norm": 2.954709053039551, + "learning_rate": 6.8266361800329e-05, + "loss": 2.2491, + "step": 204000 + }, + { + "epoch": 0.7640768074666557, + "grad_norm": 2.892221212387085, + "learning_rate": 6.812970014150086e-05, + "loss": 2.2431, + "step": 204500 + }, + { + "epoch": 0.7659449659201194, + "grad_norm": 1.9717577695846558, + "learning_rate": 6.799315615334446e-05, + "loss": 2.2397, + "step": 205000 + }, + { + "epoch": 0.7678131243735831, + "grad_norm": 2.904269218444824, + "learning_rate": 6.785618374157811e-05, + "loss": 2.1972, + "step": 205500 + }, + { + "epoch": 0.7696812828270468, + "grad_norm": 3.807295083999634, + "learning_rate": 6.771933197025247e-05, + "loss": 2.2292, + "step": 206000 + }, + { + "epoch": 0.7715494412805105, + "grad_norm": 3.4538333415985107, + "learning_rate": 6.758205351413722e-05, + "loss": 2.1935, + "step": 206500 + }, + { + "epoch": 0.7734175997339743, + "grad_norm": 2.769444227218628, + "learning_rate": 6.744462365404948e-05, + "loss": 2.1709, + "step": 207000 + }, + { + "epoch": 0.775285758187438, + "grad_norm": 3.002584934234619, + "learning_rate": 6.730704357343616e-05, + "loss": 2.1863, + "step": 207500 + }, + { + "epoch": 0.7771539166409016, + "grad_norm": 2.559108257293701, + "learning_rate": 6.716959006322012e-05, + "loss": 2.2118, + "step": 208000 + }, + { + "epoch": 0.7790220750943654, + "grad_norm": 3.1521153450012207, + "learning_rate": 6.703171339157552e-05, + "loss": 2.19, + "step": 208500 + }, + { + "epoch": 0.780890233547829, + "grad_norm": 2.7111008167266846, + "learning_rate": 6.689369005509088e-05, + "loss": 2.2044, + "step": 209000 + }, + { + "epoch": 0.7827583920012927, + "grad_norm": 2.8580000400543213, + "learning_rate": 6.675552124232371e-05, + "loss": 2.2458, + "step": 209500 + }, + { + "epoch": 0.7846265504547565, + "grad_norm": 2.7248494625091553, + "learning_rate": 6.661720814308425e-05, + "loss": 2.2096, + "step": 210000 + }, + { + "epoch": 0.7864947089082202, + "grad_norm": 3.5847723484039307, + "learning_rate": 6.647875194842521e-05, + "loss": 2.2238, + "step": 210500 + }, + { + "epoch": 0.7883628673616839, + "grad_norm": 3.013185977935791, + "learning_rate": 6.634015385063155e-05, + "loss": 2.2128, + "step": 211000 + }, + { + "epoch": 0.7902310258151476, + "grad_norm": 3.160470962524414, + "learning_rate": 6.620141504321021e-05, + "loss": 2.2604, + "step": 211500 + }, + { + "epoch": 0.7920991842686113, + "grad_norm": 3.009772300720215, + "learning_rate": 6.606281461596562e-05, + "loss": 2.2169, + "step": 212000 + }, + { + "epoch": 0.793967342722075, + "grad_norm": 2.7089791297912598, + "learning_rate": 6.592379825008977e-05, + "loss": 2.1894, + "step": 212500 + }, + { + "epoch": 0.7958355011755387, + "grad_norm": 2.2874131202697754, + "learning_rate": 6.578492320297462e-05, + "loss": 2.2472, + "step": 213000 + }, + { + "epoch": 0.7977036596290024, + "grad_norm": 3.115208864212036, + "learning_rate": 6.564563405749691e-05, + "loss": 2.1696, + "step": 213500 + }, + { + "epoch": 0.7995718180824661, + "grad_norm": 3.074309825897217, + "learning_rate": 6.550621018309538e-05, + "loss": 2.2022, + "step": 214000 + }, + { + "epoch": 0.8014399765359298, + "grad_norm": 2.6160593032836914, + "learning_rate": 6.536665278038796e-05, + "loss": 2.2136, + "step": 214500 + }, + { + "epoch": 0.8033081349893936, + "grad_norm": 2.875887155532837, + "learning_rate": 6.522696305114238e-05, + "loss": 2.222, + "step": 215000 + }, + { + "epoch": 0.8051762934428572, + "grad_norm": 1.9582101106643677, + "learning_rate": 6.508714219826595e-05, + "loss": 2.1975, + "step": 215500 + }, + { + "epoch": 0.8070444518963209, + "grad_norm": 3.11397647857666, + "learning_rate": 6.494719142579506e-05, + "loss": 2.2285, + "step": 216000 + }, + { + "epoch": 0.8089126103497847, + "grad_norm": 2.7110836505889893, + "learning_rate": 6.480711193888488e-05, + "loss": 2.1638, + "step": 216500 + }, + { + "epoch": 0.8107807688032483, + "grad_norm": 2.2085702419281006, + "learning_rate": 6.4666904943799e-05, + "loss": 2.2144, + "step": 217000 + }, + { + "epoch": 0.812648927256712, + "grad_norm": 3.44262957572937, + "learning_rate": 6.452657164789899e-05, + "loss": 2.2248, + "step": 217500 + }, + { + "epoch": 0.8145170857101758, + "grad_norm": 2.770791530609131, + "learning_rate": 6.438639430044904e-05, + "loss": 2.1861, + "step": 218000 + }, + { + "epoch": 0.8163852441636394, + "grad_norm": 3.2068679332733154, + "learning_rate": 6.424581227590346e-05, + "loss": 2.1691, + "step": 218500 + }, + { + "epoch": 0.8182534026171032, + "grad_norm": 3.264312744140625, + "learning_rate": 6.410510757669032e-05, + "loss": 2.159, + "step": 219000 + }, + { + "epoch": 0.8201215610705669, + "grad_norm": 3.264051675796509, + "learning_rate": 6.396428141445709e-05, + "loss": 2.1775, + "step": 219500 + }, + { + "epoch": 0.8219897195240305, + "grad_norm": 2.961418867111206, + "learning_rate": 6.382333500189714e-05, + "loss": 2.1851, + "step": 220000 + }, + { + "epoch": 0.8238578779774943, + "grad_norm": 4.034390449523926, + "learning_rate": 6.368226955273941e-05, + "loss": 2.1552, + "step": 220500 + }, + { + "epoch": 0.825726036430958, + "grad_norm": 2.0030012130737305, + "learning_rate": 6.354136876505816e-05, + "loss": 2.1762, + "step": 221000 + }, + { + "epoch": 0.8275941948844217, + "grad_norm": 2.7552449703216553, + "learning_rate": 6.340006911997954e-05, + "loss": 2.1758, + "step": 221500 + }, + { + "epoch": 0.8294623533378854, + "grad_norm": 2.4928476810455322, + "learning_rate": 6.325865408316381e-05, + "loss": 2.1951, + "step": 222000 + }, + { + "epoch": 0.8313305117913491, + "grad_norm": 2.8218753337860107, + "learning_rate": 6.311712487237538e-05, + "loss": 2.1348, + "step": 222500 + }, + { + "epoch": 0.8331986702448129, + "grad_norm": 3.4085326194763184, + "learning_rate": 6.297548270636179e-05, + "loss": 2.2058, + "step": 223000 + }, + { + "epoch": 0.8350668286982765, + "grad_norm": 3.3644134998321533, + "learning_rate": 6.283372880484332e-05, + "loss": 2.1574, + "step": 223500 + }, + { + "epoch": 0.8369349871517402, + "grad_norm": 3.0675761699676514, + "learning_rate": 6.269186438850234e-05, + "loss": 2.1725, + "step": 224000 + }, + { + "epoch": 0.838803145605204, + "grad_norm": 2.6877012252807617, + "learning_rate": 6.2549890678973e-05, + "loss": 2.1889, + "step": 224500 + }, + { + "epoch": 0.8406713040586676, + "grad_norm": 3.4169256687164307, + "learning_rate": 6.240837743960651e-05, + "loss": 2.1423, + "step": 225000 + }, + { + "epoch": 0.8425394625121313, + "grad_norm": 3.0024383068084717, + "learning_rate": 6.22661892373068e-05, + "loss": 2.178, + "step": 225500 + }, + { + "epoch": 0.8444076209655951, + "grad_norm": 3.079028606414795, + "learning_rate": 6.212389540742632e-05, + "loss": 2.2295, + "step": 226000 + }, + { + "epoch": 0.8462757794190587, + "grad_norm": 2.90077805519104, + "learning_rate": 6.198149717529692e-05, + "loss": 2.1684, + "step": 226500 + }, + { + "epoch": 0.8481439378725225, + "grad_norm": 3.053629159927368, + "learning_rate": 6.18389957671496e-05, + "loss": 2.1738, + "step": 227000 + }, + { + "epoch": 0.8500120963259862, + "grad_norm": 3.0925843715667725, + "learning_rate": 6.16963924101038e-05, + "loss": 2.1551, + "step": 227500 + }, + { + "epoch": 0.8518802547794498, + "grad_norm": 3.0221009254455566, + "learning_rate": 6.155368833215677e-05, + "loss": 2.1966, + "step": 228000 + }, + { + "epoch": 0.8537484132329136, + "grad_norm": 2.5803329944610596, + "learning_rate": 6.141088476217323e-05, + "loss": 2.164, + "step": 228500 + }, + { + "epoch": 0.8556165716863773, + "grad_norm": 3.4956555366516113, + "learning_rate": 6.126826883078718e-05, + "loss": 2.1776, + "step": 229000 + }, + { + "epoch": 0.8574847301398411, + "grad_norm": 2.8954169750213623, + "learning_rate": 6.112527015957583e-05, + "loss": 2.1944, + "step": 229500 + }, + { + "epoch": 0.8593528885933047, + "grad_norm": 3.2150614261627197, + "learning_rate": 6.0982175685556475e-05, + "loss": 2.1942, + "step": 230000 + }, + { + "epoch": 0.8612210470467684, + "grad_norm": 2.8969147205352783, + "learning_rate": 6.083898664095558e-05, + "loss": 2.152, + "step": 230500 + }, + { + "epoch": 0.8630892055002322, + "grad_norm": 2.898751974105835, + "learning_rate": 6.069599091590918e-05, + "loss": 2.1624, + "step": 231000 + }, + { + "epoch": 0.8649573639536958, + "grad_norm": 3.5042660236358643, + "learning_rate": 6.05529034527542e-05, + "loss": 2.1428, + "step": 231500 + }, + { + "epoch": 0.8668255224071595, + "grad_norm": 3.0192151069641113, + "learning_rate": 6.040943845887397e-05, + "loss": 2.1942, + "step": 232000 + }, + { + "epoch": 0.8686936808606233, + "grad_norm": 3.0444955825805664, + "learning_rate": 6.026588382641243e-05, + "loss": 2.1533, + "step": 232500 + }, + { + "epoch": 0.8705618393140869, + "grad_norm": 3.1138992309570312, + "learning_rate": 6.012224079155855e-05, + "loss": 2.1841, + "step": 233000 + }, + { + "epoch": 0.8724299977675507, + "grad_norm": 2.3980443477630615, + "learning_rate": 5.997879813783181e-05, + "loss": 2.1724, + "step": 233500 + }, + { + "epoch": 0.8742981562210144, + "grad_norm": 2.9543912410736084, + "learning_rate": 5.9834982180414524e-05, + "loss": 2.1502, + "step": 234000 + }, + { + "epoch": 0.876166314674478, + "grad_norm": 2.555027961730957, + "learning_rate": 5.969108153121932e-05, + "loss": 2.1499, + "step": 234500 + }, + { + "epoch": 0.8780344731279418, + "grad_norm": 2.4806180000305176, + "learning_rate": 5.954709742941489e-05, + "loss": 2.1733, + "step": 235000 + }, + { + "epoch": 0.8799026315814055, + "grad_norm": 2.855769634246826, + "learning_rate": 5.9403031114888505e-05, + "loss": 2.1783, + "step": 235500 + }, + { + "epoch": 0.8817707900348691, + "grad_norm": 2.85447359085083, + "learning_rate": 5.9258883828235466e-05, + "loss": 2.1684, + "step": 236000 + }, + { + "epoch": 0.8836389484883329, + "grad_norm": 3.5129261016845703, + "learning_rate": 5.911494534352925e-05, + "loss": 2.1825, + "step": 236500 + }, + { + "epoch": 0.8855071069417966, + "grad_norm": 3.9751412868499756, + "learning_rate": 5.8970639992924826e-05, + "loss": 2.1827, + "step": 237000 + }, + { + "epoch": 0.8873752653952603, + "grad_norm": 3.1551120281219482, + "learning_rate": 5.882625739363443e-05, + "loss": 2.2232, + "step": 237500 + }, + { + "epoch": 0.889243423848724, + "grad_norm": 3.2931878566741943, + "learning_rate": 5.868179878897693e-05, + "loss": 2.1291, + "step": 238000 + }, + { + "epoch": 0.8911115823021877, + "grad_norm": 3.2662160396575928, + "learning_rate": 5.853726542292572e-05, + "loss": 2.1776, + "step": 238500 + }, + { + "epoch": 0.8929797407556515, + "grad_norm": 2.764841079711914, + "learning_rate": 5.8392658540097975e-05, + "loss": 2.1069, + "step": 239000 + }, + { + "epoch": 0.8948478992091151, + "grad_norm": 1.903836965560913, + "learning_rate": 5.8247979385743945e-05, + "loss": 2.1436, + "step": 239500 + }, + { + "epoch": 0.8967160576625788, + "grad_norm": 2.859905481338501, + "learning_rate": 5.8103229205736235e-05, + "loss": 2.1426, + "step": 240000 + }, + { + "epoch": 0.8985842161160426, + "grad_norm": 3.1984663009643555, + "learning_rate": 5.79586989552882e-05, + "loss": 2.1798, + "step": 240500 + }, + { + "epoch": 0.9004523745695062, + "grad_norm": 2.157151222229004, + "learning_rate": 5.781381059984584e-05, + "loss": 2.1766, + "step": 241000 + }, + { + "epoch": 0.90232053302297, + "grad_norm": 3.674839973449707, + "learning_rate": 5.7668854957498444e-05, + "loss": 2.1925, + "step": 241500 + }, + { + "epoch": 0.9041886914764337, + "grad_norm": 2.9118549823760986, + "learning_rate": 5.752383327649953e-05, + "loss": 2.1655, + "step": 242000 + }, + { + "epoch": 0.9060568499298973, + "grad_norm": 3.0006792545318604, + "learning_rate": 5.737903704244284e-05, + "loss": 2.1639, + "step": 242500 + }, + { + "epoch": 0.9079250083833611, + "grad_norm": 3.3966879844665527, + "learning_rate": 5.723388715699902e-05, + "loss": 2.1106, + "step": 243000 + }, + { + "epoch": 0.9097931668368248, + "grad_norm": 3.6091904640197754, + "learning_rate": 5.708896546422721e-05, + "loss": 2.1847, + "step": 243500 + }, + { + "epoch": 0.9116613252902884, + "grad_norm": 2.7571775913238525, + "learning_rate": 5.694369236403816e-05, + "loss": 2.1453, + "step": 244000 + }, + { + "epoch": 0.9135294837437522, + "grad_norm": 3.4625306129455566, + "learning_rate": 5.6798359469775195e-05, + "loss": 2.1599, + "step": 244500 + }, + { + "epoch": 0.9153976421972159, + "grad_norm": 2.573812246322632, + "learning_rate": 5.665296803294042e-05, + "loss": 2.1393, + "step": 245000 + }, + { + "epoch": 0.9172658006506796, + "grad_norm": 2.3979828357696533, + "learning_rate": 5.650751930554011e-05, + "loss": 2.1714, + "step": 245500 + }, + { + "epoch": 0.9191339591041433, + "grad_norm": 3.1871445178985596, + "learning_rate": 5.6362014540073884e-05, + "loss": 2.1164, + "step": 246000 + }, + { + "epoch": 0.921002117557607, + "grad_norm": 2.8169736862182617, + "learning_rate": 5.6216454989523906e-05, + "loss": 2.1343, + "step": 246500 + }, + { + "epoch": 0.9228702760110707, + "grad_norm": 3.2970011234283447, + "learning_rate": 5.607113318609965e-05, + "loss": 2.1403, + "step": 247000 + }, + { + "epoch": 0.9247384344645344, + "grad_norm": 2.7862350940704346, + "learning_rate": 5.5925467929508655e-05, + "loss": 2.148, + "step": 247500 + }, + { + "epoch": 0.9266065929179981, + "grad_norm": 2.888575553894043, + "learning_rate": 5.5779751647058663e-05, + "loss": 2.184, + "step": 248000 + }, + { + "epoch": 0.9284747513714618, + "grad_norm": 2.52675199508667, + "learning_rate": 5.56339855935533e-05, + "loss": 2.078, + "step": 248500 + }, + { + "epoch": 0.9303429098249255, + "grad_norm": 2.9500951766967773, + "learning_rate": 5.54881710242247e-05, + "loss": 2.1206, + "step": 249000 + }, + { + "epoch": 0.9322110682783893, + "grad_norm": 2.5412566661834717, + "learning_rate": 5.5342309194722885e-05, + "loss": 2.1395, + "step": 249500 + }, + { + "epoch": 0.934079226731853, + "grad_norm": 2.3108468055725098, + "learning_rate": 5.519640136110478e-05, + "loss": 2.1498, + "step": 250000 + }, + { + "epoch": 0.9359473851853166, + "grad_norm": 2.373042345046997, + "learning_rate": 5.505044877982351e-05, + "loss": 2.1532, + "step": 250500 + }, + { + "epoch": 0.9378155436387804, + "grad_norm": 2.997445821762085, + "learning_rate": 5.490474474242996e-05, + "loss": 2.1451, + "step": 251000 + }, + { + "epoch": 0.939683702092244, + "grad_norm": 2.837625741958618, + "learning_rate": 5.4758706519924406e-05, + "loss": 2.1425, + "step": 251500 + }, + { + "epoch": 0.9415518605457077, + "grad_norm": 2.954401731491089, + "learning_rate": 5.461262731886816e-05, + "loss": 2.1568, + "step": 252000 + }, + { + "epoch": 0.9434200189991715, + "grad_norm": 3.2825334072113037, + "learning_rate": 5.446650839719003e-05, + "loss": 2.15, + "step": 252500 + }, + { + "epoch": 0.9452881774526352, + "grad_norm": 3.196861505508423, + "learning_rate": 5.4320643365477844e-05, + "loss": 2.1278, + "step": 253000 + }, + { + "epoch": 0.9471563359060989, + "grad_norm": 2.7488534450531006, + "learning_rate": 5.417444885085084e-05, + "loss": 2.1859, + "step": 253500 + }, + { + "epoch": 0.9490244943595626, + "grad_norm": 2.5847301483154297, + "learning_rate": 5.4028218388879116e-05, + "loss": 2.1445, + "step": 254000 + }, + { + "epoch": 0.9508926528130263, + "grad_norm": 3.6500895023345947, + "learning_rate": 5.388195323879396e-05, + "loss": 2.1439, + "step": 254500 + }, + { + "epoch": 0.95276081126649, + "grad_norm": 2.848147392272949, + "learning_rate": 5.373594728980722e-05, + "loss": 2.1709, + "step": 255000 + }, + { + "epoch": 0.9546289697199537, + "grad_norm": 2.592301368713379, + "learning_rate": 5.35899092980915e-05, + "loss": 2.1306, + "step": 255500 + }, + { + "epoch": 0.9564971281734174, + "grad_norm": 1.9539679288864136, + "learning_rate": 5.344354776311128e-05, + "loss": 2.115, + "step": 256000 + }, + { + "epoch": 0.9583652866268811, + "grad_norm": 3.211258888244629, + "learning_rate": 5.329715657477968e-05, + "loss": 2.166, + "step": 256500 + }, + { + "epoch": 0.9602334450803448, + "grad_norm": 2.754812240600586, + "learning_rate": 5.31507369937121e-05, + "loss": 2.1639, + "step": 257000 + }, + { + "epoch": 0.9621016035338086, + "grad_norm": 2.349533796310425, + "learning_rate": 5.300458320043379e-05, + "loss": 2.155, + "step": 257500 + }, + { + "epoch": 0.9639697619872722, + "grad_norm": 3.3088858127593994, + "learning_rate": 5.285811066719044e-05, + "loss": 2.1429, + "step": 258000 + }, + { + "epoch": 0.9658379204407359, + "grad_norm": 3.420562505722046, + "learning_rate": 5.2711613521958034e-05, + "loss": 2.133, + "step": 258500 + }, + { + "epoch": 0.9677060788941997, + "grad_norm": 2.4579176902770996, + "learning_rate": 5.256509302626437e-05, + "loss": 2.1483, + "step": 259000 + }, + { + "epoch": 0.9695742373476633, + "grad_norm": 3.574404239654541, + "learning_rate": 5.241855044183839e-05, + "loss": 2.1599, + "step": 259500 + }, + { + "epoch": 0.971442395801127, + "grad_norm": 2.763312816619873, + "learning_rate": 5.227198703059918e-05, + "loss": 2.1175, + "step": 260000 + }, + { + "epoch": 0.9733105542545908, + "grad_norm": 3.4662206172943115, + "learning_rate": 5.2125404054645224e-05, + "loss": 2.1439, + "step": 260500 + }, + { + "epoch": 0.9751787127080545, + "grad_norm": 2.4736666679382324, + "learning_rate": 5.197880277624344e-05, + "loss": 2.166, + "step": 261000 + }, + { + "epoch": 0.9770468711615182, + "grad_norm": 2.448014974594116, + "learning_rate": 5.1832184457818365e-05, + "loss": 2.1184, + "step": 261500 + }, + { + "epoch": 0.9789150296149819, + "grad_norm": 2.605496644973755, + "learning_rate": 5.168584364503971e-05, + "loss": 2.0694, + "step": 262000 + }, + { + "epoch": 0.9807831880684456, + "grad_norm": 2.6576755046844482, + "learning_rate": 5.153919506218703e-05, + "loss": 2.1525, + "step": 262500 + }, + { + "epoch": 0.9826513465219093, + "grad_norm": 3.0602567195892334, + "learning_rate": 5.139253322489586e-05, + "loss": 2.12, + "step": 263000 + }, + { + "epoch": 0.984519504975373, + "grad_norm": 2.233271598815918, + "learning_rate": 5.124585939611224e-05, + "loss": 2.124, + "step": 263500 + }, + { + "epoch": 0.9863876634288367, + "grad_norm": 3.0819501876831055, + "learning_rate": 5.109946821786733e-05, + "loss": 2.1361, + "step": 264000 + }, + { + "epoch": 0.9882558218823004, + "grad_norm": 2.7308757305145264, + "learning_rate": 5.0952774213009e-05, + "loss": 2.1196, + "step": 264500 + }, + { + "epoch": 0.9901239803357641, + "grad_norm": 2.309229612350464, + "learning_rate": 5.080607200354588e-05, + "loss": 2.071, + "step": 265000 + }, + { + "epoch": 0.9919921387892279, + "grad_norm": 3.331204652786255, + "learning_rate": 5.065965627716091e-05, + "loss": 2.0675, + "step": 265500 + }, + { + "epoch": 0.9938602972426915, + "grad_norm": 3.6821019649505615, + "learning_rate": 5.051294145852407e-05, + "loss": 2.1329, + "step": 266000 + }, + { + "epoch": 0.9957284556961552, + "grad_norm": 1.9205609560012817, + "learning_rate": 5.036622222280509e-05, + "loss": 2.1563, + "step": 266500 + }, + { + "epoch": 0.997596614149619, + "grad_norm": 3.6985223293304443, + "learning_rate": 5.021949983344428e-05, + "loss": 2.139, + "step": 267000 + }, + { + "epoch": 0.9994647726030826, + "grad_norm": 3.8483798503875732, + "learning_rate": 5.007277555390912e-05, + "loss": 2.1531, + "step": 267500 + }, + { + "epoch": 1.0013329310565464, + "grad_norm": 2.758868932723999, + "learning_rate": 4.992605064768335e-05, + "loss": 2.0257, + "step": 268000 + }, + { + "epoch": 1.00320108951001, + "grad_norm": 2.7047057151794434, + "learning_rate": 4.9779619825319616e-05, + "loss": 1.9918, + "step": 268500 + }, + { + "epoch": 1.0050692479634737, + "grad_norm": 3.4775989055633545, + "learning_rate": 4.963289745111303e-05, + "loss": 1.9841, + "step": 269000 + }, + { + "epoch": 1.0069374064169374, + "grad_norm": 3.1174392700195312, + "learning_rate": 4.9486178238129e-05, + "loss": 1.9998, + "step": 269500 + }, + { + "epoch": 1.008805564870401, + "grad_norm": 3.418029546737671, + "learning_rate": 4.933946344980765e-05, + "loss": 2.0305, + "step": 270000 + }, + { + "epoch": 1.010673723323865, + "grad_norm": 4.21517276763916, + "learning_rate": 4.919275434955098e-05, + "loss": 1.9349, + "step": 270500 + }, + { + "epoch": 1.0125418817773286, + "grad_norm": 3.2260196208953857, + "learning_rate": 4.904605220071203e-05, + "loss": 1.9659, + "step": 271000 + }, + { + "epoch": 1.0144100402307923, + "grad_norm": 2.354206085205078, + "learning_rate": 4.889935826658396e-05, + "loss": 1.9459, + "step": 271500 + }, + { + "epoch": 1.016278198684256, + "grad_norm": 2.399245262145996, + "learning_rate": 4.8752967169003024e-05, + "loss": 1.9669, + "step": 272000 + }, + { + "epoch": 1.0181463571377196, + "grad_norm": 2.836991786956787, + "learning_rate": 4.8606293431139685e-05, + "loss": 1.9754, + "step": 272500 + }, + { + "epoch": 1.0200145155911835, + "grad_norm": 2.369506597518921, + "learning_rate": 4.845963169487281e-05, + "loss": 1.9748, + "step": 273000 + }, + { + "epoch": 1.0218826740446472, + "grad_norm": 4.3176140785217285, + "learning_rate": 4.831298322314752e-05, + "loss": 1.9874, + "step": 273500 + }, + { + "epoch": 1.0237508324981108, + "grad_norm": 2.473726749420166, + "learning_rate": 4.8166349278794803e-05, + "loss": 1.9784, + "step": 274000 + }, + { + "epoch": 1.0256189909515745, + "grad_norm": 3.3185558319091797, + "learning_rate": 4.8019731124520506e-05, + "loss": 2.0007, + "step": 274500 + }, + { + "epoch": 1.0274871494050382, + "grad_norm": 3.276498317718506, + "learning_rate": 4.787313002289445e-05, + "loss": 1.9758, + "step": 275000 + }, + { + "epoch": 1.029355307858502, + "grad_norm": 3.0989725589752197, + "learning_rate": 4.772654723633967e-05, + "loss": 2.0042, + "step": 275500 + }, + { + "epoch": 1.0312234663119657, + "grad_norm": 2.4186153411865234, + "learning_rate": 4.7580277133162835e-05, + "loss": 2.0053, + "step": 276000 + }, + { + "epoch": 1.0330916247654294, + "grad_norm": 2.4179837703704834, + "learning_rate": 4.74340277836311e-05, + "loss": 1.9908, + "step": 276500 + }, + { + "epoch": 1.034959783218893, + "grad_norm": 3.3896212577819824, + "learning_rate": 4.728750742427794e-05, + "loss": 1.9604, + "step": 277000 + }, + { + "epoch": 1.0368279416723567, + "grad_norm": 2.6385319232940674, + "learning_rate": 4.714101042295578e-05, + "loss": 1.9896, + "step": 277500 + }, + { + "epoch": 1.0386961001258204, + "grad_norm": 3.6427805423736572, + "learning_rate": 4.6994538041191235e-05, + "loss": 2.0044, + "step": 278000 + }, + { + "epoch": 1.0405642585792843, + "grad_norm": 3.0906810760498047, + "learning_rate": 4.684809154029888e-05, + "loss": 2.0074, + "step": 278500 + }, + { + "epoch": 1.042432417032748, + "grad_norm": 3.357675313949585, + "learning_rate": 4.67019649921625e-05, + "loss": 2.0337, + "step": 279000 + }, + { + "epoch": 1.0443005754862116, + "grad_norm": 3.163966655731201, + "learning_rate": 4.655557397799212e-05, + "loss": 1.9936, + "step": 279500 + }, + { + "epoch": 1.0461687339396752, + "grad_norm": 2.073416233062744, + "learning_rate": 4.640921262473603e-05, + "loss": 1.9917, + "step": 280000 + }, + { + "epoch": 1.048036892393139, + "grad_norm": 4.012736797332764, + "learning_rate": 4.626288219275275e-05, + "loss": 1.9811, + "step": 280500 + }, + { + "epoch": 1.0499050508466028, + "grad_norm": 3.065397262573242, + "learning_rate": 4.611658394213446e-05, + "loss": 2.0052, + "step": 281000 + }, + { + "epoch": 1.0517732093000665, + "grad_norm": 3.3266775608062744, + "learning_rate": 4.597061162810362e-05, + "loss": 1.997, + "step": 281500 + }, + { + "epoch": 1.0536413677535301, + "grad_norm": 2.940035820007324, + "learning_rate": 4.582438144871442e-05, + "loss": 1.9267, + "step": 282000 + }, + { + "epoch": 1.0555095262069938, + "grad_norm": 3.5627119541168213, + "learning_rate": 4.567818722674258e-05, + "loss": 1.973, + "step": 282500 + }, + { + "epoch": 1.0573776846604575, + "grad_norm": 2.702580213546753, + "learning_rate": 4.553203022110738e-05, + "loss": 1.9818, + "step": 283000 + }, + { + "epoch": 1.0592458431139213, + "grad_norm": 3.027751922607422, + "learning_rate": 4.538591169040759e-05, + "loss": 2.0195, + "step": 283500 + }, + { + "epoch": 1.061114001567385, + "grad_norm": 2.598694086074829, + "learning_rate": 4.5239832892910685e-05, + "loss": 1.9988, + "step": 284000 + }, + { + "epoch": 1.0629821600208487, + "grad_norm": 2.5287024974823, + "learning_rate": 4.5093795086541985e-05, + "loss": 1.9794, + "step": 284500 + }, + { + "epoch": 1.0648503184743123, + "grad_norm": 2.937054395675659, + "learning_rate": 4.494779952887383e-05, + "loss": 1.9804, + "step": 285000 + }, + { + "epoch": 1.066718476927776, + "grad_norm": 2.625366687774658, + "learning_rate": 4.48021393369639e-05, + "loss": 2.002, + "step": 285500 + }, + { + "epoch": 1.0685866353812399, + "grad_norm": 2.97308349609375, + "learning_rate": 4.465623195716817e-05, + "loss": 1.974, + "step": 286000 + }, + { + "epoch": 1.0704547938347035, + "grad_norm": 2.940298080444336, + "learning_rate": 4.4510370594051275e-05, + "loss": 1.9722, + "step": 286500 + }, + { + "epoch": 1.0723229522881672, + "grad_norm": 2.5476973056793213, + "learning_rate": 4.436455650366615e-05, + "loss": 2.0061, + "step": 287000 + }, + { + "epoch": 1.0741911107416309, + "grad_norm": 3.88171124458313, + "learning_rate": 4.4218790941658633e-05, + "loss": 1.9859, + "step": 287500 + }, + { + "epoch": 1.0760592691950945, + "grad_norm": 2.958958864212036, + "learning_rate": 4.407307516325668e-05, + "loss": 1.9929, + "step": 288000 + }, + { + "epoch": 1.0779274276485582, + "grad_norm": 3.2626969814300537, + "learning_rate": 4.3927410423259555e-05, + "loss": 2.0427, + "step": 288500 + }, + { + "epoch": 1.079795586102022, + "grad_norm": 2.726310968399048, + "learning_rate": 4.378208914789977e-05, + "loss": 1.9826, + "step": 289000 + }, + { + "epoch": 1.0816637445554858, + "grad_norm": 3.683236598968506, + "learning_rate": 4.36365301389968e-05, + "loss": 2.006, + "step": 289500 + }, + { + "epoch": 1.0835319030089494, + "grad_norm": 3.4819111824035645, + "learning_rate": 4.349102592770976e-05, + "loss": 1.9865, + "step": 290000 + }, + { + "epoch": 1.085400061462413, + "grad_norm": 3.417532444000244, + "learning_rate": 4.334557776701607e-05, + "loss": 1.9988, + "step": 290500 + }, + { + "epoch": 1.0872682199158767, + "grad_norm": 2.9879865646362305, + "learning_rate": 4.3200477633104895e-05, + "loss": 1.9888, + "step": 291000 + }, + { + "epoch": 1.0891363783693406, + "grad_norm": 2.8864903450012207, + "learning_rate": 4.305514521222923e-05, + "loss": 1.9602, + "step": 291500 + }, + { + "epoch": 1.0910045368228043, + "grad_norm": 3.8783183097839355, + "learning_rate": 4.290987259543744e-05, + "loss": 2.0115, + "step": 292000 + }, + { + "epoch": 1.092872695276268, + "grad_norm": 3.2339043617248535, + "learning_rate": 4.2764661033712623e-05, + "loss": 2.016, + "step": 292500 + }, + { + "epoch": 1.0947408537297316, + "grad_norm": 3.942629337310791, + "learning_rate": 4.261951177751206e-05, + "loss": 1.9975, + "step": 293000 + }, + { + "epoch": 1.0966090121831953, + "grad_norm": 5.084557056427002, + "learning_rate": 4.2474426076756546e-05, + "loss": 1.9484, + "step": 293500 + }, + { + "epoch": 1.098477170636659, + "grad_norm": 3.621943473815918, + "learning_rate": 4.2329405180819554e-05, + "loss": 1.9364, + "step": 294000 + }, + { + "epoch": 1.1003453290901228, + "grad_norm": 3.5090487003326416, + "learning_rate": 4.2184450338516527e-05, + "loss": 2.0112, + "step": 294500 + }, + { + "epoch": 1.1022134875435865, + "grad_norm": 4.1997246742248535, + "learning_rate": 4.204014221253661e-05, + "loss": 1.9631, + "step": 295000 + }, + { + "epoch": 1.1040816459970502, + "grad_norm": 3.7712690830230713, + "learning_rate": 4.189532294497906e-05, + "loss": 1.9428, + "step": 295500 + }, + { + "epoch": 1.1059498044505138, + "grad_norm": 4.392169952392578, + "learning_rate": 4.175057346905878e-05, + "loss": 2.0024, + "step": 296000 + }, + { + "epoch": 1.1078179629039775, + "grad_norm": 3.103431463241577, + "learning_rate": 4.160589503125397e-05, + "loss": 1.9671, + "step": 296500 + }, + { + "epoch": 1.1096861213574414, + "grad_norm": 2.2490739822387695, + "learning_rate": 4.1461288877431045e-05, + "loss": 1.9978, + "step": 297000 + }, + { + "epoch": 1.111554279810905, + "grad_norm": 3.9997470378875732, + "learning_rate": 4.1317045243873654e-05, + "loss": 1.9756, + "step": 297500 + }, + { + "epoch": 1.1134224382643687, + "grad_norm": 3.8243391513824463, + "learning_rate": 4.117258724232387e-05, + "loss": 1.9927, + "step": 298000 + }, + { + "epoch": 1.1152905967178324, + "grad_norm": 3.207801342010498, + "learning_rate": 4.102820525609035e-05, + "loss": 1.9807, + "step": 298500 + }, + { + "epoch": 1.117158755171296, + "grad_norm": 2.981112480163574, + "learning_rate": 4.08839005284867e-05, + "loss": 1.9757, + "step": 299000 + }, + { + "epoch": 1.11902691362476, + "grad_norm": 2.8603618144989014, + "learning_rate": 4.0739674302161204e-05, + "loss": 1.9882, + "step": 299500 + }, + { + "epoch": 1.1208950720782236, + "grad_norm": 3.422062635421753, + "learning_rate": 4.059552781908619e-05, + "loss": 1.9883, + "step": 300000 + }, + { + "epoch": 1.1227632305316873, + "grad_norm": 3.2499775886535645, + "learning_rate": 4.045146232054726e-05, + "loss": 1.9715, + "step": 300500 + }, + { + "epoch": 1.124631388985151, + "grad_norm": 3.5448482036590576, + "learning_rate": 4.030776693079458e-05, + "loss": 1.9895, + "step": 301000 + }, + { + "epoch": 1.1264995474386146, + "grad_norm": 3.52693510055542, + "learning_rate": 4.016386695421753e-05, + "loss": 1.9936, + "step": 301500 + }, + { + "epoch": 1.1283677058920785, + "grad_norm": 3.247986078262329, + "learning_rate": 4.002005167932884e-05, + "loss": 1.9916, + "step": 302000 + }, + { + "epoch": 1.1302358643455421, + "grad_norm": 3.287041425704956, + "learning_rate": 3.987632234456198e-05, + "loss": 1.971, + "step": 302500 + }, + { + "epoch": 1.1321040227990058, + "grad_norm": 2.758507251739502, + "learning_rate": 3.9732680187610403e-05, + "loss": 2.0091, + "step": 303000 + }, + { + "epoch": 1.1339721812524695, + "grad_norm": 2.9558610916137695, + "learning_rate": 3.958912644541679e-05, + "loss": 2.0046, + "step": 303500 + }, + { + "epoch": 1.1358403397059331, + "grad_norm": 3.0163705348968506, + "learning_rate": 3.944566235416254e-05, + "loss": 1.9902, + "step": 304000 + }, + { + "epoch": 1.1377084981593968, + "grad_norm": 2.4738314151763916, + "learning_rate": 3.9302289149256985e-05, + "loss": 1.969, + "step": 304500 + }, + { + "epoch": 1.1395766566128607, + "grad_norm": 3.352306604385376, + "learning_rate": 3.915929453473775e-05, + "loss": 1.9639, + "step": 305000 + }, + { + "epoch": 1.1414448150663243, + "grad_norm": 3.9805781841278076, + "learning_rate": 3.9016106617675985e-05, + "loss": 1.9703, + "step": 305500 + }, + { + "epoch": 1.143312973519788, + "grad_norm": 2.410222291946411, + "learning_rate": 3.8873013285987326e-05, + "loss": 1.9836, + "step": 306000 + }, + { + "epoch": 1.1451811319732517, + "grad_norm": 3.830815076828003, + "learning_rate": 3.873030167047204e-05, + "loss": 1.9474, + "step": 306500 + }, + { + "epoch": 1.1470492904267153, + "grad_norm": 3.884229898452759, + "learning_rate": 3.858740101002805e-05, + "loss": 1.9912, + "step": 307000 + }, + { + "epoch": 1.1489174488801792, + "grad_norm": 3.097529172897339, + "learning_rate": 3.8444598626660855e-05, + "loss": 1.9851, + "step": 307500 + }, + { + "epoch": 1.1507856073336429, + "grad_norm": 3.3618969917297363, + "learning_rate": 3.8301895750081664e-05, + "loss": 1.9897, + "step": 308000 + }, + { + "epoch": 1.1526537657871065, + "grad_norm": 2.846202850341797, + "learning_rate": 3.8159293609144794e-05, + "loss": 1.9649, + "step": 308500 + }, + { + "epoch": 1.1545219242405702, + "grad_norm": 3.3975071907043457, + "learning_rate": 3.801679343183709e-05, + "loss": 1.9611, + "step": 309000 + }, + { + "epoch": 1.1563900826940339, + "grad_norm": 3.390746831893921, + "learning_rate": 3.787468113544101e-05, + "loss": 1.9809, + "step": 309500 + }, + { + "epoch": 1.1582582411474975, + "grad_norm": 3.883208990097046, + "learning_rate": 3.773238835577244e-05, + "loss": 1.9741, + "step": 310000 + }, + { + "epoch": 1.1601263996009614, + "grad_norm": 2.655240535736084, + "learning_rate": 3.7590201215933385e-05, + "loss": 1.9929, + "step": 310500 + }, + { + "epoch": 1.161994558054425, + "grad_norm": 3.561328649520874, + "learning_rate": 3.7448120940337014e-05, + "loss": 1.9941, + "step": 311000 + }, + { + "epoch": 1.1638627165078888, + "grad_norm": 4.378994464874268, + "learning_rate": 3.7306148752476284e-05, + "loss": 1.9692, + "step": 311500 + }, + { + "epoch": 1.1657308749613524, + "grad_norm": 2.515988826751709, + "learning_rate": 3.716428587491332e-05, + "loss": 1.9721, + "step": 312000 + }, + { + "epoch": 1.1675990334148163, + "grad_norm": 2.2535147666931152, + "learning_rate": 3.702253352926898e-05, + "loss": 1.9904, + "step": 312500 + }, + { + "epoch": 1.16946719186828, + "grad_norm": 3.65279483795166, + "learning_rate": 3.688117610505848e-05, + "loss": 1.8969, + "step": 313000 + }, + { + "epoch": 1.1713353503217436, + "grad_norm": 3.5840914249420166, + "learning_rate": 3.6739648257134945e-05, + "loss": 1.9981, + "step": 313500 + }, + { + "epoch": 1.1732035087752073, + "grad_norm": 4.6728973388671875, + "learning_rate": 3.659823459780314e-05, + "loss": 2.0034, + "step": 314000 + }, + { + "epoch": 1.175071667228671, + "grad_norm": 3.8465287685394287, + "learning_rate": 3.6456936344815585e-05, + "loss": 1.9575, + "step": 314500 + }, + { + "epoch": 1.1769398256821346, + "grad_norm": 3.005547046661377, + "learning_rate": 3.631603696099265e-05, + "loss": 1.9799, + "step": 315000 + }, + { + "epoch": 1.1788079841355985, + "grad_norm": 3.0555107593536377, + "learning_rate": 3.617497293307507e-05, + "loss": 1.9681, + "step": 315500 + }, + { + "epoch": 1.1806761425890622, + "grad_norm": 3.1861069202423096, + "learning_rate": 3.6034027956326125e-05, + "loss": 2.0004, + "step": 316000 + }, + { + "epoch": 1.1825443010425258, + "grad_norm": 3.5906646251678467, + "learning_rate": 3.589320324446236e-05, + "loss": 1.984, + "step": 316500 + }, + { + "epoch": 1.1844124594959895, + "grad_norm": 3.118577480316162, + "learning_rate": 3.5752500010164694e-05, + "loss": 2.0166, + "step": 317000 + }, + { + "epoch": 1.1862806179494532, + "grad_norm": 3.639019727706909, + "learning_rate": 3.561220050290951e-05, + "loss": 1.9152, + "step": 317500 + }, + { + "epoch": 1.188148776402917, + "grad_norm": 2.516979455947876, + "learning_rate": 3.547174360858504e-05, + "loss": 1.9838, + "step": 318000 + }, + { + "epoch": 1.1900169348563807, + "grad_norm": 4.030247688293457, + "learning_rate": 3.5331411821133284e-05, + "loss": 1.9957, + "step": 318500 + }, + { + "epoch": 1.1918850933098444, + "grad_norm": 2.944655656814575, + "learning_rate": 3.519120634899048e-05, + "loss": 1.9557, + "step": 319000 + }, + { + "epoch": 1.193753251763308, + "grad_norm": 2.9035158157348633, + "learning_rate": 3.505112839950505e-05, + "loss": 1.9852, + "step": 319500 + }, + { + "epoch": 1.1956214102167717, + "grad_norm": 4.2154364585876465, + "learning_rate": 3.491117917892734e-05, + "loss": 1.9863, + "step": 320000 + }, + { + "epoch": 1.1974895686702354, + "grad_norm": 3.7261621952056885, + "learning_rate": 3.4771359892399204e-05, + "loss": 1.9478, + "step": 320500 + }, + { + "epoch": 1.1993577271236993, + "grad_norm": 4.7101240158081055, + "learning_rate": 3.463195098856492e-05, + "loss": 1.9688, + "step": 321000 + }, + { + "epoch": 1.201225885577163, + "grad_norm": 3.4447665214538574, + "learning_rate": 3.44923949151937e-05, + "loss": 1.9768, + "step": 321500 + }, + { + "epoch": 1.2030940440306266, + "grad_norm": 2.6960058212280273, + "learning_rate": 3.4352972382140294e-05, + "loss": 1.9639, + "step": 322000 + }, + { + "epoch": 1.2049622024840903, + "grad_norm": 3.2135891914367676, + "learning_rate": 3.421368459001103e-05, + "loss": 2.0298, + "step": 322500 + }, + { + "epoch": 1.206830360937554, + "grad_norm": 3.953632116317749, + "learning_rate": 3.4074532738252e-05, + "loss": 2.0028, + "step": 323000 + }, + { + "epoch": 1.2086985193910178, + "grad_norm": 3.091557025909424, + "learning_rate": 3.393551802513865e-05, + "loss": 1.9353, + "step": 323500 + }, + { + "epoch": 1.2105666778444815, + "grad_norm": 3.2774996757507324, + "learning_rate": 3.379664164776548e-05, + "loss": 1.9976, + "step": 324000 + }, + { + "epoch": 1.2124348362979451, + "grad_norm": 4.057534694671631, + "learning_rate": 3.365790480203579e-05, + "loss": 1.9577, + "step": 324500 + }, + { + "epoch": 1.2143029947514088, + "grad_norm": 3.725080728530884, + "learning_rate": 3.351958573365166e-05, + "loss": 1.9619, + "step": 325000 + }, + { + "epoch": 1.2161711532048725, + "grad_norm": 2.542310953140259, + "learning_rate": 3.338140801561512e-05, + "loss": 1.9413, + "step": 325500 + }, + { + "epoch": 1.2180393116583361, + "grad_norm": 3.8798625469207764, + "learning_rate": 3.324309635334674e-05, + "loss": 1.9272, + "step": 326000 + }, + { + "epoch": 1.2199074701118, + "grad_norm": 2.8388006687164307, + "learning_rate": 3.310492898945492e-05, + "loss": 1.9717, + "step": 326500 + }, + { + "epoch": 1.2217756285652637, + "grad_norm": 3.845374822616577, + "learning_rate": 3.296690711373742e-05, + "loss": 1.9995, + "step": 327000 + }, + { + "epoch": 1.2236437870187273, + "grad_norm": 3.3350958824157715, + "learning_rate": 3.282903191473914e-05, + "loss": 1.9505, + "step": 327500 + }, + { + "epoch": 1.225511945472191, + "grad_norm": 3.514188289642334, + "learning_rate": 3.2691304579741944e-05, + "loss": 1.9493, + "step": 328000 + }, + { + "epoch": 1.2273801039256549, + "grad_norm": 4.140675067901611, + "learning_rate": 3.255372629475436e-05, + "loss": 1.9381, + "step": 328500 + }, + { + "epoch": 1.2292482623791186, + "grad_norm": 3.2821719646453857, + "learning_rate": 3.241629824450141e-05, + "loss": 1.9647, + "step": 329000 + }, + { + "epoch": 1.2311164208325822, + "grad_norm": 3.671809434890747, + "learning_rate": 3.227929601377734e-05, + "loss": 1.948, + "step": 329500 + }, + { + "epoch": 1.2329845792860459, + "grad_norm": 4.461349010467529, + "learning_rate": 3.214244577120278e-05, + "loss": 1.9533, + "step": 330000 + }, + { + "epoch": 1.2348527377395095, + "grad_norm": 4.116054058074951, + "learning_rate": 3.200547490304101e-05, + "loss": 1.9278, + "step": 330500 + }, + { + "epoch": 1.2367208961929732, + "grad_norm": 3.0734941959381104, + "learning_rate": 3.1868658990759734e-05, + "loss": 1.9038, + "step": 331000 + }, + { + "epoch": 1.238589054646437, + "grad_norm": 4.233485698699951, + "learning_rate": 3.173199921251894e-05, + "loss": 1.9466, + "step": 331500 + }, + { + "epoch": 1.2404572130999008, + "grad_norm": 3.6610071659088135, + "learning_rate": 3.159549674513415e-05, + "loss": 1.9437, + "step": 332000 + }, + { + "epoch": 1.2423253715533644, + "grad_norm": 3.757662773132324, + "learning_rate": 3.145915276406623e-05, + "loss": 1.9695, + "step": 332500 + }, + { + "epoch": 1.244193530006828, + "grad_norm": 4.0608062744140625, + "learning_rate": 3.1322968443411296e-05, + "loss": 1.9398, + "step": 333000 + }, + { + "epoch": 1.2460616884602917, + "grad_norm": 3.5959203243255615, + "learning_rate": 3.118694495589054e-05, + "loss": 1.9154, + "step": 333500 + }, + { + "epoch": 1.2479298469137556, + "grad_norm": 4.01427698135376, + "learning_rate": 3.105135503334797e-05, + "loss": 1.9268, + "step": 334000 + }, + { + "epoch": 1.2497980053672193, + "grad_norm": 4.18043851852417, + "learning_rate": 3.091565639719372e-05, + "loss": 1.9349, + "step": 334500 + }, + { + "epoch": 1.251666163820683, + "grad_norm": 3.132768154144287, + "learning_rate": 3.0780122101651435e-05, + "loss": 1.9476, + "step": 335000 + }, + { + "epoch": 1.2535343222741466, + "grad_norm": 2.99275803565979, + "learning_rate": 3.0644753313844755e-05, + "loss": 1.9625, + "step": 335500 + }, + { + "epoch": 1.2554024807276103, + "grad_norm": 3.58479380607605, + "learning_rate": 3.0509551199472118e-05, + "loss": 1.9545, + "step": 336000 + }, + { + "epoch": 1.257270639181074, + "grad_norm": 3.13480544090271, + "learning_rate": 3.0374786823074896e-05, + "loss": 1.9398, + "step": 336500 + }, + { + "epoch": 1.2591387976345378, + "grad_norm": 3.130760431289673, + "learning_rate": 3.0239921207753986e-05, + "loss": 1.9582, + "step": 337000 + }, + { + "epoch": 1.2610069560880015, + "grad_norm": 3.4282748699188232, + "learning_rate": 3.0105225751989453e-05, + "loss": 1.9285, + "step": 337500 + }, + { + "epoch": 1.2628751145414652, + "grad_norm": 3.996558666229248, + "learning_rate": 2.9970701615681463e-05, + "loss": 1.9397, + "step": 338000 + }, + { + "epoch": 1.2647432729949288, + "grad_norm": 3.9144933223724365, + "learning_rate": 2.9836349957254927e-05, + "loss": 1.9361, + "step": 338500 + }, + { + "epoch": 1.2666114314483927, + "grad_norm": 2.7201411724090576, + "learning_rate": 2.9702171933649482e-05, + "loss": 1.9221, + "step": 339000 + }, + { + "epoch": 1.2684795899018564, + "grad_norm": 3.485480785369873, + "learning_rate": 2.956843653156831e-05, + "loss": 1.951, + "step": 339500 + }, + { + "epoch": 1.27034774835532, + "grad_norm": 4.514249324798584, + "learning_rate": 2.943460888939414e-05, + "loss": 1.9556, + "step": 340000 + }, + { + "epoch": 1.2722159068087837, + "grad_norm": 3.043680429458618, + "learning_rate": 2.930095834154558e-05, + "loss": 1.9673, + "step": 340500 + }, + { + "epoch": 1.2740840652622474, + "grad_norm": 2.636143207550049, + "learning_rate": 2.9167486038924823e-05, + "loss": 1.9492, + "step": 341000 + }, + { + "epoch": 1.275952223715711, + "grad_norm": 3.6190054416656494, + "learning_rate": 2.9034193130899155e-05, + "loss": 1.9648, + "step": 341500 + }, + { + "epoch": 1.2778203821691747, + "grad_norm": 4.245516777038574, + "learning_rate": 2.890108076529099e-05, + "loss": 1.9589, + "step": 342000 + }, + { + "epoch": 1.2796885406226386, + "grad_norm": 3.619927406311035, + "learning_rate": 2.876841576763556e-05, + "loss": 1.9439, + "step": 342500 + }, + { + "epoch": 1.2815566990761023, + "grad_norm": 3.657912015914917, + "learning_rate": 2.863566755729298e-05, + "loss": 1.9564, + "step": 343000 + }, + { + "epoch": 1.283424857529566, + "grad_norm": 3.4643499851226807, + "learning_rate": 2.8503103321182943e-05, + "loss": 1.9754, + "step": 343500 + }, + { + "epoch": 1.2852930159830296, + "grad_norm": 4.774941444396973, + "learning_rate": 2.8370724200853072e-05, + "loss": 1.9406, + "step": 344000 + }, + { + "epoch": 1.2871611744364935, + "grad_norm": 3.5722765922546387, + "learning_rate": 2.8238531336256975e-05, + "loss": 1.9708, + "step": 344500 + }, + { + "epoch": 1.2890293328899571, + "grad_norm": 3.9576704502105713, + "learning_rate": 2.8106525865744272e-05, + "loss": 1.9503, + "step": 345000 + }, + { + "epoch": 1.2908974913434208, + "grad_norm": 4.773796558380127, + "learning_rate": 2.7974972371021873e-05, + "loss": 1.967, + "step": 345500 + }, + { + "epoch": 1.2927656497968845, + "grad_norm": 3.749734401702881, + "learning_rate": 2.784334471679681e-05, + "loss": 1.9484, + "step": 346000 + }, + { + "epoch": 1.2946338082503481, + "grad_norm": 4.330195903778076, + "learning_rate": 2.7711907859717524e-05, + "loss": 1.9094, + "step": 346500 + }, + { + "epoch": 1.2965019667038118, + "grad_norm": 3.0685718059539795, + "learning_rate": 2.758066293162346e-05, + "loss": 1.9195, + "step": 347000 + }, + { + "epoch": 1.2983701251572755, + "grad_norm": 3.8571877479553223, + "learning_rate": 2.7449611062701342e-05, + "loss": 1.9457, + "step": 347500 + }, + { + "epoch": 1.3002382836107393, + "grad_norm": 3.673949718475342, + "learning_rate": 2.731875338147545e-05, + "loss": 1.9046, + "step": 348000 + }, + { + "epoch": 1.302106442064203, + "grad_norm": 3.5845327377319336, + "learning_rate": 2.7188091014797774e-05, + "loss": 1.9871, + "step": 348500 + }, + { + "epoch": 1.3039746005176667, + "grad_norm": 5.045246124267578, + "learning_rate": 2.7057885822898532e-05, + "loss": 1.9445, + "step": 349000 + }, + { + "epoch": 1.3058427589711303, + "grad_norm": 4.416993141174316, + "learning_rate": 2.692761706288961e-05, + "loss": 1.9242, + "step": 349500 + }, + { + "epoch": 1.3077109174245942, + "grad_norm": 5.05975341796875, + "learning_rate": 2.6797546985612997e-05, + "loss": 1.9729, + "step": 350000 + }, + { + "epoch": 1.3095790758780579, + "grad_norm": 3.4689128398895264, + "learning_rate": 2.6667676711138423e-05, + "loss": 1.9479, + "step": 350500 + }, + { + "epoch": 1.3114472343315215, + "grad_norm": 3.177008628845215, + "learning_rate": 2.6538266495259985e-05, + "loss": 1.9456, + "step": 351000 + }, + { + "epoch": 1.3133153927849852, + "grad_norm": 3.6939172744750977, + "learning_rate": 2.6408798774518146e-05, + "loss": 1.934, + "step": 351500 + }, + { + "epoch": 1.3151835512384489, + "grad_norm": 4.592978477478027, + "learning_rate": 2.6279534204197788e-05, + "loss": 1.8931, + "step": 352000 + }, + { + "epoch": 1.3170517096919125, + "grad_norm": 4.249555587768555, + "learning_rate": 2.6150473897432166e-05, + "loss": 1.9352, + "step": 352500 + }, + { + "epoch": 1.3189198681453764, + "grad_norm": 3.4636592864990234, + "learning_rate": 2.6021876469757334e-05, + "loss": 1.9227, + "step": 353000 + }, + { + "epoch": 1.32078802659884, + "grad_norm": 3.9055769443511963, + "learning_rate": 2.5893227608380464e-05, + "loss": 2.0114, + "step": 353500 + }, + { + "epoch": 1.3226561850523038, + "grad_norm": 3.659078359603882, + "learning_rate": 2.576478633715232e-05, + "loss": 1.9675, + "step": 354000 + }, + { + "epoch": 1.3245243435057674, + "grad_norm": 4.109720230102539, + "learning_rate": 2.563655376211658e-05, + "loss": 1.9515, + "step": 354500 + }, + { + "epoch": 1.3263925019592313, + "grad_norm": 3.4679160118103027, + "learning_rate": 2.550853098751974e-05, + "loss": 1.965, + "step": 355000 + }, + { + "epoch": 1.328260660412695, + "grad_norm": 3.3445444107055664, + "learning_rate": 2.538097452833215e-05, + "loss": 1.9422, + "step": 355500 + }, + { + "epoch": 1.3301288188661586, + "grad_norm": 4.475471496582031, + "learning_rate": 2.5253374235012317e-05, + "loss": 1.9533, + "step": 356000 + }, + { + "epoch": 1.3319969773196223, + "grad_norm": 3.064134359359741, + "learning_rate": 2.5125987041797306e-05, + "loss": 1.9263, + "step": 356500 + }, + { + "epoch": 1.333865135773086, + "grad_norm": 3.313082218170166, + "learning_rate": 2.4998814045653785e-05, + "loss": 1.8802, + "step": 357000 + }, + { + "epoch": 1.3357332942265496, + "grad_norm": 5.206328392028809, + "learning_rate": 2.4872110041523282e-05, + "loss": 1.8967, + "step": 357500 + }, + { + "epoch": 1.3376014526800133, + "grad_norm": 4.334334373474121, + "learning_rate": 2.4745368289174596e-05, + "loss": 1.9429, + "step": 358000 + }, + { + "epoch": 1.3394696111334772, + "grad_norm": 5.680240154266357, + "learning_rate": 2.4618844011511794e-05, + "loss": 1.9209, + "step": 358500 + }, + { + "epoch": 1.3413377695869408, + "grad_norm": 3.261059284210205, + "learning_rate": 2.449253829807073e-05, + "loss": 1.9251, + "step": 359000 + }, + { + "epoch": 1.3432059280404045, + "grad_norm": 3.2310187816619873, + "learning_rate": 2.4366704188693773e-05, + "loss": 1.9056, + "step": 359500 + }, + { + "epoch": 1.3450740864938682, + "grad_norm": 4.145471096038818, + "learning_rate": 2.424083842220842e-05, + "loss": 1.926, + "step": 360000 + }, + { + "epoch": 1.346942244947332, + "grad_norm": 4.704455852508545, + "learning_rate": 2.411519447505653e-05, + "loss": 1.9485, + "step": 360500 + }, + { + "epoch": 1.3488104034007957, + "grad_norm": 3.9618282318115234, + "learning_rate": 2.3989773429193175e-05, + "loss": 1.9304, + "step": 361000 + }, + { + "epoch": 1.3506785618542594, + "grad_norm": 3.921598434448242, + "learning_rate": 2.3864576364654012e-05, + "loss": 1.91, + "step": 361500 + }, + { + "epoch": 1.352546720307723, + "grad_norm": 4.026153087615967, + "learning_rate": 2.3739604359545953e-05, + "loss": 1.9588, + "step": 362000 + }, + { + "epoch": 1.3544148787611867, + "grad_norm": 3.6452534198760986, + "learning_rate": 2.3615107755379164e-05, + "loss": 1.9613, + "step": 362500 + }, + { + "epoch": 1.3562830372146504, + "grad_norm": 3.757392406463623, + "learning_rate": 2.349058864020204e-05, + "loss": 1.9386, + "step": 363000 + }, + { + "epoch": 1.358151195668114, + "grad_norm": 4.3105902671813965, + "learning_rate": 2.3366297804968707e-05, + "loss": 1.9171, + "step": 363500 + }, + { + "epoch": 1.360019354121578, + "grad_norm": 4.3953938484191895, + "learning_rate": 2.3242236319982296e-05, + "loss": 1.9274, + "step": 364000 + }, + { + "epoch": 1.3618875125750416, + "grad_norm": 3.9918718338012695, + "learning_rate": 2.3118652685036857e-05, + "loss": 1.9505, + "step": 364500 + }, + { + "epoch": 1.3637556710285053, + "grad_norm": 4.170524597167969, + "learning_rate": 2.2995052639511584e-05, + "loss": 1.9666, + "step": 365000 + }, + { + "epoch": 1.365623829481969, + "grad_norm": 2.33520245552063, + "learning_rate": 2.2871685141129013e-05, + "loss": 1.8909, + "step": 365500 + }, + { + "epoch": 1.3674919879354328, + "grad_norm": 3.8575286865234375, + "learning_rate": 2.2748551252241096e-05, + "loss": 1.9036, + "step": 366000 + }, + { + "epoch": 1.3693601463888965, + "grad_norm": 3.738067150115967, + "learning_rate": 2.262589759672201e-05, + "loss": 1.9242, + "step": 366500 + }, + { + "epoch": 1.3712283048423601, + "grad_norm": 3.2097079753875732, + "learning_rate": 2.2503233633312364e-05, + "loss": 1.9669, + "step": 367000 + }, + { + "epoch": 1.3730964632958238, + "grad_norm": 4.111919403076172, + "learning_rate": 2.2380806452236224e-05, + "loss": 1.9115, + "step": 367500 + }, + { + "epoch": 1.3749646217492875, + "grad_norm": 3.6487059593200684, + "learning_rate": 2.2258617107748202e-05, + "loss": 1.9221, + "step": 368000 + }, + { + "epoch": 1.3768327802027511, + "grad_norm": 3.9140658378601074, + "learning_rate": 2.213666665205488e-05, + "loss": 1.9077, + "step": 368500 + }, + { + "epoch": 1.378700938656215, + "grad_norm": 4.236271858215332, + "learning_rate": 2.2015199316183162e-05, + "loss": 1.9248, + "step": 369000 + }, + { + "epoch": 1.3805690971096787, + "grad_norm": 3.9722940921783447, + "learning_rate": 2.189372930344269e-05, + "loss": 1.9075, + "step": 369500 + }, + { + "epoch": 1.3824372555631423, + "grad_norm": 3.9439289569854736, + "learning_rate": 2.1772501321647675e-05, + "loss": 1.9325, + "step": 370000 + }, + { + "epoch": 1.384305414016606, + "grad_norm": 3.183210611343384, + "learning_rate": 2.1651516414726137e-05, + "loss": 1.9372, + "step": 370500 + }, + { + "epoch": 1.38617357247007, + "grad_norm": 4.380889892578125, + "learning_rate": 2.1530775624512915e-05, + "loss": 1.9119, + "step": 371000 + }, + { + "epoch": 1.3880417309235336, + "grad_norm": 3.137747049331665, + "learning_rate": 2.1410520736652044e-05, + "loss": 1.8852, + "step": 371500 + }, + { + "epoch": 1.3899098893769972, + "grad_norm": 4.502001762390137, + "learning_rate": 2.129027080352e-05, + "loss": 1.9157, + "step": 372000 + }, + { + "epoch": 1.3917780478304609, + "grad_norm": 3.3394224643707275, + "learning_rate": 2.1170268097883096e-05, + "loss": 1.9329, + "step": 372500 + }, + { + "epoch": 1.3936462062839245, + "grad_norm": 3.0865299701690674, + "learning_rate": 2.1050513653118137e-05, + "loss": 1.9178, + "step": 373000 + }, + { + "epoch": 1.3955143647373882, + "grad_norm": 4.535000324249268, + "learning_rate": 2.0931247261291493e-05, + "loss": 1.9163, + "step": 373500 + }, + { + "epoch": 1.3973825231908519, + "grad_norm": 3.5877630710601807, + "learning_rate": 2.0811991928172553e-05, + "loss": 1.9437, + "step": 374000 + }, + { + "epoch": 1.3992506816443158, + "grad_norm": 4.446563243865967, + "learning_rate": 2.0692987941141717e-05, + "loss": 1.9458, + "step": 374500 + }, + { + "epoch": 1.4011188400977794, + "grad_norm": 3.427525758743286, + "learning_rate": 2.0574236324975526e-05, + "loss": 1.9163, + "step": 375000 + }, + { + "epoch": 1.402986998551243, + "grad_norm": 4.324997901916504, + "learning_rate": 2.0455974845157404e-05, + "loss": 1.9447, + "step": 375500 + }, + { + "epoch": 1.4048551570047068, + "grad_norm": 4.460984706878662, + "learning_rate": 2.0337730526503722e-05, + "loss": 1.8936, + "step": 376000 + }, + { + "epoch": 1.4067233154581706, + "grad_norm": 3.0335512161254883, + "learning_rate": 2.0219741637935503e-05, + "loss": 1.9274, + "step": 376500 + }, + { + "epoch": 1.4085914739116343, + "grad_norm": 3.983215808868408, + "learning_rate": 2.010200919548798e-05, + "loss": 1.9456, + "step": 377000 + }, + { + "epoch": 1.410459632365098, + "grad_norm": 4.645228385925293, + "learning_rate": 1.9984534212988126e-05, + "loss": 1.8914, + "step": 377500 + }, + { + "epoch": 1.4123277908185616, + "grad_norm": 4.4612250328063965, + "learning_rate": 1.986755187644178e-05, + "loss": 1.9379, + "step": 378000 + }, + { + "epoch": 1.4141959492720253, + "grad_norm": 3.9466419219970703, + "learning_rate": 1.9750594326473332e-05, + "loss": 1.9053, + "step": 378500 + }, + { + "epoch": 1.416064107725489, + "grad_norm": 3.384223461151123, + "learning_rate": 1.9633897262584083e-05, + "loss": 1.9777, + "step": 379000 + }, + { + "epoch": 1.4179322661789528, + "grad_norm": 3.591265916824341, + "learning_rate": 1.9517461689685075e-05, + "loss": 1.9357, + "step": 379500 + }, + { + "epoch": 1.4198004246324165, + "grad_norm": 4.8993730545043945, + "learning_rate": 1.9401520693960035e-05, + "loss": 1.9063, + "step": 380000 + }, + { + "epoch": 1.4216685830858802, + "grad_norm": 4.398604869842529, + "learning_rate": 1.9285610580773773e-05, + "loss": 1.8615, + "step": 380500 + }, + { + "epoch": 1.4235367415393438, + "grad_norm": 3.6538774967193604, + "learning_rate": 1.916996495777159e-05, + "loss": 1.9166, + "step": 381000 + }, + { + "epoch": 1.4254048999928077, + "grad_norm": 3.730799436569214, + "learning_rate": 1.905458482081028e-05, + "loss": 1.8853, + "step": 381500 + }, + { + "epoch": 1.4272730584462714, + "grad_norm": 5.199082851409912, + "learning_rate": 1.8939701124169172e-05, + "loss": 1.8736, + "step": 382000 + }, + { + "epoch": 1.429141216899735, + "grad_norm": 4.507551670074463, + "learning_rate": 1.8824854401777008e-05, + "loss": 1.9045, + "step": 382500 + }, + { + "epoch": 1.4310093753531987, + "grad_norm": 2.917692184448242, + "learning_rate": 1.8710276137269065e-05, + "loss": 1.8737, + "step": 383000 + }, + { + "epoch": 1.4328775338066624, + "grad_norm": 4.9208221435546875, + "learning_rate": 1.8595967317310803e-05, + "loss": 1.8852, + "step": 383500 + }, + { + "epoch": 1.434745692260126, + "grad_norm": 4.914313793182373, + "learning_rate": 1.8481928926247323e-05, + "loss": 1.9188, + "step": 384000 + }, + { + "epoch": 1.4366138507135897, + "grad_norm": 4.2889556884765625, + "learning_rate": 1.836838920853576e-05, + "loss": 1.9626, + "step": 384500 + }, + { + "epoch": 1.4384820091670536, + "grad_norm": 4.040252208709717, + "learning_rate": 1.8254894073216665e-05, + "loss": 1.9157, + "step": 385000 + }, + { + "epoch": 1.4403501676205173, + "grad_norm": 4.800929546356201, + "learning_rate": 1.8141672303869356e-05, + "loss": 1.8893, + "step": 385500 + }, + { + "epoch": 1.442218326073981, + "grad_norm": 3.5540807247161865, + "learning_rate": 1.8028724875478063e-05, + "loss": 1.9504, + "step": 386000 + }, + { + "epoch": 1.4440864845274446, + "grad_norm": 3.3006908893585205, + "learning_rate": 1.791627782948606e-05, + "loss": 1.9409, + "step": 386500 + }, + { + "epoch": 1.4459546429809085, + "grad_norm": 2.976499080657959, + "learning_rate": 1.7803881444967192e-05, + "loss": 1.9083, + "step": 387000 + }, + { + "epoch": 1.4478228014343721, + "grad_norm": 4.687767505645752, + "learning_rate": 1.7691762310215786e-05, + "loss": 1.9419, + "step": 387500 + }, + { + "epoch": 1.4496909598878358, + "grad_norm": 4.436933517456055, + "learning_rate": 1.7579921390721e-05, + "loss": 1.9205, + "step": 388000 + }, + { + "epoch": 1.4515591183412995, + "grad_norm": 4.451811790466309, + "learning_rate": 1.7468582493799596e-05, + "loss": 1.9, + "step": 388500 + }, + { + "epoch": 1.4534272767947631, + "grad_norm": 4.564020156860352, + "learning_rate": 1.7357300330458897e-05, + "loss": 1.8913, + "step": 389000 + }, + { + "epoch": 1.4552954352482268, + "grad_norm": 3.211652994155884, + "learning_rate": 1.724629926252035e-05, + "loss": 1.8884, + "step": 389500 + }, + { + "epoch": 1.4571635937016905, + "grad_norm": 4.224535942077637, + "learning_rate": 1.7135580245845107e-05, + "loss": 1.9185, + "step": 390000 + }, + { + "epoch": 1.4590317521551543, + "grad_norm": 3.9640257358551025, + "learning_rate": 1.7025364822818328e-05, + "loss": 1.9193, + "step": 390500 + }, + { + "epoch": 1.460899910608618, + "grad_norm": 3.1013686656951904, + "learning_rate": 1.6915212197670978e-05, + "loss": 1.9274, + "step": 391000 + }, + { + "epoch": 1.4627680690620817, + "grad_norm": 5.020761966705322, + "learning_rate": 1.68053444748701e-05, + "loss": 1.8856, + "step": 391500 + }, + { + "epoch": 1.4646362275155453, + "grad_norm": 3.306040048599243, + "learning_rate": 1.6695762600517374e-05, + "loss": 1.9403, + "step": 392000 + }, + { + "epoch": 1.4665043859690092, + "grad_norm": 4.234299182891846, + "learning_rate": 1.658668582157294e-05, + "loss": 1.8777, + "step": 392500 + }, + { + "epoch": 1.468372544422473, + "grad_norm": 6.068370342254639, + "learning_rate": 1.6477677896163034e-05, + "loss": 1.8937, + "step": 393000 + }, + { + "epoch": 1.4702407028759366, + "grad_norm": 4.372175216674805, + "learning_rate": 1.636895864082966e-05, + "loss": 1.9034, + "step": 393500 + }, + { + "epoch": 1.4721088613294002, + "grad_norm": 4.099493980407715, + "learning_rate": 1.6260528991784696e-05, + "loss": 1.9204, + "step": 394000 + }, + { + "epoch": 1.4739770197828639, + "grad_norm": 3.7667877674102783, + "learning_rate": 1.6152389882746138e-05, + "loss": 1.9014, + "step": 394500 + }, + { + "epoch": 1.4758451782363275, + "grad_norm": 2.797348976135254, + "learning_rate": 1.60447576486997e-05, + "loss": 1.9077, + "step": 395000 + }, + { + "epoch": 1.4777133366897914, + "grad_norm": 4.806083679199219, + "learning_rate": 1.593720182508714e-05, + "loss": 1.9239, + "step": 395500 + }, + { + "epoch": 1.479581495143255, + "grad_norm": 4.35167121887207, + "learning_rate": 1.58299393257415e-05, + "loss": 1.9147, + "step": 396000 + }, + { + "epoch": 1.4814496535967188, + "grad_norm": 7.256587982177734, + "learning_rate": 1.5722971074330122e-05, + "loss": 1.9101, + "step": 396500 + }, + { + "epoch": 1.4833178120501824, + "grad_norm": 4.269795894622803, + "learning_rate": 1.5616511042961456e-05, + "loss": 1.9253, + "step": 397000 + }, + { + "epoch": 1.4851859705036463, + "grad_norm": 3.5930633544921875, + "learning_rate": 1.551013345518685e-05, + "loss": 1.9399, + "step": 397500 + }, + { + "epoch": 1.48705412895711, + "grad_norm": 4.802802085876465, + "learning_rate": 1.5404052869284143e-05, + "loss": 1.924, + "step": 398000 + }, + { + "epoch": 1.4889222874105736, + "grad_norm": 5.457955360412598, + "learning_rate": 1.5298270198742908e-05, + "loss": 1.925, + "step": 398500 + }, + { + "epoch": 1.4907904458640373, + "grad_norm": 4.350592613220215, + "learning_rate": 1.5192997023342925e-05, + "loss": 1.9841, + "step": 399000 + }, + { + "epoch": 1.492658604317501, + "grad_norm": 3.5578579902648926, + "learning_rate": 1.5087812313349553e-05, + "loss": 1.8914, + "step": 399500 + }, + { + "epoch": 1.4945267627709646, + "grad_norm": 4.802867412567139, + "learning_rate": 1.4982928241953386e-05, + "loss": 1.8969, + "step": 400000 + }, + { + "epoch": 1.4963949212244283, + "grad_norm": 4.002582550048828, + "learning_rate": 1.4878345712340435e-05, + "loss": 1.904, + "step": 400500 + }, + { + "epoch": 1.4982630796778922, + "grad_norm": 4.3025665283203125, + "learning_rate": 1.4774273882839745e-05, + "loss": 1.916, + "step": 401000 + }, + { + "epoch": 1.5001312381313558, + "grad_norm": 4.821669101715088, + "learning_rate": 1.4670296528381727e-05, + "loss": 1.8837, + "step": 401500 + }, + { + "epoch": 1.5019993965848195, + "grad_norm": 3.655703067779541, + "learning_rate": 1.456662340786592e-05, + "loss": 1.95, + "step": 402000 + }, + { + "epoch": 1.5038675550382834, + "grad_norm": 3.852405548095703, + "learning_rate": 1.4463255414050487e-05, + "loss": 1.8723, + "step": 402500 + }, + { + "epoch": 1.505735713491747, + "grad_norm": 4.878715515136719, + "learning_rate": 1.4360193437066122e-05, + "loss": 1.8876, + "step": 403000 + }, + { + "epoch": 1.5076038719452107, + "grad_norm": 4.768284320831299, + "learning_rate": 1.4257643567674483e-05, + "loss": 1.9061, + "step": 403500 + }, + { + "epoch": 1.5094720303986744, + "grad_norm": 4.845045566558838, + "learning_rate": 1.4155195667736094e-05, + "loss": 1.8932, + "step": 404000 + }, + { + "epoch": 1.511340188852138, + "grad_norm": 3.8661012649536133, + "learning_rate": 1.4053056437417239e-05, + "loss": 1.9518, + "step": 404500 + }, + { + "epoch": 1.5132083473056017, + "grad_norm": 4.624420166015625, + "learning_rate": 1.3951226756267382e-05, + "loss": 1.8403, + "step": 405000 + }, + { + "epoch": 1.5150765057590654, + "grad_norm": 3.6633214950561523, + "learning_rate": 1.3849910229293806e-05, + "loss": 1.8943, + "step": 405500 + }, + { + "epoch": 1.516944664212529, + "grad_norm": 5.2839155197143555, + "learning_rate": 1.3748701650989005e-05, + "loss": 1.8692, + "step": 406000 + }, + { + "epoch": 1.518812822665993, + "grad_norm": 3.8412556648254395, + "learning_rate": 1.3647805242737227e-05, + "loss": 1.8699, + "step": 406500 + }, + { + "epoch": 1.5206809811194566, + "grad_norm": 3.3254265785217285, + "learning_rate": 1.3547221873385652e-05, + "loss": 1.8909, + "step": 407000 + }, + { + "epoch": 1.5225491395729203, + "grad_norm": 3.2033207416534424, + "learning_rate": 1.3446952409085728e-05, + "loss": 1.8986, + "step": 407500 + }, + { + "epoch": 1.5244172980263841, + "grad_norm": 4.760767459869385, + "learning_rate": 1.334719730796591e-05, + "loss": 1.8756, + "step": 408000 + }, + { + "epoch": 1.5262854564798478, + "grad_norm": 4.965844631195068, + "learning_rate": 1.3247557609288142e-05, + "loss": 1.8743, + "step": 408500 + }, + { + "epoch": 1.5281536149333115, + "grad_norm": 4.014163494110107, + "learning_rate": 1.314823439615473e-05, + "loss": 1.9219, + "step": 409000 + }, + { + "epoch": 1.5300217733867751, + "grad_norm": 4.178042888641357, + "learning_rate": 1.3049228523865536e-05, + "loss": 1.881, + "step": 409500 + }, + { + "epoch": 1.5318899318402388, + "grad_norm": 4.607501983642578, + "learning_rate": 1.2950737902223226e-05, + "loss": 1.9469, + "step": 410000 + }, + { + "epoch": 1.5337580902937025, + "grad_norm": 4.652303695678711, + "learning_rate": 1.2852368627651334e-05, + "loss": 1.8881, + "step": 410500 + }, + { + "epoch": 1.5356262487471661, + "grad_norm": 4.992543697357178, + "learning_rate": 1.2754319241706458e-05, + "loss": 1.9569, + "step": 411000 + }, + { + "epoch": 1.5374944072006298, + "grad_norm": 3.5058271884918213, + "learning_rate": 1.2656590588719214e-05, + "loss": 1.9032, + "step": 411500 + }, + { + "epoch": 1.5393625656540937, + "grad_norm": 3.973353147506714, + "learning_rate": 1.2559183510258338e-05, + "loss": 1.8669, + "step": 412000 + }, + { + "epoch": 1.5412307241075573, + "grad_norm": 4.776645660400391, + "learning_rate": 1.2462292692129003e-05, + "loss": 1.8993, + "step": 412500 + }, + { + "epoch": 1.543098882561021, + "grad_norm": 4.160543441772461, + "learning_rate": 1.2365530629011917e-05, + "loss": 1.9269, + "step": 413000 + }, + { + "epoch": 1.544967041014485, + "grad_norm": 4.14699125289917, + "learning_rate": 1.226909264681978e-05, + "loss": 1.9139, + "step": 413500 + }, + { + "epoch": 1.5468351994679486, + "grad_norm": 4.639766693115234, + "learning_rate": 1.2172979576006998e-05, + "loss": 1.8844, + "step": 414000 + }, + { + "epoch": 1.5487033579214122, + "grad_norm": 3.771737575531006, + "learning_rate": 1.207719224423004e-05, + "loss": 1.8961, + "step": 414500 + }, + { + "epoch": 1.550571516374876, + "grad_norm": 4.165931701660156, + "learning_rate": 1.1981922071418567e-05, + "loss": 1.891, + "step": 415000 + }, + { + "epoch": 1.5524396748283396, + "grad_norm": 5.3882341384887695, + "learning_rate": 1.1886788033865165e-05, + "loss": 1.8854, + "step": 415500 + }, + { + "epoch": 1.5543078332818032, + "grad_norm": 4.879900932312012, + "learning_rate": 1.1791982199822898e-05, + "loss": 1.8817, + "step": 416000 + }, + { + "epoch": 1.5561759917352669, + "grad_norm": 4.769500732421875, + "learning_rate": 1.169750538569126e-05, + "loss": 1.9078, + "step": 416500 + }, + { + "epoch": 1.5580441501887305, + "grad_norm": 5.184789657592773, + "learning_rate": 1.1603546369284646e-05, + "loss": 1.864, + "step": 417000 + }, + { + "epoch": 1.5599123086421944, + "grad_norm": 3.5462260246276855, + "learning_rate": 1.1509729370737072e-05, + "loss": 1.9012, + "step": 417500 + }, + { + "epoch": 1.561780467095658, + "grad_norm": 4.478038311004639, + "learning_rate": 1.1416243822658057e-05, + "loss": 1.8541, + "step": 418000 + }, + { + "epoch": 1.563648625549122, + "grad_norm": 4.2772650718688965, + "learning_rate": 1.1323090530077756e-05, + "loss": 1.9176, + "step": 418500 + }, + { + "epoch": 1.5655167840025856, + "grad_norm": 4.45164155960083, + "learning_rate": 1.123045560271172e-05, + "loss": 1.9191, + "step": 419000 + }, + { + "epoch": 1.5673849424560493, + "grad_norm": 4.31321382522583, + "learning_rate": 1.1137968556258127e-05, + "loss": 1.9104, + "step": 419500 + }, + { + "epoch": 1.569253100909513, + "grad_norm": 3.313171625137329, + "learning_rate": 1.1045816161609301e-05, + "loss": 1.8969, + "step": 420000 + }, + { + "epoch": 1.5711212593629766, + "grad_norm": 5.630086898803711, + "learning_rate": 1.0953999212315213e-05, + "loss": 1.8921, + "step": 420500 + }, + { + "epoch": 1.5729894178164403, + "grad_norm": 4.993584632873535, + "learning_rate": 1.0862518499037283e-05, + "loss": 1.8845, + "step": 421000 + }, + { + "epoch": 1.574857576269904, + "grad_norm": 5.677700996398926, + "learning_rate": 1.077155676004855e-05, + "loss": 1.8988, + "step": 421500 + }, + { + "epoch": 1.5767257347233676, + "grad_norm": 4.58486795425415, + "learning_rate": 1.068075020279995e-05, + "loss": 1.9101, + "step": 422000 + }, + { + "epoch": 1.5785938931768315, + "grad_norm": 4.042180061340332, + "learning_rate": 1.0590282234591004e-05, + "loss": 1.9224, + "step": 422500 + }, + { + "epoch": 1.5804620516302952, + "grad_norm": 3.4549098014831543, + "learning_rate": 1.0500153634466675e-05, + "loss": 1.8885, + "step": 423000 + }, + { + "epoch": 1.5823302100837588, + "grad_norm": 4.782561302185059, + "learning_rate": 1.0410544415482986e-05, + "loss": 1.9126, + "step": 423500 + }, + { + "epoch": 1.5841983685372227, + "grad_norm": 4.326170921325684, + "learning_rate": 1.0321096194361922e-05, + "loss": 1.8519, + "step": 424000 + }, + { + "epoch": 1.5860665269906864, + "grad_norm": 4.411458492279053, + "learning_rate": 1.0231989659361606e-05, + "loss": 1.8756, + "step": 424500 + }, + { + "epoch": 1.58793468544415, + "grad_norm": 4.059584140777588, + "learning_rate": 1.0143225577803328e-05, + "loss": 1.897, + "step": 425000 + }, + { + "epoch": 1.5898028438976137, + "grad_norm": 4.62555456161499, + "learning_rate": 1.0054981212748877e-05, + "loss": 1.9044, + "step": 425500 + }, + { + "epoch": 1.5916710023510774, + "grad_norm": 3.3062992095947266, + "learning_rate": 9.966903639519581e-06, + "loss": 1.8671, + "step": 426000 + }, + { + "epoch": 1.593539160804541, + "grad_norm": 3.750192880630493, + "learning_rate": 9.879170802462034e-06, + "loss": 1.9024, + "step": 426500 + }, + { + "epoch": 1.5954073192580047, + "grad_norm": 3.6934866905212402, + "learning_rate": 9.791783457068221e-06, + "loss": 1.8972, + "step": 427000 + }, + { + "epoch": 1.5972754777114684, + "grad_norm": 4.577314376831055, + "learning_rate": 9.704916092006999e-06, + "loss": 1.9391, + "step": 427500 + }, + { + "epoch": 1.5991436361649323, + "grad_norm": 4.8952226638793945, + "learning_rate": 9.618221289776025e-06, + "loss": 1.8756, + "step": 428000 + }, + { + "epoch": 1.601011794618396, + "grad_norm": 5.817446231842041, + "learning_rate": 9.531874226317888e-06, + "loss": 1.8756, + "step": 428500 + }, + { + "epoch": 1.6028799530718596, + "grad_norm": 3.9412033557891846, + "learning_rate": 9.445875645191288e-06, + "loss": 1.912, + "step": 429000 + }, + { + "epoch": 1.6047481115253235, + "grad_norm": 4.50702428817749, + "learning_rate": 9.360397236655304e-06, + "loss": 1.8652, + "step": 429500 + }, + { + "epoch": 1.6066162699787871, + "grad_norm": 4.587414741516113, + "learning_rate": 9.27509713820291e-06, + "loss": 1.9097, + "step": 430000 + }, + { + "epoch": 1.6084844284322508, + "grad_norm": 6.312617301940918, + "learning_rate": 9.190147733261234e-06, + "loss": 1.8736, + "step": 430500 + }, + { + "epoch": 1.6103525868857145, + "grad_norm": 5.86572790145874, + "learning_rate": 9.105549753353348e-06, + "loss": 1.8866, + "step": 431000 + }, + { + "epoch": 1.6122207453391781, + "grad_norm": 4.819661617279053, + "learning_rate": 9.021303926976055e-06, + "loss": 1.8648, + "step": 431500 + }, + { + "epoch": 1.6140889037926418, + "grad_norm": 4.977511882781982, + "learning_rate": 8.937578412834564e-06, + "loss": 1.8504, + "step": 432000 + }, + { + "epoch": 1.6159570622461055, + "grad_norm": 3.8270606994628906, + "learning_rate": 8.85403835895094e-06, + "loss": 1.9031, + "step": 432500 + }, + { + "epoch": 1.6178252206995691, + "grad_norm": 3.582000255584717, + "learning_rate": 8.770852624432785e-06, + "loss": 1.9016, + "step": 433000 + }, + { + "epoch": 1.619693379153033, + "grad_norm": 4.828258037567139, + "learning_rate": 8.688021925615658e-06, + "loss": 1.9003, + "step": 433500 + }, + { + "epoch": 1.6215615376064967, + "grad_norm": 4.899356842041016, + "learning_rate": 8.60571157016748e-06, + "loss": 1.902, + "step": 434000 + }, + { + "epoch": 1.6234296960599606, + "grad_norm": 3.5516891479492188, + "learning_rate": 8.523592365898686e-06, + "loss": 1.8574, + "step": 434500 + }, + { + "epoch": 1.6252978545134242, + "grad_norm": 4.53317928314209, + "learning_rate": 8.441830326558064e-06, + "loss": 1.8844, + "step": 435000 + }, + { + "epoch": 1.627166012966888, + "grad_norm": 6.883234977722168, + "learning_rate": 8.360426156221358e-06, + "loss": 1.859, + "step": 435500 + }, + { + "epoch": 1.6290341714203516, + "grad_norm": 5.441802024841309, + "learning_rate": 8.279542288766052e-06, + "loss": 1.9012, + "step": 436000 + }, + { + "epoch": 1.6309023298738152, + "grad_norm": 3.1804521083831787, + "learning_rate": 8.198855237101328e-06, + "loss": 1.8847, + "step": 436500 + }, + { + "epoch": 1.632770488327279, + "grad_norm": 4.132668972015381, + "learning_rate": 8.118528146766863e-06, + "loss": 1.8517, + "step": 437000 + }, + { + "epoch": 1.6346386467807426, + "grad_norm": 4.795321464538574, + "learning_rate": 8.038561709481684e-06, + "loss": 1.9175, + "step": 437500 + }, + { + "epoch": 1.6365068052342062, + "grad_norm": 4.67226505279541, + "learning_rate": 7.959115462975215e-06, + "loss": 1.857, + "step": 438000 + }, + { + "epoch": 1.63837496368767, + "grad_norm": 5.205322742462158, + "learning_rate": 7.879871669780554e-06, + "loss": 1.8824, + "step": 438500 + }, + { + "epoch": 1.6402431221411338, + "grad_norm": 5.369668960571289, + "learning_rate": 7.800990584772722e-06, + "loss": 1.876, + "step": 439000 + }, + { + "epoch": 1.6421112805945974, + "grad_norm": 4.469278335571289, + "learning_rate": 7.722472887218802e-06, + "loss": 1.8871, + "step": 439500 + }, + { + "epoch": 1.6439794390480613, + "grad_norm": 4.810849189758301, + "learning_rate": 7.644319253256577e-06, + "loss": 1.892, + "step": 440000 + }, + { + "epoch": 1.645847597501525, + "grad_norm": 5.1172027587890625, + "learning_rate": 7.5666855692307025e-06, + "loss": 1.9003, + "step": 440500 + }, + { + "epoch": 1.6477157559549886, + "grad_norm": 5.264705181121826, + "learning_rate": 7.48926134684001e-06, + "loss": 1.866, + "step": 441000 + }, + { + "epoch": 1.6495839144084523, + "grad_norm": 3.7140793800354004, + "learning_rate": 7.41220319629074e-06, + "loss": 1.8958, + "step": 441500 + }, + { + "epoch": 1.651452072861916, + "grad_norm": 4.509251117706299, + "learning_rate": 7.335511781152121e-06, + "loss": 1.8784, + "step": 442000 + }, + { + "epoch": 1.6533202313153796, + "grad_norm": 4.2154388427734375, + "learning_rate": 7.259340042775581e-06, + "loss": 1.8476, + "step": 442500 + }, + { + "epoch": 1.6551883897688433, + "grad_norm": 6.030950546264648, + "learning_rate": 7.183383339768157e-06, + "loss": 1.9157, + "step": 443000 + }, + { + "epoch": 1.657056548222307, + "grad_norm": 4.760791301727295, + "learning_rate": 7.107795342603074e-06, + "loss": 1.8709, + "step": 443500 + }, + { + "epoch": 1.6589247066757709, + "grad_norm": 4.554337978363037, + "learning_rate": 7.032576702189675e-06, + "loss": 1.8865, + "step": 444000 + }, + { + "epoch": 1.6607928651292345, + "grad_norm": 5.714734077453613, + "learning_rate": 6.9578773938351495e-06, + "loss": 1.8687, + "step": 444500 + }, + { + "epoch": 1.6626610235826982, + "grad_norm": 4.749231338500977, + "learning_rate": 6.883398664985902e-06, + "loss": 1.8953, + "step": 445000 + }, + { + "epoch": 1.664529182036162, + "grad_norm": 2.8103106021881104, + "learning_rate": 6.809291225230813e-06, + "loss": 1.8854, + "step": 445500 + }, + { + "epoch": 1.6663973404896257, + "grad_norm": 6.017327308654785, + "learning_rate": 6.735555712729713e-06, + "loss": 1.8829, + "step": 446000 + }, + { + "epoch": 1.6682654989430894, + "grad_norm": 5.306553363800049, + "learning_rate": 6.662339116102778e-06, + "loss": 1.8542, + "step": 446500 + }, + { + "epoch": 1.670133657396553, + "grad_norm": 5.078936576843262, + "learning_rate": 6.5893486127564465e-06, + "loss": 1.9077, + "step": 447000 + }, + { + "epoch": 1.6720018158500167, + "grad_norm": 5.262309551239014, + "learning_rate": 6.516731930651387e-06, + "loss": 1.8863, + "step": 447500 + }, + { + "epoch": 1.6738699743034804, + "grad_norm": 5.343240261077881, + "learning_rate": 6.444489695110101e-06, + "loss": 1.8784, + "step": 448000 + }, + { + "epoch": 1.675738132756944, + "grad_norm": 4.112715244293213, + "learning_rate": 6.372622528230676e-06, + "loss": 1.8559, + "step": 448500 + }, + { + "epoch": 1.6776062912104077, + "grad_norm": 3.1489148139953613, + "learning_rate": 6.301273656494144e-06, + "loss": 1.8633, + "step": 449000 + }, + { + "epoch": 1.6794744496638716, + "grad_norm": 5.503724575042725, + "learning_rate": 6.230157727089419e-06, + "loss": 1.8898, + "step": 449500 + }, + { + "epoch": 1.6813426081173353, + "grad_norm": 4.443988800048828, + "learning_rate": 6.159418712018961e-06, + "loss": 1.881, + "step": 450000 + }, + { + "epoch": 1.6832107665707992, + "grad_norm": 3.3895161151885986, + "learning_rate": 6.089057220436195e-06, + "loss": 1.8802, + "step": 450500 + }, + { + "epoch": 1.6850789250242628, + "grad_norm": 4.960055828094482, + "learning_rate": 6.0192134471937224e-06, + "loss": 1.8593, + "step": 451000 + }, + { + "epoch": 1.6869470834777265, + "grad_norm": 4.596670150756836, + "learning_rate": 5.949608058974171e-06, + "loss": 1.8924, + "step": 451500 + }, + { + "epoch": 1.6888152419311901, + "grad_norm": 3.810817003250122, + "learning_rate": 5.8803820009804165e-06, + "loss": 1.8412, + "step": 452000 + }, + { + "epoch": 1.6906834003846538, + "grad_norm": 6.2422380447387695, + "learning_rate": 5.8115358693374035e-06, + "loss": 1.875, + "step": 452500 + }, + { + "epoch": 1.6925515588381175, + "grad_norm": 4.921154499053955, + "learning_rate": 5.7432068079726676e-06, + "loss": 1.8729, + "step": 453000 + }, + { + "epoch": 1.6944197172915811, + "grad_norm": 5.331964015960693, + "learning_rate": 5.675121541510353e-06, + "loss": 1.8726, + "step": 453500 + }, + { + "epoch": 1.6962878757450448, + "grad_norm": 4.561686038970947, + "learning_rate": 5.607417968953904e-06, + "loss": 1.8597, + "step": 454000 + }, + { + "epoch": 1.6981560341985087, + "grad_norm": 5.06734037399292, + "learning_rate": 5.5400966733176905e-06, + "loss": 1.8741, + "step": 454500 + }, + { + "epoch": 1.7000241926519724, + "grad_norm": 6.29988956451416, + "learning_rate": 5.473291728727564e-06, + "loss": 1.9034, + "step": 455000 + }, + { + "epoch": 1.701892351105436, + "grad_norm": 5.206850051879883, + "learning_rate": 5.406735955363129e-06, + "loss": 1.8556, + "step": 455500 + }, + { + "epoch": 1.7037605095589, + "grad_norm": 3.8202433586120605, + "learning_rate": 5.340564187047786e-06, + "loss": 1.8677, + "step": 456000 + }, + { + "epoch": 1.7056286680123636, + "grad_norm": 3.6107611656188965, + "learning_rate": 5.2747769936051125e-06, + "loss": 1.8593, + "step": 456500 + }, + { + "epoch": 1.7074968264658272, + "grad_norm": 4.204036235809326, + "learning_rate": 5.20937494154699e-06, + "loss": 1.8571, + "step": 457000 + }, + { + "epoch": 1.709364984919291, + "grad_norm": 5.234120845794678, + "learning_rate": 5.1444882414578675e-06, + "loss": 1.8433, + "step": 457500 + }, + { + "epoch": 1.7112331433727546, + "grad_norm": 3.4716298580169678, + "learning_rate": 5.079857385347997e-06, + "loss": 1.8765, + "step": 458000 + }, + { + "epoch": 1.7131013018262182, + "grad_norm": 5.14175271987915, + "learning_rate": 5.015613349129866e-06, + "loss": 1.9206, + "step": 458500 + }, + { + "epoch": 1.714969460279682, + "grad_norm": 4.21678352355957, + "learning_rate": 4.951756686026798e-06, + "loss": 1.8835, + "step": 459000 + }, + { + "epoch": 1.7168376187331456, + "grad_norm": 3.8663065433502197, + "learning_rate": 4.888414495895577e-06, + "loss": 1.8974, + "step": 459500 + }, + { + "epoch": 1.7187057771866094, + "grad_norm": 4.44641637802124, + "learning_rate": 4.825333447862485e-06, + "loss": 1.8963, + "step": 460000 + }, + { + "epoch": 1.720573935640073, + "grad_norm": 4.290149211883545, + "learning_rate": 4.762641411497825e-06, + "loss": 1.8818, + "step": 460500 + }, + { + "epoch": 1.722442094093537, + "grad_norm": 3.1460719108581543, + "learning_rate": 4.700338926660225e-06, + "loss": 1.8916, + "step": 461000 + }, + { + "epoch": 1.7243102525470007, + "grad_norm": 3.602639675140381, + "learning_rate": 4.63842652985379e-06, + "loss": 1.8656, + "step": 461500 + }, + { + "epoch": 1.7261784110004643, + "grad_norm": 4.454497337341309, + "learning_rate": 4.577027407582085e-06, + "loss": 1.8377, + "step": 462000 + }, + { + "epoch": 1.728046569453928, + "grad_norm": 4.91801118850708, + "learning_rate": 4.5158960000806275e-06, + "loss": 1.8708, + "step": 462500 + }, + { + "epoch": 1.7299147279073916, + "grad_norm": 5.951587200164795, + "learning_rate": 4.45515626889988e-06, + "loss": 1.8598, + "step": 463000 + }, + { + "epoch": 1.7317828863608553, + "grad_norm": 3.9829583168029785, + "learning_rate": 4.394808737086631e-06, + "loss": 1.8637, + "step": 463500 + }, + { + "epoch": 1.733651044814319, + "grad_norm": 4.84136962890625, + "learning_rate": 4.334973441658552e-06, + "loss": 1.849, + "step": 464000 + }, + { + "epoch": 1.7355192032677826, + "grad_norm": 5.9698991775512695, + "learning_rate": 4.275411077223152e-06, + "loss": 1.8716, + "step": 464500 + }, + { + "epoch": 1.7373873617212465, + "grad_norm": 6.253756046295166, + "learning_rate": 4.216242459991293e-06, + "loss": 1.877, + "step": 465000 + }, + { + "epoch": 1.7392555201747102, + "grad_norm": 4.6036152839660645, + "learning_rate": 4.157468099480438e-06, + "loss": 1.8532, + "step": 465500 + }, + { + "epoch": 1.7411236786281739, + "grad_norm": 4.482430934906006, + "learning_rate": 4.099204866700346e-06, + "loss": 1.858, + "step": 466000 + }, + { + "epoch": 1.7429918370816377, + "grad_norm": 4.4797749519348145, + "learning_rate": 4.041219743568814e-06, + "loss": 1.8436, + "step": 466500 + }, + { + "epoch": 1.7448599955351014, + "grad_norm": 5.49769926071167, + "learning_rate": 3.983630384327791e-06, + "loss": 1.8767, + "step": 467000 + }, + { + "epoch": 1.746728153988565, + "grad_norm": 5.328680038452148, + "learning_rate": 3.9264372848953125e-06, + "loss": 1.8929, + "step": 467500 + }, + { + "epoch": 1.7485963124420287, + "grad_norm": 3.2703754901885986, + "learning_rate": 3.869640937777136e-06, + "loss": 1.7657, + "step": 468000 + }, + { + "epoch": 1.7504644708954924, + "grad_norm": 4.710208892822266, + "learning_rate": 3.813241832062481e-06, + "loss": 1.868, + "step": 468500 + }, + { + "epoch": 1.752332629348956, + "grad_norm": 3.9908735752105713, + "learning_rate": 3.7572404534197746e-06, + "loss": 1.9306, + "step": 469000 + }, + { + "epoch": 1.7542007878024197, + "grad_norm": 5.898683071136475, + "learning_rate": 3.701637284092546e-06, + "loss": 1.8756, + "step": 469500 + }, + { + "epoch": 1.7560689462558834, + "grad_norm": 5.575063705444336, + "learning_rate": 3.6465428136502942e-06, + "loss": 1.8415, + "step": 470000 + }, + { + "epoch": 1.7579371047093473, + "grad_norm": 3.8220248222351074, + "learning_rate": 3.591736697164866e-06, + "loss": 1.8549, + "step": 470500 + }, + { + "epoch": 1.759805263162811, + "grad_norm": 4.483773708343506, + "learning_rate": 3.5373302151939625e-06, + "loss": 1.8414, + "step": 471000 + }, + { + "epoch": 1.7616734216162746, + "grad_norm": 5.593682289123535, + "learning_rate": 3.4833238362470044e-06, + "loss": 1.8729, + "step": 471500 + }, + { + "epoch": 1.7635415800697385, + "grad_norm": 3.2169010639190674, + "learning_rate": 3.4298248369353582e-06, + "loss": 1.8556, + "step": 472000 + }, + { + "epoch": 1.7654097385232022, + "grad_norm": 5.516305923461914, + "learning_rate": 3.3766192532610986e-06, + "loss": 1.8855, + "step": 472500 + }, + { + "epoch": 1.7672778969766658, + "grad_norm": 5.06584358215332, + "learning_rate": 3.3239203637443983e-06, + "loss": 1.8967, + "step": 473000 + }, + { + "epoch": 1.7691460554301295, + "grad_norm": 4.666677474975586, + "learning_rate": 3.271517404347946e-06, + "loss": 1.8351, + "step": 473500 + }, + { + "epoch": 1.7710142138835931, + "grad_norm": 5.4451823234558105, + "learning_rate": 3.2195168369637765e-06, + "loss": 1.8405, + "step": 474000 + }, + { + "epoch": 1.7728823723370568, + "grad_norm": 4.598884582519531, + "learning_rate": 3.1679191093832883e-06, + "loss": 1.8774, + "step": 474500 + }, + { + "epoch": 1.7747505307905205, + "grad_norm": 5.018040657043457, + "learning_rate": 3.1167246659289217e-06, + "loss": 1.8544, + "step": 475000 + }, + { + "epoch": 1.7766186892439841, + "grad_norm": 5.349071502685547, + "learning_rate": 3.065933947450339e-06, + "loss": 1.8779, + "step": 475500 + }, + { + "epoch": 1.778486847697448, + "grad_norm": 4.253110408782959, + "learning_rate": 3.015547391320589e-06, + "loss": 1.8161, + "step": 476000 + }, + { + "epoch": 1.7803550061509117, + "grad_norm": 3.6783599853515625, + "learning_rate": 2.9655654314323655e-06, + "loss": 1.8395, + "step": 476500 + }, + { + "epoch": 1.7822231646043756, + "grad_norm": 4.650113582611084, + "learning_rate": 2.916185998547194e-06, + "loss": 1.8573, + "step": 477000 + }, + { + "epoch": 1.7840913230578392, + "grad_norm": 4.785963535308838, + "learning_rate": 2.8670128962200117e-06, + "loss": 1.839, + "step": 477500 + }, + { + "epoch": 1.785959481511303, + "grad_norm": 4.258472442626953, + "learning_rate": 2.818245669206393e-06, + "loss": 1.8937, + "step": 478000 + }, + { + "epoch": 1.7878276399647666, + "grad_norm": 5.702148914337158, + "learning_rate": 2.7698847374545255e-06, + "loss": 1.8767, + "step": 478500 + }, + { + "epoch": 1.7896957984182302, + "grad_norm": 5.909474849700928, + "learning_rate": 2.7219305174139067e-06, + "loss": 1.8927, + "step": 479000 + }, + { + "epoch": 1.791563956871694, + "grad_norm": 4.348086357116699, + "learning_rate": 2.6743834220317286e-06, + "loss": 1.8478, + "step": 479500 + }, + { + "epoch": 1.7934321153251576, + "grad_norm": 4.148903846740723, + "learning_rate": 2.62724386074929e-06, + "loss": 1.855, + "step": 480000 + }, + { + "epoch": 1.7953002737786212, + "grad_norm": 4.32988977432251, + "learning_rate": 2.580512239498528e-06, + "loss": 1.8551, + "step": 480500 + }, + { + "epoch": 1.7971684322320851, + "grad_norm": 4.866036415100098, + "learning_rate": 2.534188960698475e-06, + "loss": 1.8938, + "step": 481000 + }, + { + "epoch": 1.7990365906855488, + "grad_norm": 4.053302764892578, + "learning_rate": 2.4883658441394673e-06, + "loss": 1.8759, + "step": 481500 + }, + { + "epoch": 1.8009047491390124, + "grad_norm": 5.242681980133057, + "learning_rate": 2.4428596247633885e-06, + "loss": 1.8914, + "step": 482000 + }, + { + "epoch": 1.8027729075924763, + "grad_norm": 5.018854141235352, + "learning_rate": 2.3977629332031404e-06, + "loss": 1.8592, + "step": 482500 + }, + { + "epoch": 1.80464106604594, + "grad_norm": 4.828859329223633, + "learning_rate": 2.3530761577989e-06, + "loss": 1.8676, + "step": 483000 + }, + { + "epoch": 1.8065092244994037, + "grad_norm": 3.3137731552124023, + "learning_rate": 2.3088878265754845e-06, + "loss": 1.8182, + "step": 483500 + }, + { + "epoch": 1.8083773829528673, + "grad_norm": 6.416788101196289, + "learning_rate": 2.2650212126383242e-06, + "loss": 1.8656, + "step": 484000 + }, + { + "epoch": 1.810245541406331, + "grad_norm": 4.340769290924072, + "learning_rate": 2.2215656579332167e-06, + "loss": 1.9075, + "step": 484500 + }, + { + "epoch": 1.8121136998597946, + "grad_norm": 4.634076118469238, + "learning_rate": 2.17852153666806e-06, + "loss": 1.8799, + "step": 485000 + }, + { + "epoch": 1.8139818583132583, + "grad_norm": 4.349535942077637, + "learning_rate": 2.1359740729170296e-06, + "loss": 1.8522, + "step": 485500 + }, + { + "epoch": 1.815850016766722, + "grad_norm": 4.439642429351807, + "learning_rate": 2.0937531022739987e-06, + "loss": 1.8578, + "step": 486000 + }, + { + "epoch": 1.8177181752201859, + "grad_norm": 4.639336585998535, + "learning_rate": 2.051944665700545e-06, + "loss": 1.883, + "step": 486500 + }, + { + "epoch": 1.8195863336736495, + "grad_norm": 4.625245571136475, + "learning_rate": 2.010549123220773e-06, + "loss": 1.8886, + "step": 487000 + }, + { + "epoch": 1.8214544921271132, + "grad_norm": 4.0239667892456055, + "learning_rate": 1.9696483832278845e-06, + "loss": 1.8653, + "step": 487500 + }, + { + "epoch": 1.823322650580577, + "grad_norm": 4.363647937774658, + "learning_rate": 1.92907886722582e-06, + "loss": 1.8718, + "step": 488000 + }, + { + "epoch": 1.8251908090340407, + "grad_norm": 4.025300025939941, + "learning_rate": 1.8889233033491493e-06, + "loss": 1.8352, + "step": 488500 + }, + { + "epoch": 1.8270589674875044, + "grad_norm": 6.883707046508789, + "learning_rate": 1.8491820373886358e-06, + "loss": 1.9056, + "step": 489000 + }, + { + "epoch": 1.828927125940968, + "grad_norm": 5.169373512268066, + "learning_rate": 1.8098554115674292e-06, + "loss": 1.8994, + "step": 489500 + }, + { + "epoch": 1.8307952843944317, + "grad_norm": 5.691972255706787, + "learning_rate": 1.7710985840431572e-06, + "loss": 1.8602, + "step": 490000 + }, + { + "epoch": 1.8326634428478954, + "grad_norm": 4.719027042388916, + "learning_rate": 1.7326005889664986e-06, + "loss": 1.8645, + "step": 490500 + }, + { + "epoch": 1.834531601301359, + "grad_norm": 5.3066816329956055, + "learning_rate": 1.6945182379445534e-06, + "loss": 1.879, + "step": 491000 + }, + { + "epoch": 1.8363997597548227, + "grad_norm": 5.338113307952881, + "learning_rate": 1.6568518589150705e-06, + "loss": 1.8811, + "step": 491500 + }, + { + "epoch": 1.8382679182082866, + "grad_norm": 3.351616382598877, + "learning_rate": 1.61960177623377e-06, + "loss": 1.8459, + "step": 492000 + }, + { + "epoch": 1.8401360766617503, + "grad_norm": 5.075439929962158, + "learning_rate": 1.5827683106715008e-06, + "loss": 1.8515, + "step": 492500 + }, + { + "epoch": 1.8420042351152142, + "grad_norm": 4.089956283569336, + "learning_rate": 1.5463517794115367e-06, + "loss": 1.8624, + "step": 493000 + }, + { + "epoch": 1.8438723935686778, + "grad_norm": 6.492163181304932, + "learning_rate": 1.5103524960467908e-06, + "loss": 1.8245, + "step": 493500 + }, + { + "epoch": 1.8457405520221415, + "grad_norm": 6.452279567718506, + "learning_rate": 1.4748415171010387e-06, + "loss": 1.8406, + "step": 494000 + }, + { + "epoch": 1.8476087104756052, + "grad_norm": 3.7838053703308105, + "learning_rate": 1.4396768198986554e-06, + "loss": 1.8508, + "step": 494500 + }, + { + "epoch": 1.8494768689290688, + "grad_norm": 3.706258535385132, + "learning_rate": 1.4049302891993631e-06, + "loss": 1.8484, + "step": 495000 + }, + { + "epoch": 1.8513450273825325, + "grad_norm": 4.734787940979004, + "learning_rate": 1.3706022242152227e-06, + "loss": 1.8616, + "step": 495500 + }, + { + "epoch": 1.8532131858359961, + "grad_norm": 5.525266170501709, + "learning_rate": 1.336760321043634e-06, + "loss": 1.8696, + "step": 496000 + }, + { + "epoch": 1.8550813442894598, + "grad_norm": 3.555717706680298, + "learning_rate": 1.3032692323137307e-06, + "loss": 1.8539, + "step": 496500 + }, + { + "epoch": 1.8569495027429237, + "grad_norm": 4.906459331512451, + "learning_rate": 1.2701974847307452e-06, + "loss": 1.8555, + "step": 497000 + }, + { + "epoch": 1.8588176611963874, + "grad_norm": 5.703590393066406, + "learning_rate": 1.2375453630847134e-06, + "loss": 1.8088, + "step": 497500 + }, + { + "epoch": 1.860685819649851, + "grad_norm": 4.265283107757568, + "learning_rate": 1.2053771937288626e-06, + "loss": 1.8823, + "step": 498000 + }, + { + "epoch": 1.862553978103315, + "grad_norm": 4.899601936340332, + "learning_rate": 1.1735643232264836e-06, + "loss": 1.8687, + "step": 498500 + }, + { + "epoch": 1.8644221365567786, + "grad_norm": 4.975470542907715, + "learning_rate": 1.1422342758236281e-06, + "loss": 1.871, + "step": 499000 + }, + { + "epoch": 1.8662902950102422, + "grad_norm": 4.806349754333496, + "learning_rate": 1.1112617500700973e-06, + "loss": 1.8244, + "step": 499500 + }, + { + "epoch": 1.868158453463706, + "grad_norm": 5.105782508850098, + "learning_rate": 1.0807102188935214e-06, + "loss": 1.8867, + "step": 500000 + }, + { + "epoch": 1.8700266119171696, + "grad_norm": 5.97845458984375, + "learning_rate": 1.050579945381669e-06, + "loss": 1.8339, + "step": 500500 + }, + { + "epoch": 1.8718947703706332, + "grad_norm": 4.778586387634277, + "learning_rate": 1.0208711889947376e-06, + "loss": 1.8423, + "step": 501000 + }, + { + "epoch": 1.873762928824097, + "grad_norm": 4.4693169593811035, + "learning_rate": 9.915842055631286e-07, + "loss": 1.8629, + "step": 501500 + }, + { + "epoch": 1.8756310872775606, + "grad_norm": 5.0336222648620605, + "learning_rate": 9.62719247285221e-07, + "loss": 1.8386, + "step": 502000 + }, + { + "epoch": 1.8774992457310244, + "grad_norm": 4.51587438583374, + "learning_rate": 9.342765627252504e-07, + "loss": 1.8566, + "step": 502500 + }, + { + "epoch": 1.879367404184488, + "grad_norm": 4.207951068878174, + "learning_rate": 9.062563968110948e-07, + "loss": 1.8517, + "step": 503000 + }, + { + "epoch": 1.8812355626379518, + "grad_norm": 3.8609273433685303, + "learning_rate": 8.787137635712206e-07, + "loss": 1.8727, + "step": 503500 + }, + { + "epoch": 1.8831037210914157, + "grad_norm": 4.1626877784729, + "learning_rate": 8.515385089467198e-07, + "loss": 1.89, + "step": 504000 + }, + { + "epoch": 1.8849718795448793, + "grad_norm": 3.9561331272125244, + "learning_rate": 8.247864854485199e-07, + "loss": 1.8863, + "step": 504500 + }, + { + "epoch": 1.886840037998343, + "grad_norm": 4.846907138824463, + "learning_rate": 7.98457923445789e-07, + "loss": 1.8208, + "step": 505000 + }, + { + "epoch": 1.8887081964518067, + "grad_norm": 4.7613911628723145, + "learning_rate": 7.726044364189499e-07, + "loss": 1.8515, + "step": 505500 + }, + { + "epoch": 1.8905763549052703, + "grad_norm": 5.021259307861328, + "learning_rate": 7.47122625883645e-07, + "loss": 1.8398, + "step": 506000 + }, + { + "epoch": 1.892444513358734, + "grad_norm": 6.04338264465332, + "learning_rate": 7.220649456289641e-07, + "loss": 1.8433, + "step": 506500 + }, + { + "epoch": 1.8943126718121976, + "grad_norm": 4.8739094734191895, + "learning_rate": 6.974316114336077e-07, + "loss": 1.8352, + "step": 507000 + }, + { + "epoch": 1.8961808302656613, + "grad_norm": 4.441490650177002, + "learning_rate": 6.732708291258827e-07, + "loss": 1.8887, + "step": 507500 + }, + { + "epoch": 1.8980489887191252, + "grad_norm": 3.811279058456421, + "learning_rate": 6.494859700278133e-07, + "loss": 1.8689, + "step": 508000 + }, + { + "epoch": 1.8999171471725889, + "grad_norm": 2.8529744148254395, + "learning_rate": 6.26126081986883e-07, + "loss": 1.9027, + "step": 508500 + }, + { + "epoch": 1.9017853056260527, + "grad_norm": 4.631827354431152, + "learning_rate": 6.031913661616207e-07, + "loss": 1.848, + "step": 509000 + }, + { + "epoch": 1.9036534640795164, + "grad_norm": 3.616713762283325, + "learning_rate": 5.807266140930689e-07, + "loss": 1.8911, + "step": 509500 + }, + { + "epoch": 1.90552162253298, + "grad_norm": 5.187899112701416, + "learning_rate": 5.586419802097898e-07, + "loss": 1.8309, + "step": 510000 + }, + { + "epoch": 1.9073897809864437, + "grad_norm": 5.249440670013428, + "learning_rate": 5.369830996666103e-07, + "loss": 1.8542, + "step": 510500 + }, + { + "epoch": 1.9092579394399074, + "grad_norm": 5.117617607116699, + "learning_rate": 5.157501589742042e-07, + "loss": 1.8459, + "step": 511000 + }, + { + "epoch": 1.911126097893371, + "grad_norm": 5.904655456542969, + "learning_rate": 4.949433409753679e-07, + "loss": 1.8495, + "step": 511500 + }, + { + "epoch": 1.9129942563468347, + "grad_norm": 6.1428632736206055, + "learning_rate": 4.7460316030914495e-07, + "loss": 1.8274, + "step": 512000 + }, + { + "epoch": 1.9148624148002984, + "grad_norm": 4.737666130065918, + "learning_rate": 4.546482684189279e-07, + "loss": 1.8814, + "step": 512500 + }, + { + "epoch": 1.9167305732537623, + "grad_norm": 5.555963516235352, + "learning_rate": 4.351200253877141e-07, + "loss": 1.8644, + "step": 513000 + }, + { + "epoch": 1.918598731707226, + "grad_norm": 4.281107425689697, + "learning_rate": 4.160185993786592e-07, + "loss": 1.8685, + "step": 513500 + }, + { + "epoch": 1.9204668901606896, + "grad_norm": 4.849224090576172, + "learning_rate": 3.973441548794699e-07, + "loss": 1.8921, + "step": 514000 + }, + { + "epoch": 1.9223350486141535, + "grad_norm": 5.799472332000732, + "learning_rate": 3.791329209122674e-07, + "loss": 1.8326, + "step": 514500 + }, + { + "epoch": 1.9242032070676172, + "grad_norm": 5.754580020904541, + "learning_rate": 3.613120634338663e-07, + "loss": 1.8677, + "step": 515000 + }, + { + "epoch": 1.9260713655210808, + "grad_norm": 4.404658317565918, + "learning_rate": 3.4391865855858406e-07, + "loss": 1.8637, + "step": 515500 + }, + { + "epoch": 1.9279395239745445, + "grad_norm": 4.911507606506348, + "learning_rate": 3.2695285606589856e-07, + "loss": 1.85, + "step": 516000 + }, + { + "epoch": 1.9298076824280082, + "grad_norm": 4.071664333343506, + "learning_rate": 3.1044745117284056e-07, + "loss": 1.8303, + "step": 516500 + }, + { + "epoch": 1.9316758408814718, + "grad_norm": 5.3374223709106445, + "learning_rate": 2.9433643213220284e-07, + "loss": 1.8384, + "step": 517000 + }, + { + "epoch": 1.9335439993349355, + "grad_norm": 5.541077613830566, + "learning_rate": 2.7865344244054625e-07, + "loss": 1.8562, + "step": 517500 + }, + { + "epoch": 1.9354121577883991, + "grad_norm": 4.992559432983398, + "learning_rate": 2.6339861714849144e-07, + "loss": 1.8563, + "step": 518000 + }, + { + "epoch": 1.937280316241863, + "grad_norm": 3.9907846450805664, + "learning_rate": 2.486013131539955e-07, + "loss": 1.8736, + "step": 518500 + }, + { + "epoch": 1.9391484746953267, + "grad_norm": 3.9517438411712646, + "learning_rate": 2.3420235009178893e-07, + "loss": 1.859, + "step": 519000 + }, + { + "epoch": 1.9410166331487904, + "grad_norm": 4.987946510314941, + "learning_rate": 2.2023193420994125e-07, + "loss": 1.8258, + "step": 519500 + }, + { + "epoch": 1.9428847916022542, + "grad_norm": 4.550879955291748, + "learning_rate": 2.0669018581160883e-07, + "loss": 1.8678, + "step": 520000 + }, + { + "epoch": 1.944752950055718, + "grad_norm": 3.339261293411255, + "learning_rate": 1.936030194349736e-07, + "loss": 1.8278, + "step": 520500 + }, + { + "epoch": 1.9466211085091816, + "grad_norm": 5.5620951652526855, + "learning_rate": 1.8091809424235495e-07, + "loss": 1.8996, + "step": 521000 + }, + { + "epoch": 1.9484892669626452, + "grad_norm": 3.614462375640869, + "learning_rate": 1.6866217507570114e-07, + "loss": 1.8478, + "step": 521500 + }, + { + "epoch": 1.950357425416109, + "grad_norm": 4.48366117477417, + "learning_rate": 1.5683536747416184e-07, + "loss": 1.8555, + "step": 522000 + }, + { + "epoch": 1.9522255838695726, + "grad_norm": 5.737336158752441, + "learning_rate": 1.454601400492306e-07, + "loss": 1.8463, + "step": 522500 + }, + { + "epoch": 1.9540937423230362, + "grad_norm": 3.779061794281006, + "learning_rate": 1.3449099869505266e-07, + "loss": 1.8293, + "step": 523000 + }, + { + "epoch": 1.9559619007765, + "grad_norm": 5.098133087158203, + "learning_rate": 1.239512631635298e-07, + "loss": 1.8594, + "step": 523500 + }, + { + "epoch": 1.9578300592299638, + "grad_norm": 4.416299343109131, + "learning_rate": 1.1384102421526654e-07, + "loss": 1.8593, + "step": 524000 + }, + { + "epoch": 1.9596982176834274, + "grad_norm": 3.656932830810547, + "learning_rate": 1.0417930144245858e-07, + "loss": 1.836, + "step": 524500 + }, + { + "epoch": 1.9615663761368913, + "grad_norm": 5.132260322570801, + "learning_rate": 9.492745373296808e-08, + "loss": 1.8943, + "step": 525000 + }, + { + "epoch": 1.963434534590355, + "grad_norm": 4.663350582122803, + "learning_rate": 8.61053525388622e-08, + "loss": 1.8534, + "step": 525500 + }, + { + "epoch": 1.9653026930438187, + "grad_norm": 6.682803153991699, + "learning_rate": 7.77130738297216e-08, + "loss": 1.8735, + "step": 526000 + }, + { + "epoch": 1.9671708514972823, + "grad_norm": 6.555516719818115, + "learning_rate": 6.976618556056025e-08, + "loss": 1.88, + "step": 526500 + }, + { + "epoch": 1.969039009950746, + "grad_norm": 5.245980739593506, + "learning_rate": 6.223290493156397e-08, + "loss": 1.8565, + "step": 527000 + }, + { + "epoch": 1.9709071684042097, + "grad_norm": 3.9505879878997803, + "learning_rate": 5.512965235983658e-08, + "loss": 1.8449, + "step": 527500 + }, + { + "epoch": 1.9727753268576733, + "grad_norm": 6.470322132110596, + "learning_rate": 4.8456489013481986e-08, + "loss": 1.8588, + "step": 528000 + }, + { + "epoch": 1.974643485311137, + "grad_norm": 5.629650592803955, + "learning_rate": 4.221347235697226e-08, + "loss": 1.8839, + "step": 528500 + }, + { + "epoch": 1.9765116437646009, + "grad_norm": 3.961327075958252, + "learning_rate": 3.6411852409129475e-08, + "loss": 1.8824, + "step": 529000 + }, + { + "epoch": 1.9783798022180645, + "grad_norm": 4.475338935852051, + "learning_rate": 3.1028426160295554e-08, + "loss": 1.8725, + "step": 529500 + }, + { + "epoch": 1.9802479606715282, + "grad_norm": 6.577774524688721, + "learning_rate": 2.607529667921771e-08, + "loss": 1.8575, + "step": 530000 + }, + { + "epoch": 1.982116119124992, + "grad_norm": 6.510643005371094, + "learning_rate": 2.1552506618677248e-08, + "loss": 1.8503, + "step": 530500 + }, + { + "epoch": 1.9839842775784557, + "grad_norm": 4.923540115356445, + "learning_rate": 1.746785020741437e-08, + "loss": 1.8607, + "step": 531000 + }, + { + "epoch": 1.9858524360319194, + "grad_norm": 4.267704486846924, + "learning_rate": 1.3804991262938994e-08, + "loss": 1.8248, + "step": 531500 + }, + { + "epoch": 1.987720594485383, + "grad_norm": 5.18399715423584, + "learning_rate": 1.0572577402029326e-08, + "loss": 1.8468, + "step": 532000 + }, + { + "epoch": 1.9895887529388467, + "grad_norm": 4.5753045082092285, + "learning_rate": 7.770636459902836e-09, + "loss": 1.8354, + "step": 532500 + }, + { + "epoch": 1.9914569113923104, + "grad_norm": 4.492304801940918, + "learning_rate": 5.403505802398234e-09, + "loss": 1.8668, + "step": 533000 + }, + { + "epoch": 1.993325069845774, + "grad_norm": 6.207240104675293, + "learning_rate": 3.461718322739227e-09, + "loss": 1.8532, + "step": 533500 + }, + { + "epoch": 1.9951932282992377, + "grad_norm": 6.569146156311035, + "learning_rate": 1.9504649954538156e-09, + "loss": 1.8313, + "step": 534000 + }, + { + "epoch": 1.9970613867527016, + "grad_norm": 3.274258852005005, + "learning_rate": 8.69758834370904e-10, + "loss": 1.9104, + "step": 534500 + }, + { + "epoch": 1.9989295452061653, + "grad_norm": 4.226444721221924, + "learning_rate": 2.2047974543304427e-10, + "loss": 1.8681, + "step": 535000 + } + ], + "logging_steps": 500, + "max_steps": 535286, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4321334103279616e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}