diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5649 @@ +{ + "best_metric": 3.6955573558807373, + "best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-20000", + "epoch": 1.5829046299960428, + "eval_steps": 10000, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019786307874950534, + "grad_norm": 254.82342529296875, + "learning_rate": 2.499208537244918e-07, + "loss": 5.7705, + "step": 25 + }, + { + "epoch": 0.003957261574990107, + "grad_norm": 153.19989013671875, + "learning_rate": 2.498384096875041e-07, + "loss": 5.6747, + "step": 50 + }, + { + "epoch": 0.00593589236248516, + "grad_norm": 224.5292510986328, + "learning_rate": 2.4975596565051644e-07, + "loss": 5.6201, + "step": 75 + }, + { + "epoch": 0.007914523149980214, + "grad_norm": 175.854248046875, + "learning_rate": 2.4967352161352873e-07, + "loss": 5.6974, + "step": 100 + }, + { + "epoch": 0.009893153937475268, + "grad_norm": 163.52769470214844, + "learning_rate": 2.49591077576541e-07, + "loss": 5.5417, + "step": 125 + }, + { + "epoch": 0.01187178472497032, + "grad_norm": 254.2264862060547, + "learning_rate": 2.4950863353955335e-07, + "loss": 5.8201, + "step": 150 + }, + { + "epoch": 0.013850415512465374, + "grad_norm": 175.30279541015625, + "learning_rate": 2.4942618950256564e-07, + "loss": 5.5302, + "step": 175 + }, + { + "epoch": 0.015829046299960427, + "grad_norm": 300.1286315917969, + "learning_rate": 2.4934374546557797e-07, + "loss": 5.6572, + "step": 200 + }, + { + "epoch": 0.01780767708745548, + "grad_norm": 201.56961059570312, + "learning_rate": 2.4926130142859026e-07, + "loss": 5.2914, + "step": 225 + }, + { + "epoch": 0.019786307874950535, + "grad_norm": 245.64854431152344, + "learning_rate": 2.491788573916026e-07, + "loss": 5.4478, + "step": 250 + }, + { + "epoch": 0.02176493866244559, + "grad_norm": 239.78257751464844, + "learning_rate": 2.490964133546149e-07, + "loss": 5.4161, + "step": 275 + }, + { + "epoch": 0.02374356944994064, + "grad_norm": 150.18310546875, + "learning_rate": 2.4901396931762717e-07, + "loss": 5.4978, + "step": 300 + }, + { + "epoch": 0.025722200237435693, + "grad_norm": 172.03607177734375, + "learning_rate": 2.489315252806395e-07, + "loss": 5.4105, + "step": 325 + }, + { + "epoch": 0.027700831024930747, + "grad_norm": 343.2570495605469, + "learning_rate": 2.488490812436518e-07, + "loss": 5.5195, + "step": 350 + }, + { + "epoch": 0.0296794618124258, + "grad_norm": 329.7228698730469, + "learning_rate": 2.4876663720666413e-07, + "loss": 5.4494, + "step": 375 + }, + { + "epoch": 0.031658092599920855, + "grad_norm": 174.63136291503906, + "learning_rate": 2.486841931696764e-07, + "loss": 5.2864, + "step": 400 + }, + { + "epoch": 0.033636723387415905, + "grad_norm": 356.6216125488281, + "learning_rate": 2.486017491326887e-07, + "loss": 5.4376, + "step": 425 + }, + { + "epoch": 0.03561535417491096, + "grad_norm": 166.16783142089844, + "learning_rate": 2.4851930509570104e-07, + "loss": 5.3141, + "step": 450 + }, + { + "epoch": 0.03759398496240601, + "grad_norm": 220.06170654296875, + "learning_rate": 2.484368610587133e-07, + "loss": 5.5457, + "step": 475 + }, + { + "epoch": 0.03957261574990107, + "grad_norm": 154.55517578125, + "learning_rate": 2.483544170217256e-07, + "loss": 5.1264, + "step": 500 + }, + { + "epoch": 0.04155124653739612, + "grad_norm": 184.18443298339844, + "learning_rate": 2.4827197298473794e-07, + "loss": 5.3702, + "step": 525 + }, + { + "epoch": 0.04352987732489118, + "grad_norm": 128.84693908691406, + "learning_rate": 2.4818952894775023e-07, + "loss": 5.0207, + "step": 550 + }, + { + "epoch": 0.04550850811238623, + "grad_norm": 196.2894287109375, + "learning_rate": 2.4810708491076257e-07, + "loss": 5.315, + "step": 575 + }, + { + "epoch": 0.04748713889988128, + "grad_norm": 200.00257873535156, + "learning_rate": 2.4802464087377485e-07, + "loss": 5.2215, + "step": 600 + }, + { + "epoch": 0.049465769687376336, + "grad_norm": 271.8963928222656, + "learning_rate": 2.479421968367872e-07, + "loss": 5.3286, + "step": 625 + }, + { + "epoch": 0.051444400474871387, + "grad_norm": 181.56686401367188, + "learning_rate": 2.478597527997995e-07, + "loss": 4.9967, + "step": 650 + }, + { + "epoch": 0.053423031262366444, + "grad_norm": 242.8925323486328, + "learning_rate": 2.477773087628118e-07, + "loss": 5.1984, + "step": 675 + }, + { + "epoch": 0.055401662049861494, + "grad_norm": 210.05746459960938, + "learning_rate": 2.476948647258241e-07, + "loss": 5.0975, + "step": 700 + }, + { + "epoch": 0.05738029283735655, + "grad_norm": 181.1220245361328, + "learning_rate": 2.476124206888364e-07, + "loss": 5.0036, + "step": 725 + }, + { + "epoch": 0.0593589236248516, + "grad_norm": 166.00709533691406, + "learning_rate": 2.475299766518487e-07, + "loss": 5.3082, + "step": 750 + }, + { + "epoch": 0.06133755441234666, + "grad_norm": 151.4649200439453, + "learning_rate": 2.47447532614861e-07, + "loss": 5.1391, + "step": 775 + }, + { + "epoch": 0.06331618519984171, + "grad_norm": 149.88165283203125, + "learning_rate": 2.4736508857787335e-07, + "loss": 5.0783, + "step": 800 + }, + { + "epoch": 0.06529481598733676, + "grad_norm": 172.47061157226562, + "learning_rate": 2.4728264454088563e-07, + "loss": 4.9624, + "step": 825 + }, + { + "epoch": 0.06727344677483181, + "grad_norm": 298.1490478515625, + "learning_rate": 2.4720020050389797e-07, + "loss": 5.1937, + "step": 850 + }, + { + "epoch": 0.06925207756232687, + "grad_norm": 164.37867736816406, + "learning_rate": 2.4711775646691025e-07, + "loss": 5.1792, + "step": 875 + }, + { + "epoch": 0.07123070834982193, + "grad_norm": 216.8033905029297, + "learning_rate": 2.4703531242992254e-07, + "loss": 5.152, + "step": 900 + }, + { + "epoch": 0.07320933913731698, + "grad_norm": 211.95762634277344, + "learning_rate": 2.469528683929349e-07, + "loss": 4.9146, + "step": 925 + }, + { + "epoch": 0.07518796992481203, + "grad_norm": 257.61968994140625, + "learning_rate": 2.4687042435594716e-07, + "loss": 5.095, + "step": 950 + }, + { + "epoch": 0.07716660071230709, + "grad_norm": 179.43719482421875, + "learning_rate": 2.467879803189595e-07, + "loss": 5.0316, + "step": 975 + }, + { + "epoch": 0.07914523149980214, + "grad_norm": 180.3157958984375, + "learning_rate": 2.467055362819718e-07, + "loss": 4.9441, + "step": 1000 + }, + { + "epoch": 0.08112386228729719, + "grad_norm": 162.77447509765625, + "learning_rate": 2.4662309224498407e-07, + "loss": 4.9724, + "step": 1025 + }, + { + "epoch": 0.08310249307479224, + "grad_norm": 123.65939331054688, + "learning_rate": 2.465406482079964e-07, + "loss": 5.2271, + "step": 1050 + }, + { + "epoch": 0.08508112386228729, + "grad_norm": 163.114990234375, + "learning_rate": 2.464582041710087e-07, + "loss": 4.9724, + "step": 1075 + }, + { + "epoch": 0.08705975464978236, + "grad_norm": 204.76400756835938, + "learning_rate": 2.46375760134021e-07, + "loss": 4.8724, + "step": 1100 + }, + { + "epoch": 0.0890383854372774, + "grad_norm": 307.8963623046875, + "learning_rate": 2.462933160970333e-07, + "loss": 4.9256, + "step": 1125 + }, + { + "epoch": 0.09101701622477246, + "grad_norm": 133.03707885742188, + "learning_rate": 2.462108720600456e-07, + "loss": 4.8792, + "step": 1150 + }, + { + "epoch": 0.09299564701226751, + "grad_norm": 161.41697692871094, + "learning_rate": 2.4612842802305794e-07, + "loss": 5.054, + "step": 1175 + }, + { + "epoch": 0.09497427779976256, + "grad_norm": 135.36228942871094, + "learning_rate": 2.460459839860702e-07, + "loss": 4.8655, + "step": 1200 + }, + { + "epoch": 0.09695290858725762, + "grad_norm": 179.60646057128906, + "learning_rate": 2.4596353994908256e-07, + "loss": 4.7832, + "step": 1225 + }, + { + "epoch": 0.09893153937475267, + "grad_norm": 335.71380615234375, + "learning_rate": 2.4588109591209485e-07, + "loss": 4.9979, + "step": 1250 + }, + { + "epoch": 0.10091017016224772, + "grad_norm": 149.5147247314453, + "learning_rate": 2.457986518751072e-07, + "loss": 4.714, + "step": 1275 + }, + { + "epoch": 0.10288880094974277, + "grad_norm": 154.0236358642578, + "learning_rate": 2.4571620783811947e-07, + "loss": 4.8015, + "step": 1300 + }, + { + "epoch": 0.10486743173723784, + "grad_norm": 450.5319519042969, + "learning_rate": 2.456337638011318e-07, + "loss": 4.6914, + "step": 1325 + }, + { + "epoch": 0.10684606252473289, + "grad_norm": 195.87863159179688, + "learning_rate": 2.455513197641441e-07, + "loss": 5.0124, + "step": 1350 + }, + { + "epoch": 0.10882469331222794, + "grad_norm": 198.12225341796875, + "learning_rate": 2.454688757271564e-07, + "loss": 4.5305, + "step": 1375 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 161.57623291015625, + "learning_rate": 2.453864316901687e-07, + "loss": 4.7806, + "step": 1400 + }, + { + "epoch": 0.11278195488721804, + "grad_norm": 187.8081817626953, + "learning_rate": 2.45303987653181e-07, + "loss": 4.9401, + "step": 1425 + }, + { + "epoch": 0.1147605856747131, + "grad_norm": 160.1893768310547, + "learning_rate": 2.4522154361619334e-07, + "loss": 4.8119, + "step": 1450 + }, + { + "epoch": 0.11673921646220815, + "grad_norm": 181.8563995361328, + "learning_rate": 2.4513909957920563e-07, + "loss": 4.7979, + "step": 1475 + }, + { + "epoch": 0.1187178472497032, + "grad_norm": 184.80641174316406, + "learning_rate": 2.4505665554221796e-07, + "loss": 4.8448, + "step": 1500 + }, + { + "epoch": 0.12069647803719825, + "grad_norm": 151.4502410888672, + "learning_rate": 2.4497421150523025e-07, + "loss": 4.7101, + "step": 1525 + }, + { + "epoch": 0.12267510882469332, + "grad_norm": 163.2119598388672, + "learning_rate": 2.4489176746824253e-07, + "loss": 4.8802, + "step": 1550 + }, + { + "epoch": 0.12465373961218837, + "grad_norm": 147.33741760253906, + "learning_rate": 2.4480932343125487e-07, + "loss": 4.6433, + "step": 1575 + }, + { + "epoch": 0.12663237039968342, + "grad_norm": 145.84716796875, + "learning_rate": 2.4472687939426716e-07, + "loss": 4.4118, + "step": 1600 + }, + { + "epoch": 0.12861100118717847, + "grad_norm": 111.55641174316406, + "learning_rate": 2.4464443535727944e-07, + "loss": 4.819, + "step": 1625 + }, + { + "epoch": 0.13058963197467352, + "grad_norm": 145.68092346191406, + "learning_rate": 2.445619913202918e-07, + "loss": 4.7752, + "step": 1650 + }, + { + "epoch": 0.13256826276216857, + "grad_norm": 274.0830078125, + "learning_rate": 2.4447954728330407e-07, + "loss": 4.8566, + "step": 1675 + }, + { + "epoch": 0.13454689354966362, + "grad_norm": 141.83982849121094, + "learning_rate": 2.4439710324631635e-07, + "loss": 4.6643, + "step": 1700 + }, + { + "epoch": 0.1365255243371587, + "grad_norm": 182.46160888671875, + "learning_rate": 2.443146592093287e-07, + "loss": 4.731, + "step": 1725 + }, + { + "epoch": 0.13850415512465375, + "grad_norm": 200.28773498535156, + "learning_rate": 2.44232215172341e-07, + "loss": 4.5525, + "step": 1750 + }, + { + "epoch": 0.1404827859121488, + "grad_norm": 163.7792510986328, + "learning_rate": 2.441497711353533e-07, + "loss": 4.8076, + "step": 1775 + }, + { + "epoch": 0.14246141669964385, + "grad_norm": 422.9642639160156, + "learning_rate": 2.440673270983656e-07, + "loss": 4.7045, + "step": 1800 + }, + { + "epoch": 0.1444400474871389, + "grad_norm": 187.99957275390625, + "learning_rate": 2.4398488306137794e-07, + "loss": 4.6615, + "step": 1825 + }, + { + "epoch": 0.14641867827463395, + "grad_norm": 144.52732849121094, + "learning_rate": 2.439024390243902e-07, + "loss": 4.7912, + "step": 1850 + }, + { + "epoch": 0.148397309062129, + "grad_norm": 192.0771026611328, + "learning_rate": 2.4381999498740256e-07, + "loss": 4.7916, + "step": 1875 + }, + { + "epoch": 0.15037593984962405, + "grad_norm": 148.06878662109375, + "learning_rate": 2.4373755095041484e-07, + "loss": 4.7782, + "step": 1900 + }, + { + "epoch": 0.1523545706371191, + "grad_norm": 131.4456329345703, + "learning_rate": 2.436551069134272e-07, + "loss": 4.579, + "step": 1925 + }, + { + "epoch": 0.15433320142461418, + "grad_norm": 141.84681701660156, + "learning_rate": 2.4357266287643947e-07, + "loss": 4.5776, + "step": 1950 + }, + { + "epoch": 0.15631183221210923, + "grad_norm": 122.31990051269531, + "learning_rate": 2.4349021883945175e-07, + "loss": 4.5185, + "step": 1975 + }, + { + "epoch": 0.15829046299960428, + "grad_norm": 229.08372497558594, + "learning_rate": 2.434077748024641e-07, + "loss": 4.6352, + "step": 2000 + }, + { + "epoch": 0.16026909378709933, + "grad_norm": 136.54153442382812, + "learning_rate": 2.433253307654764e-07, + "loss": 4.5512, + "step": 2025 + }, + { + "epoch": 0.16224772457459438, + "grad_norm": 237.05514526367188, + "learning_rate": 2.432428867284887e-07, + "loss": 4.7146, + "step": 2050 + }, + { + "epoch": 0.16422635536208943, + "grad_norm": 149.2750244140625, + "learning_rate": 2.43160442691501e-07, + "loss": 4.6935, + "step": 2075 + }, + { + "epoch": 0.16620498614958448, + "grad_norm": 149.77297973632812, + "learning_rate": 2.4307799865451334e-07, + "loss": 4.8223, + "step": 2100 + }, + { + "epoch": 0.16818361693707953, + "grad_norm": 235.3883056640625, + "learning_rate": 2.429955546175256e-07, + "loss": 4.6266, + "step": 2125 + }, + { + "epoch": 0.17016224772457458, + "grad_norm": 137.77316284179688, + "learning_rate": 2.429131105805379e-07, + "loss": 4.8543, + "step": 2150 + }, + { + "epoch": 0.17214087851206966, + "grad_norm": 143.8935089111328, + "learning_rate": 2.4283066654355025e-07, + "loss": 4.651, + "step": 2175 + }, + { + "epoch": 0.1741195092995647, + "grad_norm": 191.43856811523438, + "learning_rate": 2.4274822250656253e-07, + "loss": 4.4166, + "step": 2200 + }, + { + "epoch": 0.17609814008705976, + "grad_norm": 135.82838439941406, + "learning_rate": 2.426657784695748e-07, + "loss": 4.7078, + "step": 2225 + }, + { + "epoch": 0.1780767708745548, + "grad_norm": 114.28646087646484, + "learning_rate": 2.4258333443258715e-07, + "loss": 4.5316, + "step": 2250 + }, + { + "epoch": 0.18005540166204986, + "grad_norm": 237.41001892089844, + "learning_rate": 2.4250089039559944e-07, + "loss": 4.4699, + "step": 2275 + }, + { + "epoch": 0.1820340324495449, + "grad_norm": 124.57892608642578, + "learning_rate": 2.424184463586117e-07, + "loss": 4.5101, + "step": 2300 + }, + { + "epoch": 0.18401266323703996, + "grad_norm": 147.15554809570312, + "learning_rate": 2.4233600232162406e-07, + "loss": 4.5974, + "step": 2325 + }, + { + "epoch": 0.18599129402453501, + "grad_norm": 166.0609588623047, + "learning_rate": 2.4225355828463635e-07, + "loss": 4.5105, + "step": 2350 + }, + { + "epoch": 0.18796992481203006, + "grad_norm": 188.97705078125, + "learning_rate": 2.421711142476487e-07, + "loss": 4.587, + "step": 2375 + }, + { + "epoch": 0.18994855559952512, + "grad_norm": 243.09271240234375, + "learning_rate": 2.4208867021066097e-07, + "loss": 4.7686, + "step": 2400 + }, + { + "epoch": 0.1919271863870202, + "grad_norm": 127.40078735351562, + "learning_rate": 2.420062261736733e-07, + "loss": 4.4476, + "step": 2425 + }, + { + "epoch": 0.19390581717451524, + "grad_norm": 253.8776092529297, + "learning_rate": 2.419237821366856e-07, + "loss": 4.5478, + "step": 2450 + }, + { + "epoch": 0.1958844479620103, + "grad_norm": 123.27115631103516, + "learning_rate": 2.4184133809969793e-07, + "loss": 4.3502, + "step": 2475 + }, + { + "epoch": 0.19786307874950534, + "grad_norm": 138.00375366210938, + "learning_rate": 2.417588940627102e-07, + "loss": 4.3534, + "step": 2500 + }, + { + "epoch": 0.1998417095370004, + "grad_norm": 115.53954315185547, + "learning_rate": 2.4167645002572256e-07, + "loss": 4.7066, + "step": 2525 + }, + { + "epoch": 0.20182034032449545, + "grad_norm": 180.38809204101562, + "learning_rate": 2.4159400598873484e-07, + "loss": 4.6605, + "step": 2550 + }, + { + "epoch": 0.2037989711119905, + "grad_norm": 129.8457489013672, + "learning_rate": 2.415115619517472e-07, + "loss": 4.3849, + "step": 2575 + }, + { + "epoch": 0.20577760189948555, + "grad_norm": 156.64404296875, + "learning_rate": 2.4142911791475946e-07, + "loss": 4.3434, + "step": 2600 + }, + { + "epoch": 0.2077562326869806, + "grad_norm": 162.81320190429688, + "learning_rate": 2.4134667387777175e-07, + "loss": 4.5466, + "step": 2625 + }, + { + "epoch": 0.20973486347447567, + "grad_norm": 128.7244873046875, + "learning_rate": 2.412642298407841e-07, + "loss": 4.5358, + "step": 2650 + }, + { + "epoch": 0.21171349426197072, + "grad_norm": 217.59042358398438, + "learning_rate": 2.4118178580379637e-07, + "loss": 4.5235, + "step": 2675 + }, + { + "epoch": 0.21369212504946578, + "grad_norm": 144.84365844726562, + "learning_rate": 2.410993417668087e-07, + "loss": 4.3811, + "step": 2700 + }, + { + "epoch": 0.21567075583696083, + "grad_norm": 146.22451782226562, + "learning_rate": 2.41016897729821e-07, + "loss": 4.3797, + "step": 2725 + }, + { + "epoch": 0.21764938662445588, + "grad_norm": 198.39772033691406, + "learning_rate": 2.409344536928333e-07, + "loss": 4.4303, + "step": 2750 + }, + { + "epoch": 0.21962801741195093, + "grad_norm": 158.10592651367188, + "learning_rate": 2.408520096558456e-07, + "loss": 4.3633, + "step": 2775 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 166.79954528808594, + "learning_rate": 2.407695656188579e-07, + "loss": 4.5392, + "step": 2800 + }, + { + "epoch": 0.22358527898694103, + "grad_norm": 207.30593872070312, + "learning_rate": 2.406871215818702e-07, + "loss": 4.5003, + "step": 2825 + }, + { + "epoch": 0.22556390977443608, + "grad_norm": 128.81883239746094, + "learning_rate": 2.4060467754488253e-07, + "loss": 4.5416, + "step": 2850 + }, + { + "epoch": 0.22754254056193116, + "grad_norm": 181.48960876464844, + "learning_rate": 2.405222335078948e-07, + "loss": 4.1725, + "step": 2875 + }, + { + "epoch": 0.2295211713494262, + "grad_norm": 179.47384643554688, + "learning_rate": 2.4043978947090715e-07, + "loss": 4.5229, + "step": 2900 + }, + { + "epoch": 0.23149980213692126, + "grad_norm": 144.242919921875, + "learning_rate": 2.4035734543391943e-07, + "loss": 4.3295, + "step": 2925 + }, + { + "epoch": 0.2334784329244163, + "grad_norm": 177.61968994140625, + "learning_rate": 2.402749013969317e-07, + "loss": 4.4266, + "step": 2950 + }, + { + "epoch": 0.23545706371191136, + "grad_norm": 143.8682861328125, + "learning_rate": 2.4019245735994406e-07, + "loss": 4.2341, + "step": 2975 + }, + { + "epoch": 0.2374356944994064, + "grad_norm": 128.8461151123047, + "learning_rate": 2.4011001332295634e-07, + "loss": 4.3676, + "step": 3000 + }, + { + "epoch": 0.23941432528690146, + "grad_norm": 160.70687866210938, + "learning_rate": 2.400275692859687e-07, + "loss": 4.3945, + "step": 3025 + }, + { + "epoch": 0.2413929560743965, + "grad_norm": 157.65855407714844, + "learning_rate": 2.3994512524898097e-07, + "loss": 4.4967, + "step": 3050 + }, + { + "epoch": 0.24337158686189156, + "grad_norm": 125.79988861083984, + "learning_rate": 2.398626812119933e-07, + "loss": 4.279, + "step": 3075 + }, + { + "epoch": 0.24535021764938664, + "grad_norm": 168.8534698486328, + "learning_rate": 2.397802371750056e-07, + "loss": 4.4813, + "step": 3100 + }, + { + "epoch": 0.2473288484368817, + "grad_norm": 120.4126968383789, + "learning_rate": 2.3969779313801793e-07, + "loss": 4.1997, + "step": 3125 + }, + { + "epoch": 0.24930747922437674, + "grad_norm": 115.56365203857422, + "learning_rate": 2.396153491010302e-07, + "loss": 4.4076, + "step": 3150 + }, + { + "epoch": 0.2512861100118718, + "grad_norm": 152.89859008789062, + "learning_rate": 2.3953290506404255e-07, + "loss": 4.2893, + "step": 3175 + }, + { + "epoch": 0.25326474079936684, + "grad_norm": 177.6272735595703, + "learning_rate": 2.3945046102705484e-07, + "loss": 4.4892, + "step": 3200 + }, + { + "epoch": 0.2552433715868619, + "grad_norm": 131.46661376953125, + "learning_rate": 2.393680169900671e-07, + "loss": 4.2702, + "step": 3225 + }, + { + "epoch": 0.25722200237435694, + "grad_norm": 101.60210418701172, + "learning_rate": 2.3928557295307946e-07, + "loss": 4.2209, + "step": 3250 + }, + { + "epoch": 0.259200633161852, + "grad_norm": 199.7799835205078, + "learning_rate": 2.3920312891609174e-07, + "loss": 4.1502, + "step": 3275 + }, + { + "epoch": 0.26117926394934704, + "grad_norm": 163.44424438476562, + "learning_rate": 2.391206848791041e-07, + "loss": 4.3423, + "step": 3300 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 148.59519958496094, + "learning_rate": 2.3903824084211637e-07, + "loss": 4.4833, + "step": 3325 + }, + { + "epoch": 0.26513652552433714, + "grad_norm": 129.75927734375, + "learning_rate": 2.3895579680512865e-07, + "loss": 4.4745, + "step": 3350 + }, + { + "epoch": 0.2671151563118322, + "grad_norm": 126.6795654296875, + "learning_rate": 2.38873352768141e-07, + "loss": 4.3964, + "step": 3375 + }, + { + "epoch": 0.26909378709932724, + "grad_norm": 157.1032257080078, + "learning_rate": 2.387909087311533e-07, + "loss": 4.3419, + "step": 3400 + }, + { + "epoch": 0.2710724178868223, + "grad_norm": 142.79139709472656, + "learning_rate": 2.3870846469416556e-07, + "loss": 4.2243, + "step": 3425 + }, + { + "epoch": 0.2730510486743174, + "grad_norm": 137.3797607421875, + "learning_rate": 2.386260206571779e-07, + "loss": 4.1661, + "step": 3450 + }, + { + "epoch": 0.27502967946181245, + "grad_norm": 148.77401733398438, + "learning_rate": 2.385435766201902e-07, + "loss": 4.483, + "step": 3475 + }, + { + "epoch": 0.2770083102493075, + "grad_norm": 124.54267120361328, + "learning_rate": 2.384611325832025e-07, + "loss": 4.369, + "step": 3500 + }, + { + "epoch": 0.27898694103680255, + "grad_norm": 113.43370056152344, + "learning_rate": 2.383786885462148e-07, + "loss": 4.1491, + "step": 3525 + }, + { + "epoch": 0.2809655718242976, + "grad_norm": 155.67677307128906, + "learning_rate": 2.3829624450922712e-07, + "loss": 4.3403, + "step": 3550 + }, + { + "epoch": 0.28294420261179265, + "grad_norm": 201.27784729003906, + "learning_rate": 2.3821380047223943e-07, + "loss": 4.3563, + "step": 3575 + }, + { + "epoch": 0.2849228333992877, + "grad_norm": 104.74275970458984, + "learning_rate": 2.3813135643525174e-07, + "loss": 4.2706, + "step": 3600 + }, + { + "epoch": 0.28690146418678275, + "grad_norm": 133.6251678466797, + "learning_rate": 2.3804891239826405e-07, + "loss": 4.3637, + "step": 3625 + }, + { + "epoch": 0.2888800949742778, + "grad_norm": 102.35352325439453, + "learning_rate": 2.3796646836127634e-07, + "loss": 4.2585, + "step": 3650 + }, + { + "epoch": 0.29085872576177285, + "grad_norm": 156.72654724121094, + "learning_rate": 2.3788402432428868e-07, + "loss": 4.3448, + "step": 3675 + }, + { + "epoch": 0.2928373565492679, + "grad_norm": 121.19142150878906, + "learning_rate": 2.3780158028730096e-07, + "loss": 4.1475, + "step": 3700 + }, + { + "epoch": 0.29481598733676295, + "grad_norm": 138.72952270507812, + "learning_rate": 2.3771913625031327e-07, + "loss": 4.2475, + "step": 3725 + }, + { + "epoch": 0.296794618124258, + "grad_norm": 314.35113525390625, + "learning_rate": 2.3763669221332559e-07, + "loss": 4.2643, + "step": 3750 + }, + { + "epoch": 0.29877324891175305, + "grad_norm": 131.71240234375, + "learning_rate": 2.375542481763379e-07, + "loss": 4.2741, + "step": 3775 + }, + { + "epoch": 0.3007518796992481, + "grad_norm": 193.2744598388672, + "learning_rate": 2.374718041393502e-07, + "loss": 4.2314, + "step": 3800 + }, + { + "epoch": 0.30273051048674315, + "grad_norm": 146.98760986328125, + "learning_rate": 2.3738936010236252e-07, + "loss": 4.5421, + "step": 3825 + }, + { + "epoch": 0.3047091412742382, + "grad_norm": 106.49159240722656, + "learning_rate": 2.373069160653748e-07, + "loss": 4.0922, + "step": 3850 + }, + { + "epoch": 0.30668777206173325, + "grad_norm": 128.12686157226562, + "learning_rate": 2.3722447202838712e-07, + "loss": 4.3171, + "step": 3875 + }, + { + "epoch": 0.30866640284922836, + "grad_norm": 165.8458251953125, + "learning_rate": 2.3714202799139943e-07, + "loss": 4.1937, + "step": 3900 + }, + { + "epoch": 0.3106450336367234, + "grad_norm": 129.49652099609375, + "learning_rate": 2.3705958395441171e-07, + "loss": 4.2486, + "step": 3925 + }, + { + "epoch": 0.31262366442421846, + "grad_norm": 113.08882141113281, + "learning_rate": 2.3697713991742405e-07, + "loss": 3.9533, + "step": 3950 + }, + { + "epoch": 0.3146022952117135, + "grad_norm": 116.51021575927734, + "learning_rate": 2.3689469588043634e-07, + "loss": 4.2525, + "step": 3975 + }, + { + "epoch": 0.31658092599920856, + "grad_norm": 95.54279327392578, + "learning_rate": 2.3681225184344867e-07, + "loss": 4.1355, + "step": 4000 + }, + { + "epoch": 0.3185595567867036, + "grad_norm": 123.10621643066406, + "learning_rate": 2.3672980780646096e-07, + "loss": 4.3705, + "step": 4025 + }, + { + "epoch": 0.32053818757419866, + "grad_norm": 142.11273193359375, + "learning_rate": 2.3664736376947327e-07, + "loss": 4.2712, + "step": 4050 + }, + { + "epoch": 0.3225168183616937, + "grad_norm": 162.17141723632812, + "learning_rate": 2.3656491973248558e-07, + "loss": 4.1127, + "step": 4075 + }, + { + "epoch": 0.32449544914918876, + "grad_norm": 160.26893615722656, + "learning_rate": 2.364824756954979e-07, + "loss": 4.2687, + "step": 4100 + }, + { + "epoch": 0.3264740799366838, + "grad_norm": 134.65093994140625, + "learning_rate": 2.3640003165851018e-07, + "loss": 4.3567, + "step": 4125 + }, + { + "epoch": 0.32845271072417886, + "grad_norm": 178.23516845703125, + "learning_rate": 2.3631758762152252e-07, + "loss": 4.097, + "step": 4150 + }, + { + "epoch": 0.3304313415116739, + "grad_norm": 151.1556396484375, + "learning_rate": 2.362351435845348e-07, + "loss": 4.1602, + "step": 4175 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 154.64442443847656, + "learning_rate": 2.3615269954754711e-07, + "loss": 4.2365, + "step": 4200 + }, + { + "epoch": 0.334388603086664, + "grad_norm": 226.6827850341797, + "learning_rate": 2.3607025551055943e-07, + "loss": 4.3196, + "step": 4225 + }, + { + "epoch": 0.33636723387415907, + "grad_norm": 172.67916870117188, + "learning_rate": 2.359878114735717e-07, + "loss": 4.4476, + "step": 4250 + }, + { + "epoch": 0.3383458646616541, + "grad_norm": 124.78984069824219, + "learning_rate": 2.3590536743658405e-07, + "loss": 4.4006, + "step": 4275 + }, + { + "epoch": 0.34032449544914917, + "grad_norm": 156.81365966796875, + "learning_rate": 2.3582292339959633e-07, + "loss": 4.3914, + "step": 4300 + }, + { + "epoch": 0.3423031262366442, + "grad_norm": 116.53181457519531, + "learning_rate": 2.3574047936260865e-07, + "loss": 4.2846, + "step": 4325 + }, + { + "epoch": 0.3442817570241393, + "grad_norm": 146.16543579101562, + "learning_rate": 2.3565803532562096e-07, + "loss": 4.1371, + "step": 4350 + }, + { + "epoch": 0.3462603878116344, + "grad_norm": 213.07974243164062, + "learning_rate": 2.3557559128863327e-07, + "loss": 4.2294, + "step": 4375 + }, + { + "epoch": 0.3482390185991294, + "grad_norm": 99.38206481933594, + "learning_rate": 2.3549314725164558e-07, + "loss": 4.1726, + "step": 4400 + }, + { + "epoch": 0.3502176493866245, + "grad_norm": 162.97059631347656, + "learning_rate": 2.354107032146579e-07, + "loss": 4.0507, + "step": 4425 + }, + { + "epoch": 0.3521962801741195, + "grad_norm": 132.77474975585938, + "learning_rate": 2.3532825917767018e-07, + "loss": 4.0016, + "step": 4450 + }, + { + "epoch": 0.3541749109616146, + "grad_norm": 126.9658203125, + "learning_rate": 2.3524581514068252e-07, + "loss": 4.2731, + "step": 4475 + }, + { + "epoch": 0.3561535417491096, + "grad_norm": 194.47755432128906, + "learning_rate": 2.351633711036948e-07, + "loss": 4.1119, + "step": 4500 + }, + { + "epoch": 0.3581321725366047, + "grad_norm": 153.6606903076172, + "learning_rate": 2.3508092706670709e-07, + "loss": 4.4556, + "step": 4525 + }, + { + "epoch": 0.3601108033240997, + "grad_norm": 146.66709899902344, + "learning_rate": 2.3499848302971942e-07, + "loss": 4.3314, + "step": 4550 + }, + { + "epoch": 0.3620894341115948, + "grad_norm": 111.01129913330078, + "learning_rate": 2.349160389927317e-07, + "loss": 4.2929, + "step": 4575 + }, + { + "epoch": 0.3640680648990898, + "grad_norm": 137.40582275390625, + "learning_rate": 2.3483359495574405e-07, + "loss": 4.2198, + "step": 4600 + }, + { + "epoch": 0.3660466956865849, + "grad_norm": 142.0623779296875, + "learning_rate": 2.3475115091875633e-07, + "loss": 4.2013, + "step": 4625 + }, + { + "epoch": 0.36802532647407993, + "grad_norm": 135.2795867919922, + "learning_rate": 2.3466870688176864e-07, + "loss": 4.231, + "step": 4650 + }, + { + "epoch": 0.370003957261575, + "grad_norm": 127.59281158447266, + "learning_rate": 2.3458626284478096e-07, + "loss": 3.9613, + "step": 4675 + }, + { + "epoch": 0.37198258804907003, + "grad_norm": 132.48663330078125, + "learning_rate": 2.3450381880779327e-07, + "loss": 4.1925, + "step": 4700 + }, + { + "epoch": 0.3739612188365651, + "grad_norm": 135.35409545898438, + "learning_rate": 2.3442137477080555e-07, + "loss": 4.1828, + "step": 4725 + }, + { + "epoch": 0.37593984962406013, + "grad_norm": 107.55503845214844, + "learning_rate": 2.343389307338179e-07, + "loss": 4.2578, + "step": 4750 + }, + { + "epoch": 0.3779184804115552, + "grad_norm": 132.79620361328125, + "learning_rate": 2.3425648669683018e-07, + "loss": 4.0254, + "step": 4775 + }, + { + "epoch": 0.37989711119905023, + "grad_norm": 123.6044692993164, + "learning_rate": 2.341740426598425e-07, + "loss": 3.9981, + "step": 4800 + }, + { + "epoch": 0.38187574198654534, + "grad_norm": 149.656005859375, + "learning_rate": 2.340915986228548e-07, + "loss": 4.1067, + "step": 4825 + }, + { + "epoch": 0.3838543727740404, + "grad_norm": 122.97380065917969, + "learning_rate": 2.3400915458586708e-07, + "loss": 4.2396, + "step": 4850 + }, + { + "epoch": 0.38583300356153544, + "grad_norm": 140.10183715820312, + "learning_rate": 2.3392671054887942e-07, + "loss": 4.1309, + "step": 4875 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 137.91583251953125, + "learning_rate": 2.338442665118917e-07, + "loss": 4.0575, + "step": 4900 + }, + { + "epoch": 0.38979026513652554, + "grad_norm": 137.72152709960938, + "learning_rate": 2.3376182247490402e-07, + "loss": 4.159, + "step": 4925 + }, + { + "epoch": 0.3917688959240206, + "grad_norm": 84.3819808959961, + "learning_rate": 2.3367937843791633e-07, + "loss": 4.2611, + "step": 4950 + }, + { + "epoch": 0.39374752671151564, + "grad_norm": 200.3111114501953, + "learning_rate": 2.3359693440092864e-07, + "loss": 4.167, + "step": 4975 + }, + { + "epoch": 0.3957261574990107, + "grad_norm": 123.27460479736328, + "learning_rate": 2.3351449036394095e-07, + "loss": 4.2918, + "step": 5000 + }, + { + "epoch": 0.39770478828650574, + "grad_norm": 111.70620727539062, + "learning_rate": 2.3343204632695327e-07, + "loss": 4.2242, + "step": 5025 + }, + { + "epoch": 0.3996834190740008, + "grad_norm": 107.74165344238281, + "learning_rate": 2.3334960228996555e-07, + "loss": 4.3572, + "step": 5050 + }, + { + "epoch": 0.40166204986149584, + "grad_norm": 138.31423950195312, + "learning_rate": 2.332671582529779e-07, + "loss": 4.1759, + "step": 5075 + }, + { + "epoch": 0.4036406806489909, + "grad_norm": 104.73587799072266, + "learning_rate": 2.3318471421599017e-07, + "loss": 4.1695, + "step": 5100 + }, + { + "epoch": 0.40561931143648594, + "grad_norm": 138.1061553955078, + "learning_rate": 2.3310227017900246e-07, + "loss": 4.0986, + "step": 5125 + }, + { + "epoch": 0.407597942223981, + "grad_norm": 148.92279052734375, + "learning_rate": 2.330198261420148e-07, + "loss": 4.3455, + "step": 5150 + }, + { + "epoch": 0.40957657301147604, + "grad_norm": 321.29852294921875, + "learning_rate": 2.3293738210502708e-07, + "loss": 4.1285, + "step": 5175 + }, + { + "epoch": 0.4115552037989711, + "grad_norm": 114.85989379882812, + "learning_rate": 2.3285493806803942e-07, + "loss": 3.9628, + "step": 5200 + }, + { + "epoch": 0.41353383458646614, + "grad_norm": 137.27610778808594, + "learning_rate": 2.327724940310517e-07, + "loss": 4.1521, + "step": 5225 + }, + { + "epoch": 0.4155124653739612, + "grad_norm": 96.02686309814453, + "learning_rate": 2.3269004999406402e-07, + "loss": 4.027, + "step": 5250 + }, + { + "epoch": 0.4174910961614563, + "grad_norm": 213.81649780273438, + "learning_rate": 2.3260760595707633e-07, + "loss": 4.0522, + "step": 5275 + }, + { + "epoch": 0.41946972694895135, + "grad_norm": 160.4125518798828, + "learning_rate": 2.3252516192008864e-07, + "loss": 4.09, + "step": 5300 + }, + { + "epoch": 0.4214483577364464, + "grad_norm": 167.58741760253906, + "learning_rate": 2.3244271788310093e-07, + "loss": 4.1128, + "step": 5325 + }, + { + "epoch": 0.42342698852394145, + "grad_norm": 159.55303955078125, + "learning_rate": 2.3236027384611326e-07, + "loss": 4.0867, + "step": 5350 + }, + { + "epoch": 0.4254056193114365, + "grad_norm": 122.51324462890625, + "learning_rate": 2.3227782980912555e-07, + "loss": 4.2261, + "step": 5375 + }, + { + "epoch": 0.42738425009893155, + "grad_norm": 185.9108428955078, + "learning_rate": 2.3219538577213789e-07, + "loss": 3.9684, + "step": 5400 + }, + { + "epoch": 0.4293628808864266, + "grad_norm": 195.37579345703125, + "learning_rate": 2.3211294173515017e-07, + "loss": 4.0779, + "step": 5425 + }, + { + "epoch": 0.43134151167392165, + "grad_norm": 157.84371948242188, + "learning_rate": 2.3203049769816246e-07, + "loss": 4.1991, + "step": 5450 + }, + { + "epoch": 0.4333201424614167, + "grad_norm": 111.01512908935547, + "learning_rate": 2.319480536611748e-07, + "loss": 3.9962, + "step": 5475 + }, + { + "epoch": 0.43529877324891175, + "grad_norm": 114.49053955078125, + "learning_rate": 2.3186560962418708e-07, + "loss": 3.8972, + "step": 5500 + }, + { + "epoch": 0.4372774040364068, + "grad_norm": 168.17874145507812, + "learning_rate": 2.317831655871994e-07, + "loss": 4.1913, + "step": 5525 + }, + { + "epoch": 0.43925603482390185, + "grad_norm": 140.61912536621094, + "learning_rate": 2.317007215502117e-07, + "loss": 4.1396, + "step": 5550 + }, + { + "epoch": 0.4412346656113969, + "grad_norm": 138.01805114746094, + "learning_rate": 2.3161827751322401e-07, + "loss": 4.1399, + "step": 5575 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 188.0181427001953, + "learning_rate": 2.3153583347623633e-07, + "loss": 4.0329, + "step": 5600 + }, + { + "epoch": 0.445191927186387, + "grad_norm": 170.8402099609375, + "learning_rate": 2.3145338943924864e-07, + "loss": 4.3414, + "step": 5625 + }, + { + "epoch": 0.44717055797388205, + "grad_norm": 200.65077209472656, + "learning_rate": 2.3137094540226092e-07, + "loss": 4.2154, + "step": 5650 + }, + { + "epoch": 0.4491491887613771, + "grad_norm": 120.18091583251953, + "learning_rate": 2.3128850136527326e-07, + "loss": 4.0372, + "step": 5675 + }, + { + "epoch": 0.45112781954887216, + "grad_norm": 89.9730224609375, + "learning_rate": 2.3120605732828555e-07, + "loss": 4.1059, + "step": 5700 + }, + { + "epoch": 0.45310645033636726, + "grad_norm": 133.7999267578125, + "learning_rate": 2.3112361329129786e-07, + "loss": 4.2035, + "step": 5725 + }, + { + "epoch": 0.4550850811238623, + "grad_norm": 88.3386459350586, + "learning_rate": 2.3104116925431017e-07, + "loss": 4.0566, + "step": 5750 + }, + { + "epoch": 0.45706371191135736, + "grad_norm": 130.95127868652344, + "learning_rate": 2.3095872521732245e-07, + "loss": 4.3084, + "step": 5775 + }, + { + "epoch": 0.4590423426988524, + "grad_norm": 162.55679321289062, + "learning_rate": 2.308762811803348e-07, + "loss": 4.0288, + "step": 5800 + }, + { + "epoch": 0.46102097348634746, + "grad_norm": 104.4178695678711, + "learning_rate": 2.3079383714334708e-07, + "loss": 3.9244, + "step": 5825 + }, + { + "epoch": 0.4629996042738425, + "grad_norm": 235.28123474121094, + "learning_rate": 2.307113931063594e-07, + "loss": 4.1106, + "step": 5850 + }, + { + "epoch": 0.46497823506133756, + "grad_norm": 289.6645812988281, + "learning_rate": 2.306289490693717e-07, + "loss": 4.0457, + "step": 5875 + }, + { + "epoch": 0.4669568658488326, + "grad_norm": 99.97111511230469, + "learning_rate": 2.30546505032384e-07, + "loss": 4.2542, + "step": 5900 + }, + { + "epoch": 0.46893549663632766, + "grad_norm": 260.0950622558594, + "learning_rate": 2.304640609953963e-07, + "loss": 4.1564, + "step": 5925 + }, + { + "epoch": 0.4709141274238227, + "grad_norm": 113.74392700195312, + "learning_rate": 2.3038161695840864e-07, + "loss": 4.0403, + "step": 5950 + }, + { + "epoch": 0.47289275821131777, + "grad_norm": 79.32340240478516, + "learning_rate": 2.3029917292142092e-07, + "loss": 4.0408, + "step": 5975 + }, + { + "epoch": 0.4748713889988128, + "grad_norm": 95.92308807373047, + "learning_rate": 2.3021672888443326e-07, + "loss": 3.9811, + "step": 6000 + }, + { + "epoch": 0.47685001978630787, + "grad_norm": 94.5758285522461, + "learning_rate": 2.3013428484744554e-07, + "loss": 4.1102, + "step": 6025 + }, + { + "epoch": 0.4788286505738029, + "grad_norm": 142.32131958007812, + "learning_rate": 2.3005184081045786e-07, + "loss": 3.989, + "step": 6050 + }, + { + "epoch": 0.48080728136129797, + "grad_norm": 97.84469604492188, + "learning_rate": 2.2996939677347017e-07, + "loss": 3.9512, + "step": 6075 + }, + { + "epoch": 0.482785912148793, + "grad_norm": 94.38491821289062, + "learning_rate": 2.2988695273648245e-07, + "loss": 3.9475, + "step": 6100 + }, + { + "epoch": 0.48476454293628807, + "grad_norm": 124.32872772216797, + "learning_rate": 2.2980450869949476e-07, + "loss": 4.1352, + "step": 6125 + }, + { + "epoch": 0.4867431737237831, + "grad_norm": 196.1511993408203, + "learning_rate": 2.2972206466250708e-07, + "loss": 4.2956, + "step": 6150 + }, + { + "epoch": 0.48872180451127817, + "grad_norm": 144.1227264404297, + "learning_rate": 2.296396206255194e-07, + "loss": 3.9718, + "step": 6175 + }, + { + "epoch": 0.4907004352987733, + "grad_norm": 115.52275085449219, + "learning_rate": 2.295571765885317e-07, + "loss": 3.9135, + "step": 6200 + }, + { + "epoch": 0.4926790660862683, + "grad_norm": 117.71548461914062, + "learning_rate": 2.29474732551544e-07, + "loss": 3.9026, + "step": 6225 + }, + { + "epoch": 0.4946576968737634, + "grad_norm": 135.42698669433594, + "learning_rate": 2.293922885145563e-07, + "loss": 4.0369, + "step": 6250 + }, + { + "epoch": 0.4966363276612584, + "grad_norm": 142.4741973876953, + "learning_rate": 2.2930984447756863e-07, + "loss": 4.3588, + "step": 6275 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 128.56195068359375, + "learning_rate": 2.2922740044058092e-07, + "loss": 3.9089, + "step": 6300 + }, + { + "epoch": 0.5005935892362485, + "grad_norm": 96.84894561767578, + "learning_rate": 2.2914495640359323e-07, + "loss": 4.1722, + "step": 6325 + }, + { + "epoch": 0.5025722200237436, + "grad_norm": 236.92965698242188, + "learning_rate": 2.2906251236660554e-07, + "loss": 3.9729, + "step": 6350 + }, + { + "epoch": 0.5045508508112386, + "grad_norm": 135.83609008789062, + "learning_rate": 2.2898006832961783e-07, + "loss": 4.0322, + "step": 6375 + }, + { + "epoch": 0.5065294815987337, + "grad_norm": 123.36375427246094, + "learning_rate": 2.2889762429263017e-07, + "loss": 4.0042, + "step": 6400 + }, + { + "epoch": 0.5085081123862287, + "grad_norm": 118.30574035644531, + "learning_rate": 2.2881518025564245e-07, + "loss": 4.1079, + "step": 6425 + }, + { + "epoch": 0.5104867431737238, + "grad_norm": 107.81358337402344, + "learning_rate": 2.2873273621865476e-07, + "loss": 4.1198, + "step": 6450 + }, + { + "epoch": 0.5124653739612188, + "grad_norm": 146.2493438720703, + "learning_rate": 2.2865029218166707e-07, + "loss": 4.0814, + "step": 6475 + }, + { + "epoch": 0.5144440047487139, + "grad_norm": 136.8212890625, + "learning_rate": 2.2856784814467939e-07, + "loss": 4.0562, + "step": 6500 + }, + { + "epoch": 0.5164226355362089, + "grad_norm": 139.30670166015625, + "learning_rate": 2.2848540410769167e-07, + "loss": 4.1199, + "step": 6525 + }, + { + "epoch": 0.518401266323704, + "grad_norm": 194.90414428710938, + "learning_rate": 2.28402960070704e-07, + "loss": 4.0562, + "step": 6550 + }, + { + "epoch": 0.520379897111199, + "grad_norm": 103.54257202148438, + "learning_rate": 2.283205160337163e-07, + "loss": 4.0797, + "step": 6575 + }, + { + "epoch": 0.5223585278986941, + "grad_norm": 101.63102722167969, + "learning_rate": 2.2823807199672863e-07, + "loss": 4.0591, + "step": 6600 + }, + { + "epoch": 0.5243371586861891, + "grad_norm": 104.28479766845703, + "learning_rate": 2.2815562795974092e-07, + "loss": 3.8991, + "step": 6625 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 166.01107788085938, + "learning_rate": 2.2807318392275323e-07, + "loss": 4.1011, + "step": 6650 + }, + { + "epoch": 0.5282944202611792, + "grad_norm": 154.64959716796875, + "learning_rate": 2.2799073988576554e-07, + "loss": 3.9283, + "step": 6675 + }, + { + "epoch": 0.5302730510486743, + "grad_norm": 96.0099868774414, + "learning_rate": 2.2790829584877782e-07, + "loss": 3.8247, + "step": 6700 + }, + { + "epoch": 0.5322516818361693, + "grad_norm": 120.90514373779297, + "learning_rate": 2.2782585181179014e-07, + "loss": 4.0629, + "step": 6725 + }, + { + "epoch": 0.5342303126236644, + "grad_norm": 106.48863983154297, + "learning_rate": 2.2774340777480245e-07, + "loss": 4.0127, + "step": 6750 + }, + { + "epoch": 0.5362089434111594, + "grad_norm": 113.17047882080078, + "learning_rate": 2.2766096373781476e-07, + "loss": 4.03, + "step": 6775 + }, + { + "epoch": 0.5381875741986545, + "grad_norm": 130.6500701904297, + "learning_rate": 2.2757851970082707e-07, + "loss": 4.0192, + "step": 6800 + }, + { + "epoch": 0.5401662049861495, + "grad_norm": 142.3747100830078, + "learning_rate": 2.2749607566383938e-07, + "loss": 4.1507, + "step": 6825 + }, + { + "epoch": 0.5421448357736446, + "grad_norm": 125.88548278808594, + "learning_rate": 2.2741363162685167e-07, + "loss": 4.2026, + "step": 6850 + }, + { + "epoch": 0.5441234665611397, + "grad_norm": 156.44570922851562, + "learning_rate": 2.27331187589864e-07, + "loss": 4.1063, + "step": 6875 + }, + { + "epoch": 0.5461020973486348, + "grad_norm": 150.82635498046875, + "learning_rate": 2.272487435528763e-07, + "loss": 4.0477, + "step": 6900 + }, + { + "epoch": 0.5480807281361298, + "grad_norm": 170.67994689941406, + "learning_rate": 2.271662995158886e-07, + "loss": 4.1398, + "step": 6925 + }, + { + "epoch": 0.5500593589236249, + "grad_norm": 114.224609375, + "learning_rate": 2.2708385547890091e-07, + "loss": 4.0906, + "step": 6950 + }, + { + "epoch": 0.55203798971112, + "grad_norm": 135.5966033935547, + "learning_rate": 2.2700141144191323e-07, + "loss": 3.872, + "step": 6975 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 120.73974609375, + "learning_rate": 2.2691896740492554e-07, + "loss": 3.9762, + "step": 7000 + }, + { + "epoch": 0.55599525128611, + "grad_norm": 107.66891479492188, + "learning_rate": 2.2683652336793782e-07, + "loss": 4.0551, + "step": 7025 + }, + { + "epoch": 0.5579738820736051, + "grad_norm": 107.60162353515625, + "learning_rate": 2.2675407933095013e-07, + "loss": 3.973, + "step": 7050 + }, + { + "epoch": 0.5599525128611001, + "grad_norm": 118.88258361816406, + "learning_rate": 2.2667163529396245e-07, + "loss": 3.9864, + "step": 7075 + }, + { + "epoch": 0.5619311436485952, + "grad_norm": 148.85667419433594, + "learning_rate": 2.2658919125697476e-07, + "loss": 3.9409, + "step": 7100 + }, + { + "epoch": 0.5639097744360902, + "grad_norm": 148.57321166992188, + "learning_rate": 2.2650674721998704e-07, + "loss": 3.9611, + "step": 7125 + }, + { + "epoch": 0.5658884052235853, + "grad_norm": 172.39999389648438, + "learning_rate": 2.2642430318299938e-07, + "loss": 3.97, + "step": 7150 + }, + { + "epoch": 0.5678670360110804, + "grad_norm": 120.57051086425781, + "learning_rate": 2.2634185914601167e-07, + "loss": 3.9352, + "step": 7175 + }, + { + "epoch": 0.5698456667985754, + "grad_norm": 143.2531280517578, + "learning_rate": 2.26259415109024e-07, + "loss": 3.9686, + "step": 7200 + }, + { + "epoch": 0.5718242975860705, + "grad_norm": 123.57396697998047, + "learning_rate": 2.261769710720363e-07, + "loss": 4.0855, + "step": 7225 + }, + { + "epoch": 0.5738029283735655, + "grad_norm": 115.12631225585938, + "learning_rate": 2.260945270350486e-07, + "loss": 3.9754, + "step": 7250 + }, + { + "epoch": 0.5757815591610606, + "grad_norm": 114.95091247558594, + "learning_rate": 2.260120829980609e-07, + "loss": 3.8981, + "step": 7275 + }, + { + "epoch": 0.5777601899485556, + "grad_norm": 105.46833038330078, + "learning_rate": 2.2592963896107322e-07, + "loss": 3.9452, + "step": 7300 + }, + { + "epoch": 0.5797388207360507, + "grad_norm": 132.89012145996094, + "learning_rate": 2.258471949240855e-07, + "loss": 4.0808, + "step": 7325 + }, + { + "epoch": 0.5817174515235457, + "grad_norm": 143.6460418701172, + "learning_rate": 2.2576475088709782e-07, + "loss": 4.0108, + "step": 7350 + }, + { + "epoch": 0.5836960823110408, + "grad_norm": 130.83352661132812, + "learning_rate": 2.2568230685011013e-07, + "loss": 3.9701, + "step": 7375 + }, + { + "epoch": 0.5856747130985358, + "grad_norm": 111.10405731201172, + "learning_rate": 2.2559986281312244e-07, + "loss": 4.3162, + "step": 7400 + }, + { + "epoch": 0.5876533438860309, + "grad_norm": 163.31959533691406, + "learning_rate": 2.2551741877613476e-07, + "loss": 3.9096, + "step": 7425 + }, + { + "epoch": 0.5896319746735259, + "grad_norm": 134.72927856445312, + "learning_rate": 2.2543497473914704e-07, + "loss": 3.8888, + "step": 7450 + }, + { + "epoch": 0.591610605461021, + "grad_norm": 124.26619720458984, + "learning_rate": 2.2535253070215938e-07, + "loss": 4.0683, + "step": 7475 + }, + { + "epoch": 0.593589236248516, + "grad_norm": 106.8174057006836, + "learning_rate": 2.2527008666517166e-07, + "loss": 4.0547, + "step": 7500 + }, + { + "epoch": 0.5955678670360111, + "grad_norm": 108.11019897460938, + "learning_rate": 2.2518764262818398e-07, + "loss": 3.9728, + "step": 7525 + }, + { + "epoch": 0.5975464978235061, + "grad_norm": 117.44151306152344, + "learning_rate": 2.251051985911963e-07, + "loss": 4.0569, + "step": 7550 + }, + { + "epoch": 0.5995251286110012, + "grad_norm": 106.18008422851562, + "learning_rate": 2.250227545542086e-07, + "loss": 3.9042, + "step": 7575 + }, + { + "epoch": 0.6015037593984962, + "grad_norm": 88.67406463623047, + "learning_rate": 2.249403105172209e-07, + "loss": 4.0719, + "step": 7600 + }, + { + "epoch": 0.6034823901859913, + "grad_norm": 111.12770080566406, + "learning_rate": 2.248578664802332e-07, + "loss": 3.9749, + "step": 7625 + }, + { + "epoch": 0.6054610209734863, + "grad_norm": 119.26530456542969, + "learning_rate": 2.247754224432455e-07, + "loss": 3.9832, + "step": 7650 + }, + { + "epoch": 0.6074396517609814, + "grad_norm": 157.9289093017578, + "learning_rate": 2.2469297840625782e-07, + "loss": 3.9538, + "step": 7675 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 122.70995330810547, + "learning_rate": 2.2461053436927013e-07, + "loss": 3.8497, + "step": 7700 + }, + { + "epoch": 0.6113969133359715, + "grad_norm": 142.41835021972656, + "learning_rate": 2.2452809033228242e-07, + "loss": 3.9172, + "step": 7725 + }, + { + "epoch": 0.6133755441234665, + "grad_norm": 128.31825256347656, + "learning_rate": 2.2444564629529475e-07, + "loss": 3.7326, + "step": 7750 + }, + { + "epoch": 0.6153541749109616, + "grad_norm": 142.67408752441406, + "learning_rate": 2.2436320225830704e-07, + "loss": 3.8782, + "step": 7775 + }, + { + "epoch": 0.6173328056984567, + "grad_norm": 145.0731658935547, + "learning_rate": 2.2428075822131938e-07, + "loss": 4.024, + "step": 7800 + }, + { + "epoch": 0.6193114364859518, + "grad_norm": 187.09068298339844, + "learning_rate": 2.2419831418433166e-07, + "loss": 3.8939, + "step": 7825 + }, + { + "epoch": 0.6212900672734468, + "grad_norm": 122.93965148925781, + "learning_rate": 2.2411587014734397e-07, + "loss": 4.0373, + "step": 7850 + }, + { + "epoch": 0.6232686980609419, + "grad_norm": 152.1845245361328, + "learning_rate": 2.2403342611035628e-07, + "loss": 4.1168, + "step": 7875 + }, + { + "epoch": 0.6252473288484369, + "grad_norm": 100.07666778564453, + "learning_rate": 2.239509820733686e-07, + "loss": 3.9718, + "step": 7900 + }, + { + "epoch": 0.627225959635932, + "grad_norm": 130.85479736328125, + "learning_rate": 2.2386853803638088e-07, + "loss": 4.0301, + "step": 7925 + }, + { + "epoch": 0.629204590423427, + "grad_norm": 123.073974609375, + "learning_rate": 2.237860939993932e-07, + "loss": 4.0104, + "step": 7950 + }, + { + "epoch": 0.6311832212109221, + "grad_norm": 168.19808959960938, + "learning_rate": 2.237036499624055e-07, + "loss": 3.9421, + "step": 7975 + }, + { + "epoch": 0.6331618519984171, + "grad_norm": 118.69593811035156, + "learning_rate": 2.2362120592541782e-07, + "loss": 3.8238, + "step": 8000 + }, + { + "epoch": 0.6351404827859122, + "grad_norm": 192.9334259033203, + "learning_rate": 2.2353876188843013e-07, + "loss": 3.8227, + "step": 8025 + }, + { + "epoch": 0.6371191135734072, + "grad_norm": 103.11824035644531, + "learning_rate": 2.2345631785144241e-07, + "loss": 3.9359, + "step": 8050 + }, + { + "epoch": 0.6390977443609023, + "grad_norm": 129.3599090576172, + "learning_rate": 2.2337387381445475e-07, + "loss": 4.0562, + "step": 8075 + }, + { + "epoch": 0.6410763751483973, + "grad_norm": 124.06795501708984, + "learning_rate": 2.2329142977746704e-07, + "loss": 4.1502, + "step": 8100 + }, + { + "epoch": 0.6430550059358924, + "grad_norm": 113.18289184570312, + "learning_rate": 2.2320898574047935e-07, + "loss": 4.0059, + "step": 8125 + }, + { + "epoch": 0.6450336367233874, + "grad_norm": 117.89970397949219, + "learning_rate": 2.2312654170349166e-07, + "loss": 4.0162, + "step": 8150 + }, + { + "epoch": 0.6470122675108825, + "grad_norm": 109.6517105102539, + "learning_rate": 2.2304409766650397e-07, + "loss": 3.9979, + "step": 8175 + }, + { + "epoch": 0.6489908982983775, + "grad_norm": 123.35499572753906, + "learning_rate": 2.2296165362951628e-07, + "loss": 3.9273, + "step": 8200 + }, + { + "epoch": 0.6509695290858726, + "grad_norm": 141.97459411621094, + "learning_rate": 2.228792095925286e-07, + "loss": 4.0558, + "step": 8225 + }, + { + "epoch": 0.6529481598733676, + "grad_norm": 159.06973266601562, + "learning_rate": 2.2279676555554088e-07, + "loss": 3.8374, + "step": 8250 + }, + { + "epoch": 0.6549267906608627, + "grad_norm": 120.933837890625, + "learning_rate": 2.227143215185532e-07, + "loss": 3.8412, + "step": 8275 + }, + { + "epoch": 0.6569054214483577, + "grad_norm": 106.266357421875, + "learning_rate": 2.226318774815655e-07, + "loss": 3.8757, + "step": 8300 + }, + { + "epoch": 0.6588840522358528, + "grad_norm": 138.7765655517578, + "learning_rate": 2.225494334445778e-07, + "loss": 4.2284, + "step": 8325 + }, + { + "epoch": 0.6608626830233478, + "grad_norm": 120.76045989990234, + "learning_rate": 2.2246698940759013e-07, + "loss": 3.9127, + "step": 8350 + }, + { + "epoch": 0.6628413138108429, + "grad_norm": 117.31808471679688, + "learning_rate": 2.223845453706024e-07, + "loss": 3.7577, + "step": 8375 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 108.21405029296875, + "learning_rate": 2.2230210133361475e-07, + "loss": 4.1095, + "step": 8400 + }, + { + "epoch": 0.666798575385833, + "grad_norm": 126.65251159667969, + "learning_rate": 2.2221965729662703e-07, + "loss": 3.9047, + "step": 8425 + }, + { + "epoch": 0.668777206173328, + "grad_norm": 135.06512451171875, + "learning_rate": 2.2213721325963935e-07, + "loss": 3.9267, + "step": 8450 + }, + { + "epoch": 0.6707558369608231, + "grad_norm": 150.37025451660156, + "learning_rate": 2.2205476922265166e-07, + "loss": 3.9662, + "step": 8475 + }, + { + "epoch": 0.6727344677483181, + "grad_norm": 138.01531982421875, + "learning_rate": 2.2197232518566397e-07, + "loss": 3.8107, + "step": 8500 + }, + { + "epoch": 0.6747130985358132, + "grad_norm": 130.35153198242188, + "learning_rate": 2.2188988114867625e-07, + "loss": 3.8068, + "step": 8525 + }, + { + "epoch": 0.6766917293233082, + "grad_norm": 161.9180145263672, + "learning_rate": 2.218074371116886e-07, + "loss": 4.0893, + "step": 8550 + }, + { + "epoch": 0.6786703601108033, + "grad_norm": 165.08409118652344, + "learning_rate": 2.2172499307470088e-07, + "loss": 3.8419, + "step": 8575 + }, + { + "epoch": 0.6806489908982983, + "grad_norm": 153.2915496826172, + "learning_rate": 2.216425490377132e-07, + "loss": 3.9302, + "step": 8600 + }, + { + "epoch": 0.6826276216857934, + "grad_norm": 153.20138549804688, + "learning_rate": 2.215601050007255e-07, + "loss": 3.9947, + "step": 8625 + }, + { + "epoch": 0.6846062524732884, + "grad_norm": 124.32341003417969, + "learning_rate": 2.2147766096373779e-07, + "loss": 3.8241, + "step": 8650 + }, + { + "epoch": 0.6865848832607835, + "grad_norm": 209.813232421875, + "learning_rate": 2.2139521692675012e-07, + "loss": 3.917, + "step": 8675 + }, + { + "epoch": 0.6885635140482786, + "grad_norm": 116.88125610351562, + "learning_rate": 2.213127728897624e-07, + "loss": 3.9474, + "step": 8700 + }, + { + "epoch": 0.6905421448357737, + "grad_norm": 178.58721923828125, + "learning_rate": 2.2123032885277472e-07, + "loss": 3.9247, + "step": 8725 + }, + { + "epoch": 0.6925207756232687, + "grad_norm": 123.67437744140625, + "learning_rate": 2.2114788481578703e-07, + "loss": 3.9548, + "step": 8750 + }, + { + "epoch": 0.6944994064107638, + "grad_norm": 154.60626220703125, + "learning_rate": 2.2106544077879934e-07, + "loss": 3.8239, + "step": 8775 + }, + { + "epoch": 0.6964780371982588, + "grad_norm": 141.65699768066406, + "learning_rate": 2.2098299674181166e-07, + "loss": 3.8458, + "step": 8800 + }, + { + "epoch": 0.6984566679857539, + "grad_norm": 112.9280776977539, + "learning_rate": 2.2090055270482397e-07, + "loss": 3.9648, + "step": 8825 + }, + { + "epoch": 0.700435298773249, + "grad_norm": 154.57643127441406, + "learning_rate": 2.2081810866783625e-07, + "loss": 4.0078, + "step": 8850 + }, + { + "epoch": 0.702413929560744, + "grad_norm": 129.07418823242188, + "learning_rate": 2.207356646308486e-07, + "loss": 4.0276, + "step": 8875 + }, + { + "epoch": 0.704392560348239, + "grad_norm": 113.59859466552734, + "learning_rate": 2.2065322059386088e-07, + "loss": 4.1596, + "step": 8900 + }, + { + "epoch": 0.7063711911357341, + "grad_norm": 136.26283264160156, + "learning_rate": 2.2057077655687316e-07, + "loss": 3.948, + "step": 8925 + }, + { + "epoch": 0.7083498219232292, + "grad_norm": 118.27870178222656, + "learning_rate": 2.204883325198855e-07, + "loss": 3.7611, + "step": 8950 + }, + { + "epoch": 0.7103284527107242, + "grad_norm": 159.56643676757812, + "learning_rate": 2.2040588848289778e-07, + "loss": 4.0215, + "step": 8975 + }, + { + "epoch": 0.7123070834982193, + "grad_norm": 125.84573364257812, + "learning_rate": 2.2032344444591012e-07, + "loss": 4.0046, + "step": 9000 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 148.7548065185547, + "learning_rate": 2.202410004089224e-07, + "loss": 4.0033, + "step": 9025 + }, + { + "epoch": 0.7162643450732094, + "grad_norm": 109.41517639160156, + "learning_rate": 2.2015855637193472e-07, + "loss": 3.9298, + "step": 9050 + }, + { + "epoch": 0.7182429758607044, + "grad_norm": 112.52848815917969, + "learning_rate": 2.2007611233494703e-07, + "loss": 4.1003, + "step": 9075 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 114.72808074951172, + "learning_rate": 2.1999366829795934e-07, + "loss": 4.1112, + "step": 9100 + }, + { + "epoch": 0.7222002374356945, + "grad_norm": 144.11619567871094, + "learning_rate": 2.1991122426097163e-07, + "loss": 3.764, + "step": 9125 + }, + { + "epoch": 0.7241788682231896, + "grad_norm": 118.64055633544922, + "learning_rate": 2.1982878022398396e-07, + "loss": 3.9262, + "step": 9150 + }, + { + "epoch": 0.7261574990106846, + "grad_norm": 166.79525756835938, + "learning_rate": 2.1974633618699625e-07, + "loss": 4.0518, + "step": 9175 + }, + { + "epoch": 0.7281361297981797, + "grad_norm": 128.2512969970703, + "learning_rate": 2.1966389215000856e-07, + "loss": 3.952, + "step": 9200 + }, + { + "epoch": 0.7301147605856747, + "grad_norm": 143.56414794921875, + "learning_rate": 2.1958144811302087e-07, + "loss": 3.7034, + "step": 9225 + }, + { + "epoch": 0.7320933913731698, + "grad_norm": 120.13394165039062, + "learning_rate": 2.1949900407603316e-07, + "loss": 3.7839, + "step": 9250 + }, + { + "epoch": 0.7340720221606648, + "grad_norm": 148.74070739746094, + "learning_rate": 2.194165600390455e-07, + "loss": 3.8871, + "step": 9275 + }, + { + "epoch": 0.7360506529481599, + "grad_norm": 148.17022705078125, + "learning_rate": 2.1933411600205778e-07, + "loss": 3.7486, + "step": 9300 + }, + { + "epoch": 0.7380292837356549, + "grad_norm": 112.7260513305664, + "learning_rate": 2.192516719650701e-07, + "loss": 3.9436, + "step": 9325 + }, + { + "epoch": 0.74000791452315, + "grad_norm": 131.4718780517578, + "learning_rate": 2.191692279280824e-07, + "loss": 4.1101, + "step": 9350 + }, + { + "epoch": 0.741986545310645, + "grad_norm": 106.73101043701172, + "learning_rate": 2.1908678389109472e-07, + "loss": 3.9285, + "step": 9375 + }, + { + "epoch": 0.7439651760981401, + "grad_norm": 120.58040618896484, + "learning_rate": 2.1900433985410703e-07, + "loss": 3.8471, + "step": 9400 + }, + { + "epoch": 0.7459438068856351, + "grad_norm": 135.69512939453125, + "learning_rate": 2.1892189581711934e-07, + "loss": 3.7629, + "step": 9425 + }, + { + "epoch": 0.7479224376731302, + "grad_norm": 125.78627014160156, + "learning_rate": 2.1883945178013162e-07, + "loss": 4.0646, + "step": 9450 + }, + { + "epoch": 0.7499010684606252, + "grad_norm": 150.2305145263672, + "learning_rate": 2.1875700774314396e-07, + "loss": 3.9361, + "step": 9475 + }, + { + "epoch": 0.7518796992481203, + "grad_norm": 95.4436264038086, + "learning_rate": 2.1867456370615625e-07, + "loss": 3.7688, + "step": 9500 + }, + { + "epoch": 0.7538583300356153, + "grad_norm": 141.27809143066406, + "learning_rate": 2.1859211966916853e-07, + "loss": 4.0217, + "step": 9525 + }, + { + "epoch": 0.7558369608231104, + "grad_norm": 133.8254852294922, + "learning_rate": 2.1850967563218087e-07, + "loss": 4.0683, + "step": 9550 + }, + { + "epoch": 0.7578155916106054, + "grad_norm": 139.919189453125, + "learning_rate": 2.1842723159519316e-07, + "loss": 3.9958, + "step": 9575 + }, + { + "epoch": 0.7597942223981005, + "grad_norm": 173.58946228027344, + "learning_rate": 2.183447875582055e-07, + "loss": 3.9474, + "step": 9600 + }, + { + "epoch": 0.7617728531855956, + "grad_norm": 107.07398223876953, + "learning_rate": 2.1826234352121778e-07, + "loss": 3.8308, + "step": 9625 + }, + { + "epoch": 0.7637514839730907, + "grad_norm": 124.00753784179688, + "learning_rate": 2.181798994842301e-07, + "loss": 3.8218, + "step": 9650 + }, + { + "epoch": 0.7657301147605857, + "grad_norm": 138.23736572265625, + "learning_rate": 2.180974554472424e-07, + "loss": 3.7296, + "step": 9675 + }, + { + "epoch": 0.7677087455480808, + "grad_norm": 128.9496612548828, + "learning_rate": 2.1801501141025471e-07, + "loss": 3.9163, + "step": 9700 + }, + { + "epoch": 0.7696873763355758, + "grad_norm": 108.07875061035156, + "learning_rate": 2.17932567373267e-07, + "loss": 3.9408, + "step": 9725 + }, + { + "epoch": 0.7716660071230709, + "grad_norm": 126.18501281738281, + "learning_rate": 2.1785012333627934e-07, + "loss": 4.1993, + "step": 9750 + }, + { + "epoch": 0.7736446379105659, + "grad_norm": 144.8102264404297, + "learning_rate": 2.1776767929929162e-07, + "loss": 3.877, + "step": 9775 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 118.8504638671875, + "learning_rate": 2.1768523526230396e-07, + "loss": 3.9788, + "step": 9800 + }, + { + "epoch": 0.777601899485556, + "grad_norm": 127.45133209228516, + "learning_rate": 2.1760279122531625e-07, + "loss": 3.8987, + "step": 9825 + }, + { + "epoch": 0.7795805302730511, + "grad_norm": 134.95892333984375, + "learning_rate": 2.1752034718832853e-07, + "loss": 3.8251, + "step": 9850 + }, + { + "epoch": 0.7815591610605461, + "grad_norm": 124.00614929199219, + "learning_rate": 2.1743790315134087e-07, + "loss": 3.6875, + "step": 9875 + }, + { + "epoch": 0.7835377918480412, + "grad_norm": 126.81105041503906, + "learning_rate": 2.1735545911435315e-07, + "loss": 3.7447, + "step": 9900 + }, + { + "epoch": 0.7855164226355362, + "grad_norm": 106.54443359375, + "learning_rate": 2.172730150773655e-07, + "loss": 3.9051, + "step": 9925 + }, + { + "epoch": 0.7874950534230313, + "grad_norm": 156.5098876953125, + "learning_rate": 2.1719057104037778e-07, + "loss": 3.9587, + "step": 9950 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 128.83648681640625, + "learning_rate": 2.171081270033901e-07, + "loss": 4.0755, + "step": 9975 + }, + { + "epoch": 0.7914523149980214, + "grad_norm": 131.54664611816406, + "learning_rate": 2.170256829664024e-07, + "loss": 4.0072, + "step": 10000 + }, + { + "epoch": 0.7914523149980214, + "eval_loss": 3.8681728839874268, + "eval_runtime": 9.5698, + "eval_samples_per_second": 264.165, + "eval_steps_per_second": 33.021, + "step": 10000 + }, + { + "epoch": 0.7934309457855164, + "grad_norm": 130.07545471191406, + "learning_rate": 2.169432389294147e-07, + "loss": 3.7883, + "step": 10025 + }, + { + "epoch": 0.7954095765730115, + "grad_norm": 135.5163116455078, + "learning_rate": 2.16860794892427e-07, + "loss": 3.8028, + "step": 10050 + }, + { + "epoch": 0.7973882073605065, + "grad_norm": 130.33010864257812, + "learning_rate": 2.1677835085543934e-07, + "loss": 3.9204, + "step": 10075 + }, + { + "epoch": 0.7993668381480016, + "grad_norm": 131.47166442871094, + "learning_rate": 2.1669590681845162e-07, + "loss": 3.7961, + "step": 10100 + }, + { + "epoch": 0.8013454689354966, + "grad_norm": 126.78470611572266, + "learning_rate": 2.1661346278146396e-07, + "loss": 3.8249, + "step": 10125 + }, + { + "epoch": 0.8033240997229917, + "grad_norm": 188.20448303222656, + "learning_rate": 2.1653101874447624e-07, + "loss": 3.778, + "step": 10150 + }, + { + "epoch": 0.8053027305104867, + "grad_norm": 137.7311553955078, + "learning_rate": 2.1644857470748853e-07, + "loss": 3.9346, + "step": 10175 + }, + { + "epoch": 0.8072813612979818, + "grad_norm": 89.07192993164062, + "learning_rate": 2.1636613067050087e-07, + "loss": 3.7072, + "step": 10200 + }, + { + "epoch": 0.8092599920854768, + "grad_norm": 96.4891357421875, + "learning_rate": 2.1628368663351315e-07, + "loss": 3.8609, + "step": 10225 + }, + { + "epoch": 0.8112386228729719, + "grad_norm": 144.67372131347656, + "learning_rate": 2.1620124259652546e-07, + "loss": 3.6157, + "step": 10250 + }, + { + "epoch": 0.8132172536604669, + "grad_norm": 115.51387023925781, + "learning_rate": 2.1611879855953778e-07, + "loss": 3.7226, + "step": 10275 + }, + { + "epoch": 0.815195884447962, + "grad_norm": 99.25152587890625, + "learning_rate": 2.160363545225501e-07, + "loss": 3.9583, + "step": 10300 + }, + { + "epoch": 0.817174515235457, + "grad_norm": 129.25799560546875, + "learning_rate": 2.1595391048556237e-07, + "loss": 3.9272, + "step": 10325 + }, + { + "epoch": 0.8191531460229521, + "grad_norm": 142.94081115722656, + "learning_rate": 2.158714664485747e-07, + "loss": 4.0403, + "step": 10350 + }, + { + "epoch": 0.8211317768104471, + "grad_norm": 138.73974609375, + "learning_rate": 2.15789022411587e-07, + "loss": 3.9837, + "step": 10375 + }, + { + "epoch": 0.8231104075979422, + "grad_norm": 128.65940856933594, + "learning_rate": 2.1570657837459933e-07, + "loss": 3.9414, + "step": 10400 + }, + { + "epoch": 0.8250890383854372, + "grad_norm": 98.70417022705078, + "learning_rate": 2.1562413433761162e-07, + "loss": 3.9875, + "step": 10425 + }, + { + "epoch": 0.8270676691729323, + "grad_norm": 95.58749389648438, + "learning_rate": 2.155416903006239e-07, + "loss": 4.0485, + "step": 10450 + }, + { + "epoch": 0.8290462999604273, + "grad_norm": 115.98033905029297, + "learning_rate": 2.1545924626363624e-07, + "loss": 3.9302, + "step": 10475 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 115.73841094970703, + "learning_rate": 2.1537680222664853e-07, + "loss": 3.8931, + "step": 10500 + }, + { + "epoch": 0.8330035615354174, + "grad_norm": 120.9448013305664, + "learning_rate": 2.1529435818966086e-07, + "loss": 3.9164, + "step": 10525 + }, + { + "epoch": 0.8349821923229126, + "grad_norm": 108.95042419433594, + "learning_rate": 2.1521191415267315e-07, + "loss": 3.8218, + "step": 10550 + }, + { + "epoch": 0.8369608231104076, + "grad_norm": 134.7786407470703, + "learning_rate": 2.1512947011568546e-07, + "loss": 3.8017, + "step": 10575 + }, + { + "epoch": 0.8389394538979027, + "grad_norm": 135.17352294921875, + "learning_rate": 2.1504702607869777e-07, + "loss": 3.7896, + "step": 10600 + }, + { + "epoch": 0.8409180846853977, + "grad_norm": 107.60875701904297, + "learning_rate": 2.1496458204171008e-07, + "loss": 3.8653, + "step": 10625 + }, + { + "epoch": 0.8428967154728928, + "grad_norm": 106.85396575927734, + "learning_rate": 2.1488213800472237e-07, + "loss": 3.7688, + "step": 10650 + }, + { + "epoch": 0.8448753462603878, + "grad_norm": 154.32797241210938, + "learning_rate": 2.147996939677347e-07, + "loss": 3.7928, + "step": 10675 + }, + { + "epoch": 0.8468539770478829, + "grad_norm": 105.45416259765625, + "learning_rate": 2.14717249930747e-07, + "loss": 3.9773, + "step": 10700 + }, + { + "epoch": 0.848832607835378, + "grad_norm": 130.95082092285156, + "learning_rate": 2.1463480589375933e-07, + "loss": 3.9007, + "step": 10725 + }, + { + "epoch": 0.850811238622873, + "grad_norm": 111.79964447021484, + "learning_rate": 2.1455236185677162e-07, + "loss": 3.8503, + "step": 10750 + }, + { + "epoch": 0.852789869410368, + "grad_norm": 111.66275787353516, + "learning_rate": 2.144699178197839e-07, + "loss": 4.1484, + "step": 10775 + }, + { + "epoch": 0.8547685001978631, + "grad_norm": 105.59733581542969, + "learning_rate": 2.1438747378279624e-07, + "loss": 3.8871, + "step": 10800 + }, + { + "epoch": 0.8567471309853582, + "grad_norm": 100.40887451171875, + "learning_rate": 2.1430502974580852e-07, + "loss": 3.8456, + "step": 10825 + }, + { + "epoch": 0.8587257617728532, + "grad_norm": 129.26177978515625, + "learning_rate": 2.1422258570882084e-07, + "loss": 3.673, + "step": 10850 + }, + { + "epoch": 0.8607043925603483, + "grad_norm": 144.7709197998047, + "learning_rate": 2.1414014167183315e-07, + "loss": 3.7465, + "step": 10875 + }, + { + "epoch": 0.8626830233478433, + "grad_norm": 137.7886505126953, + "learning_rate": 2.1405769763484546e-07, + "loss": 3.8944, + "step": 10900 + }, + { + "epoch": 0.8646616541353384, + "grad_norm": 117.49480438232422, + "learning_rate": 2.1397525359785774e-07, + "loss": 3.7995, + "step": 10925 + }, + { + "epoch": 0.8666402849228334, + "grad_norm": 168.4739990234375, + "learning_rate": 2.1389280956087008e-07, + "loss": 4.0448, + "step": 10950 + }, + { + "epoch": 0.8686189157103285, + "grad_norm": 133.91720581054688, + "learning_rate": 2.1381036552388237e-07, + "loss": 3.6778, + "step": 10975 + }, + { + "epoch": 0.8705975464978235, + "grad_norm": 100.49728393554688, + "learning_rate": 2.137279214868947e-07, + "loss": 3.9251, + "step": 11000 + }, + { + "epoch": 0.8725761772853186, + "grad_norm": 189.62327575683594, + "learning_rate": 2.13645477449907e-07, + "loss": 3.8032, + "step": 11025 + }, + { + "epoch": 0.8745548080728136, + "grad_norm": 108.47595977783203, + "learning_rate": 2.135630334129193e-07, + "loss": 3.92, + "step": 11050 + }, + { + "epoch": 0.8765334388603087, + "grad_norm": 150.95584106445312, + "learning_rate": 2.1348058937593161e-07, + "loss": 3.9559, + "step": 11075 + }, + { + "epoch": 0.8785120696478037, + "grad_norm": 126.86693572998047, + "learning_rate": 2.133981453389439e-07, + "loss": 3.9322, + "step": 11100 + }, + { + "epoch": 0.8804907004352988, + "grad_norm": 158.07049560546875, + "learning_rate": 2.1331570130195624e-07, + "loss": 3.7498, + "step": 11125 + }, + { + "epoch": 0.8824693312227938, + "grad_norm": 156.98699951171875, + "learning_rate": 2.1323325726496852e-07, + "loss": 3.9679, + "step": 11150 + }, + { + "epoch": 0.8844479620102889, + "grad_norm": 91.1626968383789, + "learning_rate": 2.1315081322798083e-07, + "loss": 3.9669, + "step": 11175 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 145.4545135498047, + "learning_rate": 2.1306836919099315e-07, + "loss": 3.9038, + "step": 11200 + }, + { + "epoch": 0.888405223585279, + "grad_norm": 134.78793334960938, + "learning_rate": 2.1298592515400546e-07, + "loss": 3.9122, + "step": 11225 + }, + { + "epoch": 0.890383854372774, + "grad_norm": 132.1155242919922, + "learning_rate": 2.1290348111701774e-07, + "loss": 4.0016, + "step": 11250 + }, + { + "epoch": 0.8923624851602691, + "grad_norm": 116.72430419921875, + "learning_rate": 2.1282103708003008e-07, + "loss": 3.9878, + "step": 11275 + }, + { + "epoch": 0.8943411159477641, + "grad_norm": 117.89616394042969, + "learning_rate": 2.1273859304304237e-07, + "loss": 3.7834, + "step": 11300 + }, + { + "epoch": 0.8963197467352592, + "grad_norm": 151.65277099609375, + "learning_rate": 2.126561490060547e-07, + "loss": 3.9696, + "step": 11325 + }, + { + "epoch": 0.8982983775227542, + "grad_norm": 139.39405822753906, + "learning_rate": 2.12573704969067e-07, + "loss": 3.9585, + "step": 11350 + }, + { + "epoch": 0.9002770083102493, + "grad_norm": 123.25849914550781, + "learning_rate": 2.124912609320793e-07, + "loss": 3.816, + "step": 11375 + }, + { + "epoch": 0.9022556390977443, + "grad_norm": 148.54562377929688, + "learning_rate": 2.124088168950916e-07, + "loss": 3.8433, + "step": 11400 + }, + { + "epoch": 0.9042342698852394, + "grad_norm": 135.60752868652344, + "learning_rate": 2.123263728581039e-07, + "loss": 3.7851, + "step": 11425 + }, + { + "epoch": 0.9062129006727345, + "grad_norm": 129.5711212158203, + "learning_rate": 2.122439288211162e-07, + "loss": 3.7133, + "step": 11450 + }, + { + "epoch": 0.9081915314602296, + "grad_norm": 136.88392639160156, + "learning_rate": 2.1216148478412852e-07, + "loss": 3.9864, + "step": 11475 + }, + { + "epoch": 0.9101701622477246, + "grad_norm": 148.6637725830078, + "learning_rate": 2.1207904074714083e-07, + "loss": 3.6906, + "step": 11500 + }, + { + "epoch": 0.9121487930352197, + "grad_norm": 164.6747283935547, + "learning_rate": 2.1199659671015314e-07, + "loss": 3.8543, + "step": 11525 + }, + { + "epoch": 0.9141274238227147, + "grad_norm": 114.64603424072266, + "learning_rate": 2.1191415267316546e-07, + "loss": 3.6973, + "step": 11550 + }, + { + "epoch": 0.9161060546102098, + "grad_norm": 129.53265380859375, + "learning_rate": 2.1183170863617774e-07, + "loss": 3.5896, + "step": 11575 + }, + { + "epoch": 0.9180846853977048, + "grad_norm": 145.63973999023438, + "learning_rate": 2.1174926459919008e-07, + "loss": 3.963, + "step": 11600 + }, + { + "epoch": 0.9200633161851999, + "grad_norm": 132.06729125976562, + "learning_rate": 2.1166682056220236e-07, + "loss": 3.6703, + "step": 11625 + }, + { + "epoch": 0.9220419469726949, + "grad_norm": 129.76583862304688, + "learning_rate": 2.1158437652521468e-07, + "loss": 3.6965, + "step": 11650 + }, + { + "epoch": 0.92402057776019, + "grad_norm": 115.21681213378906, + "learning_rate": 2.1150193248822699e-07, + "loss": 3.8711, + "step": 11675 + }, + { + "epoch": 0.925999208547685, + "grad_norm": 85.92119598388672, + "learning_rate": 2.114194884512393e-07, + "loss": 3.8377, + "step": 11700 + }, + { + "epoch": 0.9279778393351801, + "grad_norm": 117.11397552490234, + "learning_rate": 2.113370444142516e-07, + "loss": 3.7965, + "step": 11725 + }, + { + "epoch": 0.9299564701226751, + "grad_norm": 113.50032806396484, + "learning_rate": 2.112546003772639e-07, + "loss": 3.7684, + "step": 11750 + }, + { + "epoch": 0.9319351009101702, + "grad_norm": 107.85367584228516, + "learning_rate": 2.111721563402762e-07, + "loss": 3.6988, + "step": 11775 + }, + { + "epoch": 0.9339137316976652, + "grad_norm": 121.42901611328125, + "learning_rate": 2.1108971230328852e-07, + "loss": 3.6745, + "step": 11800 + }, + { + "epoch": 0.9358923624851603, + "grad_norm": 139.971923828125, + "learning_rate": 2.1100726826630083e-07, + "loss": 3.8403, + "step": 11825 + }, + { + "epoch": 0.9378709932726553, + "grad_norm": 126.40741729736328, + "learning_rate": 2.1092482422931312e-07, + "loss": 3.7846, + "step": 11850 + }, + { + "epoch": 0.9398496240601504, + "grad_norm": 110.96858215332031, + "learning_rate": 2.1084238019232545e-07, + "loss": 3.9256, + "step": 11875 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 107.72772979736328, + "learning_rate": 2.1075993615533774e-07, + "loss": 3.7888, + "step": 11900 + }, + { + "epoch": 0.9438068856351405, + "grad_norm": 94.70952606201172, + "learning_rate": 2.1067749211835008e-07, + "loss": 3.8496, + "step": 11925 + }, + { + "epoch": 0.9457855164226355, + "grad_norm": 178.84500122070312, + "learning_rate": 2.1059504808136236e-07, + "loss": 3.737, + "step": 11950 + }, + { + "epoch": 0.9477641472101306, + "grad_norm": 179.0364227294922, + "learning_rate": 2.1051260404437467e-07, + "loss": 3.7584, + "step": 11975 + }, + { + "epoch": 0.9497427779976256, + "grad_norm": 141.7266845703125, + "learning_rate": 2.1043016000738698e-07, + "loss": 3.7948, + "step": 12000 + }, + { + "epoch": 0.9517214087851207, + "grad_norm": 120.56340026855469, + "learning_rate": 2.1034771597039927e-07, + "loss": 3.8951, + "step": 12025 + }, + { + "epoch": 0.9537000395726157, + "grad_norm": 113.90511322021484, + "learning_rate": 2.1026527193341158e-07, + "loss": 3.8823, + "step": 12050 + }, + { + "epoch": 0.9556786703601108, + "grad_norm": 103.57383728027344, + "learning_rate": 2.101828278964239e-07, + "loss": 3.7853, + "step": 12075 + }, + { + "epoch": 0.9576573011476058, + "grad_norm": 111.0738525390625, + "learning_rate": 2.101003838594362e-07, + "loss": 3.8394, + "step": 12100 + }, + { + "epoch": 0.9596359319351009, + "grad_norm": 130.9629364013672, + "learning_rate": 2.1001793982244852e-07, + "loss": 3.7752, + "step": 12125 + }, + { + "epoch": 0.9616145627225959, + "grad_norm": 98.29790496826172, + "learning_rate": 2.0993549578546083e-07, + "loss": 3.8332, + "step": 12150 + }, + { + "epoch": 0.963593193510091, + "grad_norm": 116.66043853759766, + "learning_rate": 2.098530517484731e-07, + "loss": 3.964, + "step": 12175 + }, + { + "epoch": 0.965571824297586, + "grad_norm": 179.0226287841797, + "learning_rate": 2.0977060771148545e-07, + "loss": 3.8848, + "step": 12200 + }, + { + "epoch": 0.9675504550850811, + "grad_norm": 100.40991973876953, + "learning_rate": 2.0968816367449774e-07, + "loss": 3.8461, + "step": 12225 + }, + { + "epoch": 0.9695290858725761, + "grad_norm": 117.44912719726562, + "learning_rate": 2.0960571963751005e-07, + "loss": 3.723, + "step": 12250 + }, + { + "epoch": 0.9715077166600712, + "grad_norm": 106.5282974243164, + "learning_rate": 2.0952327560052236e-07, + "loss": 3.8557, + "step": 12275 + }, + { + "epoch": 0.9734863474475662, + "grad_norm": 111.7376480102539, + "learning_rate": 2.0944083156353467e-07, + "loss": 3.8505, + "step": 12300 + }, + { + "epoch": 0.9754649782350613, + "grad_norm": 185.3255157470703, + "learning_rate": 2.0935838752654698e-07, + "loss": 3.9339, + "step": 12325 + }, + { + "epoch": 0.9774436090225563, + "grad_norm": 128.0303955078125, + "learning_rate": 2.0927594348955927e-07, + "loss": 3.9595, + "step": 12350 + }, + { + "epoch": 0.9794222398100515, + "grad_norm": 117.01809692382812, + "learning_rate": 2.0919349945257158e-07, + "loss": 3.8385, + "step": 12375 + }, + { + "epoch": 0.9814008705975465, + "grad_norm": 132.46775817871094, + "learning_rate": 2.091110554155839e-07, + "loss": 3.6976, + "step": 12400 + }, + { + "epoch": 0.9833795013850416, + "grad_norm": 125.96062469482422, + "learning_rate": 2.090286113785962e-07, + "loss": 3.9652, + "step": 12425 + }, + { + "epoch": 0.9853581321725366, + "grad_norm": 172.13600158691406, + "learning_rate": 2.089461673416085e-07, + "loss": 3.9026, + "step": 12450 + }, + { + "epoch": 0.9873367629600317, + "grad_norm": 107.95278930664062, + "learning_rate": 2.0886372330462083e-07, + "loss": 3.8457, + "step": 12475 + }, + { + "epoch": 0.9893153937475268, + "grad_norm": 196.08303833007812, + "learning_rate": 2.087812792676331e-07, + "loss": 3.8319, + "step": 12500 + }, + { + "epoch": 0.9912940245350218, + "grad_norm": 119.07249450683594, + "learning_rate": 2.0869883523064545e-07, + "loss": 3.7464, + "step": 12525 + }, + { + "epoch": 0.9932726553225169, + "grad_norm": 118.91383361816406, + "learning_rate": 2.0861639119365773e-07, + "loss": 3.7734, + "step": 12550 + }, + { + "epoch": 0.9952512861100119, + "grad_norm": 122.37627410888672, + "learning_rate": 2.0853394715667005e-07, + "loss": 3.9263, + "step": 12575 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 101.24188232421875, + "learning_rate": 2.0845150311968236e-07, + "loss": 3.6865, + "step": 12600 + }, + { + "epoch": 0.999208547685002, + "grad_norm": 253.74789428710938, + "learning_rate": 2.0836905908269467e-07, + "loss": 3.857, + "step": 12625 + }, + { + "epoch": 1.001187178472497, + "grad_norm": 121.46297454833984, + "learning_rate": 2.0828661504570695e-07, + "loss": 3.9145, + "step": 12650 + }, + { + "epoch": 1.003165809259992, + "grad_norm": 115.41104125976562, + "learning_rate": 2.0820417100871927e-07, + "loss": 3.9441, + "step": 12675 + }, + { + "epoch": 1.0051444400474872, + "grad_norm": 105.03096008300781, + "learning_rate": 2.0812172697173158e-07, + "loss": 3.6926, + "step": 12700 + }, + { + "epoch": 1.007123070834982, + "grad_norm": 98.86603546142578, + "learning_rate": 2.080392829347439e-07, + "loss": 3.5524, + "step": 12725 + }, + { + "epoch": 1.0091017016224773, + "grad_norm": 120.75003814697266, + "learning_rate": 2.079568388977562e-07, + "loss": 3.8494, + "step": 12750 + }, + { + "epoch": 1.0110803324099722, + "grad_norm": 86.03683471679688, + "learning_rate": 2.0787439486076849e-07, + "loss": 3.7635, + "step": 12775 + }, + { + "epoch": 1.0130589631974674, + "grad_norm": 132.28945922851562, + "learning_rate": 2.0779195082378082e-07, + "loss": 4.007, + "step": 12800 + }, + { + "epoch": 1.0150375939849625, + "grad_norm": 129.51051330566406, + "learning_rate": 2.077095067867931e-07, + "loss": 3.6065, + "step": 12825 + }, + { + "epoch": 1.0170162247724575, + "grad_norm": 103.38876342773438, + "learning_rate": 2.0762706274980542e-07, + "loss": 3.7508, + "step": 12850 + }, + { + "epoch": 1.0189948555599526, + "grad_norm": 163.7755584716797, + "learning_rate": 2.0754461871281773e-07, + "loss": 3.7519, + "step": 12875 + }, + { + "epoch": 1.0209734863474476, + "grad_norm": 123.19217681884766, + "learning_rate": 2.0746217467583004e-07, + "loss": 3.8441, + "step": 12900 + }, + { + "epoch": 1.0229521171349427, + "grad_norm": 122.48994445800781, + "learning_rate": 2.0737973063884236e-07, + "loss": 3.8664, + "step": 12925 + }, + { + "epoch": 1.0249307479224377, + "grad_norm": 97.31283569335938, + "learning_rate": 2.0729728660185467e-07, + "loss": 3.9103, + "step": 12950 + }, + { + "epoch": 1.0269093787099328, + "grad_norm": 178.35073852539062, + "learning_rate": 2.0721484256486695e-07, + "loss": 3.8718, + "step": 12975 + }, + { + "epoch": 1.0288880094974278, + "grad_norm": 124.15072631835938, + "learning_rate": 2.0713239852787926e-07, + "loss": 3.7962, + "step": 13000 + }, + { + "epoch": 1.030866640284923, + "grad_norm": 141.29360961914062, + "learning_rate": 2.0704995449089158e-07, + "loss": 3.8207, + "step": 13025 + }, + { + "epoch": 1.0328452710724179, + "grad_norm": 117.48384857177734, + "learning_rate": 2.0696751045390386e-07, + "loss": 4.0525, + "step": 13050 + }, + { + "epoch": 1.034823901859913, + "grad_norm": 124.6396713256836, + "learning_rate": 2.068850664169162e-07, + "loss": 3.8003, + "step": 13075 + }, + { + "epoch": 1.036802532647408, + "grad_norm": 164.6786346435547, + "learning_rate": 2.0680262237992848e-07, + "loss": 3.6338, + "step": 13100 + }, + { + "epoch": 1.0387811634349031, + "grad_norm": 144.25732421875, + "learning_rate": 2.0672017834294082e-07, + "loss": 3.9446, + "step": 13125 + }, + { + "epoch": 1.040759794222398, + "grad_norm": 130.24571228027344, + "learning_rate": 2.066377343059531e-07, + "loss": 3.9012, + "step": 13150 + }, + { + "epoch": 1.0427384250098932, + "grad_norm": 122.11434173583984, + "learning_rate": 2.0655529026896542e-07, + "loss": 3.7799, + "step": 13175 + }, + { + "epoch": 1.0447170557973882, + "grad_norm": 161.62745666503906, + "learning_rate": 2.0647284623197773e-07, + "loss": 3.8881, + "step": 13200 + }, + { + "epoch": 1.0466956865848833, + "grad_norm": 129.13072204589844, + "learning_rate": 2.0639040219499004e-07, + "loss": 3.8295, + "step": 13225 + }, + { + "epoch": 1.0486743173723783, + "grad_norm": 153.86805725097656, + "learning_rate": 2.0630795815800233e-07, + "loss": 3.7328, + "step": 13250 + }, + { + "epoch": 1.0506529481598734, + "grad_norm": 173.9022979736328, + "learning_rate": 2.0622551412101464e-07, + "loss": 3.9866, + "step": 13275 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 104.5372543334961, + "learning_rate": 2.0614307008402695e-07, + "loss": 3.6121, + "step": 13300 + }, + { + "epoch": 1.0546102097348635, + "grad_norm": 115.93688201904297, + "learning_rate": 2.0606062604703926e-07, + "loss": 3.8851, + "step": 13325 + }, + { + "epoch": 1.0565888405223585, + "grad_norm": 140.26748657226562, + "learning_rate": 2.0597818201005157e-07, + "loss": 3.9924, + "step": 13350 + }, + { + "epoch": 1.0585674713098536, + "grad_norm": 122.87477111816406, + "learning_rate": 2.0589573797306386e-07, + "loss": 3.7768, + "step": 13375 + }, + { + "epoch": 1.0605461020973486, + "grad_norm": 105.3402099609375, + "learning_rate": 2.058132939360762e-07, + "loss": 3.6371, + "step": 13400 + }, + { + "epoch": 1.0625247328848437, + "grad_norm": 112.1316146850586, + "learning_rate": 2.0573084989908848e-07, + "loss": 3.8185, + "step": 13425 + }, + { + "epoch": 1.0645033636723387, + "grad_norm": 105.67694854736328, + "learning_rate": 2.056484058621008e-07, + "loss": 3.7202, + "step": 13450 + }, + { + "epoch": 1.0664819944598338, + "grad_norm": 160.9032440185547, + "learning_rate": 2.055659618251131e-07, + "loss": 3.6819, + "step": 13475 + }, + { + "epoch": 1.0684606252473288, + "grad_norm": 197.6964569091797, + "learning_rate": 2.0548351778812542e-07, + "loss": 3.914, + "step": 13500 + }, + { + "epoch": 1.070439256034824, + "grad_norm": 100.68509674072266, + "learning_rate": 2.0540107375113773e-07, + "loss": 3.7281, + "step": 13525 + }, + { + "epoch": 1.0724178868223189, + "grad_norm": 121.31140899658203, + "learning_rate": 2.0531862971415004e-07, + "loss": 3.7413, + "step": 13550 + }, + { + "epoch": 1.074396517609814, + "grad_norm": 176.4757080078125, + "learning_rate": 2.0523618567716232e-07, + "loss": 4.1066, + "step": 13575 + }, + { + "epoch": 1.076375148397309, + "grad_norm": 98.44918823242188, + "learning_rate": 2.0515374164017464e-07, + "loss": 3.6721, + "step": 13600 + }, + { + "epoch": 1.0783537791848041, + "grad_norm": 114.9334487915039, + "learning_rate": 2.0507129760318695e-07, + "loss": 3.7888, + "step": 13625 + }, + { + "epoch": 1.080332409972299, + "grad_norm": 172.9710235595703, + "learning_rate": 2.0498885356619923e-07, + "loss": 3.8811, + "step": 13650 + }, + { + "epoch": 1.0823110407597942, + "grad_norm": 77.68281555175781, + "learning_rate": 2.0490640952921157e-07, + "loss": 3.7865, + "step": 13675 + }, + { + "epoch": 1.0842896715472894, + "grad_norm": 117.03499603271484, + "learning_rate": 2.0482396549222386e-07, + "loss": 3.6545, + "step": 13700 + }, + { + "epoch": 1.0862683023347843, + "grad_norm": 111.88362121582031, + "learning_rate": 2.047415214552362e-07, + "loss": 3.7833, + "step": 13725 + }, + { + "epoch": 1.0882469331222793, + "grad_norm": 177.05654907226562, + "learning_rate": 2.0465907741824848e-07, + "loss": 3.8182, + "step": 13750 + }, + { + "epoch": 1.0902255639097744, + "grad_norm": 110.29389953613281, + "learning_rate": 2.045766333812608e-07, + "loss": 3.571, + "step": 13775 + }, + { + "epoch": 1.0922041946972696, + "grad_norm": 81.29917907714844, + "learning_rate": 2.044941893442731e-07, + "loss": 3.7003, + "step": 13800 + }, + { + "epoch": 1.0941828254847645, + "grad_norm": 130.32589721679688, + "learning_rate": 2.0441174530728541e-07, + "loss": 3.965, + "step": 13825 + }, + { + "epoch": 1.0961614562722597, + "grad_norm": 134.07106018066406, + "learning_rate": 2.043293012702977e-07, + "loss": 4.0457, + "step": 13850 + }, + { + "epoch": 1.0981400870597546, + "grad_norm": 204.80294799804688, + "learning_rate": 2.0424685723331004e-07, + "loss": 3.5721, + "step": 13875 + }, + { + "epoch": 1.1001187178472498, + "grad_norm": 157.00570678710938, + "learning_rate": 2.0416441319632232e-07, + "loss": 3.8119, + "step": 13900 + }, + { + "epoch": 1.1020973486347447, + "grad_norm": 118.97384643554688, + "learning_rate": 2.0408196915933463e-07, + "loss": 3.6172, + "step": 13925 + }, + { + "epoch": 1.10407597942224, + "grad_norm": 147.43740844726562, + "learning_rate": 2.0399952512234695e-07, + "loss": 3.8369, + "step": 13950 + }, + { + "epoch": 1.1060546102097348, + "grad_norm": 116.36746215820312, + "learning_rate": 2.0391708108535923e-07, + "loss": 3.8669, + "step": 13975 + }, + { + "epoch": 1.10803324099723, + "grad_norm": 119.54280090332031, + "learning_rate": 2.0383463704837157e-07, + "loss": 3.8281, + "step": 14000 + }, + { + "epoch": 1.110011871784725, + "grad_norm": 161.6844024658203, + "learning_rate": 2.0375219301138385e-07, + "loss": 3.632, + "step": 14025 + }, + { + "epoch": 1.11199050257222, + "grad_norm": 115.4676284790039, + "learning_rate": 2.0366974897439617e-07, + "loss": 3.6665, + "step": 14050 + }, + { + "epoch": 1.113969133359715, + "grad_norm": 149.53504943847656, + "learning_rate": 2.0358730493740848e-07, + "loss": 3.8808, + "step": 14075 + }, + { + "epoch": 1.1159477641472102, + "grad_norm": 108.68144989013672, + "learning_rate": 2.035048609004208e-07, + "loss": 3.763, + "step": 14100 + }, + { + "epoch": 1.1179263949347051, + "grad_norm": 140.69000244140625, + "learning_rate": 2.034224168634331e-07, + "loss": 3.8483, + "step": 14125 + }, + { + "epoch": 1.1199050257222003, + "grad_norm": 116.24478149414062, + "learning_rate": 2.033399728264454e-07, + "loss": 3.6232, + "step": 14150 + }, + { + "epoch": 1.1218836565096952, + "grad_norm": 151.31192016601562, + "learning_rate": 2.032575287894577e-07, + "loss": 3.9463, + "step": 14175 + }, + { + "epoch": 1.1238622872971904, + "grad_norm": 104.94828796386719, + "learning_rate": 2.0317508475247004e-07, + "loss": 3.8141, + "step": 14200 + }, + { + "epoch": 1.1258409180846853, + "grad_norm": 174.0530548095703, + "learning_rate": 2.0309264071548232e-07, + "loss": 3.7278, + "step": 14225 + }, + { + "epoch": 1.1278195488721805, + "grad_norm": 123.31523895263672, + "learning_rate": 2.030101966784946e-07, + "loss": 3.6594, + "step": 14250 + }, + { + "epoch": 1.1297981796596754, + "grad_norm": 199.5244140625, + "learning_rate": 2.0292775264150694e-07, + "loss": 3.7397, + "step": 14275 + }, + { + "epoch": 1.1317768104471706, + "grad_norm": 121.85034942626953, + "learning_rate": 2.0284530860451923e-07, + "loss": 3.7268, + "step": 14300 + }, + { + "epoch": 1.1337554412346655, + "grad_norm": 100.23025512695312, + "learning_rate": 2.0276286456753157e-07, + "loss": 3.6129, + "step": 14325 + }, + { + "epoch": 1.1357340720221607, + "grad_norm": 194.6273651123047, + "learning_rate": 2.0268042053054385e-07, + "loss": 3.8529, + "step": 14350 + }, + { + "epoch": 1.1377127028096556, + "grad_norm": 86.26454162597656, + "learning_rate": 2.0259797649355616e-07, + "loss": 3.7558, + "step": 14375 + }, + { + "epoch": 1.1396913335971508, + "grad_norm": 94.16887664794922, + "learning_rate": 2.0251553245656847e-07, + "loss": 3.743, + "step": 14400 + }, + { + "epoch": 1.1416699643846457, + "grad_norm": 112.39076232910156, + "learning_rate": 2.0243308841958079e-07, + "loss": 3.6799, + "step": 14425 + }, + { + "epoch": 1.143648595172141, + "grad_norm": 146.2572479248047, + "learning_rate": 2.0235064438259307e-07, + "loss": 3.8062, + "step": 14450 + }, + { + "epoch": 1.1456272259596358, + "grad_norm": 100.57402801513672, + "learning_rate": 2.022682003456054e-07, + "loss": 3.7005, + "step": 14475 + }, + { + "epoch": 1.147605856747131, + "grad_norm": 110.64191436767578, + "learning_rate": 2.021857563086177e-07, + "loss": 4.007, + "step": 14500 + }, + { + "epoch": 1.149584487534626, + "grad_norm": 182.17724609375, + "learning_rate": 2.0210331227163003e-07, + "loss": 3.782, + "step": 14525 + }, + { + "epoch": 1.151563118322121, + "grad_norm": 135.72979736328125, + "learning_rate": 2.0202086823464232e-07, + "loss": 3.6697, + "step": 14550 + }, + { + "epoch": 1.1535417491096163, + "grad_norm": 101.66889953613281, + "learning_rate": 2.019384241976546e-07, + "loss": 3.7594, + "step": 14575 + }, + { + "epoch": 1.1555203798971112, + "grad_norm": 160.48109436035156, + "learning_rate": 2.0185598016066694e-07, + "loss": 3.7643, + "step": 14600 + }, + { + "epoch": 1.1574990106846061, + "grad_norm": 104.34113311767578, + "learning_rate": 2.0177353612367923e-07, + "loss": 3.9202, + "step": 14625 + }, + { + "epoch": 1.1594776414721013, + "grad_norm": 89.21700286865234, + "learning_rate": 2.0169109208669154e-07, + "loss": 3.5803, + "step": 14650 + }, + { + "epoch": 1.1614562722595965, + "grad_norm": 111.87966918945312, + "learning_rate": 2.0160864804970385e-07, + "loss": 3.5052, + "step": 14675 + }, + { + "epoch": 1.1634349030470914, + "grad_norm": 134.26589965820312, + "learning_rate": 2.0152620401271616e-07, + "loss": 3.7542, + "step": 14700 + }, + { + "epoch": 1.1654135338345863, + "grad_norm": 93.41925811767578, + "learning_rate": 2.0144375997572847e-07, + "loss": 3.8865, + "step": 14725 + }, + { + "epoch": 1.1673921646220815, + "grad_norm": 121.87728881835938, + "learning_rate": 2.0136131593874078e-07, + "loss": 3.9852, + "step": 14750 + }, + { + "epoch": 1.1693707954095767, + "grad_norm": 129.56564331054688, + "learning_rate": 2.0127887190175307e-07, + "loss": 3.6025, + "step": 14775 + }, + { + "epoch": 1.1713494261970716, + "grad_norm": 138.36422729492188, + "learning_rate": 2.011964278647654e-07, + "loss": 3.9298, + "step": 14800 + }, + { + "epoch": 1.1733280569845668, + "grad_norm": 115.63604736328125, + "learning_rate": 2.011139838277777e-07, + "loss": 3.811, + "step": 14825 + }, + { + "epoch": 1.1753066877720617, + "grad_norm": 128.9251708984375, + "learning_rate": 2.0103153979078998e-07, + "loss": 4.013, + "step": 14850 + }, + { + "epoch": 1.1772853185595569, + "grad_norm": 115.66397857666016, + "learning_rate": 2.0094909575380232e-07, + "loss": 3.9727, + "step": 14875 + }, + { + "epoch": 1.1792639493470518, + "grad_norm": 105.60704040527344, + "learning_rate": 2.008666517168146e-07, + "loss": 4.0072, + "step": 14900 + }, + { + "epoch": 1.181242580134547, + "grad_norm": 127.380859375, + "learning_rate": 2.0078420767982694e-07, + "loss": 3.763, + "step": 14925 + }, + { + "epoch": 1.183221210922042, + "grad_norm": 87.45874786376953, + "learning_rate": 2.0070176364283922e-07, + "loss": 3.6529, + "step": 14950 + }, + { + "epoch": 1.185199841709537, + "grad_norm": 257.6990966796875, + "learning_rate": 2.0061931960585154e-07, + "loss": 3.8049, + "step": 14975 + }, + { + "epoch": 1.187178472497032, + "grad_norm": 111.80567169189453, + "learning_rate": 2.0053687556886385e-07, + "loss": 3.9666, + "step": 15000 + }, + { + "epoch": 1.1891571032845272, + "grad_norm": 98.00049591064453, + "learning_rate": 2.0045443153187616e-07, + "loss": 3.773, + "step": 15025 + }, + { + "epoch": 1.1911357340720221, + "grad_norm": 117.19742584228516, + "learning_rate": 2.0037198749488844e-07, + "loss": 3.8228, + "step": 15050 + }, + { + "epoch": 1.1931143648595173, + "grad_norm": 99.34817504882812, + "learning_rate": 2.0028954345790078e-07, + "loss": 3.8045, + "step": 15075 + }, + { + "epoch": 1.1950929956470122, + "grad_norm": 193.61734008789062, + "learning_rate": 2.0020709942091307e-07, + "loss": 3.7231, + "step": 15100 + }, + { + "epoch": 1.1970716264345074, + "grad_norm": 110.1533203125, + "learning_rate": 2.001246553839254e-07, + "loss": 3.7013, + "step": 15125 + }, + { + "epoch": 1.1990502572220023, + "grad_norm": 81.41429901123047, + "learning_rate": 2.000422113469377e-07, + "loss": 3.5668, + "step": 15150 + }, + { + "epoch": 1.2010288880094975, + "grad_norm": 103.47528839111328, + "learning_rate": 1.9995976730994998e-07, + "loss": 3.6627, + "step": 15175 + }, + { + "epoch": 1.2030075187969924, + "grad_norm": 179.40489196777344, + "learning_rate": 1.9987732327296231e-07, + "loss": 3.8307, + "step": 15200 + }, + { + "epoch": 1.2049861495844876, + "grad_norm": 108.21305084228516, + "learning_rate": 1.997948792359746e-07, + "loss": 3.7619, + "step": 15225 + }, + { + "epoch": 1.2069647803719825, + "grad_norm": 206.6326904296875, + "learning_rate": 1.997124351989869e-07, + "loss": 3.7036, + "step": 15250 + }, + { + "epoch": 1.2089434111594777, + "grad_norm": 115.525390625, + "learning_rate": 1.9962999116199922e-07, + "loss": 3.823, + "step": 15275 + }, + { + "epoch": 1.2109220419469726, + "grad_norm": 124.59696197509766, + "learning_rate": 1.9954754712501153e-07, + "loss": 3.7983, + "step": 15300 + }, + { + "epoch": 1.2129006727344678, + "grad_norm": 107.92992401123047, + "learning_rate": 1.9946510308802385e-07, + "loss": 3.7677, + "step": 15325 + }, + { + "epoch": 1.2148793035219627, + "grad_norm": 164.7980194091797, + "learning_rate": 1.9938265905103616e-07, + "loss": 3.7692, + "step": 15350 + }, + { + "epoch": 1.2168579343094579, + "grad_norm": 95.48210906982422, + "learning_rate": 1.9930021501404844e-07, + "loss": 3.8568, + "step": 15375 + }, + { + "epoch": 1.2188365650969528, + "grad_norm": 172.18484497070312, + "learning_rate": 1.9921777097706078e-07, + "loss": 3.596, + "step": 15400 + }, + { + "epoch": 1.220815195884448, + "grad_norm": 146.89479064941406, + "learning_rate": 1.9913532694007307e-07, + "loss": 3.7992, + "step": 15425 + }, + { + "epoch": 1.222793826671943, + "grad_norm": 106.06961822509766, + "learning_rate": 1.9905288290308538e-07, + "loss": 3.5689, + "step": 15450 + }, + { + "epoch": 1.224772457459438, + "grad_norm": 124.81307220458984, + "learning_rate": 1.989704388660977e-07, + "loss": 3.8072, + "step": 15475 + }, + { + "epoch": 1.226751088246933, + "grad_norm": 144.33468627929688, + "learning_rate": 1.9888799482910997e-07, + "loss": 3.7889, + "step": 15500 + }, + { + "epoch": 1.2287297190344282, + "grad_norm": 108.416015625, + "learning_rate": 1.988055507921223e-07, + "loss": 3.7345, + "step": 15525 + }, + { + "epoch": 1.2307083498219233, + "grad_norm": 118.40252685546875, + "learning_rate": 1.987231067551346e-07, + "loss": 3.9306, + "step": 15550 + }, + { + "epoch": 1.2326869806094183, + "grad_norm": 121.7845230102539, + "learning_rate": 1.986406627181469e-07, + "loss": 3.6828, + "step": 15575 + }, + { + "epoch": 1.2346656113969132, + "grad_norm": 126.77731323242188, + "learning_rate": 1.9855821868115922e-07, + "loss": 3.8064, + "step": 15600 + }, + { + "epoch": 1.2366442421844084, + "grad_norm": 128.55198669433594, + "learning_rate": 1.9847577464417153e-07, + "loss": 3.952, + "step": 15625 + }, + { + "epoch": 1.2386228729719035, + "grad_norm": 99.97991180419922, + "learning_rate": 1.9839333060718382e-07, + "loss": 3.8026, + "step": 15650 + }, + { + "epoch": 1.2406015037593985, + "grad_norm": 109.74803924560547, + "learning_rate": 1.9831088657019615e-07, + "loss": 3.8499, + "step": 15675 + }, + { + "epoch": 1.2425801345468936, + "grad_norm": 125.961669921875, + "learning_rate": 1.9822844253320844e-07, + "loss": 3.8659, + "step": 15700 + }, + { + "epoch": 1.2445587653343886, + "grad_norm": 106.30962371826172, + "learning_rate": 1.9814599849622078e-07, + "loss": 3.7085, + "step": 15725 + }, + { + "epoch": 1.2465373961218837, + "grad_norm": 101.37333679199219, + "learning_rate": 1.9806355445923306e-07, + "loss": 3.6006, + "step": 15750 + }, + { + "epoch": 1.2485160269093787, + "grad_norm": 117.87637329101562, + "learning_rate": 1.9798111042224537e-07, + "loss": 3.7463, + "step": 15775 + }, + { + "epoch": 1.2504946576968738, + "grad_norm": 158.502685546875, + "learning_rate": 1.9789866638525769e-07, + "loss": 3.7699, + "step": 15800 + }, + { + "epoch": 1.2524732884843688, + "grad_norm": 213.62936401367188, + "learning_rate": 1.9781622234826997e-07, + "loss": 3.9892, + "step": 15825 + }, + { + "epoch": 1.254451919271864, + "grad_norm": 120.2110595703125, + "learning_rate": 1.9773377831128228e-07, + "loss": 3.6972, + "step": 15850 + }, + { + "epoch": 1.2564305500593589, + "grad_norm": 115.94564819335938, + "learning_rate": 1.976513342742946e-07, + "loss": 3.6659, + "step": 15875 + }, + { + "epoch": 1.258409180846854, + "grad_norm": 100.15674591064453, + "learning_rate": 1.975688902373069e-07, + "loss": 3.6585, + "step": 15900 + }, + { + "epoch": 1.260387811634349, + "grad_norm": 126.27003479003906, + "learning_rate": 1.9748644620031922e-07, + "loss": 3.793, + "step": 15925 + }, + { + "epoch": 1.2623664424218441, + "grad_norm": 122.43645477294922, + "learning_rate": 1.9740400216333153e-07, + "loss": 3.7562, + "step": 15950 + }, + { + "epoch": 1.264345073209339, + "grad_norm": 121.08145904541016, + "learning_rate": 1.9732155812634381e-07, + "loss": 3.759, + "step": 15975 + }, + { + "epoch": 1.2663237039968342, + "grad_norm": 116.81194305419922, + "learning_rate": 1.9723911408935615e-07, + "loss": 3.6303, + "step": 16000 + }, + { + "epoch": 1.2683023347843292, + "grad_norm": 107.8984146118164, + "learning_rate": 1.9715667005236844e-07, + "loss": 3.7525, + "step": 16025 + }, + { + "epoch": 1.2702809655718243, + "grad_norm": 114.40152740478516, + "learning_rate": 1.9707422601538075e-07, + "loss": 3.6044, + "step": 16050 + }, + { + "epoch": 1.2722595963593193, + "grad_norm": 128.336181640625, + "learning_rate": 1.9699178197839306e-07, + "loss": 3.6883, + "step": 16075 + }, + { + "epoch": 1.2742382271468145, + "grad_norm": 177.9373321533203, + "learning_rate": 1.9690933794140535e-07, + "loss": 3.7305, + "step": 16100 + }, + { + "epoch": 1.2762168579343094, + "grad_norm": 137.6163787841797, + "learning_rate": 1.9682689390441768e-07, + "loss": 3.4276, + "step": 16125 + }, + { + "epoch": 1.2781954887218046, + "grad_norm": 93.84378814697266, + "learning_rate": 1.9674444986742997e-07, + "loss": 3.954, + "step": 16150 + }, + { + "epoch": 1.2801741195092995, + "grad_norm": 114.7217788696289, + "learning_rate": 1.9666200583044228e-07, + "loss": 3.9032, + "step": 16175 + }, + { + "epoch": 1.2821527502967947, + "grad_norm": 127.64576721191406, + "learning_rate": 1.965795617934546e-07, + "loss": 3.9561, + "step": 16200 + }, + { + "epoch": 1.2841313810842896, + "grad_norm": 147.38531494140625, + "learning_rate": 1.964971177564669e-07, + "loss": 3.7639, + "step": 16225 + }, + { + "epoch": 1.2861100118717848, + "grad_norm": 106.52879333496094, + "learning_rate": 1.964146737194792e-07, + "loss": 3.6835, + "step": 16250 + }, + { + "epoch": 1.2880886426592797, + "grad_norm": 117.00553131103516, + "learning_rate": 1.9633222968249153e-07, + "loss": 3.5749, + "step": 16275 + }, + { + "epoch": 1.2900672734467749, + "grad_norm": 145.50201416015625, + "learning_rate": 1.962497856455038e-07, + "loss": 3.8452, + "step": 16300 + }, + { + "epoch": 1.29204590423427, + "grad_norm": 109.88056945800781, + "learning_rate": 1.9616734160851615e-07, + "loss": 3.7797, + "step": 16325 + }, + { + "epoch": 1.294024535021765, + "grad_norm": 98.50968170166016, + "learning_rate": 1.9608489757152844e-07, + "loss": 3.9423, + "step": 16350 + }, + { + "epoch": 1.29600316580926, + "grad_norm": 137.3463134765625, + "learning_rate": 1.9600245353454075e-07, + "loss": 3.8175, + "step": 16375 + }, + { + "epoch": 1.297981796596755, + "grad_norm": 104.44812774658203, + "learning_rate": 1.9592000949755306e-07, + "loss": 3.8284, + "step": 16400 + }, + { + "epoch": 1.2999604273842502, + "grad_norm": 134.79293823242188, + "learning_rate": 1.9583756546056534e-07, + "loss": 3.8295, + "step": 16425 + }, + { + "epoch": 1.3019390581717452, + "grad_norm": 136.05784606933594, + "learning_rate": 1.9575512142357766e-07, + "loss": 3.9577, + "step": 16450 + }, + { + "epoch": 1.30391768895924, + "grad_norm": 162.8216094970703, + "learning_rate": 1.9567267738658997e-07, + "loss": 3.9843, + "step": 16475 + }, + { + "epoch": 1.3058963197467353, + "grad_norm": 132.25741577148438, + "learning_rate": 1.9559023334960228e-07, + "loss": 3.8045, + "step": 16500 + }, + { + "epoch": 1.3078749505342304, + "grad_norm": 140.72642517089844, + "learning_rate": 1.955077893126146e-07, + "loss": 3.8217, + "step": 16525 + }, + { + "epoch": 1.3098535813217254, + "grad_norm": 119.49310302734375, + "learning_rate": 1.954253452756269e-07, + "loss": 3.6075, + "step": 16550 + }, + { + "epoch": 1.3118322121092203, + "grad_norm": 189.51280212402344, + "learning_rate": 1.953429012386392e-07, + "loss": 3.9516, + "step": 16575 + }, + { + "epoch": 1.3138108428967155, + "grad_norm": 106.88180541992188, + "learning_rate": 1.9526045720165153e-07, + "loss": 3.9774, + "step": 16600 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 136.42471313476562, + "learning_rate": 1.951780131646638e-07, + "loss": 3.7752, + "step": 16625 + }, + { + "epoch": 1.3177681044717056, + "grad_norm": 134.68948364257812, + "learning_rate": 1.9509556912767612e-07, + "loss": 3.6695, + "step": 16650 + }, + { + "epoch": 1.3197467352592005, + "grad_norm": 88.88003540039062, + "learning_rate": 1.9501312509068843e-07, + "loss": 3.9039, + "step": 16675 + }, + { + "epoch": 1.3217253660466957, + "grad_norm": 106.34395599365234, + "learning_rate": 1.9493068105370075e-07, + "loss": 3.6726, + "step": 16700 + }, + { + "epoch": 1.3237039968341908, + "grad_norm": 94.92339324951172, + "learning_rate": 1.9484823701671306e-07, + "loss": 3.7362, + "step": 16725 + }, + { + "epoch": 1.3256826276216858, + "grad_norm": 131.17221069335938, + "learning_rate": 1.9476579297972534e-07, + "loss": 3.6668, + "step": 16750 + }, + { + "epoch": 1.327661258409181, + "grad_norm": 115.50166320800781, + "learning_rate": 1.9468334894273765e-07, + "loss": 3.9881, + "step": 16775 + }, + { + "epoch": 1.3296398891966759, + "grad_norm": 146.72767639160156, + "learning_rate": 1.9460090490574997e-07, + "loss": 3.8375, + "step": 16800 + }, + { + "epoch": 1.331618519984171, + "grad_norm": 125.41651153564453, + "learning_rate": 1.9451846086876228e-07, + "loss": 3.7523, + "step": 16825 + }, + { + "epoch": 1.333597150771666, + "grad_norm": 148.9279327392578, + "learning_rate": 1.9443601683177456e-07, + "loss": 3.7209, + "step": 16850 + }, + { + "epoch": 1.3355757815591611, + "grad_norm": 132.95213317871094, + "learning_rate": 1.943535727947869e-07, + "loss": 3.8043, + "step": 16875 + }, + { + "epoch": 1.337554412346656, + "grad_norm": 121.32286834716797, + "learning_rate": 1.9427112875779919e-07, + "loss": 3.8463, + "step": 16900 + }, + { + "epoch": 1.3395330431341512, + "grad_norm": 104.13847351074219, + "learning_rate": 1.9418868472081152e-07, + "loss": 3.6028, + "step": 16925 + }, + { + "epoch": 1.3415116739216462, + "grad_norm": 116.32896423339844, + "learning_rate": 1.941062406838238e-07, + "loss": 3.6065, + "step": 16950 + }, + { + "epoch": 1.3434903047091413, + "grad_norm": 123.75446319580078, + "learning_rate": 1.9402379664683612e-07, + "loss": 3.8383, + "step": 16975 + }, + { + "epoch": 1.3454689354966363, + "grad_norm": 146.36181640625, + "learning_rate": 1.9394135260984843e-07, + "loss": 3.7784, + "step": 17000 + }, + { + "epoch": 1.3474475662841314, + "grad_norm": 153.75225830078125, + "learning_rate": 1.9385890857286074e-07, + "loss": 3.8691, + "step": 17025 + }, + { + "epoch": 1.3494261970716264, + "grad_norm": 144.79983520507812, + "learning_rate": 1.9377646453587303e-07, + "loss": 3.7902, + "step": 17050 + }, + { + "epoch": 1.3514048278591215, + "grad_norm": 107.72118377685547, + "learning_rate": 1.9369402049888534e-07, + "loss": 3.7424, + "step": 17075 + }, + { + "epoch": 1.3533834586466165, + "grad_norm": 107.33843994140625, + "learning_rate": 1.9361157646189765e-07, + "loss": 3.7423, + "step": 17100 + }, + { + "epoch": 1.3553620894341116, + "grad_norm": 154.79566955566406, + "learning_rate": 1.9352913242490996e-07, + "loss": 3.5984, + "step": 17125 + }, + { + "epoch": 1.3573407202216066, + "grad_norm": 118.29646301269531, + "learning_rate": 1.9344668838792227e-07, + "loss": 3.6184, + "step": 17150 + }, + { + "epoch": 1.3593193510091017, + "grad_norm": 194.39981079101562, + "learning_rate": 1.9336424435093456e-07, + "loss": 3.7566, + "step": 17175 + }, + { + "epoch": 1.361297981796597, + "grad_norm": 126.4017333984375, + "learning_rate": 1.932818003139469e-07, + "loss": 3.7378, + "step": 17200 + }, + { + "epoch": 1.3632766125840918, + "grad_norm": 154.46499633789062, + "learning_rate": 1.9319935627695918e-07, + "loss": 3.575, + "step": 17225 + }, + { + "epoch": 1.3652552433715868, + "grad_norm": 121.02112579345703, + "learning_rate": 1.9311691223997152e-07, + "loss": 3.5395, + "step": 17250 + }, + { + "epoch": 1.367233874159082, + "grad_norm": 112.81676483154297, + "learning_rate": 1.930344682029838e-07, + "loss": 3.7668, + "step": 17275 + }, + { + "epoch": 1.369212504946577, + "grad_norm": 128.57958984375, + "learning_rate": 1.9295202416599612e-07, + "loss": 3.6508, + "step": 17300 + }, + { + "epoch": 1.371191135734072, + "grad_norm": 100.09276580810547, + "learning_rate": 1.9286958012900843e-07, + "loss": 3.568, + "step": 17325 + }, + { + "epoch": 1.373169766521567, + "grad_norm": 102.0973892211914, + "learning_rate": 1.9278713609202074e-07, + "loss": 3.6707, + "step": 17350 + }, + { + "epoch": 1.3751483973090621, + "grad_norm": 180.7505645751953, + "learning_rate": 1.9270469205503303e-07, + "loss": 3.6053, + "step": 17375 + }, + { + "epoch": 1.3771270280965573, + "grad_norm": 141.00286865234375, + "learning_rate": 1.9262224801804534e-07, + "loss": 3.802, + "step": 17400 + }, + { + "epoch": 1.3791056588840522, + "grad_norm": 105.14612579345703, + "learning_rate": 1.9253980398105765e-07, + "loss": 3.8125, + "step": 17425 + }, + { + "epoch": 1.3810842896715472, + "grad_norm": 134.7199249267578, + "learning_rate": 1.9245735994406993e-07, + "loss": 3.6837, + "step": 17450 + }, + { + "epoch": 1.3830629204590423, + "grad_norm": 161.79367065429688, + "learning_rate": 1.9237491590708227e-07, + "loss": 3.4173, + "step": 17475 + }, + { + "epoch": 1.3850415512465375, + "grad_norm": 90.2686996459961, + "learning_rate": 1.9229247187009456e-07, + "loss": 3.7035, + "step": 17500 + }, + { + "epoch": 1.3870201820340324, + "grad_norm": 138.9261016845703, + "learning_rate": 1.922100278331069e-07, + "loss": 3.7375, + "step": 17525 + }, + { + "epoch": 1.3889988128215274, + "grad_norm": 99.18257141113281, + "learning_rate": 1.9212758379611918e-07, + "loss": 3.7228, + "step": 17550 + }, + { + "epoch": 1.3909774436090225, + "grad_norm": 108.61868286132812, + "learning_rate": 1.920451397591315e-07, + "loss": 3.8505, + "step": 17575 + }, + { + "epoch": 1.3929560743965177, + "grad_norm": 89.02510070800781, + "learning_rate": 1.919626957221438e-07, + "loss": 3.7366, + "step": 17600 + }, + { + "epoch": 1.3949347051840126, + "grad_norm": 183.8499755859375, + "learning_rate": 1.9188025168515612e-07, + "loss": 3.7842, + "step": 17625 + }, + { + "epoch": 1.3969133359715078, + "grad_norm": 111.4132080078125, + "learning_rate": 1.917978076481684e-07, + "loss": 4.0444, + "step": 17650 + }, + { + "epoch": 1.3988919667590027, + "grad_norm": 114.52892303466797, + "learning_rate": 1.917153636111807e-07, + "loss": 3.6568, + "step": 17675 + }, + { + "epoch": 1.400870597546498, + "grad_norm": 108.65158081054688, + "learning_rate": 1.9163291957419302e-07, + "loss": 3.7598, + "step": 17700 + }, + { + "epoch": 1.4028492283339928, + "grad_norm": 118.24126434326172, + "learning_rate": 1.9155047553720534e-07, + "loss": 3.5768, + "step": 17725 + }, + { + "epoch": 1.404827859121488, + "grad_norm": 116.43769073486328, + "learning_rate": 1.9146803150021765e-07, + "loss": 3.6195, + "step": 17750 + }, + { + "epoch": 1.406806489908983, + "grad_norm": 96.20083618164062, + "learning_rate": 1.9138558746322993e-07, + "loss": 3.7732, + "step": 17775 + }, + { + "epoch": 1.408785120696478, + "grad_norm": 106.93498229980469, + "learning_rate": 1.9130314342624227e-07, + "loss": 3.5733, + "step": 17800 + }, + { + "epoch": 1.410763751483973, + "grad_norm": 126.78841400146484, + "learning_rate": 1.9122069938925456e-07, + "loss": 3.4865, + "step": 17825 + }, + { + "epoch": 1.4127423822714682, + "grad_norm": 161.895263671875, + "learning_rate": 1.911382553522669e-07, + "loss": 3.7985, + "step": 17850 + }, + { + "epoch": 1.4147210130589631, + "grad_norm": 137.6743927001953, + "learning_rate": 1.9105581131527918e-07, + "loss": 3.5315, + "step": 17875 + }, + { + "epoch": 1.4166996438464583, + "grad_norm": 81.24407196044922, + "learning_rate": 1.909733672782915e-07, + "loss": 3.6007, + "step": 17900 + }, + { + "epoch": 1.4186782746339532, + "grad_norm": 109.42854309082031, + "learning_rate": 1.908909232413038e-07, + "loss": 3.5264, + "step": 17925 + }, + { + "epoch": 1.4206569054214484, + "grad_norm": 113.56609344482422, + "learning_rate": 1.9080847920431611e-07, + "loss": 3.7304, + "step": 17950 + }, + { + "epoch": 1.4226355362089433, + "grad_norm": 148.14793395996094, + "learning_rate": 1.907260351673284e-07, + "loss": 3.6923, + "step": 17975 + }, + { + "epoch": 1.4246141669964385, + "grad_norm": 141.70896911621094, + "learning_rate": 1.906435911303407e-07, + "loss": 3.813, + "step": 18000 + }, + { + "epoch": 1.4265927977839334, + "grad_norm": 107.8005142211914, + "learning_rate": 1.9056114709335302e-07, + "loss": 3.8025, + "step": 18025 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 119.71678924560547, + "learning_rate": 1.904787030563653e-07, + "loss": 3.6632, + "step": 18050 + }, + { + "epoch": 1.4305500593589235, + "grad_norm": 89.46141815185547, + "learning_rate": 1.9039625901937765e-07, + "loss": 3.5954, + "step": 18075 + }, + { + "epoch": 1.4325286901464187, + "grad_norm": 94.6341552734375, + "learning_rate": 1.9031381498238993e-07, + "loss": 3.7922, + "step": 18100 + }, + { + "epoch": 1.4345073209339136, + "grad_norm": 119.52442932128906, + "learning_rate": 1.9023137094540227e-07, + "loss": 3.8677, + "step": 18125 + }, + { + "epoch": 1.4364859517214088, + "grad_norm": 85.59278869628906, + "learning_rate": 1.9014892690841455e-07, + "loss": 3.7679, + "step": 18150 + }, + { + "epoch": 1.438464582508904, + "grad_norm": 140.89190673828125, + "learning_rate": 1.9006648287142687e-07, + "loss": 3.7617, + "step": 18175 + }, + { + "epoch": 1.440443213296399, + "grad_norm": 114.29212188720703, + "learning_rate": 1.8998403883443918e-07, + "loss": 3.9376, + "step": 18200 + }, + { + "epoch": 1.4424218440838938, + "grad_norm": 118.00497436523438, + "learning_rate": 1.899015947974515e-07, + "loss": 3.7841, + "step": 18225 + }, + { + "epoch": 1.444400474871389, + "grad_norm": 140.94200134277344, + "learning_rate": 1.8981915076046377e-07, + "loss": 3.694, + "step": 18250 + }, + { + "epoch": 1.4463791056588842, + "grad_norm": 153.66397094726562, + "learning_rate": 1.897367067234761e-07, + "loss": 3.7603, + "step": 18275 + }, + { + "epoch": 1.448357736446379, + "grad_norm": 128.06732177734375, + "learning_rate": 1.896542626864884e-07, + "loss": 3.7799, + "step": 18300 + }, + { + "epoch": 1.450336367233874, + "grad_norm": 112.23291778564453, + "learning_rate": 1.895718186495007e-07, + "loss": 3.7655, + "step": 18325 + }, + { + "epoch": 1.4523149980213692, + "grad_norm": 91.07350158691406, + "learning_rate": 1.8948937461251302e-07, + "loss": 3.643, + "step": 18350 + }, + { + "epoch": 1.4542936288088644, + "grad_norm": 98.65179443359375, + "learning_rate": 1.894069305755253e-07, + "loss": 3.7346, + "step": 18375 + }, + { + "epoch": 1.4562722595963593, + "grad_norm": 118.8671875, + "learning_rate": 1.8932448653853764e-07, + "loss": 3.7361, + "step": 18400 + }, + { + "epoch": 1.4582508903838542, + "grad_norm": 121.17200469970703, + "learning_rate": 1.8924204250154993e-07, + "loss": 3.7823, + "step": 18425 + }, + { + "epoch": 1.4602295211713494, + "grad_norm": 91.53225708007812, + "learning_rate": 1.8915959846456227e-07, + "loss": 3.659, + "step": 18450 + }, + { + "epoch": 1.4622081519588446, + "grad_norm": 140.78236389160156, + "learning_rate": 1.8907715442757455e-07, + "loss": 3.8495, + "step": 18475 + }, + { + "epoch": 1.4641867827463395, + "grad_norm": 136.43206787109375, + "learning_rate": 1.8899471039058686e-07, + "loss": 3.8468, + "step": 18500 + }, + { + "epoch": 1.4661654135338344, + "grad_norm": 106.83531951904297, + "learning_rate": 1.8891226635359917e-07, + "loss": 3.6799, + "step": 18525 + }, + { + "epoch": 1.4681440443213296, + "grad_norm": 132.8029327392578, + "learning_rate": 1.8882982231661149e-07, + "loss": 3.7441, + "step": 18550 + }, + { + "epoch": 1.4701226751088248, + "grad_norm": 125.74601745605469, + "learning_rate": 1.8874737827962377e-07, + "loss": 3.586, + "step": 18575 + }, + { + "epoch": 1.4721013058963197, + "grad_norm": 126.61151123046875, + "learning_rate": 1.886649342426361e-07, + "loss": 3.7686, + "step": 18600 + }, + { + "epoch": 1.4740799366838149, + "grad_norm": 127.62877655029297, + "learning_rate": 1.885824902056484e-07, + "loss": 3.74, + "step": 18625 + }, + { + "epoch": 1.4760585674713098, + "grad_norm": 134.3148193359375, + "learning_rate": 1.8850004616866068e-07, + "loss": 3.8815, + "step": 18650 + }, + { + "epoch": 1.478037198258805, + "grad_norm": 167.0948028564453, + "learning_rate": 1.8841760213167302e-07, + "loss": 3.7258, + "step": 18675 + }, + { + "epoch": 1.4800158290463, + "grad_norm": 103.17122650146484, + "learning_rate": 1.883351580946853e-07, + "loss": 3.7211, + "step": 18700 + }, + { + "epoch": 1.481994459833795, + "grad_norm": 98.9283218383789, + "learning_rate": 1.8825271405769764e-07, + "loss": 3.6401, + "step": 18725 + }, + { + "epoch": 1.48397309062129, + "grad_norm": 110.53363800048828, + "learning_rate": 1.8817027002070993e-07, + "loss": 3.84, + "step": 18750 + }, + { + "epoch": 1.4859517214087852, + "grad_norm": 115.60739135742188, + "learning_rate": 1.8808782598372224e-07, + "loss": 3.9043, + "step": 18775 + }, + { + "epoch": 1.4879303521962801, + "grad_norm": 130.4183807373047, + "learning_rate": 1.8800538194673455e-07, + "loss": 3.6778, + "step": 18800 + }, + { + "epoch": 1.4899089829837753, + "grad_norm": 94.30120086669922, + "learning_rate": 1.8792293790974686e-07, + "loss": 3.5853, + "step": 18825 + }, + { + "epoch": 1.4918876137712702, + "grad_norm": 102.75646209716797, + "learning_rate": 1.8784049387275917e-07, + "loss": 4.0326, + "step": 18850 + }, + { + "epoch": 1.4938662445587654, + "grad_norm": 153.1327362060547, + "learning_rate": 1.8775804983577148e-07, + "loss": 3.8785, + "step": 18875 + }, + { + "epoch": 1.4958448753462603, + "grad_norm": 100.03012084960938, + "learning_rate": 1.8767560579878377e-07, + "loss": 3.692, + "step": 18900 + }, + { + "epoch": 1.4978235061337555, + "grad_norm": 128.64657592773438, + "learning_rate": 1.8759316176179608e-07, + "loss": 3.7799, + "step": 18925 + }, + { + "epoch": 1.4998021369212504, + "grad_norm": 99.55155944824219, + "learning_rate": 1.875107177248084e-07, + "loss": 3.7112, + "step": 18950 + }, + { + "epoch": 1.5017807677087456, + "grad_norm": 171.94178771972656, + "learning_rate": 1.8742827368782068e-07, + "loss": 3.7663, + "step": 18975 + }, + { + "epoch": 1.5037593984962405, + "grad_norm": 107.23114013671875, + "learning_rate": 1.8734582965083302e-07, + "loss": 3.6127, + "step": 19000 + }, + { + "epoch": 1.5057380292837357, + "grad_norm": 101.01669311523438, + "learning_rate": 1.872633856138453e-07, + "loss": 3.5948, + "step": 19025 + }, + { + "epoch": 1.5077166600712308, + "grad_norm": 158.38211059570312, + "learning_rate": 1.8718094157685764e-07, + "loss": 3.5552, + "step": 19050 + }, + { + "epoch": 1.5096952908587258, + "grad_norm": 117.05270385742188, + "learning_rate": 1.8709849753986992e-07, + "loss": 3.7912, + "step": 19075 + }, + { + "epoch": 1.5116739216462207, + "grad_norm": 117.87310028076172, + "learning_rate": 1.8701605350288224e-07, + "loss": 3.7512, + "step": 19100 + }, + { + "epoch": 1.5136525524337159, + "grad_norm": 106.46678161621094, + "learning_rate": 1.8693360946589455e-07, + "loss": 3.7715, + "step": 19125 + }, + { + "epoch": 1.515631183221211, + "grad_norm": 107.43925476074219, + "learning_rate": 1.8685116542890686e-07, + "loss": 3.7395, + "step": 19150 + }, + { + "epoch": 1.517609814008706, + "grad_norm": 101.0313491821289, + "learning_rate": 1.8676872139191914e-07, + "loss": 3.6996, + "step": 19175 + }, + { + "epoch": 1.519588444796201, + "grad_norm": 179.96051025390625, + "learning_rate": 1.8668627735493148e-07, + "loss": 3.6133, + "step": 19200 + }, + { + "epoch": 1.521567075583696, + "grad_norm": 90.7691879272461, + "learning_rate": 1.8660383331794377e-07, + "loss": 3.6053, + "step": 19225 + }, + { + "epoch": 1.5235457063711912, + "grad_norm": 137.12637329101562, + "learning_rate": 1.8652138928095605e-07, + "loss": 3.6739, + "step": 19250 + }, + { + "epoch": 1.5255243371586862, + "grad_norm": 138.26014709472656, + "learning_rate": 1.864389452439684e-07, + "loss": 3.8328, + "step": 19275 + }, + { + "epoch": 1.5275029679461811, + "grad_norm": 94.83419799804688, + "learning_rate": 1.8635650120698068e-07, + "loss": 3.7555, + "step": 19300 + }, + { + "epoch": 1.5294815987336763, + "grad_norm": 105.08883666992188, + "learning_rate": 1.8627405716999301e-07, + "loss": 3.8474, + "step": 19325 + }, + { + "epoch": 1.5314602295211714, + "grad_norm": 124.04432678222656, + "learning_rate": 1.861916131330053e-07, + "loss": 3.5361, + "step": 19350 + }, + { + "epoch": 1.5334388603086664, + "grad_norm": 97.83289337158203, + "learning_rate": 1.861091690960176e-07, + "loss": 3.6283, + "step": 19375 + }, + { + "epoch": 1.5354174910961613, + "grad_norm": 110.26960754394531, + "learning_rate": 1.8602672505902992e-07, + "loss": 3.7872, + "step": 19400 + }, + { + "epoch": 1.5373961218836565, + "grad_norm": 115.99392700195312, + "learning_rate": 1.8594428102204223e-07, + "loss": 3.6191, + "step": 19425 + }, + { + "epoch": 1.5393747526711516, + "grad_norm": 105.29808044433594, + "learning_rate": 1.8586183698505454e-07, + "loss": 3.6837, + "step": 19450 + }, + { + "epoch": 1.5413533834586466, + "grad_norm": 100.55680847167969, + "learning_rate": 1.8577939294806686e-07, + "loss": 3.7439, + "step": 19475 + }, + { + "epoch": 1.5433320142461415, + "grad_norm": 117.83526611328125, + "learning_rate": 1.8569694891107914e-07, + "loss": 3.5938, + "step": 19500 + }, + { + "epoch": 1.5453106450336367, + "grad_norm": 84.95865631103516, + "learning_rate": 1.8561450487409148e-07, + "loss": 3.7425, + "step": 19525 + }, + { + "epoch": 1.5472892758211318, + "grad_norm": 128.07754516601562, + "learning_rate": 1.8553206083710376e-07, + "loss": 3.7379, + "step": 19550 + }, + { + "epoch": 1.5492679066086268, + "grad_norm": 103.81727600097656, + "learning_rate": 1.8544961680011605e-07, + "loss": 3.8121, + "step": 19575 + }, + { + "epoch": 1.5512465373961217, + "grad_norm": 113.71387481689453, + "learning_rate": 1.853671727631284e-07, + "loss": 3.5854, + "step": 19600 + }, + { + "epoch": 1.5532251681836169, + "grad_norm": 106.94216918945312, + "learning_rate": 1.8528472872614067e-07, + "loss": 3.7723, + "step": 19625 + }, + { + "epoch": 1.555203798971112, + "grad_norm": 113.77094268798828, + "learning_rate": 1.85202284689153e-07, + "loss": 3.7218, + "step": 19650 + }, + { + "epoch": 1.557182429758607, + "grad_norm": 102.69156646728516, + "learning_rate": 1.851198406521653e-07, + "loss": 3.5478, + "step": 19675 + }, + { + "epoch": 1.5591610605461022, + "grad_norm": 146.14686584472656, + "learning_rate": 1.850373966151776e-07, + "loss": 3.5888, + "step": 19700 + }, + { + "epoch": 1.5611396913335973, + "grad_norm": 117.55293273925781, + "learning_rate": 1.8495495257818992e-07, + "loss": 3.5802, + "step": 19725 + }, + { + "epoch": 1.5631183221210923, + "grad_norm": 116.557861328125, + "learning_rate": 1.8487250854120223e-07, + "loss": 3.6588, + "step": 19750 + }, + { + "epoch": 1.5650969529085872, + "grad_norm": 143.797607421875, + "learning_rate": 1.8479006450421452e-07, + "loss": 3.7888, + "step": 19775 + }, + { + "epoch": 1.5670755836960824, + "grad_norm": 90.89989471435547, + "learning_rate": 1.8470762046722685e-07, + "loss": 3.7565, + "step": 19800 + }, + { + "epoch": 1.5690542144835775, + "grad_norm": 93.96920776367188, + "learning_rate": 1.8462517643023914e-07, + "loss": 3.7576, + "step": 19825 + }, + { + "epoch": 1.5710328452710725, + "grad_norm": 146.9014129638672, + "learning_rate": 1.8454273239325148e-07, + "loss": 3.7507, + "step": 19850 + }, + { + "epoch": 1.5730114760585674, + "grad_norm": 119.38358306884766, + "learning_rate": 1.8446028835626376e-07, + "loss": 3.6588, + "step": 19875 + }, + { + "epoch": 1.5749901068460626, + "grad_norm": 113.38998413085938, + "learning_rate": 1.8437784431927605e-07, + "loss": 3.7866, + "step": 19900 + }, + { + "epoch": 1.5769687376335577, + "grad_norm": 103.051513671875, + "learning_rate": 1.8429540028228839e-07, + "loss": 3.6443, + "step": 19925 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 94.77304077148438, + "learning_rate": 1.8421295624530067e-07, + "loss": 3.6966, + "step": 19950 + }, + { + "epoch": 1.5809259992085476, + "grad_norm": 167.68310546875, + "learning_rate": 1.8413051220831298e-07, + "loss": 3.684, + "step": 19975 + }, + { + "epoch": 1.5829046299960428, + "grad_norm": 96.78482055664062, + "learning_rate": 1.840480681713253e-07, + "loss": 3.733, + "step": 20000 + }, + { + "epoch": 1.5829046299960428, + "eval_loss": 3.6955573558807373, + "eval_runtime": 9.5434, + "eval_samples_per_second": 264.894, + "eval_steps_per_second": 33.112, + "step": 20000 + } + ], + "logging_steps": 25, + "max_steps": 75810, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 71534592000000.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}