{ "best_metric": 3.8681728839874268, "best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-10000", "epoch": 0.7914523149980214, "eval_steps": 10000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019786307874950534, "grad_norm": 254.82342529296875, "learning_rate": 2.499208537244918e-07, "loss": 5.7705, "step": 25 }, { "epoch": 0.003957261574990107, "grad_norm": 153.19989013671875, "learning_rate": 2.498384096875041e-07, "loss": 5.6747, "step": 50 }, { "epoch": 0.00593589236248516, "grad_norm": 224.5292510986328, "learning_rate": 2.4975596565051644e-07, "loss": 5.6201, "step": 75 }, { "epoch": 0.007914523149980214, "grad_norm": 175.854248046875, "learning_rate": 2.4967352161352873e-07, "loss": 5.6974, "step": 100 }, { "epoch": 0.009893153937475268, "grad_norm": 163.52769470214844, "learning_rate": 2.49591077576541e-07, "loss": 5.5417, "step": 125 }, { "epoch": 0.01187178472497032, "grad_norm": 254.2264862060547, "learning_rate": 2.4950863353955335e-07, "loss": 5.8201, "step": 150 }, { "epoch": 0.013850415512465374, "grad_norm": 175.30279541015625, "learning_rate": 2.4942618950256564e-07, "loss": 5.5302, "step": 175 }, { "epoch": 0.015829046299960427, "grad_norm": 300.1286315917969, "learning_rate": 2.4934374546557797e-07, "loss": 5.6572, "step": 200 }, { "epoch": 0.01780767708745548, "grad_norm": 201.56961059570312, "learning_rate": 2.4926130142859026e-07, "loss": 5.2914, "step": 225 }, { "epoch": 0.019786307874950535, "grad_norm": 245.64854431152344, "learning_rate": 2.491788573916026e-07, "loss": 5.4478, "step": 250 }, { "epoch": 0.02176493866244559, "grad_norm": 239.78257751464844, "learning_rate": 2.490964133546149e-07, "loss": 5.4161, "step": 275 }, { "epoch": 0.02374356944994064, "grad_norm": 150.18310546875, "learning_rate": 2.4901396931762717e-07, "loss": 5.4978, "step": 300 }, { "epoch": 0.025722200237435693, "grad_norm": 172.03607177734375, "learning_rate": 2.489315252806395e-07, "loss": 5.4105, "step": 325 }, { "epoch": 0.027700831024930747, "grad_norm": 343.2570495605469, "learning_rate": 2.488490812436518e-07, "loss": 5.5195, "step": 350 }, { "epoch": 0.0296794618124258, "grad_norm": 329.7228698730469, "learning_rate": 2.4876663720666413e-07, "loss": 5.4494, "step": 375 }, { "epoch": 0.031658092599920855, "grad_norm": 174.63136291503906, "learning_rate": 2.486841931696764e-07, "loss": 5.2864, "step": 400 }, { "epoch": 0.033636723387415905, "grad_norm": 356.6216125488281, "learning_rate": 2.486017491326887e-07, "loss": 5.4376, "step": 425 }, { "epoch": 0.03561535417491096, "grad_norm": 166.16783142089844, "learning_rate": 2.4851930509570104e-07, "loss": 5.3141, "step": 450 }, { "epoch": 0.03759398496240601, "grad_norm": 220.06170654296875, "learning_rate": 2.484368610587133e-07, "loss": 5.5457, "step": 475 }, { "epoch": 0.03957261574990107, "grad_norm": 154.55517578125, "learning_rate": 2.483544170217256e-07, "loss": 5.1264, "step": 500 }, { "epoch": 0.04155124653739612, "grad_norm": 184.18443298339844, "learning_rate": 2.4827197298473794e-07, "loss": 5.3702, "step": 525 }, { "epoch": 0.04352987732489118, "grad_norm": 128.84693908691406, "learning_rate": 2.4818952894775023e-07, "loss": 5.0207, "step": 550 }, { "epoch": 0.04550850811238623, "grad_norm": 196.2894287109375, "learning_rate": 2.4810708491076257e-07, "loss": 5.315, "step": 575 }, { "epoch": 0.04748713889988128, "grad_norm": 200.00257873535156, "learning_rate": 2.4802464087377485e-07, "loss": 5.2215, "step": 600 }, { "epoch": 0.049465769687376336, "grad_norm": 271.8963928222656, "learning_rate": 2.479421968367872e-07, "loss": 5.3286, "step": 625 }, { "epoch": 0.051444400474871387, "grad_norm": 181.56686401367188, "learning_rate": 2.478597527997995e-07, "loss": 4.9967, "step": 650 }, { "epoch": 0.053423031262366444, "grad_norm": 242.8925323486328, "learning_rate": 2.477773087628118e-07, "loss": 5.1984, "step": 675 }, { "epoch": 0.055401662049861494, "grad_norm": 210.05746459960938, "learning_rate": 2.476948647258241e-07, "loss": 5.0975, "step": 700 }, { "epoch": 0.05738029283735655, "grad_norm": 181.1220245361328, "learning_rate": 2.476124206888364e-07, "loss": 5.0036, "step": 725 }, { "epoch": 0.0593589236248516, "grad_norm": 166.00709533691406, "learning_rate": 2.475299766518487e-07, "loss": 5.3082, "step": 750 }, { "epoch": 0.06133755441234666, "grad_norm": 151.4649200439453, "learning_rate": 2.47447532614861e-07, "loss": 5.1391, "step": 775 }, { "epoch": 0.06331618519984171, "grad_norm": 149.88165283203125, "learning_rate": 2.4736508857787335e-07, "loss": 5.0783, "step": 800 }, { "epoch": 0.06529481598733676, "grad_norm": 172.47061157226562, "learning_rate": 2.4728264454088563e-07, "loss": 4.9624, "step": 825 }, { "epoch": 0.06727344677483181, "grad_norm": 298.1490478515625, "learning_rate": 2.4720020050389797e-07, "loss": 5.1937, "step": 850 }, { "epoch": 0.06925207756232687, "grad_norm": 164.37867736816406, "learning_rate": 2.4711775646691025e-07, "loss": 5.1792, "step": 875 }, { "epoch": 0.07123070834982193, "grad_norm": 216.8033905029297, "learning_rate": 2.4703531242992254e-07, "loss": 5.152, "step": 900 }, { "epoch": 0.07320933913731698, "grad_norm": 211.95762634277344, "learning_rate": 2.469528683929349e-07, "loss": 4.9146, "step": 925 }, { "epoch": 0.07518796992481203, "grad_norm": 257.61968994140625, "learning_rate": 2.4687042435594716e-07, "loss": 5.095, "step": 950 }, { "epoch": 0.07716660071230709, "grad_norm": 179.43719482421875, "learning_rate": 2.467879803189595e-07, "loss": 5.0316, "step": 975 }, { "epoch": 0.07914523149980214, "grad_norm": 180.3157958984375, "learning_rate": 2.467055362819718e-07, "loss": 4.9441, "step": 1000 }, { "epoch": 0.08112386228729719, "grad_norm": 162.77447509765625, "learning_rate": 2.4662309224498407e-07, "loss": 4.9724, "step": 1025 }, { "epoch": 0.08310249307479224, "grad_norm": 123.65939331054688, "learning_rate": 2.465406482079964e-07, "loss": 5.2271, "step": 1050 }, { "epoch": 0.08508112386228729, "grad_norm": 163.114990234375, "learning_rate": 2.464582041710087e-07, "loss": 4.9724, "step": 1075 }, { "epoch": 0.08705975464978236, "grad_norm": 204.76400756835938, "learning_rate": 2.46375760134021e-07, "loss": 4.8724, "step": 1100 }, { "epoch": 0.0890383854372774, "grad_norm": 307.8963623046875, "learning_rate": 2.462933160970333e-07, "loss": 4.9256, "step": 1125 }, { "epoch": 0.09101701622477246, "grad_norm": 133.03707885742188, "learning_rate": 2.462108720600456e-07, "loss": 4.8792, "step": 1150 }, { "epoch": 0.09299564701226751, "grad_norm": 161.41697692871094, "learning_rate": 2.4612842802305794e-07, "loss": 5.054, "step": 1175 }, { "epoch": 0.09497427779976256, "grad_norm": 135.36228942871094, "learning_rate": 2.460459839860702e-07, "loss": 4.8655, "step": 1200 }, { "epoch": 0.09695290858725762, "grad_norm": 179.60646057128906, "learning_rate": 2.4596353994908256e-07, "loss": 4.7832, "step": 1225 }, { "epoch": 0.09893153937475267, "grad_norm": 335.71380615234375, "learning_rate": 2.4588109591209485e-07, "loss": 4.9979, "step": 1250 }, { "epoch": 0.10091017016224772, "grad_norm": 149.5147247314453, "learning_rate": 2.457986518751072e-07, "loss": 4.714, "step": 1275 }, { "epoch": 0.10288880094974277, "grad_norm": 154.0236358642578, "learning_rate": 2.4571620783811947e-07, "loss": 4.8015, "step": 1300 }, { "epoch": 0.10486743173723784, "grad_norm": 450.5319519042969, "learning_rate": 2.456337638011318e-07, "loss": 4.6914, "step": 1325 }, { "epoch": 0.10684606252473289, "grad_norm": 195.87863159179688, "learning_rate": 2.455513197641441e-07, "loss": 5.0124, "step": 1350 }, { "epoch": 0.10882469331222794, "grad_norm": 198.12225341796875, "learning_rate": 2.454688757271564e-07, "loss": 4.5305, "step": 1375 }, { "epoch": 0.11080332409972299, "grad_norm": 161.57623291015625, "learning_rate": 2.453864316901687e-07, "loss": 4.7806, "step": 1400 }, { "epoch": 0.11278195488721804, "grad_norm": 187.8081817626953, "learning_rate": 2.45303987653181e-07, "loss": 4.9401, "step": 1425 }, { "epoch": 0.1147605856747131, "grad_norm": 160.1893768310547, "learning_rate": 2.4522154361619334e-07, "loss": 4.8119, "step": 1450 }, { "epoch": 0.11673921646220815, "grad_norm": 181.8563995361328, "learning_rate": 2.4513909957920563e-07, "loss": 4.7979, "step": 1475 }, { "epoch": 0.1187178472497032, "grad_norm": 184.80641174316406, "learning_rate": 2.4505665554221796e-07, "loss": 4.8448, "step": 1500 }, { "epoch": 0.12069647803719825, "grad_norm": 151.4502410888672, "learning_rate": 2.4497421150523025e-07, "loss": 4.7101, "step": 1525 }, { "epoch": 0.12267510882469332, "grad_norm": 163.2119598388672, "learning_rate": 2.4489176746824253e-07, "loss": 4.8802, "step": 1550 }, { "epoch": 0.12465373961218837, "grad_norm": 147.33741760253906, "learning_rate": 2.4480932343125487e-07, "loss": 4.6433, "step": 1575 }, { "epoch": 0.12663237039968342, "grad_norm": 145.84716796875, "learning_rate": 2.4472687939426716e-07, "loss": 4.4118, "step": 1600 }, { "epoch": 0.12861100118717847, "grad_norm": 111.55641174316406, "learning_rate": 2.4464443535727944e-07, "loss": 4.819, "step": 1625 }, { "epoch": 0.13058963197467352, "grad_norm": 145.68092346191406, "learning_rate": 2.445619913202918e-07, "loss": 4.7752, "step": 1650 }, { "epoch": 0.13256826276216857, "grad_norm": 274.0830078125, "learning_rate": 2.4447954728330407e-07, "loss": 4.8566, "step": 1675 }, { "epoch": 0.13454689354966362, "grad_norm": 141.83982849121094, "learning_rate": 2.4439710324631635e-07, "loss": 4.6643, "step": 1700 }, { "epoch": 0.1365255243371587, "grad_norm": 182.46160888671875, "learning_rate": 2.443146592093287e-07, "loss": 4.731, "step": 1725 }, { "epoch": 0.13850415512465375, "grad_norm": 200.28773498535156, "learning_rate": 2.44232215172341e-07, "loss": 4.5525, "step": 1750 }, { "epoch": 0.1404827859121488, "grad_norm": 163.7792510986328, "learning_rate": 2.441497711353533e-07, "loss": 4.8076, "step": 1775 }, { "epoch": 0.14246141669964385, "grad_norm": 422.9642639160156, "learning_rate": 2.440673270983656e-07, "loss": 4.7045, "step": 1800 }, { "epoch": 0.1444400474871389, "grad_norm": 187.99957275390625, "learning_rate": 2.4398488306137794e-07, "loss": 4.6615, "step": 1825 }, { "epoch": 0.14641867827463395, "grad_norm": 144.52732849121094, "learning_rate": 2.439024390243902e-07, "loss": 4.7912, "step": 1850 }, { "epoch": 0.148397309062129, "grad_norm": 192.0771026611328, "learning_rate": 2.4381999498740256e-07, "loss": 4.7916, "step": 1875 }, { "epoch": 0.15037593984962405, "grad_norm": 148.06878662109375, "learning_rate": 2.4373755095041484e-07, "loss": 4.7782, "step": 1900 }, { "epoch": 0.1523545706371191, "grad_norm": 131.4456329345703, "learning_rate": 2.436551069134272e-07, "loss": 4.579, "step": 1925 }, { "epoch": 0.15433320142461418, "grad_norm": 141.84681701660156, "learning_rate": 2.4357266287643947e-07, "loss": 4.5776, "step": 1950 }, { "epoch": 0.15631183221210923, "grad_norm": 122.31990051269531, "learning_rate": 2.4349021883945175e-07, "loss": 4.5185, "step": 1975 }, { "epoch": 0.15829046299960428, "grad_norm": 229.08372497558594, "learning_rate": 2.434077748024641e-07, "loss": 4.6352, "step": 2000 }, { "epoch": 0.16026909378709933, "grad_norm": 136.54153442382812, "learning_rate": 2.433253307654764e-07, "loss": 4.5512, "step": 2025 }, { "epoch": 0.16224772457459438, "grad_norm": 237.05514526367188, "learning_rate": 2.432428867284887e-07, "loss": 4.7146, "step": 2050 }, { "epoch": 0.16422635536208943, "grad_norm": 149.2750244140625, "learning_rate": 2.43160442691501e-07, "loss": 4.6935, "step": 2075 }, { "epoch": 0.16620498614958448, "grad_norm": 149.77297973632812, "learning_rate": 2.4307799865451334e-07, "loss": 4.8223, "step": 2100 }, { "epoch": 0.16818361693707953, "grad_norm": 235.3883056640625, "learning_rate": 2.429955546175256e-07, "loss": 4.6266, "step": 2125 }, { "epoch": 0.17016224772457458, "grad_norm": 137.77316284179688, "learning_rate": 2.429131105805379e-07, "loss": 4.8543, "step": 2150 }, { "epoch": 0.17214087851206966, "grad_norm": 143.8935089111328, "learning_rate": 2.4283066654355025e-07, "loss": 4.651, "step": 2175 }, { "epoch": 0.1741195092995647, "grad_norm": 191.43856811523438, "learning_rate": 2.4274822250656253e-07, "loss": 4.4166, "step": 2200 }, { "epoch": 0.17609814008705976, "grad_norm": 135.82838439941406, "learning_rate": 2.426657784695748e-07, "loss": 4.7078, "step": 2225 }, { "epoch": 0.1780767708745548, "grad_norm": 114.28646087646484, "learning_rate": 2.4258333443258715e-07, "loss": 4.5316, "step": 2250 }, { "epoch": 0.18005540166204986, "grad_norm": 237.41001892089844, "learning_rate": 2.4250089039559944e-07, "loss": 4.4699, "step": 2275 }, { "epoch": 0.1820340324495449, "grad_norm": 124.57892608642578, "learning_rate": 2.424184463586117e-07, "loss": 4.5101, "step": 2300 }, { "epoch": 0.18401266323703996, "grad_norm": 147.15554809570312, "learning_rate": 2.4233600232162406e-07, "loss": 4.5974, "step": 2325 }, { "epoch": 0.18599129402453501, "grad_norm": 166.0609588623047, "learning_rate": 2.4225355828463635e-07, "loss": 4.5105, "step": 2350 }, { "epoch": 0.18796992481203006, "grad_norm": 188.97705078125, "learning_rate": 2.421711142476487e-07, "loss": 4.587, "step": 2375 }, { "epoch": 0.18994855559952512, "grad_norm": 243.09271240234375, "learning_rate": 2.4208867021066097e-07, "loss": 4.7686, "step": 2400 }, { "epoch": 0.1919271863870202, "grad_norm": 127.40078735351562, "learning_rate": 2.420062261736733e-07, "loss": 4.4476, "step": 2425 }, { "epoch": 0.19390581717451524, "grad_norm": 253.8776092529297, "learning_rate": 2.419237821366856e-07, "loss": 4.5478, "step": 2450 }, { "epoch": 0.1958844479620103, "grad_norm": 123.27115631103516, "learning_rate": 2.4184133809969793e-07, "loss": 4.3502, "step": 2475 }, { "epoch": 0.19786307874950534, "grad_norm": 138.00375366210938, "learning_rate": 2.417588940627102e-07, "loss": 4.3534, "step": 2500 }, { "epoch": 0.1998417095370004, "grad_norm": 115.53954315185547, "learning_rate": 2.4167645002572256e-07, "loss": 4.7066, "step": 2525 }, { "epoch": 0.20182034032449545, "grad_norm": 180.38809204101562, "learning_rate": 2.4159400598873484e-07, "loss": 4.6605, "step": 2550 }, { "epoch": 0.2037989711119905, "grad_norm": 129.8457489013672, "learning_rate": 2.415115619517472e-07, "loss": 4.3849, "step": 2575 }, { "epoch": 0.20577760189948555, "grad_norm": 156.64404296875, "learning_rate": 2.4142911791475946e-07, "loss": 4.3434, "step": 2600 }, { "epoch": 0.2077562326869806, "grad_norm": 162.81320190429688, "learning_rate": 2.4134667387777175e-07, "loss": 4.5466, "step": 2625 }, { "epoch": 0.20973486347447567, "grad_norm": 128.7244873046875, "learning_rate": 2.412642298407841e-07, "loss": 4.5358, "step": 2650 }, { "epoch": 0.21171349426197072, "grad_norm": 217.59042358398438, "learning_rate": 2.4118178580379637e-07, "loss": 4.5235, "step": 2675 }, { "epoch": 0.21369212504946578, "grad_norm": 144.84365844726562, "learning_rate": 2.410993417668087e-07, "loss": 4.3811, "step": 2700 }, { "epoch": 0.21567075583696083, "grad_norm": 146.22451782226562, "learning_rate": 2.41016897729821e-07, "loss": 4.3797, "step": 2725 }, { "epoch": 0.21764938662445588, "grad_norm": 198.39772033691406, "learning_rate": 2.409344536928333e-07, "loss": 4.4303, "step": 2750 }, { "epoch": 0.21962801741195093, "grad_norm": 158.10592651367188, "learning_rate": 2.408520096558456e-07, "loss": 4.3633, "step": 2775 }, { "epoch": 0.22160664819944598, "grad_norm": 166.79954528808594, "learning_rate": 2.407695656188579e-07, "loss": 4.5392, "step": 2800 }, { "epoch": 0.22358527898694103, "grad_norm": 207.30593872070312, "learning_rate": 2.406871215818702e-07, "loss": 4.5003, "step": 2825 }, { "epoch": 0.22556390977443608, "grad_norm": 128.81883239746094, "learning_rate": 2.4060467754488253e-07, "loss": 4.5416, "step": 2850 }, { "epoch": 0.22754254056193116, "grad_norm": 181.48960876464844, "learning_rate": 2.405222335078948e-07, "loss": 4.1725, "step": 2875 }, { "epoch": 0.2295211713494262, "grad_norm": 179.47384643554688, "learning_rate": 2.4043978947090715e-07, "loss": 4.5229, "step": 2900 }, { "epoch": 0.23149980213692126, "grad_norm": 144.242919921875, "learning_rate": 2.4035734543391943e-07, "loss": 4.3295, "step": 2925 }, { "epoch": 0.2334784329244163, "grad_norm": 177.61968994140625, "learning_rate": 2.402749013969317e-07, "loss": 4.4266, "step": 2950 }, { "epoch": 0.23545706371191136, "grad_norm": 143.8682861328125, "learning_rate": 2.4019245735994406e-07, "loss": 4.2341, "step": 2975 }, { "epoch": 0.2374356944994064, "grad_norm": 128.8461151123047, "learning_rate": 2.4011001332295634e-07, "loss": 4.3676, "step": 3000 }, { "epoch": 0.23941432528690146, "grad_norm": 160.70687866210938, "learning_rate": 2.400275692859687e-07, "loss": 4.3945, "step": 3025 }, { "epoch": 0.2413929560743965, "grad_norm": 157.65855407714844, "learning_rate": 2.3994512524898097e-07, "loss": 4.4967, "step": 3050 }, { "epoch": 0.24337158686189156, "grad_norm": 125.79988861083984, "learning_rate": 2.398626812119933e-07, "loss": 4.279, "step": 3075 }, { "epoch": 0.24535021764938664, "grad_norm": 168.8534698486328, "learning_rate": 2.397802371750056e-07, "loss": 4.4813, "step": 3100 }, { "epoch": 0.2473288484368817, "grad_norm": 120.4126968383789, "learning_rate": 2.3969779313801793e-07, "loss": 4.1997, "step": 3125 }, { "epoch": 0.24930747922437674, "grad_norm": 115.56365203857422, "learning_rate": 2.396153491010302e-07, "loss": 4.4076, "step": 3150 }, { "epoch": 0.2512861100118718, "grad_norm": 152.89859008789062, "learning_rate": 2.3953290506404255e-07, "loss": 4.2893, "step": 3175 }, { "epoch": 0.25326474079936684, "grad_norm": 177.6272735595703, "learning_rate": 2.3945046102705484e-07, "loss": 4.4892, "step": 3200 }, { "epoch": 0.2552433715868619, "grad_norm": 131.46661376953125, "learning_rate": 2.393680169900671e-07, "loss": 4.2702, "step": 3225 }, { "epoch": 0.25722200237435694, "grad_norm": 101.60210418701172, "learning_rate": 2.3928557295307946e-07, "loss": 4.2209, "step": 3250 }, { "epoch": 0.259200633161852, "grad_norm": 199.7799835205078, "learning_rate": 2.3920312891609174e-07, "loss": 4.1502, "step": 3275 }, { "epoch": 0.26117926394934704, "grad_norm": 163.44424438476562, "learning_rate": 2.391206848791041e-07, "loss": 4.3423, "step": 3300 }, { "epoch": 0.2631578947368421, "grad_norm": 148.59519958496094, "learning_rate": 2.3903824084211637e-07, "loss": 4.4833, "step": 3325 }, { "epoch": 0.26513652552433714, "grad_norm": 129.75927734375, "learning_rate": 2.3895579680512865e-07, "loss": 4.4745, "step": 3350 }, { "epoch": 0.2671151563118322, "grad_norm": 126.6795654296875, "learning_rate": 2.38873352768141e-07, "loss": 4.3964, "step": 3375 }, { "epoch": 0.26909378709932724, "grad_norm": 157.1032257080078, "learning_rate": 2.387909087311533e-07, "loss": 4.3419, "step": 3400 }, { "epoch": 0.2710724178868223, "grad_norm": 142.79139709472656, "learning_rate": 2.3870846469416556e-07, "loss": 4.2243, "step": 3425 }, { "epoch": 0.2730510486743174, "grad_norm": 137.3797607421875, "learning_rate": 2.386260206571779e-07, "loss": 4.1661, "step": 3450 }, { "epoch": 0.27502967946181245, "grad_norm": 148.77401733398438, "learning_rate": 2.385435766201902e-07, "loss": 4.483, "step": 3475 }, { "epoch": 0.2770083102493075, "grad_norm": 124.54267120361328, "learning_rate": 2.384611325832025e-07, "loss": 4.369, "step": 3500 }, { "epoch": 0.27898694103680255, "grad_norm": 113.43370056152344, "learning_rate": 2.383786885462148e-07, "loss": 4.1491, "step": 3525 }, { "epoch": 0.2809655718242976, "grad_norm": 155.67677307128906, "learning_rate": 2.3829624450922712e-07, "loss": 4.3403, "step": 3550 }, { "epoch": 0.28294420261179265, "grad_norm": 201.27784729003906, "learning_rate": 2.3821380047223943e-07, "loss": 4.3563, "step": 3575 }, { "epoch": 0.2849228333992877, "grad_norm": 104.74275970458984, "learning_rate": 2.3813135643525174e-07, "loss": 4.2706, "step": 3600 }, { "epoch": 0.28690146418678275, "grad_norm": 133.6251678466797, "learning_rate": 2.3804891239826405e-07, "loss": 4.3637, "step": 3625 }, { "epoch": 0.2888800949742778, "grad_norm": 102.35352325439453, "learning_rate": 2.3796646836127634e-07, "loss": 4.2585, "step": 3650 }, { "epoch": 0.29085872576177285, "grad_norm": 156.72654724121094, "learning_rate": 2.3788402432428868e-07, "loss": 4.3448, "step": 3675 }, { "epoch": 0.2928373565492679, "grad_norm": 121.19142150878906, "learning_rate": 2.3780158028730096e-07, "loss": 4.1475, "step": 3700 }, { "epoch": 0.29481598733676295, "grad_norm": 138.72952270507812, "learning_rate": 2.3771913625031327e-07, "loss": 4.2475, "step": 3725 }, { "epoch": 0.296794618124258, "grad_norm": 314.35113525390625, "learning_rate": 2.3763669221332559e-07, "loss": 4.2643, "step": 3750 }, { "epoch": 0.29877324891175305, "grad_norm": 131.71240234375, "learning_rate": 2.375542481763379e-07, "loss": 4.2741, "step": 3775 }, { "epoch": 0.3007518796992481, "grad_norm": 193.2744598388672, "learning_rate": 2.374718041393502e-07, "loss": 4.2314, "step": 3800 }, { "epoch": 0.30273051048674315, "grad_norm": 146.98760986328125, "learning_rate": 2.3738936010236252e-07, "loss": 4.5421, "step": 3825 }, { "epoch": 0.3047091412742382, "grad_norm": 106.49159240722656, "learning_rate": 2.373069160653748e-07, "loss": 4.0922, "step": 3850 }, { "epoch": 0.30668777206173325, "grad_norm": 128.12686157226562, "learning_rate": 2.3722447202838712e-07, "loss": 4.3171, "step": 3875 }, { "epoch": 0.30866640284922836, "grad_norm": 165.8458251953125, "learning_rate": 2.3714202799139943e-07, "loss": 4.1937, "step": 3900 }, { "epoch": 0.3106450336367234, "grad_norm": 129.49652099609375, "learning_rate": 2.3705958395441171e-07, "loss": 4.2486, "step": 3925 }, { "epoch": 0.31262366442421846, "grad_norm": 113.08882141113281, "learning_rate": 2.3697713991742405e-07, "loss": 3.9533, "step": 3950 }, { "epoch": 0.3146022952117135, "grad_norm": 116.51021575927734, "learning_rate": 2.3689469588043634e-07, "loss": 4.2525, "step": 3975 }, { "epoch": 0.31658092599920856, "grad_norm": 95.54279327392578, "learning_rate": 2.3681225184344867e-07, "loss": 4.1355, "step": 4000 }, { "epoch": 0.3185595567867036, "grad_norm": 123.10621643066406, "learning_rate": 2.3672980780646096e-07, "loss": 4.3705, "step": 4025 }, { "epoch": 0.32053818757419866, "grad_norm": 142.11273193359375, "learning_rate": 2.3664736376947327e-07, "loss": 4.2712, "step": 4050 }, { "epoch": 0.3225168183616937, "grad_norm": 162.17141723632812, "learning_rate": 2.3656491973248558e-07, "loss": 4.1127, "step": 4075 }, { "epoch": 0.32449544914918876, "grad_norm": 160.26893615722656, "learning_rate": 2.364824756954979e-07, "loss": 4.2687, "step": 4100 }, { "epoch": 0.3264740799366838, "grad_norm": 134.65093994140625, "learning_rate": 2.3640003165851018e-07, "loss": 4.3567, "step": 4125 }, { "epoch": 0.32845271072417886, "grad_norm": 178.23516845703125, "learning_rate": 2.3631758762152252e-07, "loss": 4.097, "step": 4150 }, { "epoch": 0.3304313415116739, "grad_norm": 151.1556396484375, "learning_rate": 2.362351435845348e-07, "loss": 4.1602, "step": 4175 }, { "epoch": 0.33240997229916897, "grad_norm": 154.64442443847656, "learning_rate": 2.3615269954754711e-07, "loss": 4.2365, "step": 4200 }, { "epoch": 0.334388603086664, "grad_norm": 226.6827850341797, "learning_rate": 2.3607025551055943e-07, "loss": 4.3196, "step": 4225 }, { "epoch": 0.33636723387415907, "grad_norm": 172.67916870117188, "learning_rate": 2.359878114735717e-07, "loss": 4.4476, "step": 4250 }, { "epoch": 0.3383458646616541, "grad_norm": 124.78984069824219, "learning_rate": 2.3590536743658405e-07, "loss": 4.4006, "step": 4275 }, { "epoch": 0.34032449544914917, "grad_norm": 156.81365966796875, "learning_rate": 2.3582292339959633e-07, "loss": 4.3914, "step": 4300 }, { "epoch": 0.3423031262366442, "grad_norm": 116.53181457519531, "learning_rate": 2.3574047936260865e-07, "loss": 4.2846, "step": 4325 }, { "epoch": 0.3442817570241393, "grad_norm": 146.16543579101562, "learning_rate": 2.3565803532562096e-07, "loss": 4.1371, "step": 4350 }, { "epoch": 0.3462603878116344, "grad_norm": 213.07974243164062, "learning_rate": 2.3557559128863327e-07, "loss": 4.2294, "step": 4375 }, { "epoch": 0.3482390185991294, "grad_norm": 99.38206481933594, "learning_rate": 2.3549314725164558e-07, "loss": 4.1726, "step": 4400 }, { "epoch": 0.3502176493866245, "grad_norm": 162.97059631347656, "learning_rate": 2.354107032146579e-07, "loss": 4.0507, "step": 4425 }, { "epoch": 0.3521962801741195, "grad_norm": 132.77474975585938, "learning_rate": 2.3532825917767018e-07, "loss": 4.0016, "step": 4450 }, { "epoch": 0.3541749109616146, "grad_norm": 126.9658203125, "learning_rate": 2.3524581514068252e-07, "loss": 4.2731, "step": 4475 }, { "epoch": 0.3561535417491096, "grad_norm": 194.47755432128906, "learning_rate": 2.351633711036948e-07, "loss": 4.1119, "step": 4500 }, { "epoch": 0.3581321725366047, "grad_norm": 153.6606903076172, "learning_rate": 2.3508092706670709e-07, "loss": 4.4556, "step": 4525 }, { "epoch": 0.3601108033240997, "grad_norm": 146.66709899902344, "learning_rate": 2.3499848302971942e-07, "loss": 4.3314, "step": 4550 }, { "epoch": 0.3620894341115948, "grad_norm": 111.01129913330078, "learning_rate": 2.349160389927317e-07, "loss": 4.2929, "step": 4575 }, { "epoch": 0.3640680648990898, "grad_norm": 137.40582275390625, "learning_rate": 2.3483359495574405e-07, "loss": 4.2198, "step": 4600 }, { "epoch": 0.3660466956865849, "grad_norm": 142.0623779296875, "learning_rate": 2.3475115091875633e-07, "loss": 4.2013, "step": 4625 }, { "epoch": 0.36802532647407993, "grad_norm": 135.2795867919922, "learning_rate": 2.3466870688176864e-07, "loss": 4.231, "step": 4650 }, { "epoch": 0.370003957261575, "grad_norm": 127.59281158447266, "learning_rate": 2.3458626284478096e-07, "loss": 3.9613, "step": 4675 }, { "epoch": 0.37198258804907003, "grad_norm": 132.48663330078125, "learning_rate": 2.3450381880779327e-07, "loss": 4.1925, "step": 4700 }, { "epoch": 0.3739612188365651, "grad_norm": 135.35409545898438, "learning_rate": 2.3442137477080555e-07, "loss": 4.1828, "step": 4725 }, { "epoch": 0.37593984962406013, "grad_norm": 107.55503845214844, "learning_rate": 2.343389307338179e-07, "loss": 4.2578, "step": 4750 }, { "epoch": 0.3779184804115552, "grad_norm": 132.79620361328125, "learning_rate": 2.3425648669683018e-07, "loss": 4.0254, "step": 4775 }, { "epoch": 0.37989711119905023, "grad_norm": 123.6044692993164, "learning_rate": 2.341740426598425e-07, "loss": 3.9981, "step": 4800 }, { "epoch": 0.38187574198654534, "grad_norm": 149.656005859375, "learning_rate": 2.340915986228548e-07, "loss": 4.1067, "step": 4825 }, { "epoch": 0.3838543727740404, "grad_norm": 122.97380065917969, "learning_rate": 2.3400915458586708e-07, "loss": 4.2396, "step": 4850 }, { "epoch": 0.38583300356153544, "grad_norm": 140.10183715820312, "learning_rate": 2.3392671054887942e-07, "loss": 4.1309, "step": 4875 }, { "epoch": 0.3878116343490305, "grad_norm": 137.91583251953125, "learning_rate": 2.338442665118917e-07, "loss": 4.0575, "step": 4900 }, { "epoch": 0.38979026513652554, "grad_norm": 137.72152709960938, "learning_rate": 2.3376182247490402e-07, "loss": 4.159, "step": 4925 }, { "epoch": 0.3917688959240206, "grad_norm": 84.3819808959961, "learning_rate": 2.3367937843791633e-07, "loss": 4.2611, "step": 4950 }, { "epoch": 0.39374752671151564, "grad_norm": 200.3111114501953, "learning_rate": 2.3359693440092864e-07, "loss": 4.167, "step": 4975 }, { "epoch": 0.3957261574990107, "grad_norm": 123.27460479736328, "learning_rate": 2.3351449036394095e-07, "loss": 4.2918, "step": 5000 }, { "epoch": 0.39770478828650574, "grad_norm": 111.70620727539062, "learning_rate": 2.3343204632695327e-07, "loss": 4.2242, "step": 5025 }, { "epoch": 0.3996834190740008, "grad_norm": 107.74165344238281, "learning_rate": 2.3334960228996555e-07, "loss": 4.3572, "step": 5050 }, { "epoch": 0.40166204986149584, "grad_norm": 138.31423950195312, "learning_rate": 2.332671582529779e-07, "loss": 4.1759, "step": 5075 }, { "epoch": 0.4036406806489909, "grad_norm": 104.73587799072266, "learning_rate": 2.3318471421599017e-07, "loss": 4.1695, "step": 5100 }, { "epoch": 0.40561931143648594, "grad_norm": 138.1061553955078, "learning_rate": 2.3310227017900246e-07, "loss": 4.0986, "step": 5125 }, { "epoch": 0.407597942223981, "grad_norm": 148.92279052734375, "learning_rate": 2.330198261420148e-07, "loss": 4.3455, "step": 5150 }, { "epoch": 0.40957657301147604, "grad_norm": 321.29852294921875, "learning_rate": 2.3293738210502708e-07, "loss": 4.1285, "step": 5175 }, { "epoch": 0.4115552037989711, "grad_norm": 114.85989379882812, "learning_rate": 2.3285493806803942e-07, "loss": 3.9628, "step": 5200 }, { "epoch": 0.41353383458646614, "grad_norm": 137.27610778808594, "learning_rate": 2.327724940310517e-07, "loss": 4.1521, "step": 5225 }, { "epoch": 0.4155124653739612, "grad_norm": 96.02686309814453, "learning_rate": 2.3269004999406402e-07, "loss": 4.027, "step": 5250 }, { "epoch": 0.4174910961614563, "grad_norm": 213.81649780273438, "learning_rate": 2.3260760595707633e-07, "loss": 4.0522, "step": 5275 }, { "epoch": 0.41946972694895135, "grad_norm": 160.4125518798828, "learning_rate": 2.3252516192008864e-07, "loss": 4.09, "step": 5300 }, { "epoch": 0.4214483577364464, "grad_norm": 167.58741760253906, "learning_rate": 2.3244271788310093e-07, "loss": 4.1128, "step": 5325 }, { "epoch": 0.42342698852394145, "grad_norm": 159.55303955078125, "learning_rate": 2.3236027384611326e-07, "loss": 4.0867, "step": 5350 }, { "epoch": 0.4254056193114365, "grad_norm": 122.51324462890625, "learning_rate": 2.3227782980912555e-07, "loss": 4.2261, "step": 5375 }, { "epoch": 0.42738425009893155, "grad_norm": 185.9108428955078, "learning_rate": 2.3219538577213789e-07, "loss": 3.9684, "step": 5400 }, { "epoch": 0.4293628808864266, "grad_norm": 195.37579345703125, "learning_rate": 2.3211294173515017e-07, "loss": 4.0779, "step": 5425 }, { "epoch": 0.43134151167392165, "grad_norm": 157.84371948242188, "learning_rate": 2.3203049769816246e-07, "loss": 4.1991, "step": 5450 }, { "epoch": 0.4333201424614167, "grad_norm": 111.01512908935547, "learning_rate": 2.319480536611748e-07, "loss": 3.9962, "step": 5475 }, { "epoch": 0.43529877324891175, "grad_norm": 114.49053955078125, "learning_rate": 2.3186560962418708e-07, "loss": 3.8972, "step": 5500 }, { "epoch": 0.4372774040364068, "grad_norm": 168.17874145507812, "learning_rate": 2.317831655871994e-07, "loss": 4.1913, "step": 5525 }, { "epoch": 0.43925603482390185, "grad_norm": 140.61912536621094, "learning_rate": 2.317007215502117e-07, "loss": 4.1396, "step": 5550 }, { "epoch": 0.4412346656113969, "grad_norm": 138.01805114746094, "learning_rate": 2.3161827751322401e-07, "loss": 4.1399, "step": 5575 }, { "epoch": 0.44321329639889195, "grad_norm": 188.0181427001953, "learning_rate": 2.3153583347623633e-07, "loss": 4.0329, "step": 5600 }, { "epoch": 0.445191927186387, "grad_norm": 170.8402099609375, "learning_rate": 2.3145338943924864e-07, "loss": 4.3414, "step": 5625 }, { "epoch": 0.44717055797388205, "grad_norm": 200.65077209472656, "learning_rate": 2.3137094540226092e-07, "loss": 4.2154, "step": 5650 }, { "epoch": 0.4491491887613771, "grad_norm": 120.18091583251953, "learning_rate": 2.3128850136527326e-07, "loss": 4.0372, "step": 5675 }, { "epoch": 0.45112781954887216, "grad_norm": 89.9730224609375, "learning_rate": 2.3120605732828555e-07, "loss": 4.1059, "step": 5700 }, { "epoch": 0.45310645033636726, "grad_norm": 133.7999267578125, "learning_rate": 2.3112361329129786e-07, "loss": 4.2035, "step": 5725 }, { "epoch": 0.4550850811238623, "grad_norm": 88.3386459350586, "learning_rate": 2.3104116925431017e-07, "loss": 4.0566, "step": 5750 }, { "epoch": 0.45706371191135736, "grad_norm": 130.95127868652344, "learning_rate": 2.3095872521732245e-07, "loss": 4.3084, "step": 5775 }, { "epoch": 0.4590423426988524, "grad_norm": 162.55679321289062, "learning_rate": 2.308762811803348e-07, "loss": 4.0288, "step": 5800 }, { "epoch": 0.46102097348634746, "grad_norm": 104.4178695678711, "learning_rate": 2.3079383714334708e-07, "loss": 3.9244, "step": 5825 }, { "epoch": 0.4629996042738425, "grad_norm": 235.28123474121094, "learning_rate": 2.307113931063594e-07, "loss": 4.1106, "step": 5850 }, { "epoch": 0.46497823506133756, "grad_norm": 289.6645812988281, "learning_rate": 2.306289490693717e-07, "loss": 4.0457, "step": 5875 }, { "epoch": 0.4669568658488326, "grad_norm": 99.97111511230469, "learning_rate": 2.30546505032384e-07, "loss": 4.2542, "step": 5900 }, { "epoch": 0.46893549663632766, "grad_norm": 260.0950622558594, "learning_rate": 2.304640609953963e-07, "loss": 4.1564, "step": 5925 }, { "epoch": 0.4709141274238227, "grad_norm": 113.74392700195312, "learning_rate": 2.3038161695840864e-07, "loss": 4.0403, "step": 5950 }, { "epoch": 0.47289275821131777, "grad_norm": 79.32340240478516, "learning_rate": 2.3029917292142092e-07, "loss": 4.0408, "step": 5975 }, { "epoch": 0.4748713889988128, "grad_norm": 95.92308807373047, "learning_rate": 2.3021672888443326e-07, "loss": 3.9811, "step": 6000 }, { "epoch": 0.47685001978630787, "grad_norm": 94.5758285522461, "learning_rate": 2.3013428484744554e-07, "loss": 4.1102, "step": 6025 }, { "epoch": 0.4788286505738029, "grad_norm": 142.32131958007812, "learning_rate": 2.3005184081045786e-07, "loss": 3.989, "step": 6050 }, { "epoch": 0.48080728136129797, "grad_norm": 97.84469604492188, "learning_rate": 2.2996939677347017e-07, "loss": 3.9512, "step": 6075 }, { "epoch": 0.482785912148793, "grad_norm": 94.38491821289062, "learning_rate": 2.2988695273648245e-07, "loss": 3.9475, "step": 6100 }, { "epoch": 0.48476454293628807, "grad_norm": 124.32872772216797, "learning_rate": 2.2980450869949476e-07, "loss": 4.1352, "step": 6125 }, { "epoch": 0.4867431737237831, "grad_norm": 196.1511993408203, "learning_rate": 2.2972206466250708e-07, "loss": 4.2956, "step": 6150 }, { "epoch": 0.48872180451127817, "grad_norm": 144.1227264404297, "learning_rate": 2.296396206255194e-07, "loss": 3.9718, "step": 6175 }, { "epoch": 0.4907004352987733, "grad_norm": 115.52275085449219, "learning_rate": 2.295571765885317e-07, "loss": 3.9135, "step": 6200 }, { "epoch": 0.4926790660862683, "grad_norm": 117.71548461914062, "learning_rate": 2.29474732551544e-07, "loss": 3.9026, "step": 6225 }, { "epoch": 0.4946576968737634, "grad_norm": 135.42698669433594, "learning_rate": 2.293922885145563e-07, "loss": 4.0369, "step": 6250 }, { "epoch": 0.4966363276612584, "grad_norm": 142.4741973876953, "learning_rate": 2.2930984447756863e-07, "loss": 4.3588, "step": 6275 }, { "epoch": 0.4986149584487535, "grad_norm": 128.56195068359375, "learning_rate": 2.2922740044058092e-07, "loss": 3.9089, "step": 6300 }, { "epoch": 0.5005935892362485, "grad_norm": 96.84894561767578, "learning_rate": 2.2914495640359323e-07, "loss": 4.1722, "step": 6325 }, { "epoch": 0.5025722200237436, "grad_norm": 236.92965698242188, "learning_rate": 2.2906251236660554e-07, "loss": 3.9729, "step": 6350 }, { "epoch": 0.5045508508112386, "grad_norm": 135.83609008789062, "learning_rate": 2.2898006832961783e-07, "loss": 4.0322, "step": 6375 }, { "epoch": 0.5065294815987337, "grad_norm": 123.36375427246094, "learning_rate": 2.2889762429263017e-07, "loss": 4.0042, "step": 6400 }, { "epoch": 0.5085081123862287, "grad_norm": 118.30574035644531, "learning_rate": 2.2881518025564245e-07, "loss": 4.1079, "step": 6425 }, { "epoch": 0.5104867431737238, "grad_norm": 107.81358337402344, "learning_rate": 2.2873273621865476e-07, "loss": 4.1198, "step": 6450 }, { "epoch": 0.5124653739612188, "grad_norm": 146.2493438720703, "learning_rate": 2.2865029218166707e-07, "loss": 4.0814, "step": 6475 }, { "epoch": 0.5144440047487139, "grad_norm": 136.8212890625, "learning_rate": 2.2856784814467939e-07, "loss": 4.0562, "step": 6500 }, { "epoch": 0.5164226355362089, "grad_norm": 139.30670166015625, "learning_rate": 2.2848540410769167e-07, "loss": 4.1199, "step": 6525 }, { "epoch": 0.518401266323704, "grad_norm": 194.90414428710938, "learning_rate": 2.28402960070704e-07, "loss": 4.0562, "step": 6550 }, { "epoch": 0.520379897111199, "grad_norm": 103.54257202148438, "learning_rate": 2.283205160337163e-07, "loss": 4.0797, "step": 6575 }, { "epoch": 0.5223585278986941, "grad_norm": 101.63102722167969, "learning_rate": 2.2823807199672863e-07, "loss": 4.0591, "step": 6600 }, { "epoch": 0.5243371586861891, "grad_norm": 104.28479766845703, "learning_rate": 2.2815562795974092e-07, "loss": 3.8991, "step": 6625 }, { "epoch": 0.5263157894736842, "grad_norm": 166.01107788085938, "learning_rate": 2.2807318392275323e-07, "loss": 4.1011, "step": 6650 }, { "epoch": 0.5282944202611792, "grad_norm": 154.64959716796875, "learning_rate": 2.2799073988576554e-07, "loss": 3.9283, "step": 6675 }, { "epoch": 0.5302730510486743, "grad_norm": 96.0099868774414, "learning_rate": 2.2790829584877782e-07, "loss": 3.8247, "step": 6700 }, { "epoch": 0.5322516818361693, "grad_norm": 120.90514373779297, "learning_rate": 2.2782585181179014e-07, "loss": 4.0629, "step": 6725 }, { "epoch": 0.5342303126236644, "grad_norm": 106.48863983154297, "learning_rate": 2.2774340777480245e-07, "loss": 4.0127, "step": 6750 }, { "epoch": 0.5362089434111594, "grad_norm": 113.17047882080078, "learning_rate": 2.2766096373781476e-07, "loss": 4.03, "step": 6775 }, { "epoch": 0.5381875741986545, "grad_norm": 130.6500701904297, "learning_rate": 2.2757851970082707e-07, "loss": 4.0192, "step": 6800 }, { "epoch": 0.5401662049861495, "grad_norm": 142.3747100830078, "learning_rate": 2.2749607566383938e-07, "loss": 4.1507, "step": 6825 }, { "epoch": 0.5421448357736446, "grad_norm": 125.88548278808594, "learning_rate": 2.2741363162685167e-07, "loss": 4.2026, "step": 6850 }, { "epoch": 0.5441234665611397, "grad_norm": 156.44570922851562, "learning_rate": 2.27331187589864e-07, "loss": 4.1063, "step": 6875 }, { "epoch": 0.5461020973486348, "grad_norm": 150.82635498046875, "learning_rate": 2.272487435528763e-07, "loss": 4.0477, "step": 6900 }, { "epoch": 0.5480807281361298, "grad_norm": 170.67994689941406, "learning_rate": 2.271662995158886e-07, "loss": 4.1398, "step": 6925 }, { "epoch": 0.5500593589236249, "grad_norm": 114.224609375, "learning_rate": 2.2708385547890091e-07, "loss": 4.0906, "step": 6950 }, { "epoch": 0.55203798971112, "grad_norm": 135.5966033935547, "learning_rate": 2.2700141144191323e-07, "loss": 3.872, "step": 6975 }, { "epoch": 0.554016620498615, "grad_norm": 120.73974609375, "learning_rate": 2.2691896740492554e-07, "loss": 3.9762, "step": 7000 }, { "epoch": 0.55599525128611, "grad_norm": 107.66891479492188, "learning_rate": 2.2683652336793782e-07, "loss": 4.0551, "step": 7025 }, { "epoch": 0.5579738820736051, "grad_norm": 107.60162353515625, "learning_rate": 2.2675407933095013e-07, "loss": 3.973, "step": 7050 }, { "epoch": 0.5599525128611001, "grad_norm": 118.88258361816406, "learning_rate": 2.2667163529396245e-07, "loss": 3.9864, "step": 7075 }, { "epoch": 0.5619311436485952, "grad_norm": 148.85667419433594, "learning_rate": 2.2658919125697476e-07, "loss": 3.9409, "step": 7100 }, { "epoch": 0.5639097744360902, "grad_norm": 148.57321166992188, "learning_rate": 2.2650674721998704e-07, "loss": 3.9611, "step": 7125 }, { "epoch": 0.5658884052235853, "grad_norm": 172.39999389648438, "learning_rate": 2.2642430318299938e-07, "loss": 3.97, "step": 7150 }, { "epoch": 0.5678670360110804, "grad_norm": 120.57051086425781, "learning_rate": 2.2634185914601167e-07, "loss": 3.9352, "step": 7175 }, { "epoch": 0.5698456667985754, "grad_norm": 143.2531280517578, "learning_rate": 2.26259415109024e-07, "loss": 3.9686, "step": 7200 }, { "epoch": 0.5718242975860705, "grad_norm": 123.57396697998047, "learning_rate": 2.261769710720363e-07, "loss": 4.0855, "step": 7225 }, { "epoch": 0.5738029283735655, "grad_norm": 115.12631225585938, "learning_rate": 2.260945270350486e-07, "loss": 3.9754, "step": 7250 }, { "epoch": 0.5757815591610606, "grad_norm": 114.95091247558594, "learning_rate": 2.260120829980609e-07, "loss": 3.8981, "step": 7275 }, { "epoch": 0.5777601899485556, "grad_norm": 105.46833038330078, "learning_rate": 2.2592963896107322e-07, "loss": 3.9452, "step": 7300 }, { "epoch": 0.5797388207360507, "grad_norm": 132.89012145996094, "learning_rate": 2.258471949240855e-07, "loss": 4.0808, "step": 7325 }, { "epoch": 0.5817174515235457, "grad_norm": 143.6460418701172, "learning_rate": 2.2576475088709782e-07, "loss": 4.0108, "step": 7350 }, { "epoch": 0.5836960823110408, "grad_norm": 130.83352661132812, "learning_rate": 2.2568230685011013e-07, "loss": 3.9701, "step": 7375 }, { "epoch": 0.5856747130985358, "grad_norm": 111.10405731201172, "learning_rate": 2.2559986281312244e-07, "loss": 4.3162, "step": 7400 }, { "epoch": 0.5876533438860309, "grad_norm": 163.31959533691406, "learning_rate": 2.2551741877613476e-07, "loss": 3.9096, "step": 7425 }, { "epoch": 0.5896319746735259, "grad_norm": 134.72927856445312, "learning_rate": 2.2543497473914704e-07, "loss": 3.8888, "step": 7450 }, { "epoch": 0.591610605461021, "grad_norm": 124.26619720458984, "learning_rate": 2.2535253070215938e-07, "loss": 4.0683, "step": 7475 }, { "epoch": 0.593589236248516, "grad_norm": 106.8174057006836, "learning_rate": 2.2527008666517166e-07, "loss": 4.0547, "step": 7500 }, { "epoch": 0.5955678670360111, "grad_norm": 108.11019897460938, "learning_rate": 2.2518764262818398e-07, "loss": 3.9728, "step": 7525 }, { "epoch": 0.5975464978235061, "grad_norm": 117.44151306152344, "learning_rate": 2.251051985911963e-07, "loss": 4.0569, "step": 7550 }, { "epoch": 0.5995251286110012, "grad_norm": 106.18008422851562, "learning_rate": 2.250227545542086e-07, "loss": 3.9042, "step": 7575 }, { "epoch": 0.6015037593984962, "grad_norm": 88.67406463623047, "learning_rate": 2.249403105172209e-07, "loss": 4.0719, "step": 7600 }, { "epoch": 0.6034823901859913, "grad_norm": 111.12770080566406, "learning_rate": 2.248578664802332e-07, "loss": 3.9749, "step": 7625 }, { "epoch": 0.6054610209734863, "grad_norm": 119.26530456542969, "learning_rate": 2.247754224432455e-07, "loss": 3.9832, "step": 7650 }, { "epoch": 0.6074396517609814, "grad_norm": 157.9289093017578, "learning_rate": 2.2469297840625782e-07, "loss": 3.9538, "step": 7675 }, { "epoch": 0.6094182825484764, "grad_norm": 122.70995330810547, "learning_rate": 2.2461053436927013e-07, "loss": 3.8497, "step": 7700 }, { "epoch": 0.6113969133359715, "grad_norm": 142.41835021972656, "learning_rate": 2.2452809033228242e-07, "loss": 3.9172, "step": 7725 }, { "epoch": 0.6133755441234665, "grad_norm": 128.31825256347656, "learning_rate": 2.2444564629529475e-07, "loss": 3.7326, "step": 7750 }, { "epoch": 0.6153541749109616, "grad_norm": 142.67408752441406, "learning_rate": 2.2436320225830704e-07, "loss": 3.8782, "step": 7775 }, { "epoch": 0.6173328056984567, "grad_norm": 145.0731658935547, "learning_rate": 2.2428075822131938e-07, "loss": 4.024, "step": 7800 }, { "epoch": 0.6193114364859518, "grad_norm": 187.09068298339844, "learning_rate": 2.2419831418433166e-07, "loss": 3.8939, "step": 7825 }, { "epoch": 0.6212900672734468, "grad_norm": 122.93965148925781, "learning_rate": 2.2411587014734397e-07, "loss": 4.0373, "step": 7850 }, { "epoch": 0.6232686980609419, "grad_norm": 152.1845245361328, "learning_rate": 2.2403342611035628e-07, "loss": 4.1168, "step": 7875 }, { "epoch": 0.6252473288484369, "grad_norm": 100.07666778564453, "learning_rate": 2.239509820733686e-07, "loss": 3.9718, "step": 7900 }, { "epoch": 0.627225959635932, "grad_norm": 130.85479736328125, "learning_rate": 2.2386853803638088e-07, "loss": 4.0301, "step": 7925 }, { "epoch": 0.629204590423427, "grad_norm": 123.073974609375, "learning_rate": 2.237860939993932e-07, "loss": 4.0104, "step": 7950 }, { "epoch": 0.6311832212109221, "grad_norm": 168.19808959960938, "learning_rate": 2.237036499624055e-07, "loss": 3.9421, "step": 7975 }, { "epoch": 0.6331618519984171, "grad_norm": 118.69593811035156, "learning_rate": 2.2362120592541782e-07, "loss": 3.8238, "step": 8000 }, { "epoch": 0.6351404827859122, "grad_norm": 192.9334259033203, "learning_rate": 2.2353876188843013e-07, "loss": 3.8227, "step": 8025 }, { "epoch": 0.6371191135734072, "grad_norm": 103.11824035644531, "learning_rate": 2.2345631785144241e-07, "loss": 3.9359, "step": 8050 }, { "epoch": 0.6390977443609023, "grad_norm": 129.3599090576172, "learning_rate": 2.2337387381445475e-07, "loss": 4.0562, "step": 8075 }, { "epoch": 0.6410763751483973, "grad_norm": 124.06795501708984, "learning_rate": 2.2329142977746704e-07, "loss": 4.1502, "step": 8100 }, { "epoch": 0.6430550059358924, "grad_norm": 113.18289184570312, "learning_rate": 2.2320898574047935e-07, "loss": 4.0059, "step": 8125 }, { "epoch": 0.6450336367233874, "grad_norm": 117.89970397949219, "learning_rate": 2.2312654170349166e-07, "loss": 4.0162, "step": 8150 }, { "epoch": 0.6470122675108825, "grad_norm": 109.6517105102539, "learning_rate": 2.2304409766650397e-07, "loss": 3.9979, "step": 8175 }, { "epoch": 0.6489908982983775, "grad_norm": 123.35499572753906, "learning_rate": 2.2296165362951628e-07, "loss": 3.9273, "step": 8200 }, { "epoch": 0.6509695290858726, "grad_norm": 141.97459411621094, "learning_rate": 2.228792095925286e-07, "loss": 4.0558, "step": 8225 }, { "epoch": 0.6529481598733676, "grad_norm": 159.06973266601562, "learning_rate": 2.2279676555554088e-07, "loss": 3.8374, "step": 8250 }, { "epoch": 0.6549267906608627, "grad_norm": 120.933837890625, "learning_rate": 2.227143215185532e-07, "loss": 3.8412, "step": 8275 }, { "epoch": 0.6569054214483577, "grad_norm": 106.266357421875, "learning_rate": 2.226318774815655e-07, "loss": 3.8757, "step": 8300 }, { "epoch": 0.6588840522358528, "grad_norm": 138.7765655517578, "learning_rate": 2.225494334445778e-07, "loss": 4.2284, "step": 8325 }, { "epoch": 0.6608626830233478, "grad_norm": 120.76045989990234, "learning_rate": 2.2246698940759013e-07, "loss": 3.9127, "step": 8350 }, { "epoch": 0.6628413138108429, "grad_norm": 117.31808471679688, "learning_rate": 2.223845453706024e-07, "loss": 3.7577, "step": 8375 }, { "epoch": 0.6648199445983379, "grad_norm": 108.21405029296875, "learning_rate": 2.2230210133361475e-07, "loss": 4.1095, "step": 8400 }, { "epoch": 0.666798575385833, "grad_norm": 126.65251159667969, "learning_rate": 2.2221965729662703e-07, "loss": 3.9047, "step": 8425 }, { "epoch": 0.668777206173328, "grad_norm": 135.06512451171875, "learning_rate": 2.2213721325963935e-07, "loss": 3.9267, "step": 8450 }, { "epoch": 0.6707558369608231, "grad_norm": 150.37025451660156, "learning_rate": 2.2205476922265166e-07, "loss": 3.9662, "step": 8475 }, { "epoch": 0.6727344677483181, "grad_norm": 138.01531982421875, "learning_rate": 2.2197232518566397e-07, "loss": 3.8107, "step": 8500 }, { "epoch": 0.6747130985358132, "grad_norm": 130.35153198242188, "learning_rate": 2.2188988114867625e-07, "loss": 3.8068, "step": 8525 }, { "epoch": 0.6766917293233082, "grad_norm": 161.9180145263672, "learning_rate": 2.218074371116886e-07, "loss": 4.0893, "step": 8550 }, { "epoch": 0.6786703601108033, "grad_norm": 165.08409118652344, "learning_rate": 2.2172499307470088e-07, "loss": 3.8419, "step": 8575 }, { "epoch": 0.6806489908982983, "grad_norm": 153.2915496826172, "learning_rate": 2.216425490377132e-07, "loss": 3.9302, "step": 8600 }, { "epoch": 0.6826276216857934, "grad_norm": 153.20138549804688, "learning_rate": 2.215601050007255e-07, "loss": 3.9947, "step": 8625 }, { "epoch": 0.6846062524732884, "grad_norm": 124.32341003417969, "learning_rate": 2.2147766096373779e-07, "loss": 3.8241, "step": 8650 }, { "epoch": 0.6865848832607835, "grad_norm": 209.813232421875, "learning_rate": 2.2139521692675012e-07, "loss": 3.917, "step": 8675 }, { "epoch": 0.6885635140482786, "grad_norm": 116.88125610351562, "learning_rate": 2.213127728897624e-07, "loss": 3.9474, "step": 8700 }, { "epoch": 0.6905421448357737, "grad_norm": 178.58721923828125, "learning_rate": 2.2123032885277472e-07, "loss": 3.9247, "step": 8725 }, { "epoch": 0.6925207756232687, "grad_norm": 123.67437744140625, "learning_rate": 2.2114788481578703e-07, "loss": 3.9548, "step": 8750 }, { "epoch": 0.6944994064107638, "grad_norm": 154.60626220703125, "learning_rate": 2.2106544077879934e-07, "loss": 3.8239, "step": 8775 }, { "epoch": 0.6964780371982588, "grad_norm": 141.65699768066406, "learning_rate": 2.2098299674181166e-07, "loss": 3.8458, "step": 8800 }, { "epoch": 0.6984566679857539, "grad_norm": 112.9280776977539, "learning_rate": 2.2090055270482397e-07, "loss": 3.9648, "step": 8825 }, { "epoch": 0.700435298773249, "grad_norm": 154.57643127441406, "learning_rate": 2.2081810866783625e-07, "loss": 4.0078, "step": 8850 }, { "epoch": 0.702413929560744, "grad_norm": 129.07418823242188, "learning_rate": 2.207356646308486e-07, "loss": 4.0276, "step": 8875 }, { "epoch": 0.704392560348239, "grad_norm": 113.59859466552734, "learning_rate": 2.2065322059386088e-07, "loss": 4.1596, "step": 8900 }, { "epoch": 0.7063711911357341, "grad_norm": 136.26283264160156, "learning_rate": 2.2057077655687316e-07, "loss": 3.948, "step": 8925 }, { "epoch": 0.7083498219232292, "grad_norm": 118.27870178222656, "learning_rate": 2.204883325198855e-07, "loss": 3.7611, "step": 8950 }, { "epoch": 0.7103284527107242, "grad_norm": 159.56643676757812, "learning_rate": 2.2040588848289778e-07, "loss": 4.0215, "step": 8975 }, { "epoch": 0.7123070834982193, "grad_norm": 125.84573364257812, "learning_rate": 2.2032344444591012e-07, "loss": 4.0046, "step": 9000 }, { "epoch": 0.7142857142857143, "grad_norm": 148.7548065185547, "learning_rate": 2.202410004089224e-07, "loss": 4.0033, "step": 9025 }, { "epoch": 0.7162643450732094, "grad_norm": 109.41517639160156, "learning_rate": 2.2015855637193472e-07, "loss": 3.9298, "step": 9050 }, { "epoch": 0.7182429758607044, "grad_norm": 112.52848815917969, "learning_rate": 2.2007611233494703e-07, "loss": 4.1003, "step": 9075 }, { "epoch": 0.7202216066481995, "grad_norm": 114.72808074951172, "learning_rate": 2.1999366829795934e-07, "loss": 4.1112, "step": 9100 }, { "epoch": 0.7222002374356945, "grad_norm": 144.11619567871094, "learning_rate": 2.1991122426097163e-07, "loss": 3.764, "step": 9125 }, { "epoch": 0.7241788682231896, "grad_norm": 118.64055633544922, "learning_rate": 2.1982878022398396e-07, "loss": 3.9262, "step": 9150 }, { "epoch": 0.7261574990106846, "grad_norm": 166.79525756835938, "learning_rate": 2.1974633618699625e-07, "loss": 4.0518, "step": 9175 }, { "epoch": 0.7281361297981797, "grad_norm": 128.2512969970703, "learning_rate": 2.1966389215000856e-07, "loss": 3.952, "step": 9200 }, { "epoch": 0.7301147605856747, "grad_norm": 143.56414794921875, "learning_rate": 2.1958144811302087e-07, "loss": 3.7034, "step": 9225 }, { "epoch": 0.7320933913731698, "grad_norm": 120.13394165039062, "learning_rate": 2.1949900407603316e-07, "loss": 3.7839, "step": 9250 }, { "epoch": 0.7340720221606648, "grad_norm": 148.74070739746094, "learning_rate": 2.194165600390455e-07, "loss": 3.8871, "step": 9275 }, { "epoch": 0.7360506529481599, "grad_norm": 148.17022705078125, "learning_rate": 2.1933411600205778e-07, "loss": 3.7486, "step": 9300 }, { "epoch": 0.7380292837356549, "grad_norm": 112.7260513305664, "learning_rate": 2.192516719650701e-07, "loss": 3.9436, "step": 9325 }, { "epoch": 0.74000791452315, "grad_norm": 131.4718780517578, "learning_rate": 2.191692279280824e-07, "loss": 4.1101, "step": 9350 }, { "epoch": 0.741986545310645, "grad_norm": 106.73101043701172, "learning_rate": 2.1908678389109472e-07, "loss": 3.9285, "step": 9375 }, { "epoch": 0.7439651760981401, "grad_norm": 120.58040618896484, "learning_rate": 2.1900433985410703e-07, "loss": 3.8471, "step": 9400 }, { "epoch": 0.7459438068856351, "grad_norm": 135.69512939453125, "learning_rate": 2.1892189581711934e-07, "loss": 3.7629, "step": 9425 }, { "epoch": 0.7479224376731302, "grad_norm": 125.78627014160156, "learning_rate": 2.1883945178013162e-07, "loss": 4.0646, "step": 9450 }, { "epoch": 0.7499010684606252, "grad_norm": 150.2305145263672, "learning_rate": 2.1875700774314396e-07, "loss": 3.9361, "step": 9475 }, { "epoch": 0.7518796992481203, "grad_norm": 95.4436264038086, "learning_rate": 2.1867456370615625e-07, "loss": 3.7688, "step": 9500 }, { "epoch": 0.7538583300356153, "grad_norm": 141.27809143066406, "learning_rate": 2.1859211966916853e-07, "loss": 4.0217, "step": 9525 }, { "epoch": 0.7558369608231104, "grad_norm": 133.8254852294922, "learning_rate": 2.1850967563218087e-07, "loss": 4.0683, "step": 9550 }, { "epoch": 0.7578155916106054, "grad_norm": 139.919189453125, "learning_rate": 2.1842723159519316e-07, "loss": 3.9958, "step": 9575 }, { "epoch": 0.7597942223981005, "grad_norm": 173.58946228027344, "learning_rate": 2.183447875582055e-07, "loss": 3.9474, "step": 9600 }, { "epoch": 0.7617728531855956, "grad_norm": 107.07398223876953, "learning_rate": 2.1826234352121778e-07, "loss": 3.8308, "step": 9625 }, { "epoch": 0.7637514839730907, "grad_norm": 124.00753784179688, "learning_rate": 2.181798994842301e-07, "loss": 3.8218, "step": 9650 }, { "epoch": 0.7657301147605857, "grad_norm": 138.23736572265625, "learning_rate": 2.180974554472424e-07, "loss": 3.7296, "step": 9675 }, { "epoch": 0.7677087455480808, "grad_norm": 128.9496612548828, "learning_rate": 2.1801501141025471e-07, "loss": 3.9163, "step": 9700 }, { "epoch": 0.7696873763355758, "grad_norm": 108.07875061035156, "learning_rate": 2.17932567373267e-07, "loss": 3.9408, "step": 9725 }, { "epoch": 0.7716660071230709, "grad_norm": 126.18501281738281, "learning_rate": 2.1785012333627934e-07, "loss": 4.1993, "step": 9750 }, { "epoch": 0.7736446379105659, "grad_norm": 144.8102264404297, "learning_rate": 2.1776767929929162e-07, "loss": 3.877, "step": 9775 }, { "epoch": 0.775623268698061, "grad_norm": 118.8504638671875, "learning_rate": 2.1768523526230396e-07, "loss": 3.9788, "step": 9800 }, { "epoch": 0.777601899485556, "grad_norm": 127.45133209228516, "learning_rate": 2.1760279122531625e-07, "loss": 3.8987, "step": 9825 }, { "epoch": 0.7795805302730511, "grad_norm": 134.95892333984375, "learning_rate": 2.1752034718832853e-07, "loss": 3.8251, "step": 9850 }, { "epoch": 0.7815591610605461, "grad_norm": 124.00614929199219, "learning_rate": 2.1743790315134087e-07, "loss": 3.6875, "step": 9875 }, { "epoch": 0.7835377918480412, "grad_norm": 126.81105041503906, "learning_rate": 2.1735545911435315e-07, "loss": 3.7447, "step": 9900 }, { "epoch": 0.7855164226355362, "grad_norm": 106.54443359375, "learning_rate": 2.172730150773655e-07, "loss": 3.9051, "step": 9925 }, { "epoch": 0.7874950534230313, "grad_norm": 156.5098876953125, "learning_rate": 2.1719057104037778e-07, "loss": 3.9587, "step": 9950 }, { "epoch": 0.7894736842105263, "grad_norm": 128.83648681640625, "learning_rate": 2.171081270033901e-07, "loss": 4.0755, "step": 9975 }, { "epoch": 0.7914523149980214, "grad_norm": 131.54664611816406, "learning_rate": 2.170256829664024e-07, "loss": 4.0072, "step": 10000 }, { "epoch": 0.7914523149980214, "eval_loss": 3.8681728839874268, "eval_runtime": 9.5698, "eval_samples_per_second": 264.165, "eval_steps_per_second": 33.021, "step": 10000 } ], "logging_steps": 25, "max_steps": 75810, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 35767296000000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }