|
{ |
|
"best_metric": 3.8681728839874268, |
|
"best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-10000", |
|
"epoch": 0.7914523149980214, |
|
"eval_steps": 10000, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019786307874950534, |
|
"grad_norm": 254.82342529296875, |
|
"learning_rate": 2.499208537244918e-07, |
|
"loss": 5.7705, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003957261574990107, |
|
"grad_norm": 153.19989013671875, |
|
"learning_rate": 2.498384096875041e-07, |
|
"loss": 5.6747, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00593589236248516, |
|
"grad_norm": 224.5292510986328, |
|
"learning_rate": 2.4975596565051644e-07, |
|
"loss": 5.6201, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.007914523149980214, |
|
"grad_norm": 175.854248046875, |
|
"learning_rate": 2.4967352161352873e-07, |
|
"loss": 5.6974, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009893153937475268, |
|
"grad_norm": 163.52769470214844, |
|
"learning_rate": 2.49591077576541e-07, |
|
"loss": 5.5417, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.01187178472497032, |
|
"grad_norm": 254.2264862060547, |
|
"learning_rate": 2.4950863353955335e-07, |
|
"loss": 5.8201, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.013850415512465374, |
|
"grad_norm": 175.30279541015625, |
|
"learning_rate": 2.4942618950256564e-07, |
|
"loss": 5.5302, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.015829046299960427, |
|
"grad_norm": 300.1286315917969, |
|
"learning_rate": 2.4934374546557797e-07, |
|
"loss": 5.6572, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01780767708745548, |
|
"grad_norm": 201.56961059570312, |
|
"learning_rate": 2.4926130142859026e-07, |
|
"loss": 5.2914, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.019786307874950535, |
|
"grad_norm": 245.64854431152344, |
|
"learning_rate": 2.491788573916026e-07, |
|
"loss": 5.4478, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02176493866244559, |
|
"grad_norm": 239.78257751464844, |
|
"learning_rate": 2.490964133546149e-07, |
|
"loss": 5.4161, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.02374356944994064, |
|
"grad_norm": 150.18310546875, |
|
"learning_rate": 2.4901396931762717e-07, |
|
"loss": 5.4978, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.025722200237435693, |
|
"grad_norm": 172.03607177734375, |
|
"learning_rate": 2.489315252806395e-07, |
|
"loss": 5.4105, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.027700831024930747, |
|
"grad_norm": 343.2570495605469, |
|
"learning_rate": 2.488490812436518e-07, |
|
"loss": 5.5195, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0296794618124258, |
|
"grad_norm": 329.7228698730469, |
|
"learning_rate": 2.4876663720666413e-07, |
|
"loss": 5.4494, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.031658092599920855, |
|
"grad_norm": 174.63136291503906, |
|
"learning_rate": 2.486841931696764e-07, |
|
"loss": 5.2864, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.033636723387415905, |
|
"grad_norm": 356.6216125488281, |
|
"learning_rate": 2.486017491326887e-07, |
|
"loss": 5.4376, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.03561535417491096, |
|
"grad_norm": 166.16783142089844, |
|
"learning_rate": 2.4851930509570104e-07, |
|
"loss": 5.3141, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.03759398496240601, |
|
"grad_norm": 220.06170654296875, |
|
"learning_rate": 2.484368610587133e-07, |
|
"loss": 5.5457, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.03957261574990107, |
|
"grad_norm": 154.55517578125, |
|
"learning_rate": 2.483544170217256e-07, |
|
"loss": 5.1264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04155124653739612, |
|
"grad_norm": 184.18443298339844, |
|
"learning_rate": 2.4827197298473794e-07, |
|
"loss": 5.3702, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.04352987732489118, |
|
"grad_norm": 128.84693908691406, |
|
"learning_rate": 2.4818952894775023e-07, |
|
"loss": 5.0207, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04550850811238623, |
|
"grad_norm": 196.2894287109375, |
|
"learning_rate": 2.4810708491076257e-07, |
|
"loss": 5.315, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.04748713889988128, |
|
"grad_norm": 200.00257873535156, |
|
"learning_rate": 2.4802464087377485e-07, |
|
"loss": 5.2215, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.049465769687376336, |
|
"grad_norm": 271.8963928222656, |
|
"learning_rate": 2.479421968367872e-07, |
|
"loss": 5.3286, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.051444400474871387, |
|
"grad_norm": 181.56686401367188, |
|
"learning_rate": 2.478597527997995e-07, |
|
"loss": 4.9967, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.053423031262366444, |
|
"grad_norm": 242.8925323486328, |
|
"learning_rate": 2.477773087628118e-07, |
|
"loss": 5.1984, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.055401662049861494, |
|
"grad_norm": 210.05746459960938, |
|
"learning_rate": 2.476948647258241e-07, |
|
"loss": 5.0975, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.05738029283735655, |
|
"grad_norm": 181.1220245361328, |
|
"learning_rate": 2.476124206888364e-07, |
|
"loss": 5.0036, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.0593589236248516, |
|
"grad_norm": 166.00709533691406, |
|
"learning_rate": 2.475299766518487e-07, |
|
"loss": 5.3082, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06133755441234666, |
|
"grad_norm": 151.4649200439453, |
|
"learning_rate": 2.47447532614861e-07, |
|
"loss": 5.1391, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.06331618519984171, |
|
"grad_norm": 149.88165283203125, |
|
"learning_rate": 2.4736508857787335e-07, |
|
"loss": 5.0783, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.06529481598733676, |
|
"grad_norm": 172.47061157226562, |
|
"learning_rate": 2.4728264454088563e-07, |
|
"loss": 4.9624, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.06727344677483181, |
|
"grad_norm": 298.1490478515625, |
|
"learning_rate": 2.4720020050389797e-07, |
|
"loss": 5.1937, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.06925207756232687, |
|
"grad_norm": 164.37867736816406, |
|
"learning_rate": 2.4711775646691025e-07, |
|
"loss": 5.1792, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.07123070834982193, |
|
"grad_norm": 216.8033905029297, |
|
"learning_rate": 2.4703531242992254e-07, |
|
"loss": 5.152, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07320933913731698, |
|
"grad_norm": 211.95762634277344, |
|
"learning_rate": 2.469528683929349e-07, |
|
"loss": 4.9146, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.07518796992481203, |
|
"grad_norm": 257.61968994140625, |
|
"learning_rate": 2.4687042435594716e-07, |
|
"loss": 5.095, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.07716660071230709, |
|
"grad_norm": 179.43719482421875, |
|
"learning_rate": 2.467879803189595e-07, |
|
"loss": 5.0316, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.07914523149980214, |
|
"grad_norm": 180.3157958984375, |
|
"learning_rate": 2.467055362819718e-07, |
|
"loss": 4.9441, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08112386228729719, |
|
"grad_norm": 162.77447509765625, |
|
"learning_rate": 2.4662309224498407e-07, |
|
"loss": 4.9724, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.08310249307479224, |
|
"grad_norm": 123.65939331054688, |
|
"learning_rate": 2.465406482079964e-07, |
|
"loss": 5.2271, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.08508112386228729, |
|
"grad_norm": 163.114990234375, |
|
"learning_rate": 2.464582041710087e-07, |
|
"loss": 4.9724, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.08705975464978236, |
|
"grad_norm": 204.76400756835938, |
|
"learning_rate": 2.46375760134021e-07, |
|
"loss": 4.8724, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0890383854372774, |
|
"grad_norm": 307.8963623046875, |
|
"learning_rate": 2.462933160970333e-07, |
|
"loss": 4.9256, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.09101701622477246, |
|
"grad_norm": 133.03707885742188, |
|
"learning_rate": 2.462108720600456e-07, |
|
"loss": 4.8792, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.09299564701226751, |
|
"grad_norm": 161.41697692871094, |
|
"learning_rate": 2.4612842802305794e-07, |
|
"loss": 5.054, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.09497427779976256, |
|
"grad_norm": 135.36228942871094, |
|
"learning_rate": 2.460459839860702e-07, |
|
"loss": 4.8655, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09695290858725762, |
|
"grad_norm": 179.60646057128906, |
|
"learning_rate": 2.4596353994908256e-07, |
|
"loss": 4.7832, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.09893153937475267, |
|
"grad_norm": 335.71380615234375, |
|
"learning_rate": 2.4588109591209485e-07, |
|
"loss": 4.9979, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.10091017016224772, |
|
"grad_norm": 149.5147247314453, |
|
"learning_rate": 2.457986518751072e-07, |
|
"loss": 4.714, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.10288880094974277, |
|
"grad_norm": 154.0236358642578, |
|
"learning_rate": 2.4571620783811947e-07, |
|
"loss": 4.8015, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.10486743173723784, |
|
"grad_norm": 450.5319519042969, |
|
"learning_rate": 2.456337638011318e-07, |
|
"loss": 4.6914, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.10684606252473289, |
|
"grad_norm": 195.87863159179688, |
|
"learning_rate": 2.455513197641441e-07, |
|
"loss": 5.0124, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.10882469331222794, |
|
"grad_norm": 198.12225341796875, |
|
"learning_rate": 2.454688757271564e-07, |
|
"loss": 4.5305, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.11080332409972299, |
|
"grad_norm": 161.57623291015625, |
|
"learning_rate": 2.453864316901687e-07, |
|
"loss": 4.7806, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.11278195488721804, |
|
"grad_norm": 187.8081817626953, |
|
"learning_rate": 2.45303987653181e-07, |
|
"loss": 4.9401, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.1147605856747131, |
|
"grad_norm": 160.1893768310547, |
|
"learning_rate": 2.4522154361619334e-07, |
|
"loss": 4.8119, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.11673921646220815, |
|
"grad_norm": 181.8563995361328, |
|
"learning_rate": 2.4513909957920563e-07, |
|
"loss": 4.7979, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.1187178472497032, |
|
"grad_norm": 184.80641174316406, |
|
"learning_rate": 2.4505665554221796e-07, |
|
"loss": 4.8448, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12069647803719825, |
|
"grad_norm": 151.4502410888672, |
|
"learning_rate": 2.4497421150523025e-07, |
|
"loss": 4.7101, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.12267510882469332, |
|
"grad_norm": 163.2119598388672, |
|
"learning_rate": 2.4489176746824253e-07, |
|
"loss": 4.8802, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.12465373961218837, |
|
"grad_norm": 147.33741760253906, |
|
"learning_rate": 2.4480932343125487e-07, |
|
"loss": 4.6433, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.12663237039968342, |
|
"grad_norm": 145.84716796875, |
|
"learning_rate": 2.4472687939426716e-07, |
|
"loss": 4.4118, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.12861100118717847, |
|
"grad_norm": 111.55641174316406, |
|
"learning_rate": 2.4464443535727944e-07, |
|
"loss": 4.819, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.13058963197467352, |
|
"grad_norm": 145.68092346191406, |
|
"learning_rate": 2.445619913202918e-07, |
|
"loss": 4.7752, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.13256826276216857, |
|
"grad_norm": 274.0830078125, |
|
"learning_rate": 2.4447954728330407e-07, |
|
"loss": 4.8566, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.13454689354966362, |
|
"grad_norm": 141.83982849121094, |
|
"learning_rate": 2.4439710324631635e-07, |
|
"loss": 4.6643, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1365255243371587, |
|
"grad_norm": 182.46160888671875, |
|
"learning_rate": 2.443146592093287e-07, |
|
"loss": 4.731, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.13850415512465375, |
|
"grad_norm": 200.28773498535156, |
|
"learning_rate": 2.44232215172341e-07, |
|
"loss": 4.5525, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.1404827859121488, |
|
"grad_norm": 163.7792510986328, |
|
"learning_rate": 2.441497711353533e-07, |
|
"loss": 4.8076, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.14246141669964385, |
|
"grad_norm": 422.9642639160156, |
|
"learning_rate": 2.440673270983656e-07, |
|
"loss": 4.7045, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.1444400474871389, |
|
"grad_norm": 187.99957275390625, |
|
"learning_rate": 2.4398488306137794e-07, |
|
"loss": 4.6615, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.14641867827463395, |
|
"grad_norm": 144.52732849121094, |
|
"learning_rate": 2.439024390243902e-07, |
|
"loss": 4.7912, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.148397309062129, |
|
"grad_norm": 192.0771026611328, |
|
"learning_rate": 2.4381999498740256e-07, |
|
"loss": 4.7916, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.15037593984962405, |
|
"grad_norm": 148.06878662109375, |
|
"learning_rate": 2.4373755095041484e-07, |
|
"loss": 4.7782, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1523545706371191, |
|
"grad_norm": 131.4456329345703, |
|
"learning_rate": 2.436551069134272e-07, |
|
"loss": 4.579, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.15433320142461418, |
|
"grad_norm": 141.84681701660156, |
|
"learning_rate": 2.4357266287643947e-07, |
|
"loss": 4.5776, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.15631183221210923, |
|
"grad_norm": 122.31990051269531, |
|
"learning_rate": 2.4349021883945175e-07, |
|
"loss": 4.5185, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.15829046299960428, |
|
"grad_norm": 229.08372497558594, |
|
"learning_rate": 2.434077748024641e-07, |
|
"loss": 4.6352, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16026909378709933, |
|
"grad_norm": 136.54153442382812, |
|
"learning_rate": 2.433253307654764e-07, |
|
"loss": 4.5512, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.16224772457459438, |
|
"grad_norm": 237.05514526367188, |
|
"learning_rate": 2.432428867284887e-07, |
|
"loss": 4.7146, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.16422635536208943, |
|
"grad_norm": 149.2750244140625, |
|
"learning_rate": 2.43160442691501e-07, |
|
"loss": 4.6935, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.16620498614958448, |
|
"grad_norm": 149.77297973632812, |
|
"learning_rate": 2.4307799865451334e-07, |
|
"loss": 4.8223, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.16818361693707953, |
|
"grad_norm": 235.3883056640625, |
|
"learning_rate": 2.429955546175256e-07, |
|
"loss": 4.6266, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.17016224772457458, |
|
"grad_norm": 137.77316284179688, |
|
"learning_rate": 2.429131105805379e-07, |
|
"loss": 4.8543, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.17214087851206966, |
|
"grad_norm": 143.8935089111328, |
|
"learning_rate": 2.4283066654355025e-07, |
|
"loss": 4.651, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.1741195092995647, |
|
"grad_norm": 191.43856811523438, |
|
"learning_rate": 2.4274822250656253e-07, |
|
"loss": 4.4166, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.17609814008705976, |
|
"grad_norm": 135.82838439941406, |
|
"learning_rate": 2.426657784695748e-07, |
|
"loss": 4.7078, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.1780767708745548, |
|
"grad_norm": 114.28646087646484, |
|
"learning_rate": 2.4258333443258715e-07, |
|
"loss": 4.5316, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.18005540166204986, |
|
"grad_norm": 237.41001892089844, |
|
"learning_rate": 2.4250089039559944e-07, |
|
"loss": 4.4699, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.1820340324495449, |
|
"grad_norm": 124.57892608642578, |
|
"learning_rate": 2.424184463586117e-07, |
|
"loss": 4.5101, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.18401266323703996, |
|
"grad_norm": 147.15554809570312, |
|
"learning_rate": 2.4233600232162406e-07, |
|
"loss": 4.5974, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.18599129402453501, |
|
"grad_norm": 166.0609588623047, |
|
"learning_rate": 2.4225355828463635e-07, |
|
"loss": 4.5105, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.18796992481203006, |
|
"grad_norm": 188.97705078125, |
|
"learning_rate": 2.421711142476487e-07, |
|
"loss": 4.587, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.18994855559952512, |
|
"grad_norm": 243.09271240234375, |
|
"learning_rate": 2.4208867021066097e-07, |
|
"loss": 4.7686, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1919271863870202, |
|
"grad_norm": 127.40078735351562, |
|
"learning_rate": 2.420062261736733e-07, |
|
"loss": 4.4476, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.19390581717451524, |
|
"grad_norm": 253.8776092529297, |
|
"learning_rate": 2.419237821366856e-07, |
|
"loss": 4.5478, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.1958844479620103, |
|
"grad_norm": 123.27115631103516, |
|
"learning_rate": 2.4184133809969793e-07, |
|
"loss": 4.3502, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.19786307874950534, |
|
"grad_norm": 138.00375366210938, |
|
"learning_rate": 2.417588940627102e-07, |
|
"loss": 4.3534, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1998417095370004, |
|
"grad_norm": 115.53954315185547, |
|
"learning_rate": 2.4167645002572256e-07, |
|
"loss": 4.7066, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.20182034032449545, |
|
"grad_norm": 180.38809204101562, |
|
"learning_rate": 2.4159400598873484e-07, |
|
"loss": 4.6605, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2037989711119905, |
|
"grad_norm": 129.8457489013672, |
|
"learning_rate": 2.415115619517472e-07, |
|
"loss": 4.3849, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.20577760189948555, |
|
"grad_norm": 156.64404296875, |
|
"learning_rate": 2.4142911791475946e-07, |
|
"loss": 4.3434, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2077562326869806, |
|
"grad_norm": 162.81320190429688, |
|
"learning_rate": 2.4134667387777175e-07, |
|
"loss": 4.5466, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.20973486347447567, |
|
"grad_norm": 128.7244873046875, |
|
"learning_rate": 2.412642298407841e-07, |
|
"loss": 4.5358, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.21171349426197072, |
|
"grad_norm": 217.59042358398438, |
|
"learning_rate": 2.4118178580379637e-07, |
|
"loss": 4.5235, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.21369212504946578, |
|
"grad_norm": 144.84365844726562, |
|
"learning_rate": 2.410993417668087e-07, |
|
"loss": 4.3811, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.21567075583696083, |
|
"grad_norm": 146.22451782226562, |
|
"learning_rate": 2.41016897729821e-07, |
|
"loss": 4.3797, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.21764938662445588, |
|
"grad_norm": 198.39772033691406, |
|
"learning_rate": 2.409344536928333e-07, |
|
"loss": 4.4303, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.21962801741195093, |
|
"grad_norm": 158.10592651367188, |
|
"learning_rate": 2.408520096558456e-07, |
|
"loss": 4.3633, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.22160664819944598, |
|
"grad_norm": 166.79954528808594, |
|
"learning_rate": 2.407695656188579e-07, |
|
"loss": 4.5392, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.22358527898694103, |
|
"grad_norm": 207.30593872070312, |
|
"learning_rate": 2.406871215818702e-07, |
|
"loss": 4.5003, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.22556390977443608, |
|
"grad_norm": 128.81883239746094, |
|
"learning_rate": 2.4060467754488253e-07, |
|
"loss": 4.5416, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.22754254056193116, |
|
"grad_norm": 181.48960876464844, |
|
"learning_rate": 2.405222335078948e-07, |
|
"loss": 4.1725, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.2295211713494262, |
|
"grad_norm": 179.47384643554688, |
|
"learning_rate": 2.4043978947090715e-07, |
|
"loss": 4.5229, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.23149980213692126, |
|
"grad_norm": 144.242919921875, |
|
"learning_rate": 2.4035734543391943e-07, |
|
"loss": 4.3295, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.2334784329244163, |
|
"grad_norm": 177.61968994140625, |
|
"learning_rate": 2.402749013969317e-07, |
|
"loss": 4.4266, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.23545706371191136, |
|
"grad_norm": 143.8682861328125, |
|
"learning_rate": 2.4019245735994406e-07, |
|
"loss": 4.2341, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.2374356944994064, |
|
"grad_norm": 128.8461151123047, |
|
"learning_rate": 2.4011001332295634e-07, |
|
"loss": 4.3676, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.23941432528690146, |
|
"grad_norm": 160.70687866210938, |
|
"learning_rate": 2.400275692859687e-07, |
|
"loss": 4.3945, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.2413929560743965, |
|
"grad_norm": 157.65855407714844, |
|
"learning_rate": 2.3994512524898097e-07, |
|
"loss": 4.4967, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.24337158686189156, |
|
"grad_norm": 125.79988861083984, |
|
"learning_rate": 2.398626812119933e-07, |
|
"loss": 4.279, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.24535021764938664, |
|
"grad_norm": 168.8534698486328, |
|
"learning_rate": 2.397802371750056e-07, |
|
"loss": 4.4813, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2473288484368817, |
|
"grad_norm": 120.4126968383789, |
|
"learning_rate": 2.3969779313801793e-07, |
|
"loss": 4.1997, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.24930747922437674, |
|
"grad_norm": 115.56365203857422, |
|
"learning_rate": 2.396153491010302e-07, |
|
"loss": 4.4076, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2512861100118718, |
|
"grad_norm": 152.89859008789062, |
|
"learning_rate": 2.3953290506404255e-07, |
|
"loss": 4.2893, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.25326474079936684, |
|
"grad_norm": 177.6272735595703, |
|
"learning_rate": 2.3945046102705484e-07, |
|
"loss": 4.4892, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2552433715868619, |
|
"grad_norm": 131.46661376953125, |
|
"learning_rate": 2.393680169900671e-07, |
|
"loss": 4.2702, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.25722200237435694, |
|
"grad_norm": 101.60210418701172, |
|
"learning_rate": 2.3928557295307946e-07, |
|
"loss": 4.2209, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.259200633161852, |
|
"grad_norm": 199.7799835205078, |
|
"learning_rate": 2.3920312891609174e-07, |
|
"loss": 4.1502, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.26117926394934704, |
|
"grad_norm": 163.44424438476562, |
|
"learning_rate": 2.391206848791041e-07, |
|
"loss": 4.3423, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 148.59519958496094, |
|
"learning_rate": 2.3903824084211637e-07, |
|
"loss": 4.4833, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.26513652552433714, |
|
"grad_norm": 129.75927734375, |
|
"learning_rate": 2.3895579680512865e-07, |
|
"loss": 4.4745, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.2671151563118322, |
|
"grad_norm": 126.6795654296875, |
|
"learning_rate": 2.38873352768141e-07, |
|
"loss": 4.3964, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.26909378709932724, |
|
"grad_norm": 157.1032257080078, |
|
"learning_rate": 2.387909087311533e-07, |
|
"loss": 4.3419, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.2710724178868223, |
|
"grad_norm": 142.79139709472656, |
|
"learning_rate": 2.3870846469416556e-07, |
|
"loss": 4.2243, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.2730510486743174, |
|
"grad_norm": 137.3797607421875, |
|
"learning_rate": 2.386260206571779e-07, |
|
"loss": 4.1661, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.27502967946181245, |
|
"grad_norm": 148.77401733398438, |
|
"learning_rate": 2.385435766201902e-07, |
|
"loss": 4.483, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.2770083102493075, |
|
"grad_norm": 124.54267120361328, |
|
"learning_rate": 2.384611325832025e-07, |
|
"loss": 4.369, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.27898694103680255, |
|
"grad_norm": 113.43370056152344, |
|
"learning_rate": 2.383786885462148e-07, |
|
"loss": 4.1491, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.2809655718242976, |
|
"grad_norm": 155.67677307128906, |
|
"learning_rate": 2.3829624450922712e-07, |
|
"loss": 4.3403, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.28294420261179265, |
|
"grad_norm": 201.27784729003906, |
|
"learning_rate": 2.3821380047223943e-07, |
|
"loss": 4.3563, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.2849228333992877, |
|
"grad_norm": 104.74275970458984, |
|
"learning_rate": 2.3813135643525174e-07, |
|
"loss": 4.2706, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.28690146418678275, |
|
"grad_norm": 133.6251678466797, |
|
"learning_rate": 2.3804891239826405e-07, |
|
"loss": 4.3637, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.2888800949742778, |
|
"grad_norm": 102.35352325439453, |
|
"learning_rate": 2.3796646836127634e-07, |
|
"loss": 4.2585, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.29085872576177285, |
|
"grad_norm": 156.72654724121094, |
|
"learning_rate": 2.3788402432428868e-07, |
|
"loss": 4.3448, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.2928373565492679, |
|
"grad_norm": 121.19142150878906, |
|
"learning_rate": 2.3780158028730096e-07, |
|
"loss": 4.1475, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.29481598733676295, |
|
"grad_norm": 138.72952270507812, |
|
"learning_rate": 2.3771913625031327e-07, |
|
"loss": 4.2475, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.296794618124258, |
|
"grad_norm": 314.35113525390625, |
|
"learning_rate": 2.3763669221332559e-07, |
|
"loss": 4.2643, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.29877324891175305, |
|
"grad_norm": 131.71240234375, |
|
"learning_rate": 2.375542481763379e-07, |
|
"loss": 4.2741, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.3007518796992481, |
|
"grad_norm": 193.2744598388672, |
|
"learning_rate": 2.374718041393502e-07, |
|
"loss": 4.2314, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.30273051048674315, |
|
"grad_norm": 146.98760986328125, |
|
"learning_rate": 2.3738936010236252e-07, |
|
"loss": 4.5421, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.3047091412742382, |
|
"grad_norm": 106.49159240722656, |
|
"learning_rate": 2.373069160653748e-07, |
|
"loss": 4.0922, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.30668777206173325, |
|
"grad_norm": 128.12686157226562, |
|
"learning_rate": 2.3722447202838712e-07, |
|
"loss": 4.3171, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.30866640284922836, |
|
"grad_norm": 165.8458251953125, |
|
"learning_rate": 2.3714202799139943e-07, |
|
"loss": 4.1937, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3106450336367234, |
|
"grad_norm": 129.49652099609375, |
|
"learning_rate": 2.3705958395441171e-07, |
|
"loss": 4.2486, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.31262366442421846, |
|
"grad_norm": 113.08882141113281, |
|
"learning_rate": 2.3697713991742405e-07, |
|
"loss": 3.9533, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3146022952117135, |
|
"grad_norm": 116.51021575927734, |
|
"learning_rate": 2.3689469588043634e-07, |
|
"loss": 4.2525, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.31658092599920856, |
|
"grad_norm": 95.54279327392578, |
|
"learning_rate": 2.3681225184344867e-07, |
|
"loss": 4.1355, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3185595567867036, |
|
"grad_norm": 123.10621643066406, |
|
"learning_rate": 2.3672980780646096e-07, |
|
"loss": 4.3705, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.32053818757419866, |
|
"grad_norm": 142.11273193359375, |
|
"learning_rate": 2.3664736376947327e-07, |
|
"loss": 4.2712, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.3225168183616937, |
|
"grad_norm": 162.17141723632812, |
|
"learning_rate": 2.3656491973248558e-07, |
|
"loss": 4.1127, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.32449544914918876, |
|
"grad_norm": 160.26893615722656, |
|
"learning_rate": 2.364824756954979e-07, |
|
"loss": 4.2687, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3264740799366838, |
|
"grad_norm": 134.65093994140625, |
|
"learning_rate": 2.3640003165851018e-07, |
|
"loss": 4.3567, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.32845271072417886, |
|
"grad_norm": 178.23516845703125, |
|
"learning_rate": 2.3631758762152252e-07, |
|
"loss": 4.097, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3304313415116739, |
|
"grad_norm": 151.1556396484375, |
|
"learning_rate": 2.362351435845348e-07, |
|
"loss": 4.1602, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.33240997229916897, |
|
"grad_norm": 154.64442443847656, |
|
"learning_rate": 2.3615269954754711e-07, |
|
"loss": 4.2365, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.334388603086664, |
|
"grad_norm": 226.6827850341797, |
|
"learning_rate": 2.3607025551055943e-07, |
|
"loss": 4.3196, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.33636723387415907, |
|
"grad_norm": 172.67916870117188, |
|
"learning_rate": 2.359878114735717e-07, |
|
"loss": 4.4476, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3383458646616541, |
|
"grad_norm": 124.78984069824219, |
|
"learning_rate": 2.3590536743658405e-07, |
|
"loss": 4.4006, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.34032449544914917, |
|
"grad_norm": 156.81365966796875, |
|
"learning_rate": 2.3582292339959633e-07, |
|
"loss": 4.3914, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3423031262366442, |
|
"grad_norm": 116.53181457519531, |
|
"learning_rate": 2.3574047936260865e-07, |
|
"loss": 4.2846, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.3442817570241393, |
|
"grad_norm": 146.16543579101562, |
|
"learning_rate": 2.3565803532562096e-07, |
|
"loss": 4.1371, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.3462603878116344, |
|
"grad_norm": 213.07974243164062, |
|
"learning_rate": 2.3557559128863327e-07, |
|
"loss": 4.2294, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.3482390185991294, |
|
"grad_norm": 99.38206481933594, |
|
"learning_rate": 2.3549314725164558e-07, |
|
"loss": 4.1726, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3502176493866245, |
|
"grad_norm": 162.97059631347656, |
|
"learning_rate": 2.354107032146579e-07, |
|
"loss": 4.0507, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.3521962801741195, |
|
"grad_norm": 132.77474975585938, |
|
"learning_rate": 2.3532825917767018e-07, |
|
"loss": 4.0016, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.3541749109616146, |
|
"grad_norm": 126.9658203125, |
|
"learning_rate": 2.3524581514068252e-07, |
|
"loss": 4.2731, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.3561535417491096, |
|
"grad_norm": 194.47755432128906, |
|
"learning_rate": 2.351633711036948e-07, |
|
"loss": 4.1119, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3581321725366047, |
|
"grad_norm": 153.6606903076172, |
|
"learning_rate": 2.3508092706670709e-07, |
|
"loss": 4.4556, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.3601108033240997, |
|
"grad_norm": 146.66709899902344, |
|
"learning_rate": 2.3499848302971942e-07, |
|
"loss": 4.3314, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.3620894341115948, |
|
"grad_norm": 111.01129913330078, |
|
"learning_rate": 2.349160389927317e-07, |
|
"loss": 4.2929, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.3640680648990898, |
|
"grad_norm": 137.40582275390625, |
|
"learning_rate": 2.3483359495574405e-07, |
|
"loss": 4.2198, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.3660466956865849, |
|
"grad_norm": 142.0623779296875, |
|
"learning_rate": 2.3475115091875633e-07, |
|
"loss": 4.2013, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.36802532647407993, |
|
"grad_norm": 135.2795867919922, |
|
"learning_rate": 2.3466870688176864e-07, |
|
"loss": 4.231, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.370003957261575, |
|
"grad_norm": 127.59281158447266, |
|
"learning_rate": 2.3458626284478096e-07, |
|
"loss": 3.9613, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.37198258804907003, |
|
"grad_norm": 132.48663330078125, |
|
"learning_rate": 2.3450381880779327e-07, |
|
"loss": 4.1925, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.3739612188365651, |
|
"grad_norm": 135.35409545898438, |
|
"learning_rate": 2.3442137477080555e-07, |
|
"loss": 4.1828, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.37593984962406013, |
|
"grad_norm": 107.55503845214844, |
|
"learning_rate": 2.343389307338179e-07, |
|
"loss": 4.2578, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.3779184804115552, |
|
"grad_norm": 132.79620361328125, |
|
"learning_rate": 2.3425648669683018e-07, |
|
"loss": 4.0254, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.37989711119905023, |
|
"grad_norm": 123.6044692993164, |
|
"learning_rate": 2.341740426598425e-07, |
|
"loss": 3.9981, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.38187574198654534, |
|
"grad_norm": 149.656005859375, |
|
"learning_rate": 2.340915986228548e-07, |
|
"loss": 4.1067, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.3838543727740404, |
|
"grad_norm": 122.97380065917969, |
|
"learning_rate": 2.3400915458586708e-07, |
|
"loss": 4.2396, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.38583300356153544, |
|
"grad_norm": 140.10183715820312, |
|
"learning_rate": 2.3392671054887942e-07, |
|
"loss": 4.1309, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.3878116343490305, |
|
"grad_norm": 137.91583251953125, |
|
"learning_rate": 2.338442665118917e-07, |
|
"loss": 4.0575, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.38979026513652554, |
|
"grad_norm": 137.72152709960938, |
|
"learning_rate": 2.3376182247490402e-07, |
|
"loss": 4.159, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.3917688959240206, |
|
"grad_norm": 84.3819808959961, |
|
"learning_rate": 2.3367937843791633e-07, |
|
"loss": 4.2611, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.39374752671151564, |
|
"grad_norm": 200.3111114501953, |
|
"learning_rate": 2.3359693440092864e-07, |
|
"loss": 4.167, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.3957261574990107, |
|
"grad_norm": 123.27460479736328, |
|
"learning_rate": 2.3351449036394095e-07, |
|
"loss": 4.2918, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.39770478828650574, |
|
"grad_norm": 111.70620727539062, |
|
"learning_rate": 2.3343204632695327e-07, |
|
"loss": 4.2242, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.3996834190740008, |
|
"grad_norm": 107.74165344238281, |
|
"learning_rate": 2.3334960228996555e-07, |
|
"loss": 4.3572, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.40166204986149584, |
|
"grad_norm": 138.31423950195312, |
|
"learning_rate": 2.332671582529779e-07, |
|
"loss": 4.1759, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.4036406806489909, |
|
"grad_norm": 104.73587799072266, |
|
"learning_rate": 2.3318471421599017e-07, |
|
"loss": 4.1695, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.40561931143648594, |
|
"grad_norm": 138.1061553955078, |
|
"learning_rate": 2.3310227017900246e-07, |
|
"loss": 4.0986, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.407597942223981, |
|
"grad_norm": 148.92279052734375, |
|
"learning_rate": 2.330198261420148e-07, |
|
"loss": 4.3455, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.40957657301147604, |
|
"grad_norm": 321.29852294921875, |
|
"learning_rate": 2.3293738210502708e-07, |
|
"loss": 4.1285, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.4115552037989711, |
|
"grad_norm": 114.85989379882812, |
|
"learning_rate": 2.3285493806803942e-07, |
|
"loss": 3.9628, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.41353383458646614, |
|
"grad_norm": 137.27610778808594, |
|
"learning_rate": 2.327724940310517e-07, |
|
"loss": 4.1521, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.4155124653739612, |
|
"grad_norm": 96.02686309814453, |
|
"learning_rate": 2.3269004999406402e-07, |
|
"loss": 4.027, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4174910961614563, |
|
"grad_norm": 213.81649780273438, |
|
"learning_rate": 2.3260760595707633e-07, |
|
"loss": 4.0522, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.41946972694895135, |
|
"grad_norm": 160.4125518798828, |
|
"learning_rate": 2.3252516192008864e-07, |
|
"loss": 4.09, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.4214483577364464, |
|
"grad_norm": 167.58741760253906, |
|
"learning_rate": 2.3244271788310093e-07, |
|
"loss": 4.1128, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.42342698852394145, |
|
"grad_norm": 159.55303955078125, |
|
"learning_rate": 2.3236027384611326e-07, |
|
"loss": 4.0867, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.4254056193114365, |
|
"grad_norm": 122.51324462890625, |
|
"learning_rate": 2.3227782980912555e-07, |
|
"loss": 4.2261, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.42738425009893155, |
|
"grad_norm": 185.9108428955078, |
|
"learning_rate": 2.3219538577213789e-07, |
|
"loss": 3.9684, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4293628808864266, |
|
"grad_norm": 195.37579345703125, |
|
"learning_rate": 2.3211294173515017e-07, |
|
"loss": 4.0779, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.43134151167392165, |
|
"grad_norm": 157.84371948242188, |
|
"learning_rate": 2.3203049769816246e-07, |
|
"loss": 4.1991, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4333201424614167, |
|
"grad_norm": 111.01512908935547, |
|
"learning_rate": 2.319480536611748e-07, |
|
"loss": 3.9962, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.43529877324891175, |
|
"grad_norm": 114.49053955078125, |
|
"learning_rate": 2.3186560962418708e-07, |
|
"loss": 3.8972, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4372774040364068, |
|
"grad_norm": 168.17874145507812, |
|
"learning_rate": 2.317831655871994e-07, |
|
"loss": 4.1913, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.43925603482390185, |
|
"grad_norm": 140.61912536621094, |
|
"learning_rate": 2.317007215502117e-07, |
|
"loss": 4.1396, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.4412346656113969, |
|
"grad_norm": 138.01805114746094, |
|
"learning_rate": 2.3161827751322401e-07, |
|
"loss": 4.1399, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.44321329639889195, |
|
"grad_norm": 188.0181427001953, |
|
"learning_rate": 2.3153583347623633e-07, |
|
"loss": 4.0329, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.445191927186387, |
|
"grad_norm": 170.8402099609375, |
|
"learning_rate": 2.3145338943924864e-07, |
|
"loss": 4.3414, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.44717055797388205, |
|
"grad_norm": 200.65077209472656, |
|
"learning_rate": 2.3137094540226092e-07, |
|
"loss": 4.2154, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.4491491887613771, |
|
"grad_norm": 120.18091583251953, |
|
"learning_rate": 2.3128850136527326e-07, |
|
"loss": 4.0372, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.45112781954887216, |
|
"grad_norm": 89.9730224609375, |
|
"learning_rate": 2.3120605732828555e-07, |
|
"loss": 4.1059, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.45310645033636726, |
|
"grad_norm": 133.7999267578125, |
|
"learning_rate": 2.3112361329129786e-07, |
|
"loss": 4.2035, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.4550850811238623, |
|
"grad_norm": 88.3386459350586, |
|
"learning_rate": 2.3104116925431017e-07, |
|
"loss": 4.0566, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.45706371191135736, |
|
"grad_norm": 130.95127868652344, |
|
"learning_rate": 2.3095872521732245e-07, |
|
"loss": 4.3084, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.4590423426988524, |
|
"grad_norm": 162.55679321289062, |
|
"learning_rate": 2.308762811803348e-07, |
|
"loss": 4.0288, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.46102097348634746, |
|
"grad_norm": 104.4178695678711, |
|
"learning_rate": 2.3079383714334708e-07, |
|
"loss": 3.9244, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.4629996042738425, |
|
"grad_norm": 235.28123474121094, |
|
"learning_rate": 2.307113931063594e-07, |
|
"loss": 4.1106, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.46497823506133756, |
|
"grad_norm": 289.6645812988281, |
|
"learning_rate": 2.306289490693717e-07, |
|
"loss": 4.0457, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.4669568658488326, |
|
"grad_norm": 99.97111511230469, |
|
"learning_rate": 2.30546505032384e-07, |
|
"loss": 4.2542, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.46893549663632766, |
|
"grad_norm": 260.0950622558594, |
|
"learning_rate": 2.304640609953963e-07, |
|
"loss": 4.1564, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.4709141274238227, |
|
"grad_norm": 113.74392700195312, |
|
"learning_rate": 2.3038161695840864e-07, |
|
"loss": 4.0403, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.47289275821131777, |
|
"grad_norm": 79.32340240478516, |
|
"learning_rate": 2.3029917292142092e-07, |
|
"loss": 4.0408, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.4748713889988128, |
|
"grad_norm": 95.92308807373047, |
|
"learning_rate": 2.3021672888443326e-07, |
|
"loss": 3.9811, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.47685001978630787, |
|
"grad_norm": 94.5758285522461, |
|
"learning_rate": 2.3013428484744554e-07, |
|
"loss": 4.1102, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.4788286505738029, |
|
"grad_norm": 142.32131958007812, |
|
"learning_rate": 2.3005184081045786e-07, |
|
"loss": 3.989, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.48080728136129797, |
|
"grad_norm": 97.84469604492188, |
|
"learning_rate": 2.2996939677347017e-07, |
|
"loss": 3.9512, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.482785912148793, |
|
"grad_norm": 94.38491821289062, |
|
"learning_rate": 2.2988695273648245e-07, |
|
"loss": 3.9475, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.48476454293628807, |
|
"grad_norm": 124.32872772216797, |
|
"learning_rate": 2.2980450869949476e-07, |
|
"loss": 4.1352, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.4867431737237831, |
|
"grad_norm": 196.1511993408203, |
|
"learning_rate": 2.2972206466250708e-07, |
|
"loss": 4.2956, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.48872180451127817, |
|
"grad_norm": 144.1227264404297, |
|
"learning_rate": 2.296396206255194e-07, |
|
"loss": 3.9718, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.4907004352987733, |
|
"grad_norm": 115.52275085449219, |
|
"learning_rate": 2.295571765885317e-07, |
|
"loss": 3.9135, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.4926790660862683, |
|
"grad_norm": 117.71548461914062, |
|
"learning_rate": 2.29474732551544e-07, |
|
"loss": 3.9026, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.4946576968737634, |
|
"grad_norm": 135.42698669433594, |
|
"learning_rate": 2.293922885145563e-07, |
|
"loss": 4.0369, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.4966363276612584, |
|
"grad_norm": 142.4741973876953, |
|
"learning_rate": 2.2930984447756863e-07, |
|
"loss": 4.3588, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.4986149584487535, |
|
"grad_norm": 128.56195068359375, |
|
"learning_rate": 2.2922740044058092e-07, |
|
"loss": 3.9089, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5005935892362485, |
|
"grad_norm": 96.84894561767578, |
|
"learning_rate": 2.2914495640359323e-07, |
|
"loss": 4.1722, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.5025722200237436, |
|
"grad_norm": 236.92965698242188, |
|
"learning_rate": 2.2906251236660554e-07, |
|
"loss": 3.9729, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.5045508508112386, |
|
"grad_norm": 135.83609008789062, |
|
"learning_rate": 2.2898006832961783e-07, |
|
"loss": 4.0322, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.5065294815987337, |
|
"grad_norm": 123.36375427246094, |
|
"learning_rate": 2.2889762429263017e-07, |
|
"loss": 4.0042, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5085081123862287, |
|
"grad_norm": 118.30574035644531, |
|
"learning_rate": 2.2881518025564245e-07, |
|
"loss": 4.1079, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.5104867431737238, |
|
"grad_norm": 107.81358337402344, |
|
"learning_rate": 2.2873273621865476e-07, |
|
"loss": 4.1198, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5124653739612188, |
|
"grad_norm": 146.2493438720703, |
|
"learning_rate": 2.2865029218166707e-07, |
|
"loss": 4.0814, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.5144440047487139, |
|
"grad_norm": 136.8212890625, |
|
"learning_rate": 2.2856784814467939e-07, |
|
"loss": 4.0562, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5164226355362089, |
|
"grad_norm": 139.30670166015625, |
|
"learning_rate": 2.2848540410769167e-07, |
|
"loss": 4.1199, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.518401266323704, |
|
"grad_norm": 194.90414428710938, |
|
"learning_rate": 2.28402960070704e-07, |
|
"loss": 4.0562, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.520379897111199, |
|
"grad_norm": 103.54257202148438, |
|
"learning_rate": 2.283205160337163e-07, |
|
"loss": 4.0797, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.5223585278986941, |
|
"grad_norm": 101.63102722167969, |
|
"learning_rate": 2.2823807199672863e-07, |
|
"loss": 4.0591, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5243371586861891, |
|
"grad_norm": 104.28479766845703, |
|
"learning_rate": 2.2815562795974092e-07, |
|
"loss": 3.8991, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 166.01107788085938, |
|
"learning_rate": 2.2807318392275323e-07, |
|
"loss": 4.1011, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.5282944202611792, |
|
"grad_norm": 154.64959716796875, |
|
"learning_rate": 2.2799073988576554e-07, |
|
"loss": 3.9283, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.5302730510486743, |
|
"grad_norm": 96.0099868774414, |
|
"learning_rate": 2.2790829584877782e-07, |
|
"loss": 3.8247, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.5322516818361693, |
|
"grad_norm": 120.90514373779297, |
|
"learning_rate": 2.2782585181179014e-07, |
|
"loss": 4.0629, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.5342303126236644, |
|
"grad_norm": 106.48863983154297, |
|
"learning_rate": 2.2774340777480245e-07, |
|
"loss": 4.0127, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.5362089434111594, |
|
"grad_norm": 113.17047882080078, |
|
"learning_rate": 2.2766096373781476e-07, |
|
"loss": 4.03, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.5381875741986545, |
|
"grad_norm": 130.6500701904297, |
|
"learning_rate": 2.2757851970082707e-07, |
|
"loss": 4.0192, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5401662049861495, |
|
"grad_norm": 142.3747100830078, |
|
"learning_rate": 2.2749607566383938e-07, |
|
"loss": 4.1507, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.5421448357736446, |
|
"grad_norm": 125.88548278808594, |
|
"learning_rate": 2.2741363162685167e-07, |
|
"loss": 4.2026, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.5441234665611397, |
|
"grad_norm": 156.44570922851562, |
|
"learning_rate": 2.27331187589864e-07, |
|
"loss": 4.1063, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.5461020973486348, |
|
"grad_norm": 150.82635498046875, |
|
"learning_rate": 2.272487435528763e-07, |
|
"loss": 4.0477, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5480807281361298, |
|
"grad_norm": 170.67994689941406, |
|
"learning_rate": 2.271662995158886e-07, |
|
"loss": 4.1398, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.5500593589236249, |
|
"grad_norm": 114.224609375, |
|
"learning_rate": 2.2708385547890091e-07, |
|
"loss": 4.0906, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.55203798971112, |
|
"grad_norm": 135.5966033935547, |
|
"learning_rate": 2.2700141144191323e-07, |
|
"loss": 3.872, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.554016620498615, |
|
"grad_norm": 120.73974609375, |
|
"learning_rate": 2.2691896740492554e-07, |
|
"loss": 3.9762, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.55599525128611, |
|
"grad_norm": 107.66891479492188, |
|
"learning_rate": 2.2683652336793782e-07, |
|
"loss": 4.0551, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.5579738820736051, |
|
"grad_norm": 107.60162353515625, |
|
"learning_rate": 2.2675407933095013e-07, |
|
"loss": 3.973, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.5599525128611001, |
|
"grad_norm": 118.88258361816406, |
|
"learning_rate": 2.2667163529396245e-07, |
|
"loss": 3.9864, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.5619311436485952, |
|
"grad_norm": 148.85667419433594, |
|
"learning_rate": 2.2658919125697476e-07, |
|
"loss": 3.9409, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5639097744360902, |
|
"grad_norm": 148.57321166992188, |
|
"learning_rate": 2.2650674721998704e-07, |
|
"loss": 3.9611, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.5658884052235853, |
|
"grad_norm": 172.39999389648438, |
|
"learning_rate": 2.2642430318299938e-07, |
|
"loss": 3.97, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.5678670360110804, |
|
"grad_norm": 120.57051086425781, |
|
"learning_rate": 2.2634185914601167e-07, |
|
"loss": 3.9352, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.5698456667985754, |
|
"grad_norm": 143.2531280517578, |
|
"learning_rate": 2.26259415109024e-07, |
|
"loss": 3.9686, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.5718242975860705, |
|
"grad_norm": 123.57396697998047, |
|
"learning_rate": 2.261769710720363e-07, |
|
"loss": 4.0855, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.5738029283735655, |
|
"grad_norm": 115.12631225585938, |
|
"learning_rate": 2.260945270350486e-07, |
|
"loss": 3.9754, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.5757815591610606, |
|
"grad_norm": 114.95091247558594, |
|
"learning_rate": 2.260120829980609e-07, |
|
"loss": 3.8981, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.5777601899485556, |
|
"grad_norm": 105.46833038330078, |
|
"learning_rate": 2.2592963896107322e-07, |
|
"loss": 3.9452, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.5797388207360507, |
|
"grad_norm": 132.89012145996094, |
|
"learning_rate": 2.258471949240855e-07, |
|
"loss": 4.0808, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.5817174515235457, |
|
"grad_norm": 143.6460418701172, |
|
"learning_rate": 2.2576475088709782e-07, |
|
"loss": 4.0108, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.5836960823110408, |
|
"grad_norm": 130.83352661132812, |
|
"learning_rate": 2.2568230685011013e-07, |
|
"loss": 3.9701, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.5856747130985358, |
|
"grad_norm": 111.10405731201172, |
|
"learning_rate": 2.2559986281312244e-07, |
|
"loss": 4.3162, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.5876533438860309, |
|
"grad_norm": 163.31959533691406, |
|
"learning_rate": 2.2551741877613476e-07, |
|
"loss": 3.9096, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.5896319746735259, |
|
"grad_norm": 134.72927856445312, |
|
"learning_rate": 2.2543497473914704e-07, |
|
"loss": 3.8888, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.591610605461021, |
|
"grad_norm": 124.26619720458984, |
|
"learning_rate": 2.2535253070215938e-07, |
|
"loss": 4.0683, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.593589236248516, |
|
"grad_norm": 106.8174057006836, |
|
"learning_rate": 2.2527008666517166e-07, |
|
"loss": 4.0547, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5955678670360111, |
|
"grad_norm": 108.11019897460938, |
|
"learning_rate": 2.2518764262818398e-07, |
|
"loss": 3.9728, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.5975464978235061, |
|
"grad_norm": 117.44151306152344, |
|
"learning_rate": 2.251051985911963e-07, |
|
"loss": 4.0569, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.5995251286110012, |
|
"grad_norm": 106.18008422851562, |
|
"learning_rate": 2.250227545542086e-07, |
|
"loss": 3.9042, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.6015037593984962, |
|
"grad_norm": 88.67406463623047, |
|
"learning_rate": 2.249403105172209e-07, |
|
"loss": 4.0719, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.6034823901859913, |
|
"grad_norm": 111.12770080566406, |
|
"learning_rate": 2.248578664802332e-07, |
|
"loss": 3.9749, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.6054610209734863, |
|
"grad_norm": 119.26530456542969, |
|
"learning_rate": 2.247754224432455e-07, |
|
"loss": 3.9832, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.6074396517609814, |
|
"grad_norm": 157.9289093017578, |
|
"learning_rate": 2.2469297840625782e-07, |
|
"loss": 3.9538, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.6094182825484764, |
|
"grad_norm": 122.70995330810547, |
|
"learning_rate": 2.2461053436927013e-07, |
|
"loss": 3.8497, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6113969133359715, |
|
"grad_norm": 142.41835021972656, |
|
"learning_rate": 2.2452809033228242e-07, |
|
"loss": 3.9172, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.6133755441234665, |
|
"grad_norm": 128.31825256347656, |
|
"learning_rate": 2.2444564629529475e-07, |
|
"loss": 3.7326, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6153541749109616, |
|
"grad_norm": 142.67408752441406, |
|
"learning_rate": 2.2436320225830704e-07, |
|
"loss": 3.8782, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.6173328056984567, |
|
"grad_norm": 145.0731658935547, |
|
"learning_rate": 2.2428075822131938e-07, |
|
"loss": 4.024, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6193114364859518, |
|
"grad_norm": 187.09068298339844, |
|
"learning_rate": 2.2419831418433166e-07, |
|
"loss": 3.8939, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.6212900672734468, |
|
"grad_norm": 122.93965148925781, |
|
"learning_rate": 2.2411587014734397e-07, |
|
"loss": 4.0373, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.6232686980609419, |
|
"grad_norm": 152.1845245361328, |
|
"learning_rate": 2.2403342611035628e-07, |
|
"loss": 4.1168, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.6252473288484369, |
|
"grad_norm": 100.07666778564453, |
|
"learning_rate": 2.239509820733686e-07, |
|
"loss": 3.9718, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.627225959635932, |
|
"grad_norm": 130.85479736328125, |
|
"learning_rate": 2.2386853803638088e-07, |
|
"loss": 4.0301, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.629204590423427, |
|
"grad_norm": 123.073974609375, |
|
"learning_rate": 2.237860939993932e-07, |
|
"loss": 4.0104, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.6311832212109221, |
|
"grad_norm": 168.19808959960938, |
|
"learning_rate": 2.237036499624055e-07, |
|
"loss": 3.9421, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.6331618519984171, |
|
"grad_norm": 118.69593811035156, |
|
"learning_rate": 2.2362120592541782e-07, |
|
"loss": 3.8238, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6351404827859122, |
|
"grad_norm": 192.9334259033203, |
|
"learning_rate": 2.2353876188843013e-07, |
|
"loss": 3.8227, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.6371191135734072, |
|
"grad_norm": 103.11824035644531, |
|
"learning_rate": 2.2345631785144241e-07, |
|
"loss": 3.9359, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.6390977443609023, |
|
"grad_norm": 129.3599090576172, |
|
"learning_rate": 2.2337387381445475e-07, |
|
"loss": 4.0562, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.6410763751483973, |
|
"grad_norm": 124.06795501708984, |
|
"learning_rate": 2.2329142977746704e-07, |
|
"loss": 4.1502, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6430550059358924, |
|
"grad_norm": 113.18289184570312, |
|
"learning_rate": 2.2320898574047935e-07, |
|
"loss": 4.0059, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.6450336367233874, |
|
"grad_norm": 117.89970397949219, |
|
"learning_rate": 2.2312654170349166e-07, |
|
"loss": 4.0162, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.6470122675108825, |
|
"grad_norm": 109.6517105102539, |
|
"learning_rate": 2.2304409766650397e-07, |
|
"loss": 3.9979, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.6489908982983775, |
|
"grad_norm": 123.35499572753906, |
|
"learning_rate": 2.2296165362951628e-07, |
|
"loss": 3.9273, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.6509695290858726, |
|
"grad_norm": 141.97459411621094, |
|
"learning_rate": 2.228792095925286e-07, |
|
"loss": 4.0558, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.6529481598733676, |
|
"grad_norm": 159.06973266601562, |
|
"learning_rate": 2.2279676555554088e-07, |
|
"loss": 3.8374, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.6549267906608627, |
|
"grad_norm": 120.933837890625, |
|
"learning_rate": 2.227143215185532e-07, |
|
"loss": 3.8412, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.6569054214483577, |
|
"grad_norm": 106.266357421875, |
|
"learning_rate": 2.226318774815655e-07, |
|
"loss": 3.8757, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6588840522358528, |
|
"grad_norm": 138.7765655517578, |
|
"learning_rate": 2.225494334445778e-07, |
|
"loss": 4.2284, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.6608626830233478, |
|
"grad_norm": 120.76045989990234, |
|
"learning_rate": 2.2246698940759013e-07, |
|
"loss": 3.9127, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.6628413138108429, |
|
"grad_norm": 117.31808471679688, |
|
"learning_rate": 2.223845453706024e-07, |
|
"loss": 3.7577, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.6648199445983379, |
|
"grad_norm": 108.21405029296875, |
|
"learning_rate": 2.2230210133361475e-07, |
|
"loss": 4.1095, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.666798575385833, |
|
"grad_norm": 126.65251159667969, |
|
"learning_rate": 2.2221965729662703e-07, |
|
"loss": 3.9047, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.668777206173328, |
|
"grad_norm": 135.06512451171875, |
|
"learning_rate": 2.2213721325963935e-07, |
|
"loss": 3.9267, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.6707558369608231, |
|
"grad_norm": 150.37025451660156, |
|
"learning_rate": 2.2205476922265166e-07, |
|
"loss": 3.9662, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.6727344677483181, |
|
"grad_norm": 138.01531982421875, |
|
"learning_rate": 2.2197232518566397e-07, |
|
"loss": 3.8107, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6747130985358132, |
|
"grad_norm": 130.35153198242188, |
|
"learning_rate": 2.2188988114867625e-07, |
|
"loss": 3.8068, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.6766917293233082, |
|
"grad_norm": 161.9180145263672, |
|
"learning_rate": 2.218074371116886e-07, |
|
"loss": 4.0893, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.6786703601108033, |
|
"grad_norm": 165.08409118652344, |
|
"learning_rate": 2.2172499307470088e-07, |
|
"loss": 3.8419, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.6806489908982983, |
|
"grad_norm": 153.2915496826172, |
|
"learning_rate": 2.216425490377132e-07, |
|
"loss": 3.9302, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.6826276216857934, |
|
"grad_norm": 153.20138549804688, |
|
"learning_rate": 2.215601050007255e-07, |
|
"loss": 3.9947, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.6846062524732884, |
|
"grad_norm": 124.32341003417969, |
|
"learning_rate": 2.2147766096373779e-07, |
|
"loss": 3.8241, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.6865848832607835, |
|
"grad_norm": 209.813232421875, |
|
"learning_rate": 2.2139521692675012e-07, |
|
"loss": 3.917, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.6885635140482786, |
|
"grad_norm": 116.88125610351562, |
|
"learning_rate": 2.213127728897624e-07, |
|
"loss": 3.9474, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.6905421448357737, |
|
"grad_norm": 178.58721923828125, |
|
"learning_rate": 2.2123032885277472e-07, |
|
"loss": 3.9247, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.6925207756232687, |
|
"grad_norm": 123.67437744140625, |
|
"learning_rate": 2.2114788481578703e-07, |
|
"loss": 3.9548, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.6944994064107638, |
|
"grad_norm": 154.60626220703125, |
|
"learning_rate": 2.2106544077879934e-07, |
|
"loss": 3.8239, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.6964780371982588, |
|
"grad_norm": 141.65699768066406, |
|
"learning_rate": 2.2098299674181166e-07, |
|
"loss": 3.8458, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.6984566679857539, |
|
"grad_norm": 112.9280776977539, |
|
"learning_rate": 2.2090055270482397e-07, |
|
"loss": 3.9648, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.700435298773249, |
|
"grad_norm": 154.57643127441406, |
|
"learning_rate": 2.2081810866783625e-07, |
|
"loss": 4.0078, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.702413929560744, |
|
"grad_norm": 129.07418823242188, |
|
"learning_rate": 2.207356646308486e-07, |
|
"loss": 4.0276, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.704392560348239, |
|
"grad_norm": 113.59859466552734, |
|
"learning_rate": 2.2065322059386088e-07, |
|
"loss": 4.1596, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7063711911357341, |
|
"grad_norm": 136.26283264160156, |
|
"learning_rate": 2.2057077655687316e-07, |
|
"loss": 3.948, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.7083498219232292, |
|
"grad_norm": 118.27870178222656, |
|
"learning_rate": 2.204883325198855e-07, |
|
"loss": 3.7611, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.7103284527107242, |
|
"grad_norm": 159.56643676757812, |
|
"learning_rate": 2.2040588848289778e-07, |
|
"loss": 4.0215, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 0.7123070834982193, |
|
"grad_norm": 125.84573364257812, |
|
"learning_rate": 2.2032344444591012e-07, |
|
"loss": 4.0046, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 148.7548065185547, |
|
"learning_rate": 2.202410004089224e-07, |
|
"loss": 4.0033, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 0.7162643450732094, |
|
"grad_norm": 109.41517639160156, |
|
"learning_rate": 2.2015855637193472e-07, |
|
"loss": 3.9298, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.7182429758607044, |
|
"grad_norm": 112.52848815917969, |
|
"learning_rate": 2.2007611233494703e-07, |
|
"loss": 4.1003, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.7202216066481995, |
|
"grad_norm": 114.72808074951172, |
|
"learning_rate": 2.1999366829795934e-07, |
|
"loss": 4.1112, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.7222002374356945, |
|
"grad_norm": 144.11619567871094, |
|
"learning_rate": 2.1991122426097163e-07, |
|
"loss": 3.764, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 0.7241788682231896, |
|
"grad_norm": 118.64055633544922, |
|
"learning_rate": 2.1982878022398396e-07, |
|
"loss": 3.9262, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.7261574990106846, |
|
"grad_norm": 166.79525756835938, |
|
"learning_rate": 2.1974633618699625e-07, |
|
"loss": 4.0518, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 0.7281361297981797, |
|
"grad_norm": 128.2512969970703, |
|
"learning_rate": 2.1966389215000856e-07, |
|
"loss": 3.952, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.7301147605856747, |
|
"grad_norm": 143.56414794921875, |
|
"learning_rate": 2.1958144811302087e-07, |
|
"loss": 3.7034, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.7320933913731698, |
|
"grad_norm": 120.13394165039062, |
|
"learning_rate": 2.1949900407603316e-07, |
|
"loss": 3.7839, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.7340720221606648, |
|
"grad_norm": 148.74070739746094, |
|
"learning_rate": 2.194165600390455e-07, |
|
"loss": 3.8871, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 0.7360506529481599, |
|
"grad_norm": 148.17022705078125, |
|
"learning_rate": 2.1933411600205778e-07, |
|
"loss": 3.7486, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.7380292837356549, |
|
"grad_norm": 112.7260513305664, |
|
"learning_rate": 2.192516719650701e-07, |
|
"loss": 3.9436, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 0.74000791452315, |
|
"grad_norm": 131.4718780517578, |
|
"learning_rate": 2.191692279280824e-07, |
|
"loss": 4.1101, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.741986545310645, |
|
"grad_norm": 106.73101043701172, |
|
"learning_rate": 2.1908678389109472e-07, |
|
"loss": 3.9285, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.7439651760981401, |
|
"grad_norm": 120.58040618896484, |
|
"learning_rate": 2.1900433985410703e-07, |
|
"loss": 3.8471, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.7459438068856351, |
|
"grad_norm": 135.69512939453125, |
|
"learning_rate": 2.1892189581711934e-07, |
|
"loss": 3.7629, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 0.7479224376731302, |
|
"grad_norm": 125.78627014160156, |
|
"learning_rate": 2.1883945178013162e-07, |
|
"loss": 4.0646, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.7499010684606252, |
|
"grad_norm": 150.2305145263672, |
|
"learning_rate": 2.1875700774314396e-07, |
|
"loss": 3.9361, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 0.7518796992481203, |
|
"grad_norm": 95.4436264038086, |
|
"learning_rate": 2.1867456370615625e-07, |
|
"loss": 3.7688, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7538583300356153, |
|
"grad_norm": 141.27809143066406, |
|
"learning_rate": 2.1859211966916853e-07, |
|
"loss": 4.0217, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.7558369608231104, |
|
"grad_norm": 133.8254852294922, |
|
"learning_rate": 2.1850967563218087e-07, |
|
"loss": 4.0683, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.7578155916106054, |
|
"grad_norm": 139.919189453125, |
|
"learning_rate": 2.1842723159519316e-07, |
|
"loss": 3.9958, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 0.7597942223981005, |
|
"grad_norm": 173.58946228027344, |
|
"learning_rate": 2.183447875582055e-07, |
|
"loss": 3.9474, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.7617728531855956, |
|
"grad_norm": 107.07398223876953, |
|
"learning_rate": 2.1826234352121778e-07, |
|
"loss": 3.8308, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 0.7637514839730907, |
|
"grad_norm": 124.00753784179688, |
|
"learning_rate": 2.181798994842301e-07, |
|
"loss": 3.8218, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.7657301147605857, |
|
"grad_norm": 138.23736572265625, |
|
"learning_rate": 2.180974554472424e-07, |
|
"loss": 3.7296, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.7677087455480808, |
|
"grad_norm": 128.9496612548828, |
|
"learning_rate": 2.1801501141025471e-07, |
|
"loss": 3.9163, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.7696873763355758, |
|
"grad_norm": 108.07875061035156, |
|
"learning_rate": 2.17932567373267e-07, |
|
"loss": 3.9408, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 0.7716660071230709, |
|
"grad_norm": 126.18501281738281, |
|
"learning_rate": 2.1785012333627934e-07, |
|
"loss": 4.1993, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.7736446379105659, |
|
"grad_norm": 144.8102264404297, |
|
"learning_rate": 2.1776767929929162e-07, |
|
"loss": 3.877, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 0.775623268698061, |
|
"grad_norm": 118.8504638671875, |
|
"learning_rate": 2.1768523526230396e-07, |
|
"loss": 3.9788, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.777601899485556, |
|
"grad_norm": 127.45133209228516, |
|
"learning_rate": 2.1760279122531625e-07, |
|
"loss": 3.8987, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.7795805302730511, |
|
"grad_norm": 134.95892333984375, |
|
"learning_rate": 2.1752034718832853e-07, |
|
"loss": 3.8251, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.7815591610605461, |
|
"grad_norm": 124.00614929199219, |
|
"learning_rate": 2.1743790315134087e-07, |
|
"loss": 3.6875, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 0.7835377918480412, |
|
"grad_norm": 126.81105041503906, |
|
"learning_rate": 2.1735545911435315e-07, |
|
"loss": 3.7447, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.7855164226355362, |
|
"grad_norm": 106.54443359375, |
|
"learning_rate": 2.172730150773655e-07, |
|
"loss": 3.9051, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 0.7874950534230313, |
|
"grad_norm": 156.5098876953125, |
|
"learning_rate": 2.1719057104037778e-07, |
|
"loss": 3.9587, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 128.83648681640625, |
|
"learning_rate": 2.171081270033901e-07, |
|
"loss": 4.0755, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.7914523149980214, |
|
"grad_norm": 131.54664611816406, |
|
"learning_rate": 2.170256829664024e-07, |
|
"loss": 4.0072, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7914523149980214, |
|
"eval_loss": 3.8681728839874268, |
|
"eval_runtime": 9.5698, |
|
"eval_samples_per_second": 264.165, |
|
"eval_steps_per_second": 33.021, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 75810, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 35767296000000.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|