test_1 / trainer_state.json
stanpony's picture
Upload fine-tuned checkpoint
a2f8549 verified
{
"best_metric": 3.8681728839874268,
"best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-10000",
"epoch": 0.7914523149980214,
"eval_steps": 10000,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019786307874950534,
"grad_norm": 254.82342529296875,
"learning_rate": 2.499208537244918e-07,
"loss": 5.7705,
"step": 25
},
{
"epoch": 0.003957261574990107,
"grad_norm": 153.19989013671875,
"learning_rate": 2.498384096875041e-07,
"loss": 5.6747,
"step": 50
},
{
"epoch": 0.00593589236248516,
"grad_norm": 224.5292510986328,
"learning_rate": 2.4975596565051644e-07,
"loss": 5.6201,
"step": 75
},
{
"epoch": 0.007914523149980214,
"grad_norm": 175.854248046875,
"learning_rate": 2.4967352161352873e-07,
"loss": 5.6974,
"step": 100
},
{
"epoch": 0.009893153937475268,
"grad_norm": 163.52769470214844,
"learning_rate": 2.49591077576541e-07,
"loss": 5.5417,
"step": 125
},
{
"epoch": 0.01187178472497032,
"grad_norm": 254.2264862060547,
"learning_rate": 2.4950863353955335e-07,
"loss": 5.8201,
"step": 150
},
{
"epoch": 0.013850415512465374,
"grad_norm": 175.30279541015625,
"learning_rate": 2.4942618950256564e-07,
"loss": 5.5302,
"step": 175
},
{
"epoch": 0.015829046299960427,
"grad_norm": 300.1286315917969,
"learning_rate": 2.4934374546557797e-07,
"loss": 5.6572,
"step": 200
},
{
"epoch": 0.01780767708745548,
"grad_norm": 201.56961059570312,
"learning_rate": 2.4926130142859026e-07,
"loss": 5.2914,
"step": 225
},
{
"epoch": 0.019786307874950535,
"grad_norm": 245.64854431152344,
"learning_rate": 2.491788573916026e-07,
"loss": 5.4478,
"step": 250
},
{
"epoch": 0.02176493866244559,
"grad_norm": 239.78257751464844,
"learning_rate": 2.490964133546149e-07,
"loss": 5.4161,
"step": 275
},
{
"epoch": 0.02374356944994064,
"grad_norm": 150.18310546875,
"learning_rate": 2.4901396931762717e-07,
"loss": 5.4978,
"step": 300
},
{
"epoch": 0.025722200237435693,
"grad_norm": 172.03607177734375,
"learning_rate": 2.489315252806395e-07,
"loss": 5.4105,
"step": 325
},
{
"epoch": 0.027700831024930747,
"grad_norm": 343.2570495605469,
"learning_rate": 2.488490812436518e-07,
"loss": 5.5195,
"step": 350
},
{
"epoch": 0.0296794618124258,
"grad_norm": 329.7228698730469,
"learning_rate": 2.4876663720666413e-07,
"loss": 5.4494,
"step": 375
},
{
"epoch": 0.031658092599920855,
"grad_norm": 174.63136291503906,
"learning_rate": 2.486841931696764e-07,
"loss": 5.2864,
"step": 400
},
{
"epoch": 0.033636723387415905,
"grad_norm": 356.6216125488281,
"learning_rate": 2.486017491326887e-07,
"loss": 5.4376,
"step": 425
},
{
"epoch": 0.03561535417491096,
"grad_norm": 166.16783142089844,
"learning_rate": 2.4851930509570104e-07,
"loss": 5.3141,
"step": 450
},
{
"epoch": 0.03759398496240601,
"grad_norm": 220.06170654296875,
"learning_rate": 2.484368610587133e-07,
"loss": 5.5457,
"step": 475
},
{
"epoch": 0.03957261574990107,
"grad_norm": 154.55517578125,
"learning_rate": 2.483544170217256e-07,
"loss": 5.1264,
"step": 500
},
{
"epoch": 0.04155124653739612,
"grad_norm": 184.18443298339844,
"learning_rate": 2.4827197298473794e-07,
"loss": 5.3702,
"step": 525
},
{
"epoch": 0.04352987732489118,
"grad_norm": 128.84693908691406,
"learning_rate": 2.4818952894775023e-07,
"loss": 5.0207,
"step": 550
},
{
"epoch": 0.04550850811238623,
"grad_norm": 196.2894287109375,
"learning_rate": 2.4810708491076257e-07,
"loss": 5.315,
"step": 575
},
{
"epoch": 0.04748713889988128,
"grad_norm": 200.00257873535156,
"learning_rate": 2.4802464087377485e-07,
"loss": 5.2215,
"step": 600
},
{
"epoch": 0.049465769687376336,
"grad_norm": 271.8963928222656,
"learning_rate": 2.479421968367872e-07,
"loss": 5.3286,
"step": 625
},
{
"epoch": 0.051444400474871387,
"grad_norm": 181.56686401367188,
"learning_rate": 2.478597527997995e-07,
"loss": 4.9967,
"step": 650
},
{
"epoch": 0.053423031262366444,
"grad_norm": 242.8925323486328,
"learning_rate": 2.477773087628118e-07,
"loss": 5.1984,
"step": 675
},
{
"epoch": 0.055401662049861494,
"grad_norm": 210.05746459960938,
"learning_rate": 2.476948647258241e-07,
"loss": 5.0975,
"step": 700
},
{
"epoch": 0.05738029283735655,
"grad_norm": 181.1220245361328,
"learning_rate": 2.476124206888364e-07,
"loss": 5.0036,
"step": 725
},
{
"epoch": 0.0593589236248516,
"grad_norm": 166.00709533691406,
"learning_rate": 2.475299766518487e-07,
"loss": 5.3082,
"step": 750
},
{
"epoch": 0.06133755441234666,
"grad_norm": 151.4649200439453,
"learning_rate": 2.47447532614861e-07,
"loss": 5.1391,
"step": 775
},
{
"epoch": 0.06331618519984171,
"grad_norm": 149.88165283203125,
"learning_rate": 2.4736508857787335e-07,
"loss": 5.0783,
"step": 800
},
{
"epoch": 0.06529481598733676,
"grad_norm": 172.47061157226562,
"learning_rate": 2.4728264454088563e-07,
"loss": 4.9624,
"step": 825
},
{
"epoch": 0.06727344677483181,
"grad_norm": 298.1490478515625,
"learning_rate": 2.4720020050389797e-07,
"loss": 5.1937,
"step": 850
},
{
"epoch": 0.06925207756232687,
"grad_norm": 164.37867736816406,
"learning_rate": 2.4711775646691025e-07,
"loss": 5.1792,
"step": 875
},
{
"epoch": 0.07123070834982193,
"grad_norm": 216.8033905029297,
"learning_rate": 2.4703531242992254e-07,
"loss": 5.152,
"step": 900
},
{
"epoch": 0.07320933913731698,
"grad_norm": 211.95762634277344,
"learning_rate": 2.469528683929349e-07,
"loss": 4.9146,
"step": 925
},
{
"epoch": 0.07518796992481203,
"grad_norm": 257.61968994140625,
"learning_rate": 2.4687042435594716e-07,
"loss": 5.095,
"step": 950
},
{
"epoch": 0.07716660071230709,
"grad_norm": 179.43719482421875,
"learning_rate": 2.467879803189595e-07,
"loss": 5.0316,
"step": 975
},
{
"epoch": 0.07914523149980214,
"grad_norm": 180.3157958984375,
"learning_rate": 2.467055362819718e-07,
"loss": 4.9441,
"step": 1000
},
{
"epoch": 0.08112386228729719,
"grad_norm": 162.77447509765625,
"learning_rate": 2.4662309224498407e-07,
"loss": 4.9724,
"step": 1025
},
{
"epoch": 0.08310249307479224,
"grad_norm": 123.65939331054688,
"learning_rate": 2.465406482079964e-07,
"loss": 5.2271,
"step": 1050
},
{
"epoch": 0.08508112386228729,
"grad_norm": 163.114990234375,
"learning_rate": 2.464582041710087e-07,
"loss": 4.9724,
"step": 1075
},
{
"epoch": 0.08705975464978236,
"grad_norm": 204.76400756835938,
"learning_rate": 2.46375760134021e-07,
"loss": 4.8724,
"step": 1100
},
{
"epoch": 0.0890383854372774,
"grad_norm": 307.8963623046875,
"learning_rate": 2.462933160970333e-07,
"loss": 4.9256,
"step": 1125
},
{
"epoch": 0.09101701622477246,
"grad_norm": 133.03707885742188,
"learning_rate": 2.462108720600456e-07,
"loss": 4.8792,
"step": 1150
},
{
"epoch": 0.09299564701226751,
"grad_norm": 161.41697692871094,
"learning_rate": 2.4612842802305794e-07,
"loss": 5.054,
"step": 1175
},
{
"epoch": 0.09497427779976256,
"grad_norm": 135.36228942871094,
"learning_rate": 2.460459839860702e-07,
"loss": 4.8655,
"step": 1200
},
{
"epoch": 0.09695290858725762,
"grad_norm": 179.60646057128906,
"learning_rate": 2.4596353994908256e-07,
"loss": 4.7832,
"step": 1225
},
{
"epoch": 0.09893153937475267,
"grad_norm": 335.71380615234375,
"learning_rate": 2.4588109591209485e-07,
"loss": 4.9979,
"step": 1250
},
{
"epoch": 0.10091017016224772,
"grad_norm": 149.5147247314453,
"learning_rate": 2.457986518751072e-07,
"loss": 4.714,
"step": 1275
},
{
"epoch": 0.10288880094974277,
"grad_norm": 154.0236358642578,
"learning_rate": 2.4571620783811947e-07,
"loss": 4.8015,
"step": 1300
},
{
"epoch": 0.10486743173723784,
"grad_norm": 450.5319519042969,
"learning_rate": 2.456337638011318e-07,
"loss": 4.6914,
"step": 1325
},
{
"epoch": 0.10684606252473289,
"grad_norm": 195.87863159179688,
"learning_rate": 2.455513197641441e-07,
"loss": 5.0124,
"step": 1350
},
{
"epoch": 0.10882469331222794,
"grad_norm": 198.12225341796875,
"learning_rate": 2.454688757271564e-07,
"loss": 4.5305,
"step": 1375
},
{
"epoch": 0.11080332409972299,
"grad_norm": 161.57623291015625,
"learning_rate": 2.453864316901687e-07,
"loss": 4.7806,
"step": 1400
},
{
"epoch": 0.11278195488721804,
"grad_norm": 187.8081817626953,
"learning_rate": 2.45303987653181e-07,
"loss": 4.9401,
"step": 1425
},
{
"epoch": 0.1147605856747131,
"grad_norm": 160.1893768310547,
"learning_rate": 2.4522154361619334e-07,
"loss": 4.8119,
"step": 1450
},
{
"epoch": 0.11673921646220815,
"grad_norm": 181.8563995361328,
"learning_rate": 2.4513909957920563e-07,
"loss": 4.7979,
"step": 1475
},
{
"epoch": 0.1187178472497032,
"grad_norm": 184.80641174316406,
"learning_rate": 2.4505665554221796e-07,
"loss": 4.8448,
"step": 1500
},
{
"epoch": 0.12069647803719825,
"grad_norm": 151.4502410888672,
"learning_rate": 2.4497421150523025e-07,
"loss": 4.7101,
"step": 1525
},
{
"epoch": 0.12267510882469332,
"grad_norm": 163.2119598388672,
"learning_rate": 2.4489176746824253e-07,
"loss": 4.8802,
"step": 1550
},
{
"epoch": 0.12465373961218837,
"grad_norm": 147.33741760253906,
"learning_rate": 2.4480932343125487e-07,
"loss": 4.6433,
"step": 1575
},
{
"epoch": 0.12663237039968342,
"grad_norm": 145.84716796875,
"learning_rate": 2.4472687939426716e-07,
"loss": 4.4118,
"step": 1600
},
{
"epoch": 0.12861100118717847,
"grad_norm": 111.55641174316406,
"learning_rate": 2.4464443535727944e-07,
"loss": 4.819,
"step": 1625
},
{
"epoch": 0.13058963197467352,
"grad_norm": 145.68092346191406,
"learning_rate": 2.445619913202918e-07,
"loss": 4.7752,
"step": 1650
},
{
"epoch": 0.13256826276216857,
"grad_norm": 274.0830078125,
"learning_rate": 2.4447954728330407e-07,
"loss": 4.8566,
"step": 1675
},
{
"epoch": 0.13454689354966362,
"grad_norm": 141.83982849121094,
"learning_rate": 2.4439710324631635e-07,
"loss": 4.6643,
"step": 1700
},
{
"epoch": 0.1365255243371587,
"grad_norm": 182.46160888671875,
"learning_rate": 2.443146592093287e-07,
"loss": 4.731,
"step": 1725
},
{
"epoch": 0.13850415512465375,
"grad_norm": 200.28773498535156,
"learning_rate": 2.44232215172341e-07,
"loss": 4.5525,
"step": 1750
},
{
"epoch": 0.1404827859121488,
"grad_norm": 163.7792510986328,
"learning_rate": 2.441497711353533e-07,
"loss": 4.8076,
"step": 1775
},
{
"epoch": 0.14246141669964385,
"grad_norm": 422.9642639160156,
"learning_rate": 2.440673270983656e-07,
"loss": 4.7045,
"step": 1800
},
{
"epoch": 0.1444400474871389,
"grad_norm": 187.99957275390625,
"learning_rate": 2.4398488306137794e-07,
"loss": 4.6615,
"step": 1825
},
{
"epoch": 0.14641867827463395,
"grad_norm": 144.52732849121094,
"learning_rate": 2.439024390243902e-07,
"loss": 4.7912,
"step": 1850
},
{
"epoch": 0.148397309062129,
"grad_norm": 192.0771026611328,
"learning_rate": 2.4381999498740256e-07,
"loss": 4.7916,
"step": 1875
},
{
"epoch": 0.15037593984962405,
"grad_norm": 148.06878662109375,
"learning_rate": 2.4373755095041484e-07,
"loss": 4.7782,
"step": 1900
},
{
"epoch": 0.1523545706371191,
"grad_norm": 131.4456329345703,
"learning_rate": 2.436551069134272e-07,
"loss": 4.579,
"step": 1925
},
{
"epoch": 0.15433320142461418,
"grad_norm": 141.84681701660156,
"learning_rate": 2.4357266287643947e-07,
"loss": 4.5776,
"step": 1950
},
{
"epoch": 0.15631183221210923,
"grad_norm": 122.31990051269531,
"learning_rate": 2.4349021883945175e-07,
"loss": 4.5185,
"step": 1975
},
{
"epoch": 0.15829046299960428,
"grad_norm": 229.08372497558594,
"learning_rate": 2.434077748024641e-07,
"loss": 4.6352,
"step": 2000
},
{
"epoch": 0.16026909378709933,
"grad_norm": 136.54153442382812,
"learning_rate": 2.433253307654764e-07,
"loss": 4.5512,
"step": 2025
},
{
"epoch": 0.16224772457459438,
"grad_norm": 237.05514526367188,
"learning_rate": 2.432428867284887e-07,
"loss": 4.7146,
"step": 2050
},
{
"epoch": 0.16422635536208943,
"grad_norm": 149.2750244140625,
"learning_rate": 2.43160442691501e-07,
"loss": 4.6935,
"step": 2075
},
{
"epoch": 0.16620498614958448,
"grad_norm": 149.77297973632812,
"learning_rate": 2.4307799865451334e-07,
"loss": 4.8223,
"step": 2100
},
{
"epoch": 0.16818361693707953,
"grad_norm": 235.3883056640625,
"learning_rate": 2.429955546175256e-07,
"loss": 4.6266,
"step": 2125
},
{
"epoch": 0.17016224772457458,
"grad_norm": 137.77316284179688,
"learning_rate": 2.429131105805379e-07,
"loss": 4.8543,
"step": 2150
},
{
"epoch": 0.17214087851206966,
"grad_norm": 143.8935089111328,
"learning_rate": 2.4283066654355025e-07,
"loss": 4.651,
"step": 2175
},
{
"epoch": 0.1741195092995647,
"grad_norm": 191.43856811523438,
"learning_rate": 2.4274822250656253e-07,
"loss": 4.4166,
"step": 2200
},
{
"epoch": 0.17609814008705976,
"grad_norm": 135.82838439941406,
"learning_rate": 2.426657784695748e-07,
"loss": 4.7078,
"step": 2225
},
{
"epoch": 0.1780767708745548,
"grad_norm": 114.28646087646484,
"learning_rate": 2.4258333443258715e-07,
"loss": 4.5316,
"step": 2250
},
{
"epoch": 0.18005540166204986,
"grad_norm": 237.41001892089844,
"learning_rate": 2.4250089039559944e-07,
"loss": 4.4699,
"step": 2275
},
{
"epoch": 0.1820340324495449,
"grad_norm": 124.57892608642578,
"learning_rate": 2.424184463586117e-07,
"loss": 4.5101,
"step": 2300
},
{
"epoch": 0.18401266323703996,
"grad_norm": 147.15554809570312,
"learning_rate": 2.4233600232162406e-07,
"loss": 4.5974,
"step": 2325
},
{
"epoch": 0.18599129402453501,
"grad_norm": 166.0609588623047,
"learning_rate": 2.4225355828463635e-07,
"loss": 4.5105,
"step": 2350
},
{
"epoch": 0.18796992481203006,
"grad_norm": 188.97705078125,
"learning_rate": 2.421711142476487e-07,
"loss": 4.587,
"step": 2375
},
{
"epoch": 0.18994855559952512,
"grad_norm": 243.09271240234375,
"learning_rate": 2.4208867021066097e-07,
"loss": 4.7686,
"step": 2400
},
{
"epoch": 0.1919271863870202,
"grad_norm": 127.40078735351562,
"learning_rate": 2.420062261736733e-07,
"loss": 4.4476,
"step": 2425
},
{
"epoch": 0.19390581717451524,
"grad_norm": 253.8776092529297,
"learning_rate": 2.419237821366856e-07,
"loss": 4.5478,
"step": 2450
},
{
"epoch": 0.1958844479620103,
"grad_norm": 123.27115631103516,
"learning_rate": 2.4184133809969793e-07,
"loss": 4.3502,
"step": 2475
},
{
"epoch": 0.19786307874950534,
"grad_norm": 138.00375366210938,
"learning_rate": 2.417588940627102e-07,
"loss": 4.3534,
"step": 2500
},
{
"epoch": 0.1998417095370004,
"grad_norm": 115.53954315185547,
"learning_rate": 2.4167645002572256e-07,
"loss": 4.7066,
"step": 2525
},
{
"epoch": 0.20182034032449545,
"grad_norm": 180.38809204101562,
"learning_rate": 2.4159400598873484e-07,
"loss": 4.6605,
"step": 2550
},
{
"epoch": 0.2037989711119905,
"grad_norm": 129.8457489013672,
"learning_rate": 2.415115619517472e-07,
"loss": 4.3849,
"step": 2575
},
{
"epoch": 0.20577760189948555,
"grad_norm": 156.64404296875,
"learning_rate": 2.4142911791475946e-07,
"loss": 4.3434,
"step": 2600
},
{
"epoch": 0.2077562326869806,
"grad_norm": 162.81320190429688,
"learning_rate": 2.4134667387777175e-07,
"loss": 4.5466,
"step": 2625
},
{
"epoch": 0.20973486347447567,
"grad_norm": 128.7244873046875,
"learning_rate": 2.412642298407841e-07,
"loss": 4.5358,
"step": 2650
},
{
"epoch": 0.21171349426197072,
"grad_norm": 217.59042358398438,
"learning_rate": 2.4118178580379637e-07,
"loss": 4.5235,
"step": 2675
},
{
"epoch": 0.21369212504946578,
"grad_norm": 144.84365844726562,
"learning_rate": 2.410993417668087e-07,
"loss": 4.3811,
"step": 2700
},
{
"epoch": 0.21567075583696083,
"grad_norm": 146.22451782226562,
"learning_rate": 2.41016897729821e-07,
"loss": 4.3797,
"step": 2725
},
{
"epoch": 0.21764938662445588,
"grad_norm": 198.39772033691406,
"learning_rate": 2.409344536928333e-07,
"loss": 4.4303,
"step": 2750
},
{
"epoch": 0.21962801741195093,
"grad_norm": 158.10592651367188,
"learning_rate": 2.408520096558456e-07,
"loss": 4.3633,
"step": 2775
},
{
"epoch": 0.22160664819944598,
"grad_norm": 166.79954528808594,
"learning_rate": 2.407695656188579e-07,
"loss": 4.5392,
"step": 2800
},
{
"epoch": 0.22358527898694103,
"grad_norm": 207.30593872070312,
"learning_rate": 2.406871215818702e-07,
"loss": 4.5003,
"step": 2825
},
{
"epoch": 0.22556390977443608,
"grad_norm": 128.81883239746094,
"learning_rate": 2.4060467754488253e-07,
"loss": 4.5416,
"step": 2850
},
{
"epoch": 0.22754254056193116,
"grad_norm": 181.48960876464844,
"learning_rate": 2.405222335078948e-07,
"loss": 4.1725,
"step": 2875
},
{
"epoch": 0.2295211713494262,
"grad_norm": 179.47384643554688,
"learning_rate": 2.4043978947090715e-07,
"loss": 4.5229,
"step": 2900
},
{
"epoch": 0.23149980213692126,
"grad_norm": 144.242919921875,
"learning_rate": 2.4035734543391943e-07,
"loss": 4.3295,
"step": 2925
},
{
"epoch": 0.2334784329244163,
"grad_norm": 177.61968994140625,
"learning_rate": 2.402749013969317e-07,
"loss": 4.4266,
"step": 2950
},
{
"epoch": 0.23545706371191136,
"grad_norm": 143.8682861328125,
"learning_rate": 2.4019245735994406e-07,
"loss": 4.2341,
"step": 2975
},
{
"epoch": 0.2374356944994064,
"grad_norm": 128.8461151123047,
"learning_rate": 2.4011001332295634e-07,
"loss": 4.3676,
"step": 3000
},
{
"epoch": 0.23941432528690146,
"grad_norm": 160.70687866210938,
"learning_rate": 2.400275692859687e-07,
"loss": 4.3945,
"step": 3025
},
{
"epoch": 0.2413929560743965,
"grad_norm": 157.65855407714844,
"learning_rate": 2.3994512524898097e-07,
"loss": 4.4967,
"step": 3050
},
{
"epoch": 0.24337158686189156,
"grad_norm": 125.79988861083984,
"learning_rate": 2.398626812119933e-07,
"loss": 4.279,
"step": 3075
},
{
"epoch": 0.24535021764938664,
"grad_norm": 168.8534698486328,
"learning_rate": 2.397802371750056e-07,
"loss": 4.4813,
"step": 3100
},
{
"epoch": 0.2473288484368817,
"grad_norm": 120.4126968383789,
"learning_rate": 2.3969779313801793e-07,
"loss": 4.1997,
"step": 3125
},
{
"epoch": 0.24930747922437674,
"grad_norm": 115.56365203857422,
"learning_rate": 2.396153491010302e-07,
"loss": 4.4076,
"step": 3150
},
{
"epoch": 0.2512861100118718,
"grad_norm": 152.89859008789062,
"learning_rate": 2.3953290506404255e-07,
"loss": 4.2893,
"step": 3175
},
{
"epoch": 0.25326474079936684,
"grad_norm": 177.6272735595703,
"learning_rate": 2.3945046102705484e-07,
"loss": 4.4892,
"step": 3200
},
{
"epoch": 0.2552433715868619,
"grad_norm": 131.46661376953125,
"learning_rate": 2.393680169900671e-07,
"loss": 4.2702,
"step": 3225
},
{
"epoch": 0.25722200237435694,
"grad_norm": 101.60210418701172,
"learning_rate": 2.3928557295307946e-07,
"loss": 4.2209,
"step": 3250
},
{
"epoch": 0.259200633161852,
"grad_norm": 199.7799835205078,
"learning_rate": 2.3920312891609174e-07,
"loss": 4.1502,
"step": 3275
},
{
"epoch": 0.26117926394934704,
"grad_norm": 163.44424438476562,
"learning_rate": 2.391206848791041e-07,
"loss": 4.3423,
"step": 3300
},
{
"epoch": 0.2631578947368421,
"grad_norm": 148.59519958496094,
"learning_rate": 2.3903824084211637e-07,
"loss": 4.4833,
"step": 3325
},
{
"epoch": 0.26513652552433714,
"grad_norm": 129.75927734375,
"learning_rate": 2.3895579680512865e-07,
"loss": 4.4745,
"step": 3350
},
{
"epoch": 0.2671151563118322,
"grad_norm": 126.6795654296875,
"learning_rate": 2.38873352768141e-07,
"loss": 4.3964,
"step": 3375
},
{
"epoch": 0.26909378709932724,
"grad_norm": 157.1032257080078,
"learning_rate": 2.387909087311533e-07,
"loss": 4.3419,
"step": 3400
},
{
"epoch": 0.2710724178868223,
"grad_norm": 142.79139709472656,
"learning_rate": 2.3870846469416556e-07,
"loss": 4.2243,
"step": 3425
},
{
"epoch": 0.2730510486743174,
"grad_norm": 137.3797607421875,
"learning_rate": 2.386260206571779e-07,
"loss": 4.1661,
"step": 3450
},
{
"epoch": 0.27502967946181245,
"grad_norm": 148.77401733398438,
"learning_rate": 2.385435766201902e-07,
"loss": 4.483,
"step": 3475
},
{
"epoch": 0.2770083102493075,
"grad_norm": 124.54267120361328,
"learning_rate": 2.384611325832025e-07,
"loss": 4.369,
"step": 3500
},
{
"epoch": 0.27898694103680255,
"grad_norm": 113.43370056152344,
"learning_rate": 2.383786885462148e-07,
"loss": 4.1491,
"step": 3525
},
{
"epoch": 0.2809655718242976,
"grad_norm": 155.67677307128906,
"learning_rate": 2.3829624450922712e-07,
"loss": 4.3403,
"step": 3550
},
{
"epoch": 0.28294420261179265,
"grad_norm": 201.27784729003906,
"learning_rate": 2.3821380047223943e-07,
"loss": 4.3563,
"step": 3575
},
{
"epoch": 0.2849228333992877,
"grad_norm": 104.74275970458984,
"learning_rate": 2.3813135643525174e-07,
"loss": 4.2706,
"step": 3600
},
{
"epoch": 0.28690146418678275,
"grad_norm": 133.6251678466797,
"learning_rate": 2.3804891239826405e-07,
"loss": 4.3637,
"step": 3625
},
{
"epoch": 0.2888800949742778,
"grad_norm": 102.35352325439453,
"learning_rate": 2.3796646836127634e-07,
"loss": 4.2585,
"step": 3650
},
{
"epoch": 0.29085872576177285,
"grad_norm": 156.72654724121094,
"learning_rate": 2.3788402432428868e-07,
"loss": 4.3448,
"step": 3675
},
{
"epoch": 0.2928373565492679,
"grad_norm": 121.19142150878906,
"learning_rate": 2.3780158028730096e-07,
"loss": 4.1475,
"step": 3700
},
{
"epoch": 0.29481598733676295,
"grad_norm": 138.72952270507812,
"learning_rate": 2.3771913625031327e-07,
"loss": 4.2475,
"step": 3725
},
{
"epoch": 0.296794618124258,
"grad_norm": 314.35113525390625,
"learning_rate": 2.3763669221332559e-07,
"loss": 4.2643,
"step": 3750
},
{
"epoch": 0.29877324891175305,
"grad_norm": 131.71240234375,
"learning_rate": 2.375542481763379e-07,
"loss": 4.2741,
"step": 3775
},
{
"epoch": 0.3007518796992481,
"grad_norm": 193.2744598388672,
"learning_rate": 2.374718041393502e-07,
"loss": 4.2314,
"step": 3800
},
{
"epoch": 0.30273051048674315,
"grad_norm": 146.98760986328125,
"learning_rate": 2.3738936010236252e-07,
"loss": 4.5421,
"step": 3825
},
{
"epoch": 0.3047091412742382,
"grad_norm": 106.49159240722656,
"learning_rate": 2.373069160653748e-07,
"loss": 4.0922,
"step": 3850
},
{
"epoch": 0.30668777206173325,
"grad_norm": 128.12686157226562,
"learning_rate": 2.3722447202838712e-07,
"loss": 4.3171,
"step": 3875
},
{
"epoch": 0.30866640284922836,
"grad_norm": 165.8458251953125,
"learning_rate": 2.3714202799139943e-07,
"loss": 4.1937,
"step": 3900
},
{
"epoch": 0.3106450336367234,
"grad_norm": 129.49652099609375,
"learning_rate": 2.3705958395441171e-07,
"loss": 4.2486,
"step": 3925
},
{
"epoch": 0.31262366442421846,
"grad_norm": 113.08882141113281,
"learning_rate": 2.3697713991742405e-07,
"loss": 3.9533,
"step": 3950
},
{
"epoch": 0.3146022952117135,
"grad_norm": 116.51021575927734,
"learning_rate": 2.3689469588043634e-07,
"loss": 4.2525,
"step": 3975
},
{
"epoch": 0.31658092599920856,
"grad_norm": 95.54279327392578,
"learning_rate": 2.3681225184344867e-07,
"loss": 4.1355,
"step": 4000
},
{
"epoch": 0.3185595567867036,
"grad_norm": 123.10621643066406,
"learning_rate": 2.3672980780646096e-07,
"loss": 4.3705,
"step": 4025
},
{
"epoch": 0.32053818757419866,
"grad_norm": 142.11273193359375,
"learning_rate": 2.3664736376947327e-07,
"loss": 4.2712,
"step": 4050
},
{
"epoch": 0.3225168183616937,
"grad_norm": 162.17141723632812,
"learning_rate": 2.3656491973248558e-07,
"loss": 4.1127,
"step": 4075
},
{
"epoch": 0.32449544914918876,
"grad_norm": 160.26893615722656,
"learning_rate": 2.364824756954979e-07,
"loss": 4.2687,
"step": 4100
},
{
"epoch": 0.3264740799366838,
"grad_norm": 134.65093994140625,
"learning_rate": 2.3640003165851018e-07,
"loss": 4.3567,
"step": 4125
},
{
"epoch": 0.32845271072417886,
"grad_norm": 178.23516845703125,
"learning_rate": 2.3631758762152252e-07,
"loss": 4.097,
"step": 4150
},
{
"epoch": 0.3304313415116739,
"grad_norm": 151.1556396484375,
"learning_rate": 2.362351435845348e-07,
"loss": 4.1602,
"step": 4175
},
{
"epoch": 0.33240997229916897,
"grad_norm": 154.64442443847656,
"learning_rate": 2.3615269954754711e-07,
"loss": 4.2365,
"step": 4200
},
{
"epoch": 0.334388603086664,
"grad_norm": 226.6827850341797,
"learning_rate": 2.3607025551055943e-07,
"loss": 4.3196,
"step": 4225
},
{
"epoch": 0.33636723387415907,
"grad_norm": 172.67916870117188,
"learning_rate": 2.359878114735717e-07,
"loss": 4.4476,
"step": 4250
},
{
"epoch": 0.3383458646616541,
"grad_norm": 124.78984069824219,
"learning_rate": 2.3590536743658405e-07,
"loss": 4.4006,
"step": 4275
},
{
"epoch": 0.34032449544914917,
"grad_norm": 156.81365966796875,
"learning_rate": 2.3582292339959633e-07,
"loss": 4.3914,
"step": 4300
},
{
"epoch": 0.3423031262366442,
"grad_norm": 116.53181457519531,
"learning_rate": 2.3574047936260865e-07,
"loss": 4.2846,
"step": 4325
},
{
"epoch": 0.3442817570241393,
"grad_norm": 146.16543579101562,
"learning_rate": 2.3565803532562096e-07,
"loss": 4.1371,
"step": 4350
},
{
"epoch": 0.3462603878116344,
"grad_norm": 213.07974243164062,
"learning_rate": 2.3557559128863327e-07,
"loss": 4.2294,
"step": 4375
},
{
"epoch": 0.3482390185991294,
"grad_norm": 99.38206481933594,
"learning_rate": 2.3549314725164558e-07,
"loss": 4.1726,
"step": 4400
},
{
"epoch": 0.3502176493866245,
"grad_norm": 162.97059631347656,
"learning_rate": 2.354107032146579e-07,
"loss": 4.0507,
"step": 4425
},
{
"epoch": 0.3521962801741195,
"grad_norm": 132.77474975585938,
"learning_rate": 2.3532825917767018e-07,
"loss": 4.0016,
"step": 4450
},
{
"epoch": 0.3541749109616146,
"grad_norm": 126.9658203125,
"learning_rate": 2.3524581514068252e-07,
"loss": 4.2731,
"step": 4475
},
{
"epoch": 0.3561535417491096,
"grad_norm": 194.47755432128906,
"learning_rate": 2.351633711036948e-07,
"loss": 4.1119,
"step": 4500
},
{
"epoch": 0.3581321725366047,
"grad_norm": 153.6606903076172,
"learning_rate": 2.3508092706670709e-07,
"loss": 4.4556,
"step": 4525
},
{
"epoch": 0.3601108033240997,
"grad_norm": 146.66709899902344,
"learning_rate": 2.3499848302971942e-07,
"loss": 4.3314,
"step": 4550
},
{
"epoch": 0.3620894341115948,
"grad_norm": 111.01129913330078,
"learning_rate": 2.349160389927317e-07,
"loss": 4.2929,
"step": 4575
},
{
"epoch": 0.3640680648990898,
"grad_norm": 137.40582275390625,
"learning_rate": 2.3483359495574405e-07,
"loss": 4.2198,
"step": 4600
},
{
"epoch": 0.3660466956865849,
"grad_norm": 142.0623779296875,
"learning_rate": 2.3475115091875633e-07,
"loss": 4.2013,
"step": 4625
},
{
"epoch": 0.36802532647407993,
"grad_norm": 135.2795867919922,
"learning_rate": 2.3466870688176864e-07,
"loss": 4.231,
"step": 4650
},
{
"epoch": 0.370003957261575,
"grad_norm": 127.59281158447266,
"learning_rate": 2.3458626284478096e-07,
"loss": 3.9613,
"step": 4675
},
{
"epoch": 0.37198258804907003,
"grad_norm": 132.48663330078125,
"learning_rate": 2.3450381880779327e-07,
"loss": 4.1925,
"step": 4700
},
{
"epoch": 0.3739612188365651,
"grad_norm": 135.35409545898438,
"learning_rate": 2.3442137477080555e-07,
"loss": 4.1828,
"step": 4725
},
{
"epoch": 0.37593984962406013,
"grad_norm": 107.55503845214844,
"learning_rate": 2.343389307338179e-07,
"loss": 4.2578,
"step": 4750
},
{
"epoch": 0.3779184804115552,
"grad_norm": 132.79620361328125,
"learning_rate": 2.3425648669683018e-07,
"loss": 4.0254,
"step": 4775
},
{
"epoch": 0.37989711119905023,
"grad_norm": 123.6044692993164,
"learning_rate": 2.341740426598425e-07,
"loss": 3.9981,
"step": 4800
},
{
"epoch": 0.38187574198654534,
"grad_norm": 149.656005859375,
"learning_rate": 2.340915986228548e-07,
"loss": 4.1067,
"step": 4825
},
{
"epoch": 0.3838543727740404,
"grad_norm": 122.97380065917969,
"learning_rate": 2.3400915458586708e-07,
"loss": 4.2396,
"step": 4850
},
{
"epoch": 0.38583300356153544,
"grad_norm": 140.10183715820312,
"learning_rate": 2.3392671054887942e-07,
"loss": 4.1309,
"step": 4875
},
{
"epoch": 0.3878116343490305,
"grad_norm": 137.91583251953125,
"learning_rate": 2.338442665118917e-07,
"loss": 4.0575,
"step": 4900
},
{
"epoch": 0.38979026513652554,
"grad_norm": 137.72152709960938,
"learning_rate": 2.3376182247490402e-07,
"loss": 4.159,
"step": 4925
},
{
"epoch": 0.3917688959240206,
"grad_norm": 84.3819808959961,
"learning_rate": 2.3367937843791633e-07,
"loss": 4.2611,
"step": 4950
},
{
"epoch": 0.39374752671151564,
"grad_norm": 200.3111114501953,
"learning_rate": 2.3359693440092864e-07,
"loss": 4.167,
"step": 4975
},
{
"epoch": 0.3957261574990107,
"grad_norm": 123.27460479736328,
"learning_rate": 2.3351449036394095e-07,
"loss": 4.2918,
"step": 5000
},
{
"epoch": 0.39770478828650574,
"grad_norm": 111.70620727539062,
"learning_rate": 2.3343204632695327e-07,
"loss": 4.2242,
"step": 5025
},
{
"epoch": 0.3996834190740008,
"grad_norm": 107.74165344238281,
"learning_rate": 2.3334960228996555e-07,
"loss": 4.3572,
"step": 5050
},
{
"epoch": 0.40166204986149584,
"grad_norm": 138.31423950195312,
"learning_rate": 2.332671582529779e-07,
"loss": 4.1759,
"step": 5075
},
{
"epoch": 0.4036406806489909,
"grad_norm": 104.73587799072266,
"learning_rate": 2.3318471421599017e-07,
"loss": 4.1695,
"step": 5100
},
{
"epoch": 0.40561931143648594,
"grad_norm": 138.1061553955078,
"learning_rate": 2.3310227017900246e-07,
"loss": 4.0986,
"step": 5125
},
{
"epoch": 0.407597942223981,
"grad_norm": 148.92279052734375,
"learning_rate": 2.330198261420148e-07,
"loss": 4.3455,
"step": 5150
},
{
"epoch": 0.40957657301147604,
"grad_norm": 321.29852294921875,
"learning_rate": 2.3293738210502708e-07,
"loss": 4.1285,
"step": 5175
},
{
"epoch": 0.4115552037989711,
"grad_norm": 114.85989379882812,
"learning_rate": 2.3285493806803942e-07,
"loss": 3.9628,
"step": 5200
},
{
"epoch": 0.41353383458646614,
"grad_norm": 137.27610778808594,
"learning_rate": 2.327724940310517e-07,
"loss": 4.1521,
"step": 5225
},
{
"epoch": 0.4155124653739612,
"grad_norm": 96.02686309814453,
"learning_rate": 2.3269004999406402e-07,
"loss": 4.027,
"step": 5250
},
{
"epoch": 0.4174910961614563,
"grad_norm": 213.81649780273438,
"learning_rate": 2.3260760595707633e-07,
"loss": 4.0522,
"step": 5275
},
{
"epoch": 0.41946972694895135,
"grad_norm": 160.4125518798828,
"learning_rate": 2.3252516192008864e-07,
"loss": 4.09,
"step": 5300
},
{
"epoch": 0.4214483577364464,
"grad_norm": 167.58741760253906,
"learning_rate": 2.3244271788310093e-07,
"loss": 4.1128,
"step": 5325
},
{
"epoch": 0.42342698852394145,
"grad_norm": 159.55303955078125,
"learning_rate": 2.3236027384611326e-07,
"loss": 4.0867,
"step": 5350
},
{
"epoch": 0.4254056193114365,
"grad_norm": 122.51324462890625,
"learning_rate": 2.3227782980912555e-07,
"loss": 4.2261,
"step": 5375
},
{
"epoch": 0.42738425009893155,
"grad_norm": 185.9108428955078,
"learning_rate": 2.3219538577213789e-07,
"loss": 3.9684,
"step": 5400
},
{
"epoch": 0.4293628808864266,
"grad_norm": 195.37579345703125,
"learning_rate": 2.3211294173515017e-07,
"loss": 4.0779,
"step": 5425
},
{
"epoch": 0.43134151167392165,
"grad_norm": 157.84371948242188,
"learning_rate": 2.3203049769816246e-07,
"loss": 4.1991,
"step": 5450
},
{
"epoch": 0.4333201424614167,
"grad_norm": 111.01512908935547,
"learning_rate": 2.319480536611748e-07,
"loss": 3.9962,
"step": 5475
},
{
"epoch": 0.43529877324891175,
"grad_norm": 114.49053955078125,
"learning_rate": 2.3186560962418708e-07,
"loss": 3.8972,
"step": 5500
},
{
"epoch": 0.4372774040364068,
"grad_norm": 168.17874145507812,
"learning_rate": 2.317831655871994e-07,
"loss": 4.1913,
"step": 5525
},
{
"epoch": 0.43925603482390185,
"grad_norm": 140.61912536621094,
"learning_rate": 2.317007215502117e-07,
"loss": 4.1396,
"step": 5550
},
{
"epoch": 0.4412346656113969,
"grad_norm": 138.01805114746094,
"learning_rate": 2.3161827751322401e-07,
"loss": 4.1399,
"step": 5575
},
{
"epoch": 0.44321329639889195,
"grad_norm": 188.0181427001953,
"learning_rate": 2.3153583347623633e-07,
"loss": 4.0329,
"step": 5600
},
{
"epoch": 0.445191927186387,
"grad_norm": 170.8402099609375,
"learning_rate": 2.3145338943924864e-07,
"loss": 4.3414,
"step": 5625
},
{
"epoch": 0.44717055797388205,
"grad_norm": 200.65077209472656,
"learning_rate": 2.3137094540226092e-07,
"loss": 4.2154,
"step": 5650
},
{
"epoch": 0.4491491887613771,
"grad_norm": 120.18091583251953,
"learning_rate": 2.3128850136527326e-07,
"loss": 4.0372,
"step": 5675
},
{
"epoch": 0.45112781954887216,
"grad_norm": 89.9730224609375,
"learning_rate": 2.3120605732828555e-07,
"loss": 4.1059,
"step": 5700
},
{
"epoch": 0.45310645033636726,
"grad_norm": 133.7999267578125,
"learning_rate": 2.3112361329129786e-07,
"loss": 4.2035,
"step": 5725
},
{
"epoch": 0.4550850811238623,
"grad_norm": 88.3386459350586,
"learning_rate": 2.3104116925431017e-07,
"loss": 4.0566,
"step": 5750
},
{
"epoch": 0.45706371191135736,
"grad_norm": 130.95127868652344,
"learning_rate": 2.3095872521732245e-07,
"loss": 4.3084,
"step": 5775
},
{
"epoch": 0.4590423426988524,
"grad_norm": 162.55679321289062,
"learning_rate": 2.308762811803348e-07,
"loss": 4.0288,
"step": 5800
},
{
"epoch": 0.46102097348634746,
"grad_norm": 104.4178695678711,
"learning_rate": 2.3079383714334708e-07,
"loss": 3.9244,
"step": 5825
},
{
"epoch": 0.4629996042738425,
"grad_norm": 235.28123474121094,
"learning_rate": 2.307113931063594e-07,
"loss": 4.1106,
"step": 5850
},
{
"epoch": 0.46497823506133756,
"grad_norm": 289.6645812988281,
"learning_rate": 2.306289490693717e-07,
"loss": 4.0457,
"step": 5875
},
{
"epoch": 0.4669568658488326,
"grad_norm": 99.97111511230469,
"learning_rate": 2.30546505032384e-07,
"loss": 4.2542,
"step": 5900
},
{
"epoch": 0.46893549663632766,
"grad_norm": 260.0950622558594,
"learning_rate": 2.304640609953963e-07,
"loss": 4.1564,
"step": 5925
},
{
"epoch": 0.4709141274238227,
"grad_norm": 113.74392700195312,
"learning_rate": 2.3038161695840864e-07,
"loss": 4.0403,
"step": 5950
},
{
"epoch": 0.47289275821131777,
"grad_norm": 79.32340240478516,
"learning_rate": 2.3029917292142092e-07,
"loss": 4.0408,
"step": 5975
},
{
"epoch": 0.4748713889988128,
"grad_norm": 95.92308807373047,
"learning_rate": 2.3021672888443326e-07,
"loss": 3.9811,
"step": 6000
},
{
"epoch": 0.47685001978630787,
"grad_norm": 94.5758285522461,
"learning_rate": 2.3013428484744554e-07,
"loss": 4.1102,
"step": 6025
},
{
"epoch": 0.4788286505738029,
"grad_norm": 142.32131958007812,
"learning_rate": 2.3005184081045786e-07,
"loss": 3.989,
"step": 6050
},
{
"epoch": 0.48080728136129797,
"grad_norm": 97.84469604492188,
"learning_rate": 2.2996939677347017e-07,
"loss": 3.9512,
"step": 6075
},
{
"epoch": 0.482785912148793,
"grad_norm": 94.38491821289062,
"learning_rate": 2.2988695273648245e-07,
"loss": 3.9475,
"step": 6100
},
{
"epoch": 0.48476454293628807,
"grad_norm": 124.32872772216797,
"learning_rate": 2.2980450869949476e-07,
"loss": 4.1352,
"step": 6125
},
{
"epoch": 0.4867431737237831,
"grad_norm": 196.1511993408203,
"learning_rate": 2.2972206466250708e-07,
"loss": 4.2956,
"step": 6150
},
{
"epoch": 0.48872180451127817,
"grad_norm": 144.1227264404297,
"learning_rate": 2.296396206255194e-07,
"loss": 3.9718,
"step": 6175
},
{
"epoch": 0.4907004352987733,
"grad_norm": 115.52275085449219,
"learning_rate": 2.295571765885317e-07,
"loss": 3.9135,
"step": 6200
},
{
"epoch": 0.4926790660862683,
"grad_norm": 117.71548461914062,
"learning_rate": 2.29474732551544e-07,
"loss": 3.9026,
"step": 6225
},
{
"epoch": 0.4946576968737634,
"grad_norm": 135.42698669433594,
"learning_rate": 2.293922885145563e-07,
"loss": 4.0369,
"step": 6250
},
{
"epoch": 0.4966363276612584,
"grad_norm": 142.4741973876953,
"learning_rate": 2.2930984447756863e-07,
"loss": 4.3588,
"step": 6275
},
{
"epoch": 0.4986149584487535,
"grad_norm": 128.56195068359375,
"learning_rate": 2.2922740044058092e-07,
"loss": 3.9089,
"step": 6300
},
{
"epoch": 0.5005935892362485,
"grad_norm": 96.84894561767578,
"learning_rate": 2.2914495640359323e-07,
"loss": 4.1722,
"step": 6325
},
{
"epoch": 0.5025722200237436,
"grad_norm": 236.92965698242188,
"learning_rate": 2.2906251236660554e-07,
"loss": 3.9729,
"step": 6350
},
{
"epoch": 0.5045508508112386,
"grad_norm": 135.83609008789062,
"learning_rate": 2.2898006832961783e-07,
"loss": 4.0322,
"step": 6375
},
{
"epoch": 0.5065294815987337,
"grad_norm": 123.36375427246094,
"learning_rate": 2.2889762429263017e-07,
"loss": 4.0042,
"step": 6400
},
{
"epoch": 0.5085081123862287,
"grad_norm": 118.30574035644531,
"learning_rate": 2.2881518025564245e-07,
"loss": 4.1079,
"step": 6425
},
{
"epoch": 0.5104867431737238,
"grad_norm": 107.81358337402344,
"learning_rate": 2.2873273621865476e-07,
"loss": 4.1198,
"step": 6450
},
{
"epoch": 0.5124653739612188,
"grad_norm": 146.2493438720703,
"learning_rate": 2.2865029218166707e-07,
"loss": 4.0814,
"step": 6475
},
{
"epoch": 0.5144440047487139,
"grad_norm": 136.8212890625,
"learning_rate": 2.2856784814467939e-07,
"loss": 4.0562,
"step": 6500
},
{
"epoch": 0.5164226355362089,
"grad_norm": 139.30670166015625,
"learning_rate": 2.2848540410769167e-07,
"loss": 4.1199,
"step": 6525
},
{
"epoch": 0.518401266323704,
"grad_norm": 194.90414428710938,
"learning_rate": 2.28402960070704e-07,
"loss": 4.0562,
"step": 6550
},
{
"epoch": 0.520379897111199,
"grad_norm": 103.54257202148438,
"learning_rate": 2.283205160337163e-07,
"loss": 4.0797,
"step": 6575
},
{
"epoch": 0.5223585278986941,
"grad_norm": 101.63102722167969,
"learning_rate": 2.2823807199672863e-07,
"loss": 4.0591,
"step": 6600
},
{
"epoch": 0.5243371586861891,
"grad_norm": 104.28479766845703,
"learning_rate": 2.2815562795974092e-07,
"loss": 3.8991,
"step": 6625
},
{
"epoch": 0.5263157894736842,
"grad_norm": 166.01107788085938,
"learning_rate": 2.2807318392275323e-07,
"loss": 4.1011,
"step": 6650
},
{
"epoch": 0.5282944202611792,
"grad_norm": 154.64959716796875,
"learning_rate": 2.2799073988576554e-07,
"loss": 3.9283,
"step": 6675
},
{
"epoch": 0.5302730510486743,
"grad_norm": 96.0099868774414,
"learning_rate": 2.2790829584877782e-07,
"loss": 3.8247,
"step": 6700
},
{
"epoch": 0.5322516818361693,
"grad_norm": 120.90514373779297,
"learning_rate": 2.2782585181179014e-07,
"loss": 4.0629,
"step": 6725
},
{
"epoch": 0.5342303126236644,
"grad_norm": 106.48863983154297,
"learning_rate": 2.2774340777480245e-07,
"loss": 4.0127,
"step": 6750
},
{
"epoch": 0.5362089434111594,
"grad_norm": 113.17047882080078,
"learning_rate": 2.2766096373781476e-07,
"loss": 4.03,
"step": 6775
},
{
"epoch": 0.5381875741986545,
"grad_norm": 130.6500701904297,
"learning_rate": 2.2757851970082707e-07,
"loss": 4.0192,
"step": 6800
},
{
"epoch": 0.5401662049861495,
"grad_norm": 142.3747100830078,
"learning_rate": 2.2749607566383938e-07,
"loss": 4.1507,
"step": 6825
},
{
"epoch": 0.5421448357736446,
"grad_norm": 125.88548278808594,
"learning_rate": 2.2741363162685167e-07,
"loss": 4.2026,
"step": 6850
},
{
"epoch": 0.5441234665611397,
"grad_norm": 156.44570922851562,
"learning_rate": 2.27331187589864e-07,
"loss": 4.1063,
"step": 6875
},
{
"epoch": 0.5461020973486348,
"grad_norm": 150.82635498046875,
"learning_rate": 2.272487435528763e-07,
"loss": 4.0477,
"step": 6900
},
{
"epoch": 0.5480807281361298,
"grad_norm": 170.67994689941406,
"learning_rate": 2.271662995158886e-07,
"loss": 4.1398,
"step": 6925
},
{
"epoch": 0.5500593589236249,
"grad_norm": 114.224609375,
"learning_rate": 2.2708385547890091e-07,
"loss": 4.0906,
"step": 6950
},
{
"epoch": 0.55203798971112,
"grad_norm": 135.5966033935547,
"learning_rate": 2.2700141144191323e-07,
"loss": 3.872,
"step": 6975
},
{
"epoch": 0.554016620498615,
"grad_norm": 120.73974609375,
"learning_rate": 2.2691896740492554e-07,
"loss": 3.9762,
"step": 7000
},
{
"epoch": 0.55599525128611,
"grad_norm": 107.66891479492188,
"learning_rate": 2.2683652336793782e-07,
"loss": 4.0551,
"step": 7025
},
{
"epoch": 0.5579738820736051,
"grad_norm": 107.60162353515625,
"learning_rate": 2.2675407933095013e-07,
"loss": 3.973,
"step": 7050
},
{
"epoch": 0.5599525128611001,
"grad_norm": 118.88258361816406,
"learning_rate": 2.2667163529396245e-07,
"loss": 3.9864,
"step": 7075
},
{
"epoch": 0.5619311436485952,
"grad_norm": 148.85667419433594,
"learning_rate": 2.2658919125697476e-07,
"loss": 3.9409,
"step": 7100
},
{
"epoch": 0.5639097744360902,
"grad_norm": 148.57321166992188,
"learning_rate": 2.2650674721998704e-07,
"loss": 3.9611,
"step": 7125
},
{
"epoch": 0.5658884052235853,
"grad_norm": 172.39999389648438,
"learning_rate": 2.2642430318299938e-07,
"loss": 3.97,
"step": 7150
},
{
"epoch": 0.5678670360110804,
"grad_norm": 120.57051086425781,
"learning_rate": 2.2634185914601167e-07,
"loss": 3.9352,
"step": 7175
},
{
"epoch": 0.5698456667985754,
"grad_norm": 143.2531280517578,
"learning_rate": 2.26259415109024e-07,
"loss": 3.9686,
"step": 7200
},
{
"epoch": 0.5718242975860705,
"grad_norm": 123.57396697998047,
"learning_rate": 2.261769710720363e-07,
"loss": 4.0855,
"step": 7225
},
{
"epoch": 0.5738029283735655,
"grad_norm": 115.12631225585938,
"learning_rate": 2.260945270350486e-07,
"loss": 3.9754,
"step": 7250
},
{
"epoch": 0.5757815591610606,
"grad_norm": 114.95091247558594,
"learning_rate": 2.260120829980609e-07,
"loss": 3.8981,
"step": 7275
},
{
"epoch": 0.5777601899485556,
"grad_norm": 105.46833038330078,
"learning_rate": 2.2592963896107322e-07,
"loss": 3.9452,
"step": 7300
},
{
"epoch": 0.5797388207360507,
"grad_norm": 132.89012145996094,
"learning_rate": 2.258471949240855e-07,
"loss": 4.0808,
"step": 7325
},
{
"epoch": 0.5817174515235457,
"grad_norm": 143.6460418701172,
"learning_rate": 2.2576475088709782e-07,
"loss": 4.0108,
"step": 7350
},
{
"epoch": 0.5836960823110408,
"grad_norm": 130.83352661132812,
"learning_rate": 2.2568230685011013e-07,
"loss": 3.9701,
"step": 7375
},
{
"epoch": 0.5856747130985358,
"grad_norm": 111.10405731201172,
"learning_rate": 2.2559986281312244e-07,
"loss": 4.3162,
"step": 7400
},
{
"epoch": 0.5876533438860309,
"grad_norm": 163.31959533691406,
"learning_rate": 2.2551741877613476e-07,
"loss": 3.9096,
"step": 7425
},
{
"epoch": 0.5896319746735259,
"grad_norm": 134.72927856445312,
"learning_rate": 2.2543497473914704e-07,
"loss": 3.8888,
"step": 7450
},
{
"epoch": 0.591610605461021,
"grad_norm": 124.26619720458984,
"learning_rate": 2.2535253070215938e-07,
"loss": 4.0683,
"step": 7475
},
{
"epoch": 0.593589236248516,
"grad_norm": 106.8174057006836,
"learning_rate": 2.2527008666517166e-07,
"loss": 4.0547,
"step": 7500
},
{
"epoch": 0.5955678670360111,
"grad_norm": 108.11019897460938,
"learning_rate": 2.2518764262818398e-07,
"loss": 3.9728,
"step": 7525
},
{
"epoch": 0.5975464978235061,
"grad_norm": 117.44151306152344,
"learning_rate": 2.251051985911963e-07,
"loss": 4.0569,
"step": 7550
},
{
"epoch": 0.5995251286110012,
"grad_norm": 106.18008422851562,
"learning_rate": 2.250227545542086e-07,
"loss": 3.9042,
"step": 7575
},
{
"epoch": 0.6015037593984962,
"grad_norm": 88.67406463623047,
"learning_rate": 2.249403105172209e-07,
"loss": 4.0719,
"step": 7600
},
{
"epoch": 0.6034823901859913,
"grad_norm": 111.12770080566406,
"learning_rate": 2.248578664802332e-07,
"loss": 3.9749,
"step": 7625
},
{
"epoch": 0.6054610209734863,
"grad_norm": 119.26530456542969,
"learning_rate": 2.247754224432455e-07,
"loss": 3.9832,
"step": 7650
},
{
"epoch": 0.6074396517609814,
"grad_norm": 157.9289093017578,
"learning_rate": 2.2469297840625782e-07,
"loss": 3.9538,
"step": 7675
},
{
"epoch": 0.6094182825484764,
"grad_norm": 122.70995330810547,
"learning_rate": 2.2461053436927013e-07,
"loss": 3.8497,
"step": 7700
},
{
"epoch": 0.6113969133359715,
"grad_norm": 142.41835021972656,
"learning_rate": 2.2452809033228242e-07,
"loss": 3.9172,
"step": 7725
},
{
"epoch": 0.6133755441234665,
"grad_norm": 128.31825256347656,
"learning_rate": 2.2444564629529475e-07,
"loss": 3.7326,
"step": 7750
},
{
"epoch": 0.6153541749109616,
"grad_norm": 142.67408752441406,
"learning_rate": 2.2436320225830704e-07,
"loss": 3.8782,
"step": 7775
},
{
"epoch": 0.6173328056984567,
"grad_norm": 145.0731658935547,
"learning_rate": 2.2428075822131938e-07,
"loss": 4.024,
"step": 7800
},
{
"epoch": 0.6193114364859518,
"grad_norm": 187.09068298339844,
"learning_rate": 2.2419831418433166e-07,
"loss": 3.8939,
"step": 7825
},
{
"epoch": 0.6212900672734468,
"grad_norm": 122.93965148925781,
"learning_rate": 2.2411587014734397e-07,
"loss": 4.0373,
"step": 7850
},
{
"epoch": 0.6232686980609419,
"grad_norm": 152.1845245361328,
"learning_rate": 2.2403342611035628e-07,
"loss": 4.1168,
"step": 7875
},
{
"epoch": 0.6252473288484369,
"grad_norm": 100.07666778564453,
"learning_rate": 2.239509820733686e-07,
"loss": 3.9718,
"step": 7900
},
{
"epoch": 0.627225959635932,
"grad_norm": 130.85479736328125,
"learning_rate": 2.2386853803638088e-07,
"loss": 4.0301,
"step": 7925
},
{
"epoch": 0.629204590423427,
"grad_norm": 123.073974609375,
"learning_rate": 2.237860939993932e-07,
"loss": 4.0104,
"step": 7950
},
{
"epoch": 0.6311832212109221,
"grad_norm": 168.19808959960938,
"learning_rate": 2.237036499624055e-07,
"loss": 3.9421,
"step": 7975
},
{
"epoch": 0.6331618519984171,
"grad_norm": 118.69593811035156,
"learning_rate": 2.2362120592541782e-07,
"loss": 3.8238,
"step": 8000
},
{
"epoch": 0.6351404827859122,
"grad_norm": 192.9334259033203,
"learning_rate": 2.2353876188843013e-07,
"loss": 3.8227,
"step": 8025
},
{
"epoch": 0.6371191135734072,
"grad_norm": 103.11824035644531,
"learning_rate": 2.2345631785144241e-07,
"loss": 3.9359,
"step": 8050
},
{
"epoch": 0.6390977443609023,
"grad_norm": 129.3599090576172,
"learning_rate": 2.2337387381445475e-07,
"loss": 4.0562,
"step": 8075
},
{
"epoch": 0.6410763751483973,
"grad_norm": 124.06795501708984,
"learning_rate": 2.2329142977746704e-07,
"loss": 4.1502,
"step": 8100
},
{
"epoch": 0.6430550059358924,
"grad_norm": 113.18289184570312,
"learning_rate": 2.2320898574047935e-07,
"loss": 4.0059,
"step": 8125
},
{
"epoch": 0.6450336367233874,
"grad_norm": 117.89970397949219,
"learning_rate": 2.2312654170349166e-07,
"loss": 4.0162,
"step": 8150
},
{
"epoch": 0.6470122675108825,
"grad_norm": 109.6517105102539,
"learning_rate": 2.2304409766650397e-07,
"loss": 3.9979,
"step": 8175
},
{
"epoch": 0.6489908982983775,
"grad_norm": 123.35499572753906,
"learning_rate": 2.2296165362951628e-07,
"loss": 3.9273,
"step": 8200
},
{
"epoch": 0.6509695290858726,
"grad_norm": 141.97459411621094,
"learning_rate": 2.228792095925286e-07,
"loss": 4.0558,
"step": 8225
},
{
"epoch": 0.6529481598733676,
"grad_norm": 159.06973266601562,
"learning_rate": 2.2279676555554088e-07,
"loss": 3.8374,
"step": 8250
},
{
"epoch": 0.6549267906608627,
"grad_norm": 120.933837890625,
"learning_rate": 2.227143215185532e-07,
"loss": 3.8412,
"step": 8275
},
{
"epoch": 0.6569054214483577,
"grad_norm": 106.266357421875,
"learning_rate": 2.226318774815655e-07,
"loss": 3.8757,
"step": 8300
},
{
"epoch": 0.6588840522358528,
"grad_norm": 138.7765655517578,
"learning_rate": 2.225494334445778e-07,
"loss": 4.2284,
"step": 8325
},
{
"epoch": 0.6608626830233478,
"grad_norm": 120.76045989990234,
"learning_rate": 2.2246698940759013e-07,
"loss": 3.9127,
"step": 8350
},
{
"epoch": 0.6628413138108429,
"grad_norm": 117.31808471679688,
"learning_rate": 2.223845453706024e-07,
"loss": 3.7577,
"step": 8375
},
{
"epoch": 0.6648199445983379,
"grad_norm": 108.21405029296875,
"learning_rate": 2.2230210133361475e-07,
"loss": 4.1095,
"step": 8400
},
{
"epoch": 0.666798575385833,
"grad_norm": 126.65251159667969,
"learning_rate": 2.2221965729662703e-07,
"loss": 3.9047,
"step": 8425
},
{
"epoch": 0.668777206173328,
"grad_norm": 135.06512451171875,
"learning_rate": 2.2213721325963935e-07,
"loss": 3.9267,
"step": 8450
},
{
"epoch": 0.6707558369608231,
"grad_norm": 150.37025451660156,
"learning_rate": 2.2205476922265166e-07,
"loss": 3.9662,
"step": 8475
},
{
"epoch": 0.6727344677483181,
"grad_norm": 138.01531982421875,
"learning_rate": 2.2197232518566397e-07,
"loss": 3.8107,
"step": 8500
},
{
"epoch": 0.6747130985358132,
"grad_norm": 130.35153198242188,
"learning_rate": 2.2188988114867625e-07,
"loss": 3.8068,
"step": 8525
},
{
"epoch": 0.6766917293233082,
"grad_norm": 161.9180145263672,
"learning_rate": 2.218074371116886e-07,
"loss": 4.0893,
"step": 8550
},
{
"epoch": 0.6786703601108033,
"grad_norm": 165.08409118652344,
"learning_rate": 2.2172499307470088e-07,
"loss": 3.8419,
"step": 8575
},
{
"epoch": 0.6806489908982983,
"grad_norm": 153.2915496826172,
"learning_rate": 2.216425490377132e-07,
"loss": 3.9302,
"step": 8600
},
{
"epoch": 0.6826276216857934,
"grad_norm": 153.20138549804688,
"learning_rate": 2.215601050007255e-07,
"loss": 3.9947,
"step": 8625
},
{
"epoch": 0.6846062524732884,
"grad_norm": 124.32341003417969,
"learning_rate": 2.2147766096373779e-07,
"loss": 3.8241,
"step": 8650
},
{
"epoch": 0.6865848832607835,
"grad_norm": 209.813232421875,
"learning_rate": 2.2139521692675012e-07,
"loss": 3.917,
"step": 8675
},
{
"epoch": 0.6885635140482786,
"grad_norm": 116.88125610351562,
"learning_rate": 2.213127728897624e-07,
"loss": 3.9474,
"step": 8700
},
{
"epoch": 0.6905421448357737,
"grad_norm": 178.58721923828125,
"learning_rate": 2.2123032885277472e-07,
"loss": 3.9247,
"step": 8725
},
{
"epoch": 0.6925207756232687,
"grad_norm": 123.67437744140625,
"learning_rate": 2.2114788481578703e-07,
"loss": 3.9548,
"step": 8750
},
{
"epoch": 0.6944994064107638,
"grad_norm": 154.60626220703125,
"learning_rate": 2.2106544077879934e-07,
"loss": 3.8239,
"step": 8775
},
{
"epoch": 0.6964780371982588,
"grad_norm": 141.65699768066406,
"learning_rate": 2.2098299674181166e-07,
"loss": 3.8458,
"step": 8800
},
{
"epoch": 0.6984566679857539,
"grad_norm": 112.9280776977539,
"learning_rate": 2.2090055270482397e-07,
"loss": 3.9648,
"step": 8825
},
{
"epoch": 0.700435298773249,
"grad_norm": 154.57643127441406,
"learning_rate": 2.2081810866783625e-07,
"loss": 4.0078,
"step": 8850
},
{
"epoch": 0.702413929560744,
"grad_norm": 129.07418823242188,
"learning_rate": 2.207356646308486e-07,
"loss": 4.0276,
"step": 8875
},
{
"epoch": 0.704392560348239,
"grad_norm": 113.59859466552734,
"learning_rate": 2.2065322059386088e-07,
"loss": 4.1596,
"step": 8900
},
{
"epoch": 0.7063711911357341,
"grad_norm": 136.26283264160156,
"learning_rate": 2.2057077655687316e-07,
"loss": 3.948,
"step": 8925
},
{
"epoch": 0.7083498219232292,
"grad_norm": 118.27870178222656,
"learning_rate": 2.204883325198855e-07,
"loss": 3.7611,
"step": 8950
},
{
"epoch": 0.7103284527107242,
"grad_norm": 159.56643676757812,
"learning_rate": 2.2040588848289778e-07,
"loss": 4.0215,
"step": 8975
},
{
"epoch": 0.7123070834982193,
"grad_norm": 125.84573364257812,
"learning_rate": 2.2032344444591012e-07,
"loss": 4.0046,
"step": 9000
},
{
"epoch": 0.7142857142857143,
"grad_norm": 148.7548065185547,
"learning_rate": 2.202410004089224e-07,
"loss": 4.0033,
"step": 9025
},
{
"epoch": 0.7162643450732094,
"grad_norm": 109.41517639160156,
"learning_rate": 2.2015855637193472e-07,
"loss": 3.9298,
"step": 9050
},
{
"epoch": 0.7182429758607044,
"grad_norm": 112.52848815917969,
"learning_rate": 2.2007611233494703e-07,
"loss": 4.1003,
"step": 9075
},
{
"epoch": 0.7202216066481995,
"grad_norm": 114.72808074951172,
"learning_rate": 2.1999366829795934e-07,
"loss": 4.1112,
"step": 9100
},
{
"epoch": 0.7222002374356945,
"grad_norm": 144.11619567871094,
"learning_rate": 2.1991122426097163e-07,
"loss": 3.764,
"step": 9125
},
{
"epoch": 0.7241788682231896,
"grad_norm": 118.64055633544922,
"learning_rate": 2.1982878022398396e-07,
"loss": 3.9262,
"step": 9150
},
{
"epoch": 0.7261574990106846,
"grad_norm": 166.79525756835938,
"learning_rate": 2.1974633618699625e-07,
"loss": 4.0518,
"step": 9175
},
{
"epoch": 0.7281361297981797,
"grad_norm": 128.2512969970703,
"learning_rate": 2.1966389215000856e-07,
"loss": 3.952,
"step": 9200
},
{
"epoch": 0.7301147605856747,
"grad_norm": 143.56414794921875,
"learning_rate": 2.1958144811302087e-07,
"loss": 3.7034,
"step": 9225
},
{
"epoch": 0.7320933913731698,
"grad_norm": 120.13394165039062,
"learning_rate": 2.1949900407603316e-07,
"loss": 3.7839,
"step": 9250
},
{
"epoch": 0.7340720221606648,
"grad_norm": 148.74070739746094,
"learning_rate": 2.194165600390455e-07,
"loss": 3.8871,
"step": 9275
},
{
"epoch": 0.7360506529481599,
"grad_norm": 148.17022705078125,
"learning_rate": 2.1933411600205778e-07,
"loss": 3.7486,
"step": 9300
},
{
"epoch": 0.7380292837356549,
"grad_norm": 112.7260513305664,
"learning_rate": 2.192516719650701e-07,
"loss": 3.9436,
"step": 9325
},
{
"epoch": 0.74000791452315,
"grad_norm": 131.4718780517578,
"learning_rate": 2.191692279280824e-07,
"loss": 4.1101,
"step": 9350
},
{
"epoch": 0.741986545310645,
"grad_norm": 106.73101043701172,
"learning_rate": 2.1908678389109472e-07,
"loss": 3.9285,
"step": 9375
},
{
"epoch": 0.7439651760981401,
"grad_norm": 120.58040618896484,
"learning_rate": 2.1900433985410703e-07,
"loss": 3.8471,
"step": 9400
},
{
"epoch": 0.7459438068856351,
"grad_norm": 135.69512939453125,
"learning_rate": 2.1892189581711934e-07,
"loss": 3.7629,
"step": 9425
},
{
"epoch": 0.7479224376731302,
"grad_norm": 125.78627014160156,
"learning_rate": 2.1883945178013162e-07,
"loss": 4.0646,
"step": 9450
},
{
"epoch": 0.7499010684606252,
"grad_norm": 150.2305145263672,
"learning_rate": 2.1875700774314396e-07,
"loss": 3.9361,
"step": 9475
},
{
"epoch": 0.7518796992481203,
"grad_norm": 95.4436264038086,
"learning_rate": 2.1867456370615625e-07,
"loss": 3.7688,
"step": 9500
},
{
"epoch": 0.7538583300356153,
"grad_norm": 141.27809143066406,
"learning_rate": 2.1859211966916853e-07,
"loss": 4.0217,
"step": 9525
},
{
"epoch": 0.7558369608231104,
"grad_norm": 133.8254852294922,
"learning_rate": 2.1850967563218087e-07,
"loss": 4.0683,
"step": 9550
},
{
"epoch": 0.7578155916106054,
"grad_norm": 139.919189453125,
"learning_rate": 2.1842723159519316e-07,
"loss": 3.9958,
"step": 9575
},
{
"epoch": 0.7597942223981005,
"grad_norm": 173.58946228027344,
"learning_rate": 2.183447875582055e-07,
"loss": 3.9474,
"step": 9600
},
{
"epoch": 0.7617728531855956,
"grad_norm": 107.07398223876953,
"learning_rate": 2.1826234352121778e-07,
"loss": 3.8308,
"step": 9625
},
{
"epoch": 0.7637514839730907,
"grad_norm": 124.00753784179688,
"learning_rate": 2.181798994842301e-07,
"loss": 3.8218,
"step": 9650
},
{
"epoch": 0.7657301147605857,
"grad_norm": 138.23736572265625,
"learning_rate": 2.180974554472424e-07,
"loss": 3.7296,
"step": 9675
},
{
"epoch": 0.7677087455480808,
"grad_norm": 128.9496612548828,
"learning_rate": 2.1801501141025471e-07,
"loss": 3.9163,
"step": 9700
},
{
"epoch": 0.7696873763355758,
"grad_norm": 108.07875061035156,
"learning_rate": 2.17932567373267e-07,
"loss": 3.9408,
"step": 9725
},
{
"epoch": 0.7716660071230709,
"grad_norm": 126.18501281738281,
"learning_rate": 2.1785012333627934e-07,
"loss": 4.1993,
"step": 9750
},
{
"epoch": 0.7736446379105659,
"grad_norm": 144.8102264404297,
"learning_rate": 2.1776767929929162e-07,
"loss": 3.877,
"step": 9775
},
{
"epoch": 0.775623268698061,
"grad_norm": 118.8504638671875,
"learning_rate": 2.1768523526230396e-07,
"loss": 3.9788,
"step": 9800
},
{
"epoch": 0.777601899485556,
"grad_norm": 127.45133209228516,
"learning_rate": 2.1760279122531625e-07,
"loss": 3.8987,
"step": 9825
},
{
"epoch": 0.7795805302730511,
"grad_norm": 134.95892333984375,
"learning_rate": 2.1752034718832853e-07,
"loss": 3.8251,
"step": 9850
},
{
"epoch": 0.7815591610605461,
"grad_norm": 124.00614929199219,
"learning_rate": 2.1743790315134087e-07,
"loss": 3.6875,
"step": 9875
},
{
"epoch": 0.7835377918480412,
"grad_norm": 126.81105041503906,
"learning_rate": 2.1735545911435315e-07,
"loss": 3.7447,
"step": 9900
},
{
"epoch": 0.7855164226355362,
"grad_norm": 106.54443359375,
"learning_rate": 2.172730150773655e-07,
"loss": 3.9051,
"step": 9925
},
{
"epoch": 0.7874950534230313,
"grad_norm": 156.5098876953125,
"learning_rate": 2.1719057104037778e-07,
"loss": 3.9587,
"step": 9950
},
{
"epoch": 0.7894736842105263,
"grad_norm": 128.83648681640625,
"learning_rate": 2.171081270033901e-07,
"loss": 4.0755,
"step": 9975
},
{
"epoch": 0.7914523149980214,
"grad_norm": 131.54664611816406,
"learning_rate": 2.170256829664024e-07,
"loss": 4.0072,
"step": 10000
},
{
"epoch": 0.7914523149980214,
"eval_loss": 3.8681728839874268,
"eval_runtime": 9.5698,
"eval_samples_per_second": 264.165,
"eval_steps_per_second": 33.021,
"step": 10000
}
],
"logging_steps": 25,
"max_steps": 75810,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 35767296000000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}