Qwen2-VL-2B-Instruct-SFT / trainer_state.json
bluuluu's picture
Model save
2c69336 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998197093715069,
"eval_steps": 500,
"global_step": 3466,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014423250279450475,
"grad_norm": 23.09968734754774,
"learning_rate": 2.3054755043227666e-07,
"loss": 12.1657,
"step": 5
},
{
"epoch": 0.002884650055890095,
"grad_norm": 23.350567085111635,
"learning_rate": 5.187319884726226e-07,
"loss": 12.1499,
"step": 10
},
{
"epoch": 0.004326975083835142,
"grad_norm": 22.840877913954497,
"learning_rate": 8.069164265129684e-07,
"loss": 12.0857,
"step": 15
},
{
"epoch": 0.00576930011178019,
"grad_norm": 21.40321138460624,
"learning_rate": 1.0951008645533142e-06,
"loss": 11.8028,
"step": 20
},
{
"epoch": 0.007211625139725237,
"grad_norm": 18.192353108517974,
"learning_rate": 1.3832853025936602e-06,
"loss": 11.3384,
"step": 25
},
{
"epoch": 0.008653950167670284,
"grad_norm": 18.559232783911973,
"learning_rate": 1.6714697406340058e-06,
"loss": 10.3127,
"step": 30
},
{
"epoch": 0.010096275195615331,
"grad_norm": 37.79150391064707,
"learning_rate": 1.959654178674352e-06,
"loss": 9.0664,
"step": 35
},
{
"epoch": 0.01153860022356038,
"grad_norm": 33.772043740311254,
"learning_rate": 2.247838616714698e-06,
"loss": 7.409,
"step": 40
},
{
"epoch": 0.012980925251505427,
"grad_norm": 23.04632172544007,
"learning_rate": 2.5360230547550434e-06,
"loss": 6.3338,
"step": 45
},
{
"epoch": 0.014423250279450473,
"grad_norm": 25.32559722397877,
"learning_rate": 2.8242074927953894e-06,
"loss": 4.4908,
"step": 50
},
{
"epoch": 0.015865575307395522,
"grad_norm": 9.143968031022688,
"learning_rate": 3.1123919308357354e-06,
"loss": 3.2978,
"step": 55
},
{
"epoch": 0.01730790033534057,
"grad_norm": 2.3359297684099745,
"learning_rate": 3.400576368876081e-06,
"loss": 2.6887,
"step": 60
},
{
"epoch": 0.018750225363285616,
"grad_norm": 1.5235792893524585,
"learning_rate": 3.6887608069164266e-06,
"loss": 2.6051,
"step": 65
},
{
"epoch": 0.020192550391230663,
"grad_norm": 1.6452371227737381,
"learning_rate": 3.976945244956772e-06,
"loss": 2.5288,
"step": 70
},
{
"epoch": 0.021634875419175713,
"grad_norm": 2.3877151673363133,
"learning_rate": 4.265129682997119e-06,
"loss": 2.4368,
"step": 75
},
{
"epoch": 0.02307720044712076,
"grad_norm": 3.5448230000902283,
"learning_rate": 4.553314121037464e-06,
"loss": 2.2394,
"step": 80
},
{
"epoch": 0.024519525475065806,
"grad_norm": 3.998099329525319,
"learning_rate": 4.84149855907781e-06,
"loss": 2.0687,
"step": 85
},
{
"epoch": 0.025961850503010853,
"grad_norm": 5.3900301279889025,
"learning_rate": 5.129682997118156e-06,
"loss": 2.0427,
"step": 90
},
{
"epoch": 0.0274041755309559,
"grad_norm": 7.2317244995568215,
"learning_rate": 5.417867435158502e-06,
"loss": 1.9167,
"step": 95
},
{
"epoch": 0.028846500558900947,
"grad_norm": 4.190169407947923,
"learning_rate": 5.706051873198848e-06,
"loss": 1.8528,
"step": 100
},
{
"epoch": 0.030288825586845997,
"grad_norm": 5.165106554451897,
"learning_rate": 5.994236311239193e-06,
"loss": 1.8751,
"step": 105
},
{
"epoch": 0.031731150614791044,
"grad_norm": 3.2421300129897426,
"learning_rate": 6.2824207492795395e-06,
"loss": 1.7973,
"step": 110
},
{
"epoch": 0.03317347564273609,
"grad_norm": 4.460292781455887,
"learning_rate": 6.570605187319885e-06,
"loss": 1.6292,
"step": 115
},
{
"epoch": 0.03461580067068114,
"grad_norm": 4.913131259117871,
"learning_rate": 6.8587896253602315e-06,
"loss": 1.655,
"step": 120
},
{
"epoch": 0.03605812569862619,
"grad_norm": 4.1881116653103945,
"learning_rate": 7.146974063400577e-06,
"loss": 1.664,
"step": 125
},
{
"epoch": 0.03750045072657123,
"grad_norm": 5.723431293294362,
"learning_rate": 7.4351585014409235e-06,
"loss": 1.6202,
"step": 130
},
{
"epoch": 0.03894277575451628,
"grad_norm": 4.909602119186479,
"learning_rate": 7.723342939481268e-06,
"loss": 1.5486,
"step": 135
},
{
"epoch": 0.040385100782461325,
"grad_norm": 5.928676345818394,
"learning_rate": 8.011527377521614e-06,
"loss": 1.4965,
"step": 140
},
{
"epoch": 0.041827425810406375,
"grad_norm": 5.5830317263384845,
"learning_rate": 8.299711815561961e-06,
"loss": 1.4195,
"step": 145
},
{
"epoch": 0.043269750838351426,
"grad_norm": 5.587820490379444,
"learning_rate": 8.587896253602305e-06,
"loss": 1.3894,
"step": 150
},
{
"epoch": 0.04471207586629647,
"grad_norm": 3.5851612990900836,
"learning_rate": 8.876080691642652e-06,
"loss": 1.4654,
"step": 155
},
{
"epoch": 0.04615440089424152,
"grad_norm": 4.792344497245253,
"learning_rate": 9.164265129682998e-06,
"loss": 1.3801,
"step": 160
},
{
"epoch": 0.04759672592218656,
"grad_norm": 3.5644574463856387,
"learning_rate": 9.452449567723344e-06,
"loss": 1.3527,
"step": 165
},
{
"epoch": 0.04903905095013161,
"grad_norm": 4.245088356022904,
"learning_rate": 9.740634005763689e-06,
"loss": 1.3465,
"step": 170
},
{
"epoch": 0.050481375978076656,
"grad_norm": 4.623244884122231,
"learning_rate": 1.0028818443804036e-05,
"loss": 1.3647,
"step": 175
},
{
"epoch": 0.05192370100602171,
"grad_norm": 3.5591972450196043,
"learning_rate": 1.031700288184438e-05,
"loss": 1.261,
"step": 180
},
{
"epoch": 0.05336602603396676,
"grad_norm": 3.6288737317693243,
"learning_rate": 1.0605187319884726e-05,
"loss": 1.2178,
"step": 185
},
{
"epoch": 0.0548083510619118,
"grad_norm": 5.472679192029011,
"learning_rate": 1.0893371757925073e-05,
"loss": 1.2372,
"step": 190
},
{
"epoch": 0.05625067608985685,
"grad_norm": 2.987171924181164,
"learning_rate": 1.1181556195965419e-05,
"loss": 1.1878,
"step": 195
},
{
"epoch": 0.057693001117801894,
"grad_norm": 3.633711033064426,
"learning_rate": 1.1469740634005764e-05,
"loss": 1.1895,
"step": 200
},
{
"epoch": 0.059135326145746944,
"grad_norm": 3.9402926571067978,
"learning_rate": 1.175792507204611e-05,
"loss": 1.1368,
"step": 205
},
{
"epoch": 0.060577651173691995,
"grad_norm": 3.527134311033913,
"learning_rate": 1.2046109510086457e-05,
"loss": 1.1306,
"step": 210
},
{
"epoch": 0.06201997620163704,
"grad_norm": 3.679407663475352,
"learning_rate": 1.2334293948126803e-05,
"loss": 1.0846,
"step": 215
},
{
"epoch": 0.06346230122958209,
"grad_norm": 3.1104059182965047,
"learning_rate": 1.2622478386167147e-05,
"loss": 1.1201,
"step": 220
},
{
"epoch": 0.06490462625752713,
"grad_norm": 4.203869282005421,
"learning_rate": 1.2910662824207494e-05,
"loss": 1.0694,
"step": 225
},
{
"epoch": 0.06634695128547217,
"grad_norm": 3.936128919901792,
"learning_rate": 1.319884726224784e-05,
"loss": 1.0191,
"step": 230
},
{
"epoch": 0.06778927631341723,
"grad_norm": 2.2362445033305804,
"learning_rate": 1.3487031700288185e-05,
"loss": 0.9774,
"step": 235
},
{
"epoch": 0.06923160134136228,
"grad_norm": 2.757438827888907,
"learning_rate": 1.377521613832853e-05,
"loss": 1.0124,
"step": 240
},
{
"epoch": 0.07067392636930732,
"grad_norm": 3.4599226565163783,
"learning_rate": 1.4063400576368878e-05,
"loss": 0.9295,
"step": 245
},
{
"epoch": 0.07211625139725238,
"grad_norm": 2.0262096895794963,
"learning_rate": 1.4351585014409224e-05,
"loss": 0.9118,
"step": 250
},
{
"epoch": 0.07355857642519742,
"grad_norm": 2.487400868386021,
"learning_rate": 1.4639769452449568e-05,
"loss": 0.9409,
"step": 255
},
{
"epoch": 0.07500090145314246,
"grad_norm": 1.9303088742335475,
"learning_rate": 1.4927953890489915e-05,
"loss": 0.9211,
"step": 260
},
{
"epoch": 0.0764432264810875,
"grad_norm": 2.175412817851971,
"learning_rate": 1.521613832853026e-05,
"loss": 0.9168,
"step": 265
},
{
"epoch": 0.07788555150903256,
"grad_norm": 2.5796504124225033,
"learning_rate": 1.5504322766570608e-05,
"loss": 0.9527,
"step": 270
},
{
"epoch": 0.0793278765369776,
"grad_norm": 1.9788435183920994,
"learning_rate": 1.5792507204610953e-05,
"loss": 0.8426,
"step": 275
},
{
"epoch": 0.08077020156492265,
"grad_norm": 2.003074548053739,
"learning_rate": 1.60806916426513e-05,
"loss": 0.8527,
"step": 280
},
{
"epoch": 0.08221252659286771,
"grad_norm": 2.1994335722383602,
"learning_rate": 1.6368876080691644e-05,
"loss": 0.8072,
"step": 285
},
{
"epoch": 0.08365485162081275,
"grad_norm": 1.726445070641134,
"learning_rate": 1.665706051873199e-05,
"loss": 0.8163,
"step": 290
},
{
"epoch": 0.0850971766487578,
"grad_norm": 2.350691118327581,
"learning_rate": 1.6945244956772336e-05,
"loss": 0.7651,
"step": 295
},
{
"epoch": 0.08653950167670285,
"grad_norm": 2.6639655167915115,
"learning_rate": 1.723342939481268e-05,
"loss": 0.7535,
"step": 300
},
{
"epoch": 0.0879818267046479,
"grad_norm": 1.3919563172463725,
"learning_rate": 1.7521613832853027e-05,
"loss": 0.785,
"step": 305
},
{
"epoch": 0.08942415173259294,
"grad_norm": 1.2944766289360783,
"learning_rate": 1.7809798270893372e-05,
"loss": 0.7111,
"step": 310
},
{
"epoch": 0.09086647676053798,
"grad_norm": 1.4798988266070112,
"learning_rate": 1.8097982708933718e-05,
"loss": 0.7293,
"step": 315
},
{
"epoch": 0.09230880178848304,
"grad_norm": 1.1830162313483426,
"learning_rate": 1.8386167146974067e-05,
"loss": 0.7231,
"step": 320
},
{
"epoch": 0.09375112681642808,
"grad_norm": 1.5568610974134778,
"learning_rate": 1.867435158501441e-05,
"loss": 0.7445,
"step": 325
},
{
"epoch": 0.09519345184437313,
"grad_norm": 1.1492164494899182,
"learning_rate": 1.8962536023054755e-05,
"loss": 0.6959,
"step": 330
},
{
"epoch": 0.09663577687231818,
"grad_norm": 1.0978857201097723,
"learning_rate": 1.9250720461095104e-05,
"loss": 0.7057,
"step": 335
},
{
"epoch": 0.09807810190026323,
"grad_norm": 1.0096489653703298,
"learning_rate": 1.953890489913545e-05,
"loss": 0.6772,
"step": 340
},
{
"epoch": 0.09952042692820827,
"grad_norm": 1.1232844613521993,
"learning_rate": 1.9827089337175795e-05,
"loss": 0.7246,
"step": 345
},
{
"epoch": 0.10096275195615331,
"grad_norm": 1.02243795388932,
"learning_rate": 1.9999979709215212e-05,
"loss": 0.7024,
"step": 350
},
{
"epoch": 0.10240507698409837,
"grad_norm": 1.1367801539352143,
"learning_rate": 1.9999751438831965e-05,
"loss": 0.6489,
"step": 355
},
{
"epoch": 0.10384740201204341,
"grad_norm": 1.1572043181625398,
"learning_rate": 1.9999269540393507e-05,
"loss": 0.6489,
"step": 360
},
{
"epoch": 0.10528972703998846,
"grad_norm": 1.0269240416486167,
"learning_rate": 1.9998534026122433e-05,
"loss": 0.6782,
"step": 365
},
{
"epoch": 0.10673205206793351,
"grad_norm": 0.9511160065038861,
"learning_rate": 1.9997544914673915e-05,
"loss": 0.6312,
"step": 370
},
{
"epoch": 0.10817437709587856,
"grad_norm": 1.1374311508874984,
"learning_rate": 1.999630223113522e-05,
"loss": 0.6628,
"step": 375
},
{
"epoch": 0.1096167021238236,
"grad_norm": 1.450941328478541,
"learning_rate": 1.9994806007025068e-05,
"loss": 0.6389,
"step": 380
},
{
"epoch": 0.11105902715176866,
"grad_norm": 0.8046806001901237,
"learning_rate": 1.9993056280292845e-05,
"loss": 0.6482,
"step": 385
},
{
"epoch": 0.1125013521797137,
"grad_norm": 0.8216403494158578,
"learning_rate": 1.999105309531763e-05,
"loss": 0.6078,
"step": 390
},
{
"epoch": 0.11394367720765874,
"grad_norm": 0.8600864577290717,
"learning_rate": 1.9988796502907083e-05,
"loss": 0.63,
"step": 395
},
{
"epoch": 0.11538600223560379,
"grad_norm": 0.798579467879802,
"learning_rate": 1.9986286560296134e-05,
"loss": 0.6109,
"step": 400
},
{
"epoch": 0.11682832726354885,
"grad_norm": 0.7668970837973854,
"learning_rate": 1.998352333114556e-05,
"loss": 0.5857,
"step": 405
},
{
"epoch": 0.11827065229149389,
"grad_norm": 1.0143366745206854,
"learning_rate": 1.998050688554034e-05,
"loss": 0.6176,
"step": 410
},
{
"epoch": 0.11971297731943893,
"grad_norm": 0.7114180483975799,
"learning_rate": 1.9977237299987903e-05,
"loss": 0.62,
"step": 415
},
{
"epoch": 0.12115530234738399,
"grad_norm": 0.8179413343809848,
"learning_rate": 1.997371465741617e-05,
"loss": 0.6205,
"step": 420
},
{
"epoch": 0.12259762737532903,
"grad_norm": 0.6435940720725398,
"learning_rate": 1.996993904717146e-05,
"loss": 0.5878,
"step": 425
},
{
"epoch": 0.12403995240327408,
"grad_norm": 0.9102246188273324,
"learning_rate": 1.9965910565016223e-05,
"loss": 0.6021,
"step": 430
},
{
"epoch": 0.12548227743121912,
"grad_norm": 0.6153476600060466,
"learning_rate": 1.9961629313126608e-05,
"loss": 0.5674,
"step": 435
},
{
"epoch": 0.12692460245916418,
"grad_norm": 0.5823753109992822,
"learning_rate": 1.9957095400089875e-05,
"loss": 0.5819,
"step": 440
},
{
"epoch": 0.12836692748710923,
"grad_norm": 0.6280650049871973,
"learning_rate": 1.9952308940901634e-05,
"loss": 0.6357,
"step": 445
},
{
"epoch": 0.12980925251505426,
"grad_norm": 1.12163730124818,
"learning_rate": 1.9947270056962934e-05,
"loss": 0.5659,
"step": 450
},
{
"epoch": 0.13125157754299932,
"grad_norm": 0.8453741002711367,
"learning_rate": 1.994197887607719e-05,
"loss": 0.5423,
"step": 455
},
{
"epoch": 0.13269390257094435,
"grad_norm": 0.6945577095672939,
"learning_rate": 1.993643553244693e-05,
"loss": 0.6118,
"step": 460
},
{
"epoch": 0.1341362275988894,
"grad_norm": 0.6080087347638511,
"learning_rate": 1.993064016667039e-05,
"loss": 0.5912,
"step": 465
},
{
"epoch": 0.13557855262683446,
"grad_norm": 0.5072027520003524,
"learning_rate": 1.992459292573796e-05,
"loss": 0.6086,
"step": 470
},
{
"epoch": 0.1370208776547795,
"grad_norm": 0.5194397753829619,
"learning_rate": 1.991829396302845e-05,
"loss": 0.5554,
"step": 475
},
{
"epoch": 0.13846320268272455,
"grad_norm": 0.6531400636419847,
"learning_rate": 1.9911743438305203e-05,
"loss": 0.5738,
"step": 480
},
{
"epoch": 0.1399055277106696,
"grad_norm": 0.8007993447245763,
"learning_rate": 1.990494151771202e-05,
"loss": 0.5698,
"step": 485
},
{
"epoch": 0.14134785273861464,
"grad_norm": 0.7192330669398362,
"learning_rate": 1.989788837376899e-05,
"loss": 0.5629,
"step": 490
},
{
"epoch": 0.1427901777665597,
"grad_norm": 0.688440868686088,
"learning_rate": 1.989058418536807e-05,
"loss": 0.5734,
"step": 495
},
{
"epoch": 0.14423250279450475,
"grad_norm": 1.001172764554856,
"learning_rate": 1.988302913776858e-05,
"loss": 0.5745,
"step": 500
},
{
"epoch": 0.14423250279450475,
"eval_loss": 0.568706750869751,
"eval_runtime": 161.3667,
"eval_samples_per_second": 11.161,
"eval_steps_per_second": 2.795,
"step": 500
},
{
"epoch": 0.14567482782244978,
"grad_norm": 1.0515733209433527,
"learning_rate": 1.9875223422592485e-05,
"loss": 0.5704,
"step": 505
},
{
"epoch": 0.14711715285039484,
"grad_norm": 1.0276945765068186,
"learning_rate": 1.986716723781954e-05,
"loss": 0.6123,
"step": 510
},
{
"epoch": 0.1485594778783399,
"grad_norm": 0.8043743845845657,
"learning_rate": 1.985886078778227e-05,
"loss": 0.5437,
"step": 515
},
{
"epoch": 0.15000180290628493,
"grad_norm": 0.6535595881064415,
"learning_rate": 1.9850304283160793e-05,
"loss": 0.5527,
"step": 520
},
{
"epoch": 0.15144412793422998,
"grad_norm": 0.7357564272936004,
"learning_rate": 1.9841497940977464e-05,
"loss": 0.5432,
"step": 525
},
{
"epoch": 0.152886452962175,
"grad_norm": 0.7287222676647807,
"learning_rate": 1.983244198459138e-05,
"loss": 0.5811,
"step": 530
},
{
"epoch": 0.15432877799012007,
"grad_norm": 0.5697752505815841,
"learning_rate": 1.982313664369271e-05,
"loss": 0.5627,
"step": 535
},
{
"epoch": 0.15577110301806513,
"grad_norm": 0.5170616797914624,
"learning_rate": 1.981358215429687e-05,
"loss": 0.5592,
"step": 540
},
{
"epoch": 0.15721342804601016,
"grad_norm": 0.619913426569597,
"learning_rate": 1.9803778758738543e-05,
"loss": 0.5435,
"step": 545
},
{
"epoch": 0.1586557530739552,
"grad_norm": 0.9727823301261521,
"learning_rate": 1.9793726705665524e-05,
"loss": 0.5889,
"step": 550
},
{
"epoch": 0.16009807810190027,
"grad_norm": 0.6044688838902901,
"learning_rate": 1.9783426250032412e-05,
"loss": 0.5678,
"step": 555
},
{
"epoch": 0.1615404031298453,
"grad_norm": 0.46024598144245266,
"learning_rate": 1.9772877653094165e-05,
"loss": 0.5639,
"step": 560
},
{
"epoch": 0.16298272815779036,
"grad_norm": 0.45100341602786603,
"learning_rate": 1.9762081182399434e-05,
"loss": 0.5717,
"step": 565
},
{
"epoch": 0.16442505318573541,
"grad_norm": 0.5540308655652189,
"learning_rate": 1.9751037111783818e-05,
"loss": 0.5623,
"step": 570
},
{
"epoch": 0.16586737821368044,
"grad_norm": 0.43976603899998645,
"learning_rate": 1.9739745721362897e-05,
"loss": 0.5319,
"step": 575
},
{
"epoch": 0.1673097032416255,
"grad_norm": 0.4612500025708451,
"learning_rate": 1.9728207297525125e-05,
"loss": 0.5653,
"step": 580
},
{
"epoch": 0.16875202826957056,
"grad_norm": 0.5752333041985558,
"learning_rate": 1.9716422132924572e-05,
"loss": 0.567,
"step": 585
},
{
"epoch": 0.1701943532975156,
"grad_norm": 0.5369943570453672,
"learning_rate": 1.9704390526473515e-05,
"loss": 0.5609,
"step": 590
},
{
"epoch": 0.17163667832546065,
"grad_norm": 0.5164720235053389,
"learning_rate": 1.9692112783334826e-05,
"loss": 0.5415,
"step": 595
},
{
"epoch": 0.1730790033534057,
"grad_norm": 0.7665382521888024,
"learning_rate": 1.967958921491426e-05,
"loss": 0.5671,
"step": 600
},
{
"epoch": 0.17452132838135073,
"grad_norm": 0.6256340257615823,
"learning_rate": 1.966682013885255e-05,
"loss": 0.5533,
"step": 605
},
{
"epoch": 0.1759636534092958,
"grad_norm": 0.4893424331522886,
"learning_rate": 1.9653805879017323e-05,
"loss": 0.5589,
"step": 610
},
{
"epoch": 0.17740597843724082,
"grad_norm": 0.4930248858437027,
"learning_rate": 1.964054676549494e-05,
"loss": 0.5418,
"step": 615
},
{
"epoch": 0.17884830346518588,
"grad_norm": 0.45814407628412845,
"learning_rate": 1.9627043134582068e-05,
"loss": 0.5195,
"step": 620
},
{
"epoch": 0.18029062849313093,
"grad_norm": 0.5315704703868885,
"learning_rate": 1.9613295328777187e-05,
"loss": 0.5095,
"step": 625
},
{
"epoch": 0.18173295352107596,
"grad_norm": 0.43146076740416167,
"learning_rate": 1.959930369677189e-05,
"loss": 0.4929,
"step": 630
},
{
"epoch": 0.18317527854902102,
"grad_norm": 0.4627882494650573,
"learning_rate": 1.958506859344204e-05,
"loss": 0.5141,
"step": 635
},
{
"epoch": 0.18461760357696608,
"grad_norm": 0.621672972720691,
"learning_rate": 1.9570590379838767e-05,
"loss": 0.5486,
"step": 640
},
{
"epoch": 0.1860599286049111,
"grad_norm": 0.5063460018719447,
"learning_rate": 1.9555869423179316e-05,
"loss": 0.5497,
"step": 645
},
{
"epoch": 0.18750225363285616,
"grad_norm": 0.48895947210824475,
"learning_rate": 1.9540906096837727e-05,
"loss": 0.5465,
"step": 650
},
{
"epoch": 0.18894457866080122,
"grad_norm": 0.47357663586358684,
"learning_rate": 1.9525700780335372e-05,
"loss": 0.529,
"step": 655
},
{
"epoch": 0.19038690368874625,
"grad_norm": 0.43786638884850015,
"learning_rate": 1.951025385933132e-05,
"loss": 0.522,
"step": 660
},
{
"epoch": 0.1918292287166913,
"grad_norm": 0.5828551791972233,
"learning_rate": 1.9494565725612565e-05,
"loss": 0.5334,
"step": 665
},
{
"epoch": 0.19327155374463637,
"grad_norm": 0.4669699168406431,
"learning_rate": 1.9478636777084077e-05,
"loss": 0.4846,
"step": 670
},
{
"epoch": 0.1947138787725814,
"grad_norm": 0.5626195687859905,
"learning_rate": 1.946246741775873e-05,
"loss": 0.556,
"step": 675
},
{
"epoch": 0.19615620380052645,
"grad_norm": 0.5482755680769119,
"learning_rate": 1.9446058057747025e-05,
"loss": 0.4561,
"step": 680
},
{
"epoch": 0.1975985288284715,
"grad_norm": 0.4878018831010534,
"learning_rate": 1.9429409113246715e-05,
"loss": 0.526,
"step": 685
},
{
"epoch": 0.19904085385641654,
"grad_norm": 0.7436357434374212,
"learning_rate": 1.9412521006532245e-05,
"loss": 0.5088,
"step": 690
},
{
"epoch": 0.2004831788843616,
"grad_norm": 0.45530676409796045,
"learning_rate": 1.939539416594402e-05,
"loss": 0.5214,
"step": 695
},
{
"epoch": 0.20192550391230663,
"grad_norm": 0.6302948823981896,
"learning_rate": 1.937802902587757e-05,
"loss": 0.5591,
"step": 700
},
{
"epoch": 0.20336782894025168,
"grad_norm": 0.4921513503843826,
"learning_rate": 1.936042602677251e-05,
"loss": 0.5288,
"step": 705
},
{
"epoch": 0.20481015396819674,
"grad_norm": 0.5421091687931597,
"learning_rate": 1.934258561510138e-05,
"loss": 0.5151,
"step": 710
},
{
"epoch": 0.20625247899614177,
"grad_norm": 0.7576428493111558,
"learning_rate": 1.932450824335832e-05,
"loss": 0.477,
"step": 715
},
{
"epoch": 0.20769480402408683,
"grad_norm": 0.424961853700426,
"learning_rate": 1.9306194370047592e-05,
"loss": 0.5342,
"step": 720
},
{
"epoch": 0.20913712905203188,
"grad_norm": 0.49906945581307455,
"learning_rate": 1.9287644459671948e-05,
"loss": 0.5334,
"step": 725
},
{
"epoch": 0.2105794540799769,
"grad_norm": 0.46177937508565325,
"learning_rate": 1.926885898272085e-05,
"loss": 0.4989,
"step": 730
},
{
"epoch": 0.21202177910792197,
"grad_norm": 0.4920606306275181,
"learning_rate": 1.9249838415658543e-05,
"loss": 0.5448,
"step": 735
},
{
"epoch": 0.21346410413586703,
"grad_norm": 0.4191101613829332,
"learning_rate": 1.9230583240911954e-05,
"loss": 0.4694,
"step": 740
},
{
"epoch": 0.21490642916381206,
"grad_norm": 0.48817506876963557,
"learning_rate": 1.9211093946858484e-05,
"loss": 0.5173,
"step": 745
},
{
"epoch": 0.21634875419175711,
"grad_norm": 0.5126984233381934,
"learning_rate": 1.919137102781359e-05,
"loss": 0.5074,
"step": 750
},
{
"epoch": 0.21779107921970217,
"grad_norm": 0.5334260917924061,
"learning_rate": 1.9171414984018266e-05,
"loss": 0.4917,
"step": 755
},
{
"epoch": 0.2192334042476472,
"grad_norm": 0.5501541841297073,
"learning_rate": 1.915122632162635e-05,
"loss": 0.5152,
"step": 760
},
{
"epoch": 0.22067572927559226,
"grad_norm": 0.4359723210170646,
"learning_rate": 1.913080555269169e-05,
"loss": 0.5215,
"step": 765
},
{
"epoch": 0.22211805430353732,
"grad_norm": 0.5662077360043514,
"learning_rate": 1.911015319515515e-05,
"loss": 0.5253,
"step": 770
},
{
"epoch": 0.22356037933148235,
"grad_norm": 0.4764077159702808,
"learning_rate": 1.908926977283148e-05,
"loss": 0.5066,
"step": 775
},
{
"epoch": 0.2250027043594274,
"grad_norm": 0.5639009005172965,
"learning_rate": 1.9068155815396018e-05,
"loss": 0.474,
"step": 780
},
{
"epoch": 0.22644502938737243,
"grad_norm": 0.6776509031874417,
"learning_rate": 1.904681185837128e-05,
"loss": 0.5025,
"step": 785
},
{
"epoch": 0.2278873544153175,
"grad_norm": 0.3940863617407268,
"learning_rate": 1.9025238443113346e-05,
"loss": 0.4781,
"step": 790
},
{
"epoch": 0.22932967944326255,
"grad_norm": 0.5731371374463607,
"learning_rate": 1.9003436116798156e-05,
"loss": 0.5325,
"step": 795
},
{
"epoch": 0.23077200447120758,
"grad_norm": 0.44630504407580995,
"learning_rate": 1.898140543240762e-05,
"loss": 0.5094,
"step": 800
},
{
"epoch": 0.23221432949915263,
"grad_norm": 0.5013841323056458,
"learning_rate": 1.8959146948715582e-05,
"loss": 0.5123,
"step": 805
},
{
"epoch": 0.2336566545270977,
"grad_norm": 0.6517172353158069,
"learning_rate": 1.8936661230273677e-05,
"loss": 0.4944,
"step": 810
},
{
"epoch": 0.23509897955504272,
"grad_norm": 0.5321704297258375,
"learning_rate": 1.8913948847396978e-05,
"loss": 0.5111,
"step": 815
},
{
"epoch": 0.23654130458298778,
"grad_norm": 0.5733385459091142,
"learning_rate": 1.8891010376149554e-05,
"loss": 0.5255,
"step": 820
},
{
"epoch": 0.23798362961093283,
"grad_norm": 0.6439828549708082,
"learning_rate": 1.8867846398329856e-05,
"loss": 0.5224,
"step": 825
},
{
"epoch": 0.23942595463887786,
"grad_norm": 0.526933741666615,
"learning_rate": 1.884445750145595e-05,
"loss": 0.4987,
"step": 830
},
{
"epoch": 0.24086827966682292,
"grad_norm": 0.4358091890203275,
"learning_rate": 1.882084427875062e-05,
"loss": 0.5151,
"step": 835
},
{
"epoch": 0.24231060469476798,
"grad_norm": 0.42052312366605993,
"learning_rate": 1.8797007329126336e-05,
"loss": 0.5292,
"step": 840
},
{
"epoch": 0.243752929722713,
"grad_norm": 0.5162254671712243,
"learning_rate": 1.8772947257170034e-05,
"loss": 0.4701,
"step": 845
},
{
"epoch": 0.24519525475065806,
"grad_norm": 0.41421320556868774,
"learning_rate": 1.8748664673127814e-05,
"loss": 0.4869,
"step": 850
},
{
"epoch": 0.2466375797786031,
"grad_norm": 0.44489422959937447,
"learning_rate": 1.872416019288944e-05,
"loss": 0.5107,
"step": 855
},
{
"epoch": 0.24807990480654815,
"grad_norm": 0.5131502882549939,
"learning_rate": 1.8699434437972726e-05,
"loss": 0.5002,
"step": 860
},
{
"epoch": 0.2495222298344932,
"grad_norm": 0.4410628046298298,
"learning_rate": 1.8674488035507776e-05,
"loss": 0.5033,
"step": 865
},
{
"epoch": 0.25096455486243824,
"grad_norm": 0.424822720640458,
"learning_rate": 1.864932161822107e-05,
"loss": 0.459,
"step": 870
},
{
"epoch": 0.2524068798903833,
"grad_norm": 0.546763650924181,
"learning_rate": 1.8623935824419416e-05,
"loss": 0.4782,
"step": 875
},
{
"epoch": 0.25384920491832835,
"grad_norm": 0.571446149303962,
"learning_rate": 1.859833129797378e-05,
"loss": 0.4971,
"step": 880
},
{
"epoch": 0.2552915299462734,
"grad_norm": 0.3881051890411508,
"learning_rate": 1.857250868830292e-05,
"loss": 0.4645,
"step": 885
},
{
"epoch": 0.25673385497421847,
"grad_norm": 0.4365270093969844,
"learning_rate": 1.8546468650356947e-05,
"loss": 0.4999,
"step": 890
},
{
"epoch": 0.25817618000216347,
"grad_norm": 0.39922925876114046,
"learning_rate": 1.852021184460069e-05,
"loss": 0.4607,
"step": 895
},
{
"epoch": 0.2596185050301085,
"grad_norm": 0.4385372209974039,
"learning_rate": 1.849373893699697e-05,
"loss": 0.5032,
"step": 900
},
{
"epoch": 0.2610608300580536,
"grad_norm": 0.4289486219739114,
"learning_rate": 1.8467050598989677e-05,
"loss": 0.5003,
"step": 905
},
{
"epoch": 0.26250315508599864,
"grad_norm": 0.4045886984758963,
"learning_rate": 1.8440147507486765e-05,
"loss": 0.4644,
"step": 910
},
{
"epoch": 0.2639454801139437,
"grad_norm": 0.43637212820672877,
"learning_rate": 1.8413030344843064e-05,
"loss": 0.5057,
"step": 915
},
{
"epoch": 0.2653878051418887,
"grad_norm": 0.468355616591299,
"learning_rate": 1.838569979884301e-05,
"loss": 0.4967,
"step": 920
},
{
"epoch": 0.26683013016983376,
"grad_norm": 0.4257178939942325,
"learning_rate": 1.835815656268314e-05,
"loss": 0.4848,
"step": 925
},
{
"epoch": 0.2682724551977788,
"grad_norm": 0.6504232751090008,
"learning_rate": 1.8330401334954567e-05,
"loss": 0.4958,
"step": 930
},
{
"epoch": 0.26971478022572387,
"grad_norm": 0.4492644770064815,
"learning_rate": 1.8302434819625234e-05,
"loss": 0.4868,
"step": 935
},
{
"epoch": 0.27115710525366893,
"grad_norm": 0.37095796426726924,
"learning_rate": 1.8274257726022054e-05,
"loss": 0.4472,
"step": 940
},
{
"epoch": 0.272599430281614,
"grad_norm": 0.4070852473871566,
"learning_rate": 1.824587076881294e-05,
"loss": 0.4686,
"step": 945
},
{
"epoch": 0.274041755309559,
"grad_norm": 0.44023807834971757,
"learning_rate": 1.821727466798867e-05,
"loss": 0.471,
"step": 950
},
{
"epoch": 0.27548408033750404,
"grad_norm": 0.5209872184391927,
"learning_rate": 1.8188470148844602e-05,
"loss": 0.4962,
"step": 955
},
{
"epoch": 0.2769264053654491,
"grad_norm": 0.41685090109899176,
"learning_rate": 1.8159457941962325e-05,
"loss": 0.475,
"step": 960
},
{
"epoch": 0.27836873039339416,
"grad_norm": 0.5171250899115861,
"learning_rate": 1.8130238783191087e-05,
"loss": 0.5163,
"step": 965
},
{
"epoch": 0.2798110554213392,
"grad_norm": 0.47139497814149867,
"learning_rate": 1.810081341362915e-05,
"loss": 0.4641,
"step": 970
},
{
"epoch": 0.2812533804492843,
"grad_norm": 0.3879518437836758,
"learning_rate": 1.8071182579604986e-05,
"loss": 0.4777,
"step": 975
},
{
"epoch": 0.2826957054772293,
"grad_norm": 0.455341690737865,
"learning_rate": 1.804134703265836e-05,
"loss": 0.5271,
"step": 980
},
{
"epoch": 0.28413803050517433,
"grad_norm": 0.39108612071221016,
"learning_rate": 1.8011307529521255e-05,
"loss": 0.4645,
"step": 985
},
{
"epoch": 0.2855803555331194,
"grad_norm": 0.3865948965496386,
"learning_rate": 1.7981064832098687e-05,
"loss": 0.4578,
"step": 990
},
{
"epoch": 0.28702268056106445,
"grad_norm": 0.40375523747783393,
"learning_rate": 1.7950619707449374e-05,
"loss": 0.4923,
"step": 995
},
{
"epoch": 0.2884650055890095,
"grad_norm": 0.3376017909117174,
"learning_rate": 1.7919972927766288e-05,
"loss": 0.4658,
"step": 1000
},
{
"epoch": 0.2884650055890095,
"eval_loss": 0.4833250343799591,
"eval_runtime": 142.0125,
"eval_samples_per_second": 12.682,
"eval_steps_per_second": 3.176,
"step": 1000
},
{
"epoch": 0.2899073306169545,
"grad_norm": 0.47138251586932034,
"learning_rate": 1.7889125270357053e-05,
"loss": 0.4851,
"step": 1005
},
{
"epoch": 0.29134965564489956,
"grad_norm": 0.522686359505293,
"learning_rate": 1.7858077517624265e-05,
"loss": 0.4788,
"step": 1010
},
{
"epoch": 0.2927919806728446,
"grad_norm": 0.6355398882354177,
"learning_rate": 1.7826830457045608e-05,
"loss": 0.4525,
"step": 1015
},
{
"epoch": 0.2942343057007897,
"grad_norm": 0.44577505392395406,
"learning_rate": 1.7795384881153896e-05,
"loss": 0.4614,
"step": 1020
},
{
"epoch": 0.29567663072873474,
"grad_norm": 0.454859759409631,
"learning_rate": 1.7763741587516983e-05,
"loss": 0.5021,
"step": 1025
},
{
"epoch": 0.2971189557566798,
"grad_norm": 0.6161570485074761,
"learning_rate": 1.7731901378717523e-05,
"loss": 0.4903,
"step": 1030
},
{
"epoch": 0.2985612807846248,
"grad_norm": 0.43940664169854093,
"learning_rate": 1.769986506233261e-05,
"loss": 0.4819,
"step": 1035
},
{
"epoch": 0.30000360581256985,
"grad_norm": 0.4426640967510136,
"learning_rate": 1.7667633450913307e-05,
"loss": 0.4579,
"step": 1040
},
{
"epoch": 0.3014459308405149,
"grad_norm": 0.5064920131450599,
"learning_rate": 1.763520736196402e-05,
"loss": 0.5066,
"step": 1045
},
{
"epoch": 0.30288825586845997,
"grad_norm": 0.3628170152752897,
"learning_rate": 1.7602587617921785e-05,
"loss": 0.423,
"step": 1050
},
{
"epoch": 0.304330580896405,
"grad_norm": 0.4756441564342862,
"learning_rate": 1.7569775046135388e-05,
"loss": 0.5278,
"step": 1055
},
{
"epoch": 0.30577290592435,
"grad_norm": 0.40932967287449395,
"learning_rate": 1.753677047884439e-05,
"loss": 0.4565,
"step": 1060
},
{
"epoch": 0.3072152309522951,
"grad_norm": 0.4148447936276441,
"learning_rate": 1.7503574753158022e-05,
"loss": 0.4819,
"step": 1065
},
{
"epoch": 0.30865755598024014,
"grad_norm": 0.3868133979093347,
"learning_rate": 1.747018871103395e-05,
"loss": 0.4707,
"step": 1070
},
{
"epoch": 0.3100998810081852,
"grad_norm": 0.39630255989567886,
"learning_rate": 1.743661319925691e-05,
"loss": 0.4387,
"step": 1075
},
{
"epoch": 0.31154220603613025,
"grad_norm": 0.4233553435649959,
"learning_rate": 1.7402849069417246e-05,
"loss": 0.465,
"step": 1080
},
{
"epoch": 0.3129845310640753,
"grad_norm": 0.37304393376464795,
"learning_rate": 1.7368897177889307e-05,
"loss": 0.4854,
"step": 1085
},
{
"epoch": 0.3144268560920203,
"grad_norm": 0.41669096423193014,
"learning_rate": 1.7334758385809715e-05,
"loss": 0.4369,
"step": 1090
},
{
"epoch": 0.31586918111996537,
"grad_norm": 0.3950040493214593,
"learning_rate": 1.7300433559055533e-05,
"loss": 0.4488,
"step": 1095
},
{
"epoch": 0.3173115061479104,
"grad_norm": 0.4206456914262744,
"learning_rate": 1.7265923568222315e-05,
"loss": 0.4608,
"step": 1100
},
{
"epoch": 0.3187538311758555,
"grad_norm": 0.5459001712618055,
"learning_rate": 1.7231229288602e-05,
"loss": 0.4419,
"step": 1105
},
{
"epoch": 0.32019615620380054,
"grad_norm": 0.4002983479690819,
"learning_rate": 1.7196351600160725e-05,
"loss": 0.4575,
"step": 1110
},
{
"epoch": 0.3216384812317456,
"grad_norm": 0.5400371185813517,
"learning_rate": 1.716129138751651e-05,
"loss": 0.4402,
"step": 1115
},
{
"epoch": 0.3230808062596906,
"grad_norm": 0.4526337203461876,
"learning_rate": 1.712604953991681e-05,
"loss": 0.4923,
"step": 1120
},
{
"epoch": 0.32452313128763566,
"grad_norm": 0.3924148895626424,
"learning_rate": 1.709062695121597e-05,
"loss": 0.4734,
"step": 1125
},
{
"epoch": 0.3259654563155807,
"grad_norm": 0.45730078891879783,
"learning_rate": 1.7055024519852554e-05,
"loss": 0.4935,
"step": 1130
},
{
"epoch": 0.32740778134352577,
"grad_norm": 0.41765126413107173,
"learning_rate": 1.7019243148826547e-05,
"loss": 0.4778,
"step": 1135
},
{
"epoch": 0.32885010637147083,
"grad_norm": 0.48822731606676767,
"learning_rate": 1.6983283745676464e-05,
"loss": 0.4786,
"step": 1140
},
{
"epoch": 0.33029243139941583,
"grad_norm": 0.47444702764857977,
"learning_rate": 1.6947147222456318e-05,
"loss": 0.4732,
"step": 1145
},
{
"epoch": 0.3317347564273609,
"grad_norm": 0.36819652961308474,
"learning_rate": 1.6910834495712504e-05,
"loss": 0.49,
"step": 1150
},
{
"epoch": 0.33317708145530595,
"grad_norm": 0.3963647053897705,
"learning_rate": 1.6874346486460543e-05,
"loss": 0.4599,
"step": 1155
},
{
"epoch": 0.334619406483251,
"grad_norm": 0.3557684139157355,
"learning_rate": 1.6837684120161723e-05,
"loss": 0.4603,
"step": 1160
},
{
"epoch": 0.33606173151119606,
"grad_norm": 0.42399774345522806,
"learning_rate": 1.680084832669962e-05,
"loss": 0.4322,
"step": 1165
},
{
"epoch": 0.3375040565391411,
"grad_norm": 0.4013586249486658,
"learning_rate": 1.6763840040356522e-05,
"loss": 0.4398,
"step": 1170
},
{
"epoch": 0.3389463815670861,
"grad_norm": 0.44604773948712173,
"learning_rate": 1.6726660199789733e-05,
"loss": 0.4265,
"step": 1175
},
{
"epoch": 0.3403887065950312,
"grad_norm": 0.39551679284847074,
"learning_rate": 1.6689309748007753e-05,
"loss": 0.4418,
"step": 1180
},
{
"epoch": 0.34183103162297623,
"grad_norm": 0.451264115692116,
"learning_rate": 1.6651789632346377e-05,
"loss": 0.4483,
"step": 1185
},
{
"epoch": 0.3432733566509213,
"grad_norm": 0.4689614820007113,
"learning_rate": 1.6614100804444657e-05,
"loss": 0.467,
"step": 1190
},
{
"epoch": 0.34471568167886635,
"grad_norm": 0.3841720473679624,
"learning_rate": 1.6576244220220763e-05,
"loss": 0.4313,
"step": 1195
},
{
"epoch": 0.3461580067068114,
"grad_norm": 0.4091561009628973,
"learning_rate": 1.6538220839847745e-05,
"loss": 0.434,
"step": 1200
},
{
"epoch": 0.3476003317347564,
"grad_norm": 0.4473483816905544,
"learning_rate": 1.6500031627729178e-05,
"loss": 0.4446,
"step": 1205
},
{
"epoch": 0.34904265676270146,
"grad_norm": 0.4800983187244669,
"learning_rate": 1.6461677552474698e-05,
"loss": 0.4691,
"step": 1210
},
{
"epoch": 0.3504849817906465,
"grad_norm": 0.388554374886088,
"learning_rate": 1.642315958687543e-05,
"loss": 0.4517,
"step": 1215
},
{
"epoch": 0.3519273068185916,
"grad_norm": 0.4804591032499286,
"learning_rate": 1.6384478707879337e-05,
"loss": 0.4736,
"step": 1220
},
{
"epoch": 0.35336963184653664,
"grad_norm": 0.4242345257393015,
"learning_rate": 1.6345635896566415e-05,
"loss": 0.4453,
"step": 1225
},
{
"epoch": 0.35481195687448164,
"grad_norm": 0.5125929278365619,
"learning_rate": 1.6306632138123814e-05,
"loss": 0.4894,
"step": 1230
},
{
"epoch": 0.3562542819024267,
"grad_norm": 0.4135575305051168,
"learning_rate": 1.626746842182087e-05,
"loss": 0.4516,
"step": 1235
},
{
"epoch": 0.35769660693037175,
"grad_norm": 0.49733207897305337,
"learning_rate": 1.6228145740983986e-05,
"loss": 0.4676,
"step": 1240
},
{
"epoch": 0.3591389319583168,
"grad_norm": 0.405324125927312,
"learning_rate": 1.618866509297147e-05,
"loss": 0.4539,
"step": 1245
},
{
"epoch": 0.36058125698626187,
"grad_norm": 0.43290260214899146,
"learning_rate": 1.61490274791482e-05,
"loss": 0.43,
"step": 1250
},
{
"epoch": 0.3620235820142069,
"grad_norm": 0.3648124960837181,
"learning_rate": 1.6109233904860258e-05,
"loss": 0.4516,
"step": 1255
},
{
"epoch": 0.3634659070421519,
"grad_norm": 0.43358315460862995,
"learning_rate": 1.606928537940942e-05,
"loss": 0.4565,
"step": 1260
},
{
"epoch": 0.364908232070097,
"grad_norm": 0.5070316730676355,
"learning_rate": 1.602918291602755e-05,
"loss": 0.4547,
"step": 1265
},
{
"epoch": 0.36635055709804204,
"grad_norm": 0.4556281361017855,
"learning_rate": 1.5988927531850913e-05,
"loss": 0.4631,
"step": 1270
},
{
"epoch": 0.3677928821259871,
"grad_norm": 0.4210598158384229,
"learning_rate": 1.5948520247894363e-05,
"loss": 0.4595,
"step": 1275
},
{
"epoch": 0.36923520715393215,
"grad_norm": 0.4325982920205171,
"learning_rate": 1.590796208902546e-05,
"loss": 0.4698,
"step": 1280
},
{
"epoch": 0.3706775321818772,
"grad_norm": 0.4263624320016057,
"learning_rate": 1.5867254083938472e-05,
"loss": 0.4371,
"step": 1285
},
{
"epoch": 0.3721198572098222,
"grad_norm": 0.4792938379196713,
"learning_rate": 1.582639726512828e-05,
"loss": 0.4464,
"step": 1290
},
{
"epoch": 0.37356218223776727,
"grad_norm": 0.43544663382731996,
"learning_rate": 1.5785392668864186e-05,
"loss": 0.4658,
"step": 1295
},
{
"epoch": 0.37500450726571233,
"grad_norm": 0.38089232775082726,
"learning_rate": 1.5744241335163642e-05,
"loss": 0.4492,
"step": 1300
},
{
"epoch": 0.3764468322936574,
"grad_norm": 0.3692067776356917,
"learning_rate": 1.570294430776587e-05,
"loss": 0.4402,
"step": 1305
},
{
"epoch": 0.37788915732160244,
"grad_norm": 0.43939772643420716,
"learning_rate": 1.5661502634105376e-05,
"loss": 0.4413,
"step": 1310
},
{
"epoch": 0.37933148234954744,
"grad_norm": 0.39362265905546057,
"learning_rate": 1.5619917365285394e-05,
"loss": 0.4314,
"step": 1315
},
{
"epoch": 0.3807738073774925,
"grad_norm": 0.41565735116305985,
"learning_rate": 1.557818955605123e-05,
"loss": 0.4564,
"step": 1320
},
{
"epoch": 0.38221613240543756,
"grad_norm": 0.3633587329212366,
"learning_rate": 1.55363202647635e-05,
"loss": 0.4568,
"step": 1325
},
{
"epoch": 0.3836584574333826,
"grad_norm": 0.43886686943718484,
"learning_rate": 1.5494310553371292e-05,
"loss": 0.4408,
"step": 1330
},
{
"epoch": 0.3851007824613277,
"grad_norm": 0.44313421551297705,
"learning_rate": 1.545216148738523e-05,
"loss": 0.4728,
"step": 1335
},
{
"epoch": 0.38654310748927273,
"grad_norm": 0.43446763871019,
"learning_rate": 1.5409874135850453e-05,
"loss": 0.4413,
"step": 1340
},
{
"epoch": 0.38798543251721773,
"grad_norm": 0.5046802087731463,
"learning_rate": 1.5367449571319486e-05,
"loss": 0.451,
"step": 1345
},
{
"epoch": 0.3894277575451628,
"grad_norm": 0.4176799699807321,
"learning_rate": 1.5324888869825062e-05,
"loss": 0.4575,
"step": 1350
},
{
"epoch": 0.39087008257310785,
"grad_norm": 0.4357723650429465,
"learning_rate": 1.5282193110852806e-05,
"loss": 0.4628,
"step": 1355
},
{
"epoch": 0.3923124076010529,
"grad_norm": 0.47847755269517595,
"learning_rate": 1.5239363377313864e-05,
"loss": 0.4426,
"step": 1360
},
{
"epoch": 0.39375473262899796,
"grad_norm": 0.42951183292967315,
"learning_rate": 1.5196400755517445e-05,
"loss": 0.4173,
"step": 1365
},
{
"epoch": 0.395197057656943,
"grad_norm": 0.3712834304196652,
"learning_rate": 1.5153306335143247e-05,
"loss": 0.4185,
"step": 1370
},
{
"epoch": 0.396639382684888,
"grad_norm": 0.40028893775485,
"learning_rate": 1.5110081209213849e-05,
"loss": 0.4404,
"step": 1375
},
{
"epoch": 0.3980817077128331,
"grad_norm": 0.3524439650077371,
"learning_rate": 1.5066726474066962e-05,
"loss": 0.436,
"step": 1380
},
{
"epoch": 0.39952403274077813,
"grad_norm": 0.41796871469443936,
"learning_rate": 1.5023243229327631e-05,
"loss": 0.4465,
"step": 1385
},
{
"epoch": 0.4009663577687232,
"grad_norm": 0.39648024648913516,
"learning_rate": 1.4979632577880355e-05,
"loss": 0.4599,
"step": 1390
},
{
"epoch": 0.40240868279666825,
"grad_norm": 0.4177593581987727,
"learning_rate": 1.4935895625841095e-05,
"loss": 0.4341,
"step": 1395
},
{
"epoch": 0.40385100782461325,
"grad_norm": 0.39474357091689116,
"learning_rate": 1.4892033482529233e-05,
"loss": 0.4251,
"step": 1400
},
{
"epoch": 0.4052933328525583,
"grad_norm": 0.3925865645135851,
"learning_rate": 1.484804726043943e-05,
"loss": 0.4188,
"step": 1405
},
{
"epoch": 0.40673565788050337,
"grad_norm": 0.43881341912306815,
"learning_rate": 1.480393807521342e-05,
"loss": 0.4626,
"step": 1410
},
{
"epoch": 0.4081779829084484,
"grad_norm": 0.38784235208087897,
"learning_rate": 1.4759707045611694e-05,
"loss": 0.4356,
"step": 1415
},
{
"epoch": 0.4096203079363935,
"grad_norm": 0.4652349082201273,
"learning_rate": 1.4715355293485134e-05,
"loss": 0.4429,
"step": 1420
},
{
"epoch": 0.41106263296433854,
"grad_norm": 0.5020179396910893,
"learning_rate": 1.4670883943746575e-05,
"loss": 0.4424,
"step": 1425
},
{
"epoch": 0.41250495799228354,
"grad_norm": 0.46646941577755224,
"learning_rate": 1.4626294124342237e-05,
"loss": 0.4473,
"step": 1430
},
{
"epoch": 0.4139472830202286,
"grad_norm": 0.3715580720003536,
"learning_rate": 1.4581586966223156e-05,
"loss": 0.457,
"step": 1435
},
{
"epoch": 0.41538960804817365,
"grad_norm": 0.3913149158851186,
"learning_rate": 1.453676360331647e-05,
"loss": 0.4232,
"step": 1440
},
{
"epoch": 0.4168319330761187,
"grad_norm": 0.3755928140913827,
"learning_rate": 1.4491825172496675e-05,
"loss": 0.4376,
"step": 1445
},
{
"epoch": 0.41827425810406377,
"grad_norm": 0.4632236851893659,
"learning_rate": 1.4446772813556784e-05,
"loss": 0.4547,
"step": 1450
},
{
"epoch": 0.4197165831320088,
"grad_norm": 0.3622221987812085,
"learning_rate": 1.4401607669179415e-05,
"loss": 0.4189,
"step": 1455
},
{
"epoch": 0.4211589081599538,
"grad_norm": 0.4427510263617938,
"learning_rate": 1.4356330884907823e-05,
"loss": 0.4307,
"step": 1460
},
{
"epoch": 0.4226012331878989,
"grad_norm": 0.40821656664051026,
"learning_rate": 1.4310943609116815e-05,
"loss": 0.4416,
"step": 1465
},
{
"epoch": 0.42404355821584394,
"grad_norm": 0.45484460030870416,
"learning_rate": 1.4265446992983661e-05,
"loss": 0.449,
"step": 1470
},
{
"epoch": 0.425485883243789,
"grad_norm": 0.38430976618751717,
"learning_rate": 1.4219842190458865e-05,
"loss": 0.4445,
"step": 1475
},
{
"epoch": 0.42692820827173406,
"grad_norm": 0.40624625230940725,
"learning_rate": 1.4174130358236924e-05,
"loss": 0.4734,
"step": 1480
},
{
"epoch": 0.42837053329967906,
"grad_norm": 0.38501281348072397,
"learning_rate": 1.4128312655726957e-05,
"loss": 0.4407,
"step": 1485
},
{
"epoch": 0.4298128583276241,
"grad_norm": 0.5552503619067779,
"learning_rate": 1.4082390245023337e-05,
"loss": 0.4559,
"step": 1490
},
{
"epoch": 0.43125518335556917,
"grad_norm": 0.41269951819834144,
"learning_rate": 1.4036364290876176e-05,
"loss": 0.4407,
"step": 1495
},
{
"epoch": 0.43269750838351423,
"grad_norm": 0.4132538908060478,
"learning_rate": 1.3990235960661824e-05,
"loss": 0.4439,
"step": 1500
},
{
"epoch": 0.43269750838351423,
"eval_loss": 0.43445292115211487,
"eval_runtime": 142.5412,
"eval_samples_per_second": 12.635,
"eval_steps_per_second": 3.164,
"step": 1500
},
{
"epoch": 0.4341398334114593,
"grad_norm": 0.42757706099004156,
"learning_rate": 1.3944006424353229e-05,
"loss": 0.4247,
"step": 1505
},
{
"epoch": 0.43558215843940434,
"grad_norm": 0.36759037583277737,
"learning_rate": 1.389767685449027e-05,
"loss": 0.4306,
"step": 1510
},
{
"epoch": 0.43702448346734935,
"grad_norm": 0.42042330760151675,
"learning_rate": 1.3851248426150026e-05,
"loss": 0.4244,
"step": 1515
},
{
"epoch": 0.4384668084952944,
"grad_norm": 0.38414415773611094,
"learning_rate": 1.380472231691697e-05,
"loss": 0.4377,
"step": 1520
},
{
"epoch": 0.43990913352323946,
"grad_norm": 0.4303765304251248,
"learning_rate": 1.375809970685309e-05,
"loss": 0.4574,
"step": 1525
},
{
"epoch": 0.4413514585511845,
"grad_norm": 0.39045631524439356,
"learning_rate": 1.3711381778467972e-05,
"loss": 0.4487,
"step": 1530
},
{
"epoch": 0.4427937835791296,
"grad_norm": 0.409923537347395,
"learning_rate": 1.36645697166888e-05,
"loss": 0.4155,
"step": 1535
},
{
"epoch": 0.44423610860707463,
"grad_norm": 0.4590281734742793,
"learning_rate": 1.3617664708830304e-05,
"loss": 0.4211,
"step": 1540
},
{
"epoch": 0.44567843363501963,
"grad_norm": 0.4340206380764746,
"learning_rate": 1.3570667944564651e-05,
"loss": 0.43,
"step": 1545
},
{
"epoch": 0.4471207586629647,
"grad_norm": 0.3867702108735739,
"learning_rate": 1.3523580615891258e-05,
"loss": 0.4367,
"step": 1550
},
{
"epoch": 0.44856308369090975,
"grad_norm": 0.45493644595260835,
"learning_rate": 1.347640391710657e-05,
"loss": 0.4336,
"step": 1555
},
{
"epoch": 0.4500054087188548,
"grad_norm": 0.41557484865468924,
"learning_rate": 1.3429139044773768e-05,
"loss": 0.4128,
"step": 1560
},
{
"epoch": 0.45144773374679986,
"grad_norm": 0.41564130897863455,
"learning_rate": 1.3381787197692413e-05,
"loss": 0.3957,
"step": 1565
},
{
"epoch": 0.45289005877474486,
"grad_norm": 0.4011264197640641,
"learning_rate": 1.3334349576868046e-05,
"loss": 0.442,
"step": 1570
},
{
"epoch": 0.4543323838026899,
"grad_norm": 0.4825855614290229,
"learning_rate": 1.3286827385481726e-05,
"loss": 0.4058,
"step": 1575
},
{
"epoch": 0.455774708830635,
"grad_norm": 0.3921023793032671,
"learning_rate": 1.3239221828859509e-05,
"loss": 0.3884,
"step": 1580
},
{
"epoch": 0.45721703385858004,
"grad_norm": 0.40627991293028837,
"learning_rate": 1.3191534114441883e-05,
"loss": 0.4333,
"step": 1585
},
{
"epoch": 0.4586593588865251,
"grad_norm": 0.43891554498901797,
"learning_rate": 1.3143765451753137e-05,
"loss": 0.4166,
"step": 1590
},
{
"epoch": 0.46010168391447015,
"grad_norm": 0.39830311047980305,
"learning_rate": 1.3095917052370686e-05,
"loss": 0.4235,
"step": 1595
},
{
"epoch": 0.46154400894241515,
"grad_norm": 0.3980453207285396,
"learning_rate": 1.3047990129894348e-05,
"loss": 0.4001,
"step": 1600
},
{
"epoch": 0.4629863339703602,
"grad_norm": 0.4136578166461488,
"learning_rate": 1.299998589991555e-05,
"loss": 0.4076,
"step": 1605
},
{
"epoch": 0.46442865899830527,
"grad_norm": 0.4343208402620231,
"learning_rate": 1.2951905579986506e-05,
"loss": 0.4384,
"step": 1610
},
{
"epoch": 0.4658709840262503,
"grad_norm": 0.45578762184210947,
"learning_rate": 1.290375038958933e-05,
"loss": 0.4048,
"step": 1615
},
{
"epoch": 0.4673133090541954,
"grad_norm": 0.46943412662551365,
"learning_rate": 1.285552155010511e-05,
"loss": 0.401,
"step": 1620
},
{
"epoch": 0.46875563408214044,
"grad_norm": 0.40848878753251544,
"learning_rate": 1.2807220284782926e-05,
"loss": 0.4461,
"step": 1625
},
{
"epoch": 0.47019795911008544,
"grad_norm": 0.3921726292273481,
"learning_rate": 1.2758847818708832e-05,
"loss": 0.4205,
"step": 1630
},
{
"epoch": 0.4716402841380305,
"grad_norm": 0.45781513572784016,
"learning_rate": 1.2710405378774768e-05,
"loss": 0.4423,
"step": 1635
},
{
"epoch": 0.47308260916597555,
"grad_norm": 0.45862261759553535,
"learning_rate": 1.2661894193647458e-05,
"loss": 0.4,
"step": 1640
},
{
"epoch": 0.4745249341939206,
"grad_norm": 0.3527899534786595,
"learning_rate": 1.261331549373724e-05,
"loss": 0.3998,
"step": 1645
},
{
"epoch": 0.47596725922186567,
"grad_norm": 0.36297450328540837,
"learning_rate": 1.2564670511166865e-05,
"loss": 0.4206,
"step": 1650
},
{
"epoch": 0.47740958424981067,
"grad_norm": 0.4030716124087903,
"learning_rate": 1.2515960479740224e-05,
"loss": 0.4047,
"step": 1655
},
{
"epoch": 0.4788519092777557,
"grad_norm": 0.41175543047417906,
"learning_rate": 1.246718663491108e-05,
"loss": 0.4345,
"step": 1660
},
{
"epoch": 0.4802942343057008,
"grad_norm": 0.3574092930784039,
"learning_rate": 1.2418350213751728e-05,
"loss": 0.4081,
"step": 1665
},
{
"epoch": 0.48173655933364584,
"grad_norm": 0.3954039812545518,
"learning_rate": 1.2369452454921604e-05,
"loss": 0.4159,
"step": 1670
},
{
"epoch": 0.4831788843615909,
"grad_norm": 0.4497181497561506,
"learning_rate": 1.2320494598635886e-05,
"loss": 0.4052,
"step": 1675
},
{
"epoch": 0.48462120938953596,
"grad_norm": 0.44655082111096045,
"learning_rate": 1.2271477886634023e-05,
"loss": 0.4123,
"step": 1680
},
{
"epoch": 0.48606353441748096,
"grad_norm": 0.40423139543908587,
"learning_rate": 1.2222403562148252e-05,
"loss": 0.4152,
"step": 1685
},
{
"epoch": 0.487505859445426,
"grad_norm": 0.36806086858378434,
"learning_rate": 1.2173272869872062e-05,
"loss": 0.4252,
"step": 1690
},
{
"epoch": 0.4889481844733711,
"grad_norm": 0.41722654899253564,
"learning_rate": 1.2124087055928617e-05,
"loss": 0.3879,
"step": 1695
},
{
"epoch": 0.49039050950131613,
"grad_norm": 0.4329150355333478,
"learning_rate": 1.207484736783916e-05,
"loss": 0.3849,
"step": 1700
},
{
"epoch": 0.4918328345292612,
"grad_norm": 0.4710085788902766,
"learning_rate": 1.2025555054491367e-05,
"loss": 0.4303,
"step": 1705
},
{
"epoch": 0.4932751595572062,
"grad_norm": 0.443066548358196,
"learning_rate": 1.1976211366107668e-05,
"loss": 0.4198,
"step": 1710
},
{
"epoch": 0.49471748458515125,
"grad_norm": 0.3338656609348242,
"learning_rate": 1.1926817554213548e-05,
"loss": 0.3911,
"step": 1715
},
{
"epoch": 0.4961598096130963,
"grad_norm": 0.38270258610415053,
"learning_rate": 1.1877374871605786e-05,
"loss": 0.4068,
"step": 1720
},
{
"epoch": 0.49760213464104136,
"grad_norm": 0.40504870451767916,
"learning_rate": 1.18278845723207e-05,
"loss": 0.4117,
"step": 1725
},
{
"epoch": 0.4990444596689864,
"grad_norm": 0.4346348228563321,
"learning_rate": 1.1778347911602329e-05,
"loss": 0.4104,
"step": 1730
},
{
"epoch": 0.5004867846969314,
"grad_norm": 0.4075021793881479,
"learning_rate": 1.1728766145870587e-05,
"loss": 0.4229,
"step": 1735
},
{
"epoch": 0.5019291097248765,
"grad_norm": 0.418017099187981,
"learning_rate": 1.167914053268942e-05,
"loss": 0.407,
"step": 1740
},
{
"epoch": 0.5033714347528215,
"grad_norm": 0.39895813955242926,
"learning_rate": 1.1629472330734888e-05,
"loss": 0.3978,
"step": 1745
},
{
"epoch": 0.5048137597807666,
"grad_norm": 0.40383289208967305,
"learning_rate": 1.1579762799763249e-05,
"loss": 0.4175,
"step": 1750
},
{
"epoch": 0.5062560848087116,
"grad_norm": 0.5225560587862472,
"learning_rate": 1.1530013200579008e-05,
"loss": 0.4131,
"step": 1755
},
{
"epoch": 0.5076984098366567,
"grad_norm": 0.4004897787727647,
"learning_rate": 1.1480224795002943e-05,
"loss": 0.3888,
"step": 1760
},
{
"epoch": 0.5091407348646018,
"grad_norm": 0.4248175503521806,
"learning_rate": 1.1430398845840085e-05,
"loss": 0.4324,
"step": 1765
},
{
"epoch": 0.5105830598925468,
"grad_norm": 0.43829908182981264,
"learning_rate": 1.1380536616847706e-05,
"loss": 0.4079,
"step": 1770
},
{
"epoch": 0.5120253849204919,
"grad_norm": 0.43570794658905476,
"learning_rate": 1.1330639372703258e-05,
"loss": 0.4045,
"step": 1775
},
{
"epoch": 0.5134677099484369,
"grad_norm": 0.43500914045447153,
"learning_rate": 1.12807083789723e-05,
"loss": 0.419,
"step": 1780
},
{
"epoch": 0.5149100349763819,
"grad_norm": 0.41351142363579385,
"learning_rate": 1.123074490207639e-05,
"loss": 0.3986,
"step": 1785
},
{
"epoch": 0.5163523600043269,
"grad_norm": 0.37789765808010595,
"learning_rate": 1.1180750209260972e-05,
"loss": 0.4016,
"step": 1790
},
{
"epoch": 0.517794685032272,
"grad_norm": 0.4013962679722207,
"learning_rate": 1.1130725568563241e-05,
"loss": 0.4081,
"step": 1795
},
{
"epoch": 0.519237010060217,
"grad_norm": 0.38374761554210224,
"learning_rate": 1.1080672248779964e-05,
"loss": 0.4061,
"step": 1800
},
{
"epoch": 0.5206793350881621,
"grad_norm": 0.44182386119487255,
"learning_rate": 1.1030591519435316e-05,
"loss": 0.3916,
"step": 1805
},
{
"epoch": 0.5221216601161072,
"grad_norm": 0.44971294735945117,
"learning_rate": 1.0980484650748666e-05,
"loss": 0.3996,
"step": 1810
},
{
"epoch": 0.5235639851440522,
"grad_norm": 0.35276497806950113,
"learning_rate": 1.0930352913602371e-05,
"loss": 0.3732,
"step": 1815
},
{
"epoch": 0.5250063101719973,
"grad_norm": 0.42340138266599786,
"learning_rate": 1.0880197579509532e-05,
"loss": 0.4222,
"step": 1820
},
{
"epoch": 0.5264486351999423,
"grad_norm": 0.39078797688993877,
"learning_rate": 1.0830019920581753e-05,
"loss": 0.4136,
"step": 1825
},
{
"epoch": 0.5278909602278874,
"grad_norm": 0.4130289272161752,
"learning_rate": 1.0779821209496876e-05,
"loss": 0.4192,
"step": 1830
},
{
"epoch": 0.5293332852558325,
"grad_norm": 0.41541974485384586,
"learning_rate": 1.0729602719466692e-05,
"loss": 0.4031,
"step": 1835
},
{
"epoch": 0.5307756102837774,
"grad_norm": 0.44049659174573497,
"learning_rate": 1.067936572420466e-05,
"loss": 0.4069,
"step": 1840
},
{
"epoch": 0.5322179353117225,
"grad_norm": 0.44056632399340595,
"learning_rate": 1.0629111497893591e-05,
"loss": 0.3964,
"step": 1845
},
{
"epoch": 0.5336602603396675,
"grad_norm": 0.40575645379756525,
"learning_rate": 1.0578841315153333e-05,
"loss": 0.3953,
"step": 1850
},
{
"epoch": 0.5351025853676126,
"grad_norm": 0.37056517023195357,
"learning_rate": 1.0528556451008447e-05,
"loss": 0.4058,
"step": 1855
},
{
"epoch": 0.5365449103955576,
"grad_norm": 0.38961078802000476,
"learning_rate": 1.0478258180855869e-05,
"loss": 0.3783,
"step": 1860
},
{
"epoch": 0.5379872354235027,
"grad_norm": 0.4278326171242378,
"learning_rate": 1.0427947780432547e-05,
"loss": 0.4025,
"step": 1865
},
{
"epoch": 0.5394295604514477,
"grad_norm": 0.4487192036382051,
"learning_rate": 1.0377626525783101e-05,
"loss": 0.3933,
"step": 1870
},
{
"epoch": 0.5408718854793928,
"grad_norm": 0.5348996401888022,
"learning_rate": 1.0327295693227454e-05,
"loss": 0.447,
"step": 1875
},
{
"epoch": 0.5423142105073379,
"grad_norm": 0.527197311781129,
"learning_rate": 1.0276956559328455e-05,
"loss": 0.3949,
"step": 1880
},
{
"epoch": 0.5437565355352829,
"grad_norm": 0.41151058505508553,
"learning_rate": 1.0226610400859498e-05,
"loss": 0.4051,
"step": 1885
},
{
"epoch": 0.545198860563228,
"grad_norm": 0.37166405264306773,
"learning_rate": 1.0176258494772153e-05,
"loss": 0.3991,
"step": 1890
},
{
"epoch": 0.5466411855911729,
"grad_norm": 0.4167614980577364,
"learning_rate": 1.0125902118163762e-05,
"loss": 0.4086,
"step": 1895
},
{
"epoch": 0.548083510619118,
"grad_norm": 0.4002106455641225,
"learning_rate": 1.007554254824506e-05,
"loss": 0.4006,
"step": 1900
},
{
"epoch": 0.549525835647063,
"grad_norm": 0.38648887792017217,
"learning_rate": 1.0025181062307774e-05,
"loss": 0.4009,
"step": 1905
},
{
"epoch": 0.5509681606750081,
"grad_norm": 0.4402653770907521,
"learning_rate": 9.974818937692228e-06,
"loss": 0.3909,
"step": 1910
},
{
"epoch": 0.5524104857029531,
"grad_norm": 0.39402192655503426,
"learning_rate": 9.92445745175494e-06,
"loss": 0.3793,
"step": 1915
},
{
"epoch": 0.5538528107308982,
"grad_norm": 0.36447042674734037,
"learning_rate": 9.874097881836241e-06,
"loss": 0.3856,
"step": 1920
},
{
"epoch": 0.5552951357588433,
"grad_norm": 0.38084863196798785,
"learning_rate": 9.823741505227852e-06,
"loss": 0.3821,
"step": 1925
},
{
"epoch": 0.5567374607867883,
"grad_norm": 0.3689396281200298,
"learning_rate": 9.773389599140504e-06,
"loss": 0.3888,
"step": 1930
},
{
"epoch": 0.5581797858147334,
"grad_norm": 0.42447241183482853,
"learning_rate": 9.72304344067155e-06,
"loss": 0.4018,
"step": 1935
},
{
"epoch": 0.5596221108426784,
"grad_norm": 0.34840166562757835,
"learning_rate": 9.672704306772547e-06,
"loss": 0.381,
"step": 1940
},
{
"epoch": 0.5610644358706235,
"grad_norm": 0.3824007554962182,
"learning_rate": 9.6223734742169e-06,
"loss": 0.405,
"step": 1945
},
{
"epoch": 0.5625067608985685,
"grad_norm": 0.40567921647837246,
"learning_rate": 9.572052219567455e-06,
"loss": 0.3886,
"step": 1950
},
{
"epoch": 0.5639490859265135,
"grad_norm": 0.4496361442646002,
"learning_rate": 9.521741819144135e-06,
"loss": 0.3926,
"step": 1955
},
{
"epoch": 0.5653914109544586,
"grad_norm": 0.3771274201963948,
"learning_rate": 9.471443548991557e-06,
"loss": 0.4009,
"step": 1960
},
{
"epoch": 0.5668337359824036,
"grad_norm": 0.3832741322922619,
"learning_rate": 9.421158684846669e-06,
"loss": 0.3926,
"step": 1965
},
{
"epoch": 0.5682760610103487,
"grad_norm": 0.41676932794244004,
"learning_rate": 9.370888502106414e-06,
"loss": 0.4194,
"step": 1970
},
{
"epoch": 0.5697183860382937,
"grad_norm": 0.4465176481054024,
"learning_rate": 9.320634275795342e-06,
"loss": 0.3885,
"step": 1975
},
{
"epoch": 0.5711607110662388,
"grad_norm": 0.41454265589275485,
"learning_rate": 9.270397280533311e-06,
"loss": 0.4041,
"step": 1980
},
{
"epoch": 0.5726030360941838,
"grad_norm": 0.37529076026198815,
"learning_rate": 9.220178790503125e-06,
"loss": 0.3784,
"step": 1985
},
{
"epoch": 0.5740453611221289,
"grad_norm": 0.4006407856625201,
"learning_rate": 9.169980079418248e-06,
"loss": 0.3742,
"step": 1990
},
{
"epoch": 0.575487686150074,
"grad_norm": 0.4075785016746068,
"learning_rate": 9.119802420490473e-06,
"loss": 0.4184,
"step": 1995
},
{
"epoch": 0.576930011178019,
"grad_norm": 0.3892341916180056,
"learning_rate": 9.06964708639763e-06,
"loss": 0.3865,
"step": 2000
},
{
"epoch": 0.576930011178019,
"eval_loss": 0.3948507606983185,
"eval_runtime": 142.1685,
"eval_samples_per_second": 12.668,
"eval_steps_per_second": 3.172,
"step": 2000
},
{
"epoch": 0.5783723362059641,
"grad_norm": 0.4476758638692534,
"learning_rate": 9.019515349251337e-06,
"loss": 0.4076,
"step": 2005
},
{
"epoch": 0.579814661233909,
"grad_norm": 0.38084358148704506,
"learning_rate": 8.969408480564684e-06,
"loss": 0.3951,
"step": 2010
},
{
"epoch": 0.5812569862618541,
"grad_norm": 0.3946160859854508,
"learning_rate": 8.919327751220038e-06,
"loss": 0.3737,
"step": 2015
},
{
"epoch": 0.5826993112897991,
"grad_norm": 0.4376591903476801,
"learning_rate": 8.86927443143676e-06,
"loss": 0.3993,
"step": 2020
},
{
"epoch": 0.5841416363177442,
"grad_norm": 0.4220093736158996,
"learning_rate": 8.819249790739033e-06,
"loss": 0.3896,
"step": 2025
},
{
"epoch": 0.5855839613456892,
"grad_norm": 0.37781362600911217,
"learning_rate": 8.769255097923617e-06,
"loss": 0.358,
"step": 2030
},
{
"epoch": 0.5870262863736343,
"grad_norm": 0.37752543573320735,
"learning_rate": 8.719291621027703e-06,
"loss": 0.4016,
"step": 2035
},
{
"epoch": 0.5884686114015794,
"grad_norm": 0.4195162100656966,
"learning_rate": 8.669360627296745e-06,
"loss": 0.3755,
"step": 2040
},
{
"epoch": 0.5899109364295244,
"grad_norm": 0.40866907101120203,
"learning_rate": 8.619463383152296e-06,
"loss": 0.3964,
"step": 2045
},
{
"epoch": 0.5913532614574695,
"grad_norm": 0.4194072279329464,
"learning_rate": 8.56960115415992e-06,
"loss": 0.3853,
"step": 2050
},
{
"epoch": 0.5927955864854145,
"grad_norm": 0.503872591140977,
"learning_rate": 8.519775204997063e-06,
"loss": 0.4161,
"step": 2055
},
{
"epoch": 0.5942379115133596,
"grad_norm": 0.4656959686074043,
"learning_rate": 8.469986799420993e-06,
"loss": 0.4207,
"step": 2060
},
{
"epoch": 0.5956802365413045,
"grad_norm": 0.4068362162842934,
"learning_rate": 8.420237200236753e-06,
"loss": 0.3717,
"step": 2065
},
{
"epoch": 0.5971225615692496,
"grad_norm": 0.4469993385978865,
"learning_rate": 8.370527669265114e-06,
"loss": 0.4039,
"step": 2070
},
{
"epoch": 0.5985648865971946,
"grad_norm": 0.43643202324029334,
"learning_rate": 8.320859467310582e-06,
"loss": 0.3749,
"step": 2075
},
{
"epoch": 0.6000072116251397,
"grad_norm": 0.5297689595825736,
"learning_rate": 8.271233854129413e-06,
"loss": 0.376,
"step": 2080
},
{
"epoch": 0.6014495366530848,
"grad_norm": 0.489056954944045,
"learning_rate": 8.221652088397675e-06,
"loss": 0.3933,
"step": 2085
},
{
"epoch": 0.6028918616810298,
"grad_norm": 0.37378771704976776,
"learning_rate": 8.172115427679304e-06,
"loss": 0.3945,
"step": 2090
},
{
"epoch": 0.6043341867089749,
"grad_norm": 0.4235226777306445,
"learning_rate": 8.122625128394216e-06,
"loss": 0.3826,
"step": 2095
},
{
"epoch": 0.6057765117369199,
"grad_norm": 0.4021066843708137,
"learning_rate": 8.073182445786455e-06,
"loss": 0.3642,
"step": 2100
},
{
"epoch": 0.607218836764865,
"grad_norm": 0.3735730097404964,
"learning_rate": 8.023788633892334e-06,
"loss": 0.3725,
"step": 2105
},
{
"epoch": 0.60866116179281,
"grad_norm": 0.42115686535849983,
"learning_rate": 7.974444945508637e-06,
"loss": 0.3876,
"step": 2110
},
{
"epoch": 0.6101034868207551,
"grad_norm": 0.42268328106794184,
"learning_rate": 7.925152632160841e-06,
"loss": 0.4042,
"step": 2115
},
{
"epoch": 0.6115458118487,
"grad_norm": 0.4303350707681742,
"learning_rate": 7.875912944071386e-06,
"loss": 0.3718,
"step": 2120
},
{
"epoch": 0.6129881368766451,
"grad_norm": 0.41179372110756424,
"learning_rate": 7.826727130127942e-06,
"loss": 0.3844,
"step": 2125
},
{
"epoch": 0.6144304619045902,
"grad_norm": 0.3763060638976918,
"learning_rate": 7.77759643785175e-06,
"loss": 0.378,
"step": 2130
},
{
"epoch": 0.6158727869325352,
"grad_norm": 0.40647467863126857,
"learning_rate": 7.72852211336598e-06,
"loss": 0.3633,
"step": 2135
},
{
"epoch": 0.6173151119604803,
"grad_norm": 0.4427513530880047,
"learning_rate": 7.679505401364116e-06,
"loss": 0.3728,
"step": 2140
},
{
"epoch": 0.6187574369884253,
"grad_norm": 0.40218277177425543,
"learning_rate": 7.630547545078398e-06,
"loss": 0.3936,
"step": 2145
},
{
"epoch": 0.6201997620163704,
"grad_norm": 0.40266373448906506,
"learning_rate": 7.581649786248276e-06,
"loss": 0.3956,
"step": 2150
},
{
"epoch": 0.6216420870443155,
"grad_norm": 0.4101360200980578,
"learning_rate": 7.532813365088921e-06,
"loss": 0.3935,
"step": 2155
},
{
"epoch": 0.6230844120722605,
"grad_norm": 0.4360450388421823,
"learning_rate": 7.484039520259781e-06,
"loss": 0.393,
"step": 2160
},
{
"epoch": 0.6245267371002056,
"grad_norm": 0.3984091507351705,
"learning_rate": 7.435329488833137e-06,
"loss": 0.3857,
"step": 2165
},
{
"epoch": 0.6259690621281506,
"grad_norm": 0.4057039326760462,
"learning_rate": 7.38668450626276e-06,
"loss": 0.4013,
"step": 2170
},
{
"epoch": 0.6274113871560957,
"grad_norm": 0.39301356289008293,
"learning_rate": 7.338105806352542e-06,
"loss": 0.3613,
"step": 2175
},
{
"epoch": 0.6288537121840406,
"grad_norm": 0.4031222004525292,
"learning_rate": 7.289594621225236e-06,
"loss": 0.3775,
"step": 2180
},
{
"epoch": 0.6302960372119857,
"grad_norm": 0.42389618462152223,
"learning_rate": 7.241152181291173e-06,
"loss": 0.3842,
"step": 2185
},
{
"epoch": 0.6317383622399307,
"grad_norm": 0.4222447939654566,
"learning_rate": 7.192779715217075e-06,
"loss": 0.3747,
"step": 2190
},
{
"epoch": 0.6331806872678758,
"grad_norm": 0.3616433078805121,
"learning_rate": 7.144478449894894e-06,
"loss": 0.3619,
"step": 2195
},
{
"epoch": 0.6346230122958209,
"grad_norm": 0.40315108612725287,
"learning_rate": 7.096249610410671e-06,
"loss": 0.383,
"step": 2200
},
{
"epoch": 0.6360653373237659,
"grad_norm": 0.39550949033278987,
"learning_rate": 7.0480944200134975e-06,
"loss": 0.3993,
"step": 2205
},
{
"epoch": 0.637507662351711,
"grad_norm": 0.4061605042450912,
"learning_rate": 7.00001410008445e-06,
"loss": 0.3667,
"step": 2210
},
{
"epoch": 0.638949987379656,
"grad_norm": 0.399669288075527,
"learning_rate": 6.952009870105654e-06,
"loss": 0.387,
"step": 2215
},
{
"epoch": 0.6403923124076011,
"grad_norm": 0.4188823149502449,
"learning_rate": 6.904082947629317e-06,
"loss": 0.3814,
"step": 2220
},
{
"epoch": 0.6418346374355461,
"grad_norm": 0.3729926900968089,
"learning_rate": 6.856234548246866e-06,
"loss": 0.3647,
"step": 2225
},
{
"epoch": 0.6432769624634912,
"grad_norm": 0.3995200969127714,
"learning_rate": 6.808465885558122e-06,
"loss": 0.3778,
"step": 2230
},
{
"epoch": 0.6447192874914361,
"grad_norm": 0.4182365028017815,
"learning_rate": 6.760778171140492e-06,
"loss": 0.4071,
"step": 2235
},
{
"epoch": 0.6461616125193812,
"grad_norm": 0.419641094415173,
"learning_rate": 6.713172614518278e-06,
"loss": 0.3838,
"step": 2240
},
{
"epoch": 0.6476039375473263,
"grad_norm": 0.455639932664125,
"learning_rate": 6.665650423131953e-06,
"loss": 0.3864,
"step": 2245
},
{
"epoch": 0.6490462625752713,
"grad_norm": 0.42278667120966895,
"learning_rate": 6.618212802307589e-06,
"loss": 0.396,
"step": 2250
},
{
"epoch": 0.6504885876032164,
"grad_norm": 0.44585454789944867,
"learning_rate": 6.570860955226234e-06,
"loss": 0.3811,
"step": 2255
},
{
"epoch": 0.6519309126311614,
"grad_norm": 0.3966025625438823,
"learning_rate": 6.5235960828934305e-06,
"loss": 0.3732,
"step": 2260
},
{
"epoch": 0.6533732376591065,
"grad_norm": 0.40489868259557904,
"learning_rate": 6.476419384108745e-06,
"loss": 0.3567,
"step": 2265
},
{
"epoch": 0.6548155626870515,
"grad_norm": 0.39366736678335024,
"learning_rate": 6.429332055435349e-06,
"loss": 0.3623,
"step": 2270
},
{
"epoch": 0.6562578877149966,
"grad_norm": 0.42529750592620424,
"learning_rate": 6.382335291169698e-06,
"loss": 0.3676,
"step": 2275
},
{
"epoch": 0.6577002127429417,
"grad_norm": 0.44036040562921713,
"learning_rate": 6.335430283311206e-06,
"loss": 0.3889,
"step": 2280
},
{
"epoch": 0.6591425377708867,
"grad_norm": 0.3787593063841428,
"learning_rate": 6.288618221532031e-06,
"loss": 0.386,
"step": 2285
},
{
"epoch": 0.6605848627988317,
"grad_norm": 0.4169592811397764,
"learning_rate": 6.241900293146915e-06,
"loss": 0.3752,
"step": 2290
},
{
"epoch": 0.6620271878267767,
"grad_norm": 0.4047539500558757,
"learning_rate": 6.195277683083033e-06,
"loss": 0.3658,
"step": 2295
},
{
"epoch": 0.6634695128547218,
"grad_norm": 0.3845249122797127,
"learning_rate": 6.148751573849976e-06,
"loss": 0.3563,
"step": 2300
},
{
"epoch": 0.6649118378826668,
"grad_norm": 0.4633041975142693,
"learning_rate": 6.102323145509732e-06,
"loss": 0.3852,
"step": 2305
},
{
"epoch": 0.6663541629106119,
"grad_norm": 0.3985148240515743,
"learning_rate": 6.055993575646775e-06,
"loss": 0.3915,
"step": 2310
},
{
"epoch": 0.667796487938557,
"grad_norm": 0.40716397694215495,
"learning_rate": 6.00976403933818e-06,
"loss": 0.3605,
"step": 2315
},
{
"epoch": 0.669238812966502,
"grad_norm": 0.38795576025941675,
"learning_rate": 5.963635709123825e-06,
"loss": 0.37,
"step": 2320
},
{
"epoch": 0.6706811379944471,
"grad_norm": 0.4110632294347015,
"learning_rate": 5.91760975497667e-06,
"loss": 0.3853,
"step": 2325
},
{
"epoch": 0.6721234630223921,
"grad_norm": 0.3969166036791085,
"learning_rate": 5.871687344273045e-06,
"loss": 0.3672,
"step": 2330
},
{
"epoch": 0.6735657880503372,
"grad_norm": 0.41207993758304634,
"learning_rate": 5.8258696417630825e-06,
"loss": 0.3547,
"step": 2335
},
{
"epoch": 0.6750081130782822,
"grad_norm": 0.3680867654775724,
"learning_rate": 5.780157809541134e-06,
"loss": 0.3625,
"step": 2340
},
{
"epoch": 0.6764504381062273,
"grad_norm": 0.4267438085961488,
"learning_rate": 5.734553007016345e-06,
"loss": 0.3999,
"step": 2345
},
{
"epoch": 0.6778927631341722,
"grad_norm": 0.3986326036374569,
"learning_rate": 5.68905639088319e-06,
"loss": 0.3303,
"step": 2350
},
{
"epoch": 0.6793350881621173,
"grad_norm": 0.42614206231420926,
"learning_rate": 5.643669115092183e-06,
"loss": 0.3589,
"step": 2355
},
{
"epoch": 0.6807774131900624,
"grad_norm": 0.3776847045804154,
"learning_rate": 5.598392330820586e-06,
"loss": 0.3609,
"step": 2360
},
{
"epoch": 0.6822197382180074,
"grad_norm": 0.41271036973705766,
"learning_rate": 5.553227186443215e-06,
"loss": 0.3615,
"step": 2365
},
{
"epoch": 0.6836620632459525,
"grad_norm": 0.38781546784387094,
"learning_rate": 5.508174827503328e-06,
"loss": 0.3433,
"step": 2370
},
{
"epoch": 0.6851043882738975,
"grad_norm": 0.39550012764434234,
"learning_rate": 5.46323639668353e-06,
"loss": 0.3691,
"step": 2375
},
{
"epoch": 0.6865467133018426,
"grad_norm": 0.4203725670836375,
"learning_rate": 5.4184130337768485e-06,
"loss": 0.3882,
"step": 2380
},
{
"epoch": 0.6879890383297876,
"grad_norm": 0.41719368579398214,
"learning_rate": 5.373705875657766e-06,
"loss": 0.3678,
"step": 2385
},
{
"epoch": 0.6894313633577327,
"grad_norm": 0.408418654280754,
"learning_rate": 5.329116056253429e-06,
"loss": 0.3788,
"step": 2390
},
{
"epoch": 0.6908736883856778,
"grad_norm": 0.4432414502444195,
"learning_rate": 5.284644706514868e-06,
"loss": 0.3733,
"step": 2395
},
{
"epoch": 0.6923160134136228,
"grad_norm": 0.43523682450545426,
"learning_rate": 5.240292954388306e-06,
"loss": 0.3716,
"step": 2400
},
{
"epoch": 0.6937583384415678,
"grad_norm": 0.4389694994462393,
"learning_rate": 5.1960619247865815e-06,
"loss": 0.3655,
"step": 2405
},
{
"epoch": 0.6952006634695128,
"grad_norm": 0.3932614135155125,
"learning_rate": 5.15195273956057e-06,
"loss": 0.3971,
"step": 2410
},
{
"epoch": 0.6966429884974579,
"grad_norm": 0.38979362609767165,
"learning_rate": 5.107966517470771e-06,
"loss": 0.3724,
"step": 2415
},
{
"epoch": 0.6980853135254029,
"grad_norm": 0.4209080852390916,
"learning_rate": 5.064104374158909e-06,
"loss": 0.3911,
"step": 2420
},
{
"epoch": 0.699527638553348,
"grad_norm": 0.45055904805315533,
"learning_rate": 5.0203674221196485e-06,
"loss": 0.3633,
"step": 2425
},
{
"epoch": 0.700969963581293,
"grad_norm": 0.3868393099197903,
"learning_rate": 4.9767567706723706e-06,
"loss": 0.3515,
"step": 2430
},
{
"epoch": 0.7024122886092381,
"grad_norm": 0.41826804531316264,
"learning_rate": 4.933273525933041e-06,
"loss": 0.3519,
"step": 2435
},
{
"epoch": 0.7038546136371832,
"grad_norm": 0.45957339946847975,
"learning_rate": 4.889918790786153e-06,
"loss": 0.3807,
"step": 2440
},
{
"epoch": 0.7052969386651282,
"grad_norm": 0.4540538141436769,
"learning_rate": 4.846693664856754e-06,
"loss": 0.3465,
"step": 2445
},
{
"epoch": 0.7067392636930733,
"grad_norm": 0.47813500195150954,
"learning_rate": 4.803599244482558e-06,
"loss": 0.376,
"step": 2450
},
{
"epoch": 0.7081815887210183,
"grad_norm": 0.3925519413949624,
"learning_rate": 4.760636622686136e-06,
"loss": 0.3404,
"step": 2455
},
{
"epoch": 0.7096239137489633,
"grad_norm": 0.4289528139780234,
"learning_rate": 4.717806889147196e-06,
"loss": 0.3627,
"step": 2460
},
{
"epoch": 0.7110662387769083,
"grad_norm": 0.41215198190870284,
"learning_rate": 4.675111130174939e-06,
"loss": 0.3716,
"step": 2465
},
{
"epoch": 0.7125085638048534,
"grad_norm": 0.4403007485651443,
"learning_rate": 4.632550428680515e-06,
"loss": 0.3765,
"step": 2470
},
{
"epoch": 0.7139508888327984,
"grad_norm": 0.4311724864201015,
"learning_rate": 4.590125864149551e-06,
"loss": 0.3743,
"step": 2475
},
{
"epoch": 0.7153932138607435,
"grad_norm": 0.46098384046435353,
"learning_rate": 4.547838512614773e-06,
"loss": 0.3505,
"step": 2480
},
{
"epoch": 0.7168355388886886,
"grad_norm": 0.40338840945222365,
"learning_rate": 4.505689446628712e-06,
"loss": 0.3691,
"step": 2485
},
{
"epoch": 0.7182778639166336,
"grad_norm": 0.40824551867501546,
"learning_rate": 4.4636797352365035e-06,
"loss": 0.3585,
"step": 2490
},
{
"epoch": 0.7197201889445787,
"grad_norm": 0.4297027171998161,
"learning_rate": 4.421810443948774e-06,
"loss": 0.3705,
"step": 2495
},
{
"epoch": 0.7211625139725237,
"grad_norm": 0.40341531049143703,
"learning_rate": 4.38008263471461e-06,
"loss": 0.3815,
"step": 2500
},
{
"epoch": 0.7211625139725237,
"eval_loss": 0.37222930788993835,
"eval_runtime": 142.2441,
"eval_samples_per_second": 12.661,
"eval_steps_per_second": 3.171,
"step": 2500
},
{
"epoch": 0.7226048390004688,
"grad_norm": 0.4407059294927956,
"learning_rate": 4.338497365894628e-06,
"loss": 0.3661,
"step": 2505
},
{
"epoch": 0.7240471640284138,
"grad_norm": 0.43213340820969415,
"learning_rate": 4.297055692234133e-06,
"loss": 0.3548,
"step": 2510
},
{
"epoch": 0.7254894890563589,
"grad_norm": 0.40790860794488015,
"learning_rate": 4.25575866483636e-06,
"loss": 0.3693,
"step": 2515
},
{
"epoch": 0.7269318140843039,
"grad_norm": 0.39452605394978774,
"learning_rate": 4.214607331135817e-06,
"loss": 0.3629,
"step": 2520
},
{
"epoch": 0.7283741391122489,
"grad_norm": 0.4535519104968178,
"learning_rate": 4.173602734871723e-06,
"loss": 0.3631,
"step": 2525
},
{
"epoch": 0.729816464140194,
"grad_norm": 0.4215165521407461,
"learning_rate": 4.132745916061528e-06,
"loss": 0.3623,
"step": 2530
},
{
"epoch": 0.731258789168139,
"grad_norm": 0.4369337778893739,
"learning_rate": 4.09203791097454e-06,
"loss": 0.3799,
"step": 2535
},
{
"epoch": 0.7327011141960841,
"grad_norm": 0.4218365082776104,
"learning_rate": 4.051479752105642e-06,
"loss": 0.3281,
"step": 2540
},
{
"epoch": 0.7341434392240291,
"grad_norm": 0.39141469492573994,
"learning_rate": 4.01107246814909e-06,
"loss": 0.3779,
"step": 2545
},
{
"epoch": 0.7355857642519742,
"grad_norm": 0.4361183098017262,
"learning_rate": 3.970817083972451e-06,
"loss": 0.3677,
"step": 2550
},
{
"epoch": 0.7370280892799193,
"grad_norm": 0.4212489079522315,
"learning_rate": 3.930714620590582e-06,
"loss": 0.3697,
"step": 2555
},
{
"epoch": 0.7384704143078643,
"grad_norm": 0.42629366346781794,
"learning_rate": 3.890766095139744e-06,
"loss": 0.336,
"step": 2560
},
{
"epoch": 0.7399127393358094,
"grad_norm": 0.39167597840940843,
"learning_rate": 3.850972520851804e-06,
"loss": 0.3297,
"step": 2565
},
{
"epoch": 0.7413550643637544,
"grad_norm": 0.4233310284348778,
"learning_rate": 3.8113349070285344e-06,
"loss": 0.3613,
"step": 2570
},
{
"epoch": 0.7427973893916994,
"grad_norm": 0.4263022461531563,
"learning_rate": 3.771854259016019e-06,
"loss": 0.3529,
"step": 2575
},
{
"epoch": 0.7442397144196444,
"grad_norm": 0.3973240159937157,
"learning_rate": 3.7325315781791337e-06,
"loss": 0.3661,
"step": 2580
},
{
"epoch": 0.7456820394475895,
"grad_norm": 0.39734045764738396,
"learning_rate": 3.693367861876188e-06,
"loss": 0.3815,
"step": 2585
},
{
"epoch": 0.7471243644755345,
"grad_norm": 0.4473118684590064,
"learning_rate": 3.6543641034335873e-06,
"loss": 0.3488,
"step": 2590
},
{
"epoch": 0.7485666895034796,
"grad_norm": 0.4071557714101167,
"learning_rate": 3.615521292120663e-06,
"loss": 0.36,
"step": 2595
},
{
"epoch": 0.7500090145314247,
"grad_norm": 0.4149969887621353,
"learning_rate": 3.5768404131245695e-06,
"loss": 0.3619,
"step": 2600
},
{
"epoch": 0.7514513395593697,
"grad_norm": 0.41064754239264667,
"learning_rate": 3.5383224475253043e-06,
"loss": 0.3623,
"step": 2605
},
{
"epoch": 0.7528936645873148,
"grad_norm": 0.48731666991216727,
"learning_rate": 3.4999683722708265e-06,
"loss": 0.3824,
"step": 2610
},
{
"epoch": 0.7543359896152598,
"grad_norm": 0.42149841198530297,
"learning_rate": 3.4617791601522565e-06,
"loss": 0.3658,
"step": 2615
},
{
"epoch": 0.7557783146432049,
"grad_norm": 0.3936949177789515,
"learning_rate": 3.423755779779243e-06,
"loss": 0.3308,
"step": 2620
},
{
"epoch": 0.7572206396711499,
"grad_norm": 0.43489944362821054,
"learning_rate": 3.3858991955553455e-06,
"loss": 0.3815,
"step": 2625
},
{
"epoch": 0.7586629646990949,
"grad_norm": 0.3921717289554429,
"learning_rate": 3.348210367653625e-06,
"loss": 0.3531,
"step": 2630
},
{
"epoch": 0.76010528972704,
"grad_norm": 0.44238912615157533,
"learning_rate": 3.3106902519922523e-06,
"loss": 0.3696,
"step": 2635
},
{
"epoch": 0.761547614754985,
"grad_norm": 0.4536027992384981,
"learning_rate": 3.27333980021027e-06,
"loss": 0.37,
"step": 2640
},
{
"epoch": 0.7629899397829301,
"grad_norm": 0.4564191707678332,
"learning_rate": 3.236159959643482e-06,
"loss": 0.3819,
"step": 2645
},
{
"epoch": 0.7644322648108751,
"grad_norm": 0.5326593840798252,
"learning_rate": 3.1991516733003813e-06,
"loss": 0.3758,
"step": 2650
},
{
"epoch": 0.7658745898388202,
"grad_norm": 0.43321441818668444,
"learning_rate": 3.1623158798382813e-06,
"loss": 0.3783,
"step": 2655
},
{
"epoch": 0.7673169148667652,
"grad_norm": 0.4454237213343821,
"learning_rate": 3.125653513539456e-06,
"loss": 0.3607,
"step": 2660
},
{
"epoch": 0.7687592398947103,
"grad_norm": 0.4107211963202732,
"learning_rate": 3.089165504287499e-06,
"loss": 0.3482,
"step": 2665
},
{
"epoch": 0.7702015649226553,
"grad_norm": 0.3789782102911423,
"learning_rate": 3.052852777543687e-06,
"loss": 0.3543,
"step": 2670
},
{
"epoch": 0.7716438899506004,
"grad_norm": 0.4079189291227377,
"learning_rate": 3.0167162543235384e-06,
"loss": 0.3276,
"step": 2675
},
{
"epoch": 0.7730862149785455,
"grad_norm": 0.4472943997084153,
"learning_rate": 2.9807568511734564e-06,
"loss": 0.3825,
"step": 2680
},
{
"epoch": 0.7745285400064905,
"grad_norm": 0.430008379042804,
"learning_rate": 2.944975480147445e-06,
"loss": 0.3595,
"step": 2685
},
{
"epoch": 0.7759708650344355,
"grad_norm": 0.4401700574196651,
"learning_rate": 2.909373048784032e-06,
"loss": 0.3779,
"step": 2690
},
{
"epoch": 0.7774131900623805,
"grad_norm": 0.4208383654033427,
"learning_rate": 2.873950460083191e-06,
"loss": 0.3749,
"step": 2695
},
{
"epoch": 0.7788555150903256,
"grad_norm": 0.4174074736046765,
"learning_rate": 2.8387086124834952e-06,
"loss": 0.374,
"step": 2700
},
{
"epoch": 0.7802978401182706,
"grad_norm": 0.42868575004589055,
"learning_rate": 2.8036483998392784e-06,
"loss": 0.3564,
"step": 2705
},
{
"epoch": 0.7817401651462157,
"grad_norm": 0.3985935455753018,
"learning_rate": 2.768770711398001e-06,
"loss": 0.3667,
"step": 2710
},
{
"epoch": 0.7831824901741608,
"grad_norm": 0.40569605016983845,
"learning_rate": 2.734076431777688e-06,
"loss": 0.3506,
"step": 2715
},
{
"epoch": 0.7846248152021058,
"grad_norm": 0.39328145893392497,
"learning_rate": 2.6995664409444665e-06,
"loss": 0.3464,
"step": 2720
},
{
"epoch": 0.7860671402300509,
"grad_norm": 0.4528233880552543,
"learning_rate": 2.6652416141902913e-06,
"loss": 0.3605,
"step": 2725
},
{
"epoch": 0.7875094652579959,
"grad_norm": 0.4480705994704807,
"learning_rate": 2.631102822110695e-06,
"loss": 0.3726,
"step": 2730
},
{
"epoch": 0.788951790285941,
"grad_norm": 0.4574022134374259,
"learning_rate": 2.597150930582757e-06,
"loss": 0.359,
"step": 2735
},
{
"epoch": 0.790394115313886,
"grad_norm": 0.4078128321456425,
"learning_rate": 2.563386800743094e-06,
"loss": 0.3413,
"step": 2740
},
{
"epoch": 0.791836440341831,
"grad_norm": 0.44464864656256,
"learning_rate": 2.5298112889660544e-06,
"loss": 0.3587,
"step": 2745
},
{
"epoch": 0.793278765369776,
"grad_norm": 0.3890963843751233,
"learning_rate": 2.4964252468419802e-06,
"loss": 0.344,
"step": 2750
},
{
"epoch": 0.7947210903977211,
"grad_norm": 0.42348428672207705,
"learning_rate": 2.463229521155611e-06,
"loss": 0.3835,
"step": 2755
},
{
"epoch": 0.7961634154256662,
"grad_norm": 0.4244981524719468,
"learning_rate": 2.430224953864617e-06,
"loss": 0.3908,
"step": 2760
},
{
"epoch": 0.7976057404536112,
"grad_norm": 0.4461589097043871,
"learning_rate": 2.397412382078219e-06,
"loss": 0.3493,
"step": 2765
},
{
"epoch": 0.7990480654815563,
"grad_norm": 0.4226119316706504,
"learning_rate": 2.364792638035982e-06,
"loss": 0.3549,
"step": 2770
},
{
"epoch": 0.8004903905095013,
"grad_norm": 0.43426124883547124,
"learning_rate": 2.3323665490866964e-06,
"loss": 0.3578,
"step": 2775
},
{
"epoch": 0.8019327155374464,
"grad_norm": 0.42274869171496543,
"learning_rate": 2.300134937667391e-06,
"loss": 0.3805,
"step": 2780
},
{
"epoch": 0.8033750405653914,
"grad_norm": 0.4841781161829471,
"learning_rate": 2.2680986212824786e-06,
"loss": 0.3499,
"step": 2785
},
{
"epoch": 0.8048173655933365,
"grad_norm": 0.428134320224768,
"learning_rate": 2.2362584124830167e-06,
"loss": 0.3684,
"step": 2790
},
{
"epoch": 0.8062596906212816,
"grad_norm": 0.4117804314200649,
"learning_rate": 2.204615118846107e-06,
"loss": 0.3869,
"step": 2795
},
{
"epoch": 0.8077020156492265,
"grad_norm": 0.41413616917927765,
"learning_rate": 2.1731695429543974e-06,
"loss": 0.338,
"step": 2800
},
{
"epoch": 0.8091443406771716,
"grad_norm": 0.4360068588380961,
"learning_rate": 2.141922482375737e-06,
"loss": 0.3665,
"step": 2805
},
{
"epoch": 0.8105866657051166,
"grad_norm": 0.4334830193418244,
"learning_rate": 2.1108747296429477e-06,
"loss": 0.3721,
"step": 2810
},
{
"epoch": 0.8120289907330617,
"grad_norm": 0.507519342034383,
"learning_rate": 2.080027072233718e-06,
"loss": 0.3646,
"step": 2815
},
{
"epoch": 0.8134713157610067,
"grad_norm": 0.42834185576130923,
"learning_rate": 2.049380292550629e-06,
"loss": 0.3633,
"step": 2820
},
{
"epoch": 0.8149136407889518,
"grad_norm": 0.453195030964312,
"learning_rate": 2.018935167901316e-06,
"loss": 0.3539,
"step": 2825
},
{
"epoch": 0.8163559658168968,
"grad_norm": 0.4103347116873249,
"learning_rate": 1.9886924704787482e-06,
"loss": 0.3457,
"step": 2830
},
{
"epoch": 0.8177982908448419,
"grad_norm": 0.4081898260751316,
"learning_rate": 1.9586529673416433e-06,
"loss": 0.347,
"step": 2835
},
{
"epoch": 0.819240615872787,
"grad_norm": 0.40268175350554464,
"learning_rate": 1.928817420395018e-06,
"loss": 0.3772,
"step": 2840
},
{
"epoch": 0.820682940900732,
"grad_norm": 0.43775696767862726,
"learning_rate": 1.8991865863708547e-06,
"loss": 0.3718,
"step": 2845
},
{
"epoch": 0.8221252659286771,
"grad_norm": 0.43895036356232614,
"learning_rate": 1.8697612168089152e-06,
"loss": 0.3648,
"step": 2850
},
{
"epoch": 0.823567590956622,
"grad_norm": 0.40821144604675824,
"learning_rate": 1.8405420580376755e-06,
"loss": 0.3422,
"step": 2855
},
{
"epoch": 0.8250099159845671,
"grad_norm": 0.4577535204704979,
"learning_rate": 1.811529851155398e-06,
"loss": 0.3511,
"step": 2860
},
{
"epoch": 0.8264522410125121,
"grad_norm": 0.40698416625428246,
"learning_rate": 1.7827253320113347e-06,
"loss": 0.3521,
"step": 2865
},
{
"epoch": 0.8278945660404572,
"grad_norm": 0.48745985212369625,
"learning_rate": 1.7541292311870616e-06,
"loss": 0.3727,
"step": 2870
},
{
"epoch": 0.8293368910684022,
"grad_norm": 0.4152788200688241,
"learning_rate": 1.7257422739779495e-06,
"loss": 0.3406,
"step": 2875
},
{
"epoch": 0.8307792160963473,
"grad_norm": 0.42357457834820555,
"learning_rate": 1.6975651803747716e-06,
"loss": 0.3614,
"step": 2880
},
{
"epoch": 0.8322215411242924,
"grad_norm": 0.4290601435620992,
"learning_rate": 1.6695986650454355e-06,
"loss": 0.349,
"step": 2885
},
{
"epoch": 0.8336638661522374,
"grad_norm": 0.40830671063358515,
"learning_rate": 1.6418434373168623e-06,
"loss": 0.3592,
"step": 2890
},
{
"epoch": 0.8351061911801825,
"grad_norm": 0.4097799963554095,
"learning_rate": 1.614300201156994e-06,
"loss": 0.3359,
"step": 2895
},
{
"epoch": 0.8365485162081275,
"grad_norm": 0.43204146744095845,
"learning_rate": 1.5869696551569346e-06,
"loss": 0.3596,
"step": 2900
},
{
"epoch": 0.8379908412360726,
"grad_norm": 0.46076233886580875,
"learning_rate": 1.5598524925132396e-06,
"loss": 0.3609,
"step": 2905
},
{
"epoch": 0.8394331662640176,
"grad_norm": 0.4286297255981423,
"learning_rate": 1.5329494010103263e-06,
"loss": 0.3607,
"step": 2910
},
{
"epoch": 0.8408754912919626,
"grad_norm": 0.3956440167259478,
"learning_rate": 1.5062610630030317e-06,
"loss": 0.316,
"step": 2915
},
{
"epoch": 0.8423178163199077,
"grad_norm": 0.41432843943606673,
"learning_rate": 1.4797881553993099e-06,
"loss": 0.3589,
"step": 2920
},
{
"epoch": 0.8437601413478527,
"grad_norm": 0.397270661772685,
"learning_rate": 1.4535313496430558e-06,
"loss": 0.3519,
"step": 2925
},
{
"epoch": 0.8452024663757978,
"grad_norm": 0.41857285751070505,
"learning_rate": 1.4274913116970846e-06,
"loss": 0.3401,
"step": 2930
},
{
"epoch": 0.8466447914037428,
"grad_norm": 0.3941031419777465,
"learning_rate": 1.4016687020262231e-06,
"loss": 0.3504,
"step": 2935
},
{
"epoch": 0.8480871164316879,
"grad_norm": 0.428688446592497,
"learning_rate": 1.3760641755805848e-06,
"loss": 0.3614,
"step": 2940
},
{
"epoch": 0.8495294414596329,
"grad_norm": 0.4097211469034453,
"learning_rate": 1.3506783817789337e-06,
"loss": 0.3384,
"step": 2945
},
{
"epoch": 0.850971766487578,
"grad_norm": 0.44047116848231305,
"learning_rate": 1.3255119644922266e-06,
"loss": 0.3638,
"step": 2950
},
{
"epoch": 0.852414091515523,
"grad_norm": 0.3994464624403052,
"learning_rate": 1.300565562027276e-06,
"loss": 0.3447,
"step": 2955
},
{
"epoch": 0.8538564165434681,
"grad_norm": 0.44495457947302897,
"learning_rate": 1.2758398071105626e-06,
"loss": 0.3546,
"step": 2960
},
{
"epoch": 0.8552987415714132,
"grad_norm": 0.4147516297268767,
"learning_rate": 1.2513353268721907e-06,
"loss": 0.3421,
"step": 2965
},
{
"epoch": 0.8567410665993581,
"grad_norm": 0.422646250463158,
"learning_rate": 1.2270527428299684e-06,
"loss": 0.3579,
"step": 2970
},
{
"epoch": 0.8581833916273032,
"grad_norm": 0.4189403344854125,
"learning_rate": 1.2029926708736673e-06,
"loss": 0.3425,
"step": 2975
},
{
"epoch": 0.8596257166552482,
"grad_norm": 0.41547910036939945,
"learning_rate": 1.179155721249381e-06,
"loss": 0.3376,
"step": 2980
},
{
"epoch": 0.8610680416831933,
"grad_norm": 0.42428858195226893,
"learning_rate": 1.1555424985440522e-06,
"loss": 0.3554,
"step": 2985
},
{
"epoch": 0.8625103667111383,
"grad_norm": 0.4425537282272965,
"learning_rate": 1.1321536016701473e-06,
"loss": 0.351,
"step": 2990
},
{
"epoch": 0.8639526917390834,
"grad_norm": 0.4161228925911087,
"learning_rate": 1.1089896238504461e-06,
"loss": 0.336,
"step": 2995
},
{
"epoch": 0.8653950167670285,
"grad_norm": 0.37656047979276985,
"learning_rate": 1.086051152603026e-06,
"loss": 0.3509,
"step": 3000
},
{
"epoch": 0.8653950167670285,
"eval_loss": 0.3611552119255066,
"eval_runtime": 142.3229,
"eval_samples_per_second": 12.654,
"eval_steps_per_second": 3.169,
"step": 3000
},
{
"epoch": 0.8668373417949735,
"grad_norm": 0.4463172354545017,
"learning_rate": 1.0633387697263254e-06,
"loss": 0.35,
"step": 3005
},
{
"epoch": 0.8682796668229186,
"grad_norm": 0.43074983850708387,
"learning_rate": 1.0408530512844196e-06,
"loss": 0.3613,
"step": 3010
},
{
"epoch": 0.8697219918508636,
"grad_norm": 0.39354733454334206,
"learning_rate": 1.0185945675923813e-06,
"loss": 0.3727,
"step": 3015
},
{
"epoch": 0.8711643168788087,
"grad_norm": 0.44960602091132634,
"learning_rate": 9.965638832018432e-07,
"loss": 0.372,
"step": 3020
},
{
"epoch": 0.8726066419067536,
"grad_norm": 0.42518881330063735,
"learning_rate": 9.747615568866553e-07,
"loss": 0.3516,
"step": 3025
},
{
"epoch": 0.8740489669346987,
"grad_norm": 0.44741688383815076,
"learning_rate": 9.531881416287203e-07,
"loss": 0.3562,
"step": 3030
},
{
"epoch": 0.8754912919626437,
"grad_norm": 0.4331522299966881,
"learning_rate": 9.318441846039828e-07,
"loss": 0.3548,
"step": 3035
},
{
"epoch": 0.8769336169905888,
"grad_norm": 0.506237893255727,
"learning_rate": 9.107302271685226e-07,
"loss": 0.3412,
"step": 3040
},
{
"epoch": 0.8783759420185339,
"grad_norm": 0.4658754493753741,
"learning_rate": 8.898468048448528e-07,
"loss": 0.3336,
"step": 3045
},
{
"epoch": 0.8798182670464789,
"grad_norm": 0.438225563597408,
"learning_rate": 8.691944473083114e-07,
"loss": 0.3422,
"step": 3050
},
{
"epoch": 0.881260592074424,
"grad_norm": 0.4170714809613398,
"learning_rate": 8.487736783736533e-07,
"loss": 0.3621,
"step": 3055
},
{
"epoch": 0.882702917102369,
"grad_norm": 0.4590349478238853,
"learning_rate": 8.285850159817388e-07,
"loss": 0.3791,
"step": 3060
},
{
"epoch": 0.8841452421303141,
"grad_norm": 0.4332258091307991,
"learning_rate": 8.086289721864127e-07,
"loss": 0.3404,
"step": 3065
},
{
"epoch": 0.8855875671582591,
"grad_norm": 0.4452410333427778,
"learning_rate": 7.889060531415193e-07,
"loss": 0.3541,
"step": 3070
},
{
"epoch": 0.8870298921862042,
"grad_norm": 0.42507300447077245,
"learning_rate": 7.694167590880475e-07,
"loss": 0.3549,
"step": 3075
},
{
"epoch": 0.8884722172141493,
"grad_norm": 0.4227403053651907,
"learning_rate": 7.501615843414623e-07,
"loss": 0.3264,
"step": 3080
},
{
"epoch": 0.8899145422420942,
"grad_norm": 0.4131961662824003,
"learning_rate": 7.311410172791522e-07,
"loss": 0.3369,
"step": 3085
},
{
"epoch": 0.8913568672700393,
"grad_norm": 0.39579591570866374,
"learning_rate": 7.123555403280558e-07,
"loss": 0.3483,
"step": 3090
},
{
"epoch": 0.8927991922979843,
"grad_norm": 0.42292696994848605,
"learning_rate": 6.938056299524099e-07,
"loss": 0.3398,
"step": 3095
},
{
"epoch": 0.8942415173259294,
"grad_norm": 0.38022938922831223,
"learning_rate": 6.754917566416796e-07,
"loss": 0.3469,
"step": 3100
},
{
"epoch": 0.8956838423538744,
"grad_norm": 0.4849805496701068,
"learning_rate": 6.574143848986226e-07,
"loss": 0.3618,
"step": 3105
},
{
"epoch": 0.8971261673818195,
"grad_norm": 0.44465461522642474,
"learning_rate": 6.395739732274919e-07,
"loss": 0.3642,
"step": 3110
},
{
"epoch": 0.8985684924097646,
"grad_norm": 0.44656695164750837,
"learning_rate": 6.219709741224322e-07,
"loss": 0.3563,
"step": 3115
},
{
"epoch": 0.9000108174377096,
"grad_norm": 0.4269116876807273,
"learning_rate": 6.046058340559824e-07,
"loss": 0.3431,
"step": 3120
},
{
"epoch": 0.9014531424656547,
"grad_norm": 0.4086865891433274,
"learning_rate": 5.874789934677583e-07,
"loss": 0.3505,
"step": 3125
},
{
"epoch": 0.9028954674935997,
"grad_norm": 0.4404444466800333,
"learning_rate": 5.705908867532862e-07,
"loss": 0.3407,
"step": 3130
},
{
"epoch": 0.9043377925215448,
"grad_norm": 0.45999537115175176,
"learning_rate": 5.53941942252979e-07,
"loss": 0.37,
"step": 3135
},
{
"epoch": 0.9057801175494897,
"grad_norm": 0.4242568290280731,
"learning_rate": 5.375325822412747e-07,
"loss": 0.3316,
"step": 3140
},
{
"epoch": 0.9072224425774348,
"grad_norm": 0.4753028820261241,
"learning_rate": 5.213632229159227e-07,
"loss": 0.3785,
"step": 3145
},
{
"epoch": 0.9086647676053798,
"grad_norm": 0.4699691806857396,
"learning_rate": 5.054342743874386e-07,
"loss": 0.3617,
"step": 3150
},
{
"epoch": 0.9101070926333249,
"grad_norm": 0.4352496762130561,
"learning_rate": 4.897461406686821e-07,
"loss": 0.3359,
"step": 3155
},
{
"epoch": 0.91154941766127,
"grad_norm": 0.4316421343515809,
"learning_rate": 4.742992196646301e-07,
"loss": 0.3376,
"step": 3160
},
{
"epoch": 0.912991742689215,
"grad_norm": 0.4001287994073788,
"learning_rate": 4.590939031622743e-07,
"loss": 0.3351,
"step": 3165
},
{
"epoch": 0.9144340677171601,
"grad_norm": 0.4363788326973079,
"learning_rate": 4.4413057682068606e-07,
"loss": 0.3473,
"step": 3170
},
{
"epoch": 0.9158763927451051,
"grad_norm": 0.44176842953481193,
"learning_rate": 4.2940962016123524e-07,
"loss": 0.3332,
"step": 3175
},
{
"epoch": 0.9173187177730502,
"grad_norm": 0.43914474716543256,
"learning_rate": 4.149314065579624e-07,
"loss": 0.3383,
"step": 3180
},
{
"epoch": 0.9187610428009952,
"grad_norm": 0.4540079519566383,
"learning_rate": 4.0069630322811303e-07,
"loss": 0.3786,
"step": 3185
},
{
"epoch": 0.9202033678289403,
"grad_norm": 0.4612868459187327,
"learning_rate": 3.867046712228162e-07,
"loss": 0.3625,
"step": 3190
},
{
"epoch": 0.9216456928568852,
"grad_norm": 0.40372545279617805,
"learning_rate": 3.729568654179361e-07,
"loss": 0.3308,
"step": 3195
},
{
"epoch": 0.9230880178848303,
"grad_norm": 0.4204476032972304,
"learning_rate": 3.5945323450506387e-07,
"loss": 0.3346,
"step": 3200
},
{
"epoch": 0.9245303429127754,
"grad_norm": 0.45260198781122246,
"learning_rate": 3.4619412098267693e-07,
"loss": 0.3795,
"step": 3205
},
{
"epoch": 0.9259726679407204,
"grad_norm": 0.42527213346553855,
"learning_rate": 3.331798611474535e-07,
"loss": 0.3421,
"step": 3210
},
{
"epoch": 0.9274149929686655,
"grad_norm": 0.414984415520749,
"learning_rate": 3.204107850857374e-07,
"loss": 0.3291,
"step": 3215
},
{
"epoch": 0.9288573179966105,
"grad_norm": 0.4549260227056393,
"learning_rate": 3.0788721666517365e-07,
"loss": 0.3486,
"step": 3220
},
{
"epoch": 0.9302996430245556,
"grad_norm": 0.4443023622951338,
"learning_rate": 2.9560947352648697e-07,
"loss": 0.3756,
"step": 3225
},
{
"epoch": 0.9317419680525006,
"grad_norm": 0.4250192102717841,
"learning_rate": 2.8357786707542854e-07,
"loss": 0.3525,
"step": 3230
},
{
"epoch": 0.9331842930804457,
"grad_norm": 0.41194820669384097,
"learning_rate": 2.71792702474879e-07,
"loss": 0.3562,
"step": 3235
},
{
"epoch": 0.9346266181083908,
"grad_norm": 0.42277936484045997,
"learning_rate": 2.602542786371065e-07,
"loss": 0.3609,
"step": 3240
},
{
"epoch": 0.9360689431363358,
"grad_norm": 0.402522590339594,
"learning_rate": 2.489628882161832e-07,
"loss": 0.3323,
"step": 3245
},
{
"epoch": 0.9375112681642809,
"grad_norm": 0.42468823176649917,
"learning_rate": 2.3791881760056756e-07,
"loss": 0.3705,
"step": 3250
},
{
"epoch": 0.9389535931922258,
"grad_norm": 0.42563197511583134,
"learning_rate": 2.2712234690583813e-07,
"loss": 0.3635,
"step": 3255
},
{
"epoch": 0.9403959182201709,
"grad_norm": 0.4452148892270775,
"learning_rate": 2.1657374996758795e-07,
"loss": 0.3478,
"step": 3260
},
{
"epoch": 0.9418382432481159,
"grad_norm": 0.4539015567282992,
"learning_rate": 2.0627329433447917e-07,
"loss": 0.3736,
"step": 3265
},
{
"epoch": 0.943280568276061,
"grad_norm": 0.40270803503237657,
"learning_rate": 1.9622124126145837e-07,
"loss": 0.3378,
"step": 3270
},
{
"epoch": 0.944722893304006,
"grad_norm": 0.4075396549757293,
"learning_rate": 1.864178457031318e-07,
"loss": 0.3562,
"step": 3275
},
{
"epoch": 0.9461652183319511,
"grad_norm": 0.43266062909072267,
"learning_rate": 1.768633563072919e-07,
"loss": 0.3451,
"step": 3280
},
{
"epoch": 0.9476075433598962,
"grad_norm": 0.418621662939926,
"learning_rate": 1.6755801540862092e-07,
"loss": 0.334,
"step": 3285
},
{
"epoch": 0.9490498683878412,
"grad_norm": 0.4221481289163581,
"learning_rate": 1.5850205902253613e-07,
"loss": 0.3536,
"step": 3290
},
{
"epoch": 0.9504921934157863,
"grad_norm": 0.40400229300396406,
"learning_rate": 1.4969571683920768e-07,
"loss": 0.3636,
"step": 3295
},
{
"epoch": 0.9519345184437313,
"grad_norm": 0.4142859171614361,
"learning_rate": 1.411392122177302e-07,
"loss": 0.3302,
"step": 3300
},
{
"epoch": 0.9533768434716764,
"grad_norm": 0.4259634616965583,
"learning_rate": 1.3283276218046259e-07,
"loss": 0.3674,
"step": 3305
},
{
"epoch": 0.9548191684996213,
"grad_norm": 0.41429097541392035,
"learning_rate": 1.2477657740751714e-07,
"loss": 0.3483,
"step": 3310
},
{
"epoch": 0.9562614935275664,
"grad_norm": 0.42353387168902784,
"learning_rate": 1.169708622314214e-07,
"loss": 0.3608,
"step": 3315
},
{
"epoch": 0.9577038185555115,
"grad_norm": 0.42693212185785107,
"learning_rate": 1.0941581463193129e-07,
"loss": 0.3452,
"step": 3320
},
{
"epoch": 0.9591461435834565,
"grad_norm": 0.4328702433520352,
"learning_rate": 1.021116262310129e-07,
"loss": 0.3413,
"step": 3325
},
{
"epoch": 0.9605884686114016,
"grad_norm": 0.41956255025855793,
"learning_rate": 9.505848228798076e-08,
"loss": 0.3604,
"step": 3330
},
{
"epoch": 0.9620307936393466,
"grad_norm": 0.4209071869524921,
"learning_rate": 8.825656169480056e-08,
"loss": 0.3384,
"step": 3335
},
{
"epoch": 0.9634731186672917,
"grad_norm": 0.4118105753397592,
"learning_rate": 8.170603697154944e-08,
"loss": 0.3338,
"step": 3340
},
{
"epoch": 0.9649154436952367,
"grad_norm": 0.43817584876124205,
"learning_rate": 7.540707426204163e-08,
"loss": 0.3281,
"step": 3345
},
{
"epoch": 0.9663577687231818,
"grad_norm": 0.3903217050033041,
"learning_rate": 6.935983332961305e-08,
"loss": 0.3308,
"step": 3350
},
{
"epoch": 0.9678000937511269,
"grad_norm": 0.41905865354117233,
"learning_rate": 6.356446755307444e-08,
"loss": 0.3509,
"step": 3355
},
{
"epoch": 0.9692424187790719,
"grad_norm": 0.41394321455611666,
"learning_rate": 5.802112392281123e-08,
"loss": 0.3377,
"step": 3360
},
{
"epoch": 0.9706847438070169,
"grad_norm": 0.4316304666724342,
"learning_rate": 5.272994303706758e-08,
"loss": 0.3592,
"step": 3365
},
{
"epoch": 0.9721270688349619,
"grad_norm": 0.45454272140307556,
"learning_rate": 4.769105909836924e-08,
"loss": 0.3485,
"step": 3370
},
{
"epoch": 0.973569393862907,
"grad_norm": 0.43202485000084534,
"learning_rate": 4.2904599910127406e-08,
"loss": 0.3538,
"step": 3375
},
{
"epoch": 0.975011718890852,
"grad_norm": 0.44712558770756466,
"learning_rate": 3.837068687339351e-08,
"loss": 0.367,
"step": 3380
},
{
"epoch": 0.9764540439187971,
"grad_norm": 0.423193248701901,
"learning_rate": 3.408943498377726e-08,
"loss": 0.3351,
"step": 3385
},
{
"epoch": 0.9778963689467421,
"grad_norm": 0.47037763666404425,
"learning_rate": 3.006095282854116e-08,
"loss": 0.3966,
"step": 3390
},
{
"epoch": 0.9793386939746872,
"grad_norm": 0.4314080592872779,
"learning_rate": 2.628534258383164e-08,
"loss": 0.357,
"step": 3395
},
{
"epoch": 0.9807810190026323,
"grad_norm": 0.45121239415975073,
"learning_rate": 2.2762700012097795e-08,
"loss": 0.3564,
"step": 3400
},
{
"epoch": 0.9822233440305773,
"grad_norm": 0.4226505971917229,
"learning_rate": 1.9493114459659956e-08,
"loss": 0.3625,
"step": 3405
},
{
"epoch": 0.9836656690585224,
"grad_norm": 0.4197713049001792,
"learning_rate": 1.6476668854440435e-08,
"loss": 0.3526,
"step": 3410
},
{
"epoch": 0.9851079940864674,
"grad_norm": 0.4575738762031232,
"learning_rate": 1.3713439703865183e-08,
"loss": 0.3762,
"step": 3415
},
{
"epoch": 0.9865503191144124,
"grad_norm": 0.4574906098764045,
"learning_rate": 1.120349709291868e-08,
"loss": 0.3634,
"step": 3420
},
{
"epoch": 0.9879926441423574,
"grad_norm": 0.43088006927461175,
"learning_rate": 8.946904682370917e-09,
"loss": 0.3675,
"step": 3425
},
{
"epoch": 0.9894349691703025,
"grad_norm": 0.4103449101623024,
"learning_rate": 6.943719707158681e-09,
"loss": 0.3496,
"step": 3430
},
{
"epoch": 0.9908772941982475,
"grad_norm": 0.40469613082222705,
"learning_rate": 5.193992974935613e-09,
"loss": 0.369,
"step": 3435
},
{
"epoch": 0.9923196192261926,
"grad_norm": 0.46076258755412675,
"learning_rate": 3.697768864782125e-09,
"loss": 0.3588,
"step": 3440
},
{
"epoch": 0.9937619442541377,
"grad_norm": 0.4334341619233562,
"learning_rate": 2.4550853260851826e-09,
"loss": 0.3345,
"step": 3445
},
{
"epoch": 0.9952042692820827,
"grad_norm": 0.44568439209243566,
"learning_rate": 1.4659738775679721e-09,
"loss": 0.3459,
"step": 3450
},
{
"epoch": 0.9966465943100278,
"grad_norm": 0.45951543969711284,
"learning_rate": 7.30459606494982e-10,
"loss": 0.3791,
"step": 3455
},
{
"epoch": 0.9980889193379728,
"grad_norm": 0.4459520568434071,
"learning_rate": 2.4856116803695375e-10,
"loss": 0.3525,
"step": 3460
},
{
"epoch": 0.9995312443659179,
"grad_norm": 0.4581327568157757,
"learning_rate": 2.0290784791265893e-11,
"loss": 0.3492,
"step": 3465
},
{
"epoch": 0.9998197093715069,
"step": 3466,
"total_flos": 4977616761913344.0,
"train_loss": 0.6325558101381102,
"train_runtime": 63848.9812,
"train_samples_per_second": 3.475,
"train_steps_per_second": 0.054
}
],
"logging_steps": 5,
"max_steps": 3466,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4977616761913344.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}