|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998197093715069, |
|
"eval_steps": 500, |
|
"global_step": 3466, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014423250279450475, |
|
"grad_norm": 23.09968734754774, |
|
"learning_rate": 2.3054755043227666e-07, |
|
"loss": 12.1657, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002884650055890095, |
|
"grad_norm": 23.350567085111635, |
|
"learning_rate": 5.187319884726226e-07, |
|
"loss": 12.1499, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004326975083835142, |
|
"grad_norm": 22.840877913954497, |
|
"learning_rate": 8.069164265129684e-07, |
|
"loss": 12.0857, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00576930011178019, |
|
"grad_norm": 21.40321138460624, |
|
"learning_rate": 1.0951008645533142e-06, |
|
"loss": 11.8028, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007211625139725237, |
|
"grad_norm": 18.192353108517974, |
|
"learning_rate": 1.3832853025936602e-06, |
|
"loss": 11.3384, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008653950167670284, |
|
"grad_norm": 18.559232783911973, |
|
"learning_rate": 1.6714697406340058e-06, |
|
"loss": 10.3127, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010096275195615331, |
|
"grad_norm": 37.79150391064707, |
|
"learning_rate": 1.959654178674352e-06, |
|
"loss": 9.0664, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01153860022356038, |
|
"grad_norm": 33.772043740311254, |
|
"learning_rate": 2.247838616714698e-06, |
|
"loss": 7.409, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.012980925251505427, |
|
"grad_norm": 23.04632172544007, |
|
"learning_rate": 2.5360230547550434e-06, |
|
"loss": 6.3338, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014423250279450473, |
|
"grad_norm": 25.32559722397877, |
|
"learning_rate": 2.8242074927953894e-06, |
|
"loss": 4.4908, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015865575307395522, |
|
"grad_norm": 9.143968031022688, |
|
"learning_rate": 3.1123919308357354e-06, |
|
"loss": 3.2978, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01730790033534057, |
|
"grad_norm": 2.3359297684099745, |
|
"learning_rate": 3.400576368876081e-06, |
|
"loss": 2.6887, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.018750225363285616, |
|
"grad_norm": 1.5235792893524585, |
|
"learning_rate": 3.6887608069164266e-06, |
|
"loss": 2.6051, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.020192550391230663, |
|
"grad_norm": 1.6452371227737381, |
|
"learning_rate": 3.976945244956772e-06, |
|
"loss": 2.5288, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.021634875419175713, |
|
"grad_norm": 2.3877151673363133, |
|
"learning_rate": 4.265129682997119e-06, |
|
"loss": 2.4368, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02307720044712076, |
|
"grad_norm": 3.5448230000902283, |
|
"learning_rate": 4.553314121037464e-06, |
|
"loss": 2.2394, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.024519525475065806, |
|
"grad_norm": 3.998099329525319, |
|
"learning_rate": 4.84149855907781e-06, |
|
"loss": 2.0687, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.025961850503010853, |
|
"grad_norm": 5.3900301279889025, |
|
"learning_rate": 5.129682997118156e-06, |
|
"loss": 2.0427, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0274041755309559, |
|
"grad_norm": 7.2317244995568215, |
|
"learning_rate": 5.417867435158502e-06, |
|
"loss": 1.9167, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.028846500558900947, |
|
"grad_norm": 4.190169407947923, |
|
"learning_rate": 5.706051873198848e-06, |
|
"loss": 1.8528, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030288825586845997, |
|
"grad_norm": 5.165106554451897, |
|
"learning_rate": 5.994236311239193e-06, |
|
"loss": 1.8751, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.031731150614791044, |
|
"grad_norm": 3.2421300129897426, |
|
"learning_rate": 6.2824207492795395e-06, |
|
"loss": 1.7973, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03317347564273609, |
|
"grad_norm": 4.460292781455887, |
|
"learning_rate": 6.570605187319885e-06, |
|
"loss": 1.6292, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03461580067068114, |
|
"grad_norm": 4.913131259117871, |
|
"learning_rate": 6.8587896253602315e-06, |
|
"loss": 1.655, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03605812569862619, |
|
"grad_norm": 4.1881116653103945, |
|
"learning_rate": 7.146974063400577e-06, |
|
"loss": 1.664, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03750045072657123, |
|
"grad_norm": 5.723431293294362, |
|
"learning_rate": 7.4351585014409235e-06, |
|
"loss": 1.6202, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03894277575451628, |
|
"grad_norm": 4.909602119186479, |
|
"learning_rate": 7.723342939481268e-06, |
|
"loss": 1.5486, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.040385100782461325, |
|
"grad_norm": 5.928676345818394, |
|
"learning_rate": 8.011527377521614e-06, |
|
"loss": 1.4965, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.041827425810406375, |
|
"grad_norm": 5.5830317263384845, |
|
"learning_rate": 8.299711815561961e-06, |
|
"loss": 1.4195, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.043269750838351426, |
|
"grad_norm": 5.587820490379444, |
|
"learning_rate": 8.587896253602305e-06, |
|
"loss": 1.3894, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04471207586629647, |
|
"grad_norm": 3.5851612990900836, |
|
"learning_rate": 8.876080691642652e-06, |
|
"loss": 1.4654, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04615440089424152, |
|
"grad_norm": 4.792344497245253, |
|
"learning_rate": 9.164265129682998e-06, |
|
"loss": 1.3801, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04759672592218656, |
|
"grad_norm": 3.5644574463856387, |
|
"learning_rate": 9.452449567723344e-06, |
|
"loss": 1.3527, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.04903905095013161, |
|
"grad_norm": 4.245088356022904, |
|
"learning_rate": 9.740634005763689e-06, |
|
"loss": 1.3465, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.050481375978076656, |
|
"grad_norm": 4.623244884122231, |
|
"learning_rate": 1.0028818443804036e-05, |
|
"loss": 1.3647, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.05192370100602171, |
|
"grad_norm": 3.5591972450196043, |
|
"learning_rate": 1.031700288184438e-05, |
|
"loss": 1.261, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05336602603396676, |
|
"grad_norm": 3.6288737317693243, |
|
"learning_rate": 1.0605187319884726e-05, |
|
"loss": 1.2178, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0548083510619118, |
|
"grad_norm": 5.472679192029011, |
|
"learning_rate": 1.0893371757925073e-05, |
|
"loss": 1.2372, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05625067608985685, |
|
"grad_norm": 2.987171924181164, |
|
"learning_rate": 1.1181556195965419e-05, |
|
"loss": 1.1878, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.057693001117801894, |
|
"grad_norm": 3.633711033064426, |
|
"learning_rate": 1.1469740634005764e-05, |
|
"loss": 1.1895, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.059135326145746944, |
|
"grad_norm": 3.9402926571067978, |
|
"learning_rate": 1.175792507204611e-05, |
|
"loss": 1.1368, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.060577651173691995, |
|
"grad_norm": 3.527134311033913, |
|
"learning_rate": 1.2046109510086457e-05, |
|
"loss": 1.1306, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06201997620163704, |
|
"grad_norm": 3.679407663475352, |
|
"learning_rate": 1.2334293948126803e-05, |
|
"loss": 1.0846, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.06346230122958209, |
|
"grad_norm": 3.1104059182965047, |
|
"learning_rate": 1.2622478386167147e-05, |
|
"loss": 1.1201, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06490462625752713, |
|
"grad_norm": 4.203869282005421, |
|
"learning_rate": 1.2910662824207494e-05, |
|
"loss": 1.0694, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06634695128547217, |
|
"grad_norm": 3.936128919901792, |
|
"learning_rate": 1.319884726224784e-05, |
|
"loss": 1.0191, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06778927631341723, |
|
"grad_norm": 2.2362445033305804, |
|
"learning_rate": 1.3487031700288185e-05, |
|
"loss": 0.9774, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06923160134136228, |
|
"grad_norm": 2.757438827888907, |
|
"learning_rate": 1.377521613832853e-05, |
|
"loss": 1.0124, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07067392636930732, |
|
"grad_norm": 3.4599226565163783, |
|
"learning_rate": 1.4063400576368878e-05, |
|
"loss": 0.9295, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07211625139725238, |
|
"grad_norm": 2.0262096895794963, |
|
"learning_rate": 1.4351585014409224e-05, |
|
"loss": 0.9118, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07355857642519742, |
|
"grad_norm": 2.487400868386021, |
|
"learning_rate": 1.4639769452449568e-05, |
|
"loss": 0.9409, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.07500090145314246, |
|
"grad_norm": 1.9303088742335475, |
|
"learning_rate": 1.4927953890489915e-05, |
|
"loss": 0.9211, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0764432264810875, |
|
"grad_norm": 2.175412817851971, |
|
"learning_rate": 1.521613832853026e-05, |
|
"loss": 0.9168, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.07788555150903256, |
|
"grad_norm": 2.5796504124225033, |
|
"learning_rate": 1.5504322766570608e-05, |
|
"loss": 0.9527, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0793278765369776, |
|
"grad_norm": 1.9788435183920994, |
|
"learning_rate": 1.5792507204610953e-05, |
|
"loss": 0.8426, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.08077020156492265, |
|
"grad_norm": 2.003074548053739, |
|
"learning_rate": 1.60806916426513e-05, |
|
"loss": 0.8527, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08221252659286771, |
|
"grad_norm": 2.1994335722383602, |
|
"learning_rate": 1.6368876080691644e-05, |
|
"loss": 0.8072, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.08365485162081275, |
|
"grad_norm": 1.726445070641134, |
|
"learning_rate": 1.665706051873199e-05, |
|
"loss": 0.8163, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0850971766487578, |
|
"grad_norm": 2.350691118327581, |
|
"learning_rate": 1.6945244956772336e-05, |
|
"loss": 0.7651, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.08653950167670285, |
|
"grad_norm": 2.6639655167915115, |
|
"learning_rate": 1.723342939481268e-05, |
|
"loss": 0.7535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0879818267046479, |
|
"grad_norm": 1.3919563172463725, |
|
"learning_rate": 1.7521613832853027e-05, |
|
"loss": 0.785, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.08942415173259294, |
|
"grad_norm": 1.2944766289360783, |
|
"learning_rate": 1.7809798270893372e-05, |
|
"loss": 0.7111, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09086647676053798, |
|
"grad_norm": 1.4798988266070112, |
|
"learning_rate": 1.8097982708933718e-05, |
|
"loss": 0.7293, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09230880178848304, |
|
"grad_norm": 1.1830162313483426, |
|
"learning_rate": 1.8386167146974067e-05, |
|
"loss": 0.7231, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09375112681642808, |
|
"grad_norm": 1.5568610974134778, |
|
"learning_rate": 1.867435158501441e-05, |
|
"loss": 0.7445, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.09519345184437313, |
|
"grad_norm": 1.1492164494899182, |
|
"learning_rate": 1.8962536023054755e-05, |
|
"loss": 0.6959, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09663577687231818, |
|
"grad_norm": 1.0978857201097723, |
|
"learning_rate": 1.9250720461095104e-05, |
|
"loss": 0.7057, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.09807810190026323, |
|
"grad_norm": 1.0096489653703298, |
|
"learning_rate": 1.953890489913545e-05, |
|
"loss": 0.6772, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09952042692820827, |
|
"grad_norm": 1.1232844613521993, |
|
"learning_rate": 1.9827089337175795e-05, |
|
"loss": 0.7246, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.10096275195615331, |
|
"grad_norm": 1.02243795388932, |
|
"learning_rate": 1.9999979709215212e-05, |
|
"loss": 0.7024, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10240507698409837, |
|
"grad_norm": 1.1367801539352143, |
|
"learning_rate": 1.9999751438831965e-05, |
|
"loss": 0.6489, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.10384740201204341, |
|
"grad_norm": 1.1572043181625398, |
|
"learning_rate": 1.9999269540393507e-05, |
|
"loss": 0.6489, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10528972703998846, |
|
"grad_norm": 1.0269240416486167, |
|
"learning_rate": 1.9998534026122433e-05, |
|
"loss": 0.6782, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.10673205206793351, |
|
"grad_norm": 0.9511160065038861, |
|
"learning_rate": 1.9997544914673915e-05, |
|
"loss": 0.6312, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.10817437709587856, |
|
"grad_norm": 1.1374311508874984, |
|
"learning_rate": 1.999630223113522e-05, |
|
"loss": 0.6628, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1096167021238236, |
|
"grad_norm": 1.450941328478541, |
|
"learning_rate": 1.9994806007025068e-05, |
|
"loss": 0.6389, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11105902715176866, |
|
"grad_norm": 0.8046806001901237, |
|
"learning_rate": 1.9993056280292845e-05, |
|
"loss": 0.6482, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.1125013521797137, |
|
"grad_norm": 0.8216403494158578, |
|
"learning_rate": 1.999105309531763e-05, |
|
"loss": 0.6078, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11394367720765874, |
|
"grad_norm": 0.8600864577290717, |
|
"learning_rate": 1.9988796502907083e-05, |
|
"loss": 0.63, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.11538600223560379, |
|
"grad_norm": 0.798579467879802, |
|
"learning_rate": 1.9986286560296134e-05, |
|
"loss": 0.6109, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11682832726354885, |
|
"grad_norm": 0.7668970837973854, |
|
"learning_rate": 1.998352333114556e-05, |
|
"loss": 0.5857, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.11827065229149389, |
|
"grad_norm": 1.0143366745206854, |
|
"learning_rate": 1.998050688554034e-05, |
|
"loss": 0.6176, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11971297731943893, |
|
"grad_norm": 0.7114180483975799, |
|
"learning_rate": 1.9977237299987903e-05, |
|
"loss": 0.62, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.12115530234738399, |
|
"grad_norm": 0.8179413343809848, |
|
"learning_rate": 1.997371465741617e-05, |
|
"loss": 0.6205, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12259762737532903, |
|
"grad_norm": 0.6435940720725398, |
|
"learning_rate": 1.996993904717146e-05, |
|
"loss": 0.5878, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.12403995240327408, |
|
"grad_norm": 0.9102246188273324, |
|
"learning_rate": 1.9965910565016223e-05, |
|
"loss": 0.6021, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12548227743121912, |
|
"grad_norm": 0.6153476600060466, |
|
"learning_rate": 1.9961629313126608e-05, |
|
"loss": 0.5674, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.12692460245916418, |
|
"grad_norm": 0.5823753109992822, |
|
"learning_rate": 1.9957095400089875e-05, |
|
"loss": 0.5819, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.12836692748710923, |
|
"grad_norm": 0.6280650049871973, |
|
"learning_rate": 1.9952308940901634e-05, |
|
"loss": 0.6357, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.12980925251505426, |
|
"grad_norm": 1.12163730124818, |
|
"learning_rate": 1.9947270056962934e-05, |
|
"loss": 0.5659, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13125157754299932, |
|
"grad_norm": 0.8453741002711367, |
|
"learning_rate": 1.994197887607719e-05, |
|
"loss": 0.5423, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.13269390257094435, |
|
"grad_norm": 0.6945577095672939, |
|
"learning_rate": 1.993643553244693e-05, |
|
"loss": 0.6118, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1341362275988894, |
|
"grad_norm": 0.6080087347638511, |
|
"learning_rate": 1.993064016667039e-05, |
|
"loss": 0.5912, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.13557855262683446, |
|
"grad_norm": 0.5072027520003524, |
|
"learning_rate": 1.992459292573796e-05, |
|
"loss": 0.6086, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1370208776547795, |
|
"grad_norm": 0.5194397753829619, |
|
"learning_rate": 1.991829396302845e-05, |
|
"loss": 0.5554, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.13846320268272455, |
|
"grad_norm": 0.6531400636419847, |
|
"learning_rate": 1.9911743438305203e-05, |
|
"loss": 0.5738, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1399055277106696, |
|
"grad_norm": 0.8007993447245763, |
|
"learning_rate": 1.990494151771202e-05, |
|
"loss": 0.5698, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.14134785273861464, |
|
"grad_norm": 0.7192330669398362, |
|
"learning_rate": 1.989788837376899e-05, |
|
"loss": 0.5629, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1427901777665597, |
|
"grad_norm": 0.688440868686088, |
|
"learning_rate": 1.989058418536807e-05, |
|
"loss": 0.5734, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14423250279450475, |
|
"grad_norm": 1.001172764554856, |
|
"learning_rate": 1.988302913776858e-05, |
|
"loss": 0.5745, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14423250279450475, |
|
"eval_loss": 0.568706750869751, |
|
"eval_runtime": 161.3667, |
|
"eval_samples_per_second": 11.161, |
|
"eval_steps_per_second": 2.795, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14567482782244978, |
|
"grad_norm": 1.0515733209433527, |
|
"learning_rate": 1.9875223422592485e-05, |
|
"loss": 0.5704, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.14711715285039484, |
|
"grad_norm": 1.0276945765068186, |
|
"learning_rate": 1.986716723781954e-05, |
|
"loss": 0.6123, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1485594778783399, |
|
"grad_norm": 0.8043743845845657, |
|
"learning_rate": 1.985886078778227e-05, |
|
"loss": 0.5437, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.15000180290628493, |
|
"grad_norm": 0.6535595881064415, |
|
"learning_rate": 1.9850304283160793e-05, |
|
"loss": 0.5527, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.15144412793422998, |
|
"grad_norm": 0.7357564272936004, |
|
"learning_rate": 1.9841497940977464e-05, |
|
"loss": 0.5432, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.152886452962175, |
|
"grad_norm": 0.7287222676647807, |
|
"learning_rate": 1.983244198459138e-05, |
|
"loss": 0.5811, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15432877799012007, |
|
"grad_norm": 0.5697752505815841, |
|
"learning_rate": 1.982313664369271e-05, |
|
"loss": 0.5627, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.15577110301806513, |
|
"grad_norm": 0.5170616797914624, |
|
"learning_rate": 1.981358215429687e-05, |
|
"loss": 0.5592, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15721342804601016, |
|
"grad_norm": 0.619913426569597, |
|
"learning_rate": 1.9803778758738543e-05, |
|
"loss": 0.5435, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.1586557530739552, |
|
"grad_norm": 0.9727823301261521, |
|
"learning_rate": 1.9793726705665524e-05, |
|
"loss": 0.5889, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16009807810190027, |
|
"grad_norm": 0.6044688838902901, |
|
"learning_rate": 1.9783426250032412e-05, |
|
"loss": 0.5678, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.1615404031298453, |
|
"grad_norm": 0.46024598144245266, |
|
"learning_rate": 1.9772877653094165e-05, |
|
"loss": 0.5639, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.16298272815779036, |
|
"grad_norm": 0.45100341602786603, |
|
"learning_rate": 1.9762081182399434e-05, |
|
"loss": 0.5717, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.16442505318573541, |
|
"grad_norm": 0.5540308655652189, |
|
"learning_rate": 1.9751037111783818e-05, |
|
"loss": 0.5623, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.16586737821368044, |
|
"grad_norm": 0.43976603899998645, |
|
"learning_rate": 1.9739745721362897e-05, |
|
"loss": 0.5319, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.1673097032416255, |
|
"grad_norm": 0.4612500025708451, |
|
"learning_rate": 1.9728207297525125e-05, |
|
"loss": 0.5653, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.16875202826957056, |
|
"grad_norm": 0.5752333041985558, |
|
"learning_rate": 1.9716422132924572e-05, |
|
"loss": 0.567, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1701943532975156, |
|
"grad_norm": 0.5369943570453672, |
|
"learning_rate": 1.9704390526473515e-05, |
|
"loss": 0.5609, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.17163667832546065, |
|
"grad_norm": 0.5164720235053389, |
|
"learning_rate": 1.9692112783334826e-05, |
|
"loss": 0.5415, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.1730790033534057, |
|
"grad_norm": 0.7665382521888024, |
|
"learning_rate": 1.967958921491426e-05, |
|
"loss": 0.5671, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17452132838135073, |
|
"grad_norm": 0.6256340257615823, |
|
"learning_rate": 1.966682013885255e-05, |
|
"loss": 0.5533, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.1759636534092958, |
|
"grad_norm": 0.4893424331522886, |
|
"learning_rate": 1.9653805879017323e-05, |
|
"loss": 0.5589, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.17740597843724082, |
|
"grad_norm": 0.4930248858437027, |
|
"learning_rate": 1.964054676549494e-05, |
|
"loss": 0.5418, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.17884830346518588, |
|
"grad_norm": 0.45814407628412845, |
|
"learning_rate": 1.9627043134582068e-05, |
|
"loss": 0.5195, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18029062849313093, |
|
"grad_norm": 0.5315704703868885, |
|
"learning_rate": 1.9613295328777187e-05, |
|
"loss": 0.5095, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.18173295352107596, |
|
"grad_norm": 0.43146076740416167, |
|
"learning_rate": 1.959930369677189e-05, |
|
"loss": 0.4929, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18317527854902102, |
|
"grad_norm": 0.4627882494650573, |
|
"learning_rate": 1.958506859344204e-05, |
|
"loss": 0.5141, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.18461760357696608, |
|
"grad_norm": 0.621672972720691, |
|
"learning_rate": 1.9570590379838767e-05, |
|
"loss": 0.5486, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1860599286049111, |
|
"grad_norm": 0.5063460018719447, |
|
"learning_rate": 1.9555869423179316e-05, |
|
"loss": 0.5497, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.18750225363285616, |
|
"grad_norm": 0.48895947210824475, |
|
"learning_rate": 1.9540906096837727e-05, |
|
"loss": 0.5465, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18894457866080122, |
|
"grad_norm": 0.47357663586358684, |
|
"learning_rate": 1.9525700780335372e-05, |
|
"loss": 0.529, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.19038690368874625, |
|
"grad_norm": 0.43786638884850015, |
|
"learning_rate": 1.951025385933132e-05, |
|
"loss": 0.522, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1918292287166913, |
|
"grad_norm": 0.5828551791972233, |
|
"learning_rate": 1.9494565725612565e-05, |
|
"loss": 0.5334, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.19327155374463637, |
|
"grad_norm": 0.4669699168406431, |
|
"learning_rate": 1.9478636777084077e-05, |
|
"loss": 0.4846, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.1947138787725814, |
|
"grad_norm": 0.5626195687859905, |
|
"learning_rate": 1.946246741775873e-05, |
|
"loss": 0.556, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.19615620380052645, |
|
"grad_norm": 0.5482755680769119, |
|
"learning_rate": 1.9446058057747025e-05, |
|
"loss": 0.4561, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1975985288284715, |
|
"grad_norm": 0.4878018831010534, |
|
"learning_rate": 1.9429409113246715e-05, |
|
"loss": 0.526, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.19904085385641654, |
|
"grad_norm": 0.7436357434374212, |
|
"learning_rate": 1.9412521006532245e-05, |
|
"loss": 0.5088, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2004831788843616, |
|
"grad_norm": 0.45530676409796045, |
|
"learning_rate": 1.939539416594402e-05, |
|
"loss": 0.5214, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.20192550391230663, |
|
"grad_norm": 0.6302948823981896, |
|
"learning_rate": 1.937802902587757e-05, |
|
"loss": 0.5591, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20336782894025168, |
|
"grad_norm": 0.4921513503843826, |
|
"learning_rate": 1.936042602677251e-05, |
|
"loss": 0.5288, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.20481015396819674, |
|
"grad_norm": 0.5421091687931597, |
|
"learning_rate": 1.934258561510138e-05, |
|
"loss": 0.5151, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.20625247899614177, |
|
"grad_norm": 0.7576428493111558, |
|
"learning_rate": 1.932450824335832e-05, |
|
"loss": 0.477, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.20769480402408683, |
|
"grad_norm": 0.424961853700426, |
|
"learning_rate": 1.9306194370047592e-05, |
|
"loss": 0.5342, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.20913712905203188, |
|
"grad_norm": 0.49906945581307455, |
|
"learning_rate": 1.9287644459671948e-05, |
|
"loss": 0.5334, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2105794540799769, |
|
"grad_norm": 0.46177937508565325, |
|
"learning_rate": 1.926885898272085e-05, |
|
"loss": 0.4989, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.21202177910792197, |
|
"grad_norm": 0.4920606306275181, |
|
"learning_rate": 1.9249838415658543e-05, |
|
"loss": 0.5448, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21346410413586703, |
|
"grad_norm": 0.4191101613829332, |
|
"learning_rate": 1.9230583240911954e-05, |
|
"loss": 0.4694, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.21490642916381206, |
|
"grad_norm": 0.48817506876963557, |
|
"learning_rate": 1.9211093946858484e-05, |
|
"loss": 0.5173, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.21634875419175711, |
|
"grad_norm": 0.5126984233381934, |
|
"learning_rate": 1.919137102781359e-05, |
|
"loss": 0.5074, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.21779107921970217, |
|
"grad_norm": 0.5334260917924061, |
|
"learning_rate": 1.9171414984018266e-05, |
|
"loss": 0.4917, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2192334042476472, |
|
"grad_norm": 0.5501541841297073, |
|
"learning_rate": 1.915122632162635e-05, |
|
"loss": 0.5152, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.22067572927559226, |
|
"grad_norm": 0.4359723210170646, |
|
"learning_rate": 1.913080555269169e-05, |
|
"loss": 0.5215, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.22211805430353732, |
|
"grad_norm": 0.5662077360043514, |
|
"learning_rate": 1.911015319515515e-05, |
|
"loss": 0.5253, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.22356037933148235, |
|
"grad_norm": 0.4764077159702808, |
|
"learning_rate": 1.908926977283148e-05, |
|
"loss": 0.5066, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2250027043594274, |
|
"grad_norm": 0.5639009005172965, |
|
"learning_rate": 1.9068155815396018e-05, |
|
"loss": 0.474, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.22644502938737243, |
|
"grad_norm": 0.6776509031874417, |
|
"learning_rate": 1.904681185837128e-05, |
|
"loss": 0.5025, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.2278873544153175, |
|
"grad_norm": 0.3940863617407268, |
|
"learning_rate": 1.9025238443113346e-05, |
|
"loss": 0.4781, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.22932967944326255, |
|
"grad_norm": 0.5731371374463607, |
|
"learning_rate": 1.9003436116798156e-05, |
|
"loss": 0.5325, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.23077200447120758, |
|
"grad_norm": 0.44630504407580995, |
|
"learning_rate": 1.898140543240762e-05, |
|
"loss": 0.5094, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23221432949915263, |
|
"grad_norm": 0.5013841323056458, |
|
"learning_rate": 1.8959146948715582e-05, |
|
"loss": 0.5123, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2336566545270977, |
|
"grad_norm": 0.6517172353158069, |
|
"learning_rate": 1.8936661230273677e-05, |
|
"loss": 0.4944, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.23509897955504272, |
|
"grad_norm": 0.5321704297258375, |
|
"learning_rate": 1.8913948847396978e-05, |
|
"loss": 0.5111, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.23654130458298778, |
|
"grad_norm": 0.5733385459091142, |
|
"learning_rate": 1.8891010376149554e-05, |
|
"loss": 0.5255, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.23798362961093283, |
|
"grad_norm": 0.6439828549708082, |
|
"learning_rate": 1.8867846398329856e-05, |
|
"loss": 0.5224, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.23942595463887786, |
|
"grad_norm": 0.526933741666615, |
|
"learning_rate": 1.884445750145595e-05, |
|
"loss": 0.4987, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.24086827966682292, |
|
"grad_norm": 0.4358091890203275, |
|
"learning_rate": 1.882084427875062e-05, |
|
"loss": 0.5151, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.24231060469476798, |
|
"grad_norm": 0.42052312366605993, |
|
"learning_rate": 1.8797007329126336e-05, |
|
"loss": 0.5292, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.243752929722713, |
|
"grad_norm": 0.5162254671712243, |
|
"learning_rate": 1.8772947257170034e-05, |
|
"loss": 0.4701, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.24519525475065806, |
|
"grad_norm": 0.41421320556868774, |
|
"learning_rate": 1.8748664673127814e-05, |
|
"loss": 0.4869, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2466375797786031, |
|
"grad_norm": 0.44489422959937447, |
|
"learning_rate": 1.872416019288944e-05, |
|
"loss": 0.5107, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.24807990480654815, |
|
"grad_norm": 0.5131502882549939, |
|
"learning_rate": 1.8699434437972726e-05, |
|
"loss": 0.5002, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2495222298344932, |
|
"grad_norm": 0.4410628046298298, |
|
"learning_rate": 1.8674488035507776e-05, |
|
"loss": 0.5033, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.25096455486243824, |
|
"grad_norm": 0.424822720640458, |
|
"learning_rate": 1.864932161822107e-05, |
|
"loss": 0.459, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2524068798903833, |
|
"grad_norm": 0.546763650924181, |
|
"learning_rate": 1.8623935824419416e-05, |
|
"loss": 0.4782, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.25384920491832835, |
|
"grad_norm": 0.571446149303962, |
|
"learning_rate": 1.859833129797378e-05, |
|
"loss": 0.4971, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2552915299462734, |
|
"grad_norm": 0.3881051890411508, |
|
"learning_rate": 1.857250868830292e-05, |
|
"loss": 0.4645, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.25673385497421847, |
|
"grad_norm": 0.4365270093969844, |
|
"learning_rate": 1.8546468650356947e-05, |
|
"loss": 0.4999, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.25817618000216347, |
|
"grad_norm": 0.39922925876114046, |
|
"learning_rate": 1.852021184460069e-05, |
|
"loss": 0.4607, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.2596185050301085, |
|
"grad_norm": 0.4385372209974039, |
|
"learning_rate": 1.849373893699697e-05, |
|
"loss": 0.5032, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2610608300580536, |
|
"grad_norm": 0.4289486219739114, |
|
"learning_rate": 1.8467050598989677e-05, |
|
"loss": 0.5003, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.26250315508599864, |
|
"grad_norm": 0.4045886984758963, |
|
"learning_rate": 1.8440147507486765e-05, |
|
"loss": 0.4644, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2639454801139437, |
|
"grad_norm": 0.43637212820672877, |
|
"learning_rate": 1.8413030344843064e-05, |
|
"loss": 0.5057, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2653878051418887, |
|
"grad_norm": 0.468355616591299, |
|
"learning_rate": 1.838569979884301e-05, |
|
"loss": 0.4967, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.26683013016983376, |
|
"grad_norm": 0.4257178939942325, |
|
"learning_rate": 1.835815656268314e-05, |
|
"loss": 0.4848, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2682724551977788, |
|
"grad_norm": 0.6504232751090008, |
|
"learning_rate": 1.8330401334954567e-05, |
|
"loss": 0.4958, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.26971478022572387, |
|
"grad_norm": 0.4492644770064815, |
|
"learning_rate": 1.8302434819625234e-05, |
|
"loss": 0.4868, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.27115710525366893, |
|
"grad_norm": 0.37095796426726924, |
|
"learning_rate": 1.8274257726022054e-05, |
|
"loss": 0.4472, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.272599430281614, |
|
"grad_norm": 0.4070852473871566, |
|
"learning_rate": 1.824587076881294e-05, |
|
"loss": 0.4686, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.274041755309559, |
|
"grad_norm": 0.44023807834971757, |
|
"learning_rate": 1.821727466798867e-05, |
|
"loss": 0.471, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.27548408033750404, |
|
"grad_norm": 0.5209872184391927, |
|
"learning_rate": 1.8188470148844602e-05, |
|
"loss": 0.4962, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.2769264053654491, |
|
"grad_norm": 0.41685090109899176, |
|
"learning_rate": 1.8159457941962325e-05, |
|
"loss": 0.475, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.27836873039339416, |
|
"grad_norm": 0.5171250899115861, |
|
"learning_rate": 1.8130238783191087e-05, |
|
"loss": 0.5163, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2798110554213392, |
|
"grad_norm": 0.47139497814149867, |
|
"learning_rate": 1.810081341362915e-05, |
|
"loss": 0.4641, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2812533804492843, |
|
"grad_norm": 0.3879518437836758, |
|
"learning_rate": 1.8071182579604986e-05, |
|
"loss": 0.4777, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2826957054772293, |
|
"grad_norm": 0.455341690737865, |
|
"learning_rate": 1.804134703265836e-05, |
|
"loss": 0.5271, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.28413803050517433, |
|
"grad_norm": 0.39108612071221016, |
|
"learning_rate": 1.8011307529521255e-05, |
|
"loss": 0.4645, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2855803555331194, |
|
"grad_norm": 0.3865948965496386, |
|
"learning_rate": 1.7981064832098687e-05, |
|
"loss": 0.4578, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.28702268056106445, |
|
"grad_norm": 0.40375523747783393, |
|
"learning_rate": 1.7950619707449374e-05, |
|
"loss": 0.4923, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.2884650055890095, |
|
"grad_norm": 0.3376017909117174, |
|
"learning_rate": 1.7919972927766288e-05, |
|
"loss": 0.4658, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2884650055890095, |
|
"eval_loss": 0.4833250343799591, |
|
"eval_runtime": 142.0125, |
|
"eval_samples_per_second": 12.682, |
|
"eval_steps_per_second": 3.176, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2899073306169545, |
|
"grad_norm": 0.47138251586932034, |
|
"learning_rate": 1.7889125270357053e-05, |
|
"loss": 0.4851, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.29134965564489956, |
|
"grad_norm": 0.522686359505293, |
|
"learning_rate": 1.7858077517624265e-05, |
|
"loss": 0.4788, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2927919806728446, |
|
"grad_norm": 0.6355398882354177, |
|
"learning_rate": 1.7826830457045608e-05, |
|
"loss": 0.4525, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2942343057007897, |
|
"grad_norm": 0.44577505392395406, |
|
"learning_rate": 1.7795384881153896e-05, |
|
"loss": 0.4614, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.29567663072873474, |
|
"grad_norm": 0.454859759409631, |
|
"learning_rate": 1.7763741587516983e-05, |
|
"loss": 0.5021, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2971189557566798, |
|
"grad_norm": 0.6161570485074761, |
|
"learning_rate": 1.7731901378717523e-05, |
|
"loss": 0.4903, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2985612807846248, |
|
"grad_norm": 0.43940664169854093, |
|
"learning_rate": 1.769986506233261e-05, |
|
"loss": 0.4819, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.30000360581256985, |
|
"grad_norm": 0.4426640967510136, |
|
"learning_rate": 1.7667633450913307e-05, |
|
"loss": 0.4579, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3014459308405149, |
|
"grad_norm": 0.5064920131450599, |
|
"learning_rate": 1.763520736196402e-05, |
|
"loss": 0.5066, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.30288825586845997, |
|
"grad_norm": 0.3628170152752897, |
|
"learning_rate": 1.7602587617921785e-05, |
|
"loss": 0.423, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.304330580896405, |
|
"grad_norm": 0.4756441564342862, |
|
"learning_rate": 1.7569775046135388e-05, |
|
"loss": 0.5278, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.30577290592435, |
|
"grad_norm": 0.40932967287449395, |
|
"learning_rate": 1.753677047884439e-05, |
|
"loss": 0.4565, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3072152309522951, |
|
"grad_norm": 0.4148447936276441, |
|
"learning_rate": 1.7503574753158022e-05, |
|
"loss": 0.4819, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.30865755598024014, |
|
"grad_norm": 0.3868133979093347, |
|
"learning_rate": 1.747018871103395e-05, |
|
"loss": 0.4707, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3100998810081852, |
|
"grad_norm": 0.39630255989567886, |
|
"learning_rate": 1.743661319925691e-05, |
|
"loss": 0.4387, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.31154220603613025, |
|
"grad_norm": 0.4233553435649959, |
|
"learning_rate": 1.7402849069417246e-05, |
|
"loss": 0.465, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3129845310640753, |
|
"grad_norm": 0.37304393376464795, |
|
"learning_rate": 1.7368897177889307e-05, |
|
"loss": 0.4854, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.3144268560920203, |
|
"grad_norm": 0.41669096423193014, |
|
"learning_rate": 1.7334758385809715e-05, |
|
"loss": 0.4369, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.31586918111996537, |
|
"grad_norm": 0.3950040493214593, |
|
"learning_rate": 1.7300433559055533e-05, |
|
"loss": 0.4488, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.3173115061479104, |
|
"grad_norm": 0.4206456914262744, |
|
"learning_rate": 1.7265923568222315e-05, |
|
"loss": 0.4608, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3187538311758555, |
|
"grad_norm": 0.5459001712618055, |
|
"learning_rate": 1.7231229288602e-05, |
|
"loss": 0.4419, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.32019615620380054, |
|
"grad_norm": 0.4002983479690819, |
|
"learning_rate": 1.7196351600160725e-05, |
|
"loss": 0.4575, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3216384812317456, |
|
"grad_norm": 0.5400371185813517, |
|
"learning_rate": 1.716129138751651e-05, |
|
"loss": 0.4402, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3230808062596906, |
|
"grad_norm": 0.4526337203461876, |
|
"learning_rate": 1.712604953991681e-05, |
|
"loss": 0.4923, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.32452313128763566, |
|
"grad_norm": 0.3924148895626424, |
|
"learning_rate": 1.709062695121597e-05, |
|
"loss": 0.4734, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3259654563155807, |
|
"grad_norm": 0.45730078891879783, |
|
"learning_rate": 1.7055024519852554e-05, |
|
"loss": 0.4935, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.32740778134352577, |
|
"grad_norm": 0.41765126413107173, |
|
"learning_rate": 1.7019243148826547e-05, |
|
"loss": 0.4778, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.32885010637147083, |
|
"grad_norm": 0.48822731606676767, |
|
"learning_rate": 1.6983283745676464e-05, |
|
"loss": 0.4786, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.33029243139941583, |
|
"grad_norm": 0.47444702764857977, |
|
"learning_rate": 1.6947147222456318e-05, |
|
"loss": 0.4732, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.3317347564273609, |
|
"grad_norm": 0.36819652961308474, |
|
"learning_rate": 1.6910834495712504e-05, |
|
"loss": 0.49, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.33317708145530595, |
|
"grad_norm": 0.3963647053897705, |
|
"learning_rate": 1.6874346486460543e-05, |
|
"loss": 0.4599, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.334619406483251, |
|
"grad_norm": 0.3557684139157355, |
|
"learning_rate": 1.6837684120161723e-05, |
|
"loss": 0.4603, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.33606173151119606, |
|
"grad_norm": 0.42399774345522806, |
|
"learning_rate": 1.680084832669962e-05, |
|
"loss": 0.4322, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3375040565391411, |
|
"grad_norm": 0.4013586249486658, |
|
"learning_rate": 1.6763840040356522e-05, |
|
"loss": 0.4398, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3389463815670861, |
|
"grad_norm": 0.44604773948712173, |
|
"learning_rate": 1.6726660199789733e-05, |
|
"loss": 0.4265, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3403887065950312, |
|
"grad_norm": 0.39551679284847074, |
|
"learning_rate": 1.6689309748007753e-05, |
|
"loss": 0.4418, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.34183103162297623, |
|
"grad_norm": 0.451264115692116, |
|
"learning_rate": 1.6651789632346377e-05, |
|
"loss": 0.4483, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3432733566509213, |
|
"grad_norm": 0.4689614820007113, |
|
"learning_rate": 1.6614100804444657e-05, |
|
"loss": 0.467, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.34471568167886635, |
|
"grad_norm": 0.3841720473679624, |
|
"learning_rate": 1.6576244220220763e-05, |
|
"loss": 0.4313, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.3461580067068114, |
|
"grad_norm": 0.4091561009628973, |
|
"learning_rate": 1.6538220839847745e-05, |
|
"loss": 0.434, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3476003317347564, |
|
"grad_norm": 0.4473483816905544, |
|
"learning_rate": 1.6500031627729178e-05, |
|
"loss": 0.4446, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.34904265676270146, |
|
"grad_norm": 0.4800983187244669, |
|
"learning_rate": 1.6461677552474698e-05, |
|
"loss": 0.4691, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3504849817906465, |
|
"grad_norm": 0.388554374886088, |
|
"learning_rate": 1.642315958687543e-05, |
|
"loss": 0.4517, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.3519273068185916, |
|
"grad_norm": 0.4804591032499286, |
|
"learning_rate": 1.6384478707879337e-05, |
|
"loss": 0.4736, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.35336963184653664, |
|
"grad_norm": 0.4242345257393015, |
|
"learning_rate": 1.6345635896566415e-05, |
|
"loss": 0.4453, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.35481195687448164, |
|
"grad_norm": 0.5125929278365619, |
|
"learning_rate": 1.6306632138123814e-05, |
|
"loss": 0.4894, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3562542819024267, |
|
"grad_norm": 0.4135575305051168, |
|
"learning_rate": 1.626746842182087e-05, |
|
"loss": 0.4516, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.35769660693037175, |
|
"grad_norm": 0.49733207897305337, |
|
"learning_rate": 1.6228145740983986e-05, |
|
"loss": 0.4676, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3591389319583168, |
|
"grad_norm": 0.405324125927312, |
|
"learning_rate": 1.618866509297147e-05, |
|
"loss": 0.4539, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.36058125698626187, |
|
"grad_norm": 0.43290260214899146, |
|
"learning_rate": 1.61490274791482e-05, |
|
"loss": 0.43, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3620235820142069, |
|
"grad_norm": 0.3648124960837181, |
|
"learning_rate": 1.6109233904860258e-05, |
|
"loss": 0.4516, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.3634659070421519, |
|
"grad_norm": 0.43358315460862995, |
|
"learning_rate": 1.606928537940942e-05, |
|
"loss": 0.4565, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.364908232070097, |
|
"grad_norm": 0.5070316730676355, |
|
"learning_rate": 1.602918291602755e-05, |
|
"loss": 0.4547, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.36635055709804204, |
|
"grad_norm": 0.4556281361017855, |
|
"learning_rate": 1.5988927531850913e-05, |
|
"loss": 0.4631, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3677928821259871, |
|
"grad_norm": 0.4210598158384229, |
|
"learning_rate": 1.5948520247894363e-05, |
|
"loss": 0.4595, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.36923520715393215, |
|
"grad_norm": 0.4325982920205171, |
|
"learning_rate": 1.590796208902546e-05, |
|
"loss": 0.4698, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3706775321818772, |
|
"grad_norm": 0.4263624320016057, |
|
"learning_rate": 1.5867254083938472e-05, |
|
"loss": 0.4371, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.3721198572098222, |
|
"grad_norm": 0.4792938379196713, |
|
"learning_rate": 1.582639726512828e-05, |
|
"loss": 0.4464, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.37356218223776727, |
|
"grad_norm": 0.43544663382731996, |
|
"learning_rate": 1.5785392668864186e-05, |
|
"loss": 0.4658, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.37500450726571233, |
|
"grad_norm": 0.38089232775082726, |
|
"learning_rate": 1.5744241335163642e-05, |
|
"loss": 0.4492, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3764468322936574, |
|
"grad_norm": 0.3692067776356917, |
|
"learning_rate": 1.570294430776587e-05, |
|
"loss": 0.4402, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.37788915732160244, |
|
"grad_norm": 0.43939772643420716, |
|
"learning_rate": 1.5661502634105376e-05, |
|
"loss": 0.4413, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.37933148234954744, |
|
"grad_norm": 0.39362265905546057, |
|
"learning_rate": 1.5619917365285394e-05, |
|
"loss": 0.4314, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.3807738073774925, |
|
"grad_norm": 0.41565735116305985, |
|
"learning_rate": 1.557818955605123e-05, |
|
"loss": 0.4564, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.38221613240543756, |
|
"grad_norm": 0.3633587329212366, |
|
"learning_rate": 1.55363202647635e-05, |
|
"loss": 0.4568, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3836584574333826, |
|
"grad_norm": 0.43886686943718484, |
|
"learning_rate": 1.5494310553371292e-05, |
|
"loss": 0.4408, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.3851007824613277, |
|
"grad_norm": 0.44313421551297705, |
|
"learning_rate": 1.545216148738523e-05, |
|
"loss": 0.4728, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.38654310748927273, |
|
"grad_norm": 0.43446763871019, |
|
"learning_rate": 1.5409874135850453e-05, |
|
"loss": 0.4413, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.38798543251721773, |
|
"grad_norm": 0.5046802087731463, |
|
"learning_rate": 1.5367449571319486e-05, |
|
"loss": 0.451, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3894277575451628, |
|
"grad_norm": 0.4176799699807321, |
|
"learning_rate": 1.5324888869825062e-05, |
|
"loss": 0.4575, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.39087008257310785, |
|
"grad_norm": 0.4357723650429465, |
|
"learning_rate": 1.5282193110852806e-05, |
|
"loss": 0.4628, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.3923124076010529, |
|
"grad_norm": 0.47847755269517595, |
|
"learning_rate": 1.5239363377313864e-05, |
|
"loss": 0.4426, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.39375473262899796, |
|
"grad_norm": 0.42951183292967315, |
|
"learning_rate": 1.5196400755517445e-05, |
|
"loss": 0.4173, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.395197057656943, |
|
"grad_norm": 0.3712834304196652, |
|
"learning_rate": 1.5153306335143247e-05, |
|
"loss": 0.4185, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.396639382684888, |
|
"grad_norm": 0.40028893775485, |
|
"learning_rate": 1.5110081209213849e-05, |
|
"loss": 0.4404, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.3980817077128331, |
|
"grad_norm": 0.3524439650077371, |
|
"learning_rate": 1.5066726474066962e-05, |
|
"loss": 0.436, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.39952403274077813, |
|
"grad_norm": 0.41796871469443936, |
|
"learning_rate": 1.5023243229327631e-05, |
|
"loss": 0.4465, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.4009663577687232, |
|
"grad_norm": 0.39648024648913516, |
|
"learning_rate": 1.4979632577880355e-05, |
|
"loss": 0.4599, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.40240868279666825, |
|
"grad_norm": 0.4177593581987727, |
|
"learning_rate": 1.4935895625841095e-05, |
|
"loss": 0.4341, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.40385100782461325, |
|
"grad_norm": 0.39474357091689116, |
|
"learning_rate": 1.4892033482529233e-05, |
|
"loss": 0.4251, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4052933328525583, |
|
"grad_norm": 0.3925865645135851, |
|
"learning_rate": 1.484804726043943e-05, |
|
"loss": 0.4188, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.40673565788050337, |
|
"grad_norm": 0.43881341912306815, |
|
"learning_rate": 1.480393807521342e-05, |
|
"loss": 0.4626, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4081779829084484, |
|
"grad_norm": 0.38784235208087897, |
|
"learning_rate": 1.4759707045611694e-05, |
|
"loss": 0.4356, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4096203079363935, |
|
"grad_norm": 0.4652349082201273, |
|
"learning_rate": 1.4715355293485134e-05, |
|
"loss": 0.4429, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.41106263296433854, |
|
"grad_norm": 0.5020179396910893, |
|
"learning_rate": 1.4670883943746575e-05, |
|
"loss": 0.4424, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.41250495799228354, |
|
"grad_norm": 0.46646941577755224, |
|
"learning_rate": 1.4626294124342237e-05, |
|
"loss": 0.4473, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4139472830202286, |
|
"grad_norm": 0.3715580720003536, |
|
"learning_rate": 1.4581586966223156e-05, |
|
"loss": 0.457, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.41538960804817365, |
|
"grad_norm": 0.3913149158851186, |
|
"learning_rate": 1.453676360331647e-05, |
|
"loss": 0.4232, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4168319330761187, |
|
"grad_norm": 0.3755928140913827, |
|
"learning_rate": 1.4491825172496675e-05, |
|
"loss": 0.4376, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.41827425810406377, |
|
"grad_norm": 0.4632236851893659, |
|
"learning_rate": 1.4446772813556784e-05, |
|
"loss": 0.4547, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4197165831320088, |
|
"grad_norm": 0.3622221987812085, |
|
"learning_rate": 1.4401607669179415e-05, |
|
"loss": 0.4189, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.4211589081599538, |
|
"grad_norm": 0.4427510263617938, |
|
"learning_rate": 1.4356330884907823e-05, |
|
"loss": 0.4307, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4226012331878989, |
|
"grad_norm": 0.40821656664051026, |
|
"learning_rate": 1.4310943609116815e-05, |
|
"loss": 0.4416, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.42404355821584394, |
|
"grad_norm": 0.45484460030870416, |
|
"learning_rate": 1.4265446992983661e-05, |
|
"loss": 0.449, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.425485883243789, |
|
"grad_norm": 0.38430976618751717, |
|
"learning_rate": 1.4219842190458865e-05, |
|
"loss": 0.4445, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.42692820827173406, |
|
"grad_norm": 0.40624625230940725, |
|
"learning_rate": 1.4174130358236924e-05, |
|
"loss": 0.4734, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.42837053329967906, |
|
"grad_norm": 0.38501281348072397, |
|
"learning_rate": 1.4128312655726957e-05, |
|
"loss": 0.4407, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.4298128583276241, |
|
"grad_norm": 0.5552503619067779, |
|
"learning_rate": 1.4082390245023337e-05, |
|
"loss": 0.4559, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.43125518335556917, |
|
"grad_norm": 0.41269951819834144, |
|
"learning_rate": 1.4036364290876176e-05, |
|
"loss": 0.4407, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.43269750838351423, |
|
"grad_norm": 0.4132538908060478, |
|
"learning_rate": 1.3990235960661824e-05, |
|
"loss": 0.4439, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.43269750838351423, |
|
"eval_loss": 0.43445292115211487, |
|
"eval_runtime": 142.5412, |
|
"eval_samples_per_second": 12.635, |
|
"eval_steps_per_second": 3.164, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4341398334114593, |
|
"grad_norm": 0.42757706099004156, |
|
"learning_rate": 1.3944006424353229e-05, |
|
"loss": 0.4247, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.43558215843940434, |
|
"grad_norm": 0.36759037583277737, |
|
"learning_rate": 1.389767685449027e-05, |
|
"loss": 0.4306, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.43702448346734935, |
|
"grad_norm": 0.42042330760151675, |
|
"learning_rate": 1.3851248426150026e-05, |
|
"loss": 0.4244, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.4384668084952944, |
|
"grad_norm": 0.38414415773611094, |
|
"learning_rate": 1.380472231691697e-05, |
|
"loss": 0.4377, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.43990913352323946, |
|
"grad_norm": 0.4303765304251248, |
|
"learning_rate": 1.375809970685309e-05, |
|
"loss": 0.4574, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4413514585511845, |
|
"grad_norm": 0.39045631524439356, |
|
"learning_rate": 1.3711381778467972e-05, |
|
"loss": 0.4487, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4427937835791296, |
|
"grad_norm": 0.409923537347395, |
|
"learning_rate": 1.36645697166888e-05, |
|
"loss": 0.4155, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.44423610860707463, |
|
"grad_norm": 0.4590281734742793, |
|
"learning_rate": 1.3617664708830304e-05, |
|
"loss": 0.4211, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.44567843363501963, |
|
"grad_norm": 0.4340206380764746, |
|
"learning_rate": 1.3570667944564651e-05, |
|
"loss": 0.43, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.4471207586629647, |
|
"grad_norm": 0.3867702108735739, |
|
"learning_rate": 1.3523580615891258e-05, |
|
"loss": 0.4367, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.44856308369090975, |
|
"grad_norm": 0.45493644595260835, |
|
"learning_rate": 1.347640391710657e-05, |
|
"loss": 0.4336, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4500054087188548, |
|
"grad_norm": 0.41557484865468924, |
|
"learning_rate": 1.3429139044773768e-05, |
|
"loss": 0.4128, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.45144773374679986, |
|
"grad_norm": 0.41564130897863455, |
|
"learning_rate": 1.3381787197692413e-05, |
|
"loss": 0.3957, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.45289005877474486, |
|
"grad_norm": 0.4011264197640641, |
|
"learning_rate": 1.3334349576868046e-05, |
|
"loss": 0.442, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4543323838026899, |
|
"grad_norm": 0.4825855614290229, |
|
"learning_rate": 1.3286827385481726e-05, |
|
"loss": 0.4058, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.455774708830635, |
|
"grad_norm": 0.3921023793032671, |
|
"learning_rate": 1.3239221828859509e-05, |
|
"loss": 0.3884, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.45721703385858004, |
|
"grad_norm": 0.40627991293028837, |
|
"learning_rate": 1.3191534114441883e-05, |
|
"loss": 0.4333, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.4586593588865251, |
|
"grad_norm": 0.43891554498901797, |
|
"learning_rate": 1.3143765451753137e-05, |
|
"loss": 0.4166, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.46010168391447015, |
|
"grad_norm": 0.39830311047980305, |
|
"learning_rate": 1.3095917052370686e-05, |
|
"loss": 0.4235, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.46154400894241515, |
|
"grad_norm": 0.3980453207285396, |
|
"learning_rate": 1.3047990129894348e-05, |
|
"loss": 0.4001, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4629863339703602, |
|
"grad_norm": 0.4136578166461488, |
|
"learning_rate": 1.299998589991555e-05, |
|
"loss": 0.4076, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.46442865899830527, |
|
"grad_norm": 0.4343208402620231, |
|
"learning_rate": 1.2951905579986506e-05, |
|
"loss": 0.4384, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4658709840262503, |
|
"grad_norm": 0.45578762184210947, |
|
"learning_rate": 1.290375038958933e-05, |
|
"loss": 0.4048, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.4673133090541954, |
|
"grad_norm": 0.46943412662551365, |
|
"learning_rate": 1.285552155010511e-05, |
|
"loss": 0.401, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.46875563408214044, |
|
"grad_norm": 0.40848878753251544, |
|
"learning_rate": 1.2807220284782926e-05, |
|
"loss": 0.4461, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.47019795911008544, |
|
"grad_norm": 0.3921726292273481, |
|
"learning_rate": 1.2758847818708832e-05, |
|
"loss": 0.4205, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4716402841380305, |
|
"grad_norm": 0.45781513572784016, |
|
"learning_rate": 1.2710405378774768e-05, |
|
"loss": 0.4423, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.47308260916597555, |
|
"grad_norm": 0.45862261759553535, |
|
"learning_rate": 1.2661894193647458e-05, |
|
"loss": 0.4, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4745249341939206, |
|
"grad_norm": 0.3527899534786595, |
|
"learning_rate": 1.261331549373724e-05, |
|
"loss": 0.3998, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.47596725922186567, |
|
"grad_norm": 0.36297450328540837, |
|
"learning_rate": 1.2564670511166865e-05, |
|
"loss": 0.4206, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.47740958424981067, |
|
"grad_norm": 0.4030716124087903, |
|
"learning_rate": 1.2515960479740224e-05, |
|
"loss": 0.4047, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.4788519092777557, |
|
"grad_norm": 0.41175543047417906, |
|
"learning_rate": 1.246718663491108e-05, |
|
"loss": 0.4345, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4802942343057008, |
|
"grad_norm": 0.3574092930784039, |
|
"learning_rate": 1.2418350213751728e-05, |
|
"loss": 0.4081, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.48173655933364584, |
|
"grad_norm": 0.3954039812545518, |
|
"learning_rate": 1.2369452454921604e-05, |
|
"loss": 0.4159, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4831788843615909, |
|
"grad_norm": 0.4497181497561506, |
|
"learning_rate": 1.2320494598635886e-05, |
|
"loss": 0.4052, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.48462120938953596, |
|
"grad_norm": 0.44655082111096045, |
|
"learning_rate": 1.2271477886634023e-05, |
|
"loss": 0.4123, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.48606353441748096, |
|
"grad_norm": 0.40423139543908587, |
|
"learning_rate": 1.2222403562148252e-05, |
|
"loss": 0.4152, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.487505859445426, |
|
"grad_norm": 0.36806086858378434, |
|
"learning_rate": 1.2173272869872062e-05, |
|
"loss": 0.4252, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4889481844733711, |
|
"grad_norm": 0.41722654899253564, |
|
"learning_rate": 1.2124087055928617e-05, |
|
"loss": 0.3879, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.49039050950131613, |
|
"grad_norm": 0.4329150355333478, |
|
"learning_rate": 1.207484736783916e-05, |
|
"loss": 0.3849, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4918328345292612, |
|
"grad_norm": 0.4710085788902766, |
|
"learning_rate": 1.2025555054491367e-05, |
|
"loss": 0.4303, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.4932751595572062, |
|
"grad_norm": 0.443066548358196, |
|
"learning_rate": 1.1976211366107668e-05, |
|
"loss": 0.4198, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.49471748458515125, |
|
"grad_norm": 0.3338656609348242, |
|
"learning_rate": 1.1926817554213548e-05, |
|
"loss": 0.3911, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.4961598096130963, |
|
"grad_norm": 0.38270258610415053, |
|
"learning_rate": 1.1877374871605786e-05, |
|
"loss": 0.4068, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.49760213464104136, |
|
"grad_norm": 0.40504870451767916, |
|
"learning_rate": 1.18278845723207e-05, |
|
"loss": 0.4117, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4990444596689864, |
|
"grad_norm": 0.4346348228563321, |
|
"learning_rate": 1.1778347911602329e-05, |
|
"loss": 0.4104, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5004867846969314, |
|
"grad_norm": 0.4075021793881479, |
|
"learning_rate": 1.1728766145870587e-05, |
|
"loss": 0.4229, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5019291097248765, |
|
"grad_norm": 0.418017099187981, |
|
"learning_rate": 1.167914053268942e-05, |
|
"loss": 0.407, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5033714347528215, |
|
"grad_norm": 0.39895813955242926, |
|
"learning_rate": 1.1629472330734888e-05, |
|
"loss": 0.3978, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.5048137597807666, |
|
"grad_norm": 0.40383289208967305, |
|
"learning_rate": 1.1579762799763249e-05, |
|
"loss": 0.4175, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5062560848087116, |
|
"grad_norm": 0.5225560587862472, |
|
"learning_rate": 1.1530013200579008e-05, |
|
"loss": 0.4131, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5076984098366567, |
|
"grad_norm": 0.4004897787727647, |
|
"learning_rate": 1.1480224795002943e-05, |
|
"loss": 0.3888, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5091407348646018, |
|
"grad_norm": 0.4248175503521806, |
|
"learning_rate": 1.1430398845840085e-05, |
|
"loss": 0.4324, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5105830598925468, |
|
"grad_norm": 0.43829908182981264, |
|
"learning_rate": 1.1380536616847706e-05, |
|
"loss": 0.4079, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5120253849204919, |
|
"grad_norm": 0.43570794658905476, |
|
"learning_rate": 1.1330639372703258e-05, |
|
"loss": 0.4045, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5134677099484369, |
|
"grad_norm": 0.43500914045447153, |
|
"learning_rate": 1.12807083789723e-05, |
|
"loss": 0.419, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5149100349763819, |
|
"grad_norm": 0.41351142363579385, |
|
"learning_rate": 1.123074490207639e-05, |
|
"loss": 0.3986, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5163523600043269, |
|
"grad_norm": 0.37789765808010595, |
|
"learning_rate": 1.1180750209260972e-05, |
|
"loss": 0.4016, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.517794685032272, |
|
"grad_norm": 0.4013962679722207, |
|
"learning_rate": 1.1130725568563241e-05, |
|
"loss": 0.4081, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.519237010060217, |
|
"grad_norm": 0.38374761554210224, |
|
"learning_rate": 1.1080672248779964e-05, |
|
"loss": 0.4061, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5206793350881621, |
|
"grad_norm": 0.44182386119487255, |
|
"learning_rate": 1.1030591519435316e-05, |
|
"loss": 0.3916, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.5221216601161072, |
|
"grad_norm": 0.44971294735945117, |
|
"learning_rate": 1.0980484650748666e-05, |
|
"loss": 0.3996, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5235639851440522, |
|
"grad_norm": 0.35276497806950113, |
|
"learning_rate": 1.0930352913602371e-05, |
|
"loss": 0.3732, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5250063101719973, |
|
"grad_norm": 0.42340138266599786, |
|
"learning_rate": 1.0880197579509532e-05, |
|
"loss": 0.4222, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5264486351999423, |
|
"grad_norm": 0.39078797688993877, |
|
"learning_rate": 1.0830019920581753e-05, |
|
"loss": 0.4136, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.5278909602278874, |
|
"grad_norm": 0.4130289272161752, |
|
"learning_rate": 1.0779821209496876e-05, |
|
"loss": 0.4192, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5293332852558325, |
|
"grad_norm": 0.41541974485384586, |
|
"learning_rate": 1.0729602719466692e-05, |
|
"loss": 0.4031, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.5307756102837774, |
|
"grad_norm": 0.44049659174573497, |
|
"learning_rate": 1.067936572420466e-05, |
|
"loss": 0.4069, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5322179353117225, |
|
"grad_norm": 0.44056632399340595, |
|
"learning_rate": 1.0629111497893591e-05, |
|
"loss": 0.3964, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5336602603396675, |
|
"grad_norm": 0.40575645379756525, |
|
"learning_rate": 1.0578841315153333e-05, |
|
"loss": 0.3953, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5351025853676126, |
|
"grad_norm": 0.37056517023195357, |
|
"learning_rate": 1.0528556451008447e-05, |
|
"loss": 0.4058, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5365449103955576, |
|
"grad_norm": 0.38961078802000476, |
|
"learning_rate": 1.0478258180855869e-05, |
|
"loss": 0.3783, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5379872354235027, |
|
"grad_norm": 0.4278326171242378, |
|
"learning_rate": 1.0427947780432547e-05, |
|
"loss": 0.4025, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.5394295604514477, |
|
"grad_norm": 0.4487192036382051, |
|
"learning_rate": 1.0377626525783101e-05, |
|
"loss": 0.3933, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5408718854793928, |
|
"grad_norm": 0.5348996401888022, |
|
"learning_rate": 1.0327295693227454e-05, |
|
"loss": 0.447, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5423142105073379, |
|
"grad_norm": 0.527197311781129, |
|
"learning_rate": 1.0276956559328455e-05, |
|
"loss": 0.3949, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5437565355352829, |
|
"grad_norm": 0.41151058505508553, |
|
"learning_rate": 1.0226610400859498e-05, |
|
"loss": 0.4051, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.545198860563228, |
|
"grad_norm": 0.37166405264306773, |
|
"learning_rate": 1.0176258494772153e-05, |
|
"loss": 0.3991, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5466411855911729, |
|
"grad_norm": 0.4167614980577364, |
|
"learning_rate": 1.0125902118163762e-05, |
|
"loss": 0.4086, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.548083510619118, |
|
"grad_norm": 0.4002106455641225, |
|
"learning_rate": 1.007554254824506e-05, |
|
"loss": 0.4006, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.549525835647063, |
|
"grad_norm": 0.38648887792017217, |
|
"learning_rate": 1.0025181062307774e-05, |
|
"loss": 0.4009, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5509681606750081, |
|
"grad_norm": 0.4402653770907521, |
|
"learning_rate": 9.974818937692228e-06, |
|
"loss": 0.3909, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5524104857029531, |
|
"grad_norm": 0.39402192655503426, |
|
"learning_rate": 9.92445745175494e-06, |
|
"loss": 0.3793, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5538528107308982, |
|
"grad_norm": 0.36447042674734037, |
|
"learning_rate": 9.874097881836241e-06, |
|
"loss": 0.3856, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5552951357588433, |
|
"grad_norm": 0.38084863196798785, |
|
"learning_rate": 9.823741505227852e-06, |
|
"loss": 0.3821, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5567374607867883, |
|
"grad_norm": 0.3689396281200298, |
|
"learning_rate": 9.773389599140504e-06, |
|
"loss": 0.3888, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5581797858147334, |
|
"grad_norm": 0.42447241183482853, |
|
"learning_rate": 9.72304344067155e-06, |
|
"loss": 0.4018, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5596221108426784, |
|
"grad_norm": 0.34840166562757835, |
|
"learning_rate": 9.672704306772547e-06, |
|
"loss": 0.381, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5610644358706235, |
|
"grad_norm": 0.3824007554962182, |
|
"learning_rate": 9.6223734742169e-06, |
|
"loss": 0.405, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5625067608985685, |
|
"grad_norm": 0.40567921647837246, |
|
"learning_rate": 9.572052219567455e-06, |
|
"loss": 0.3886, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5639490859265135, |
|
"grad_norm": 0.4496361442646002, |
|
"learning_rate": 9.521741819144135e-06, |
|
"loss": 0.3926, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.5653914109544586, |
|
"grad_norm": 0.3771274201963948, |
|
"learning_rate": 9.471443548991557e-06, |
|
"loss": 0.4009, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5668337359824036, |
|
"grad_norm": 0.3832741322922619, |
|
"learning_rate": 9.421158684846669e-06, |
|
"loss": 0.3926, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5682760610103487, |
|
"grad_norm": 0.41676932794244004, |
|
"learning_rate": 9.370888502106414e-06, |
|
"loss": 0.4194, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5697183860382937, |
|
"grad_norm": 0.4465176481054024, |
|
"learning_rate": 9.320634275795342e-06, |
|
"loss": 0.3885, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5711607110662388, |
|
"grad_norm": 0.41454265589275485, |
|
"learning_rate": 9.270397280533311e-06, |
|
"loss": 0.4041, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5726030360941838, |
|
"grad_norm": 0.37529076026198815, |
|
"learning_rate": 9.220178790503125e-06, |
|
"loss": 0.3784, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5740453611221289, |
|
"grad_norm": 0.4006407856625201, |
|
"learning_rate": 9.169980079418248e-06, |
|
"loss": 0.3742, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.575487686150074, |
|
"grad_norm": 0.4075785016746068, |
|
"learning_rate": 9.119802420490473e-06, |
|
"loss": 0.4184, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.576930011178019, |
|
"grad_norm": 0.3892341916180056, |
|
"learning_rate": 9.06964708639763e-06, |
|
"loss": 0.3865, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.576930011178019, |
|
"eval_loss": 0.3948507606983185, |
|
"eval_runtime": 142.1685, |
|
"eval_samples_per_second": 12.668, |
|
"eval_steps_per_second": 3.172, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5783723362059641, |
|
"grad_norm": 0.4476758638692534, |
|
"learning_rate": 9.019515349251337e-06, |
|
"loss": 0.4076, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.579814661233909, |
|
"grad_norm": 0.38084358148704506, |
|
"learning_rate": 8.969408480564684e-06, |
|
"loss": 0.3951, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5812569862618541, |
|
"grad_norm": 0.3946160859854508, |
|
"learning_rate": 8.919327751220038e-06, |
|
"loss": 0.3737, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.5826993112897991, |
|
"grad_norm": 0.4376591903476801, |
|
"learning_rate": 8.86927443143676e-06, |
|
"loss": 0.3993, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5841416363177442, |
|
"grad_norm": 0.4220093736158996, |
|
"learning_rate": 8.819249790739033e-06, |
|
"loss": 0.3896, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5855839613456892, |
|
"grad_norm": 0.37781362600911217, |
|
"learning_rate": 8.769255097923617e-06, |
|
"loss": 0.358, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5870262863736343, |
|
"grad_norm": 0.37752543573320735, |
|
"learning_rate": 8.719291621027703e-06, |
|
"loss": 0.4016, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.5884686114015794, |
|
"grad_norm": 0.4195162100656966, |
|
"learning_rate": 8.669360627296745e-06, |
|
"loss": 0.3755, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5899109364295244, |
|
"grad_norm": 0.40866907101120203, |
|
"learning_rate": 8.619463383152296e-06, |
|
"loss": 0.3964, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5913532614574695, |
|
"grad_norm": 0.4194072279329464, |
|
"learning_rate": 8.56960115415992e-06, |
|
"loss": 0.3853, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5927955864854145, |
|
"grad_norm": 0.503872591140977, |
|
"learning_rate": 8.519775204997063e-06, |
|
"loss": 0.4161, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5942379115133596, |
|
"grad_norm": 0.4656959686074043, |
|
"learning_rate": 8.469986799420993e-06, |
|
"loss": 0.4207, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5956802365413045, |
|
"grad_norm": 0.4068362162842934, |
|
"learning_rate": 8.420237200236753e-06, |
|
"loss": 0.3717, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5971225615692496, |
|
"grad_norm": 0.4469993385978865, |
|
"learning_rate": 8.370527669265114e-06, |
|
"loss": 0.4039, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5985648865971946, |
|
"grad_norm": 0.43643202324029334, |
|
"learning_rate": 8.320859467310582e-06, |
|
"loss": 0.3749, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.6000072116251397, |
|
"grad_norm": 0.5297689595825736, |
|
"learning_rate": 8.271233854129413e-06, |
|
"loss": 0.376, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6014495366530848, |
|
"grad_norm": 0.489056954944045, |
|
"learning_rate": 8.221652088397675e-06, |
|
"loss": 0.3933, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6028918616810298, |
|
"grad_norm": 0.37378771704976776, |
|
"learning_rate": 8.172115427679304e-06, |
|
"loss": 0.3945, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6043341867089749, |
|
"grad_norm": 0.4235226777306445, |
|
"learning_rate": 8.122625128394216e-06, |
|
"loss": 0.3826, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.6057765117369199, |
|
"grad_norm": 0.4021066843708137, |
|
"learning_rate": 8.073182445786455e-06, |
|
"loss": 0.3642, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.607218836764865, |
|
"grad_norm": 0.3735730097404964, |
|
"learning_rate": 8.023788633892334e-06, |
|
"loss": 0.3725, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.60866116179281, |
|
"grad_norm": 0.42115686535849983, |
|
"learning_rate": 7.974444945508637e-06, |
|
"loss": 0.3876, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6101034868207551, |
|
"grad_norm": 0.42268328106794184, |
|
"learning_rate": 7.925152632160841e-06, |
|
"loss": 0.4042, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6115458118487, |
|
"grad_norm": 0.4303350707681742, |
|
"learning_rate": 7.875912944071386e-06, |
|
"loss": 0.3718, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6129881368766451, |
|
"grad_norm": 0.41179372110756424, |
|
"learning_rate": 7.826727130127942e-06, |
|
"loss": 0.3844, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6144304619045902, |
|
"grad_norm": 0.3763060638976918, |
|
"learning_rate": 7.77759643785175e-06, |
|
"loss": 0.378, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6158727869325352, |
|
"grad_norm": 0.40647467863126857, |
|
"learning_rate": 7.72852211336598e-06, |
|
"loss": 0.3633, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6173151119604803, |
|
"grad_norm": 0.4427513530880047, |
|
"learning_rate": 7.679505401364116e-06, |
|
"loss": 0.3728, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6187574369884253, |
|
"grad_norm": 0.40218277177425543, |
|
"learning_rate": 7.630547545078398e-06, |
|
"loss": 0.3936, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6201997620163704, |
|
"grad_norm": 0.40266373448906506, |
|
"learning_rate": 7.581649786248276e-06, |
|
"loss": 0.3956, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6216420870443155, |
|
"grad_norm": 0.4101360200980578, |
|
"learning_rate": 7.532813365088921e-06, |
|
"loss": 0.3935, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.6230844120722605, |
|
"grad_norm": 0.4360450388421823, |
|
"learning_rate": 7.484039520259781e-06, |
|
"loss": 0.393, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6245267371002056, |
|
"grad_norm": 0.3984091507351705, |
|
"learning_rate": 7.435329488833137e-06, |
|
"loss": 0.3857, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6259690621281506, |
|
"grad_norm": 0.4057039326760462, |
|
"learning_rate": 7.38668450626276e-06, |
|
"loss": 0.4013, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6274113871560957, |
|
"grad_norm": 0.39301356289008293, |
|
"learning_rate": 7.338105806352542e-06, |
|
"loss": 0.3613, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6288537121840406, |
|
"grad_norm": 0.4031222004525292, |
|
"learning_rate": 7.289594621225236e-06, |
|
"loss": 0.3775, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6302960372119857, |
|
"grad_norm": 0.42389618462152223, |
|
"learning_rate": 7.241152181291173e-06, |
|
"loss": 0.3842, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.6317383622399307, |
|
"grad_norm": 0.4222447939654566, |
|
"learning_rate": 7.192779715217075e-06, |
|
"loss": 0.3747, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6331806872678758, |
|
"grad_norm": 0.3616433078805121, |
|
"learning_rate": 7.144478449894894e-06, |
|
"loss": 0.3619, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.6346230122958209, |
|
"grad_norm": 0.40315108612725287, |
|
"learning_rate": 7.096249610410671e-06, |
|
"loss": 0.383, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6360653373237659, |
|
"grad_norm": 0.39550949033278987, |
|
"learning_rate": 7.0480944200134975e-06, |
|
"loss": 0.3993, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.637507662351711, |
|
"grad_norm": 0.4061605042450912, |
|
"learning_rate": 7.00001410008445e-06, |
|
"loss": 0.3667, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.638949987379656, |
|
"grad_norm": 0.399669288075527, |
|
"learning_rate": 6.952009870105654e-06, |
|
"loss": 0.387, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.6403923124076011, |
|
"grad_norm": 0.4188823149502449, |
|
"learning_rate": 6.904082947629317e-06, |
|
"loss": 0.3814, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6418346374355461, |
|
"grad_norm": 0.3729926900968089, |
|
"learning_rate": 6.856234548246866e-06, |
|
"loss": 0.3647, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.6432769624634912, |
|
"grad_norm": 0.3995200969127714, |
|
"learning_rate": 6.808465885558122e-06, |
|
"loss": 0.3778, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6447192874914361, |
|
"grad_norm": 0.4182365028017815, |
|
"learning_rate": 6.760778171140492e-06, |
|
"loss": 0.4071, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6461616125193812, |
|
"grad_norm": 0.419641094415173, |
|
"learning_rate": 6.713172614518278e-06, |
|
"loss": 0.3838, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6476039375473263, |
|
"grad_norm": 0.455639932664125, |
|
"learning_rate": 6.665650423131953e-06, |
|
"loss": 0.3864, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.6490462625752713, |
|
"grad_norm": 0.42278667120966895, |
|
"learning_rate": 6.618212802307589e-06, |
|
"loss": 0.396, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6504885876032164, |
|
"grad_norm": 0.44585454789944867, |
|
"learning_rate": 6.570860955226234e-06, |
|
"loss": 0.3811, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.6519309126311614, |
|
"grad_norm": 0.3966025625438823, |
|
"learning_rate": 6.5235960828934305e-06, |
|
"loss": 0.3732, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6533732376591065, |
|
"grad_norm": 0.40489868259557904, |
|
"learning_rate": 6.476419384108745e-06, |
|
"loss": 0.3567, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.6548155626870515, |
|
"grad_norm": 0.39366736678335024, |
|
"learning_rate": 6.429332055435349e-06, |
|
"loss": 0.3623, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6562578877149966, |
|
"grad_norm": 0.42529750592620424, |
|
"learning_rate": 6.382335291169698e-06, |
|
"loss": 0.3676, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6577002127429417, |
|
"grad_norm": 0.44036040562921713, |
|
"learning_rate": 6.335430283311206e-06, |
|
"loss": 0.3889, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6591425377708867, |
|
"grad_norm": 0.3787593063841428, |
|
"learning_rate": 6.288618221532031e-06, |
|
"loss": 0.386, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6605848627988317, |
|
"grad_norm": 0.4169592811397764, |
|
"learning_rate": 6.241900293146915e-06, |
|
"loss": 0.3752, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6620271878267767, |
|
"grad_norm": 0.4047539500558757, |
|
"learning_rate": 6.195277683083033e-06, |
|
"loss": 0.3658, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6634695128547218, |
|
"grad_norm": 0.3845249122797127, |
|
"learning_rate": 6.148751573849976e-06, |
|
"loss": 0.3563, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6649118378826668, |
|
"grad_norm": 0.4633041975142693, |
|
"learning_rate": 6.102323145509732e-06, |
|
"loss": 0.3852, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6663541629106119, |
|
"grad_norm": 0.3985148240515743, |
|
"learning_rate": 6.055993575646775e-06, |
|
"loss": 0.3915, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.667796487938557, |
|
"grad_norm": 0.40716397694215495, |
|
"learning_rate": 6.00976403933818e-06, |
|
"loss": 0.3605, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.669238812966502, |
|
"grad_norm": 0.38795576025941675, |
|
"learning_rate": 5.963635709123825e-06, |
|
"loss": 0.37, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6706811379944471, |
|
"grad_norm": 0.4110632294347015, |
|
"learning_rate": 5.91760975497667e-06, |
|
"loss": 0.3853, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6721234630223921, |
|
"grad_norm": 0.3969166036791085, |
|
"learning_rate": 5.871687344273045e-06, |
|
"loss": 0.3672, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6735657880503372, |
|
"grad_norm": 0.41207993758304634, |
|
"learning_rate": 5.8258696417630825e-06, |
|
"loss": 0.3547, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6750081130782822, |
|
"grad_norm": 0.3680867654775724, |
|
"learning_rate": 5.780157809541134e-06, |
|
"loss": 0.3625, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6764504381062273, |
|
"grad_norm": 0.4267438085961488, |
|
"learning_rate": 5.734553007016345e-06, |
|
"loss": 0.3999, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6778927631341722, |
|
"grad_norm": 0.3986326036374569, |
|
"learning_rate": 5.68905639088319e-06, |
|
"loss": 0.3303, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6793350881621173, |
|
"grad_norm": 0.42614206231420926, |
|
"learning_rate": 5.643669115092183e-06, |
|
"loss": 0.3589, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6807774131900624, |
|
"grad_norm": 0.3776847045804154, |
|
"learning_rate": 5.598392330820586e-06, |
|
"loss": 0.3609, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6822197382180074, |
|
"grad_norm": 0.41271036973705766, |
|
"learning_rate": 5.553227186443215e-06, |
|
"loss": 0.3615, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.6836620632459525, |
|
"grad_norm": 0.38781546784387094, |
|
"learning_rate": 5.508174827503328e-06, |
|
"loss": 0.3433, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6851043882738975, |
|
"grad_norm": 0.39550012764434234, |
|
"learning_rate": 5.46323639668353e-06, |
|
"loss": 0.3691, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.6865467133018426, |
|
"grad_norm": 0.4203725670836375, |
|
"learning_rate": 5.4184130337768485e-06, |
|
"loss": 0.3882, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6879890383297876, |
|
"grad_norm": 0.41719368579398214, |
|
"learning_rate": 5.373705875657766e-06, |
|
"loss": 0.3678, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6894313633577327, |
|
"grad_norm": 0.408418654280754, |
|
"learning_rate": 5.329116056253429e-06, |
|
"loss": 0.3788, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6908736883856778, |
|
"grad_norm": 0.4432414502444195, |
|
"learning_rate": 5.284644706514868e-06, |
|
"loss": 0.3733, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6923160134136228, |
|
"grad_norm": 0.43523682450545426, |
|
"learning_rate": 5.240292954388306e-06, |
|
"loss": 0.3716, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6937583384415678, |
|
"grad_norm": 0.4389694994462393, |
|
"learning_rate": 5.1960619247865815e-06, |
|
"loss": 0.3655, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.6952006634695128, |
|
"grad_norm": 0.3932614135155125, |
|
"learning_rate": 5.15195273956057e-06, |
|
"loss": 0.3971, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6966429884974579, |
|
"grad_norm": 0.38979362609767165, |
|
"learning_rate": 5.107966517470771e-06, |
|
"loss": 0.3724, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6980853135254029, |
|
"grad_norm": 0.4209080852390916, |
|
"learning_rate": 5.064104374158909e-06, |
|
"loss": 0.3911, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.699527638553348, |
|
"grad_norm": 0.45055904805315533, |
|
"learning_rate": 5.0203674221196485e-06, |
|
"loss": 0.3633, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.700969963581293, |
|
"grad_norm": 0.3868393099197903, |
|
"learning_rate": 4.9767567706723706e-06, |
|
"loss": 0.3515, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7024122886092381, |
|
"grad_norm": 0.41826804531316264, |
|
"learning_rate": 4.933273525933041e-06, |
|
"loss": 0.3519, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7038546136371832, |
|
"grad_norm": 0.45957339946847975, |
|
"learning_rate": 4.889918790786153e-06, |
|
"loss": 0.3807, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7052969386651282, |
|
"grad_norm": 0.4540538141436769, |
|
"learning_rate": 4.846693664856754e-06, |
|
"loss": 0.3465, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.7067392636930733, |
|
"grad_norm": 0.47813500195150954, |
|
"learning_rate": 4.803599244482558e-06, |
|
"loss": 0.376, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7081815887210183, |
|
"grad_norm": 0.3925519413949624, |
|
"learning_rate": 4.760636622686136e-06, |
|
"loss": 0.3404, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7096239137489633, |
|
"grad_norm": 0.4289528139780234, |
|
"learning_rate": 4.717806889147196e-06, |
|
"loss": 0.3627, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7110662387769083, |
|
"grad_norm": 0.41215198190870284, |
|
"learning_rate": 4.675111130174939e-06, |
|
"loss": 0.3716, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7125085638048534, |
|
"grad_norm": 0.4403007485651443, |
|
"learning_rate": 4.632550428680515e-06, |
|
"loss": 0.3765, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7139508888327984, |
|
"grad_norm": 0.4311724864201015, |
|
"learning_rate": 4.590125864149551e-06, |
|
"loss": 0.3743, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7153932138607435, |
|
"grad_norm": 0.46098384046435353, |
|
"learning_rate": 4.547838512614773e-06, |
|
"loss": 0.3505, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7168355388886886, |
|
"grad_norm": 0.40338840945222365, |
|
"learning_rate": 4.505689446628712e-06, |
|
"loss": 0.3691, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7182778639166336, |
|
"grad_norm": 0.40824551867501546, |
|
"learning_rate": 4.4636797352365035e-06, |
|
"loss": 0.3585, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7197201889445787, |
|
"grad_norm": 0.4297027171998161, |
|
"learning_rate": 4.421810443948774e-06, |
|
"loss": 0.3705, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.7211625139725237, |
|
"grad_norm": 0.40341531049143703, |
|
"learning_rate": 4.38008263471461e-06, |
|
"loss": 0.3815, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7211625139725237, |
|
"eval_loss": 0.37222930788993835, |
|
"eval_runtime": 142.2441, |
|
"eval_samples_per_second": 12.661, |
|
"eval_steps_per_second": 3.171, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7226048390004688, |
|
"grad_norm": 0.4407059294927956, |
|
"learning_rate": 4.338497365894628e-06, |
|
"loss": 0.3661, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.7240471640284138, |
|
"grad_norm": 0.43213340820969415, |
|
"learning_rate": 4.297055692234133e-06, |
|
"loss": 0.3548, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7254894890563589, |
|
"grad_norm": 0.40790860794488015, |
|
"learning_rate": 4.25575866483636e-06, |
|
"loss": 0.3693, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.7269318140843039, |
|
"grad_norm": 0.39452605394978774, |
|
"learning_rate": 4.214607331135817e-06, |
|
"loss": 0.3629, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7283741391122489, |
|
"grad_norm": 0.4535519104968178, |
|
"learning_rate": 4.173602734871723e-06, |
|
"loss": 0.3631, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.729816464140194, |
|
"grad_norm": 0.4215165521407461, |
|
"learning_rate": 4.132745916061528e-06, |
|
"loss": 0.3623, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.731258789168139, |
|
"grad_norm": 0.4369337778893739, |
|
"learning_rate": 4.09203791097454e-06, |
|
"loss": 0.3799, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.7327011141960841, |
|
"grad_norm": 0.4218365082776104, |
|
"learning_rate": 4.051479752105642e-06, |
|
"loss": 0.3281, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7341434392240291, |
|
"grad_norm": 0.39141469492573994, |
|
"learning_rate": 4.01107246814909e-06, |
|
"loss": 0.3779, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.7355857642519742, |
|
"grad_norm": 0.4361183098017262, |
|
"learning_rate": 3.970817083972451e-06, |
|
"loss": 0.3677, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7370280892799193, |
|
"grad_norm": 0.4212489079522315, |
|
"learning_rate": 3.930714620590582e-06, |
|
"loss": 0.3697, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7384704143078643, |
|
"grad_norm": 0.42629366346781794, |
|
"learning_rate": 3.890766095139744e-06, |
|
"loss": 0.336, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7399127393358094, |
|
"grad_norm": 0.39167597840940843, |
|
"learning_rate": 3.850972520851804e-06, |
|
"loss": 0.3297, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7413550643637544, |
|
"grad_norm": 0.4233310284348778, |
|
"learning_rate": 3.8113349070285344e-06, |
|
"loss": 0.3613, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7427973893916994, |
|
"grad_norm": 0.4263022461531563, |
|
"learning_rate": 3.771854259016019e-06, |
|
"loss": 0.3529, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.7442397144196444, |
|
"grad_norm": 0.3973240159937157, |
|
"learning_rate": 3.7325315781791337e-06, |
|
"loss": 0.3661, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7456820394475895, |
|
"grad_norm": 0.39734045764738396, |
|
"learning_rate": 3.693367861876188e-06, |
|
"loss": 0.3815, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.7471243644755345, |
|
"grad_norm": 0.4473118684590064, |
|
"learning_rate": 3.6543641034335873e-06, |
|
"loss": 0.3488, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7485666895034796, |
|
"grad_norm": 0.4071557714101167, |
|
"learning_rate": 3.615521292120663e-06, |
|
"loss": 0.36, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7500090145314247, |
|
"grad_norm": 0.4149969887621353, |
|
"learning_rate": 3.5768404131245695e-06, |
|
"loss": 0.3619, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7514513395593697, |
|
"grad_norm": 0.41064754239264667, |
|
"learning_rate": 3.5383224475253043e-06, |
|
"loss": 0.3623, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.7528936645873148, |
|
"grad_norm": 0.48731666991216727, |
|
"learning_rate": 3.4999683722708265e-06, |
|
"loss": 0.3824, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7543359896152598, |
|
"grad_norm": 0.42149841198530297, |
|
"learning_rate": 3.4617791601522565e-06, |
|
"loss": 0.3658, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.7557783146432049, |
|
"grad_norm": 0.3936949177789515, |
|
"learning_rate": 3.423755779779243e-06, |
|
"loss": 0.3308, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7572206396711499, |
|
"grad_norm": 0.43489944362821054, |
|
"learning_rate": 3.3858991955553455e-06, |
|
"loss": 0.3815, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7586629646990949, |
|
"grad_norm": 0.3921717289554429, |
|
"learning_rate": 3.348210367653625e-06, |
|
"loss": 0.3531, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.76010528972704, |
|
"grad_norm": 0.44238912615157533, |
|
"learning_rate": 3.3106902519922523e-06, |
|
"loss": 0.3696, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.761547614754985, |
|
"grad_norm": 0.4536027992384981, |
|
"learning_rate": 3.27333980021027e-06, |
|
"loss": 0.37, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7629899397829301, |
|
"grad_norm": 0.4564191707678332, |
|
"learning_rate": 3.236159959643482e-06, |
|
"loss": 0.3819, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7644322648108751, |
|
"grad_norm": 0.5326593840798252, |
|
"learning_rate": 3.1991516733003813e-06, |
|
"loss": 0.3758, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7658745898388202, |
|
"grad_norm": 0.43321441818668444, |
|
"learning_rate": 3.1623158798382813e-06, |
|
"loss": 0.3783, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7673169148667652, |
|
"grad_norm": 0.4454237213343821, |
|
"learning_rate": 3.125653513539456e-06, |
|
"loss": 0.3607, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7687592398947103, |
|
"grad_norm": 0.4107211963202732, |
|
"learning_rate": 3.089165504287499e-06, |
|
"loss": 0.3482, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7702015649226553, |
|
"grad_norm": 0.3789782102911423, |
|
"learning_rate": 3.052852777543687e-06, |
|
"loss": 0.3543, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7716438899506004, |
|
"grad_norm": 0.4079189291227377, |
|
"learning_rate": 3.0167162543235384e-06, |
|
"loss": 0.3276, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.7730862149785455, |
|
"grad_norm": 0.4472943997084153, |
|
"learning_rate": 2.9807568511734564e-06, |
|
"loss": 0.3825, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7745285400064905, |
|
"grad_norm": 0.430008379042804, |
|
"learning_rate": 2.944975480147445e-06, |
|
"loss": 0.3595, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7759708650344355, |
|
"grad_norm": 0.4401700574196651, |
|
"learning_rate": 2.909373048784032e-06, |
|
"loss": 0.3779, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7774131900623805, |
|
"grad_norm": 0.4208383654033427, |
|
"learning_rate": 2.873950460083191e-06, |
|
"loss": 0.3749, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.7788555150903256, |
|
"grad_norm": 0.4174074736046765, |
|
"learning_rate": 2.8387086124834952e-06, |
|
"loss": 0.374, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7802978401182706, |
|
"grad_norm": 0.42868575004589055, |
|
"learning_rate": 2.8036483998392784e-06, |
|
"loss": 0.3564, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.7817401651462157, |
|
"grad_norm": 0.3985935455753018, |
|
"learning_rate": 2.768770711398001e-06, |
|
"loss": 0.3667, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7831824901741608, |
|
"grad_norm": 0.40569605016983845, |
|
"learning_rate": 2.734076431777688e-06, |
|
"loss": 0.3506, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7846248152021058, |
|
"grad_norm": 0.39328145893392497, |
|
"learning_rate": 2.6995664409444665e-06, |
|
"loss": 0.3464, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7860671402300509, |
|
"grad_norm": 0.4528233880552543, |
|
"learning_rate": 2.6652416141902913e-06, |
|
"loss": 0.3605, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.7875094652579959, |
|
"grad_norm": 0.4480705994704807, |
|
"learning_rate": 2.631102822110695e-06, |
|
"loss": 0.3726, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.788951790285941, |
|
"grad_norm": 0.4574022134374259, |
|
"learning_rate": 2.597150930582757e-06, |
|
"loss": 0.359, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.790394115313886, |
|
"grad_norm": 0.4078128321456425, |
|
"learning_rate": 2.563386800743094e-06, |
|
"loss": 0.3413, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.791836440341831, |
|
"grad_norm": 0.44464864656256, |
|
"learning_rate": 2.5298112889660544e-06, |
|
"loss": 0.3587, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.793278765369776, |
|
"grad_norm": 0.3890963843751233, |
|
"learning_rate": 2.4964252468419802e-06, |
|
"loss": 0.344, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7947210903977211, |
|
"grad_norm": 0.42348428672207705, |
|
"learning_rate": 2.463229521155611e-06, |
|
"loss": 0.3835, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.7961634154256662, |
|
"grad_norm": 0.4244981524719468, |
|
"learning_rate": 2.430224953864617e-06, |
|
"loss": 0.3908, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7976057404536112, |
|
"grad_norm": 0.4461589097043871, |
|
"learning_rate": 2.397412382078219e-06, |
|
"loss": 0.3493, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.7990480654815563, |
|
"grad_norm": 0.4226119316706504, |
|
"learning_rate": 2.364792638035982e-06, |
|
"loss": 0.3549, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.8004903905095013, |
|
"grad_norm": 0.43426124883547124, |
|
"learning_rate": 2.3323665490866964e-06, |
|
"loss": 0.3578, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8019327155374464, |
|
"grad_norm": 0.42274869171496543, |
|
"learning_rate": 2.300134937667391e-06, |
|
"loss": 0.3805, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8033750405653914, |
|
"grad_norm": 0.4841781161829471, |
|
"learning_rate": 2.2680986212824786e-06, |
|
"loss": 0.3499, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8048173655933365, |
|
"grad_norm": 0.428134320224768, |
|
"learning_rate": 2.2362584124830167e-06, |
|
"loss": 0.3684, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8062596906212816, |
|
"grad_norm": 0.4117804314200649, |
|
"learning_rate": 2.204615118846107e-06, |
|
"loss": 0.3869, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.8077020156492265, |
|
"grad_norm": 0.41413616917927765, |
|
"learning_rate": 2.1731695429543974e-06, |
|
"loss": 0.338, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8091443406771716, |
|
"grad_norm": 0.4360068588380961, |
|
"learning_rate": 2.141922482375737e-06, |
|
"loss": 0.3665, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8105866657051166, |
|
"grad_norm": 0.4334830193418244, |
|
"learning_rate": 2.1108747296429477e-06, |
|
"loss": 0.3721, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8120289907330617, |
|
"grad_norm": 0.507519342034383, |
|
"learning_rate": 2.080027072233718e-06, |
|
"loss": 0.3646, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.8134713157610067, |
|
"grad_norm": 0.42834185576130923, |
|
"learning_rate": 2.049380292550629e-06, |
|
"loss": 0.3633, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8149136407889518, |
|
"grad_norm": 0.453195030964312, |
|
"learning_rate": 2.018935167901316e-06, |
|
"loss": 0.3539, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8163559658168968, |
|
"grad_norm": 0.4103347116873249, |
|
"learning_rate": 1.9886924704787482e-06, |
|
"loss": 0.3457, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8177982908448419, |
|
"grad_norm": 0.4081898260751316, |
|
"learning_rate": 1.9586529673416433e-06, |
|
"loss": 0.347, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.819240615872787, |
|
"grad_norm": 0.40268175350554464, |
|
"learning_rate": 1.928817420395018e-06, |
|
"loss": 0.3772, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.820682940900732, |
|
"grad_norm": 0.43775696767862726, |
|
"learning_rate": 1.8991865863708547e-06, |
|
"loss": 0.3718, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.8221252659286771, |
|
"grad_norm": 0.43895036356232614, |
|
"learning_rate": 1.8697612168089152e-06, |
|
"loss": 0.3648, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.823567590956622, |
|
"grad_norm": 0.40821144604675824, |
|
"learning_rate": 1.8405420580376755e-06, |
|
"loss": 0.3422, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.8250099159845671, |
|
"grad_norm": 0.4577535204704979, |
|
"learning_rate": 1.811529851155398e-06, |
|
"loss": 0.3511, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8264522410125121, |
|
"grad_norm": 0.40698416625428246, |
|
"learning_rate": 1.7827253320113347e-06, |
|
"loss": 0.3521, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.8278945660404572, |
|
"grad_norm": 0.48745985212369625, |
|
"learning_rate": 1.7541292311870616e-06, |
|
"loss": 0.3727, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8293368910684022, |
|
"grad_norm": 0.4152788200688241, |
|
"learning_rate": 1.7257422739779495e-06, |
|
"loss": 0.3406, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.8307792160963473, |
|
"grad_norm": 0.42357457834820555, |
|
"learning_rate": 1.6975651803747716e-06, |
|
"loss": 0.3614, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8322215411242924, |
|
"grad_norm": 0.4290601435620992, |
|
"learning_rate": 1.6695986650454355e-06, |
|
"loss": 0.349, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.8336638661522374, |
|
"grad_norm": 0.40830671063358515, |
|
"learning_rate": 1.6418434373168623e-06, |
|
"loss": 0.3592, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8351061911801825, |
|
"grad_norm": 0.4097799963554095, |
|
"learning_rate": 1.614300201156994e-06, |
|
"loss": 0.3359, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8365485162081275, |
|
"grad_norm": 0.43204146744095845, |
|
"learning_rate": 1.5869696551569346e-06, |
|
"loss": 0.3596, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8379908412360726, |
|
"grad_norm": 0.46076233886580875, |
|
"learning_rate": 1.5598524925132396e-06, |
|
"loss": 0.3609, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8394331662640176, |
|
"grad_norm": 0.4286297255981423, |
|
"learning_rate": 1.5329494010103263e-06, |
|
"loss": 0.3607, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8408754912919626, |
|
"grad_norm": 0.3956440167259478, |
|
"learning_rate": 1.5062610630030317e-06, |
|
"loss": 0.316, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.8423178163199077, |
|
"grad_norm": 0.41432843943606673, |
|
"learning_rate": 1.4797881553993099e-06, |
|
"loss": 0.3589, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8437601413478527, |
|
"grad_norm": 0.397270661772685, |
|
"learning_rate": 1.4535313496430558e-06, |
|
"loss": 0.3519, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8452024663757978, |
|
"grad_norm": 0.41857285751070505, |
|
"learning_rate": 1.4274913116970846e-06, |
|
"loss": 0.3401, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8466447914037428, |
|
"grad_norm": 0.3941031419777465, |
|
"learning_rate": 1.4016687020262231e-06, |
|
"loss": 0.3504, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.8480871164316879, |
|
"grad_norm": 0.428688446592497, |
|
"learning_rate": 1.3760641755805848e-06, |
|
"loss": 0.3614, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8495294414596329, |
|
"grad_norm": 0.4097211469034453, |
|
"learning_rate": 1.3506783817789337e-06, |
|
"loss": 0.3384, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.850971766487578, |
|
"grad_norm": 0.44047116848231305, |
|
"learning_rate": 1.3255119644922266e-06, |
|
"loss": 0.3638, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.852414091515523, |
|
"grad_norm": 0.3994464624403052, |
|
"learning_rate": 1.300565562027276e-06, |
|
"loss": 0.3447, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8538564165434681, |
|
"grad_norm": 0.44495457947302897, |
|
"learning_rate": 1.2758398071105626e-06, |
|
"loss": 0.3546, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8552987415714132, |
|
"grad_norm": 0.4147516297268767, |
|
"learning_rate": 1.2513353268721907e-06, |
|
"loss": 0.3421, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.8567410665993581, |
|
"grad_norm": 0.422646250463158, |
|
"learning_rate": 1.2270527428299684e-06, |
|
"loss": 0.3579, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8581833916273032, |
|
"grad_norm": 0.4189403344854125, |
|
"learning_rate": 1.2029926708736673e-06, |
|
"loss": 0.3425, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.8596257166552482, |
|
"grad_norm": 0.41547910036939945, |
|
"learning_rate": 1.179155721249381e-06, |
|
"loss": 0.3376, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8610680416831933, |
|
"grad_norm": 0.42428858195226893, |
|
"learning_rate": 1.1555424985440522e-06, |
|
"loss": 0.3554, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.8625103667111383, |
|
"grad_norm": 0.4425537282272965, |
|
"learning_rate": 1.1321536016701473e-06, |
|
"loss": 0.351, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8639526917390834, |
|
"grad_norm": 0.4161228925911087, |
|
"learning_rate": 1.1089896238504461e-06, |
|
"loss": 0.336, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.8653950167670285, |
|
"grad_norm": 0.37656047979276985, |
|
"learning_rate": 1.086051152603026e-06, |
|
"loss": 0.3509, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8653950167670285, |
|
"eval_loss": 0.3611552119255066, |
|
"eval_runtime": 142.3229, |
|
"eval_samples_per_second": 12.654, |
|
"eval_steps_per_second": 3.169, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8668373417949735, |
|
"grad_norm": 0.4463172354545017, |
|
"learning_rate": 1.0633387697263254e-06, |
|
"loss": 0.35, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.8682796668229186, |
|
"grad_norm": 0.43074983850708387, |
|
"learning_rate": 1.0408530512844196e-06, |
|
"loss": 0.3613, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8697219918508636, |
|
"grad_norm": 0.39354733454334206, |
|
"learning_rate": 1.0185945675923813e-06, |
|
"loss": 0.3727, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.8711643168788087, |
|
"grad_norm": 0.44960602091132634, |
|
"learning_rate": 9.965638832018432e-07, |
|
"loss": 0.372, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8726066419067536, |
|
"grad_norm": 0.42518881330063735, |
|
"learning_rate": 9.747615568866553e-07, |
|
"loss": 0.3516, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.8740489669346987, |
|
"grad_norm": 0.44741688383815076, |
|
"learning_rate": 9.531881416287203e-07, |
|
"loss": 0.3562, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8754912919626437, |
|
"grad_norm": 0.4331522299966881, |
|
"learning_rate": 9.318441846039828e-07, |
|
"loss": 0.3548, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.8769336169905888, |
|
"grad_norm": 0.506237893255727, |
|
"learning_rate": 9.107302271685226e-07, |
|
"loss": 0.3412, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8783759420185339, |
|
"grad_norm": 0.4658754493753741, |
|
"learning_rate": 8.898468048448528e-07, |
|
"loss": 0.3336, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.8798182670464789, |
|
"grad_norm": 0.438225563597408, |
|
"learning_rate": 8.691944473083114e-07, |
|
"loss": 0.3422, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.881260592074424, |
|
"grad_norm": 0.4170714809613398, |
|
"learning_rate": 8.487736783736533e-07, |
|
"loss": 0.3621, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.882702917102369, |
|
"grad_norm": 0.4590349478238853, |
|
"learning_rate": 8.285850159817388e-07, |
|
"loss": 0.3791, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8841452421303141, |
|
"grad_norm": 0.4332258091307991, |
|
"learning_rate": 8.086289721864127e-07, |
|
"loss": 0.3404, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.8855875671582591, |
|
"grad_norm": 0.4452410333427778, |
|
"learning_rate": 7.889060531415193e-07, |
|
"loss": 0.3541, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8870298921862042, |
|
"grad_norm": 0.42507300447077245, |
|
"learning_rate": 7.694167590880475e-07, |
|
"loss": 0.3549, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.8884722172141493, |
|
"grad_norm": 0.4227403053651907, |
|
"learning_rate": 7.501615843414623e-07, |
|
"loss": 0.3264, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8899145422420942, |
|
"grad_norm": 0.4131961662824003, |
|
"learning_rate": 7.311410172791522e-07, |
|
"loss": 0.3369, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.8913568672700393, |
|
"grad_norm": 0.39579591570866374, |
|
"learning_rate": 7.123555403280558e-07, |
|
"loss": 0.3483, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8927991922979843, |
|
"grad_norm": 0.42292696994848605, |
|
"learning_rate": 6.938056299524099e-07, |
|
"loss": 0.3398, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.8942415173259294, |
|
"grad_norm": 0.38022938922831223, |
|
"learning_rate": 6.754917566416796e-07, |
|
"loss": 0.3469, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8956838423538744, |
|
"grad_norm": 0.4849805496701068, |
|
"learning_rate": 6.574143848986226e-07, |
|
"loss": 0.3618, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.8971261673818195, |
|
"grad_norm": 0.44465461522642474, |
|
"learning_rate": 6.395739732274919e-07, |
|
"loss": 0.3642, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8985684924097646, |
|
"grad_norm": 0.44656695164750837, |
|
"learning_rate": 6.219709741224322e-07, |
|
"loss": 0.3563, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.9000108174377096, |
|
"grad_norm": 0.4269116876807273, |
|
"learning_rate": 6.046058340559824e-07, |
|
"loss": 0.3431, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9014531424656547, |
|
"grad_norm": 0.4086865891433274, |
|
"learning_rate": 5.874789934677583e-07, |
|
"loss": 0.3505, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.9028954674935997, |
|
"grad_norm": 0.4404444466800333, |
|
"learning_rate": 5.705908867532862e-07, |
|
"loss": 0.3407, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9043377925215448, |
|
"grad_norm": 0.45999537115175176, |
|
"learning_rate": 5.53941942252979e-07, |
|
"loss": 0.37, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.9057801175494897, |
|
"grad_norm": 0.4242568290280731, |
|
"learning_rate": 5.375325822412747e-07, |
|
"loss": 0.3316, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9072224425774348, |
|
"grad_norm": 0.4753028820261241, |
|
"learning_rate": 5.213632229159227e-07, |
|
"loss": 0.3785, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.9086647676053798, |
|
"grad_norm": 0.4699691806857396, |
|
"learning_rate": 5.054342743874386e-07, |
|
"loss": 0.3617, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9101070926333249, |
|
"grad_norm": 0.4352496762130561, |
|
"learning_rate": 4.897461406686821e-07, |
|
"loss": 0.3359, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.91154941766127, |
|
"grad_norm": 0.4316421343515809, |
|
"learning_rate": 4.742992196646301e-07, |
|
"loss": 0.3376, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.912991742689215, |
|
"grad_norm": 0.4001287994073788, |
|
"learning_rate": 4.590939031622743e-07, |
|
"loss": 0.3351, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.9144340677171601, |
|
"grad_norm": 0.4363788326973079, |
|
"learning_rate": 4.4413057682068606e-07, |
|
"loss": 0.3473, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.9158763927451051, |
|
"grad_norm": 0.44176842953481193, |
|
"learning_rate": 4.2940962016123524e-07, |
|
"loss": 0.3332, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.9173187177730502, |
|
"grad_norm": 0.43914474716543256, |
|
"learning_rate": 4.149314065579624e-07, |
|
"loss": 0.3383, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9187610428009952, |
|
"grad_norm": 0.4540079519566383, |
|
"learning_rate": 4.0069630322811303e-07, |
|
"loss": 0.3786, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.9202033678289403, |
|
"grad_norm": 0.4612868459187327, |
|
"learning_rate": 3.867046712228162e-07, |
|
"loss": 0.3625, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.9216456928568852, |
|
"grad_norm": 0.40372545279617805, |
|
"learning_rate": 3.729568654179361e-07, |
|
"loss": 0.3308, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.9230880178848303, |
|
"grad_norm": 0.4204476032972304, |
|
"learning_rate": 3.5945323450506387e-07, |
|
"loss": 0.3346, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9245303429127754, |
|
"grad_norm": 0.45260198781122246, |
|
"learning_rate": 3.4619412098267693e-07, |
|
"loss": 0.3795, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.9259726679407204, |
|
"grad_norm": 0.42527213346553855, |
|
"learning_rate": 3.331798611474535e-07, |
|
"loss": 0.3421, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9274149929686655, |
|
"grad_norm": 0.414984415520749, |
|
"learning_rate": 3.204107850857374e-07, |
|
"loss": 0.3291, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.9288573179966105, |
|
"grad_norm": 0.4549260227056393, |
|
"learning_rate": 3.0788721666517365e-07, |
|
"loss": 0.3486, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9302996430245556, |
|
"grad_norm": 0.4443023622951338, |
|
"learning_rate": 2.9560947352648697e-07, |
|
"loss": 0.3756, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.9317419680525006, |
|
"grad_norm": 0.4250192102717841, |
|
"learning_rate": 2.8357786707542854e-07, |
|
"loss": 0.3525, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.9331842930804457, |
|
"grad_norm": 0.41194820669384097, |
|
"learning_rate": 2.71792702474879e-07, |
|
"loss": 0.3562, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.9346266181083908, |
|
"grad_norm": 0.42277936484045997, |
|
"learning_rate": 2.602542786371065e-07, |
|
"loss": 0.3609, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9360689431363358, |
|
"grad_norm": 0.402522590339594, |
|
"learning_rate": 2.489628882161832e-07, |
|
"loss": 0.3323, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.9375112681642809, |
|
"grad_norm": 0.42468823176649917, |
|
"learning_rate": 2.3791881760056756e-07, |
|
"loss": 0.3705, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9389535931922258, |
|
"grad_norm": 0.42563197511583134, |
|
"learning_rate": 2.2712234690583813e-07, |
|
"loss": 0.3635, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.9403959182201709, |
|
"grad_norm": 0.4452148892270775, |
|
"learning_rate": 2.1657374996758795e-07, |
|
"loss": 0.3478, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9418382432481159, |
|
"grad_norm": 0.4539015567282992, |
|
"learning_rate": 2.0627329433447917e-07, |
|
"loss": 0.3736, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.943280568276061, |
|
"grad_norm": 0.40270803503237657, |
|
"learning_rate": 1.9622124126145837e-07, |
|
"loss": 0.3378, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.944722893304006, |
|
"grad_norm": 0.4075396549757293, |
|
"learning_rate": 1.864178457031318e-07, |
|
"loss": 0.3562, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.9461652183319511, |
|
"grad_norm": 0.43266062909072267, |
|
"learning_rate": 1.768633563072919e-07, |
|
"loss": 0.3451, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9476075433598962, |
|
"grad_norm": 0.418621662939926, |
|
"learning_rate": 1.6755801540862092e-07, |
|
"loss": 0.334, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.9490498683878412, |
|
"grad_norm": 0.4221481289163581, |
|
"learning_rate": 1.5850205902253613e-07, |
|
"loss": 0.3536, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9504921934157863, |
|
"grad_norm": 0.40400229300396406, |
|
"learning_rate": 1.4969571683920768e-07, |
|
"loss": 0.3636, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.9519345184437313, |
|
"grad_norm": 0.4142859171614361, |
|
"learning_rate": 1.411392122177302e-07, |
|
"loss": 0.3302, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9533768434716764, |
|
"grad_norm": 0.4259634616965583, |
|
"learning_rate": 1.3283276218046259e-07, |
|
"loss": 0.3674, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.9548191684996213, |
|
"grad_norm": 0.41429097541392035, |
|
"learning_rate": 1.2477657740751714e-07, |
|
"loss": 0.3483, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9562614935275664, |
|
"grad_norm": 0.42353387168902784, |
|
"learning_rate": 1.169708622314214e-07, |
|
"loss": 0.3608, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.9577038185555115, |
|
"grad_norm": 0.42693212185785107, |
|
"learning_rate": 1.0941581463193129e-07, |
|
"loss": 0.3452, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9591461435834565, |
|
"grad_norm": 0.4328702433520352, |
|
"learning_rate": 1.021116262310129e-07, |
|
"loss": 0.3413, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.9605884686114016, |
|
"grad_norm": 0.41956255025855793, |
|
"learning_rate": 9.505848228798076e-08, |
|
"loss": 0.3604, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9620307936393466, |
|
"grad_norm": 0.4209071869524921, |
|
"learning_rate": 8.825656169480056e-08, |
|
"loss": 0.3384, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.9634731186672917, |
|
"grad_norm": 0.4118105753397592, |
|
"learning_rate": 8.170603697154944e-08, |
|
"loss": 0.3338, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9649154436952367, |
|
"grad_norm": 0.43817584876124205, |
|
"learning_rate": 7.540707426204163e-08, |
|
"loss": 0.3281, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.9663577687231818, |
|
"grad_norm": 0.3903217050033041, |
|
"learning_rate": 6.935983332961305e-08, |
|
"loss": 0.3308, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9678000937511269, |
|
"grad_norm": 0.41905865354117233, |
|
"learning_rate": 6.356446755307444e-08, |
|
"loss": 0.3509, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.9692424187790719, |
|
"grad_norm": 0.41394321455611666, |
|
"learning_rate": 5.802112392281123e-08, |
|
"loss": 0.3377, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9706847438070169, |
|
"grad_norm": 0.4316304666724342, |
|
"learning_rate": 5.272994303706758e-08, |
|
"loss": 0.3592, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.9721270688349619, |
|
"grad_norm": 0.45454272140307556, |
|
"learning_rate": 4.769105909836924e-08, |
|
"loss": 0.3485, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.973569393862907, |
|
"grad_norm": 0.43202485000084534, |
|
"learning_rate": 4.2904599910127406e-08, |
|
"loss": 0.3538, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.975011718890852, |
|
"grad_norm": 0.44712558770756466, |
|
"learning_rate": 3.837068687339351e-08, |
|
"loss": 0.367, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.9764540439187971, |
|
"grad_norm": 0.423193248701901, |
|
"learning_rate": 3.408943498377726e-08, |
|
"loss": 0.3351, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.9778963689467421, |
|
"grad_norm": 0.47037763666404425, |
|
"learning_rate": 3.006095282854116e-08, |
|
"loss": 0.3966, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9793386939746872, |
|
"grad_norm": 0.4314080592872779, |
|
"learning_rate": 2.628534258383164e-08, |
|
"loss": 0.357, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.9807810190026323, |
|
"grad_norm": 0.45121239415975073, |
|
"learning_rate": 2.2762700012097795e-08, |
|
"loss": 0.3564, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9822233440305773, |
|
"grad_norm": 0.4226505971917229, |
|
"learning_rate": 1.9493114459659956e-08, |
|
"loss": 0.3625, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.9836656690585224, |
|
"grad_norm": 0.4197713049001792, |
|
"learning_rate": 1.6476668854440435e-08, |
|
"loss": 0.3526, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.9851079940864674, |
|
"grad_norm": 0.4575738762031232, |
|
"learning_rate": 1.3713439703865183e-08, |
|
"loss": 0.3762, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.9865503191144124, |
|
"grad_norm": 0.4574906098764045, |
|
"learning_rate": 1.120349709291868e-08, |
|
"loss": 0.3634, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9879926441423574, |
|
"grad_norm": 0.43088006927461175, |
|
"learning_rate": 8.946904682370917e-09, |
|
"loss": 0.3675, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.9894349691703025, |
|
"grad_norm": 0.4103449101623024, |
|
"learning_rate": 6.943719707158681e-09, |
|
"loss": 0.3496, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9908772941982475, |
|
"grad_norm": 0.40469613082222705, |
|
"learning_rate": 5.193992974935613e-09, |
|
"loss": 0.369, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.9923196192261926, |
|
"grad_norm": 0.46076258755412675, |
|
"learning_rate": 3.697768864782125e-09, |
|
"loss": 0.3588, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.9937619442541377, |
|
"grad_norm": 0.4334341619233562, |
|
"learning_rate": 2.4550853260851826e-09, |
|
"loss": 0.3345, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.9952042692820827, |
|
"grad_norm": 0.44568439209243566, |
|
"learning_rate": 1.4659738775679721e-09, |
|
"loss": 0.3459, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9966465943100278, |
|
"grad_norm": 0.45951543969711284, |
|
"learning_rate": 7.30459606494982e-10, |
|
"loss": 0.3791, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.9980889193379728, |
|
"grad_norm": 0.4459520568434071, |
|
"learning_rate": 2.4856116803695375e-10, |
|
"loss": 0.3525, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9995312443659179, |
|
"grad_norm": 0.4581327568157757, |
|
"learning_rate": 2.0290784791265893e-11, |
|
"loss": 0.3492, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.9998197093715069, |
|
"step": 3466, |
|
"total_flos": 4977616761913344.0, |
|
"train_loss": 0.6325558101381102, |
|
"train_runtime": 63848.9812, |
|
"train_samples_per_second": 3.475, |
|
"train_steps_per_second": 0.054 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3466, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4977616761913344.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|