{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 200, "global_step": 3252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009225092250922509, "grad_norm": 9.216251979924113, "learning_rate": 0.0, "loss": 1.1386, "step": 1 }, { "epoch": 0.004612546125461255, "grad_norm": 9.265663963684933, "learning_rate": 2.45398773006135e-07, "loss": 1.1356, "step": 5 }, { "epoch": 0.00922509225092251, "grad_norm": 5.379773922138807, "learning_rate": 5.521472392638038e-07, "loss": 1.1078, "step": 10 }, { "epoch": 0.013837638376383764, "grad_norm": 3.1833820798612313, "learning_rate": 8.588957055214725e-07, "loss": 1.0446, "step": 15 }, { "epoch": 0.01845018450184502, "grad_norm": 3.7414115093127966, "learning_rate": 1.165644171779141e-06, "loss": 1.027, "step": 20 }, { "epoch": 0.023062730627306273, "grad_norm": 2.077966946821524, "learning_rate": 1.47239263803681e-06, "loss": 1.0126, "step": 25 }, { "epoch": 0.027675276752767528, "grad_norm": 1.976442561168195, "learning_rate": 1.7791411042944787e-06, "loss": 0.9766, "step": 30 }, { "epoch": 0.03228782287822878, "grad_norm": 2.0558473198791685, "learning_rate": 2.085889570552147e-06, "loss": 0.9988, "step": 35 }, { "epoch": 0.03690036900369004, "grad_norm": 2.0823124494571843, "learning_rate": 2.392638036809816e-06, "loss": 1.0026, "step": 40 }, { "epoch": 0.04151291512915129, "grad_norm": 1.8983417897768358, "learning_rate": 2.699386503067485e-06, "loss": 0.9786, "step": 45 }, { "epoch": 0.046125461254612546, "grad_norm": 2.157480675022155, "learning_rate": 3.0061349693251535e-06, "loss": 0.9712, "step": 50 }, { "epoch": 0.0507380073800738, "grad_norm": 1.770274327543251, "learning_rate": 3.312883435582822e-06, "loss": 0.9703, "step": 55 }, { "epoch": 0.055350553505535055, "grad_norm": 1.7607024339948523, "learning_rate": 3.6196319018404913e-06, "loss": 0.9819, "step": 60 }, { "epoch": 0.05996309963099631, "grad_norm": 1.7629489637063536, "learning_rate": 3.92638036809816e-06, "loss": 0.9814, "step": 65 }, { "epoch": 0.06457564575645756, "grad_norm": 1.9708586699192496, "learning_rate": 4.233128834355829e-06, "loss": 0.9581, "step": 70 }, { "epoch": 0.06918819188191883, "grad_norm": 1.8360638750389535, "learning_rate": 4.539877300613497e-06, "loss": 0.9631, "step": 75 }, { "epoch": 0.07380073800738007, "grad_norm": 2.267503391793938, "learning_rate": 4.846625766871166e-06, "loss": 0.9544, "step": 80 }, { "epoch": 0.07841328413284133, "grad_norm": 1.884337413377388, "learning_rate": 5.153374233128835e-06, "loss": 0.972, "step": 85 }, { "epoch": 0.08302583025830258, "grad_norm": 1.8743154625885863, "learning_rate": 5.460122699386503e-06, "loss": 0.9697, "step": 90 }, { "epoch": 0.08763837638376384, "grad_norm": 2.1189085616099, "learning_rate": 5.766871165644172e-06, "loss": 0.9741, "step": 95 }, { "epoch": 0.09225092250922509, "grad_norm": 2.212060434062884, "learning_rate": 6.073619631901841e-06, "loss": 0.9798, "step": 100 }, { "epoch": 0.09686346863468635, "grad_norm": 1.929193262402203, "learning_rate": 6.38036809815951e-06, "loss": 1.0036, "step": 105 }, { "epoch": 0.1014760147601476, "grad_norm": 2.003660126750222, "learning_rate": 6.687116564417178e-06, "loss": 0.9857, "step": 110 }, { "epoch": 0.10608856088560886, "grad_norm": 1.8476735746280122, "learning_rate": 6.993865030674847e-06, "loss": 0.991, "step": 115 }, { "epoch": 0.11070110701107011, "grad_norm": 1.8581099924431381, "learning_rate": 7.300613496932516e-06, "loss": 0.971, "step": 120 }, { "epoch": 0.11531365313653137, "grad_norm": 1.9178224374591495, "learning_rate": 7.6073619631901856e-06, "loss": 0.9944, "step": 125 }, { "epoch": 0.11992619926199262, "grad_norm": 1.942003358498506, "learning_rate": 7.914110429447854e-06, "loss": 0.9674, "step": 130 }, { "epoch": 0.12453874538745388, "grad_norm": 2.1197747747438154, "learning_rate": 8.220858895705522e-06, "loss": 0.9963, "step": 135 }, { "epoch": 0.12915129151291513, "grad_norm": 1.8664824635600694, "learning_rate": 8.527607361963191e-06, "loss": 0.9926, "step": 140 }, { "epoch": 0.13376383763837638, "grad_norm": 2.0058004450691347, "learning_rate": 8.83435582822086e-06, "loss": 0.9712, "step": 145 }, { "epoch": 0.13837638376383765, "grad_norm": 1.7687908496159355, "learning_rate": 9.14110429447853e-06, "loss": 0.9593, "step": 150 }, { "epoch": 0.1429889298892989, "grad_norm": 2.245349299711709, "learning_rate": 9.447852760736197e-06, "loss": 1.0105, "step": 155 }, { "epoch": 0.14760147601476015, "grad_norm": 2.3027490792265186, "learning_rate": 9.754601226993867e-06, "loss": 0.9674, "step": 160 }, { "epoch": 0.1522140221402214, "grad_norm": 1.7846972073040872, "learning_rate": 1.0061349693251534e-05, "loss": 0.9759, "step": 165 }, { "epoch": 0.15682656826568267, "grad_norm": 2.3489880483587093, "learning_rate": 1.0368098159509204e-05, "loss": 0.9795, "step": 170 }, { "epoch": 0.16143911439114392, "grad_norm": 1.8342564944945305, "learning_rate": 1.0674846625766873e-05, "loss": 0.9652, "step": 175 }, { "epoch": 0.16605166051660517, "grad_norm": 2.0880839713049935, "learning_rate": 1.0981595092024542e-05, "loss": 0.9802, "step": 180 }, { "epoch": 0.1706642066420664, "grad_norm": 2.0564885394915584, "learning_rate": 1.1288343558282208e-05, "loss": 0.9879, "step": 185 }, { "epoch": 0.1752767527675277, "grad_norm": 1.8924944592502544, "learning_rate": 1.1595092024539878e-05, "loss": 0.9721, "step": 190 }, { "epoch": 0.17988929889298894, "grad_norm": 1.826625526770449, "learning_rate": 1.1901840490797547e-05, "loss": 0.9796, "step": 195 }, { "epoch": 0.18450184501845018, "grad_norm": 2.204915779056281, "learning_rate": 1.2208588957055216e-05, "loss": 0.9838, "step": 200 }, { "epoch": 0.18450184501845018, "eval_loss": 0.99369215965271, "eval_runtime": 539.2617, "eval_samples_per_second": 28.465, "eval_steps_per_second": 0.111, "step": 200 }, { "epoch": 0.18911439114391143, "grad_norm": 1.9931165010626688, "learning_rate": 1.2515337423312886e-05, "loss": 0.9757, "step": 205 }, { "epoch": 0.1937269372693727, "grad_norm": 1.9621352935552971, "learning_rate": 1.2822085889570552e-05, "loss": 0.9591, "step": 210 }, { "epoch": 0.19833948339483395, "grad_norm": 2.2039217203491632, "learning_rate": 1.3128834355828221e-05, "loss": 0.9738, "step": 215 }, { "epoch": 0.2029520295202952, "grad_norm": 1.8509141089097243, "learning_rate": 1.343558282208589e-05, "loss": 0.9966, "step": 220 }, { "epoch": 0.20756457564575645, "grad_norm": 1.8414622043228284, "learning_rate": 1.374233128834356e-05, "loss": 1.0021, "step": 225 }, { "epoch": 0.21217712177121772, "grad_norm": 1.80163089612214, "learning_rate": 1.4049079754601229e-05, "loss": 0.9831, "step": 230 }, { "epoch": 0.21678966789667897, "grad_norm": 2.0579916481946983, "learning_rate": 1.4355828220858897e-05, "loss": 1.003, "step": 235 }, { "epoch": 0.22140221402214022, "grad_norm": 2.17279455840179, "learning_rate": 1.4662576687116566e-05, "loss": 0.9951, "step": 240 }, { "epoch": 0.22601476014760147, "grad_norm": 1.9598270944099394, "learning_rate": 1.4969325153374235e-05, "loss": 0.9963, "step": 245 }, { "epoch": 0.23062730627306274, "grad_norm": 1.7398200214510018, "learning_rate": 1.5276073619631903e-05, "loss": 0.995, "step": 250 }, { "epoch": 0.235239852398524, "grad_norm": 1.8787859113099807, "learning_rate": 1.5582822085889574e-05, "loss": 1.0017, "step": 255 }, { "epoch": 0.23985239852398524, "grad_norm": 1.68690066175367, "learning_rate": 1.5889570552147238e-05, "loss": 1.0063, "step": 260 }, { "epoch": 0.2444649446494465, "grad_norm": 1.8827477836410789, "learning_rate": 1.619631901840491e-05, "loss": 1.0143, "step": 265 }, { "epoch": 0.24907749077490776, "grad_norm": 1.7349433277324904, "learning_rate": 1.6503067484662577e-05, "loss": 1.0152, "step": 270 }, { "epoch": 0.253690036900369, "grad_norm": 1.8180747657522185, "learning_rate": 1.6809815950920248e-05, "loss": 1.0022, "step": 275 }, { "epoch": 0.25830258302583026, "grad_norm": 2.2464010609362735, "learning_rate": 1.7116564417177916e-05, "loss": 1.0131, "step": 280 }, { "epoch": 0.2629151291512915, "grad_norm": 1.659157693268852, "learning_rate": 1.7423312883435583e-05, "loss": 1.0029, "step": 285 }, { "epoch": 0.26752767527675275, "grad_norm": 1.8259012389516431, "learning_rate": 1.7730061349693254e-05, "loss": 1.0149, "step": 290 }, { "epoch": 0.272140221402214, "grad_norm": 1.7620006504062913, "learning_rate": 1.8036809815950922e-05, "loss": 1.0058, "step": 295 }, { "epoch": 0.2767527675276753, "grad_norm": 1.8231421170705873, "learning_rate": 1.834355828220859e-05, "loss": 1.0157, "step": 300 }, { "epoch": 0.28136531365313655, "grad_norm": 1.7640621041852131, "learning_rate": 1.8650306748466257e-05, "loss": 0.9917, "step": 305 }, { "epoch": 0.2859778597785978, "grad_norm": 1.6216889282252938, "learning_rate": 1.8957055214723928e-05, "loss": 1.0176, "step": 310 }, { "epoch": 0.29059040590405905, "grad_norm": 1.8784106605665698, "learning_rate": 1.9263803680981596e-05, "loss": 1.0092, "step": 315 }, { "epoch": 0.2952029520295203, "grad_norm": 1.7812979855381206, "learning_rate": 1.9570552147239267e-05, "loss": 1.0122, "step": 320 }, { "epoch": 0.29981549815498154, "grad_norm": 2.7924898972312375, "learning_rate": 1.9877300613496935e-05, "loss": 1.0214, "step": 325 }, { "epoch": 0.3044280442804428, "grad_norm": 1.9196461092563228, "learning_rate": 1.999994812438719e-05, "loss": 1.037, "step": 330 }, { "epoch": 0.30904059040590404, "grad_norm": 1.9675689878341092, "learning_rate": 1.9999631108702447e-05, "loss": 1.0322, "step": 335 }, { "epoch": 0.31365313653136534, "grad_norm": 1.965168599936586, "learning_rate": 1.999902590624309e-05, "loss": 1.0217, "step": 340 }, { "epoch": 0.3182656826568266, "grad_norm": 2.0547134862700225, "learning_rate": 1.9998132534450893e-05, "loss": 1.0312, "step": 345 }, { "epoch": 0.32287822878228783, "grad_norm": 1.8052849563747033, "learning_rate": 1.9996951019072605e-05, "loss": 1.0062, "step": 350 }, { "epoch": 0.3274907749077491, "grad_norm": 1.7264405471248967, "learning_rate": 1.999548139415919e-05, "loss": 1.0176, "step": 355 }, { "epoch": 0.33210332103321033, "grad_norm": 3.9478371721075995, "learning_rate": 1.9993723702064852e-05, "loss": 1.0241, "step": 360 }, { "epoch": 0.3367158671586716, "grad_norm": 1.8206235491040932, "learning_rate": 1.9991677993445832e-05, "loss": 1.0172, "step": 365 }, { "epoch": 0.3413284132841328, "grad_norm": 1.936656750339234, "learning_rate": 1.998934432725891e-05, "loss": 1.0395, "step": 370 }, { "epoch": 0.3459409594095941, "grad_norm": 1.719646103111977, "learning_rate": 1.998672277075975e-05, "loss": 1.0242, "step": 375 }, { "epoch": 0.3505535055350554, "grad_norm": 1.64985869025976, "learning_rate": 1.998381339950093e-05, "loss": 1.0168, "step": 380 }, { "epoch": 0.3551660516605166, "grad_norm": 1.5355798843409723, "learning_rate": 1.9980616297329764e-05, "loss": 1.0, "step": 385 }, { "epoch": 0.35977859778597787, "grad_norm": 1.530082699368783, "learning_rate": 1.997713155638592e-05, "loss": 1.0086, "step": 390 }, { "epoch": 0.3643911439114391, "grad_norm": 1.803426492693465, "learning_rate": 1.997335927709872e-05, "loss": 1.0318, "step": 395 }, { "epoch": 0.36900369003690037, "grad_norm": 1.7841711948275436, "learning_rate": 1.9969299568184276e-05, "loss": 1.0162, "step": 400 }, { "epoch": 0.36900369003690037, "eval_loss": 1.032899022102356, "eval_runtime": 476.7218, "eval_samples_per_second": 32.199, "eval_steps_per_second": 0.126, "step": 400 }, { "epoch": 0.3736162361623616, "grad_norm": 1.750763339163792, "learning_rate": 1.996495254664235e-05, "loss": 1.0238, "step": 405 }, { "epoch": 0.37822878228782286, "grad_norm": 2.084259407603671, "learning_rate": 1.996031833775297e-05, "loss": 1.0144, "step": 410 }, { "epoch": 0.3828413284132841, "grad_norm": 1.581188361061769, "learning_rate": 1.995539707507284e-05, "loss": 1.0034, "step": 415 }, { "epoch": 0.3874538745387454, "grad_norm": 1.6642375432029033, "learning_rate": 1.9950188900431464e-05, "loss": 1.0452, "step": 420 }, { "epoch": 0.39206642066420666, "grad_norm": 1.90454885480689, "learning_rate": 1.9944693963927092e-05, "loss": 1.0156, "step": 425 }, { "epoch": 0.3966789667896679, "grad_norm": 1.9076933400032088, "learning_rate": 1.9938912423922368e-05, "loss": 1.0243, "step": 430 }, { "epoch": 0.40129151291512916, "grad_norm": 1.5419487099448481, "learning_rate": 1.9932844447039775e-05, "loss": 1.0036, "step": 435 }, { "epoch": 0.4059040590405904, "grad_norm": 1.5469597080761535, "learning_rate": 1.992649020815683e-05, "loss": 1.0216, "step": 440 }, { "epoch": 0.41051660516605165, "grad_norm": 1.727534201068121, "learning_rate": 1.991984989040105e-05, "loss": 1.023, "step": 445 }, { "epoch": 0.4151291512915129, "grad_norm": 1.5601876251457003, "learning_rate": 1.9912923685144673e-05, "loss": 1.0309, "step": 450 }, { "epoch": 0.41974169741697415, "grad_norm": 1.5739530073618044, "learning_rate": 1.9905711791999135e-05, "loss": 1.009, "step": 455 }, { "epoch": 0.42435424354243545, "grad_norm": 1.676935183830041, "learning_rate": 1.989821441880933e-05, "loss": 1.01, "step": 460 }, { "epoch": 0.4289667896678967, "grad_norm": 1.6489937405447432, "learning_rate": 1.98904317816476e-05, "loss": 1.023, "step": 465 }, { "epoch": 0.43357933579335795, "grad_norm": 1.5735733105955712, "learning_rate": 1.9882364104807536e-05, "loss": 1.0348, "step": 470 }, { "epoch": 0.4381918819188192, "grad_norm": 1.549624253561804, "learning_rate": 1.9874011620797494e-05, "loss": 1.0302, "step": 475 }, { "epoch": 0.44280442804428044, "grad_norm": 1.5401331906305855, "learning_rate": 1.9865374570333887e-05, "loss": 1.0217, "step": 480 }, { "epoch": 0.4474169741697417, "grad_norm": 1.5202857812071402, "learning_rate": 1.9856453202334277e-05, "loss": 1.0388, "step": 485 }, { "epoch": 0.45202952029520294, "grad_norm": 1.7005961483689318, "learning_rate": 1.9847247773910176e-05, "loss": 1.0167, "step": 490 }, { "epoch": 0.4566420664206642, "grad_norm": 1.7121845874238086, "learning_rate": 1.9837758550359637e-05, "loss": 1.0041, "step": 495 }, { "epoch": 0.4612546125461255, "grad_norm": 1.6923734480662047, "learning_rate": 1.9827985805159626e-05, "loss": 1.0378, "step": 500 }, { "epoch": 0.46586715867158673, "grad_norm": 1.756497601027887, "learning_rate": 1.981792981995812e-05, "loss": 1.0148, "step": 505 }, { "epoch": 0.470479704797048, "grad_norm": 1.6043365874775524, "learning_rate": 1.980759088456601e-05, "loss": 1.0306, "step": 510 }, { "epoch": 0.47509225092250923, "grad_norm": 1.5593042352189914, "learning_rate": 1.9796969296948723e-05, "loss": 1.0384, "step": 515 }, { "epoch": 0.4797047970479705, "grad_norm": 1.8356964322880667, "learning_rate": 1.978606536321767e-05, "loss": 1.0277, "step": 520 }, { "epoch": 0.4843173431734317, "grad_norm": 1.5388627493896347, "learning_rate": 1.9774879397621387e-05, "loss": 1.0089, "step": 525 }, { "epoch": 0.488929889298893, "grad_norm": 1.5717047981647194, "learning_rate": 1.9763411722536503e-05, "loss": 1.0206, "step": 530 }, { "epoch": 0.4935424354243542, "grad_norm": 1.673580231538279, "learning_rate": 1.9751662668458434e-05, "loss": 1.0071, "step": 535 }, { "epoch": 0.4981549815498155, "grad_norm": 1.977373456991288, "learning_rate": 1.9739632573991877e-05, "loss": 1.0223, "step": 540 }, { "epoch": 0.5027675276752768, "grad_norm": 1.6746529870758962, "learning_rate": 1.9727321785841028e-05, "loss": 1.0105, "step": 545 }, { "epoch": 0.507380073800738, "grad_norm": 1.515127525498714, "learning_rate": 1.9714730658799616e-05, "loss": 1.0159, "step": 550 }, { "epoch": 0.5119926199261993, "grad_norm": 1.563050922669026, "learning_rate": 1.9701859555740647e-05, "loss": 1.026, "step": 555 }, { "epoch": 0.5166051660516605, "grad_norm": 1.635916190051428, "learning_rate": 1.9688708847605977e-05, "loss": 1.0148, "step": 560 }, { "epoch": 0.5212177121771218, "grad_norm": 1.6077492732765468, "learning_rate": 1.9675278913395605e-05, "loss": 1.0126, "step": 565 }, { "epoch": 0.525830258302583, "grad_norm": 1.6108748420566017, "learning_rate": 1.9661570140156746e-05, "loss": 1.0116, "step": 570 }, { "epoch": 0.5304428044280443, "grad_norm": 1.433325441089538, "learning_rate": 1.9647582922972696e-05, "loss": 1.012, "step": 575 }, { "epoch": 0.5350553505535055, "grad_norm": 1.5420337235660757, "learning_rate": 1.9633317664951418e-05, "loss": 1.0122, "step": 580 }, { "epoch": 0.5396678966789668, "grad_norm": 1.5924684100513768, "learning_rate": 1.9618774777213954e-05, "loss": 1.0109, "step": 585 }, { "epoch": 0.544280442804428, "grad_norm": 1.5313594203853436, "learning_rate": 1.960395467888255e-05, "loss": 1.0031, "step": 590 }, { "epoch": 0.5488929889298892, "grad_norm": 1.4655266545088188, "learning_rate": 1.9588857797068602e-05, "loss": 1.0315, "step": 595 }, { "epoch": 0.5535055350553506, "grad_norm": 1.5909950744220547, "learning_rate": 1.957348456686032e-05, "loss": 1.0095, "step": 600 }, { "epoch": 0.5535055350553506, "eval_loss": 1.0302140712738037, "eval_runtime": 620.3005, "eval_samples_per_second": 24.746, "eval_steps_per_second": 0.097, "step": 600 }, { "epoch": 0.5581180811808119, "grad_norm": 1.7854466441111572, "learning_rate": 1.955783543131022e-05, "loss": 1.0181, "step": 605 }, { "epoch": 0.5627306273062731, "grad_norm": 1.5564929489350854, "learning_rate": 1.9541910841422324e-05, "loss": 1.0259, "step": 610 }, { "epoch": 0.5673431734317343, "grad_norm": 1.4897983626795097, "learning_rate": 1.952571125613918e-05, "loss": 1.0108, "step": 615 }, { "epoch": 0.5719557195571956, "grad_norm": 1.6487054053969497, "learning_rate": 1.9509237142328638e-05, "loss": 1.0217, "step": 620 }, { "epoch": 0.5765682656826568, "grad_norm": 1.55798491461706, "learning_rate": 1.949248897477038e-05, "loss": 1.0095, "step": 625 }, { "epoch": 0.5811808118081181, "grad_norm": 1.5666282016035418, "learning_rate": 1.9475467236142252e-05, "loss": 1.0197, "step": 630 }, { "epoch": 0.5857933579335793, "grad_norm": 1.5475535188970362, "learning_rate": 1.9458172417006347e-05, "loss": 1.029, "step": 635 }, { "epoch": 0.5904059040590406, "grad_norm": 1.6315436927424156, "learning_rate": 1.944060501579487e-05, "loss": 1.0298, "step": 640 }, { "epoch": 0.5950184501845018, "grad_norm": 1.468711297047145, "learning_rate": 1.9422765538795758e-05, "loss": 1.0018, "step": 645 }, { "epoch": 0.5996309963099631, "grad_norm": 1.518389031402169, "learning_rate": 1.9404654500138117e-05, "loss": 1.0226, "step": 650 }, { "epoch": 0.6042435424354243, "grad_norm": 1.4592808647120747, "learning_rate": 1.938627242177738e-05, "loss": 1.0174, "step": 655 }, { "epoch": 0.6088560885608856, "grad_norm": 1.480051570474581, "learning_rate": 1.936761983348028e-05, "loss": 1.0063, "step": 660 }, { "epoch": 0.6134686346863468, "grad_norm": 1.5455098945055998, "learning_rate": 1.9348697272809568e-05, "loss": 1.0186, "step": 665 }, { "epoch": 0.6180811808118081, "grad_norm": 1.467795849616486, "learning_rate": 1.9329505285108544e-05, "loss": 1.0223, "step": 670 }, { "epoch": 0.6226937269372693, "grad_norm": 1.4583083150731897, "learning_rate": 1.9310044423485303e-05, "loss": 1.0188, "step": 675 }, { "epoch": 0.6273062730627307, "grad_norm": 1.4520145261143442, "learning_rate": 1.9290315248796834e-05, "loss": 1.0148, "step": 680 }, { "epoch": 0.6319188191881919, "grad_norm": 1.545625151246913, "learning_rate": 1.9270318329632833e-05, "loss": 1.0124, "step": 685 }, { "epoch": 0.6365313653136532, "grad_norm": 1.5081286119716983, "learning_rate": 1.925005424229933e-05, "loss": 1.0122, "step": 690 }, { "epoch": 0.6411439114391144, "grad_norm": 1.5771614507281495, "learning_rate": 1.922952357080205e-05, "loss": 1.0304, "step": 695 }, { "epoch": 0.6457564575645757, "grad_norm": 1.3712078687992697, "learning_rate": 1.9208726906829637e-05, "loss": 0.9935, "step": 700 }, { "epoch": 0.6503690036900369, "grad_norm": 1.514260367509165, "learning_rate": 1.9187664849736542e-05, "loss": 0.9928, "step": 705 }, { "epoch": 0.6549815498154982, "grad_norm": 1.4337714275045377, "learning_rate": 1.9166338006525786e-05, "loss": 0.9999, "step": 710 }, { "epoch": 0.6595940959409594, "grad_norm": 1.5474123891067624, "learning_rate": 1.9144746991831463e-05, "loss": 1.0136, "step": 715 }, { "epoch": 0.6642066420664207, "grad_norm": 1.6417540187004078, "learning_rate": 1.9122892427901015e-05, "loss": 1.0148, "step": 720 }, { "epoch": 0.6688191881918819, "grad_norm": 1.4429113958532114, "learning_rate": 1.9100774944577303e-05, "loss": 1.0054, "step": 725 }, { "epoch": 0.6734317343173432, "grad_norm": 1.5435502423447707, "learning_rate": 1.907839517928046e-05, "loss": 1.0042, "step": 730 }, { "epoch": 0.6780442804428044, "grad_norm": 1.6291068946061023, "learning_rate": 1.9055753776989516e-05, "loss": 1.0095, "step": 735 }, { "epoch": 0.6826568265682657, "grad_norm": 1.5269546110275527, "learning_rate": 1.903285139022381e-05, "loss": 1.0091, "step": 740 }, { "epoch": 0.6872693726937269, "grad_norm": 1.6910985232351008, "learning_rate": 1.900968867902419e-05, "loss": 1.0105, "step": 745 }, { "epoch": 0.6918819188191881, "grad_norm": 1.473141009294655, "learning_rate": 1.898626631093399e-05, "loss": 1.0016, "step": 750 }, { "epoch": 0.6964944649446494, "grad_norm": 1.6512202388953925, "learning_rate": 1.896258496097977e-05, "loss": 1.0119, "step": 755 }, { "epoch": 0.7011070110701108, "grad_norm": 1.4887026369918788, "learning_rate": 1.8938645311651904e-05, "loss": 1.0087, "step": 760 }, { "epoch": 0.705719557195572, "grad_norm": 1.3843944368722685, "learning_rate": 1.891444805288487e-05, "loss": 1.0091, "step": 765 }, { "epoch": 0.7103321033210332, "grad_norm": 1.428126710831411, "learning_rate": 1.888999388203739e-05, "loss": 1.0059, "step": 770 }, { "epoch": 0.7149446494464945, "grad_norm": 1.4390802486166878, "learning_rate": 1.8865283503872325e-05, "loss": 0.9994, "step": 775 }, { "epoch": 0.7195571955719557, "grad_norm": 1.5690316004271283, "learning_rate": 1.884031763053636e-05, "loss": 0.9996, "step": 780 }, { "epoch": 0.724169741697417, "grad_norm": 1.5021545547633912, "learning_rate": 1.8815096981539494e-05, "loss": 0.9991, "step": 785 }, { "epoch": 0.7287822878228782, "grad_norm": 1.4900671252584468, "learning_rate": 1.8789622283734283e-05, "loss": 1.0101, "step": 790 }, { "epoch": 0.7333948339483395, "grad_norm": 1.630926420050374, "learning_rate": 1.8763894271294914e-05, "loss": 0.9929, "step": 795 }, { "epoch": 0.7380073800738007, "grad_norm": 1.3834521402990088, "learning_rate": 1.873791368569603e-05, "loss": 0.9857, "step": 800 }, { "epoch": 0.7380073800738007, "eval_loss": 1.0203672647476196, "eval_runtime": 417.6477, "eval_samples_per_second": 36.753, "eval_steps_per_second": 0.144, "step": 800 }, { "epoch": 0.742619926199262, "grad_norm": 1.4145701317152546, "learning_rate": 1.8711681275691366e-05, "loss": 1.0197, "step": 805 }, { "epoch": 0.7472324723247232, "grad_norm": 1.6689498503071274, "learning_rate": 1.868519779729218e-05, "loss": 1.0399, "step": 810 }, { "epoch": 0.7518450184501845, "grad_norm": 1.5473797400416536, "learning_rate": 1.8658464013745443e-05, "loss": 1.0189, "step": 815 }, { "epoch": 0.7564575645756457, "grad_norm": 1.5062087120542231, "learning_rate": 1.8631480695511866e-05, "loss": 1.0154, "step": 820 }, { "epoch": 0.761070110701107, "grad_norm": 1.4715819927221123, "learning_rate": 1.8604248620243682e-05, "loss": 0.9923, "step": 825 }, { "epoch": 0.7656826568265682, "grad_norm": 1.5582751445765881, "learning_rate": 1.8576768572762233e-05, "loss": 1.0035, "step": 830 }, { "epoch": 0.7702952029520295, "grad_norm": 1.5849009070973952, "learning_rate": 1.8549041345035354e-05, "loss": 1.013, "step": 835 }, { "epoch": 0.7749077490774908, "grad_norm": 1.53624381630094, "learning_rate": 1.8521067736154567e-05, "loss": 1.0212, "step": 840 }, { "epoch": 0.7795202952029521, "grad_norm": 1.5482989884986487, "learning_rate": 1.8492848552312016e-05, "loss": 0.9879, "step": 845 }, { "epoch": 0.7841328413284133, "grad_norm": 1.4110912591448512, "learning_rate": 1.8464384606777258e-05, "loss": 0.9973, "step": 850 }, { "epoch": 0.7887453874538746, "grad_norm": 1.4605506515813482, "learning_rate": 1.8435676719873828e-05, "loss": 1.0007, "step": 855 }, { "epoch": 0.7933579335793358, "grad_norm": 1.4855314358619898, "learning_rate": 1.8406725718955575e-05, "loss": 0.9921, "step": 860 }, { "epoch": 0.7979704797047971, "grad_norm": 1.4175285986471877, "learning_rate": 1.837753243838283e-05, "loss": 0.9947, "step": 865 }, { "epoch": 0.8025830258302583, "grad_norm": 1.5124089244421277, "learning_rate": 1.834809771949837e-05, "loss": 1.0007, "step": 870 }, { "epoch": 0.8071955719557196, "grad_norm": 1.6185245778463926, "learning_rate": 1.8318422410603162e-05, "loss": 1.0005, "step": 875 }, { "epoch": 0.8118081180811808, "grad_norm": 1.74728705302942, "learning_rate": 1.8288507366931907e-05, "loss": 0.9977, "step": 880 }, { "epoch": 0.816420664206642, "grad_norm": 1.3700788841960891, "learning_rate": 1.8258353450628402e-05, "loss": 0.9953, "step": 885 }, { "epoch": 0.8210332103321033, "grad_norm": 1.448030688939384, "learning_rate": 1.8227961530720696e-05, "loss": 0.9927, "step": 890 }, { "epoch": 0.8256457564575646, "grad_norm": 1.3860802683248008, "learning_rate": 1.819733248309604e-05, "loss": 1.0137, "step": 895 }, { "epoch": 0.8302583025830258, "grad_norm": 1.3807867910302518, "learning_rate": 1.816646719047563e-05, "loss": 0.9985, "step": 900 }, { "epoch": 0.834870848708487, "grad_norm": 1.5246405968433066, "learning_rate": 1.8135366542389202e-05, "loss": 0.9965, "step": 905 }, { "epoch": 0.8394833948339483, "grad_norm": 1.5298770681063796, "learning_rate": 1.8104031435149366e-05, "loss": 0.9895, "step": 910 }, { "epoch": 0.8440959409594095, "grad_norm": 1.5737628456798298, "learning_rate": 1.807246277182578e-05, "loss": 1.0016, "step": 915 }, { "epoch": 0.8487084870848709, "grad_norm": 1.3867737980631065, "learning_rate": 1.8040661462219135e-05, "loss": 0.9905, "step": 920 }, { "epoch": 0.8533210332103321, "grad_norm": 1.4453648478128216, "learning_rate": 1.8008628422834923e-05, "loss": 1.0005, "step": 925 }, { "epoch": 0.8579335793357934, "grad_norm": 1.612991010163291, "learning_rate": 1.797636457685703e-05, "loss": 0.9915, "step": 930 }, { "epoch": 0.8625461254612546, "grad_norm": 1.4214188542336526, "learning_rate": 1.7943870854121126e-05, "loss": 0.9822, "step": 935 }, { "epoch": 0.8671586715867159, "grad_norm": 1.5619301079989365, "learning_rate": 1.791114819108788e-05, "loss": 0.9781, "step": 940 }, { "epoch": 0.8717712177121771, "grad_norm": 1.575000171763216, "learning_rate": 1.787819753081594e-05, "loss": 1.0021, "step": 945 }, { "epoch": 0.8763837638376384, "grad_norm": 1.5295894907745344, "learning_rate": 1.784501982293479e-05, "loss": 1.0077, "step": 950 }, { "epoch": 0.8809963099630996, "grad_norm": 1.455437508846778, "learning_rate": 1.781161602361737e-05, "loss": 0.9757, "step": 955 }, { "epoch": 0.8856088560885609, "grad_norm": 1.4641943856542534, "learning_rate": 1.7777987095552512e-05, "loss": 0.9918, "step": 960 }, { "epoch": 0.8902214022140221, "grad_norm": 1.56760438004586, "learning_rate": 1.7744134007917195e-05, "loss": 0.9952, "step": 965 }, { "epoch": 0.8948339483394834, "grad_norm": 1.4881030883207977, "learning_rate": 1.7710057736348622e-05, "loss": 0.9995, "step": 970 }, { "epoch": 0.8994464944649446, "grad_norm": 1.4138210092484749, "learning_rate": 1.7675759262916105e-05, "loss": 0.9814, "step": 975 }, { "epoch": 0.9040590405904059, "grad_norm": 1.5269872502666184, "learning_rate": 1.764123957609275e-05, "loss": 0.9969, "step": 980 }, { "epoch": 0.9086715867158671, "grad_norm": 1.4777357727162057, "learning_rate": 1.7606499670726972e-05, "loss": 0.9922, "step": 985 }, { "epoch": 0.9132841328413284, "grad_norm": 1.5422388516772692, "learning_rate": 1.7571540548013836e-05, "loss": 0.9946, "step": 990 }, { "epoch": 0.9178966789667896, "grad_norm": 1.519307824816888, "learning_rate": 1.753636321546619e-05, "loss": 0.9966, "step": 995 }, { "epoch": 0.922509225092251, "grad_norm": 1.4264538885727793, "learning_rate": 1.7500968686885634e-05, "loss": 0.9803, "step": 1000 }, { "epoch": 0.922509225092251, "eval_loss": 1.0050979852676392, "eval_runtime": 475.0904, "eval_samples_per_second": 32.31, "eval_steps_per_second": 0.126, "step": 1000 }, { "epoch": 0.9271217712177122, "grad_norm": 1.5178129589503198, "learning_rate": 1.7465357982333294e-05, "loss": 0.9965, "step": 1005 }, { "epoch": 0.9317343173431735, "grad_norm": 1.4749449313002256, "learning_rate": 1.742953212810045e-05, "loss": 0.998, "step": 1010 }, { "epoch": 0.9363468634686347, "grad_norm": 1.4826220358510274, "learning_rate": 1.739349215667891e-05, "loss": 0.9829, "step": 1015 }, { "epoch": 0.940959409594096, "grad_norm": 1.4366615526126456, "learning_rate": 1.735723910673132e-05, "loss": 0.9847, "step": 1020 }, { "epoch": 0.9455719557195572, "grad_norm": 1.4260729595159904, "learning_rate": 1.732077402306116e-05, "loss": 0.986, "step": 1025 }, { "epoch": 0.9501845018450185, "grad_norm": 1.5039785990003167, "learning_rate": 1.7284097956582694e-05, "loss": 0.9745, "step": 1030 }, { "epoch": 0.9547970479704797, "grad_norm": 1.5082306367621992, "learning_rate": 1.7247211964290635e-05, "loss": 0.9966, "step": 1035 }, { "epoch": 0.959409594095941, "grad_norm": 1.4449103838652617, "learning_rate": 1.721011710922972e-05, "loss": 0.969, "step": 1040 }, { "epoch": 0.9640221402214022, "grad_norm": 1.493303736853594, "learning_rate": 1.717281446046404e-05, "loss": 0.9861, "step": 1045 }, { "epoch": 0.9686346863468634, "grad_norm": 1.47859930222641, "learning_rate": 1.713530509304627e-05, "loss": 0.9962, "step": 1050 }, { "epoch": 0.9732472324723247, "grad_norm": 1.4442728791265258, "learning_rate": 1.709759008798663e-05, "loss": 0.9902, "step": 1055 }, { "epoch": 0.977859778597786, "grad_norm": 1.4376492964532295, "learning_rate": 1.7059670532221802e-05, "loss": 0.9831, "step": 1060 }, { "epoch": 0.9824723247232472, "grad_norm": 2.484316021155763, "learning_rate": 1.7021547518583536e-05, "loss": 0.9813, "step": 1065 }, { "epoch": 0.9870848708487084, "grad_norm": 1.4464291734707186, "learning_rate": 1.6983222145767198e-05, "loss": 0.9902, "step": 1070 }, { "epoch": 0.9916974169741697, "grad_norm": 1.5281863553882298, "learning_rate": 1.6944695518300087e-05, "loss": 0.9807, "step": 1075 }, { "epoch": 0.996309963099631, "grad_norm": 1.4201117731708275, "learning_rate": 1.6905968746509618e-05, "loss": 0.9746, "step": 1080 }, { "epoch": 1.0009225092250922, "grad_norm": 2.838028209786232, "learning_rate": 1.6867042946491306e-05, "loss": 0.9546, "step": 1085 }, { "epoch": 1.0055350553505535, "grad_norm": 2.0637438515793582, "learning_rate": 1.6827919240076612e-05, "loss": 0.7562, "step": 1090 }, { "epoch": 1.0101476014760147, "grad_norm": 2.03888870440938, "learning_rate": 1.6788598754800602e-05, "loss": 0.7325, "step": 1095 }, { "epoch": 1.014760147601476, "grad_norm": 1.5943747459674837, "learning_rate": 1.6749082623869465e-05, "loss": 0.7403, "step": 1100 }, { "epoch": 1.0193726937269372, "grad_norm": 1.705271980777924, "learning_rate": 1.6709371986127846e-05, "loss": 0.749, "step": 1105 }, { "epoch": 1.0239852398523985, "grad_norm": 1.7629595253417687, "learning_rate": 1.6669467986026012e-05, "loss": 0.7087, "step": 1110 }, { "epoch": 1.0285977859778597, "grad_norm": 1.6633137977760193, "learning_rate": 1.662937177358691e-05, "loss": 0.7394, "step": 1115 }, { "epoch": 1.033210332103321, "grad_norm": 1.5074327799371463, "learning_rate": 1.6589084504372975e-05, "loss": 0.7164, "step": 1120 }, { "epoch": 1.0378228782287824, "grad_norm": 1.6191564800390028, "learning_rate": 1.6548607339452853e-05, "loss": 0.7251, "step": 1125 }, { "epoch": 1.0424354243542435, "grad_norm": 1.6118486061526904, "learning_rate": 1.6507941445367935e-05, "loss": 0.7317, "step": 1130 }, { "epoch": 1.0470479704797049, "grad_norm": 1.6655447135885555, "learning_rate": 1.6467087994098753e-05, "loss": 0.7439, "step": 1135 }, { "epoch": 1.051660516605166, "grad_norm": 1.8659877314188116, "learning_rate": 1.6426048163031155e-05, "loss": 0.7311, "step": 1140 }, { "epoch": 1.0562730627306274, "grad_norm": 1.6691346554473123, "learning_rate": 1.6384823134922444e-05, "loss": 0.7304, "step": 1145 }, { "epoch": 1.0608856088560885, "grad_norm": 1.6067875332998411, "learning_rate": 1.634341409786723e-05, "loss": 0.7239, "step": 1150 }, { "epoch": 1.0654981549815499, "grad_norm": 1.7410755263145463, "learning_rate": 1.6301822245263212e-05, "loss": 0.7339, "step": 1155 }, { "epoch": 1.070110701107011, "grad_norm": 1.5078513231888042, "learning_rate": 1.6260048775776804e-05, "loss": 0.7344, "step": 1160 }, { "epoch": 1.0747232472324724, "grad_norm": 1.6250331925877979, "learning_rate": 1.6218094893308553e-05, "loss": 0.7418, "step": 1165 }, { "epoch": 1.0793357933579335, "grad_norm": 1.6654181563693542, "learning_rate": 1.6175961806958476e-05, "loss": 0.7265, "step": 1170 }, { "epoch": 1.0839483394833949, "grad_norm": 1.7420998771046359, "learning_rate": 1.6133650730991183e-05, "loss": 0.723, "step": 1175 }, { "epoch": 1.088560885608856, "grad_norm": 1.6483689819198597, "learning_rate": 1.609116288480092e-05, "loss": 0.7316, "step": 1180 }, { "epoch": 1.0931734317343174, "grad_norm": 1.593740794425406, "learning_rate": 1.6048499492876378e-05, "loss": 0.7374, "step": 1185 }, { "epoch": 1.0977859778597785, "grad_norm": 1.5370001403119866, "learning_rate": 1.6005661784765453e-05, "loss": 0.7457, "step": 1190 }, { "epoch": 1.1023985239852399, "grad_norm": 1.4718068363327683, "learning_rate": 1.5962650995039783e-05, "loss": 0.7328, "step": 1195 }, { "epoch": 1.1070110701107012, "grad_norm": 1.5301401714407428, "learning_rate": 1.5919468363259164e-05, "loss": 0.736, "step": 1200 }, { "epoch": 1.1070110701107012, "eval_loss": 1.0061343908309937, "eval_runtime": 439.8508, "eval_samples_per_second": 34.898, "eval_steps_per_second": 0.136, "step": 1200 }, { "epoch": 1.1116236162361623, "grad_norm": 1.7064428647610237, "learning_rate": 1.587611513393585e-05, "loss": 0.7297, "step": 1205 }, { "epoch": 1.1162361623616237, "grad_norm": 1.6208328161309395, "learning_rate": 1.5832592556498657e-05, "loss": 0.7346, "step": 1210 }, { "epoch": 1.1208487084870848, "grad_norm": 1.802694495701501, "learning_rate": 1.5788901885256983e-05, "loss": 0.7365, "step": 1215 }, { "epoch": 1.1254612546125462, "grad_norm": 1.5370188196224415, "learning_rate": 1.5745044379364637e-05, "loss": 0.7305, "step": 1220 }, { "epoch": 1.1300738007380073, "grad_norm": 1.4889078253244556, "learning_rate": 1.5701021302783557e-05, "loss": 0.732, "step": 1225 }, { "epoch": 1.1346863468634687, "grad_norm": 1.6294276521184954, "learning_rate": 1.56568339242474e-05, "loss": 0.7276, "step": 1230 }, { "epoch": 1.1392988929889298, "grad_norm": 1.5397859025285652, "learning_rate": 1.5612483517224942e-05, "loss": 0.7354, "step": 1235 }, { "epoch": 1.1439114391143912, "grad_norm": 1.4953485038562, "learning_rate": 1.556797135988342e-05, "loss": 0.7173, "step": 1240 }, { "epoch": 1.1485239852398523, "grad_norm": 1.853185392904802, "learning_rate": 1.5523298735051657e-05, "loss": 0.7489, "step": 1245 }, { "epoch": 1.1531365313653137, "grad_norm": 1.5704541475489389, "learning_rate": 1.5478466930183107e-05, "loss": 0.7191, "step": 1250 }, { "epoch": 1.1577490774907748, "grad_norm": 1.5415559777438193, "learning_rate": 1.5433477237318765e-05, "loss": 0.7327, "step": 1255 }, { "epoch": 1.1623616236162362, "grad_norm": 1.6666092802375732, "learning_rate": 1.5388330953049907e-05, "loss": 0.7473, "step": 1260 }, { "epoch": 1.1669741697416973, "grad_norm": 1.8791358127374613, "learning_rate": 1.5343029378480733e-05, "loss": 0.7312, "step": 1265 }, { "epoch": 1.1715867158671587, "grad_norm": 1.5841817509277247, "learning_rate": 1.5297573819190873e-05, "loss": 0.7416, "step": 1270 }, { "epoch": 1.17619926199262, "grad_norm": 1.5715763468516226, "learning_rate": 1.5251965585197748e-05, "loss": 0.7307, "step": 1275 }, { "epoch": 1.1808118081180812, "grad_norm": 1.5391235781858166, "learning_rate": 1.5206205990918836e-05, "loss": 0.7212, "step": 1280 }, { "epoch": 1.1854243542435423, "grad_norm": 1.5629140884896369, "learning_rate": 1.5160296355133773e-05, "loss": 0.7312, "step": 1285 }, { "epoch": 1.1900369003690037, "grad_norm": 5.396412006514146, "learning_rate": 1.5114238000946353e-05, "loss": 0.7141, "step": 1290 }, { "epoch": 1.194649446494465, "grad_norm": 1.5426421143956044, "learning_rate": 1.50680322557464e-05, "loss": 0.7308, "step": 1295 }, { "epoch": 1.1992619926199262, "grad_norm": 1.6701861441853627, "learning_rate": 1.5021680451171499e-05, "loss": 0.7415, "step": 1300 }, { "epoch": 1.2038745387453875, "grad_norm": 1.5779190034035813, "learning_rate": 1.4975183923068637e-05, "loss": 0.7302, "step": 1305 }, { "epoch": 1.2084870848708487, "grad_norm": 1.619587352430737, "learning_rate": 1.492854401145569e-05, "loss": 0.7318, "step": 1310 }, { "epoch": 1.21309963099631, "grad_norm": 1.619976635054261, "learning_rate": 1.4881762060482814e-05, "loss": 0.7254, "step": 1315 }, { "epoch": 1.2177121771217712, "grad_norm": 1.5945364409345257, "learning_rate": 1.48348394183937e-05, "loss": 0.7402, "step": 1320 }, { "epoch": 1.2223247232472325, "grad_norm": 1.5229514773361725, "learning_rate": 1.4787777437486723e-05, "loss": 0.7367, "step": 1325 }, { "epoch": 1.2269372693726937, "grad_norm": 2.0971835619796932, "learning_rate": 1.4740577474075963e-05, "loss": 0.7416, "step": 1330 }, { "epoch": 1.231549815498155, "grad_norm": 1.5602315222575482, "learning_rate": 1.4693240888452121e-05, "loss": 0.7375, "step": 1335 }, { "epoch": 1.2361623616236161, "grad_norm": 2.090838147725305, "learning_rate": 1.4645769044843318e-05, "loss": 0.7375, "step": 1340 }, { "epoch": 1.2407749077490775, "grad_norm": 1.631256144537058, "learning_rate": 1.459816331137577e-05, "loss": 0.7463, "step": 1345 }, { "epoch": 1.2453874538745389, "grad_norm": 1.6259793781417131, "learning_rate": 1.4550425060034367e-05, "loss": 0.7237, "step": 1350 }, { "epoch": 1.25, "grad_norm": 1.5839497005284708, "learning_rate": 1.450255566662313e-05, "loss": 0.7267, "step": 1355 }, { "epoch": 1.2546125461254611, "grad_norm": 1.4817515646858677, "learning_rate": 1.4454556510725556e-05, "loss": 0.7384, "step": 1360 }, { "epoch": 1.2592250922509225, "grad_norm": 1.619889890472081, "learning_rate": 1.4406428975664875e-05, "loss": 0.7445, "step": 1365 }, { "epoch": 1.2638376383763839, "grad_norm": 1.5742411445585174, "learning_rate": 1.4358174448464155e-05, "loss": 0.731, "step": 1370 }, { "epoch": 1.268450184501845, "grad_norm": 1.5691396578757213, "learning_rate": 1.4309794319806356e-05, "loss": 0.7445, "step": 1375 }, { "epoch": 1.2730627306273063, "grad_norm": 1.5149875801425627, "learning_rate": 1.4261289983994236e-05, "loss": 0.7265, "step": 1380 }, { "epoch": 1.2776752767527675, "grad_norm": 1.5087047437199383, "learning_rate": 1.421266283891017e-05, "loss": 0.7456, "step": 1385 }, { "epoch": 1.2822878228782288, "grad_norm": 1.5718051035987606, "learning_rate": 1.4163914285975863e-05, "loss": 0.7212, "step": 1390 }, { "epoch": 1.28690036900369, "grad_norm": 1.5655345894552062, "learning_rate": 1.411504573011197e-05, "loss": 0.7112, "step": 1395 }, { "epoch": 1.2915129151291513, "grad_norm": 1.6127771597600427, "learning_rate": 1.4066058579697593e-05, "loss": 0.7249, "step": 1400 }, { "epoch": 1.2915129151291513, "eval_loss": 1.000433087348938, "eval_runtime": 377.6786, "eval_samples_per_second": 40.643, "eval_steps_per_second": 0.159, "step": 1400 }, { "epoch": 1.2961254612546125, "grad_norm": 1.5789241025710268, "learning_rate": 1.4016954246529697e-05, "loss": 0.7284, "step": 1405 }, { "epoch": 1.3007380073800738, "grad_norm": 1.6370566010616505, "learning_rate": 1.3967734145782425e-05, "loss": 0.7233, "step": 1410 }, { "epoch": 1.305350553505535, "grad_norm": 1.6019988849536153, "learning_rate": 1.391839969596632e-05, "loss": 0.7305, "step": 1415 }, { "epoch": 1.3099630996309963, "grad_norm": 1.5321404428187482, "learning_rate": 1.3868952318887421e-05, "loss": 0.7161, "step": 1420 }, { "epoch": 1.3145756457564577, "grad_norm": 1.617468685697329, "learning_rate": 1.3819393439606313e-05, "loss": 0.7383, "step": 1425 }, { "epoch": 1.3191881918819188, "grad_norm": 1.4942275323967702, "learning_rate": 1.3769724486397035e-05, "loss": 0.7309, "step": 1430 }, { "epoch": 1.32380073800738, "grad_norm": 1.526992745554204, "learning_rate": 1.371994689070594e-05, "loss": 0.7241, "step": 1435 }, { "epoch": 1.3284132841328413, "grad_norm": 1.5223415766171715, "learning_rate": 1.3670062087110423e-05, "loss": 0.7369, "step": 1440 }, { "epoch": 1.3330258302583027, "grad_norm": 1.5349271200758632, "learning_rate": 1.362007151327758e-05, "loss": 0.7408, "step": 1445 }, { "epoch": 1.3376383763837638, "grad_norm": 1.5487172960940725, "learning_rate": 1.3569976609922785e-05, "loss": 0.7366, "step": 1450 }, { "epoch": 1.3422509225092252, "grad_norm": 1.4925683710432864, "learning_rate": 1.3519778820768157e-05, "loss": 0.7316, "step": 1455 }, { "epoch": 1.3468634686346863, "grad_norm": 1.508872342522196, "learning_rate": 1.3469479592500954e-05, "loss": 0.7282, "step": 1460 }, { "epoch": 1.3514760147601477, "grad_norm": 1.5205565120558908, "learning_rate": 1.3419080374731889e-05, "loss": 0.7361, "step": 1465 }, { "epoch": 1.3560885608856088, "grad_norm": 1.5925418525444446, "learning_rate": 1.3368582619953348e-05, "loss": 0.7314, "step": 1470 }, { "epoch": 1.3607011070110702, "grad_norm": 1.536022556446867, "learning_rate": 1.331798778349752e-05, "loss": 0.7297, "step": 1475 }, { "epoch": 1.3653136531365313, "grad_norm": 1.6415269938571113, "learning_rate": 1.326729732349447e-05, "loss": 0.7236, "step": 1480 }, { "epoch": 1.3699261992619927, "grad_norm": 1.5571914542665708, "learning_rate": 1.3216512700830104e-05, "loss": 0.7456, "step": 1485 }, { "epoch": 1.3745387453874538, "grad_norm": 1.537220055899943, "learning_rate": 1.3165635379104079e-05, "loss": 0.7283, "step": 1490 }, { "epoch": 1.3791512915129152, "grad_norm": 1.586303040039408, "learning_rate": 1.31146668245876e-05, "loss": 0.74, "step": 1495 }, { "epoch": 1.3837638376383765, "grad_norm": 1.599460902002627, "learning_rate": 1.3063608506181189e-05, "loss": 0.7269, "step": 1500 }, { "epoch": 1.3883763837638377, "grad_norm": 1.5845763646878845, "learning_rate": 1.3012461895372343e-05, "loss": 0.7207, "step": 1505 }, { "epoch": 1.3929889298892988, "grad_norm": 1.6101936272120416, "learning_rate": 1.2961228466193116e-05, "loss": 0.7491, "step": 1510 }, { "epoch": 1.3976014760147601, "grad_norm": 1.5328730912860027, "learning_rate": 1.2909909695177647e-05, "loss": 0.7428, "step": 1515 }, { "epoch": 1.4022140221402215, "grad_norm": 1.4700749106652151, "learning_rate": 1.28585070613196e-05, "loss": 0.7337, "step": 1520 }, { "epoch": 1.4068265682656826, "grad_norm": 1.595108354129352, "learning_rate": 1.2807022046029556e-05, "loss": 0.7476, "step": 1525 }, { "epoch": 1.4114391143911438, "grad_norm": 1.4846465873330463, "learning_rate": 1.2755456133092295e-05, "loss": 0.7471, "step": 1530 }, { "epoch": 1.4160516605166051, "grad_norm": 1.6404654691954998, "learning_rate": 1.2703810808624051e-05, "loss": 0.7338, "step": 1535 }, { "epoch": 1.4206642066420665, "grad_norm": 1.5924740161540634, "learning_rate": 1.2652087561029682e-05, "loss": 0.7349, "step": 1540 }, { "epoch": 1.4252767527675276, "grad_norm": 1.4942826903716253, "learning_rate": 1.2600287880959762e-05, "loss": 0.725, "step": 1545 }, { "epoch": 1.429889298892989, "grad_norm": 1.5398233388725502, "learning_rate": 1.254841326126764e-05, "loss": 0.7376, "step": 1550 }, { "epoch": 1.4345018450184501, "grad_norm": 1.5654070863650702, "learning_rate": 1.2496465196966393e-05, "loss": 0.7318, "step": 1555 }, { "epoch": 1.4391143911439115, "grad_norm": 1.5898873014454018, "learning_rate": 1.2444445185185763e-05, "loss": 0.7306, "step": 1560 }, { "epoch": 1.4437269372693726, "grad_norm": 1.5102337179405925, "learning_rate": 1.239235472512899e-05, "loss": 0.7057, "step": 1565 }, { "epoch": 1.448339483394834, "grad_norm": 1.5134306661913561, "learning_rate": 1.2340195318029623e-05, "loss": 0.7216, "step": 1570 }, { "epoch": 1.4529520295202953, "grad_norm": 1.4927818706889626, "learning_rate": 1.228796846710825e-05, "loss": 0.7402, "step": 1575 }, { "epoch": 1.4575645756457565, "grad_norm": 1.455448272385386, "learning_rate": 1.2235675677529158e-05, "loss": 0.7172, "step": 1580 }, { "epoch": 1.4621771217712176, "grad_norm": 1.5634881056548133, "learning_rate": 1.2183318456356984e-05, "loss": 0.7389, "step": 1585 }, { "epoch": 1.466789667896679, "grad_norm": 1.4898756090326564, "learning_rate": 1.2130898312513255e-05, "loss": 0.7378, "step": 1590 }, { "epoch": 1.4714022140221403, "grad_norm": 1.5009726152389429, "learning_rate": 1.2078416756732925e-05, "loss": 0.7235, "step": 1595 }, { "epoch": 1.4760147601476015, "grad_norm": 1.5101924554242023, "learning_rate": 1.2025875301520811e-05, "loss": 0.7355, "step": 1600 }, { "epoch": 1.4760147601476015, "eval_loss": 0.9855098724365234, "eval_runtime": 380.6287, "eval_samples_per_second": 40.328, "eval_steps_per_second": 0.158, "step": 1600 }, { "epoch": 1.4806273062730626, "grad_norm": 1.4634212318812496, "learning_rate": 1.1973275461108027e-05, "loss": 0.7252, "step": 1605 }, { "epoch": 1.485239852398524, "grad_norm": 1.6489180454948977, "learning_rate": 1.1920618751408328e-05, "loss": 0.7196, "step": 1610 }, { "epoch": 1.4898523985239853, "grad_norm": 1.5637692459131638, "learning_rate": 1.186790668997443e-05, "loss": 0.7292, "step": 1615 }, { "epoch": 1.4944649446494465, "grad_norm": 1.5196144325218592, "learning_rate": 1.1815140795954268e-05, "loss": 0.7317, "step": 1620 }, { "epoch": 1.4990774907749078, "grad_norm": 1.5589118343089332, "learning_rate": 1.176232259004722e-05, "loss": 0.7282, "step": 1625 }, { "epoch": 1.503690036900369, "grad_norm": 1.4776701136160202, "learning_rate": 1.1709453594460279e-05, "loss": 0.7142, "step": 1630 }, { "epoch": 1.5083025830258303, "grad_norm": 1.5113121821467062, "learning_rate": 1.165653533286418e-05, "loss": 0.7267, "step": 1635 }, { "epoch": 1.5129151291512914, "grad_norm": 1.5621038457306815, "learning_rate": 1.1603569330349502e-05, "loss": 0.7194, "step": 1640 }, { "epoch": 1.5175276752767528, "grad_norm": 1.4342503333735157, "learning_rate": 1.1550557113382697e-05, "loss": 0.732, "step": 1645 }, { "epoch": 1.5221402214022142, "grad_norm": 1.4925030679944211, "learning_rate": 1.1497500209762102e-05, "loss": 0.7311, "step": 1650 }, { "epoch": 1.5267527675276753, "grad_norm": 1.5884492115586761, "learning_rate": 1.1444400148573918e-05, "loss": 0.7306, "step": 1655 }, { "epoch": 1.5313653136531364, "grad_norm": 1.4907133011402547, "learning_rate": 1.1391258460148135e-05, "loss": 0.7291, "step": 1660 }, { "epoch": 1.5359778597785978, "grad_norm": 1.5177536049645275, "learning_rate": 1.1338076676014427e-05, "loss": 0.7243, "step": 1665 }, { "epoch": 1.5405904059040592, "grad_norm": 1.5089626714379156, "learning_rate": 1.1284856328858017e-05, "loss": 0.7174, "step": 1670 }, { "epoch": 1.5452029520295203, "grad_norm": 1.518407729563154, "learning_rate": 1.1231598952475504e-05, "loss": 0.7188, "step": 1675 }, { "epoch": 1.5498154981549814, "grad_norm": 2.2443040427872973, "learning_rate": 1.1178306081730666e-05, "loss": 0.7274, "step": 1680 }, { "epoch": 1.5544280442804428, "grad_norm": 1.557724202719455, "learning_rate": 1.1124979252510209e-05, "loss": 0.7306, "step": 1685 }, { "epoch": 1.5590405904059041, "grad_norm": 1.5461146750078991, "learning_rate": 1.1071620001679514e-05, "loss": 0.7265, "step": 1690 }, { "epoch": 1.5636531365313653, "grad_norm": 1.677798283188097, "learning_rate": 1.1018229867038358e-05, "loss": 0.7296, "step": 1695 }, { "epoch": 1.5682656826568264, "grad_norm": 1.5842640863093371, "learning_rate": 1.0964810387276561e-05, "loss": 0.7136, "step": 1700 }, { "epoch": 1.5728782287822878, "grad_norm": 1.563395389439852, "learning_rate": 1.0911363101929677e-05, "loss": 0.7244, "step": 1705 }, { "epoch": 1.5774907749077491, "grad_norm": 1.5223804974728257, "learning_rate": 1.085788955133461e-05, "loss": 0.7263, "step": 1710 }, { "epoch": 1.5821033210332103, "grad_norm": 1.4519511890413386, "learning_rate": 1.080439127658521e-05, "loss": 0.7125, "step": 1715 }, { "epoch": 1.5867158671586716, "grad_norm": 1.522533945377353, "learning_rate": 1.0750869819487884e-05, "loss": 0.7273, "step": 1720 }, { "epoch": 1.591328413284133, "grad_norm": 1.5254481413189622, "learning_rate": 1.0697326722517137e-05, "loss": 0.7278, "step": 1725 }, { "epoch": 1.5959409594095941, "grad_norm": 1.5231671586261868, "learning_rate": 1.0643763528771136e-05, "loss": 0.7395, "step": 1730 }, { "epoch": 1.6005535055350553, "grad_norm": 1.4650142883805686, "learning_rate": 1.0590181781927229e-05, "loss": 0.7349, "step": 1735 }, { "epoch": 1.6051660516605166, "grad_norm": 1.4963676540506488, "learning_rate": 1.0536583026197462e-05, "loss": 0.7227, "step": 1740 }, { "epoch": 1.609778597785978, "grad_norm": 1.568382581129352, "learning_rate": 1.0482968806284073e-05, "loss": 0.7104, "step": 1745 }, { "epoch": 1.6143911439114391, "grad_norm": 1.4263377420776584, "learning_rate": 1.042934066733497e-05, "loss": 0.7295, "step": 1750 }, { "epoch": 1.6190036900369003, "grad_norm": 1.557321427633267, "learning_rate": 1.0375700154899208e-05, "loss": 0.7221, "step": 1755 }, { "epoch": 1.6236162361623616, "grad_norm": 1.4697183106878222, "learning_rate": 1.0322048814882438e-05, "loss": 0.7137, "step": 1760 }, { "epoch": 1.628228782287823, "grad_norm": 1.5186029224528301, "learning_rate": 1.0268388193502365e-05, "loss": 0.7064, "step": 1765 }, { "epoch": 1.632841328413284, "grad_norm": 1.4447029146830694, "learning_rate": 1.0214719837244176e-05, "loss": 0.7288, "step": 1770 }, { "epoch": 1.6374538745387452, "grad_norm": 1.6070031997265373, "learning_rate": 1.0161045292815974e-05, "loss": 0.707, "step": 1775 }, { "epoch": 1.6420664206642066, "grad_norm": 1.426331730931973, "learning_rate": 1.010736610710421e-05, "loss": 0.709, "step": 1780 }, { "epoch": 1.646678966789668, "grad_norm": 1.452095892694617, "learning_rate": 1.0053683827129091e-05, "loss": 0.7121, "step": 1785 }, { "epoch": 1.651291512915129, "grad_norm": 1.57091551946505, "learning_rate": 1e-05, "loss": 0.7134, "step": 1790 }, { "epoch": 1.6559040590405905, "grad_norm": 1.5750336048966571, "learning_rate": 9.946316172870909e-06, "loss": 0.7136, "step": 1795 }, { "epoch": 1.6605166051660518, "grad_norm": 1.4906719471502183, "learning_rate": 9.892633892895795e-06, "loss": 0.7151, "step": 1800 }, { "epoch": 1.6605166051660518, "eval_loss": 0.97125643491745, "eval_runtime": 375.0586, "eval_samples_per_second": 40.927, "eval_steps_per_second": 0.16, "step": 1800 }, { "epoch": 1.665129151291513, "grad_norm": 1.5072106225983386, "learning_rate": 9.83895470718403e-06, "loss": 0.7227, "step": 1805 }, { "epoch": 1.669741697416974, "grad_norm": 1.4508206944932882, "learning_rate": 9.785280162755825e-06, "loss": 0.724, "step": 1810 }, { "epoch": 1.6743542435424354, "grad_norm": 1.5498476457151749, "learning_rate": 9.731611806497637e-06, "loss": 0.7026, "step": 1815 }, { "epoch": 1.6789667896678968, "grad_norm": 1.5177452024002571, "learning_rate": 9.677951185117565e-06, "loss": 0.7129, "step": 1820 }, { "epoch": 1.683579335793358, "grad_norm": 1.7536225872991078, "learning_rate": 9.624299845100795e-06, "loss": 0.7157, "step": 1825 }, { "epoch": 1.688191881918819, "grad_norm": 1.5448154946820392, "learning_rate": 9.570659332665032e-06, "loss": 0.7029, "step": 1830 }, { "epoch": 1.6928044280442804, "grad_norm": 1.5229582975841924, "learning_rate": 9.51703119371593e-06, "loss": 0.7231, "step": 1835 }, { "epoch": 1.6974169741697418, "grad_norm": 1.401053250210059, "learning_rate": 9.463416973802541e-06, "loss": 0.6987, "step": 1840 }, { "epoch": 1.702029520295203, "grad_norm": 1.486158083271756, "learning_rate": 9.409818218072774e-06, "loss": 0.7187, "step": 1845 }, { "epoch": 1.706642066420664, "grad_norm": 1.4844923332820112, "learning_rate": 9.35623647122887e-06, "loss": 0.7038, "step": 1850 }, { "epoch": 1.7112546125461254, "grad_norm": 1.4768083176878029, "learning_rate": 9.302673277482867e-06, "loss": 0.7156, "step": 1855 }, { "epoch": 1.7158671586715868, "grad_norm": 1.4690802202313877, "learning_rate": 9.249130180512118e-06, "loss": 0.7007, "step": 1860 }, { "epoch": 1.720479704797048, "grad_norm": 1.521642869722996, "learning_rate": 9.19560872341479e-06, "loss": 0.7124, "step": 1865 }, { "epoch": 1.725092250922509, "grad_norm": 1.4844935676770985, "learning_rate": 9.142110448665394e-06, "loss": 0.7137, "step": 1870 }, { "epoch": 1.7297047970479706, "grad_norm": 1.4802119797626268, "learning_rate": 9.088636898070326e-06, "loss": 0.7142, "step": 1875 }, { "epoch": 1.7343173431734318, "grad_norm": 1.4925162048619054, "learning_rate": 9.035189612723444e-06, "loss": 0.7128, "step": 1880 }, { "epoch": 1.738929889298893, "grad_norm": 1.5072960944547247, "learning_rate": 8.981770132961649e-06, "loss": 0.7, "step": 1885 }, { "epoch": 1.7435424354243543, "grad_norm": 1.463889514395925, "learning_rate": 8.928379998320489e-06, "loss": 0.7057, "step": 1890 }, { "epoch": 1.7481549815498156, "grad_norm": 1.6039296690992704, "learning_rate": 8.875020747489795e-06, "loss": 0.7233, "step": 1895 }, { "epoch": 1.7527675276752768, "grad_norm": 1.5035319064665877, "learning_rate": 8.821693918269334e-06, "loss": 0.7049, "step": 1900 }, { "epoch": 1.757380073800738, "grad_norm": 1.4420897442537834, "learning_rate": 8.768401047524498e-06, "loss": 0.7097, "step": 1905 }, { "epoch": 1.7619926199261993, "grad_norm": 1.4863140306951776, "learning_rate": 8.715143671141985e-06, "loss": 0.7131, "step": 1910 }, { "epoch": 1.7666051660516606, "grad_norm": 1.4372944065533075, "learning_rate": 8.661923323985576e-06, "loss": 0.7066, "step": 1915 }, { "epoch": 1.7712177121771218, "grad_norm": 1.5118603952311795, "learning_rate": 8.60874153985187e-06, "loss": 0.711, "step": 1920 }, { "epoch": 1.775830258302583, "grad_norm": 1.4309158451109616, "learning_rate": 8.555599851426086e-06, "loss": 0.7017, "step": 1925 }, { "epoch": 1.7804428044280443, "grad_norm": 1.4482092213054845, "learning_rate": 8.5024997902379e-06, "loss": 0.7043, "step": 1930 }, { "epoch": 1.7850553505535056, "grad_norm": 1.4820015255772456, "learning_rate": 8.449442886617308e-06, "loss": 0.7134, "step": 1935 }, { "epoch": 1.7896678966789668, "grad_norm": 1.5178706892202136, "learning_rate": 8.396430669650501e-06, "loss": 0.6986, "step": 1940 }, { "epoch": 1.7942804428044279, "grad_norm": 1.5192714047507399, "learning_rate": 8.343464667135821e-06, "loss": 0.7098, "step": 1945 }, { "epoch": 1.7988929889298892, "grad_norm": 1.5584757005755172, "learning_rate": 8.290546405539726e-06, "loss": 0.7007, "step": 1950 }, { "epoch": 1.8035055350553506, "grad_norm": 1.4555072659251027, "learning_rate": 8.237677409952784e-06, "loss": 0.7069, "step": 1955 }, { "epoch": 1.8081180811808117, "grad_norm": 1.435117018138001, "learning_rate": 8.184859204045736e-06, "loss": 0.7126, "step": 1960 }, { "epoch": 1.812730627306273, "grad_norm": 1.497994099179169, "learning_rate": 8.132093310025572e-06, "loss": 0.6918, "step": 1965 }, { "epoch": 1.8173431734317345, "grad_norm": 1.5231919974046568, "learning_rate": 8.079381248591675e-06, "loss": 0.6999, "step": 1970 }, { "epoch": 1.8219557195571956, "grad_norm": 1.416052373581155, "learning_rate": 8.026724538891976e-06, "loss": 0.7007, "step": 1975 }, { "epoch": 1.8265682656826567, "grad_norm": 1.4579047643243503, "learning_rate": 7.974124698479192e-06, "loss": 0.6987, "step": 1980 }, { "epoch": 1.831180811808118, "grad_norm": 1.4768646334937987, "learning_rate": 7.921583243267079e-06, "loss": 0.721, "step": 1985 }, { "epoch": 1.8357933579335795, "grad_norm": 1.4193207071096974, "learning_rate": 7.869101687486748e-06, "loss": 0.6998, "step": 1990 }, { "epoch": 1.8404059040590406, "grad_norm": 1.5195143082878666, "learning_rate": 7.816681543643019e-06, "loss": 0.7035, "step": 1995 }, { "epoch": 1.8450184501845017, "grad_norm": 1.492818689804546, "learning_rate": 7.764324322470842e-06, "loss": 0.7023, "step": 2000 }, { "epoch": 1.8450184501845017, "eval_loss": 0.9556826949119568, "eval_runtime": 438.5581, "eval_samples_per_second": 35.001, "eval_steps_per_second": 0.137, "step": 2000 }, { "epoch": 1.849630996309963, "grad_norm": 1.4550020776516512, "learning_rate": 7.712031532891754e-06, "loss": 0.6959, "step": 2005 }, { "epoch": 1.8542435424354244, "grad_norm": 1.532357943989181, "learning_rate": 7.659804681970378e-06, "loss": 0.716, "step": 2010 }, { "epoch": 1.8588560885608856, "grad_norm": 1.4932493146047923, "learning_rate": 7.607645274871013e-06, "loss": 0.7103, "step": 2015 }, { "epoch": 1.8634686346863467, "grad_norm": 1.3849647891235997, "learning_rate": 7.555554814814243e-06, "loss": 0.7091, "step": 2020 }, { "epoch": 1.868081180811808, "grad_norm": 1.4686606572709258, "learning_rate": 7.50353480303361e-06, "loss": 0.7065, "step": 2025 }, { "epoch": 1.8726937269372694, "grad_norm": 1.4890927562018215, "learning_rate": 7.451586738732362e-06, "loss": 0.7045, "step": 2030 }, { "epoch": 1.8773062730627306, "grad_norm": 1.4388080292308094, "learning_rate": 7.3997121190402375e-06, "loss": 0.7062, "step": 2035 }, { "epoch": 1.881918819188192, "grad_norm": 1.595891571353621, "learning_rate": 7.347912438970324e-06, "loss": 0.693, "step": 2040 }, { "epoch": 1.8865313653136533, "grad_norm": 1.4244467986676035, "learning_rate": 7.296189191375953e-06, "loss": 0.6941, "step": 2045 }, { "epoch": 1.8911439114391144, "grad_norm": 1.4475135191897706, "learning_rate": 7.24454386690771e-06, "loss": 0.7073, "step": 2050 }, { "epoch": 1.8957564575645756, "grad_norm": 1.4673852319078244, "learning_rate": 7.192977953970448e-06, "loss": 0.7078, "step": 2055 }, { "epoch": 1.900369003690037, "grad_norm": 1.4719457932619668, "learning_rate": 7.141492938680401e-06, "loss": 0.691, "step": 2060 }, { "epoch": 1.9049815498154983, "grad_norm": 1.466731728037535, "learning_rate": 7.090090304822356e-06, "loss": 0.7062, "step": 2065 }, { "epoch": 1.9095940959409594, "grad_norm": 1.4553947793369755, "learning_rate": 7.038771533806884e-06, "loss": 0.7106, "step": 2070 }, { "epoch": 1.9142066420664205, "grad_norm": 1.478652231013823, "learning_rate": 6.9875381046276605e-06, "loss": 0.6931, "step": 2075 }, { "epoch": 1.918819188191882, "grad_norm": 1.4356929483984957, "learning_rate": 6.936391493818814e-06, "loss": 0.6898, "step": 2080 }, { "epoch": 1.9234317343173433, "grad_norm": 1.5536671032832632, "learning_rate": 6.885333175412406e-06, "loss": 0.6928, "step": 2085 }, { "epoch": 1.9280442804428044, "grad_norm": 1.4991276022393414, "learning_rate": 6.834364620895928e-06, "loss": 0.6935, "step": 2090 }, { "epoch": 1.9326568265682655, "grad_norm": 1.5046326188308679, "learning_rate": 6.783487299169897e-06, "loss": 0.6983, "step": 2095 }, { "epoch": 1.937269372693727, "grad_norm": 1.4351756608326394, "learning_rate": 6.732702676505531e-06, "loss": 0.7065, "step": 2100 }, { "epoch": 1.9418819188191883, "grad_norm": 1.5547998479904102, "learning_rate": 6.6820122165024845e-06, "loss": 0.6879, "step": 2105 }, { "epoch": 1.9464944649446494, "grad_norm": 1.49827559538925, "learning_rate": 6.631417380046656e-06, "loss": 0.7025, "step": 2110 }, { "epoch": 1.9511070110701108, "grad_norm": 1.531002649653087, "learning_rate": 6.580919625268114e-06, "loss": 0.6909, "step": 2115 }, { "epoch": 1.9557195571955721, "grad_norm": 1.509365230765324, "learning_rate": 6.530520407499049e-06, "loss": 0.686, "step": 2120 }, { "epoch": 1.9603321033210332, "grad_norm": 1.5743592553630588, "learning_rate": 6.480221179231849e-06, "loss": 0.7051, "step": 2125 }, { "epoch": 1.9649446494464944, "grad_norm": 1.6561005765337469, "learning_rate": 6.430023390077218e-06, "loss": 0.6975, "step": 2130 }, { "epoch": 1.9695571955719557, "grad_norm": 1.4695572898069678, "learning_rate": 6.379928486722421e-06, "loss": 0.703, "step": 2135 }, { "epoch": 1.974169741697417, "grad_norm": 1.436121247379392, "learning_rate": 6.329937912889582e-06, "loss": 0.7037, "step": 2140 }, { "epoch": 1.9787822878228782, "grad_norm": 1.4604394210363645, "learning_rate": 6.280053109294064e-06, "loss": 0.6861, "step": 2145 }, { "epoch": 1.9833948339483394, "grad_norm": 1.49703841483432, "learning_rate": 6.230275513602968e-06, "loss": 0.6848, "step": 2150 }, { "epoch": 1.9880073800738007, "grad_norm": 1.4776846632157035, "learning_rate": 6.180606560393694e-06, "loss": 0.6854, "step": 2155 }, { "epoch": 1.992619926199262, "grad_norm": 1.469102349555009, "learning_rate": 6.131047681112583e-06, "loss": 0.6901, "step": 2160 }, { "epoch": 1.9972324723247232, "grad_norm": 1.4916818504881257, "learning_rate": 6.081600304033682e-06, "loss": 0.6986, "step": 2165 }, { "epoch": 2.0018450184501844, "grad_norm": 3.4623791161329507, "learning_rate": 6.032265854217574e-06, "loss": 0.5805, "step": 2170 }, { "epoch": 2.006457564575646, "grad_norm": 2.5409394096245324, "learning_rate": 5.983045753470308e-06, "loss": 0.4067, "step": 2175 }, { "epoch": 2.011070110701107, "grad_norm": 1.963539215801178, "learning_rate": 5.933941420302412e-06, "loss": 0.41, "step": 2180 }, { "epoch": 2.015682656826568, "grad_norm": 1.9578493570683806, "learning_rate": 5.884954269888032e-06, "loss": 0.4078, "step": 2185 }, { "epoch": 2.0202952029520294, "grad_norm": 1.6857102876256314, "learning_rate": 5.83608571402414e-06, "loss": 0.4126, "step": 2190 }, { "epoch": 2.024907749077491, "grad_norm": 1.7969907397732796, "learning_rate": 5.787337161089836e-06, "loss": 0.4086, "step": 2195 }, { "epoch": 2.029520295202952, "grad_norm": 1.6058011608079648, "learning_rate": 5.738710016005766e-06, "loss": 0.3925, "step": 2200 }, { "epoch": 2.029520295202952, "eval_loss": 1.0149868726730347, "eval_runtime": 417.6368, "eval_samples_per_second": 36.754, "eval_steps_per_second": 0.144, "step": 2200 }, { "epoch": 2.034132841328413, "grad_norm": 1.6505939948003603, "learning_rate": 5.690205680193647e-06, "loss": 0.3948, "step": 2205 }, { "epoch": 2.0387453874538743, "grad_norm": 1.6068608464647989, "learning_rate": 5.641825551535849e-06, "loss": 0.3878, "step": 2210 }, { "epoch": 2.043357933579336, "grad_norm": 1.6707505723255622, "learning_rate": 5.593571024335126e-06, "loss": 0.3977, "step": 2215 }, { "epoch": 2.047970479704797, "grad_norm": 1.6484182975706831, "learning_rate": 5.545443489274444e-06, "loss": 0.4009, "step": 2220 }, { "epoch": 2.052583025830258, "grad_norm": 1.6146171821462916, "learning_rate": 5.497444333376874e-06, "loss": 0.3991, "step": 2225 }, { "epoch": 2.0571955719557193, "grad_norm": 1.6445160771788836, "learning_rate": 5.449574939965637e-06, "loss": 0.4019, "step": 2230 }, { "epoch": 2.061808118081181, "grad_norm": 1.6446158891460347, "learning_rate": 5.401836688624231e-06, "loss": 0.3885, "step": 2235 }, { "epoch": 2.066420664206642, "grad_norm": 1.5856595607293187, "learning_rate": 5.354230955156684e-06, "loss": 0.4052, "step": 2240 }, { "epoch": 2.071033210332103, "grad_norm": 1.5870199236748768, "learning_rate": 5.306759111547881e-06, "loss": 0.4029, "step": 2245 }, { "epoch": 2.0756457564575648, "grad_norm": 1.6193242932623593, "learning_rate": 5.259422525924037e-06, "loss": 0.3907, "step": 2250 }, { "epoch": 2.080258302583026, "grad_norm": 1.6256137352538118, "learning_rate": 5.212222562513278e-06, "loss": 0.3989, "step": 2255 }, { "epoch": 2.084870848708487, "grad_norm": 1.6417107171770284, "learning_rate": 5.165160581606301e-06, "loss": 0.3982, "step": 2260 }, { "epoch": 2.089483394833948, "grad_norm": 1.583054814160985, "learning_rate": 5.11823793951719e-06, "loss": 0.3857, "step": 2265 }, { "epoch": 2.0940959409594098, "grad_norm": 1.5988302758967783, "learning_rate": 5.0714559885443115e-06, "loss": 0.3912, "step": 2270 }, { "epoch": 2.098708487084871, "grad_norm": 1.5549504658907112, "learning_rate": 5.024816076931366e-06, "loss": 0.3964, "step": 2275 }, { "epoch": 2.103321033210332, "grad_norm": 1.6288272703564912, "learning_rate": 4.978319548828504e-06, "loss": 0.3979, "step": 2280 }, { "epoch": 2.107933579335793, "grad_norm": 1.6075015238877752, "learning_rate": 4.931967744253601e-06, "loss": 0.3859, "step": 2285 }, { "epoch": 2.1125461254612548, "grad_norm": 1.6429738157314036, "learning_rate": 4.885761999053647e-06, "loss": 0.3962, "step": 2290 }, { "epoch": 2.117158671586716, "grad_norm": 1.6796427577430482, "learning_rate": 4.839703644866228e-06, "loss": 0.4075, "step": 2295 }, { "epoch": 2.121771217712177, "grad_norm": 1.619217591723515, "learning_rate": 4.793794009081167e-06, "loss": 0.4085, "step": 2300 }, { "epoch": 2.126383763837638, "grad_norm": 1.6669420506553294, "learning_rate": 4.7480344148022535e-06, "loss": 0.4009, "step": 2305 }, { "epoch": 2.1309963099630997, "grad_norm": 1.601567836454024, "learning_rate": 4.702426180809132e-06, "loss": 0.3893, "step": 2310 }, { "epoch": 2.135608856088561, "grad_norm": 1.6330436004091688, "learning_rate": 4.65697062151927e-06, "loss": 0.3935, "step": 2315 }, { "epoch": 2.140221402214022, "grad_norm": 1.686740725167288, "learning_rate": 4.611669046950093e-06, "loss": 0.4062, "step": 2320 }, { "epoch": 2.1448339483394836, "grad_norm": 1.5889691549399958, "learning_rate": 4.566522762681239e-06, "loss": 0.3979, "step": 2325 }, { "epoch": 2.1494464944649447, "grad_norm": 1.6850634151797181, "learning_rate": 4.521533069816895e-06, "loss": 0.3999, "step": 2330 }, { "epoch": 2.154059040590406, "grad_norm": 1.5458146169307518, "learning_rate": 4.4767012649483484e-06, "loss": 0.3903, "step": 2335 }, { "epoch": 2.158671586715867, "grad_norm": 1.6016021907561413, "learning_rate": 4.432028640116581e-06, "loss": 0.3885, "step": 2340 }, { "epoch": 2.1632841328413286, "grad_norm": 1.6830749800846674, "learning_rate": 4.387516482775058e-06, "loss": 0.3897, "step": 2345 }, { "epoch": 2.1678966789667897, "grad_norm": 1.657726450794809, "learning_rate": 4.343166075752605e-06, "loss": 0.3995, "step": 2350 }, { "epoch": 2.172509225092251, "grad_norm": 1.6089499042377242, "learning_rate": 4.298978697216442e-06, "loss": 0.3906, "step": 2355 }, { "epoch": 2.177121771217712, "grad_norm": 1.6433325368187606, "learning_rate": 4.254955620635371e-06, "loss": 0.3836, "step": 2360 }, { "epoch": 2.1817343173431736, "grad_norm": 1.6228967594394044, "learning_rate": 4.21109811474302e-06, "loss": 0.3953, "step": 2365 }, { "epoch": 2.1863468634686347, "grad_norm": 1.6564397078095119, "learning_rate": 4.1674074435013445e-06, "loss": 0.3975, "step": 2370 }, { "epoch": 2.190959409594096, "grad_norm": 1.6855648962846128, "learning_rate": 4.1238848660641504e-06, "loss": 0.389, "step": 2375 }, { "epoch": 2.195571955719557, "grad_norm": 1.5746673175696069, "learning_rate": 4.080531636740836e-06, "loss": 0.3844, "step": 2380 }, { "epoch": 2.2001845018450186, "grad_norm": 1.6344618154669375, "learning_rate": 4.03734900496022e-06, "loss": 0.3988, "step": 2385 }, { "epoch": 2.2047970479704797, "grad_norm": 1.6281928689117737, "learning_rate": 3.994338215234547e-06, "loss": 0.3896, "step": 2390 }, { "epoch": 2.209409594095941, "grad_norm": 1.6257611833188321, "learning_rate": 3.9515005071236274e-06, "loss": 0.3961, "step": 2395 }, { "epoch": 2.2140221402214024, "grad_norm": 1.5957292745421947, "learning_rate": 3.908837115199086e-06, "loss": 0.3871, "step": 2400 }, { "epoch": 2.2140221402214024, "eval_loss": 1.0319310426712036, "eval_runtime": 393.3456, "eval_samples_per_second": 39.024, "eval_steps_per_second": 0.153, "step": 2400 }, { "epoch": 2.2186346863468636, "grad_norm": 1.5729201496024248, "learning_rate": 3.866349269008819e-06, "loss": 0.385, "step": 2405 }, { "epoch": 2.2232472324723247, "grad_norm": 1.6183544120950042, "learning_rate": 3.824038193041529e-06, "loss": 0.3968, "step": 2410 }, { "epoch": 2.227859778597786, "grad_norm": 1.6955717019033336, "learning_rate": 3.781905106691447e-06, "loss": 0.4004, "step": 2415 }, { "epoch": 2.2324723247232474, "grad_norm": 1.6580260222032042, "learning_rate": 3.7399512242231994e-06, "loss": 0.3842, "step": 2420 }, { "epoch": 2.2370848708487086, "grad_norm": 1.6490315910819098, "learning_rate": 3.698177754736787e-06, "loss": 0.3862, "step": 2425 }, { "epoch": 2.2416974169741697, "grad_norm": 1.6351326865393605, "learning_rate": 3.6565859021327777e-06, "loss": 0.3952, "step": 2430 }, { "epoch": 2.246309963099631, "grad_norm": 1.7016714416453813, "learning_rate": 3.6151768650775577e-06, "loss": 0.3906, "step": 2435 }, { "epoch": 2.2509225092250924, "grad_norm": 1.5855218888376186, "learning_rate": 3.5739518369688454e-06, "loss": 0.391, "step": 2440 }, { "epoch": 2.2555350553505535, "grad_norm": 1.5979300773582483, "learning_rate": 3.5329120059012536e-06, "loss": 0.3884, "step": 2445 }, { "epoch": 2.2601476014760147, "grad_norm": 1.631933769302546, "learning_rate": 3.492058554632063e-06, "loss": 0.4012, "step": 2450 }, { "epoch": 2.264760147601476, "grad_norm": 1.6784627234471698, "learning_rate": 3.4513926605471504e-06, "loss": 0.3956, "step": 2455 }, { "epoch": 2.2693726937269374, "grad_norm": 1.5994089046113484, "learning_rate": 3.4109154956270253e-06, "loss": 0.3919, "step": 2460 }, { "epoch": 2.2739852398523985, "grad_norm": 1.5962979043384486, "learning_rate": 3.370628226413093e-06, "loss": 0.3975, "step": 2465 }, { "epoch": 2.2785977859778597, "grad_norm": 1.6734564837873838, "learning_rate": 3.330532013973987e-06, "loss": 0.3887, "step": 2470 }, { "epoch": 2.2832103321033212, "grad_norm": 1.5662431409042064, "learning_rate": 3.290628013872159e-06, "loss": 0.3841, "step": 2475 }, { "epoch": 2.2878228782287824, "grad_norm": 1.5635156068197413, "learning_rate": 3.250917376130538e-06, "loss": 0.3951, "step": 2480 }, { "epoch": 2.2924354243542435, "grad_norm": 1.6010796160602963, "learning_rate": 3.211401245199398e-06, "loss": 0.3942, "step": 2485 }, { "epoch": 2.2970479704797047, "grad_norm": 1.6199565737985742, "learning_rate": 3.1720807599233903e-06, "loss": 0.3927, "step": 2490 }, { "epoch": 2.3016605166051662, "grad_norm": 1.6515237978384454, "learning_rate": 3.132957053508696e-06, "loss": 0.3978, "step": 2495 }, { "epoch": 2.3062730627306274, "grad_norm": 1.6541451651760768, "learning_rate": 3.0940312534903848e-06, "loss": 0.397, "step": 2500 }, { "epoch": 2.3108856088560885, "grad_norm": 1.6636499883415654, "learning_rate": 3.0553044816999133e-06, "loss": 0.3771, "step": 2505 }, { "epoch": 2.3154981549815496, "grad_norm": 1.5866618676441615, "learning_rate": 3.0167778542328053e-06, "loss": 0.3967, "step": 2510 }, { "epoch": 2.3201107011070112, "grad_norm": 1.6302246602193289, "learning_rate": 2.9784524814164673e-06, "loss": 0.4006, "step": 2515 }, { "epoch": 2.3247232472324724, "grad_norm": 1.6231412036835564, "learning_rate": 2.940329467778198e-06, "loss": 0.3959, "step": 2520 }, { "epoch": 2.3293357933579335, "grad_norm": 1.6840760426840755, "learning_rate": 2.9024099120133674e-06, "loss": 0.3908, "step": 2525 }, { "epoch": 2.3339483394833946, "grad_norm": 1.7187956725298614, "learning_rate": 2.8646949069537343e-06, "loss": 0.3908, "step": 2530 }, { "epoch": 2.338560885608856, "grad_norm": 1.56700956891004, "learning_rate": 2.8271855395359613e-06, "loss": 0.3961, "step": 2535 }, { "epoch": 2.3431734317343174, "grad_norm": 1.6097463271059957, "learning_rate": 2.7898828907702826e-06, "loss": 0.3894, "step": 2540 }, { "epoch": 2.3477859778597785, "grad_norm": 1.5810624302563459, "learning_rate": 2.7527880357093673e-06, "loss": 0.3853, "step": 2545 }, { "epoch": 2.35239852398524, "grad_norm": 1.6230445922415717, "learning_rate": 2.71590204341731e-06, "loss": 0.3904, "step": 2550 }, { "epoch": 2.357011070110701, "grad_norm": 1.631243837661556, "learning_rate": 2.6792259769388394e-06, "loss": 0.3854, "step": 2555 }, { "epoch": 2.3616236162361623, "grad_norm": 1.6015681704315312, "learning_rate": 2.642760893268684e-06, "loss": 0.3897, "step": 2560 }, { "epoch": 2.3662361623616235, "grad_norm": 1.629094206408994, "learning_rate": 2.6065078433210913e-06, "loss": 0.3956, "step": 2565 }, { "epoch": 2.3708487084870846, "grad_norm": 1.6768173921403078, "learning_rate": 2.570467871899557e-06, "loss": 0.3882, "step": 2570 }, { "epoch": 2.375461254612546, "grad_norm": 1.6096361607682703, "learning_rate": 2.5346420176667052e-06, "loss": 0.3841, "step": 2575 }, { "epoch": 2.3800738007380073, "grad_norm": 1.5785068874659574, "learning_rate": 2.4990313131143716e-06, "loss": 0.407, "step": 2580 }, { "epoch": 2.3846863468634685, "grad_norm": 1.639780492512362, "learning_rate": 2.463636784533813e-06, "loss": 0.3872, "step": 2585 }, { "epoch": 2.38929889298893, "grad_norm": 1.5634276209766216, "learning_rate": 2.4284594519861637e-06, "loss": 0.3844, "step": 2590 }, { "epoch": 2.393911439114391, "grad_norm": 1.6051539919308102, "learning_rate": 2.3935003292730295e-06, "loss": 0.3845, "step": 2595 }, { "epoch": 2.3985239852398523, "grad_norm": 1.5819118683660134, "learning_rate": 2.3587604239072535e-06, "loss": 0.3927, "step": 2600 }, { "epoch": 2.3985239852398523, "eval_loss": 1.0269191265106201, "eval_runtime": 441.0938, "eval_samples_per_second": 34.8, "eval_steps_per_second": 0.136, "step": 2600 }, { "epoch": 2.4031365313653135, "grad_norm": 1.625902982522089, "learning_rate": 2.324240737083897e-06, "loss": 0.3967, "step": 2605 }, { "epoch": 2.407749077490775, "grad_norm": 1.6115877735569477, "learning_rate": 2.2899422636513768e-06, "loss": 0.3888, "step": 2610 }, { "epoch": 2.412361623616236, "grad_norm": 1.6393950015907957, "learning_rate": 2.2558659920828095e-06, "loss": 0.3866, "step": 2615 }, { "epoch": 2.4169741697416973, "grad_norm": 1.6289337323279018, "learning_rate": 2.2220129044474903e-06, "loss": 0.3822, "step": 2620 }, { "epoch": 2.421586715867159, "grad_norm": 1.626399821921426, "learning_rate": 2.1883839763826285e-06, "loss": 0.3917, "step": 2625 }, { "epoch": 2.42619926199262, "grad_norm": 1.5750110015076921, "learning_rate": 2.15498017706521e-06, "loss": 0.3818, "step": 2630 }, { "epoch": 2.430811808118081, "grad_norm": 1.5656770730106075, "learning_rate": 2.1218024691840646e-06, "loss": 0.3949, "step": 2635 }, { "epoch": 2.4354243542435423, "grad_norm": 1.5878185538716267, "learning_rate": 2.088851808912126e-06, "loss": 0.39, "step": 2640 }, { "epoch": 2.4400369003690034, "grad_norm": 1.6108620558982116, "learning_rate": 2.0561291458788736e-06, "loss": 0.3968, "step": 2645 }, { "epoch": 2.444649446494465, "grad_norm": 1.5917905556601293, "learning_rate": 2.0236354231429743e-06, "loss": 0.3835, "step": 2650 }, { "epoch": 2.449261992619926, "grad_norm": 1.5749010329322541, "learning_rate": 1.9913715771650798e-06, "loss": 0.3878, "step": 2655 }, { "epoch": 2.4538745387453873, "grad_norm": 1.5853197835560284, "learning_rate": 1.959338537780868e-06, "loss": 0.3793, "step": 2660 }, { "epoch": 2.458487084870849, "grad_norm": 1.675999301164994, "learning_rate": 1.9275372281742242e-06, "loss": 0.3888, "step": 2665 }, { "epoch": 2.46309963099631, "grad_norm": 1.5909593196232035, "learning_rate": 1.8959685648506365e-06, "loss": 0.379, "step": 2670 }, { "epoch": 2.467712177121771, "grad_norm": 1.6580184737262997, "learning_rate": 1.8646334576107993e-06, "loss": 0.385, "step": 2675 }, { "epoch": 2.4723247232472323, "grad_norm": 1.694950561030231, "learning_rate": 1.83353280952437e-06, "loss": 0.4061, "step": 2680 }, { "epoch": 2.476937269372694, "grad_norm": 1.5612968845817867, "learning_rate": 1.8026675169039654e-06, "loss": 0.3717, "step": 2685 }, { "epoch": 2.481549815498155, "grad_norm": 1.6589816057109397, "learning_rate": 1.7720384692793036e-06, "loss": 0.3907, "step": 2690 }, { "epoch": 2.486162361623616, "grad_norm": 1.6036469226772128, "learning_rate": 1.7416465493715984e-06, "loss": 0.3777, "step": 2695 }, { "epoch": 2.4907749077490777, "grad_norm": 1.6327667406661128, "learning_rate": 1.7114926330680958e-06, "loss": 0.3875, "step": 2700 }, { "epoch": 2.495387453874539, "grad_norm": 1.5827244143165553, "learning_rate": 1.681577589396839e-06, "loss": 0.3859, "step": 2705 }, { "epoch": 2.5, "grad_norm": 1.6514702752041677, "learning_rate": 1.6519022805016305e-06, "loss": 0.3843, "step": 2710 }, { "epoch": 2.504612546125461, "grad_norm": 1.5570332325787979, "learning_rate": 1.6224675616171737e-06, "loss": 0.3715, "step": 2715 }, { "epoch": 2.5092250922509223, "grad_norm": 1.630413477205012, "learning_rate": 1.5932742810444314e-06, "loss": 0.3836, "step": 2720 }, { "epoch": 2.513837638376384, "grad_norm": 1.6521730580375191, "learning_rate": 1.5643232801261731e-06, "loss": 0.3948, "step": 2725 }, { "epoch": 2.518450184501845, "grad_norm": 1.6155666031902267, "learning_rate": 1.5356153932227423e-06, "loss": 0.3898, "step": 2730 }, { "epoch": 2.523062730627306, "grad_norm": 1.6555206625970396, "learning_rate": 1.5071514476879878e-06, "loss": 0.384, "step": 2735 }, { "epoch": 2.5276752767527677, "grad_norm": 1.6677020481929692, "learning_rate": 1.478932263845435e-06, "loss": 0.3952, "step": 2740 }, { "epoch": 2.532287822878229, "grad_norm": 1.66071104050102, "learning_rate": 1.450958654964647e-06, "loss": 0.3883, "step": 2745 }, { "epoch": 2.53690036900369, "grad_norm": 1.5550008356755514, "learning_rate": 1.4232314272377723e-06, "loss": 0.3867, "step": 2750 }, { "epoch": 2.541512915129151, "grad_norm": 1.6135337105923544, "learning_rate": 1.3957513797563227e-06, "loss": 0.3895, "step": 2755 }, { "epoch": 2.5461254612546127, "grad_norm": 1.6353151843919402, "learning_rate": 1.368519304488134e-06, "loss": 0.3868, "step": 2760 }, { "epoch": 2.550738007380074, "grad_norm": 1.5705890317457465, "learning_rate": 1.3415359862545574e-06, "loss": 0.3834, "step": 2765 }, { "epoch": 2.555350553505535, "grad_norm": 1.654601851442529, "learning_rate": 1.3148022027078223e-06, "loss": 0.3832, "step": 2770 }, { "epoch": 2.5599630996309966, "grad_norm": 1.5695058446628602, "learning_rate": 1.2883187243086338e-06, "loss": 0.3893, "step": 2775 }, { "epoch": 2.5645756457564577, "grad_norm": 1.6350847841681155, "learning_rate": 1.262086314303973e-06, "loss": 0.3898, "step": 2780 }, { "epoch": 2.569188191881919, "grad_norm": 1.608716488298126, "learning_rate": 1.2361057287050892e-06, "loss": 0.3834, "step": 2785 }, { "epoch": 2.57380073800738, "grad_norm": 1.6204640984588243, "learning_rate": 1.2103777162657205e-06, "loss": 0.3972, "step": 2790 }, { "epoch": 2.578413284132841, "grad_norm": 1.6231664675158106, "learning_rate": 1.1849030184605092e-06, "loss": 0.3831, "step": 2795 }, { "epoch": 2.5830258302583027, "grad_norm": 1.6455527588787915, "learning_rate": 1.1596823694636427e-06, "loss": 0.3872, "step": 2800 }, { "epoch": 2.5830258302583027, "eval_loss": 1.0266528129577637, "eval_runtime": 403.4694, "eval_samples_per_second": 38.045, "eval_steps_per_second": 0.149, "step": 2800 }, { "epoch": 2.587638376383764, "grad_norm": 1.6501434655468525, "learning_rate": 1.134716496127679e-06, "loss": 0.3866, "step": 2805 }, { "epoch": 2.592250922509225, "grad_norm": 1.5937430607804544, "learning_rate": 1.110006117962612e-06, "loss": 0.3746, "step": 2810 }, { "epoch": 2.5968634686346865, "grad_norm": 1.6042558353979863, "learning_rate": 1.085551947115131e-06, "loss": 0.3813, "step": 2815 }, { "epoch": 2.6014760147601477, "grad_norm": 1.5880288341663498, "learning_rate": 1.0613546883480974e-06, "loss": 0.3879, "step": 2820 }, { "epoch": 2.606088560885609, "grad_norm": 1.6687358545150779, "learning_rate": 1.0374150390202308e-06, "loss": 0.3764, "step": 2825 }, { "epoch": 2.61070110701107, "grad_norm": 1.6305478991844253, "learning_rate": 1.013733689066012e-06, "loss": 0.3936, "step": 2830 }, { "epoch": 2.6153136531365315, "grad_norm": 1.5959262607963571, "learning_rate": 9.903113209758098e-07, "loss": 0.3809, "step": 2835 }, { "epoch": 2.6199261992619927, "grad_norm": 1.6326891680063345, "learning_rate": 9.671486097761918e-07, "loss": 0.3851, "step": 2840 }, { "epoch": 2.624538745387454, "grad_norm": 1.5671079064226814, "learning_rate": 9.442462230104876e-07, "loss": 0.3813, "step": 2845 }, { "epoch": 2.6291512915129154, "grad_norm": 1.6290077759272117, "learning_rate": 9.216048207195438e-07, "loss": 0.3815, "step": 2850 }, { "epoch": 2.6337638376383765, "grad_norm": 1.6273985534855235, "learning_rate": 8.992250554227011e-07, "loss": 0.4061, "step": 2855 }, { "epoch": 2.6383763837638377, "grad_norm": 1.6210509601819985, "learning_rate": 8.771075720989886e-07, "loss": 0.3752, "step": 2860 }, { "epoch": 2.642988929889299, "grad_norm": 1.555467095004222, "learning_rate": 8.552530081685384e-07, "loss": 0.3875, "step": 2865 }, { "epoch": 2.64760147601476, "grad_norm": 1.5802373017514193, "learning_rate": 8.336619934742151e-07, "loss": 0.3819, "step": 2870 }, { "epoch": 2.6522140221402215, "grad_norm": 1.6154443890108339, "learning_rate": 8.123351502634625e-07, "loss": 0.3888, "step": 2875 }, { "epoch": 2.6568265682656826, "grad_norm": 1.6154395851668601, "learning_rate": 7.91273093170365e-07, "loss": 0.3808, "step": 2880 }, { "epoch": 2.661439114391144, "grad_norm": 1.587280501111623, "learning_rate": 7.704764291979516e-07, "loss": 0.3774, "step": 2885 }, { "epoch": 2.6660516605166054, "grad_norm": 1.6071595650095456, "learning_rate": 7.499457577006753e-07, "loss": 0.3819, "step": 2890 }, { "epoch": 2.6706642066420665, "grad_norm": 1.648848686369746, "learning_rate": 7.296816703671683e-07, "loss": 0.3855, "step": 2895 }, { "epoch": 2.6752767527675276, "grad_norm": 1.5617730378995032, "learning_rate": 7.09684751203168e-07, "loss": 0.3909, "step": 2900 }, { "epoch": 2.6798892988929888, "grad_norm": 1.6183910290801358, "learning_rate": 6.899555765147004e-07, "loss": 0.3826, "step": 2905 }, { "epoch": 2.6845018450184504, "grad_norm": 1.674038262833668, "learning_rate": 6.704947148914608e-07, "loss": 0.382, "step": 2910 }, { "epoch": 2.6891143911439115, "grad_norm": 1.6448057348626846, "learning_rate": 6.513027271904315e-07, "loss": 0.3854, "step": 2915 }, { "epoch": 2.6937269372693726, "grad_norm": 1.6374354465922731, "learning_rate": 6.323801665197238e-07, "loss": 0.3851, "step": 2920 }, { "epoch": 2.698339483394834, "grad_norm": 1.5834812445833784, "learning_rate": 6.137275782226216e-07, "loss": 0.3819, "step": 2925 }, { "epoch": 2.7029520295202953, "grad_norm": 1.623319595480127, "learning_rate": 5.953454998618857e-07, "loss": 0.3856, "step": 2930 }, { "epoch": 2.7075645756457565, "grad_norm": 1.5967961053246091, "learning_rate": 5.772344612042435e-07, "loss": 0.3862, "step": 2935 }, { "epoch": 2.7121771217712176, "grad_norm": 1.5710116912601946, "learning_rate": 5.593949842051338e-07, "loss": 0.3842, "step": 2940 }, { "epoch": 2.7167896678966788, "grad_norm": 1.57362891686515, "learning_rate": 5.418275829936537e-07, "loss": 0.3711, "step": 2945 }, { "epoch": 2.7214022140221403, "grad_norm": 1.580310931596939, "learning_rate": 5.24532763857749e-07, "loss": 0.3835, "step": 2950 }, { "epoch": 2.7260147601476015, "grad_norm": 1.6145634716470691, "learning_rate": 5.075110252296245e-07, "loss": 0.3882, "step": 2955 }, { "epoch": 2.7306273062730626, "grad_norm": 1.6743712939756843, "learning_rate": 4.907628576713663e-07, "loss": 0.3838, "step": 2960 }, { "epoch": 2.735239852398524, "grad_norm": 1.5631707817004297, "learning_rate": 4.742887438608235e-07, "loss": 0.387, "step": 2965 }, { "epoch": 2.7398523985239853, "grad_norm": 1.6160350852160132, "learning_rate": 4.5808915857768035e-07, "loss": 0.3733, "step": 2970 }, { "epoch": 2.7444649446494465, "grad_norm": 1.6391404550180673, "learning_rate": 4.4216456868978243e-07, "loss": 0.3863, "step": 2975 }, { "epoch": 2.7490774907749076, "grad_norm": 1.5869883487994245, "learning_rate": 4.265154331396815e-07, "loss": 0.3803, "step": 2980 }, { "epoch": 2.7536900369003687, "grad_norm": 1.6338303616426142, "learning_rate": 4.111422029314016e-07, "loss": 0.367, "step": 2985 }, { "epoch": 2.7583025830258303, "grad_norm": 1.6222090885217113, "learning_rate": 3.960453211174531e-07, "loss": 0.3913, "step": 2990 }, { "epoch": 2.7629151291512914, "grad_norm": 1.5453198744376195, "learning_rate": 3.8122522278605024e-07, "loss": 0.3884, "step": 2995 }, { "epoch": 2.767527675276753, "grad_norm": 1.6194002178218827, "learning_rate": 3.6668233504858486e-07, "loss": 0.3918, "step": 3000 }, { "epoch": 2.767527675276753, "eval_loss": 1.0241528749465942, "eval_runtime": 583.1671, "eval_samples_per_second": 26.322, "eval_steps_per_second": 0.103, "step": 3000 }, { "epoch": 2.772140221402214, "grad_norm": 1.5792021657673334, "learning_rate": 3.524170770273072e-07, "loss": 0.3836, "step": 3005 }, { "epoch": 2.7767527675276753, "grad_norm": 1.5855513913689898, "learning_rate": 3.384298598432545e-07, "loss": 0.3836, "step": 3010 }, { "epoch": 2.7813653136531364, "grad_norm": 1.5563076811229497, "learning_rate": 3.2472108660439706e-07, "loss": 0.3802, "step": 3015 }, { "epoch": 2.7859778597785976, "grad_norm": 1.5823633538445738, "learning_rate": 3.112911523940232e-07, "loss": 0.383, "step": 3020 }, { "epoch": 2.790590405904059, "grad_norm": 1.629806513708018, "learning_rate": 2.9814044425935605e-07, "loss": 0.3821, "step": 3025 }, { "epoch": 2.7952029520295203, "grad_norm": 1.6493594318817755, "learning_rate": 2.852693412003882e-07, "loss": 0.3832, "step": 3030 }, { "epoch": 2.7998154981549814, "grad_norm": 1.5938360437660848, "learning_rate": 2.7267821415897343e-07, "loss": 0.3739, "step": 3035 }, { "epoch": 2.804428044280443, "grad_norm": 1.6409833373595797, "learning_rate": 2.6036742600812683e-07, "loss": 0.3824, "step": 3040 }, { "epoch": 2.809040590405904, "grad_norm": 1.6120109521550734, "learning_rate": 2.4833733154156716e-07, "loss": 0.3791, "step": 3045 }, { "epoch": 2.8136531365313653, "grad_norm": 1.6148694677775197, "learning_rate": 2.3658827746349976e-07, "loss": 0.3716, "step": 3050 }, { "epoch": 2.8182656826568264, "grad_norm": 1.6335439307552395, "learning_rate": 2.2512060237861455e-07, "loss": 0.377, "step": 3055 }, { "epoch": 2.8228782287822876, "grad_norm": 1.6687024271570985, "learning_rate": 2.139346367823314e-07, "loss": 0.3824, "step": 3060 }, { "epoch": 2.827490774907749, "grad_norm": 1.5961399559857916, "learning_rate": 2.030307030512768e-07, "loss": 0.38, "step": 3065 }, { "epoch": 2.8321033210332103, "grad_norm": 1.628301167538455, "learning_rate": 1.9240911543399465e-07, "loss": 0.3861, "step": 3070 }, { "epoch": 2.836715867158672, "grad_norm": 1.5876724378264213, "learning_rate": 1.8207018004188338e-07, "loss": 0.375, "step": 3075 }, { "epoch": 2.841328413284133, "grad_norm": 1.6533761523786383, "learning_rate": 1.7201419484037861e-07, "loss": 0.3847, "step": 3080 }, { "epoch": 2.845940959409594, "grad_norm": 1.65860087801836, "learning_rate": 1.622414496403668e-07, "loss": 0.4014, "step": 3085 }, { "epoch": 2.8505535055350553, "grad_norm": 1.5158450003834456, "learning_rate": 1.527522260898273e-07, "loss": 0.3743, "step": 3090 }, { "epoch": 2.8551660516605164, "grad_norm": 1.543441072095729, "learning_rate": 1.4354679766572344e-07, "loss": 0.3867, "step": 3095 }, { "epoch": 2.859778597785978, "grad_norm": 1.6190829129675481, "learning_rate": 1.3462542966611314e-07, "loss": 0.3697, "step": 3100 }, { "epoch": 2.864391143911439, "grad_norm": 1.6865745421103189, "learning_rate": 1.259883792025085e-07, "loss": 0.3744, "step": 3105 }, { "epoch": 2.8690036900369003, "grad_norm": 1.6329028917573583, "learning_rate": 1.1763589519246388e-07, "loss": 0.3722, "step": 3110 }, { "epoch": 2.873616236162362, "grad_norm": 1.5873886153372632, "learning_rate": 1.095682183524005e-07, "loss": 0.3797, "step": 3115 }, { "epoch": 2.878228782287823, "grad_norm": 1.580205694330548, "learning_rate": 1.0178558119067316e-07, "loss": 0.3705, "step": 3120 }, { "epoch": 2.882841328413284, "grad_norm": 1.5748354606004111, "learning_rate": 9.428820800086558e-08, "loss": 0.3832, "step": 3125 }, { "epoch": 2.8874538745387452, "grad_norm": 1.6471024830756282, "learning_rate": 8.707631485532775e-08, "loss": 0.3886, "step": 3130 }, { "epoch": 2.8920664206642064, "grad_norm": 1.6632577517398965, "learning_rate": 8.015010959894986e-08, "loss": 0.384, "step": 3135 }, { "epoch": 2.896678966789668, "grad_norm": 1.6208970211899278, "learning_rate": 7.350979184317153e-08, "loss": 0.3861, "step": 3140 }, { "epoch": 2.901291512915129, "grad_norm": 1.64161859212595, "learning_rate": 6.715555296022746e-08, "loss": 0.3767, "step": 3145 }, { "epoch": 2.9059040590405907, "grad_norm": 1.6172525804643438, "learning_rate": 6.108757607763305e-08, "loss": 0.3857, "step": 3150 }, { "epoch": 2.910516605166052, "grad_norm": 1.580351811354319, "learning_rate": 5.530603607290852e-08, "loss": 0.3771, "step": 3155 }, { "epoch": 2.915129151291513, "grad_norm": 1.572483075790589, "learning_rate": 4.981109956853747e-08, "loss": 0.3749, "step": 3160 }, { "epoch": 2.919741697416974, "grad_norm": 1.64962807846865, "learning_rate": 4.460292492716512e-08, "loss": 0.3795, "step": 3165 }, { "epoch": 2.9243542435424352, "grad_norm": 1.605735844681554, "learning_rate": 3.968166224703085e-08, "loss": 0.3795, "step": 3170 }, { "epoch": 2.928966789667897, "grad_norm": 1.5747261446081762, "learning_rate": 3.504745335765169e-08, "loss": 0.3793, "step": 3175 }, { "epoch": 2.933579335793358, "grad_norm": 1.653000934575167, "learning_rate": 3.0700431815724464e-08, "loss": 0.3903, "step": 3180 }, { "epoch": 2.938191881918819, "grad_norm": 1.6378961953845004, "learning_rate": 2.664072290128217e-08, "loss": 0.3889, "step": 3185 }, { "epoch": 2.9428044280442807, "grad_norm": 1.584042922862379, "learning_rate": 2.2868443614082468e-08, "loss": 0.3878, "step": 3190 }, { "epoch": 2.947416974169742, "grad_norm": 1.5647903615149348, "learning_rate": 1.9383702670235927e-08, "loss": 0.382, "step": 3195 }, { "epoch": 2.952029520295203, "grad_norm": 1.580064728520442, "learning_rate": 1.6186600499074055e-08, "loss": 0.3764, "step": 3200 }, { "epoch": 2.952029520295203, "eval_loss": 1.024267315864563, "eval_runtime": 436.1706, "eval_samples_per_second": 35.193, "eval_steps_per_second": 0.138, "step": 3200 }, { "epoch": 2.956642066420664, "grad_norm": 1.6540116549017054, "learning_rate": 1.3277229240249435e-08, "loss": 0.3945, "step": 3205 }, { "epoch": 2.961254612546125, "grad_norm": 1.6470444524138421, "learning_rate": 1.0655672741090028e-08, "loss": 0.3806, "step": 3210 }, { "epoch": 2.965867158671587, "grad_norm": 1.5413563389524112, "learning_rate": 8.322006554171147e-09, "loss": 0.3818, "step": 3215 }, { "epoch": 2.970479704797048, "grad_norm": 1.6164344748774018, "learning_rate": 6.276297935149389e-09, "loss": 0.3847, "step": 3220 }, { "epoch": 2.975092250922509, "grad_norm": 1.6066275827671024, "learning_rate": 4.5186058408153156e-09, "loss": 0.3823, "step": 3225 }, { "epoch": 2.9797047970479706, "grad_norm": 1.6333269508170274, "learning_rate": 3.0489809273981375e-09, "loss": 0.3801, "step": 3230 }, { "epoch": 2.984317343173432, "grad_norm": 1.6103531418344368, "learning_rate": 1.8674655491091043e-09, "loss": 0.3932, "step": 3235 }, { "epoch": 2.988929889298893, "grad_norm": 1.6234448383351934, "learning_rate": 9.740937569135967e-10, "loss": 0.3832, "step": 3240 }, { "epoch": 2.993542435424354, "grad_norm": 1.6318211007601922, "learning_rate": 3.6889129755413033e-10, "loss": 0.3871, "step": 3245 }, { "epoch": 2.9981549815498156, "grad_norm": 1.609675292027893, "learning_rate": 5.187561280983744e-11, "loss": 0.3788, "step": 3250 }, { "epoch": 3.0, "step": 3252, "total_flos": 1361805280542720.0, "train_loss": 0.7043063408920832, "train_runtime": 81819.3933, "train_samples_per_second": 5.085, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 3252, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1361805280542720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }