zephyr-7b-sft-full / trainer_state.json
li-muyang's picture
Model save
d75a9a5 verified
raw
history blame
117 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 200,
"global_step": 3252,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009225092250922509,
"grad_norm": 9.216251979924113,
"learning_rate": 0.0,
"loss": 1.1386,
"step": 1
},
{
"epoch": 0.004612546125461255,
"grad_norm": 9.265663963684933,
"learning_rate": 2.45398773006135e-07,
"loss": 1.1356,
"step": 5
},
{
"epoch": 0.00922509225092251,
"grad_norm": 5.379773922138807,
"learning_rate": 5.521472392638038e-07,
"loss": 1.1078,
"step": 10
},
{
"epoch": 0.013837638376383764,
"grad_norm": 3.1833820798612313,
"learning_rate": 8.588957055214725e-07,
"loss": 1.0446,
"step": 15
},
{
"epoch": 0.01845018450184502,
"grad_norm": 3.7414115093127966,
"learning_rate": 1.165644171779141e-06,
"loss": 1.027,
"step": 20
},
{
"epoch": 0.023062730627306273,
"grad_norm": 2.077966946821524,
"learning_rate": 1.47239263803681e-06,
"loss": 1.0126,
"step": 25
},
{
"epoch": 0.027675276752767528,
"grad_norm": 1.976442561168195,
"learning_rate": 1.7791411042944787e-06,
"loss": 0.9766,
"step": 30
},
{
"epoch": 0.03228782287822878,
"grad_norm": 2.0558473198791685,
"learning_rate": 2.085889570552147e-06,
"loss": 0.9988,
"step": 35
},
{
"epoch": 0.03690036900369004,
"grad_norm": 2.0823124494571843,
"learning_rate": 2.392638036809816e-06,
"loss": 1.0026,
"step": 40
},
{
"epoch": 0.04151291512915129,
"grad_norm": 1.8983417897768358,
"learning_rate": 2.699386503067485e-06,
"loss": 0.9786,
"step": 45
},
{
"epoch": 0.046125461254612546,
"grad_norm": 2.157480675022155,
"learning_rate": 3.0061349693251535e-06,
"loss": 0.9712,
"step": 50
},
{
"epoch": 0.0507380073800738,
"grad_norm": 1.770274327543251,
"learning_rate": 3.312883435582822e-06,
"loss": 0.9703,
"step": 55
},
{
"epoch": 0.055350553505535055,
"grad_norm": 1.7607024339948523,
"learning_rate": 3.6196319018404913e-06,
"loss": 0.9819,
"step": 60
},
{
"epoch": 0.05996309963099631,
"grad_norm": 1.7629489637063536,
"learning_rate": 3.92638036809816e-06,
"loss": 0.9814,
"step": 65
},
{
"epoch": 0.06457564575645756,
"grad_norm": 1.9708586699192496,
"learning_rate": 4.233128834355829e-06,
"loss": 0.9581,
"step": 70
},
{
"epoch": 0.06918819188191883,
"grad_norm": 1.8360638750389535,
"learning_rate": 4.539877300613497e-06,
"loss": 0.9631,
"step": 75
},
{
"epoch": 0.07380073800738007,
"grad_norm": 2.267503391793938,
"learning_rate": 4.846625766871166e-06,
"loss": 0.9544,
"step": 80
},
{
"epoch": 0.07841328413284133,
"grad_norm": 1.884337413377388,
"learning_rate": 5.153374233128835e-06,
"loss": 0.972,
"step": 85
},
{
"epoch": 0.08302583025830258,
"grad_norm": 1.8743154625885863,
"learning_rate": 5.460122699386503e-06,
"loss": 0.9697,
"step": 90
},
{
"epoch": 0.08763837638376384,
"grad_norm": 2.1189085616099,
"learning_rate": 5.766871165644172e-06,
"loss": 0.9741,
"step": 95
},
{
"epoch": 0.09225092250922509,
"grad_norm": 2.212060434062884,
"learning_rate": 6.073619631901841e-06,
"loss": 0.9798,
"step": 100
},
{
"epoch": 0.09686346863468635,
"grad_norm": 1.929193262402203,
"learning_rate": 6.38036809815951e-06,
"loss": 1.0036,
"step": 105
},
{
"epoch": 0.1014760147601476,
"grad_norm": 2.003660126750222,
"learning_rate": 6.687116564417178e-06,
"loss": 0.9857,
"step": 110
},
{
"epoch": 0.10608856088560886,
"grad_norm": 1.8476735746280122,
"learning_rate": 6.993865030674847e-06,
"loss": 0.991,
"step": 115
},
{
"epoch": 0.11070110701107011,
"grad_norm": 1.8581099924431381,
"learning_rate": 7.300613496932516e-06,
"loss": 0.971,
"step": 120
},
{
"epoch": 0.11531365313653137,
"grad_norm": 1.9178224374591495,
"learning_rate": 7.6073619631901856e-06,
"loss": 0.9944,
"step": 125
},
{
"epoch": 0.11992619926199262,
"grad_norm": 1.942003358498506,
"learning_rate": 7.914110429447854e-06,
"loss": 0.9674,
"step": 130
},
{
"epoch": 0.12453874538745388,
"grad_norm": 2.1197747747438154,
"learning_rate": 8.220858895705522e-06,
"loss": 0.9963,
"step": 135
},
{
"epoch": 0.12915129151291513,
"grad_norm": 1.8664824635600694,
"learning_rate": 8.527607361963191e-06,
"loss": 0.9926,
"step": 140
},
{
"epoch": 0.13376383763837638,
"grad_norm": 2.0058004450691347,
"learning_rate": 8.83435582822086e-06,
"loss": 0.9712,
"step": 145
},
{
"epoch": 0.13837638376383765,
"grad_norm": 1.7687908496159355,
"learning_rate": 9.14110429447853e-06,
"loss": 0.9593,
"step": 150
},
{
"epoch": 0.1429889298892989,
"grad_norm": 2.245349299711709,
"learning_rate": 9.447852760736197e-06,
"loss": 1.0105,
"step": 155
},
{
"epoch": 0.14760147601476015,
"grad_norm": 2.3027490792265186,
"learning_rate": 9.754601226993867e-06,
"loss": 0.9674,
"step": 160
},
{
"epoch": 0.1522140221402214,
"grad_norm": 1.7846972073040872,
"learning_rate": 1.0061349693251534e-05,
"loss": 0.9759,
"step": 165
},
{
"epoch": 0.15682656826568267,
"grad_norm": 2.3489880483587093,
"learning_rate": 1.0368098159509204e-05,
"loss": 0.9795,
"step": 170
},
{
"epoch": 0.16143911439114392,
"grad_norm": 1.8342564944945305,
"learning_rate": 1.0674846625766873e-05,
"loss": 0.9652,
"step": 175
},
{
"epoch": 0.16605166051660517,
"grad_norm": 2.0880839713049935,
"learning_rate": 1.0981595092024542e-05,
"loss": 0.9802,
"step": 180
},
{
"epoch": 0.1706642066420664,
"grad_norm": 2.0564885394915584,
"learning_rate": 1.1288343558282208e-05,
"loss": 0.9879,
"step": 185
},
{
"epoch": 0.1752767527675277,
"grad_norm": 1.8924944592502544,
"learning_rate": 1.1595092024539878e-05,
"loss": 0.9721,
"step": 190
},
{
"epoch": 0.17988929889298894,
"grad_norm": 1.826625526770449,
"learning_rate": 1.1901840490797547e-05,
"loss": 0.9796,
"step": 195
},
{
"epoch": 0.18450184501845018,
"grad_norm": 2.204915779056281,
"learning_rate": 1.2208588957055216e-05,
"loss": 0.9838,
"step": 200
},
{
"epoch": 0.18450184501845018,
"eval_loss": 0.99369215965271,
"eval_runtime": 539.2617,
"eval_samples_per_second": 28.465,
"eval_steps_per_second": 0.111,
"step": 200
},
{
"epoch": 0.18911439114391143,
"grad_norm": 1.9931165010626688,
"learning_rate": 1.2515337423312886e-05,
"loss": 0.9757,
"step": 205
},
{
"epoch": 0.1937269372693727,
"grad_norm": 1.9621352935552971,
"learning_rate": 1.2822085889570552e-05,
"loss": 0.9591,
"step": 210
},
{
"epoch": 0.19833948339483395,
"grad_norm": 2.2039217203491632,
"learning_rate": 1.3128834355828221e-05,
"loss": 0.9738,
"step": 215
},
{
"epoch": 0.2029520295202952,
"grad_norm": 1.8509141089097243,
"learning_rate": 1.343558282208589e-05,
"loss": 0.9966,
"step": 220
},
{
"epoch": 0.20756457564575645,
"grad_norm": 1.8414622043228284,
"learning_rate": 1.374233128834356e-05,
"loss": 1.0021,
"step": 225
},
{
"epoch": 0.21217712177121772,
"grad_norm": 1.80163089612214,
"learning_rate": 1.4049079754601229e-05,
"loss": 0.9831,
"step": 230
},
{
"epoch": 0.21678966789667897,
"grad_norm": 2.0579916481946983,
"learning_rate": 1.4355828220858897e-05,
"loss": 1.003,
"step": 235
},
{
"epoch": 0.22140221402214022,
"grad_norm": 2.17279455840179,
"learning_rate": 1.4662576687116566e-05,
"loss": 0.9951,
"step": 240
},
{
"epoch": 0.22601476014760147,
"grad_norm": 1.9598270944099394,
"learning_rate": 1.4969325153374235e-05,
"loss": 0.9963,
"step": 245
},
{
"epoch": 0.23062730627306274,
"grad_norm": 1.7398200214510018,
"learning_rate": 1.5276073619631903e-05,
"loss": 0.995,
"step": 250
},
{
"epoch": 0.235239852398524,
"grad_norm": 1.8787859113099807,
"learning_rate": 1.5582822085889574e-05,
"loss": 1.0017,
"step": 255
},
{
"epoch": 0.23985239852398524,
"grad_norm": 1.68690066175367,
"learning_rate": 1.5889570552147238e-05,
"loss": 1.0063,
"step": 260
},
{
"epoch": 0.2444649446494465,
"grad_norm": 1.8827477836410789,
"learning_rate": 1.619631901840491e-05,
"loss": 1.0143,
"step": 265
},
{
"epoch": 0.24907749077490776,
"grad_norm": 1.7349433277324904,
"learning_rate": 1.6503067484662577e-05,
"loss": 1.0152,
"step": 270
},
{
"epoch": 0.253690036900369,
"grad_norm": 1.8180747657522185,
"learning_rate": 1.6809815950920248e-05,
"loss": 1.0022,
"step": 275
},
{
"epoch": 0.25830258302583026,
"grad_norm": 2.2464010609362735,
"learning_rate": 1.7116564417177916e-05,
"loss": 1.0131,
"step": 280
},
{
"epoch": 0.2629151291512915,
"grad_norm": 1.659157693268852,
"learning_rate": 1.7423312883435583e-05,
"loss": 1.0029,
"step": 285
},
{
"epoch": 0.26752767527675275,
"grad_norm": 1.8259012389516431,
"learning_rate": 1.7730061349693254e-05,
"loss": 1.0149,
"step": 290
},
{
"epoch": 0.272140221402214,
"grad_norm": 1.7620006504062913,
"learning_rate": 1.8036809815950922e-05,
"loss": 1.0058,
"step": 295
},
{
"epoch": 0.2767527675276753,
"grad_norm": 1.8231421170705873,
"learning_rate": 1.834355828220859e-05,
"loss": 1.0157,
"step": 300
},
{
"epoch": 0.28136531365313655,
"grad_norm": 1.7640621041852131,
"learning_rate": 1.8650306748466257e-05,
"loss": 0.9917,
"step": 305
},
{
"epoch": 0.2859778597785978,
"grad_norm": 1.6216889282252938,
"learning_rate": 1.8957055214723928e-05,
"loss": 1.0176,
"step": 310
},
{
"epoch": 0.29059040590405905,
"grad_norm": 1.8784106605665698,
"learning_rate": 1.9263803680981596e-05,
"loss": 1.0092,
"step": 315
},
{
"epoch": 0.2952029520295203,
"grad_norm": 1.7812979855381206,
"learning_rate": 1.9570552147239267e-05,
"loss": 1.0122,
"step": 320
},
{
"epoch": 0.29981549815498154,
"grad_norm": 2.7924898972312375,
"learning_rate": 1.9877300613496935e-05,
"loss": 1.0214,
"step": 325
},
{
"epoch": 0.3044280442804428,
"grad_norm": 1.9196461092563228,
"learning_rate": 1.999994812438719e-05,
"loss": 1.037,
"step": 330
},
{
"epoch": 0.30904059040590404,
"grad_norm": 1.9675689878341092,
"learning_rate": 1.9999631108702447e-05,
"loss": 1.0322,
"step": 335
},
{
"epoch": 0.31365313653136534,
"grad_norm": 1.965168599936586,
"learning_rate": 1.999902590624309e-05,
"loss": 1.0217,
"step": 340
},
{
"epoch": 0.3182656826568266,
"grad_norm": 2.0547134862700225,
"learning_rate": 1.9998132534450893e-05,
"loss": 1.0312,
"step": 345
},
{
"epoch": 0.32287822878228783,
"grad_norm": 1.8052849563747033,
"learning_rate": 1.9996951019072605e-05,
"loss": 1.0062,
"step": 350
},
{
"epoch": 0.3274907749077491,
"grad_norm": 1.7264405471248967,
"learning_rate": 1.999548139415919e-05,
"loss": 1.0176,
"step": 355
},
{
"epoch": 0.33210332103321033,
"grad_norm": 3.9478371721075995,
"learning_rate": 1.9993723702064852e-05,
"loss": 1.0241,
"step": 360
},
{
"epoch": 0.3367158671586716,
"grad_norm": 1.8206235491040932,
"learning_rate": 1.9991677993445832e-05,
"loss": 1.0172,
"step": 365
},
{
"epoch": 0.3413284132841328,
"grad_norm": 1.936656750339234,
"learning_rate": 1.998934432725891e-05,
"loss": 1.0395,
"step": 370
},
{
"epoch": 0.3459409594095941,
"grad_norm": 1.719646103111977,
"learning_rate": 1.998672277075975e-05,
"loss": 1.0242,
"step": 375
},
{
"epoch": 0.3505535055350554,
"grad_norm": 1.64985869025976,
"learning_rate": 1.998381339950093e-05,
"loss": 1.0168,
"step": 380
},
{
"epoch": 0.3551660516605166,
"grad_norm": 1.5355798843409723,
"learning_rate": 1.9980616297329764e-05,
"loss": 1.0,
"step": 385
},
{
"epoch": 0.35977859778597787,
"grad_norm": 1.530082699368783,
"learning_rate": 1.997713155638592e-05,
"loss": 1.0086,
"step": 390
},
{
"epoch": 0.3643911439114391,
"grad_norm": 1.803426492693465,
"learning_rate": 1.997335927709872e-05,
"loss": 1.0318,
"step": 395
},
{
"epoch": 0.36900369003690037,
"grad_norm": 1.7841711948275436,
"learning_rate": 1.9969299568184276e-05,
"loss": 1.0162,
"step": 400
},
{
"epoch": 0.36900369003690037,
"eval_loss": 1.032899022102356,
"eval_runtime": 476.7218,
"eval_samples_per_second": 32.199,
"eval_steps_per_second": 0.126,
"step": 400
},
{
"epoch": 0.3736162361623616,
"grad_norm": 1.750763339163792,
"learning_rate": 1.996495254664235e-05,
"loss": 1.0238,
"step": 405
},
{
"epoch": 0.37822878228782286,
"grad_norm": 2.084259407603671,
"learning_rate": 1.996031833775297e-05,
"loss": 1.0144,
"step": 410
},
{
"epoch": 0.3828413284132841,
"grad_norm": 1.581188361061769,
"learning_rate": 1.995539707507284e-05,
"loss": 1.0034,
"step": 415
},
{
"epoch": 0.3874538745387454,
"grad_norm": 1.6642375432029033,
"learning_rate": 1.9950188900431464e-05,
"loss": 1.0452,
"step": 420
},
{
"epoch": 0.39206642066420666,
"grad_norm": 1.90454885480689,
"learning_rate": 1.9944693963927092e-05,
"loss": 1.0156,
"step": 425
},
{
"epoch": 0.3966789667896679,
"grad_norm": 1.9076933400032088,
"learning_rate": 1.9938912423922368e-05,
"loss": 1.0243,
"step": 430
},
{
"epoch": 0.40129151291512916,
"grad_norm": 1.5419487099448481,
"learning_rate": 1.9932844447039775e-05,
"loss": 1.0036,
"step": 435
},
{
"epoch": 0.4059040590405904,
"grad_norm": 1.5469597080761535,
"learning_rate": 1.992649020815683e-05,
"loss": 1.0216,
"step": 440
},
{
"epoch": 0.41051660516605165,
"grad_norm": 1.727534201068121,
"learning_rate": 1.991984989040105e-05,
"loss": 1.023,
"step": 445
},
{
"epoch": 0.4151291512915129,
"grad_norm": 1.5601876251457003,
"learning_rate": 1.9912923685144673e-05,
"loss": 1.0309,
"step": 450
},
{
"epoch": 0.41974169741697415,
"grad_norm": 1.5739530073618044,
"learning_rate": 1.9905711791999135e-05,
"loss": 1.009,
"step": 455
},
{
"epoch": 0.42435424354243545,
"grad_norm": 1.676935183830041,
"learning_rate": 1.989821441880933e-05,
"loss": 1.01,
"step": 460
},
{
"epoch": 0.4289667896678967,
"grad_norm": 1.6489937405447432,
"learning_rate": 1.98904317816476e-05,
"loss": 1.023,
"step": 465
},
{
"epoch": 0.43357933579335795,
"grad_norm": 1.5735733105955712,
"learning_rate": 1.9882364104807536e-05,
"loss": 1.0348,
"step": 470
},
{
"epoch": 0.4381918819188192,
"grad_norm": 1.549624253561804,
"learning_rate": 1.9874011620797494e-05,
"loss": 1.0302,
"step": 475
},
{
"epoch": 0.44280442804428044,
"grad_norm": 1.5401331906305855,
"learning_rate": 1.9865374570333887e-05,
"loss": 1.0217,
"step": 480
},
{
"epoch": 0.4474169741697417,
"grad_norm": 1.5202857812071402,
"learning_rate": 1.9856453202334277e-05,
"loss": 1.0388,
"step": 485
},
{
"epoch": 0.45202952029520294,
"grad_norm": 1.7005961483689318,
"learning_rate": 1.9847247773910176e-05,
"loss": 1.0167,
"step": 490
},
{
"epoch": 0.4566420664206642,
"grad_norm": 1.7121845874238086,
"learning_rate": 1.9837758550359637e-05,
"loss": 1.0041,
"step": 495
},
{
"epoch": 0.4612546125461255,
"grad_norm": 1.6923734480662047,
"learning_rate": 1.9827985805159626e-05,
"loss": 1.0378,
"step": 500
},
{
"epoch": 0.46586715867158673,
"grad_norm": 1.756497601027887,
"learning_rate": 1.981792981995812e-05,
"loss": 1.0148,
"step": 505
},
{
"epoch": 0.470479704797048,
"grad_norm": 1.6043365874775524,
"learning_rate": 1.980759088456601e-05,
"loss": 1.0306,
"step": 510
},
{
"epoch": 0.47509225092250923,
"grad_norm": 1.5593042352189914,
"learning_rate": 1.9796969296948723e-05,
"loss": 1.0384,
"step": 515
},
{
"epoch": 0.4797047970479705,
"grad_norm": 1.8356964322880667,
"learning_rate": 1.978606536321767e-05,
"loss": 1.0277,
"step": 520
},
{
"epoch": 0.4843173431734317,
"grad_norm": 1.5388627493896347,
"learning_rate": 1.9774879397621387e-05,
"loss": 1.0089,
"step": 525
},
{
"epoch": 0.488929889298893,
"grad_norm": 1.5717047981647194,
"learning_rate": 1.9763411722536503e-05,
"loss": 1.0206,
"step": 530
},
{
"epoch": 0.4935424354243542,
"grad_norm": 1.673580231538279,
"learning_rate": 1.9751662668458434e-05,
"loss": 1.0071,
"step": 535
},
{
"epoch": 0.4981549815498155,
"grad_norm": 1.977373456991288,
"learning_rate": 1.9739632573991877e-05,
"loss": 1.0223,
"step": 540
},
{
"epoch": 0.5027675276752768,
"grad_norm": 1.6746529870758962,
"learning_rate": 1.9727321785841028e-05,
"loss": 1.0105,
"step": 545
},
{
"epoch": 0.507380073800738,
"grad_norm": 1.515127525498714,
"learning_rate": 1.9714730658799616e-05,
"loss": 1.0159,
"step": 550
},
{
"epoch": 0.5119926199261993,
"grad_norm": 1.563050922669026,
"learning_rate": 1.9701859555740647e-05,
"loss": 1.026,
"step": 555
},
{
"epoch": 0.5166051660516605,
"grad_norm": 1.635916190051428,
"learning_rate": 1.9688708847605977e-05,
"loss": 1.0148,
"step": 560
},
{
"epoch": 0.5212177121771218,
"grad_norm": 1.6077492732765468,
"learning_rate": 1.9675278913395605e-05,
"loss": 1.0126,
"step": 565
},
{
"epoch": 0.525830258302583,
"grad_norm": 1.6108748420566017,
"learning_rate": 1.9661570140156746e-05,
"loss": 1.0116,
"step": 570
},
{
"epoch": 0.5304428044280443,
"grad_norm": 1.433325441089538,
"learning_rate": 1.9647582922972696e-05,
"loss": 1.012,
"step": 575
},
{
"epoch": 0.5350553505535055,
"grad_norm": 1.5420337235660757,
"learning_rate": 1.9633317664951418e-05,
"loss": 1.0122,
"step": 580
},
{
"epoch": 0.5396678966789668,
"grad_norm": 1.5924684100513768,
"learning_rate": 1.9618774777213954e-05,
"loss": 1.0109,
"step": 585
},
{
"epoch": 0.544280442804428,
"grad_norm": 1.5313594203853436,
"learning_rate": 1.960395467888255e-05,
"loss": 1.0031,
"step": 590
},
{
"epoch": 0.5488929889298892,
"grad_norm": 1.4655266545088188,
"learning_rate": 1.9588857797068602e-05,
"loss": 1.0315,
"step": 595
},
{
"epoch": 0.5535055350553506,
"grad_norm": 1.5909950744220547,
"learning_rate": 1.957348456686032e-05,
"loss": 1.0095,
"step": 600
},
{
"epoch": 0.5535055350553506,
"eval_loss": 1.0302140712738037,
"eval_runtime": 620.3005,
"eval_samples_per_second": 24.746,
"eval_steps_per_second": 0.097,
"step": 600
},
{
"epoch": 0.5581180811808119,
"grad_norm": 1.7854466441111572,
"learning_rate": 1.955783543131022e-05,
"loss": 1.0181,
"step": 605
},
{
"epoch": 0.5627306273062731,
"grad_norm": 1.5564929489350854,
"learning_rate": 1.9541910841422324e-05,
"loss": 1.0259,
"step": 610
},
{
"epoch": 0.5673431734317343,
"grad_norm": 1.4897983626795097,
"learning_rate": 1.952571125613918e-05,
"loss": 1.0108,
"step": 615
},
{
"epoch": 0.5719557195571956,
"grad_norm": 1.6487054053969497,
"learning_rate": 1.9509237142328638e-05,
"loss": 1.0217,
"step": 620
},
{
"epoch": 0.5765682656826568,
"grad_norm": 1.55798491461706,
"learning_rate": 1.949248897477038e-05,
"loss": 1.0095,
"step": 625
},
{
"epoch": 0.5811808118081181,
"grad_norm": 1.5666282016035418,
"learning_rate": 1.9475467236142252e-05,
"loss": 1.0197,
"step": 630
},
{
"epoch": 0.5857933579335793,
"grad_norm": 1.5475535188970362,
"learning_rate": 1.9458172417006347e-05,
"loss": 1.029,
"step": 635
},
{
"epoch": 0.5904059040590406,
"grad_norm": 1.6315436927424156,
"learning_rate": 1.944060501579487e-05,
"loss": 1.0298,
"step": 640
},
{
"epoch": 0.5950184501845018,
"grad_norm": 1.468711297047145,
"learning_rate": 1.9422765538795758e-05,
"loss": 1.0018,
"step": 645
},
{
"epoch": 0.5996309963099631,
"grad_norm": 1.518389031402169,
"learning_rate": 1.9404654500138117e-05,
"loss": 1.0226,
"step": 650
},
{
"epoch": 0.6042435424354243,
"grad_norm": 1.4592808647120747,
"learning_rate": 1.938627242177738e-05,
"loss": 1.0174,
"step": 655
},
{
"epoch": 0.6088560885608856,
"grad_norm": 1.480051570474581,
"learning_rate": 1.936761983348028e-05,
"loss": 1.0063,
"step": 660
},
{
"epoch": 0.6134686346863468,
"grad_norm": 1.5455098945055998,
"learning_rate": 1.9348697272809568e-05,
"loss": 1.0186,
"step": 665
},
{
"epoch": 0.6180811808118081,
"grad_norm": 1.467795849616486,
"learning_rate": 1.9329505285108544e-05,
"loss": 1.0223,
"step": 670
},
{
"epoch": 0.6226937269372693,
"grad_norm": 1.4583083150731897,
"learning_rate": 1.9310044423485303e-05,
"loss": 1.0188,
"step": 675
},
{
"epoch": 0.6273062730627307,
"grad_norm": 1.4520145261143442,
"learning_rate": 1.9290315248796834e-05,
"loss": 1.0148,
"step": 680
},
{
"epoch": 0.6319188191881919,
"grad_norm": 1.545625151246913,
"learning_rate": 1.9270318329632833e-05,
"loss": 1.0124,
"step": 685
},
{
"epoch": 0.6365313653136532,
"grad_norm": 1.5081286119716983,
"learning_rate": 1.925005424229933e-05,
"loss": 1.0122,
"step": 690
},
{
"epoch": 0.6411439114391144,
"grad_norm": 1.5771614507281495,
"learning_rate": 1.922952357080205e-05,
"loss": 1.0304,
"step": 695
},
{
"epoch": 0.6457564575645757,
"grad_norm": 1.3712078687992697,
"learning_rate": 1.9208726906829637e-05,
"loss": 0.9935,
"step": 700
},
{
"epoch": 0.6503690036900369,
"grad_norm": 1.514260367509165,
"learning_rate": 1.9187664849736542e-05,
"loss": 0.9928,
"step": 705
},
{
"epoch": 0.6549815498154982,
"grad_norm": 1.4337714275045377,
"learning_rate": 1.9166338006525786e-05,
"loss": 0.9999,
"step": 710
},
{
"epoch": 0.6595940959409594,
"grad_norm": 1.5474123891067624,
"learning_rate": 1.9144746991831463e-05,
"loss": 1.0136,
"step": 715
},
{
"epoch": 0.6642066420664207,
"grad_norm": 1.6417540187004078,
"learning_rate": 1.9122892427901015e-05,
"loss": 1.0148,
"step": 720
},
{
"epoch": 0.6688191881918819,
"grad_norm": 1.4429113958532114,
"learning_rate": 1.9100774944577303e-05,
"loss": 1.0054,
"step": 725
},
{
"epoch": 0.6734317343173432,
"grad_norm": 1.5435502423447707,
"learning_rate": 1.907839517928046e-05,
"loss": 1.0042,
"step": 730
},
{
"epoch": 0.6780442804428044,
"grad_norm": 1.6291068946061023,
"learning_rate": 1.9055753776989516e-05,
"loss": 1.0095,
"step": 735
},
{
"epoch": 0.6826568265682657,
"grad_norm": 1.5269546110275527,
"learning_rate": 1.903285139022381e-05,
"loss": 1.0091,
"step": 740
},
{
"epoch": 0.6872693726937269,
"grad_norm": 1.6910985232351008,
"learning_rate": 1.900968867902419e-05,
"loss": 1.0105,
"step": 745
},
{
"epoch": 0.6918819188191881,
"grad_norm": 1.473141009294655,
"learning_rate": 1.898626631093399e-05,
"loss": 1.0016,
"step": 750
},
{
"epoch": 0.6964944649446494,
"grad_norm": 1.6512202388953925,
"learning_rate": 1.896258496097977e-05,
"loss": 1.0119,
"step": 755
},
{
"epoch": 0.7011070110701108,
"grad_norm": 1.4887026369918788,
"learning_rate": 1.8938645311651904e-05,
"loss": 1.0087,
"step": 760
},
{
"epoch": 0.705719557195572,
"grad_norm": 1.3843944368722685,
"learning_rate": 1.891444805288487e-05,
"loss": 1.0091,
"step": 765
},
{
"epoch": 0.7103321033210332,
"grad_norm": 1.428126710831411,
"learning_rate": 1.888999388203739e-05,
"loss": 1.0059,
"step": 770
},
{
"epoch": 0.7149446494464945,
"grad_norm": 1.4390802486166878,
"learning_rate": 1.8865283503872325e-05,
"loss": 0.9994,
"step": 775
},
{
"epoch": 0.7195571955719557,
"grad_norm": 1.5690316004271283,
"learning_rate": 1.884031763053636e-05,
"loss": 0.9996,
"step": 780
},
{
"epoch": 0.724169741697417,
"grad_norm": 1.5021545547633912,
"learning_rate": 1.8815096981539494e-05,
"loss": 0.9991,
"step": 785
},
{
"epoch": 0.7287822878228782,
"grad_norm": 1.4900671252584468,
"learning_rate": 1.8789622283734283e-05,
"loss": 1.0101,
"step": 790
},
{
"epoch": 0.7333948339483395,
"grad_norm": 1.630926420050374,
"learning_rate": 1.8763894271294914e-05,
"loss": 0.9929,
"step": 795
},
{
"epoch": 0.7380073800738007,
"grad_norm": 1.3834521402990088,
"learning_rate": 1.873791368569603e-05,
"loss": 0.9857,
"step": 800
},
{
"epoch": 0.7380073800738007,
"eval_loss": 1.0203672647476196,
"eval_runtime": 417.6477,
"eval_samples_per_second": 36.753,
"eval_steps_per_second": 0.144,
"step": 800
},
{
"epoch": 0.742619926199262,
"grad_norm": 1.4145701317152546,
"learning_rate": 1.8711681275691366e-05,
"loss": 1.0197,
"step": 805
},
{
"epoch": 0.7472324723247232,
"grad_norm": 1.6689498503071274,
"learning_rate": 1.868519779729218e-05,
"loss": 1.0399,
"step": 810
},
{
"epoch": 0.7518450184501845,
"grad_norm": 1.5473797400416536,
"learning_rate": 1.8658464013745443e-05,
"loss": 1.0189,
"step": 815
},
{
"epoch": 0.7564575645756457,
"grad_norm": 1.5062087120542231,
"learning_rate": 1.8631480695511866e-05,
"loss": 1.0154,
"step": 820
},
{
"epoch": 0.761070110701107,
"grad_norm": 1.4715819927221123,
"learning_rate": 1.8604248620243682e-05,
"loss": 0.9923,
"step": 825
},
{
"epoch": 0.7656826568265682,
"grad_norm": 1.5582751445765881,
"learning_rate": 1.8576768572762233e-05,
"loss": 1.0035,
"step": 830
},
{
"epoch": 0.7702952029520295,
"grad_norm": 1.5849009070973952,
"learning_rate": 1.8549041345035354e-05,
"loss": 1.013,
"step": 835
},
{
"epoch": 0.7749077490774908,
"grad_norm": 1.53624381630094,
"learning_rate": 1.8521067736154567e-05,
"loss": 1.0212,
"step": 840
},
{
"epoch": 0.7795202952029521,
"grad_norm": 1.5482989884986487,
"learning_rate": 1.8492848552312016e-05,
"loss": 0.9879,
"step": 845
},
{
"epoch": 0.7841328413284133,
"grad_norm": 1.4110912591448512,
"learning_rate": 1.8464384606777258e-05,
"loss": 0.9973,
"step": 850
},
{
"epoch": 0.7887453874538746,
"grad_norm": 1.4605506515813482,
"learning_rate": 1.8435676719873828e-05,
"loss": 1.0007,
"step": 855
},
{
"epoch": 0.7933579335793358,
"grad_norm": 1.4855314358619898,
"learning_rate": 1.8406725718955575e-05,
"loss": 0.9921,
"step": 860
},
{
"epoch": 0.7979704797047971,
"grad_norm": 1.4175285986471877,
"learning_rate": 1.837753243838283e-05,
"loss": 0.9947,
"step": 865
},
{
"epoch": 0.8025830258302583,
"grad_norm": 1.5124089244421277,
"learning_rate": 1.834809771949837e-05,
"loss": 1.0007,
"step": 870
},
{
"epoch": 0.8071955719557196,
"grad_norm": 1.6185245778463926,
"learning_rate": 1.8318422410603162e-05,
"loss": 1.0005,
"step": 875
},
{
"epoch": 0.8118081180811808,
"grad_norm": 1.74728705302942,
"learning_rate": 1.8288507366931907e-05,
"loss": 0.9977,
"step": 880
},
{
"epoch": 0.816420664206642,
"grad_norm": 1.3700788841960891,
"learning_rate": 1.8258353450628402e-05,
"loss": 0.9953,
"step": 885
},
{
"epoch": 0.8210332103321033,
"grad_norm": 1.448030688939384,
"learning_rate": 1.8227961530720696e-05,
"loss": 0.9927,
"step": 890
},
{
"epoch": 0.8256457564575646,
"grad_norm": 1.3860802683248008,
"learning_rate": 1.819733248309604e-05,
"loss": 1.0137,
"step": 895
},
{
"epoch": 0.8302583025830258,
"grad_norm": 1.3807867910302518,
"learning_rate": 1.816646719047563e-05,
"loss": 0.9985,
"step": 900
},
{
"epoch": 0.834870848708487,
"grad_norm": 1.5246405968433066,
"learning_rate": 1.8135366542389202e-05,
"loss": 0.9965,
"step": 905
},
{
"epoch": 0.8394833948339483,
"grad_norm": 1.5298770681063796,
"learning_rate": 1.8104031435149366e-05,
"loss": 0.9895,
"step": 910
},
{
"epoch": 0.8440959409594095,
"grad_norm": 1.5737628456798298,
"learning_rate": 1.807246277182578e-05,
"loss": 1.0016,
"step": 915
},
{
"epoch": 0.8487084870848709,
"grad_norm": 1.3867737980631065,
"learning_rate": 1.8040661462219135e-05,
"loss": 0.9905,
"step": 920
},
{
"epoch": 0.8533210332103321,
"grad_norm": 1.4453648478128216,
"learning_rate": 1.8008628422834923e-05,
"loss": 1.0005,
"step": 925
},
{
"epoch": 0.8579335793357934,
"grad_norm": 1.612991010163291,
"learning_rate": 1.797636457685703e-05,
"loss": 0.9915,
"step": 930
},
{
"epoch": 0.8625461254612546,
"grad_norm": 1.4214188542336526,
"learning_rate": 1.7943870854121126e-05,
"loss": 0.9822,
"step": 935
},
{
"epoch": 0.8671586715867159,
"grad_norm": 1.5619301079989365,
"learning_rate": 1.791114819108788e-05,
"loss": 0.9781,
"step": 940
},
{
"epoch": 0.8717712177121771,
"grad_norm": 1.575000171763216,
"learning_rate": 1.787819753081594e-05,
"loss": 1.0021,
"step": 945
},
{
"epoch": 0.8763837638376384,
"grad_norm": 1.5295894907745344,
"learning_rate": 1.784501982293479e-05,
"loss": 1.0077,
"step": 950
},
{
"epoch": 0.8809963099630996,
"grad_norm": 1.455437508846778,
"learning_rate": 1.781161602361737e-05,
"loss": 0.9757,
"step": 955
},
{
"epoch": 0.8856088560885609,
"grad_norm": 1.4641943856542534,
"learning_rate": 1.7777987095552512e-05,
"loss": 0.9918,
"step": 960
},
{
"epoch": 0.8902214022140221,
"grad_norm": 1.56760438004586,
"learning_rate": 1.7744134007917195e-05,
"loss": 0.9952,
"step": 965
},
{
"epoch": 0.8948339483394834,
"grad_norm": 1.4881030883207977,
"learning_rate": 1.7710057736348622e-05,
"loss": 0.9995,
"step": 970
},
{
"epoch": 0.8994464944649446,
"grad_norm": 1.4138210092484749,
"learning_rate": 1.7675759262916105e-05,
"loss": 0.9814,
"step": 975
},
{
"epoch": 0.9040590405904059,
"grad_norm": 1.5269872502666184,
"learning_rate": 1.764123957609275e-05,
"loss": 0.9969,
"step": 980
},
{
"epoch": 0.9086715867158671,
"grad_norm": 1.4777357727162057,
"learning_rate": 1.7606499670726972e-05,
"loss": 0.9922,
"step": 985
},
{
"epoch": 0.9132841328413284,
"grad_norm": 1.5422388516772692,
"learning_rate": 1.7571540548013836e-05,
"loss": 0.9946,
"step": 990
},
{
"epoch": 0.9178966789667896,
"grad_norm": 1.519307824816888,
"learning_rate": 1.753636321546619e-05,
"loss": 0.9966,
"step": 995
},
{
"epoch": 0.922509225092251,
"grad_norm": 1.4264538885727793,
"learning_rate": 1.7500968686885634e-05,
"loss": 0.9803,
"step": 1000
},
{
"epoch": 0.922509225092251,
"eval_loss": 1.0050979852676392,
"eval_runtime": 475.0904,
"eval_samples_per_second": 32.31,
"eval_steps_per_second": 0.126,
"step": 1000
},
{
"epoch": 0.9271217712177122,
"grad_norm": 1.5178129589503198,
"learning_rate": 1.7465357982333294e-05,
"loss": 0.9965,
"step": 1005
},
{
"epoch": 0.9317343173431735,
"grad_norm": 1.4749449313002256,
"learning_rate": 1.742953212810045e-05,
"loss": 0.998,
"step": 1010
},
{
"epoch": 0.9363468634686347,
"grad_norm": 1.4826220358510274,
"learning_rate": 1.739349215667891e-05,
"loss": 0.9829,
"step": 1015
},
{
"epoch": 0.940959409594096,
"grad_norm": 1.4366615526126456,
"learning_rate": 1.735723910673132e-05,
"loss": 0.9847,
"step": 1020
},
{
"epoch": 0.9455719557195572,
"grad_norm": 1.4260729595159904,
"learning_rate": 1.732077402306116e-05,
"loss": 0.986,
"step": 1025
},
{
"epoch": 0.9501845018450185,
"grad_norm": 1.5039785990003167,
"learning_rate": 1.7284097956582694e-05,
"loss": 0.9745,
"step": 1030
},
{
"epoch": 0.9547970479704797,
"grad_norm": 1.5082306367621992,
"learning_rate": 1.7247211964290635e-05,
"loss": 0.9966,
"step": 1035
},
{
"epoch": 0.959409594095941,
"grad_norm": 1.4449103838652617,
"learning_rate": 1.721011710922972e-05,
"loss": 0.969,
"step": 1040
},
{
"epoch": 0.9640221402214022,
"grad_norm": 1.493303736853594,
"learning_rate": 1.717281446046404e-05,
"loss": 0.9861,
"step": 1045
},
{
"epoch": 0.9686346863468634,
"grad_norm": 1.47859930222641,
"learning_rate": 1.713530509304627e-05,
"loss": 0.9962,
"step": 1050
},
{
"epoch": 0.9732472324723247,
"grad_norm": 1.4442728791265258,
"learning_rate": 1.709759008798663e-05,
"loss": 0.9902,
"step": 1055
},
{
"epoch": 0.977859778597786,
"grad_norm": 1.4376492964532295,
"learning_rate": 1.7059670532221802e-05,
"loss": 0.9831,
"step": 1060
},
{
"epoch": 0.9824723247232472,
"grad_norm": 2.484316021155763,
"learning_rate": 1.7021547518583536e-05,
"loss": 0.9813,
"step": 1065
},
{
"epoch": 0.9870848708487084,
"grad_norm": 1.4464291734707186,
"learning_rate": 1.6983222145767198e-05,
"loss": 0.9902,
"step": 1070
},
{
"epoch": 0.9916974169741697,
"grad_norm": 1.5281863553882298,
"learning_rate": 1.6944695518300087e-05,
"loss": 0.9807,
"step": 1075
},
{
"epoch": 0.996309963099631,
"grad_norm": 1.4201117731708275,
"learning_rate": 1.6905968746509618e-05,
"loss": 0.9746,
"step": 1080
},
{
"epoch": 1.0009225092250922,
"grad_norm": 2.838028209786232,
"learning_rate": 1.6867042946491306e-05,
"loss": 0.9546,
"step": 1085
},
{
"epoch": 1.0055350553505535,
"grad_norm": 2.0637438515793582,
"learning_rate": 1.6827919240076612e-05,
"loss": 0.7562,
"step": 1090
},
{
"epoch": 1.0101476014760147,
"grad_norm": 2.03888870440938,
"learning_rate": 1.6788598754800602e-05,
"loss": 0.7325,
"step": 1095
},
{
"epoch": 1.014760147601476,
"grad_norm": 1.5943747459674837,
"learning_rate": 1.6749082623869465e-05,
"loss": 0.7403,
"step": 1100
},
{
"epoch": 1.0193726937269372,
"grad_norm": 1.705271980777924,
"learning_rate": 1.6709371986127846e-05,
"loss": 0.749,
"step": 1105
},
{
"epoch": 1.0239852398523985,
"grad_norm": 1.7629595253417687,
"learning_rate": 1.6669467986026012e-05,
"loss": 0.7087,
"step": 1110
},
{
"epoch": 1.0285977859778597,
"grad_norm": 1.6633137977760193,
"learning_rate": 1.662937177358691e-05,
"loss": 0.7394,
"step": 1115
},
{
"epoch": 1.033210332103321,
"grad_norm": 1.5074327799371463,
"learning_rate": 1.6589084504372975e-05,
"loss": 0.7164,
"step": 1120
},
{
"epoch": 1.0378228782287824,
"grad_norm": 1.6191564800390028,
"learning_rate": 1.6548607339452853e-05,
"loss": 0.7251,
"step": 1125
},
{
"epoch": 1.0424354243542435,
"grad_norm": 1.6118486061526904,
"learning_rate": 1.6507941445367935e-05,
"loss": 0.7317,
"step": 1130
},
{
"epoch": 1.0470479704797049,
"grad_norm": 1.6655447135885555,
"learning_rate": 1.6467087994098753e-05,
"loss": 0.7439,
"step": 1135
},
{
"epoch": 1.051660516605166,
"grad_norm": 1.8659877314188116,
"learning_rate": 1.6426048163031155e-05,
"loss": 0.7311,
"step": 1140
},
{
"epoch": 1.0562730627306274,
"grad_norm": 1.6691346554473123,
"learning_rate": 1.6384823134922444e-05,
"loss": 0.7304,
"step": 1145
},
{
"epoch": 1.0608856088560885,
"grad_norm": 1.6067875332998411,
"learning_rate": 1.634341409786723e-05,
"loss": 0.7239,
"step": 1150
},
{
"epoch": 1.0654981549815499,
"grad_norm": 1.7410755263145463,
"learning_rate": 1.6301822245263212e-05,
"loss": 0.7339,
"step": 1155
},
{
"epoch": 1.070110701107011,
"grad_norm": 1.5078513231888042,
"learning_rate": 1.6260048775776804e-05,
"loss": 0.7344,
"step": 1160
},
{
"epoch": 1.0747232472324724,
"grad_norm": 1.6250331925877979,
"learning_rate": 1.6218094893308553e-05,
"loss": 0.7418,
"step": 1165
},
{
"epoch": 1.0793357933579335,
"grad_norm": 1.6654181563693542,
"learning_rate": 1.6175961806958476e-05,
"loss": 0.7265,
"step": 1170
},
{
"epoch": 1.0839483394833949,
"grad_norm": 1.7420998771046359,
"learning_rate": 1.6133650730991183e-05,
"loss": 0.723,
"step": 1175
},
{
"epoch": 1.088560885608856,
"grad_norm": 1.6483689819198597,
"learning_rate": 1.609116288480092e-05,
"loss": 0.7316,
"step": 1180
},
{
"epoch": 1.0931734317343174,
"grad_norm": 1.593740794425406,
"learning_rate": 1.6048499492876378e-05,
"loss": 0.7374,
"step": 1185
},
{
"epoch": 1.0977859778597785,
"grad_norm": 1.5370001403119866,
"learning_rate": 1.6005661784765453e-05,
"loss": 0.7457,
"step": 1190
},
{
"epoch": 1.1023985239852399,
"grad_norm": 1.4718068363327683,
"learning_rate": 1.5962650995039783e-05,
"loss": 0.7328,
"step": 1195
},
{
"epoch": 1.1070110701107012,
"grad_norm": 1.5301401714407428,
"learning_rate": 1.5919468363259164e-05,
"loss": 0.736,
"step": 1200
},
{
"epoch": 1.1070110701107012,
"eval_loss": 1.0061343908309937,
"eval_runtime": 439.8508,
"eval_samples_per_second": 34.898,
"eval_steps_per_second": 0.136,
"step": 1200
},
{
"epoch": 1.1116236162361623,
"grad_norm": 1.7064428647610237,
"learning_rate": 1.587611513393585e-05,
"loss": 0.7297,
"step": 1205
},
{
"epoch": 1.1162361623616237,
"grad_norm": 1.6208328161309395,
"learning_rate": 1.5832592556498657e-05,
"loss": 0.7346,
"step": 1210
},
{
"epoch": 1.1208487084870848,
"grad_norm": 1.802694495701501,
"learning_rate": 1.5788901885256983e-05,
"loss": 0.7365,
"step": 1215
},
{
"epoch": 1.1254612546125462,
"grad_norm": 1.5370188196224415,
"learning_rate": 1.5745044379364637e-05,
"loss": 0.7305,
"step": 1220
},
{
"epoch": 1.1300738007380073,
"grad_norm": 1.4889078253244556,
"learning_rate": 1.5701021302783557e-05,
"loss": 0.732,
"step": 1225
},
{
"epoch": 1.1346863468634687,
"grad_norm": 1.6294276521184954,
"learning_rate": 1.56568339242474e-05,
"loss": 0.7276,
"step": 1230
},
{
"epoch": 1.1392988929889298,
"grad_norm": 1.5397859025285652,
"learning_rate": 1.5612483517224942e-05,
"loss": 0.7354,
"step": 1235
},
{
"epoch": 1.1439114391143912,
"grad_norm": 1.4953485038562,
"learning_rate": 1.556797135988342e-05,
"loss": 0.7173,
"step": 1240
},
{
"epoch": 1.1485239852398523,
"grad_norm": 1.853185392904802,
"learning_rate": 1.5523298735051657e-05,
"loss": 0.7489,
"step": 1245
},
{
"epoch": 1.1531365313653137,
"grad_norm": 1.5704541475489389,
"learning_rate": 1.5478466930183107e-05,
"loss": 0.7191,
"step": 1250
},
{
"epoch": 1.1577490774907748,
"grad_norm": 1.5415559777438193,
"learning_rate": 1.5433477237318765e-05,
"loss": 0.7327,
"step": 1255
},
{
"epoch": 1.1623616236162362,
"grad_norm": 1.6666092802375732,
"learning_rate": 1.5388330953049907e-05,
"loss": 0.7473,
"step": 1260
},
{
"epoch": 1.1669741697416973,
"grad_norm": 1.8791358127374613,
"learning_rate": 1.5343029378480733e-05,
"loss": 0.7312,
"step": 1265
},
{
"epoch": 1.1715867158671587,
"grad_norm": 1.5841817509277247,
"learning_rate": 1.5297573819190873e-05,
"loss": 0.7416,
"step": 1270
},
{
"epoch": 1.17619926199262,
"grad_norm": 1.5715763468516226,
"learning_rate": 1.5251965585197748e-05,
"loss": 0.7307,
"step": 1275
},
{
"epoch": 1.1808118081180812,
"grad_norm": 1.5391235781858166,
"learning_rate": 1.5206205990918836e-05,
"loss": 0.7212,
"step": 1280
},
{
"epoch": 1.1854243542435423,
"grad_norm": 1.5629140884896369,
"learning_rate": 1.5160296355133773e-05,
"loss": 0.7312,
"step": 1285
},
{
"epoch": 1.1900369003690037,
"grad_norm": 5.396412006514146,
"learning_rate": 1.5114238000946353e-05,
"loss": 0.7141,
"step": 1290
},
{
"epoch": 1.194649446494465,
"grad_norm": 1.5426421143956044,
"learning_rate": 1.50680322557464e-05,
"loss": 0.7308,
"step": 1295
},
{
"epoch": 1.1992619926199262,
"grad_norm": 1.6701861441853627,
"learning_rate": 1.5021680451171499e-05,
"loss": 0.7415,
"step": 1300
},
{
"epoch": 1.2038745387453875,
"grad_norm": 1.5779190034035813,
"learning_rate": 1.4975183923068637e-05,
"loss": 0.7302,
"step": 1305
},
{
"epoch": 1.2084870848708487,
"grad_norm": 1.619587352430737,
"learning_rate": 1.492854401145569e-05,
"loss": 0.7318,
"step": 1310
},
{
"epoch": 1.21309963099631,
"grad_norm": 1.619976635054261,
"learning_rate": 1.4881762060482814e-05,
"loss": 0.7254,
"step": 1315
},
{
"epoch": 1.2177121771217712,
"grad_norm": 1.5945364409345257,
"learning_rate": 1.48348394183937e-05,
"loss": 0.7402,
"step": 1320
},
{
"epoch": 1.2223247232472325,
"grad_norm": 1.5229514773361725,
"learning_rate": 1.4787777437486723e-05,
"loss": 0.7367,
"step": 1325
},
{
"epoch": 1.2269372693726937,
"grad_norm": 2.0971835619796932,
"learning_rate": 1.4740577474075963e-05,
"loss": 0.7416,
"step": 1330
},
{
"epoch": 1.231549815498155,
"grad_norm": 1.5602315222575482,
"learning_rate": 1.4693240888452121e-05,
"loss": 0.7375,
"step": 1335
},
{
"epoch": 1.2361623616236161,
"grad_norm": 2.090838147725305,
"learning_rate": 1.4645769044843318e-05,
"loss": 0.7375,
"step": 1340
},
{
"epoch": 1.2407749077490775,
"grad_norm": 1.631256144537058,
"learning_rate": 1.459816331137577e-05,
"loss": 0.7463,
"step": 1345
},
{
"epoch": 1.2453874538745389,
"grad_norm": 1.6259793781417131,
"learning_rate": 1.4550425060034367e-05,
"loss": 0.7237,
"step": 1350
},
{
"epoch": 1.25,
"grad_norm": 1.5839497005284708,
"learning_rate": 1.450255566662313e-05,
"loss": 0.7267,
"step": 1355
},
{
"epoch": 1.2546125461254611,
"grad_norm": 1.4817515646858677,
"learning_rate": 1.4454556510725556e-05,
"loss": 0.7384,
"step": 1360
},
{
"epoch": 1.2592250922509225,
"grad_norm": 1.619889890472081,
"learning_rate": 1.4406428975664875e-05,
"loss": 0.7445,
"step": 1365
},
{
"epoch": 1.2638376383763839,
"grad_norm": 1.5742411445585174,
"learning_rate": 1.4358174448464155e-05,
"loss": 0.731,
"step": 1370
},
{
"epoch": 1.268450184501845,
"grad_norm": 1.5691396578757213,
"learning_rate": 1.4309794319806356e-05,
"loss": 0.7445,
"step": 1375
},
{
"epoch": 1.2730627306273063,
"grad_norm": 1.5149875801425627,
"learning_rate": 1.4261289983994236e-05,
"loss": 0.7265,
"step": 1380
},
{
"epoch": 1.2776752767527675,
"grad_norm": 1.5087047437199383,
"learning_rate": 1.421266283891017e-05,
"loss": 0.7456,
"step": 1385
},
{
"epoch": 1.2822878228782288,
"grad_norm": 1.5718051035987606,
"learning_rate": 1.4163914285975863e-05,
"loss": 0.7212,
"step": 1390
},
{
"epoch": 1.28690036900369,
"grad_norm": 1.5655345894552062,
"learning_rate": 1.411504573011197e-05,
"loss": 0.7112,
"step": 1395
},
{
"epoch": 1.2915129151291513,
"grad_norm": 1.6127771597600427,
"learning_rate": 1.4066058579697593e-05,
"loss": 0.7249,
"step": 1400
},
{
"epoch": 1.2915129151291513,
"eval_loss": 1.000433087348938,
"eval_runtime": 377.6786,
"eval_samples_per_second": 40.643,
"eval_steps_per_second": 0.159,
"step": 1400
},
{
"epoch": 1.2961254612546125,
"grad_norm": 1.5789241025710268,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.7284,
"step": 1405
},
{
"epoch": 1.3007380073800738,
"grad_norm": 1.6370566010616505,
"learning_rate": 1.3967734145782425e-05,
"loss": 0.7233,
"step": 1410
},
{
"epoch": 1.305350553505535,
"grad_norm": 1.6019988849536153,
"learning_rate": 1.391839969596632e-05,
"loss": 0.7305,
"step": 1415
},
{
"epoch": 1.3099630996309963,
"grad_norm": 1.5321404428187482,
"learning_rate": 1.3868952318887421e-05,
"loss": 0.7161,
"step": 1420
},
{
"epoch": 1.3145756457564577,
"grad_norm": 1.617468685697329,
"learning_rate": 1.3819393439606313e-05,
"loss": 0.7383,
"step": 1425
},
{
"epoch": 1.3191881918819188,
"grad_norm": 1.4942275323967702,
"learning_rate": 1.3769724486397035e-05,
"loss": 0.7309,
"step": 1430
},
{
"epoch": 1.32380073800738,
"grad_norm": 1.526992745554204,
"learning_rate": 1.371994689070594e-05,
"loss": 0.7241,
"step": 1435
},
{
"epoch": 1.3284132841328413,
"grad_norm": 1.5223415766171715,
"learning_rate": 1.3670062087110423e-05,
"loss": 0.7369,
"step": 1440
},
{
"epoch": 1.3330258302583027,
"grad_norm": 1.5349271200758632,
"learning_rate": 1.362007151327758e-05,
"loss": 0.7408,
"step": 1445
},
{
"epoch": 1.3376383763837638,
"grad_norm": 1.5487172960940725,
"learning_rate": 1.3569976609922785e-05,
"loss": 0.7366,
"step": 1450
},
{
"epoch": 1.3422509225092252,
"grad_norm": 1.4925683710432864,
"learning_rate": 1.3519778820768157e-05,
"loss": 0.7316,
"step": 1455
},
{
"epoch": 1.3468634686346863,
"grad_norm": 1.508872342522196,
"learning_rate": 1.3469479592500954e-05,
"loss": 0.7282,
"step": 1460
},
{
"epoch": 1.3514760147601477,
"grad_norm": 1.5205565120558908,
"learning_rate": 1.3419080374731889e-05,
"loss": 0.7361,
"step": 1465
},
{
"epoch": 1.3560885608856088,
"grad_norm": 1.5925418525444446,
"learning_rate": 1.3368582619953348e-05,
"loss": 0.7314,
"step": 1470
},
{
"epoch": 1.3607011070110702,
"grad_norm": 1.536022556446867,
"learning_rate": 1.331798778349752e-05,
"loss": 0.7297,
"step": 1475
},
{
"epoch": 1.3653136531365313,
"grad_norm": 1.6415269938571113,
"learning_rate": 1.326729732349447e-05,
"loss": 0.7236,
"step": 1480
},
{
"epoch": 1.3699261992619927,
"grad_norm": 1.5571914542665708,
"learning_rate": 1.3216512700830104e-05,
"loss": 0.7456,
"step": 1485
},
{
"epoch": 1.3745387453874538,
"grad_norm": 1.537220055899943,
"learning_rate": 1.3165635379104079e-05,
"loss": 0.7283,
"step": 1490
},
{
"epoch": 1.3791512915129152,
"grad_norm": 1.586303040039408,
"learning_rate": 1.31146668245876e-05,
"loss": 0.74,
"step": 1495
},
{
"epoch": 1.3837638376383765,
"grad_norm": 1.599460902002627,
"learning_rate": 1.3063608506181189e-05,
"loss": 0.7269,
"step": 1500
},
{
"epoch": 1.3883763837638377,
"grad_norm": 1.5845763646878845,
"learning_rate": 1.3012461895372343e-05,
"loss": 0.7207,
"step": 1505
},
{
"epoch": 1.3929889298892988,
"grad_norm": 1.6101936272120416,
"learning_rate": 1.2961228466193116e-05,
"loss": 0.7491,
"step": 1510
},
{
"epoch": 1.3976014760147601,
"grad_norm": 1.5328730912860027,
"learning_rate": 1.2909909695177647e-05,
"loss": 0.7428,
"step": 1515
},
{
"epoch": 1.4022140221402215,
"grad_norm": 1.4700749106652151,
"learning_rate": 1.28585070613196e-05,
"loss": 0.7337,
"step": 1520
},
{
"epoch": 1.4068265682656826,
"grad_norm": 1.595108354129352,
"learning_rate": 1.2807022046029556e-05,
"loss": 0.7476,
"step": 1525
},
{
"epoch": 1.4114391143911438,
"grad_norm": 1.4846465873330463,
"learning_rate": 1.2755456133092295e-05,
"loss": 0.7471,
"step": 1530
},
{
"epoch": 1.4160516605166051,
"grad_norm": 1.6404654691954998,
"learning_rate": 1.2703810808624051e-05,
"loss": 0.7338,
"step": 1535
},
{
"epoch": 1.4206642066420665,
"grad_norm": 1.5924740161540634,
"learning_rate": 1.2652087561029682e-05,
"loss": 0.7349,
"step": 1540
},
{
"epoch": 1.4252767527675276,
"grad_norm": 1.4942826903716253,
"learning_rate": 1.2600287880959762e-05,
"loss": 0.725,
"step": 1545
},
{
"epoch": 1.429889298892989,
"grad_norm": 1.5398233388725502,
"learning_rate": 1.254841326126764e-05,
"loss": 0.7376,
"step": 1550
},
{
"epoch": 1.4345018450184501,
"grad_norm": 1.5654070863650702,
"learning_rate": 1.2496465196966393e-05,
"loss": 0.7318,
"step": 1555
},
{
"epoch": 1.4391143911439115,
"grad_norm": 1.5898873014454018,
"learning_rate": 1.2444445185185763e-05,
"loss": 0.7306,
"step": 1560
},
{
"epoch": 1.4437269372693726,
"grad_norm": 1.5102337179405925,
"learning_rate": 1.239235472512899e-05,
"loss": 0.7057,
"step": 1565
},
{
"epoch": 1.448339483394834,
"grad_norm": 1.5134306661913561,
"learning_rate": 1.2340195318029623e-05,
"loss": 0.7216,
"step": 1570
},
{
"epoch": 1.4529520295202953,
"grad_norm": 1.4927818706889626,
"learning_rate": 1.228796846710825e-05,
"loss": 0.7402,
"step": 1575
},
{
"epoch": 1.4575645756457565,
"grad_norm": 1.455448272385386,
"learning_rate": 1.2235675677529158e-05,
"loss": 0.7172,
"step": 1580
},
{
"epoch": 1.4621771217712176,
"grad_norm": 1.5634881056548133,
"learning_rate": 1.2183318456356984e-05,
"loss": 0.7389,
"step": 1585
},
{
"epoch": 1.466789667896679,
"grad_norm": 1.4898756090326564,
"learning_rate": 1.2130898312513255e-05,
"loss": 0.7378,
"step": 1590
},
{
"epoch": 1.4714022140221403,
"grad_norm": 1.5009726152389429,
"learning_rate": 1.2078416756732925e-05,
"loss": 0.7235,
"step": 1595
},
{
"epoch": 1.4760147601476015,
"grad_norm": 1.5101924554242023,
"learning_rate": 1.2025875301520811e-05,
"loss": 0.7355,
"step": 1600
},
{
"epoch": 1.4760147601476015,
"eval_loss": 0.9855098724365234,
"eval_runtime": 380.6287,
"eval_samples_per_second": 40.328,
"eval_steps_per_second": 0.158,
"step": 1600
},
{
"epoch": 1.4806273062730626,
"grad_norm": 1.4634212318812496,
"learning_rate": 1.1973275461108027e-05,
"loss": 0.7252,
"step": 1605
},
{
"epoch": 1.485239852398524,
"grad_norm": 1.6489180454948977,
"learning_rate": 1.1920618751408328e-05,
"loss": 0.7196,
"step": 1610
},
{
"epoch": 1.4898523985239853,
"grad_norm": 1.5637692459131638,
"learning_rate": 1.186790668997443e-05,
"loss": 0.7292,
"step": 1615
},
{
"epoch": 1.4944649446494465,
"grad_norm": 1.5196144325218592,
"learning_rate": 1.1815140795954268e-05,
"loss": 0.7317,
"step": 1620
},
{
"epoch": 1.4990774907749078,
"grad_norm": 1.5589118343089332,
"learning_rate": 1.176232259004722e-05,
"loss": 0.7282,
"step": 1625
},
{
"epoch": 1.503690036900369,
"grad_norm": 1.4776701136160202,
"learning_rate": 1.1709453594460279e-05,
"loss": 0.7142,
"step": 1630
},
{
"epoch": 1.5083025830258303,
"grad_norm": 1.5113121821467062,
"learning_rate": 1.165653533286418e-05,
"loss": 0.7267,
"step": 1635
},
{
"epoch": 1.5129151291512914,
"grad_norm": 1.5621038457306815,
"learning_rate": 1.1603569330349502e-05,
"loss": 0.7194,
"step": 1640
},
{
"epoch": 1.5175276752767528,
"grad_norm": 1.4342503333735157,
"learning_rate": 1.1550557113382697e-05,
"loss": 0.732,
"step": 1645
},
{
"epoch": 1.5221402214022142,
"grad_norm": 1.4925030679944211,
"learning_rate": 1.1497500209762102e-05,
"loss": 0.7311,
"step": 1650
},
{
"epoch": 1.5267527675276753,
"grad_norm": 1.5884492115586761,
"learning_rate": 1.1444400148573918e-05,
"loss": 0.7306,
"step": 1655
},
{
"epoch": 1.5313653136531364,
"grad_norm": 1.4907133011402547,
"learning_rate": 1.1391258460148135e-05,
"loss": 0.7291,
"step": 1660
},
{
"epoch": 1.5359778597785978,
"grad_norm": 1.5177536049645275,
"learning_rate": 1.1338076676014427e-05,
"loss": 0.7243,
"step": 1665
},
{
"epoch": 1.5405904059040592,
"grad_norm": 1.5089626714379156,
"learning_rate": 1.1284856328858017e-05,
"loss": 0.7174,
"step": 1670
},
{
"epoch": 1.5452029520295203,
"grad_norm": 1.518407729563154,
"learning_rate": 1.1231598952475504e-05,
"loss": 0.7188,
"step": 1675
},
{
"epoch": 1.5498154981549814,
"grad_norm": 2.2443040427872973,
"learning_rate": 1.1178306081730666e-05,
"loss": 0.7274,
"step": 1680
},
{
"epoch": 1.5544280442804428,
"grad_norm": 1.557724202719455,
"learning_rate": 1.1124979252510209e-05,
"loss": 0.7306,
"step": 1685
},
{
"epoch": 1.5590405904059041,
"grad_norm": 1.5461146750078991,
"learning_rate": 1.1071620001679514e-05,
"loss": 0.7265,
"step": 1690
},
{
"epoch": 1.5636531365313653,
"grad_norm": 1.677798283188097,
"learning_rate": 1.1018229867038358e-05,
"loss": 0.7296,
"step": 1695
},
{
"epoch": 1.5682656826568264,
"grad_norm": 1.5842640863093371,
"learning_rate": 1.0964810387276561e-05,
"loss": 0.7136,
"step": 1700
},
{
"epoch": 1.5728782287822878,
"grad_norm": 1.563395389439852,
"learning_rate": 1.0911363101929677e-05,
"loss": 0.7244,
"step": 1705
},
{
"epoch": 1.5774907749077491,
"grad_norm": 1.5223804974728257,
"learning_rate": 1.085788955133461e-05,
"loss": 0.7263,
"step": 1710
},
{
"epoch": 1.5821033210332103,
"grad_norm": 1.4519511890413386,
"learning_rate": 1.080439127658521e-05,
"loss": 0.7125,
"step": 1715
},
{
"epoch": 1.5867158671586716,
"grad_norm": 1.522533945377353,
"learning_rate": 1.0750869819487884e-05,
"loss": 0.7273,
"step": 1720
},
{
"epoch": 1.591328413284133,
"grad_norm": 1.5254481413189622,
"learning_rate": 1.0697326722517137e-05,
"loss": 0.7278,
"step": 1725
},
{
"epoch": 1.5959409594095941,
"grad_norm": 1.5231671586261868,
"learning_rate": 1.0643763528771136e-05,
"loss": 0.7395,
"step": 1730
},
{
"epoch": 1.6005535055350553,
"grad_norm": 1.4650142883805686,
"learning_rate": 1.0590181781927229e-05,
"loss": 0.7349,
"step": 1735
},
{
"epoch": 1.6051660516605166,
"grad_norm": 1.4963676540506488,
"learning_rate": 1.0536583026197462e-05,
"loss": 0.7227,
"step": 1740
},
{
"epoch": 1.609778597785978,
"grad_norm": 1.568382581129352,
"learning_rate": 1.0482968806284073e-05,
"loss": 0.7104,
"step": 1745
},
{
"epoch": 1.6143911439114391,
"grad_norm": 1.4263377420776584,
"learning_rate": 1.042934066733497e-05,
"loss": 0.7295,
"step": 1750
},
{
"epoch": 1.6190036900369003,
"grad_norm": 1.557321427633267,
"learning_rate": 1.0375700154899208e-05,
"loss": 0.7221,
"step": 1755
},
{
"epoch": 1.6236162361623616,
"grad_norm": 1.4697183106878222,
"learning_rate": 1.0322048814882438e-05,
"loss": 0.7137,
"step": 1760
},
{
"epoch": 1.628228782287823,
"grad_norm": 1.5186029224528301,
"learning_rate": 1.0268388193502365e-05,
"loss": 0.7064,
"step": 1765
},
{
"epoch": 1.632841328413284,
"grad_norm": 1.4447029146830694,
"learning_rate": 1.0214719837244176e-05,
"loss": 0.7288,
"step": 1770
},
{
"epoch": 1.6374538745387452,
"grad_norm": 1.6070031997265373,
"learning_rate": 1.0161045292815974e-05,
"loss": 0.707,
"step": 1775
},
{
"epoch": 1.6420664206642066,
"grad_norm": 1.426331730931973,
"learning_rate": 1.010736610710421e-05,
"loss": 0.709,
"step": 1780
},
{
"epoch": 1.646678966789668,
"grad_norm": 1.452095892694617,
"learning_rate": 1.0053683827129091e-05,
"loss": 0.7121,
"step": 1785
},
{
"epoch": 1.651291512915129,
"grad_norm": 1.57091551946505,
"learning_rate": 1e-05,
"loss": 0.7134,
"step": 1790
},
{
"epoch": 1.6559040590405905,
"grad_norm": 1.5750336048966571,
"learning_rate": 9.946316172870909e-06,
"loss": 0.7136,
"step": 1795
},
{
"epoch": 1.6605166051660518,
"grad_norm": 1.4906719471502183,
"learning_rate": 9.892633892895795e-06,
"loss": 0.7151,
"step": 1800
},
{
"epoch": 1.6605166051660518,
"eval_loss": 0.97125643491745,
"eval_runtime": 375.0586,
"eval_samples_per_second": 40.927,
"eval_steps_per_second": 0.16,
"step": 1800
},
{
"epoch": 1.665129151291513,
"grad_norm": 1.5072106225983386,
"learning_rate": 9.83895470718403e-06,
"loss": 0.7227,
"step": 1805
},
{
"epoch": 1.669741697416974,
"grad_norm": 1.4508206944932882,
"learning_rate": 9.785280162755825e-06,
"loss": 0.724,
"step": 1810
},
{
"epoch": 1.6743542435424354,
"grad_norm": 1.5498476457151749,
"learning_rate": 9.731611806497637e-06,
"loss": 0.7026,
"step": 1815
},
{
"epoch": 1.6789667896678968,
"grad_norm": 1.5177452024002571,
"learning_rate": 9.677951185117565e-06,
"loss": 0.7129,
"step": 1820
},
{
"epoch": 1.683579335793358,
"grad_norm": 1.7536225872991078,
"learning_rate": 9.624299845100795e-06,
"loss": 0.7157,
"step": 1825
},
{
"epoch": 1.688191881918819,
"grad_norm": 1.5448154946820392,
"learning_rate": 9.570659332665032e-06,
"loss": 0.7029,
"step": 1830
},
{
"epoch": 1.6928044280442804,
"grad_norm": 1.5229582975841924,
"learning_rate": 9.51703119371593e-06,
"loss": 0.7231,
"step": 1835
},
{
"epoch": 1.6974169741697418,
"grad_norm": 1.401053250210059,
"learning_rate": 9.463416973802541e-06,
"loss": 0.6987,
"step": 1840
},
{
"epoch": 1.702029520295203,
"grad_norm": 1.486158083271756,
"learning_rate": 9.409818218072774e-06,
"loss": 0.7187,
"step": 1845
},
{
"epoch": 1.706642066420664,
"grad_norm": 1.4844923332820112,
"learning_rate": 9.35623647122887e-06,
"loss": 0.7038,
"step": 1850
},
{
"epoch": 1.7112546125461254,
"grad_norm": 1.4768083176878029,
"learning_rate": 9.302673277482867e-06,
"loss": 0.7156,
"step": 1855
},
{
"epoch": 1.7158671586715868,
"grad_norm": 1.4690802202313877,
"learning_rate": 9.249130180512118e-06,
"loss": 0.7007,
"step": 1860
},
{
"epoch": 1.720479704797048,
"grad_norm": 1.521642869722996,
"learning_rate": 9.19560872341479e-06,
"loss": 0.7124,
"step": 1865
},
{
"epoch": 1.725092250922509,
"grad_norm": 1.4844935676770985,
"learning_rate": 9.142110448665394e-06,
"loss": 0.7137,
"step": 1870
},
{
"epoch": 1.7297047970479706,
"grad_norm": 1.4802119797626268,
"learning_rate": 9.088636898070326e-06,
"loss": 0.7142,
"step": 1875
},
{
"epoch": 1.7343173431734318,
"grad_norm": 1.4925162048619054,
"learning_rate": 9.035189612723444e-06,
"loss": 0.7128,
"step": 1880
},
{
"epoch": 1.738929889298893,
"grad_norm": 1.5072960944547247,
"learning_rate": 8.981770132961649e-06,
"loss": 0.7,
"step": 1885
},
{
"epoch": 1.7435424354243543,
"grad_norm": 1.463889514395925,
"learning_rate": 8.928379998320489e-06,
"loss": 0.7057,
"step": 1890
},
{
"epoch": 1.7481549815498156,
"grad_norm": 1.6039296690992704,
"learning_rate": 8.875020747489795e-06,
"loss": 0.7233,
"step": 1895
},
{
"epoch": 1.7527675276752768,
"grad_norm": 1.5035319064665877,
"learning_rate": 8.821693918269334e-06,
"loss": 0.7049,
"step": 1900
},
{
"epoch": 1.757380073800738,
"grad_norm": 1.4420897442537834,
"learning_rate": 8.768401047524498e-06,
"loss": 0.7097,
"step": 1905
},
{
"epoch": 1.7619926199261993,
"grad_norm": 1.4863140306951776,
"learning_rate": 8.715143671141985e-06,
"loss": 0.7131,
"step": 1910
},
{
"epoch": 1.7666051660516606,
"grad_norm": 1.4372944065533075,
"learning_rate": 8.661923323985576e-06,
"loss": 0.7066,
"step": 1915
},
{
"epoch": 1.7712177121771218,
"grad_norm": 1.5118603952311795,
"learning_rate": 8.60874153985187e-06,
"loss": 0.711,
"step": 1920
},
{
"epoch": 1.775830258302583,
"grad_norm": 1.4309158451109616,
"learning_rate": 8.555599851426086e-06,
"loss": 0.7017,
"step": 1925
},
{
"epoch": 1.7804428044280443,
"grad_norm": 1.4482092213054845,
"learning_rate": 8.5024997902379e-06,
"loss": 0.7043,
"step": 1930
},
{
"epoch": 1.7850553505535056,
"grad_norm": 1.4820015255772456,
"learning_rate": 8.449442886617308e-06,
"loss": 0.7134,
"step": 1935
},
{
"epoch": 1.7896678966789668,
"grad_norm": 1.5178706892202136,
"learning_rate": 8.396430669650501e-06,
"loss": 0.6986,
"step": 1940
},
{
"epoch": 1.7942804428044279,
"grad_norm": 1.5192714047507399,
"learning_rate": 8.343464667135821e-06,
"loss": 0.7098,
"step": 1945
},
{
"epoch": 1.7988929889298892,
"grad_norm": 1.5584757005755172,
"learning_rate": 8.290546405539726e-06,
"loss": 0.7007,
"step": 1950
},
{
"epoch": 1.8035055350553506,
"grad_norm": 1.4555072659251027,
"learning_rate": 8.237677409952784e-06,
"loss": 0.7069,
"step": 1955
},
{
"epoch": 1.8081180811808117,
"grad_norm": 1.435117018138001,
"learning_rate": 8.184859204045736e-06,
"loss": 0.7126,
"step": 1960
},
{
"epoch": 1.812730627306273,
"grad_norm": 1.497994099179169,
"learning_rate": 8.132093310025572e-06,
"loss": 0.6918,
"step": 1965
},
{
"epoch": 1.8173431734317345,
"grad_norm": 1.5231919974046568,
"learning_rate": 8.079381248591675e-06,
"loss": 0.6999,
"step": 1970
},
{
"epoch": 1.8219557195571956,
"grad_norm": 1.416052373581155,
"learning_rate": 8.026724538891976e-06,
"loss": 0.7007,
"step": 1975
},
{
"epoch": 1.8265682656826567,
"grad_norm": 1.4579047643243503,
"learning_rate": 7.974124698479192e-06,
"loss": 0.6987,
"step": 1980
},
{
"epoch": 1.831180811808118,
"grad_norm": 1.4768646334937987,
"learning_rate": 7.921583243267079e-06,
"loss": 0.721,
"step": 1985
},
{
"epoch": 1.8357933579335795,
"grad_norm": 1.4193207071096974,
"learning_rate": 7.869101687486748e-06,
"loss": 0.6998,
"step": 1990
},
{
"epoch": 1.8404059040590406,
"grad_norm": 1.5195143082878666,
"learning_rate": 7.816681543643019e-06,
"loss": 0.7035,
"step": 1995
},
{
"epoch": 1.8450184501845017,
"grad_norm": 1.492818689804546,
"learning_rate": 7.764324322470842e-06,
"loss": 0.7023,
"step": 2000
},
{
"epoch": 1.8450184501845017,
"eval_loss": 0.9556826949119568,
"eval_runtime": 438.5581,
"eval_samples_per_second": 35.001,
"eval_steps_per_second": 0.137,
"step": 2000
},
{
"epoch": 1.849630996309963,
"grad_norm": 1.4550020776516512,
"learning_rate": 7.712031532891754e-06,
"loss": 0.6959,
"step": 2005
},
{
"epoch": 1.8542435424354244,
"grad_norm": 1.532357943989181,
"learning_rate": 7.659804681970378e-06,
"loss": 0.716,
"step": 2010
},
{
"epoch": 1.8588560885608856,
"grad_norm": 1.4932493146047923,
"learning_rate": 7.607645274871013e-06,
"loss": 0.7103,
"step": 2015
},
{
"epoch": 1.8634686346863467,
"grad_norm": 1.3849647891235997,
"learning_rate": 7.555554814814243e-06,
"loss": 0.7091,
"step": 2020
},
{
"epoch": 1.868081180811808,
"grad_norm": 1.4686606572709258,
"learning_rate": 7.50353480303361e-06,
"loss": 0.7065,
"step": 2025
},
{
"epoch": 1.8726937269372694,
"grad_norm": 1.4890927562018215,
"learning_rate": 7.451586738732362e-06,
"loss": 0.7045,
"step": 2030
},
{
"epoch": 1.8773062730627306,
"grad_norm": 1.4388080292308094,
"learning_rate": 7.3997121190402375e-06,
"loss": 0.7062,
"step": 2035
},
{
"epoch": 1.881918819188192,
"grad_norm": 1.595891571353621,
"learning_rate": 7.347912438970324e-06,
"loss": 0.693,
"step": 2040
},
{
"epoch": 1.8865313653136533,
"grad_norm": 1.4244467986676035,
"learning_rate": 7.296189191375953e-06,
"loss": 0.6941,
"step": 2045
},
{
"epoch": 1.8911439114391144,
"grad_norm": 1.4475135191897706,
"learning_rate": 7.24454386690771e-06,
"loss": 0.7073,
"step": 2050
},
{
"epoch": 1.8957564575645756,
"grad_norm": 1.4673852319078244,
"learning_rate": 7.192977953970448e-06,
"loss": 0.7078,
"step": 2055
},
{
"epoch": 1.900369003690037,
"grad_norm": 1.4719457932619668,
"learning_rate": 7.141492938680401e-06,
"loss": 0.691,
"step": 2060
},
{
"epoch": 1.9049815498154983,
"grad_norm": 1.466731728037535,
"learning_rate": 7.090090304822356e-06,
"loss": 0.7062,
"step": 2065
},
{
"epoch": 1.9095940959409594,
"grad_norm": 1.4553947793369755,
"learning_rate": 7.038771533806884e-06,
"loss": 0.7106,
"step": 2070
},
{
"epoch": 1.9142066420664205,
"grad_norm": 1.478652231013823,
"learning_rate": 6.9875381046276605e-06,
"loss": 0.6931,
"step": 2075
},
{
"epoch": 1.918819188191882,
"grad_norm": 1.4356929483984957,
"learning_rate": 6.936391493818814e-06,
"loss": 0.6898,
"step": 2080
},
{
"epoch": 1.9234317343173433,
"grad_norm": 1.5536671032832632,
"learning_rate": 6.885333175412406e-06,
"loss": 0.6928,
"step": 2085
},
{
"epoch": 1.9280442804428044,
"grad_norm": 1.4991276022393414,
"learning_rate": 6.834364620895928e-06,
"loss": 0.6935,
"step": 2090
},
{
"epoch": 1.9326568265682655,
"grad_norm": 1.5046326188308679,
"learning_rate": 6.783487299169897e-06,
"loss": 0.6983,
"step": 2095
},
{
"epoch": 1.937269372693727,
"grad_norm": 1.4351756608326394,
"learning_rate": 6.732702676505531e-06,
"loss": 0.7065,
"step": 2100
},
{
"epoch": 1.9418819188191883,
"grad_norm": 1.5547998479904102,
"learning_rate": 6.6820122165024845e-06,
"loss": 0.6879,
"step": 2105
},
{
"epoch": 1.9464944649446494,
"grad_norm": 1.49827559538925,
"learning_rate": 6.631417380046656e-06,
"loss": 0.7025,
"step": 2110
},
{
"epoch": 1.9511070110701108,
"grad_norm": 1.531002649653087,
"learning_rate": 6.580919625268114e-06,
"loss": 0.6909,
"step": 2115
},
{
"epoch": 1.9557195571955721,
"grad_norm": 1.509365230765324,
"learning_rate": 6.530520407499049e-06,
"loss": 0.686,
"step": 2120
},
{
"epoch": 1.9603321033210332,
"grad_norm": 1.5743592553630588,
"learning_rate": 6.480221179231849e-06,
"loss": 0.7051,
"step": 2125
},
{
"epoch": 1.9649446494464944,
"grad_norm": 1.6561005765337469,
"learning_rate": 6.430023390077218e-06,
"loss": 0.6975,
"step": 2130
},
{
"epoch": 1.9695571955719557,
"grad_norm": 1.4695572898069678,
"learning_rate": 6.379928486722421e-06,
"loss": 0.703,
"step": 2135
},
{
"epoch": 1.974169741697417,
"grad_norm": 1.436121247379392,
"learning_rate": 6.329937912889582e-06,
"loss": 0.7037,
"step": 2140
},
{
"epoch": 1.9787822878228782,
"grad_norm": 1.4604394210363645,
"learning_rate": 6.280053109294064e-06,
"loss": 0.6861,
"step": 2145
},
{
"epoch": 1.9833948339483394,
"grad_norm": 1.49703841483432,
"learning_rate": 6.230275513602968e-06,
"loss": 0.6848,
"step": 2150
},
{
"epoch": 1.9880073800738007,
"grad_norm": 1.4776846632157035,
"learning_rate": 6.180606560393694e-06,
"loss": 0.6854,
"step": 2155
},
{
"epoch": 1.992619926199262,
"grad_norm": 1.469102349555009,
"learning_rate": 6.131047681112583e-06,
"loss": 0.6901,
"step": 2160
},
{
"epoch": 1.9972324723247232,
"grad_norm": 1.4916818504881257,
"learning_rate": 6.081600304033682e-06,
"loss": 0.6986,
"step": 2165
},
{
"epoch": 2.0018450184501844,
"grad_norm": 3.4623791161329507,
"learning_rate": 6.032265854217574e-06,
"loss": 0.5805,
"step": 2170
},
{
"epoch": 2.006457564575646,
"grad_norm": 2.5409394096245324,
"learning_rate": 5.983045753470308e-06,
"loss": 0.4067,
"step": 2175
},
{
"epoch": 2.011070110701107,
"grad_norm": 1.963539215801178,
"learning_rate": 5.933941420302412e-06,
"loss": 0.41,
"step": 2180
},
{
"epoch": 2.015682656826568,
"grad_norm": 1.9578493570683806,
"learning_rate": 5.884954269888032e-06,
"loss": 0.4078,
"step": 2185
},
{
"epoch": 2.0202952029520294,
"grad_norm": 1.6857102876256314,
"learning_rate": 5.83608571402414e-06,
"loss": 0.4126,
"step": 2190
},
{
"epoch": 2.024907749077491,
"grad_norm": 1.7969907397732796,
"learning_rate": 5.787337161089836e-06,
"loss": 0.4086,
"step": 2195
},
{
"epoch": 2.029520295202952,
"grad_norm": 1.6058011608079648,
"learning_rate": 5.738710016005766e-06,
"loss": 0.3925,
"step": 2200
},
{
"epoch": 2.029520295202952,
"eval_loss": 1.0149868726730347,
"eval_runtime": 417.6368,
"eval_samples_per_second": 36.754,
"eval_steps_per_second": 0.144,
"step": 2200
},
{
"epoch": 2.034132841328413,
"grad_norm": 1.6505939948003603,
"learning_rate": 5.690205680193647e-06,
"loss": 0.3948,
"step": 2205
},
{
"epoch": 2.0387453874538743,
"grad_norm": 1.6068608464647989,
"learning_rate": 5.641825551535849e-06,
"loss": 0.3878,
"step": 2210
},
{
"epoch": 2.043357933579336,
"grad_norm": 1.6707505723255622,
"learning_rate": 5.593571024335126e-06,
"loss": 0.3977,
"step": 2215
},
{
"epoch": 2.047970479704797,
"grad_norm": 1.6484182975706831,
"learning_rate": 5.545443489274444e-06,
"loss": 0.4009,
"step": 2220
},
{
"epoch": 2.052583025830258,
"grad_norm": 1.6146171821462916,
"learning_rate": 5.497444333376874e-06,
"loss": 0.3991,
"step": 2225
},
{
"epoch": 2.0571955719557193,
"grad_norm": 1.6445160771788836,
"learning_rate": 5.449574939965637e-06,
"loss": 0.4019,
"step": 2230
},
{
"epoch": 2.061808118081181,
"grad_norm": 1.6446158891460347,
"learning_rate": 5.401836688624231e-06,
"loss": 0.3885,
"step": 2235
},
{
"epoch": 2.066420664206642,
"grad_norm": 1.5856595607293187,
"learning_rate": 5.354230955156684e-06,
"loss": 0.4052,
"step": 2240
},
{
"epoch": 2.071033210332103,
"grad_norm": 1.5870199236748768,
"learning_rate": 5.306759111547881e-06,
"loss": 0.4029,
"step": 2245
},
{
"epoch": 2.0756457564575648,
"grad_norm": 1.6193242932623593,
"learning_rate": 5.259422525924037e-06,
"loss": 0.3907,
"step": 2250
},
{
"epoch": 2.080258302583026,
"grad_norm": 1.6256137352538118,
"learning_rate": 5.212222562513278e-06,
"loss": 0.3989,
"step": 2255
},
{
"epoch": 2.084870848708487,
"grad_norm": 1.6417107171770284,
"learning_rate": 5.165160581606301e-06,
"loss": 0.3982,
"step": 2260
},
{
"epoch": 2.089483394833948,
"grad_norm": 1.583054814160985,
"learning_rate": 5.11823793951719e-06,
"loss": 0.3857,
"step": 2265
},
{
"epoch": 2.0940959409594098,
"grad_norm": 1.5988302758967783,
"learning_rate": 5.0714559885443115e-06,
"loss": 0.3912,
"step": 2270
},
{
"epoch": 2.098708487084871,
"grad_norm": 1.5549504658907112,
"learning_rate": 5.024816076931366e-06,
"loss": 0.3964,
"step": 2275
},
{
"epoch": 2.103321033210332,
"grad_norm": 1.6288272703564912,
"learning_rate": 4.978319548828504e-06,
"loss": 0.3979,
"step": 2280
},
{
"epoch": 2.107933579335793,
"grad_norm": 1.6075015238877752,
"learning_rate": 4.931967744253601e-06,
"loss": 0.3859,
"step": 2285
},
{
"epoch": 2.1125461254612548,
"grad_norm": 1.6429738157314036,
"learning_rate": 4.885761999053647e-06,
"loss": 0.3962,
"step": 2290
},
{
"epoch": 2.117158671586716,
"grad_norm": 1.6796427577430482,
"learning_rate": 4.839703644866228e-06,
"loss": 0.4075,
"step": 2295
},
{
"epoch": 2.121771217712177,
"grad_norm": 1.619217591723515,
"learning_rate": 4.793794009081167e-06,
"loss": 0.4085,
"step": 2300
},
{
"epoch": 2.126383763837638,
"grad_norm": 1.6669420506553294,
"learning_rate": 4.7480344148022535e-06,
"loss": 0.4009,
"step": 2305
},
{
"epoch": 2.1309963099630997,
"grad_norm": 1.601567836454024,
"learning_rate": 4.702426180809132e-06,
"loss": 0.3893,
"step": 2310
},
{
"epoch": 2.135608856088561,
"grad_norm": 1.6330436004091688,
"learning_rate": 4.65697062151927e-06,
"loss": 0.3935,
"step": 2315
},
{
"epoch": 2.140221402214022,
"grad_norm": 1.686740725167288,
"learning_rate": 4.611669046950093e-06,
"loss": 0.4062,
"step": 2320
},
{
"epoch": 2.1448339483394836,
"grad_norm": 1.5889691549399958,
"learning_rate": 4.566522762681239e-06,
"loss": 0.3979,
"step": 2325
},
{
"epoch": 2.1494464944649447,
"grad_norm": 1.6850634151797181,
"learning_rate": 4.521533069816895e-06,
"loss": 0.3999,
"step": 2330
},
{
"epoch": 2.154059040590406,
"grad_norm": 1.5458146169307518,
"learning_rate": 4.4767012649483484e-06,
"loss": 0.3903,
"step": 2335
},
{
"epoch": 2.158671586715867,
"grad_norm": 1.6016021907561413,
"learning_rate": 4.432028640116581e-06,
"loss": 0.3885,
"step": 2340
},
{
"epoch": 2.1632841328413286,
"grad_norm": 1.6830749800846674,
"learning_rate": 4.387516482775058e-06,
"loss": 0.3897,
"step": 2345
},
{
"epoch": 2.1678966789667897,
"grad_norm": 1.657726450794809,
"learning_rate": 4.343166075752605e-06,
"loss": 0.3995,
"step": 2350
},
{
"epoch": 2.172509225092251,
"grad_norm": 1.6089499042377242,
"learning_rate": 4.298978697216442e-06,
"loss": 0.3906,
"step": 2355
},
{
"epoch": 2.177121771217712,
"grad_norm": 1.6433325368187606,
"learning_rate": 4.254955620635371e-06,
"loss": 0.3836,
"step": 2360
},
{
"epoch": 2.1817343173431736,
"grad_norm": 1.6228967594394044,
"learning_rate": 4.21109811474302e-06,
"loss": 0.3953,
"step": 2365
},
{
"epoch": 2.1863468634686347,
"grad_norm": 1.6564397078095119,
"learning_rate": 4.1674074435013445e-06,
"loss": 0.3975,
"step": 2370
},
{
"epoch": 2.190959409594096,
"grad_norm": 1.6855648962846128,
"learning_rate": 4.1238848660641504e-06,
"loss": 0.389,
"step": 2375
},
{
"epoch": 2.195571955719557,
"grad_norm": 1.5746673175696069,
"learning_rate": 4.080531636740836e-06,
"loss": 0.3844,
"step": 2380
},
{
"epoch": 2.2001845018450186,
"grad_norm": 1.6344618154669375,
"learning_rate": 4.03734900496022e-06,
"loss": 0.3988,
"step": 2385
},
{
"epoch": 2.2047970479704797,
"grad_norm": 1.6281928689117737,
"learning_rate": 3.994338215234547e-06,
"loss": 0.3896,
"step": 2390
},
{
"epoch": 2.209409594095941,
"grad_norm": 1.6257611833188321,
"learning_rate": 3.9515005071236274e-06,
"loss": 0.3961,
"step": 2395
},
{
"epoch": 2.2140221402214024,
"grad_norm": 1.5957292745421947,
"learning_rate": 3.908837115199086e-06,
"loss": 0.3871,
"step": 2400
},
{
"epoch": 2.2140221402214024,
"eval_loss": 1.0319310426712036,
"eval_runtime": 393.3456,
"eval_samples_per_second": 39.024,
"eval_steps_per_second": 0.153,
"step": 2400
},
{
"epoch": 2.2186346863468636,
"grad_norm": 1.5729201496024248,
"learning_rate": 3.866349269008819e-06,
"loss": 0.385,
"step": 2405
},
{
"epoch": 2.2232472324723247,
"grad_norm": 1.6183544120950042,
"learning_rate": 3.824038193041529e-06,
"loss": 0.3968,
"step": 2410
},
{
"epoch": 2.227859778597786,
"grad_norm": 1.6955717019033336,
"learning_rate": 3.781905106691447e-06,
"loss": 0.4004,
"step": 2415
},
{
"epoch": 2.2324723247232474,
"grad_norm": 1.6580260222032042,
"learning_rate": 3.7399512242231994e-06,
"loss": 0.3842,
"step": 2420
},
{
"epoch": 2.2370848708487086,
"grad_norm": 1.6490315910819098,
"learning_rate": 3.698177754736787e-06,
"loss": 0.3862,
"step": 2425
},
{
"epoch": 2.2416974169741697,
"grad_norm": 1.6351326865393605,
"learning_rate": 3.6565859021327777e-06,
"loss": 0.3952,
"step": 2430
},
{
"epoch": 2.246309963099631,
"grad_norm": 1.7016714416453813,
"learning_rate": 3.6151768650775577e-06,
"loss": 0.3906,
"step": 2435
},
{
"epoch": 2.2509225092250924,
"grad_norm": 1.5855218888376186,
"learning_rate": 3.5739518369688454e-06,
"loss": 0.391,
"step": 2440
},
{
"epoch": 2.2555350553505535,
"grad_norm": 1.5979300773582483,
"learning_rate": 3.5329120059012536e-06,
"loss": 0.3884,
"step": 2445
},
{
"epoch": 2.2601476014760147,
"grad_norm": 1.631933769302546,
"learning_rate": 3.492058554632063e-06,
"loss": 0.4012,
"step": 2450
},
{
"epoch": 2.264760147601476,
"grad_norm": 1.6784627234471698,
"learning_rate": 3.4513926605471504e-06,
"loss": 0.3956,
"step": 2455
},
{
"epoch": 2.2693726937269374,
"grad_norm": 1.5994089046113484,
"learning_rate": 3.4109154956270253e-06,
"loss": 0.3919,
"step": 2460
},
{
"epoch": 2.2739852398523985,
"grad_norm": 1.5962979043384486,
"learning_rate": 3.370628226413093e-06,
"loss": 0.3975,
"step": 2465
},
{
"epoch": 2.2785977859778597,
"grad_norm": 1.6734564837873838,
"learning_rate": 3.330532013973987e-06,
"loss": 0.3887,
"step": 2470
},
{
"epoch": 2.2832103321033212,
"grad_norm": 1.5662431409042064,
"learning_rate": 3.290628013872159e-06,
"loss": 0.3841,
"step": 2475
},
{
"epoch": 2.2878228782287824,
"grad_norm": 1.5635156068197413,
"learning_rate": 3.250917376130538e-06,
"loss": 0.3951,
"step": 2480
},
{
"epoch": 2.2924354243542435,
"grad_norm": 1.6010796160602963,
"learning_rate": 3.211401245199398e-06,
"loss": 0.3942,
"step": 2485
},
{
"epoch": 2.2970479704797047,
"grad_norm": 1.6199565737985742,
"learning_rate": 3.1720807599233903e-06,
"loss": 0.3927,
"step": 2490
},
{
"epoch": 2.3016605166051662,
"grad_norm": 1.6515237978384454,
"learning_rate": 3.132957053508696e-06,
"loss": 0.3978,
"step": 2495
},
{
"epoch": 2.3062730627306274,
"grad_norm": 1.6541451651760768,
"learning_rate": 3.0940312534903848e-06,
"loss": 0.397,
"step": 2500
},
{
"epoch": 2.3108856088560885,
"grad_norm": 1.6636499883415654,
"learning_rate": 3.0553044816999133e-06,
"loss": 0.3771,
"step": 2505
},
{
"epoch": 2.3154981549815496,
"grad_norm": 1.5866618676441615,
"learning_rate": 3.0167778542328053e-06,
"loss": 0.3967,
"step": 2510
},
{
"epoch": 2.3201107011070112,
"grad_norm": 1.6302246602193289,
"learning_rate": 2.9784524814164673e-06,
"loss": 0.4006,
"step": 2515
},
{
"epoch": 2.3247232472324724,
"grad_norm": 1.6231412036835564,
"learning_rate": 2.940329467778198e-06,
"loss": 0.3959,
"step": 2520
},
{
"epoch": 2.3293357933579335,
"grad_norm": 1.6840760426840755,
"learning_rate": 2.9024099120133674e-06,
"loss": 0.3908,
"step": 2525
},
{
"epoch": 2.3339483394833946,
"grad_norm": 1.7187956725298614,
"learning_rate": 2.8646949069537343e-06,
"loss": 0.3908,
"step": 2530
},
{
"epoch": 2.338560885608856,
"grad_norm": 1.56700956891004,
"learning_rate": 2.8271855395359613e-06,
"loss": 0.3961,
"step": 2535
},
{
"epoch": 2.3431734317343174,
"grad_norm": 1.6097463271059957,
"learning_rate": 2.7898828907702826e-06,
"loss": 0.3894,
"step": 2540
},
{
"epoch": 2.3477859778597785,
"grad_norm": 1.5810624302563459,
"learning_rate": 2.7527880357093673e-06,
"loss": 0.3853,
"step": 2545
},
{
"epoch": 2.35239852398524,
"grad_norm": 1.6230445922415717,
"learning_rate": 2.71590204341731e-06,
"loss": 0.3904,
"step": 2550
},
{
"epoch": 2.357011070110701,
"grad_norm": 1.631243837661556,
"learning_rate": 2.6792259769388394e-06,
"loss": 0.3854,
"step": 2555
},
{
"epoch": 2.3616236162361623,
"grad_norm": 1.6015681704315312,
"learning_rate": 2.642760893268684e-06,
"loss": 0.3897,
"step": 2560
},
{
"epoch": 2.3662361623616235,
"grad_norm": 1.629094206408994,
"learning_rate": 2.6065078433210913e-06,
"loss": 0.3956,
"step": 2565
},
{
"epoch": 2.3708487084870846,
"grad_norm": 1.6768173921403078,
"learning_rate": 2.570467871899557e-06,
"loss": 0.3882,
"step": 2570
},
{
"epoch": 2.375461254612546,
"grad_norm": 1.6096361607682703,
"learning_rate": 2.5346420176667052e-06,
"loss": 0.3841,
"step": 2575
},
{
"epoch": 2.3800738007380073,
"grad_norm": 1.5785068874659574,
"learning_rate": 2.4990313131143716e-06,
"loss": 0.407,
"step": 2580
},
{
"epoch": 2.3846863468634685,
"grad_norm": 1.639780492512362,
"learning_rate": 2.463636784533813e-06,
"loss": 0.3872,
"step": 2585
},
{
"epoch": 2.38929889298893,
"grad_norm": 1.5634276209766216,
"learning_rate": 2.4284594519861637e-06,
"loss": 0.3844,
"step": 2590
},
{
"epoch": 2.393911439114391,
"grad_norm": 1.6051539919308102,
"learning_rate": 2.3935003292730295e-06,
"loss": 0.3845,
"step": 2595
},
{
"epoch": 2.3985239852398523,
"grad_norm": 1.5819118683660134,
"learning_rate": 2.3587604239072535e-06,
"loss": 0.3927,
"step": 2600
},
{
"epoch": 2.3985239852398523,
"eval_loss": 1.0269191265106201,
"eval_runtime": 441.0938,
"eval_samples_per_second": 34.8,
"eval_steps_per_second": 0.136,
"step": 2600
},
{
"epoch": 2.4031365313653135,
"grad_norm": 1.625902982522089,
"learning_rate": 2.324240737083897e-06,
"loss": 0.3967,
"step": 2605
},
{
"epoch": 2.407749077490775,
"grad_norm": 1.6115877735569477,
"learning_rate": 2.2899422636513768e-06,
"loss": 0.3888,
"step": 2610
},
{
"epoch": 2.412361623616236,
"grad_norm": 1.6393950015907957,
"learning_rate": 2.2558659920828095e-06,
"loss": 0.3866,
"step": 2615
},
{
"epoch": 2.4169741697416973,
"grad_norm": 1.6289337323279018,
"learning_rate": 2.2220129044474903e-06,
"loss": 0.3822,
"step": 2620
},
{
"epoch": 2.421586715867159,
"grad_norm": 1.626399821921426,
"learning_rate": 2.1883839763826285e-06,
"loss": 0.3917,
"step": 2625
},
{
"epoch": 2.42619926199262,
"grad_norm": 1.5750110015076921,
"learning_rate": 2.15498017706521e-06,
"loss": 0.3818,
"step": 2630
},
{
"epoch": 2.430811808118081,
"grad_norm": 1.5656770730106075,
"learning_rate": 2.1218024691840646e-06,
"loss": 0.3949,
"step": 2635
},
{
"epoch": 2.4354243542435423,
"grad_norm": 1.5878185538716267,
"learning_rate": 2.088851808912126e-06,
"loss": 0.39,
"step": 2640
},
{
"epoch": 2.4400369003690034,
"grad_norm": 1.6108620558982116,
"learning_rate": 2.0561291458788736e-06,
"loss": 0.3968,
"step": 2645
},
{
"epoch": 2.444649446494465,
"grad_norm": 1.5917905556601293,
"learning_rate": 2.0236354231429743e-06,
"loss": 0.3835,
"step": 2650
},
{
"epoch": 2.449261992619926,
"grad_norm": 1.5749010329322541,
"learning_rate": 1.9913715771650798e-06,
"loss": 0.3878,
"step": 2655
},
{
"epoch": 2.4538745387453873,
"grad_norm": 1.5853197835560284,
"learning_rate": 1.959338537780868e-06,
"loss": 0.3793,
"step": 2660
},
{
"epoch": 2.458487084870849,
"grad_norm": 1.675999301164994,
"learning_rate": 1.9275372281742242e-06,
"loss": 0.3888,
"step": 2665
},
{
"epoch": 2.46309963099631,
"grad_norm": 1.5909593196232035,
"learning_rate": 1.8959685648506365e-06,
"loss": 0.379,
"step": 2670
},
{
"epoch": 2.467712177121771,
"grad_norm": 1.6580184737262997,
"learning_rate": 1.8646334576107993e-06,
"loss": 0.385,
"step": 2675
},
{
"epoch": 2.4723247232472323,
"grad_norm": 1.694950561030231,
"learning_rate": 1.83353280952437e-06,
"loss": 0.4061,
"step": 2680
},
{
"epoch": 2.476937269372694,
"grad_norm": 1.5612968845817867,
"learning_rate": 1.8026675169039654e-06,
"loss": 0.3717,
"step": 2685
},
{
"epoch": 2.481549815498155,
"grad_norm": 1.6589816057109397,
"learning_rate": 1.7720384692793036e-06,
"loss": 0.3907,
"step": 2690
},
{
"epoch": 2.486162361623616,
"grad_norm": 1.6036469226772128,
"learning_rate": 1.7416465493715984e-06,
"loss": 0.3777,
"step": 2695
},
{
"epoch": 2.4907749077490777,
"grad_norm": 1.6327667406661128,
"learning_rate": 1.7114926330680958e-06,
"loss": 0.3875,
"step": 2700
},
{
"epoch": 2.495387453874539,
"grad_norm": 1.5827244143165553,
"learning_rate": 1.681577589396839e-06,
"loss": 0.3859,
"step": 2705
},
{
"epoch": 2.5,
"grad_norm": 1.6514702752041677,
"learning_rate": 1.6519022805016305e-06,
"loss": 0.3843,
"step": 2710
},
{
"epoch": 2.504612546125461,
"grad_norm": 1.5570332325787979,
"learning_rate": 1.6224675616171737e-06,
"loss": 0.3715,
"step": 2715
},
{
"epoch": 2.5092250922509223,
"grad_norm": 1.630413477205012,
"learning_rate": 1.5932742810444314e-06,
"loss": 0.3836,
"step": 2720
},
{
"epoch": 2.513837638376384,
"grad_norm": 1.6521730580375191,
"learning_rate": 1.5643232801261731e-06,
"loss": 0.3948,
"step": 2725
},
{
"epoch": 2.518450184501845,
"grad_norm": 1.6155666031902267,
"learning_rate": 1.5356153932227423e-06,
"loss": 0.3898,
"step": 2730
},
{
"epoch": 2.523062730627306,
"grad_norm": 1.6555206625970396,
"learning_rate": 1.5071514476879878e-06,
"loss": 0.384,
"step": 2735
},
{
"epoch": 2.5276752767527677,
"grad_norm": 1.6677020481929692,
"learning_rate": 1.478932263845435e-06,
"loss": 0.3952,
"step": 2740
},
{
"epoch": 2.532287822878229,
"grad_norm": 1.66071104050102,
"learning_rate": 1.450958654964647e-06,
"loss": 0.3883,
"step": 2745
},
{
"epoch": 2.53690036900369,
"grad_norm": 1.5550008356755514,
"learning_rate": 1.4232314272377723e-06,
"loss": 0.3867,
"step": 2750
},
{
"epoch": 2.541512915129151,
"grad_norm": 1.6135337105923544,
"learning_rate": 1.3957513797563227e-06,
"loss": 0.3895,
"step": 2755
},
{
"epoch": 2.5461254612546127,
"grad_norm": 1.6353151843919402,
"learning_rate": 1.368519304488134e-06,
"loss": 0.3868,
"step": 2760
},
{
"epoch": 2.550738007380074,
"grad_norm": 1.5705890317457465,
"learning_rate": 1.3415359862545574e-06,
"loss": 0.3834,
"step": 2765
},
{
"epoch": 2.555350553505535,
"grad_norm": 1.654601851442529,
"learning_rate": 1.3148022027078223e-06,
"loss": 0.3832,
"step": 2770
},
{
"epoch": 2.5599630996309966,
"grad_norm": 1.5695058446628602,
"learning_rate": 1.2883187243086338e-06,
"loss": 0.3893,
"step": 2775
},
{
"epoch": 2.5645756457564577,
"grad_norm": 1.6350847841681155,
"learning_rate": 1.262086314303973e-06,
"loss": 0.3898,
"step": 2780
},
{
"epoch": 2.569188191881919,
"grad_norm": 1.608716488298126,
"learning_rate": 1.2361057287050892e-06,
"loss": 0.3834,
"step": 2785
},
{
"epoch": 2.57380073800738,
"grad_norm": 1.6204640984588243,
"learning_rate": 1.2103777162657205e-06,
"loss": 0.3972,
"step": 2790
},
{
"epoch": 2.578413284132841,
"grad_norm": 1.6231664675158106,
"learning_rate": 1.1849030184605092e-06,
"loss": 0.3831,
"step": 2795
},
{
"epoch": 2.5830258302583027,
"grad_norm": 1.6455527588787915,
"learning_rate": 1.1596823694636427e-06,
"loss": 0.3872,
"step": 2800
},
{
"epoch": 2.5830258302583027,
"eval_loss": 1.0266528129577637,
"eval_runtime": 403.4694,
"eval_samples_per_second": 38.045,
"eval_steps_per_second": 0.149,
"step": 2800
},
{
"epoch": 2.587638376383764,
"grad_norm": 1.6501434655468525,
"learning_rate": 1.134716496127679e-06,
"loss": 0.3866,
"step": 2805
},
{
"epoch": 2.592250922509225,
"grad_norm": 1.5937430607804544,
"learning_rate": 1.110006117962612e-06,
"loss": 0.3746,
"step": 2810
},
{
"epoch": 2.5968634686346865,
"grad_norm": 1.6042558353979863,
"learning_rate": 1.085551947115131e-06,
"loss": 0.3813,
"step": 2815
},
{
"epoch": 2.6014760147601477,
"grad_norm": 1.5880288341663498,
"learning_rate": 1.0613546883480974e-06,
"loss": 0.3879,
"step": 2820
},
{
"epoch": 2.606088560885609,
"grad_norm": 1.6687358545150779,
"learning_rate": 1.0374150390202308e-06,
"loss": 0.3764,
"step": 2825
},
{
"epoch": 2.61070110701107,
"grad_norm": 1.6305478991844253,
"learning_rate": 1.013733689066012e-06,
"loss": 0.3936,
"step": 2830
},
{
"epoch": 2.6153136531365315,
"grad_norm": 1.5959262607963571,
"learning_rate": 9.903113209758098e-07,
"loss": 0.3809,
"step": 2835
},
{
"epoch": 2.6199261992619927,
"grad_norm": 1.6326891680063345,
"learning_rate": 9.671486097761918e-07,
"loss": 0.3851,
"step": 2840
},
{
"epoch": 2.624538745387454,
"grad_norm": 1.5671079064226814,
"learning_rate": 9.442462230104876e-07,
"loss": 0.3813,
"step": 2845
},
{
"epoch": 2.6291512915129154,
"grad_norm": 1.6290077759272117,
"learning_rate": 9.216048207195438e-07,
"loss": 0.3815,
"step": 2850
},
{
"epoch": 2.6337638376383765,
"grad_norm": 1.6273985534855235,
"learning_rate": 8.992250554227011e-07,
"loss": 0.4061,
"step": 2855
},
{
"epoch": 2.6383763837638377,
"grad_norm": 1.6210509601819985,
"learning_rate": 8.771075720989886e-07,
"loss": 0.3752,
"step": 2860
},
{
"epoch": 2.642988929889299,
"grad_norm": 1.555467095004222,
"learning_rate": 8.552530081685384e-07,
"loss": 0.3875,
"step": 2865
},
{
"epoch": 2.64760147601476,
"grad_norm": 1.5802373017514193,
"learning_rate": 8.336619934742151e-07,
"loss": 0.3819,
"step": 2870
},
{
"epoch": 2.6522140221402215,
"grad_norm": 1.6154443890108339,
"learning_rate": 8.123351502634625e-07,
"loss": 0.3888,
"step": 2875
},
{
"epoch": 2.6568265682656826,
"grad_norm": 1.6154395851668601,
"learning_rate": 7.91273093170365e-07,
"loss": 0.3808,
"step": 2880
},
{
"epoch": 2.661439114391144,
"grad_norm": 1.587280501111623,
"learning_rate": 7.704764291979516e-07,
"loss": 0.3774,
"step": 2885
},
{
"epoch": 2.6660516605166054,
"grad_norm": 1.6071595650095456,
"learning_rate": 7.499457577006753e-07,
"loss": 0.3819,
"step": 2890
},
{
"epoch": 2.6706642066420665,
"grad_norm": 1.648848686369746,
"learning_rate": 7.296816703671683e-07,
"loss": 0.3855,
"step": 2895
},
{
"epoch": 2.6752767527675276,
"grad_norm": 1.5617730378995032,
"learning_rate": 7.09684751203168e-07,
"loss": 0.3909,
"step": 2900
},
{
"epoch": 2.6798892988929888,
"grad_norm": 1.6183910290801358,
"learning_rate": 6.899555765147004e-07,
"loss": 0.3826,
"step": 2905
},
{
"epoch": 2.6845018450184504,
"grad_norm": 1.674038262833668,
"learning_rate": 6.704947148914608e-07,
"loss": 0.382,
"step": 2910
},
{
"epoch": 2.6891143911439115,
"grad_norm": 1.6448057348626846,
"learning_rate": 6.513027271904315e-07,
"loss": 0.3854,
"step": 2915
},
{
"epoch": 2.6937269372693726,
"grad_norm": 1.6374354465922731,
"learning_rate": 6.323801665197238e-07,
"loss": 0.3851,
"step": 2920
},
{
"epoch": 2.698339483394834,
"grad_norm": 1.5834812445833784,
"learning_rate": 6.137275782226216e-07,
"loss": 0.3819,
"step": 2925
},
{
"epoch": 2.7029520295202953,
"grad_norm": 1.623319595480127,
"learning_rate": 5.953454998618857e-07,
"loss": 0.3856,
"step": 2930
},
{
"epoch": 2.7075645756457565,
"grad_norm": 1.5967961053246091,
"learning_rate": 5.772344612042435e-07,
"loss": 0.3862,
"step": 2935
},
{
"epoch": 2.7121771217712176,
"grad_norm": 1.5710116912601946,
"learning_rate": 5.593949842051338e-07,
"loss": 0.3842,
"step": 2940
},
{
"epoch": 2.7167896678966788,
"grad_norm": 1.57362891686515,
"learning_rate": 5.418275829936537e-07,
"loss": 0.3711,
"step": 2945
},
{
"epoch": 2.7214022140221403,
"grad_norm": 1.580310931596939,
"learning_rate": 5.24532763857749e-07,
"loss": 0.3835,
"step": 2950
},
{
"epoch": 2.7260147601476015,
"grad_norm": 1.6145634716470691,
"learning_rate": 5.075110252296245e-07,
"loss": 0.3882,
"step": 2955
},
{
"epoch": 2.7306273062730626,
"grad_norm": 1.6743712939756843,
"learning_rate": 4.907628576713663e-07,
"loss": 0.3838,
"step": 2960
},
{
"epoch": 2.735239852398524,
"grad_norm": 1.5631707817004297,
"learning_rate": 4.742887438608235e-07,
"loss": 0.387,
"step": 2965
},
{
"epoch": 2.7398523985239853,
"grad_norm": 1.6160350852160132,
"learning_rate": 4.5808915857768035e-07,
"loss": 0.3733,
"step": 2970
},
{
"epoch": 2.7444649446494465,
"grad_norm": 1.6391404550180673,
"learning_rate": 4.4216456868978243e-07,
"loss": 0.3863,
"step": 2975
},
{
"epoch": 2.7490774907749076,
"grad_norm": 1.5869883487994245,
"learning_rate": 4.265154331396815e-07,
"loss": 0.3803,
"step": 2980
},
{
"epoch": 2.7536900369003687,
"grad_norm": 1.6338303616426142,
"learning_rate": 4.111422029314016e-07,
"loss": 0.367,
"step": 2985
},
{
"epoch": 2.7583025830258303,
"grad_norm": 1.6222090885217113,
"learning_rate": 3.960453211174531e-07,
"loss": 0.3913,
"step": 2990
},
{
"epoch": 2.7629151291512914,
"grad_norm": 1.5453198744376195,
"learning_rate": 3.8122522278605024e-07,
"loss": 0.3884,
"step": 2995
},
{
"epoch": 2.767527675276753,
"grad_norm": 1.6194002178218827,
"learning_rate": 3.6668233504858486e-07,
"loss": 0.3918,
"step": 3000
},
{
"epoch": 2.767527675276753,
"eval_loss": 1.0241528749465942,
"eval_runtime": 583.1671,
"eval_samples_per_second": 26.322,
"eval_steps_per_second": 0.103,
"step": 3000
},
{
"epoch": 2.772140221402214,
"grad_norm": 1.5792021657673334,
"learning_rate": 3.524170770273072e-07,
"loss": 0.3836,
"step": 3005
},
{
"epoch": 2.7767527675276753,
"grad_norm": 1.5855513913689898,
"learning_rate": 3.384298598432545e-07,
"loss": 0.3836,
"step": 3010
},
{
"epoch": 2.7813653136531364,
"grad_norm": 1.5563076811229497,
"learning_rate": 3.2472108660439706e-07,
"loss": 0.3802,
"step": 3015
},
{
"epoch": 2.7859778597785976,
"grad_norm": 1.5823633538445738,
"learning_rate": 3.112911523940232e-07,
"loss": 0.383,
"step": 3020
},
{
"epoch": 2.790590405904059,
"grad_norm": 1.629806513708018,
"learning_rate": 2.9814044425935605e-07,
"loss": 0.3821,
"step": 3025
},
{
"epoch": 2.7952029520295203,
"grad_norm": 1.6493594318817755,
"learning_rate": 2.852693412003882e-07,
"loss": 0.3832,
"step": 3030
},
{
"epoch": 2.7998154981549814,
"grad_norm": 1.5938360437660848,
"learning_rate": 2.7267821415897343e-07,
"loss": 0.3739,
"step": 3035
},
{
"epoch": 2.804428044280443,
"grad_norm": 1.6409833373595797,
"learning_rate": 2.6036742600812683e-07,
"loss": 0.3824,
"step": 3040
},
{
"epoch": 2.809040590405904,
"grad_norm": 1.6120109521550734,
"learning_rate": 2.4833733154156716e-07,
"loss": 0.3791,
"step": 3045
},
{
"epoch": 2.8136531365313653,
"grad_norm": 1.6148694677775197,
"learning_rate": 2.3658827746349976e-07,
"loss": 0.3716,
"step": 3050
},
{
"epoch": 2.8182656826568264,
"grad_norm": 1.6335439307552395,
"learning_rate": 2.2512060237861455e-07,
"loss": 0.377,
"step": 3055
},
{
"epoch": 2.8228782287822876,
"grad_norm": 1.6687024271570985,
"learning_rate": 2.139346367823314e-07,
"loss": 0.3824,
"step": 3060
},
{
"epoch": 2.827490774907749,
"grad_norm": 1.5961399559857916,
"learning_rate": 2.030307030512768e-07,
"loss": 0.38,
"step": 3065
},
{
"epoch": 2.8321033210332103,
"grad_norm": 1.628301167538455,
"learning_rate": 1.9240911543399465e-07,
"loss": 0.3861,
"step": 3070
},
{
"epoch": 2.836715867158672,
"grad_norm": 1.5876724378264213,
"learning_rate": 1.8207018004188338e-07,
"loss": 0.375,
"step": 3075
},
{
"epoch": 2.841328413284133,
"grad_norm": 1.6533761523786383,
"learning_rate": 1.7201419484037861e-07,
"loss": 0.3847,
"step": 3080
},
{
"epoch": 2.845940959409594,
"grad_norm": 1.65860087801836,
"learning_rate": 1.622414496403668e-07,
"loss": 0.4014,
"step": 3085
},
{
"epoch": 2.8505535055350553,
"grad_norm": 1.5158450003834456,
"learning_rate": 1.527522260898273e-07,
"loss": 0.3743,
"step": 3090
},
{
"epoch": 2.8551660516605164,
"grad_norm": 1.543441072095729,
"learning_rate": 1.4354679766572344e-07,
"loss": 0.3867,
"step": 3095
},
{
"epoch": 2.859778597785978,
"grad_norm": 1.6190829129675481,
"learning_rate": 1.3462542966611314e-07,
"loss": 0.3697,
"step": 3100
},
{
"epoch": 2.864391143911439,
"grad_norm": 1.6865745421103189,
"learning_rate": 1.259883792025085e-07,
"loss": 0.3744,
"step": 3105
},
{
"epoch": 2.8690036900369003,
"grad_norm": 1.6329028917573583,
"learning_rate": 1.1763589519246388e-07,
"loss": 0.3722,
"step": 3110
},
{
"epoch": 2.873616236162362,
"grad_norm": 1.5873886153372632,
"learning_rate": 1.095682183524005e-07,
"loss": 0.3797,
"step": 3115
},
{
"epoch": 2.878228782287823,
"grad_norm": 1.580205694330548,
"learning_rate": 1.0178558119067316e-07,
"loss": 0.3705,
"step": 3120
},
{
"epoch": 2.882841328413284,
"grad_norm": 1.5748354606004111,
"learning_rate": 9.428820800086558e-08,
"loss": 0.3832,
"step": 3125
},
{
"epoch": 2.8874538745387452,
"grad_norm": 1.6471024830756282,
"learning_rate": 8.707631485532775e-08,
"loss": 0.3886,
"step": 3130
},
{
"epoch": 2.8920664206642064,
"grad_norm": 1.6632577517398965,
"learning_rate": 8.015010959894986e-08,
"loss": 0.384,
"step": 3135
},
{
"epoch": 2.896678966789668,
"grad_norm": 1.6208970211899278,
"learning_rate": 7.350979184317153e-08,
"loss": 0.3861,
"step": 3140
},
{
"epoch": 2.901291512915129,
"grad_norm": 1.64161859212595,
"learning_rate": 6.715555296022746e-08,
"loss": 0.3767,
"step": 3145
},
{
"epoch": 2.9059040590405907,
"grad_norm": 1.6172525804643438,
"learning_rate": 6.108757607763305e-08,
"loss": 0.3857,
"step": 3150
},
{
"epoch": 2.910516605166052,
"grad_norm": 1.580351811354319,
"learning_rate": 5.530603607290852e-08,
"loss": 0.3771,
"step": 3155
},
{
"epoch": 2.915129151291513,
"grad_norm": 1.572483075790589,
"learning_rate": 4.981109956853747e-08,
"loss": 0.3749,
"step": 3160
},
{
"epoch": 2.919741697416974,
"grad_norm": 1.64962807846865,
"learning_rate": 4.460292492716512e-08,
"loss": 0.3795,
"step": 3165
},
{
"epoch": 2.9243542435424352,
"grad_norm": 1.605735844681554,
"learning_rate": 3.968166224703085e-08,
"loss": 0.3795,
"step": 3170
},
{
"epoch": 2.928966789667897,
"grad_norm": 1.5747261446081762,
"learning_rate": 3.504745335765169e-08,
"loss": 0.3793,
"step": 3175
},
{
"epoch": 2.933579335793358,
"grad_norm": 1.653000934575167,
"learning_rate": 3.0700431815724464e-08,
"loss": 0.3903,
"step": 3180
},
{
"epoch": 2.938191881918819,
"grad_norm": 1.6378961953845004,
"learning_rate": 2.664072290128217e-08,
"loss": 0.3889,
"step": 3185
},
{
"epoch": 2.9428044280442807,
"grad_norm": 1.584042922862379,
"learning_rate": 2.2868443614082468e-08,
"loss": 0.3878,
"step": 3190
},
{
"epoch": 2.947416974169742,
"grad_norm": 1.5647903615149348,
"learning_rate": 1.9383702670235927e-08,
"loss": 0.382,
"step": 3195
},
{
"epoch": 2.952029520295203,
"grad_norm": 1.580064728520442,
"learning_rate": 1.6186600499074055e-08,
"loss": 0.3764,
"step": 3200
},
{
"epoch": 2.952029520295203,
"eval_loss": 1.024267315864563,
"eval_runtime": 436.1706,
"eval_samples_per_second": 35.193,
"eval_steps_per_second": 0.138,
"step": 3200
},
{
"epoch": 2.956642066420664,
"grad_norm": 1.6540116549017054,
"learning_rate": 1.3277229240249435e-08,
"loss": 0.3945,
"step": 3205
},
{
"epoch": 2.961254612546125,
"grad_norm": 1.6470444524138421,
"learning_rate": 1.0655672741090028e-08,
"loss": 0.3806,
"step": 3210
},
{
"epoch": 2.965867158671587,
"grad_norm": 1.5413563389524112,
"learning_rate": 8.322006554171147e-09,
"loss": 0.3818,
"step": 3215
},
{
"epoch": 2.970479704797048,
"grad_norm": 1.6164344748774018,
"learning_rate": 6.276297935149389e-09,
"loss": 0.3847,
"step": 3220
},
{
"epoch": 2.975092250922509,
"grad_norm": 1.6066275827671024,
"learning_rate": 4.5186058408153156e-09,
"loss": 0.3823,
"step": 3225
},
{
"epoch": 2.9797047970479706,
"grad_norm": 1.6333269508170274,
"learning_rate": 3.0489809273981375e-09,
"loss": 0.3801,
"step": 3230
},
{
"epoch": 2.984317343173432,
"grad_norm": 1.6103531418344368,
"learning_rate": 1.8674655491091043e-09,
"loss": 0.3932,
"step": 3235
},
{
"epoch": 2.988929889298893,
"grad_norm": 1.6234448383351934,
"learning_rate": 9.740937569135967e-10,
"loss": 0.3832,
"step": 3240
},
{
"epoch": 2.993542435424354,
"grad_norm": 1.6318211007601922,
"learning_rate": 3.6889129755413033e-10,
"loss": 0.3871,
"step": 3245
},
{
"epoch": 2.9981549815498156,
"grad_norm": 1.609675292027893,
"learning_rate": 5.187561280983744e-11,
"loss": 0.3788,
"step": 3250
},
{
"epoch": 3.0,
"step": 3252,
"total_flos": 1361805280542720.0,
"train_loss": 0.7043063408920832,
"train_runtime": 81819.3933,
"train_samples_per_second": 5.085,
"train_steps_per_second": 0.04
}
],
"logging_steps": 5,
"max_steps": 3252,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1361805280542720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}