Qwen2.5-VL-3B-Instruct-SFT / trainer_state.json
Diankun's picture
Model save
03358bc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2361,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021177467174925877,
"grad_norm": 83.68173840087012,
"learning_rate": 4.219409282700422e-07,
"loss": 2.1661,
"mean_token_accuracy": 0.5976044356822967,
"step": 5
},
{
"epoch": 0.004235493434985175,
"grad_norm": 71.11062161153062,
"learning_rate": 8.438818565400844e-07,
"loss": 2.1622,
"mean_token_accuracy": 0.5995522707700729,
"step": 10
},
{
"epoch": 0.0063532401524777635,
"grad_norm": 34.572128830715705,
"learning_rate": 1.2658227848101267e-06,
"loss": 2.0632,
"mean_token_accuracy": 0.5944636166095734,
"step": 15
},
{
"epoch": 0.00847098686997035,
"grad_norm": 17.011229671312382,
"learning_rate": 1.6877637130801689e-06,
"loss": 1.7723,
"mean_token_accuracy": 0.6232941240072251,
"step": 20
},
{
"epoch": 0.010588733587462939,
"grad_norm": 9.74778063232665,
"learning_rate": 2.1097046413502114e-06,
"loss": 1.5731,
"mean_token_accuracy": 0.63005710542202,
"step": 25
},
{
"epoch": 0.012706480304955527,
"grad_norm": 7.914154804733739,
"learning_rate": 2.5316455696202535e-06,
"loss": 1.3488,
"mean_token_accuracy": 0.6629315137863159,
"step": 30
},
{
"epoch": 0.014824227022448115,
"grad_norm": 7.497303646153751,
"learning_rate": 2.9535864978902956e-06,
"loss": 1.1306,
"mean_token_accuracy": 0.6912301659584046,
"step": 35
},
{
"epoch": 0.0169419737399407,
"grad_norm": 3.0571735289067807,
"learning_rate": 3.3755274261603377e-06,
"loss": 0.9338,
"mean_token_accuracy": 0.7339568644762039,
"step": 40
},
{
"epoch": 0.01905972045743329,
"grad_norm": 3.7299236330227674,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.8366,
"mean_token_accuracy": 0.7472610950469971,
"step": 45
},
{
"epoch": 0.021177467174925878,
"grad_norm": 1.94550213470537,
"learning_rate": 4.219409282700423e-06,
"loss": 0.813,
"mean_token_accuracy": 0.7466759532690048,
"step": 50
},
{
"epoch": 0.023295213892418468,
"grad_norm": 1.9631003842444021,
"learning_rate": 4.641350210970465e-06,
"loss": 0.7409,
"mean_token_accuracy": 0.7591830432415009,
"step": 55
},
{
"epoch": 0.025412960609911054,
"grad_norm": 2.126411421817186,
"learning_rate": 5.063291139240507e-06,
"loss": 0.6999,
"mean_token_accuracy": 0.7653463244438171,
"step": 60
},
{
"epoch": 0.027530707327403644,
"grad_norm": 2.431633067673993,
"learning_rate": 5.485232067510548e-06,
"loss": 0.6803,
"mean_token_accuracy": 0.7705583959817887,
"step": 65
},
{
"epoch": 0.02964845404489623,
"grad_norm": 2.1470463103072706,
"learning_rate": 5.907172995780591e-06,
"loss": 0.6716,
"mean_token_accuracy": 0.7747641324996948,
"step": 70
},
{
"epoch": 0.03176620076238882,
"grad_norm": 1.8134772555606546,
"learning_rate": 6.329113924050634e-06,
"loss": 0.6442,
"mean_token_accuracy": 0.7782594561576843,
"step": 75
},
{
"epoch": 0.0338839474798814,
"grad_norm": 1.828059395701887,
"learning_rate": 6.751054852320675e-06,
"loss": 0.654,
"mean_token_accuracy": 0.7730626821517944,
"step": 80
},
{
"epoch": 0.03600169419737399,
"grad_norm": 2.144653892020441,
"learning_rate": 7.172995780590718e-06,
"loss": 0.5866,
"mean_token_accuracy": 0.7909977465867997,
"step": 85
},
{
"epoch": 0.03811944091486658,
"grad_norm": 1.8471427217270429,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.6343,
"mean_token_accuracy": 0.7756380766630173,
"step": 90
},
{
"epoch": 0.04023718763235917,
"grad_norm": 2.3290876362673916,
"learning_rate": 8.016877637130802e-06,
"loss": 0.6006,
"mean_token_accuracy": 0.7818532437086105,
"step": 95
},
{
"epoch": 0.042354934349851756,
"grad_norm": 2.038115409634727,
"learning_rate": 8.438818565400846e-06,
"loss": 0.6123,
"mean_token_accuracy": 0.7774519264698029,
"step": 100
},
{
"epoch": 0.044472681067344345,
"grad_norm": 1.8512017533544443,
"learning_rate": 8.860759493670886e-06,
"loss": 0.5782,
"mean_token_accuracy": 0.787897452712059,
"step": 105
},
{
"epoch": 0.046590427784836935,
"grad_norm": 1.7617335354592987,
"learning_rate": 9.28270042194093e-06,
"loss": 0.5646,
"mean_token_accuracy": 0.7916492879390716,
"step": 110
},
{
"epoch": 0.04870817450232952,
"grad_norm": 1.7594677346726018,
"learning_rate": 9.704641350210972e-06,
"loss": 0.5741,
"mean_token_accuracy": 0.7923983573913574,
"step": 115
},
{
"epoch": 0.05082592121982211,
"grad_norm": 2.074930275455498,
"learning_rate": 1.0126582278481014e-05,
"loss": 0.5872,
"mean_token_accuracy": 0.7833609640598297,
"step": 120
},
{
"epoch": 0.0529436679373147,
"grad_norm": 1.9672826514728756,
"learning_rate": 1.0548523206751056e-05,
"loss": 0.5525,
"mean_token_accuracy": 0.7987953841686248,
"step": 125
},
{
"epoch": 0.05506141465480729,
"grad_norm": 1.8715882057544604,
"learning_rate": 1.0970464135021096e-05,
"loss": 0.5825,
"mean_token_accuracy": 0.788796243071556,
"step": 130
},
{
"epoch": 0.05717916137229987,
"grad_norm": 2.055120934997949,
"learning_rate": 1.139240506329114e-05,
"loss": 0.5668,
"mean_token_accuracy": 0.794330358505249,
"step": 135
},
{
"epoch": 0.05929690808979246,
"grad_norm": 2.1751457050987213,
"learning_rate": 1.1814345991561182e-05,
"loss": 0.5504,
"mean_token_accuracy": 0.7986414194107055,
"step": 140
},
{
"epoch": 0.06141465480728505,
"grad_norm": 2.6530737679742735,
"learning_rate": 1.2236286919831224e-05,
"loss": 0.5825,
"mean_token_accuracy": 0.787852120399475,
"step": 145
},
{
"epoch": 0.06353240152477764,
"grad_norm": 1.7772714746257652,
"learning_rate": 1.2658227848101268e-05,
"loss": 0.5321,
"mean_token_accuracy": 0.7960227221250534,
"step": 150
},
{
"epoch": 0.06565014824227022,
"grad_norm": 2.007446829510951,
"learning_rate": 1.3080168776371309e-05,
"loss": 0.5652,
"mean_token_accuracy": 0.7959463059902191,
"step": 155
},
{
"epoch": 0.0677678949597628,
"grad_norm": 1.5240825493431345,
"learning_rate": 1.350210970464135e-05,
"loss": 0.5471,
"mean_token_accuracy": 0.7939142495393753,
"step": 160
},
{
"epoch": 0.0698856416772554,
"grad_norm": 2.1050082197685573,
"learning_rate": 1.3924050632911395e-05,
"loss": 0.5283,
"mean_token_accuracy": 0.8012055605649948,
"step": 165
},
{
"epoch": 0.07200338839474799,
"grad_norm": 1.8465069166680506,
"learning_rate": 1.4345991561181437e-05,
"loss": 0.5525,
"mean_token_accuracy": 0.8044249504804611,
"step": 170
},
{
"epoch": 0.07412113511224058,
"grad_norm": 1.6175686507972196,
"learning_rate": 1.4767932489451477e-05,
"loss": 0.5561,
"mean_token_accuracy": 0.8003985285758972,
"step": 175
},
{
"epoch": 0.07623888182973317,
"grad_norm": 1.987593615824268,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.5637,
"mean_token_accuracy": 0.7922172099351883,
"step": 180
},
{
"epoch": 0.07835662854722575,
"grad_norm": 1.6436094243541917,
"learning_rate": 1.5611814345991563e-05,
"loss": 0.5662,
"mean_token_accuracy": 0.7916066467761993,
"step": 185
},
{
"epoch": 0.08047437526471835,
"grad_norm": 1.6236359809975813,
"learning_rate": 1.6033755274261603e-05,
"loss": 0.5432,
"mean_token_accuracy": 0.8009026974439621,
"step": 190
},
{
"epoch": 0.08259212198221093,
"grad_norm": 1.5274031981451834,
"learning_rate": 1.6455696202531647e-05,
"loss": 0.5484,
"mean_token_accuracy": 0.7982567459344864,
"step": 195
},
{
"epoch": 0.08470986869970351,
"grad_norm": 2.03018219248001,
"learning_rate": 1.687763713080169e-05,
"loss": 0.5407,
"mean_token_accuracy": 0.8011936664581298,
"step": 200
},
{
"epoch": 0.08682761541719611,
"grad_norm": 1.8014465384883518,
"learning_rate": 1.729957805907173e-05,
"loss": 0.5247,
"mean_token_accuracy": 0.8084624141454697,
"step": 205
},
{
"epoch": 0.08894536213468869,
"grad_norm": 1.7271306151793162,
"learning_rate": 1.7721518987341772e-05,
"loss": 0.5708,
"mean_token_accuracy": 0.7947988003492356,
"step": 210
},
{
"epoch": 0.09106310885218127,
"grad_norm": 1.9191596287336676,
"learning_rate": 1.8143459915611816e-05,
"loss": 0.5363,
"mean_token_accuracy": 0.8047021597623825,
"step": 215
},
{
"epoch": 0.09318085556967387,
"grad_norm": 1.7320574921982606,
"learning_rate": 1.856540084388186e-05,
"loss": 0.5658,
"mean_token_accuracy": 0.7928981482982635,
"step": 220
},
{
"epoch": 0.09529860228716645,
"grad_norm": 1.8013279459762328,
"learning_rate": 1.89873417721519e-05,
"loss": 0.5283,
"mean_token_accuracy": 0.8064373075962067,
"step": 225
},
{
"epoch": 0.09741634900465904,
"grad_norm": 1.6963994963949376,
"learning_rate": 1.9409282700421944e-05,
"loss": 0.541,
"mean_token_accuracy": 0.798399469256401,
"step": 230
},
{
"epoch": 0.09953409572215163,
"grad_norm": 1.8211709634602606,
"learning_rate": 1.9831223628691984e-05,
"loss": 0.5529,
"mean_token_accuracy": 0.7973109990358352,
"step": 235
},
{
"epoch": 0.10165184243964422,
"grad_norm": 1.800221940229098,
"learning_rate": 1.9999901552991966e-05,
"loss": 0.5297,
"mean_token_accuracy": 0.8061769932508469,
"step": 240
},
{
"epoch": 0.10376958915713681,
"grad_norm": 3.118430462846644,
"learning_rate": 1.9999299939406875e-05,
"loss": 0.567,
"mean_token_accuracy": 0.7869770050048828,
"step": 245
},
{
"epoch": 0.1058873358746294,
"grad_norm": 6.373265842936888,
"learning_rate": 1.9998151437882874e-05,
"loss": 0.5194,
"mean_token_accuracy": 0.8079254150390625,
"step": 250
},
{
"epoch": 0.10800508259212198,
"grad_norm": 1.7643542086224073,
"learning_rate": 1.999645611123453e-05,
"loss": 0.5476,
"mean_token_accuracy": 0.8036489874124527,
"step": 255
},
{
"epoch": 0.11012282930961458,
"grad_norm": 1.7570808876197173,
"learning_rate": 1.999421405218369e-05,
"loss": 0.5183,
"mean_token_accuracy": 0.8039919883012772,
"step": 260
},
{
"epoch": 0.11224057602710716,
"grad_norm": 1.4650600928842654,
"learning_rate": 1.9991425383354462e-05,
"loss": 0.5575,
"mean_token_accuracy": 0.7989150047302246,
"step": 265
},
{
"epoch": 0.11435832274459974,
"grad_norm": 1.5715508311626518,
"learning_rate": 1.9988090257266442e-05,
"loss": 0.5276,
"mean_token_accuracy": 0.8024184852838516,
"step": 270
},
{
"epoch": 0.11647606946209234,
"grad_norm": 1.5012575844730074,
"learning_rate": 1.9984208856326433e-05,
"loss": 0.511,
"mean_token_accuracy": 0.810269170999527,
"step": 275
},
{
"epoch": 0.11859381617958492,
"grad_norm": 2.1674114266205553,
"learning_rate": 1.9979781392818424e-05,
"loss": 0.5069,
"mean_token_accuracy": 0.8049084335565567,
"step": 280
},
{
"epoch": 0.1207115628970775,
"grad_norm": 1.597566985653751,
"learning_rate": 1.9974808108892017e-05,
"loss": 0.5097,
"mean_token_accuracy": 0.810433080792427,
"step": 285
},
{
"epoch": 0.1228293096145701,
"grad_norm": 2.721798223377223,
"learning_rate": 1.9969289276549144e-05,
"loss": 0.526,
"mean_token_accuracy": 0.8058519691228867,
"step": 290
},
{
"epoch": 0.12494705633206268,
"grad_norm": 1.526771766492988,
"learning_rate": 1.9963225197629223e-05,
"loss": 0.5172,
"mean_token_accuracy": 0.8079220175743103,
"step": 295
},
{
"epoch": 0.12706480304955528,
"grad_norm": 1.3424112355487237,
"learning_rate": 1.9956616203792636e-05,
"loss": 0.5135,
"mean_token_accuracy": 0.806724363565445,
"step": 300
},
{
"epoch": 0.12918254976704785,
"grad_norm": 1.5824773036593809,
"learning_rate": 1.9949462656502588e-05,
"loss": 0.5383,
"mean_token_accuracy": 0.8001780599355698,
"step": 305
},
{
"epoch": 0.13130029648454045,
"grad_norm": 1.5157834737082827,
"learning_rate": 1.994176494700534e-05,
"loss": 0.5466,
"mean_token_accuracy": 0.7970251202583313,
"step": 310
},
{
"epoch": 0.13341804320203304,
"grad_norm": 1.8369627378901519,
"learning_rate": 1.993352349630882e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.8072717070579529,
"step": 315
},
{
"epoch": 0.1355357899195256,
"grad_norm": 1.5676620169867563,
"learning_rate": 1.9924738755159573e-05,
"loss": 0.5116,
"mean_token_accuracy": 0.8025958120822907,
"step": 320
},
{
"epoch": 0.1376535366370182,
"grad_norm": 1.5442271717658778,
"learning_rate": 1.9915411204018137e-05,
"loss": 0.495,
"mean_token_accuracy": 0.8155842959880829,
"step": 325
},
{
"epoch": 0.1397712833545108,
"grad_norm": 1.9104862823035134,
"learning_rate": 1.9905541353032744e-05,
"loss": 0.4707,
"mean_token_accuracy": 0.8196403324604035,
"step": 330
},
{
"epoch": 0.14188903007200337,
"grad_norm": 1.8843041038781683,
"learning_rate": 1.9895129742011434e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.8036209732294083,
"step": 335
},
{
"epoch": 0.14400677678949597,
"grad_norm": 1.2996290243783448,
"learning_rate": 1.9884176940392522e-05,
"loss": 0.5355,
"mean_token_accuracy": 0.7970023989677429,
"step": 340
},
{
"epoch": 0.14612452350698857,
"grad_norm": 1.7409691547169837,
"learning_rate": 1.9872683547213446e-05,
"loss": 0.5222,
"mean_token_accuracy": 0.8015773713588714,
"step": 345
},
{
"epoch": 0.14824227022448117,
"grad_norm": 1.3236145792783143,
"learning_rate": 1.9860650191078033e-05,
"loss": 0.5165,
"mean_token_accuracy": 0.8045854181051254,
"step": 350
},
{
"epoch": 0.15036001694197373,
"grad_norm": 1.5674402609006048,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.5141,
"mean_token_accuracy": 0.8047444432973861,
"step": 355
},
{
"epoch": 0.15247776365946633,
"grad_norm": 1.4948547674340282,
"learning_rate": 1.98349662519774e-05,
"loss": 0.493,
"mean_token_accuracy": 0.8128765910863877,
"step": 360
},
{
"epoch": 0.15459551037695893,
"grad_norm": 1.57285942684427,
"learning_rate": 1.9821317073734173e-05,
"loss": 0.5114,
"mean_token_accuracy": 0.8024025142192841,
"step": 365
},
{
"epoch": 0.1567132570944515,
"grad_norm": 1.3725667498479879,
"learning_rate": 1.9807130741901756e-05,
"loss": 0.5552,
"mean_token_accuracy": 0.7975639194250107,
"step": 370
},
{
"epoch": 0.1588310038119441,
"grad_norm": 1.6323326415858614,
"learning_rate": 1.979240803236785e-05,
"loss": 0.5101,
"mean_token_accuracy": 0.8058428287506103,
"step": 375
},
{
"epoch": 0.1609487505294367,
"grad_norm": 1.293657741608038,
"learning_rate": 1.9777149750356044e-05,
"loss": 0.4931,
"mean_token_accuracy": 0.8156037211418152,
"step": 380
},
{
"epoch": 0.16306649724692926,
"grad_norm": 1.584456213127757,
"learning_rate": 1.9761356730381806e-05,
"loss": 0.5066,
"mean_token_accuracy": 0.8106210082769394,
"step": 385
},
{
"epoch": 0.16518424396442186,
"grad_norm": 1.3531024564685128,
"learning_rate": 1.9745029836206813e-05,
"loss": 0.4862,
"mean_token_accuracy": 0.8180296182632446,
"step": 390
},
{
"epoch": 0.16730199068191445,
"grad_norm": 1.5992771952291873,
"learning_rate": 1.9728169960791736e-05,
"loss": 0.5158,
"mean_token_accuracy": 0.8020082831382751,
"step": 395
},
{
"epoch": 0.16941973739940702,
"grad_norm": 1.3875752393035827,
"learning_rate": 1.9710778026247367e-05,
"loss": 0.5268,
"mean_token_accuracy": 0.8021057844161987,
"step": 400
},
{
"epoch": 0.17153748411689962,
"grad_norm": 1.4892475787998831,
"learning_rate": 1.9692854983784235e-05,
"loss": 0.5031,
"mean_token_accuracy": 0.8153967589139939,
"step": 405
},
{
"epoch": 0.17365523083439222,
"grad_norm": 1.3435721015179996,
"learning_rate": 1.9674401813660532e-05,
"loss": 0.5151,
"mean_token_accuracy": 0.8066144526004791,
"step": 410
},
{
"epoch": 0.17577297755188478,
"grad_norm": 1.4757784795296558,
"learning_rate": 1.9655419525128528e-05,
"loss": 0.5197,
"mean_token_accuracy": 0.8056324630975723,
"step": 415
},
{
"epoch": 0.17789072426937738,
"grad_norm": 1.8586890907074842,
"learning_rate": 1.9635909156379373e-05,
"loss": 0.4817,
"mean_token_accuracy": 0.8227346748113632,
"step": 420
},
{
"epoch": 0.18000847098686998,
"grad_norm": 1.3338010634125226,
"learning_rate": 1.9615871774486293e-05,
"loss": 0.476,
"mean_token_accuracy": 0.8171389639377594,
"step": 425
},
{
"epoch": 0.18212621770436255,
"grad_norm": 1.467996639944381,
"learning_rate": 1.959530847534627e-05,
"loss": 0.4857,
"mean_token_accuracy": 0.8151497721672059,
"step": 430
},
{
"epoch": 0.18424396442185514,
"grad_norm": 1.482953746737999,
"learning_rate": 1.9574220383620054e-05,
"loss": 0.4922,
"mean_token_accuracy": 0.8100210309028626,
"step": 435
},
{
"epoch": 0.18636171113934774,
"grad_norm": 5.208401516653082,
"learning_rate": 1.95526086526707e-05,
"loss": 0.5263,
"mean_token_accuracy": 0.8080328673124313,
"step": 440
},
{
"epoch": 0.1884794578568403,
"grad_norm": 1.5834873689672437,
"learning_rate": 1.9530474464500445e-05,
"loss": 0.514,
"mean_token_accuracy": 0.8094299465417862,
"step": 445
},
{
"epoch": 0.1905972045743329,
"grad_norm": 1.3405671636751928,
"learning_rate": 1.9507819029686094e-05,
"loss": 0.5119,
"mean_token_accuracy": 0.8087350040674209,
"step": 450
},
{
"epoch": 0.1927149512918255,
"grad_norm": 1.3993020572279387,
"learning_rate": 1.94846435873128e-05,
"loss": 0.5153,
"mean_token_accuracy": 0.8082747459411621,
"step": 455
},
{
"epoch": 0.19483269800931807,
"grad_norm": 1.3011551512989479,
"learning_rate": 1.9460949404906285e-05,
"loss": 0.5028,
"mean_token_accuracy": 0.8120961904525756,
"step": 460
},
{
"epoch": 0.19695044472681067,
"grad_norm": 1.6479875272294309,
"learning_rate": 1.9436737778363526e-05,
"loss": 0.4787,
"mean_token_accuracy": 0.8184203952550888,
"step": 465
},
{
"epoch": 0.19906819144430327,
"grad_norm": 1.2952323822215526,
"learning_rate": 1.9412010031881884e-05,
"loss": 0.4811,
"mean_token_accuracy": 0.8196297824382782,
"step": 470
},
{
"epoch": 0.20118593816179586,
"grad_norm": 1.2434980503550659,
"learning_rate": 1.9386767517886666e-05,
"loss": 0.4992,
"mean_token_accuracy": 0.8126247316598892,
"step": 475
},
{
"epoch": 0.20330368487928843,
"grad_norm": 1.2749730489780189,
"learning_rate": 1.9361011616957165e-05,
"loss": 0.5013,
"mean_token_accuracy": 0.8094296991825104,
"step": 480
},
{
"epoch": 0.20542143159678103,
"grad_norm": 1.2801081991950354,
"learning_rate": 1.933474373775115e-05,
"loss": 0.4914,
"mean_token_accuracy": 0.8103417336940766,
"step": 485
},
{
"epoch": 0.20753917831427363,
"grad_norm": 1.3841139586738282,
"learning_rate": 1.930796531692783e-05,
"loss": 0.503,
"mean_token_accuracy": 0.8150111019611359,
"step": 490
},
{
"epoch": 0.2096569250317662,
"grad_norm": 1.2895819374549709,
"learning_rate": 1.9280677819069273e-05,
"loss": 0.4938,
"mean_token_accuracy": 0.8058139503002166,
"step": 495
},
{
"epoch": 0.2117746717492588,
"grad_norm": 1.2705506609214867,
"learning_rate": 1.9252882736600302e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.8078715801239014,
"step": 500
},
{
"epoch": 0.2138924184667514,
"grad_norm": 1.3700128773821674,
"learning_rate": 1.922458158970688e-05,
"loss": 0.5122,
"mean_token_accuracy": 0.805089196562767,
"step": 505
},
{
"epoch": 0.21601016518424396,
"grad_norm": 1.4292612681859336,
"learning_rate": 1.9195775926252952e-05,
"loss": 0.4799,
"mean_token_accuracy": 0.8134547978639602,
"step": 510
},
{
"epoch": 0.21812791190173655,
"grad_norm": 2.589810653355124,
"learning_rate": 1.91664673216958e-05,
"loss": 0.4686,
"mean_token_accuracy": 0.8232874065637589,
"step": 515
},
{
"epoch": 0.22024565861922915,
"grad_norm": 1.4425686621750156,
"learning_rate": 1.913665737899988e-05,
"loss": 0.4885,
"mean_token_accuracy": 0.815599313378334,
"step": 520
},
{
"epoch": 0.22236340533672172,
"grad_norm": 1.4823410740282665,
"learning_rate": 1.9106347728549134e-05,
"loss": 0.4832,
"mean_token_accuracy": 0.8109551817178726,
"step": 525
},
{
"epoch": 0.22448115205421432,
"grad_norm": 1.1459009249468546,
"learning_rate": 1.9075540028057844e-05,
"loss": 0.5156,
"mean_token_accuracy": 0.8015700995922088,
"step": 530
},
{
"epoch": 0.2265988987717069,
"grad_norm": 1.273350806844229,
"learning_rate": 1.9044235962479945e-05,
"loss": 0.4901,
"mean_token_accuracy": 0.8163118690252305,
"step": 535
},
{
"epoch": 0.22871664548919948,
"grad_norm": 1.2736969034780394,
"learning_rate": 1.9012437243916895e-05,
"loss": 0.475,
"mean_token_accuracy": 0.8155727684497833,
"step": 540
},
{
"epoch": 0.23083439220669208,
"grad_norm": 1.1644155017049156,
"learning_rate": 1.8980145611523996e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.8130400031805038,
"step": 545
},
{
"epoch": 0.23295213892418468,
"grad_norm": 1.3543018612133357,
"learning_rate": 1.8947362831415327e-05,
"loss": 0.4668,
"mean_token_accuracy": 0.8260669410228729,
"step": 550
},
{
"epoch": 0.23506988564167725,
"grad_norm": 1.2391111005758269,
"learning_rate": 1.8914090696567104e-05,
"loss": 0.4809,
"mean_token_accuracy": 0.8127309769392014,
"step": 555
},
{
"epoch": 0.23718763235916984,
"grad_norm": 2.2015980143710583,
"learning_rate": 1.888033102671965e-05,
"loss": 0.4922,
"mean_token_accuracy": 0.8155588954687119,
"step": 560
},
{
"epoch": 0.23930537907666244,
"grad_norm": 1.2198454979455773,
"learning_rate": 1.884608566827785e-05,
"loss": 0.5168,
"mean_token_accuracy": 0.8062847316265106,
"step": 565
},
{
"epoch": 0.241423125794155,
"grad_norm": 1.184969374617232,
"learning_rate": 1.8811356494210166e-05,
"loss": 0.4805,
"mean_token_accuracy": 0.8132707148790359,
"step": 570
},
{
"epoch": 0.2435408725116476,
"grad_norm": 1.187126766493632,
"learning_rate": 1.8776145403946226e-05,
"loss": 0.4955,
"mean_token_accuracy": 0.8102918237447738,
"step": 575
},
{
"epoch": 0.2456586192291402,
"grad_norm": 1.3821096957818944,
"learning_rate": 1.874045432327289e-05,
"loss": 0.4985,
"mean_token_accuracy": 0.8098550081253052,
"step": 580
},
{
"epoch": 0.24777636594663277,
"grad_norm": 1.214604218577671,
"learning_rate": 1.8704285204228973e-05,
"loss": 0.4627,
"mean_token_accuracy": 0.8165160745382309,
"step": 585
},
{
"epoch": 0.24989411266412537,
"grad_norm": 1.4526314855211653,
"learning_rate": 1.866764002499846e-05,
"loss": 0.4909,
"mean_token_accuracy": 0.8122711658477784,
"step": 590
},
{
"epoch": 0.25201185938161796,
"grad_norm": 1.1543877428891598,
"learning_rate": 1.8630520789802308e-05,
"loss": 0.4782,
"mean_token_accuracy": 0.8182896554470063,
"step": 595
},
{
"epoch": 0.25412960609911056,
"grad_norm": 1.3086338857944744,
"learning_rate": 1.8592929528788844e-05,
"loss": 0.4753,
"mean_token_accuracy": 0.8180733859539032,
"step": 600
},
{
"epoch": 0.25624735281660316,
"grad_norm": 1.3557276365311686,
"learning_rate": 1.8554868297922728e-05,
"loss": 0.4708,
"mean_token_accuracy": 0.8193376958370209,
"step": 605
},
{
"epoch": 0.2583650995340957,
"grad_norm": 1.2996719117152657,
"learning_rate": 1.8516339178872492e-05,
"loss": 0.4518,
"mean_token_accuracy": 0.8204487860202789,
"step": 610
},
{
"epoch": 0.2604828462515883,
"grad_norm": 1.3696724777806233,
"learning_rate": 1.8477344278896708e-05,
"loss": 0.5072,
"mean_token_accuracy": 0.8076569020748139,
"step": 615
},
{
"epoch": 0.2626005929690809,
"grad_norm": 1.2308629288247015,
"learning_rate": 1.8437885730728738e-05,
"loss": 0.5113,
"mean_token_accuracy": 0.8088377475738525,
"step": 620
},
{
"epoch": 0.2647183396865735,
"grad_norm": 1.2397238918015017,
"learning_rate": 1.839796569246006e-05,
"loss": 0.494,
"mean_token_accuracy": 0.8118572622537613,
"step": 625
},
{
"epoch": 0.2668360864040661,
"grad_norm": 1.3479748389387212,
"learning_rate": 1.8357586347422266e-05,
"loss": 0.5081,
"mean_token_accuracy": 0.8135558038949966,
"step": 630
},
{
"epoch": 0.2689538331215587,
"grad_norm": 1.1063564395200467,
"learning_rate": 1.8316749904067637e-05,
"loss": 0.4653,
"mean_token_accuracy": 0.8218313783407212,
"step": 635
},
{
"epoch": 0.2710715798390512,
"grad_norm": 1.1492824512346658,
"learning_rate": 1.8275458595848376e-05,
"loss": 0.4817,
"mean_token_accuracy": 0.8135390222072602,
"step": 640
},
{
"epoch": 0.2731893265565438,
"grad_norm": 1.4159749106872088,
"learning_rate": 1.8233714681094405e-05,
"loss": 0.4616,
"mean_token_accuracy": 0.8250806093215942,
"step": 645
},
{
"epoch": 0.2753070732740364,
"grad_norm": 1.1611107224498594,
"learning_rate": 1.819152044288992e-05,
"loss": 0.488,
"mean_token_accuracy": 0.8166846603155136,
"step": 650
},
{
"epoch": 0.277424819991529,
"grad_norm": 1.3205339840836507,
"learning_rate": 1.814887818894846e-05,
"loss": 0.5036,
"mean_token_accuracy": 0.810426139831543,
"step": 655
},
{
"epoch": 0.2795425667090216,
"grad_norm": 1.2642547117014469,
"learning_rate": 1.810579025148674e-05,
"loss": 0.5063,
"mean_token_accuracy": 0.8112012058496475,
"step": 660
},
{
"epoch": 0.2816603134265142,
"grad_norm": 5.33401159048522,
"learning_rate": 1.8062258987097062e-05,
"loss": 0.4478,
"mean_token_accuracy": 0.8289118260145187,
"step": 665
},
{
"epoch": 0.28377806014400675,
"grad_norm": 1.3752087188227111,
"learning_rate": 1.8018286776618446e-05,
"loss": 0.4963,
"mean_token_accuracy": 0.8137694984674454,
"step": 670
},
{
"epoch": 0.28589580686149935,
"grad_norm": 1.176266427707403,
"learning_rate": 1.7973876025006407e-05,
"loss": 0.4976,
"mean_token_accuracy": 0.8188654541969299,
"step": 675
},
{
"epoch": 0.28801355357899194,
"grad_norm": 1.331341038204072,
"learning_rate": 1.792902916120143e-05,
"loss": 0.4939,
"mean_token_accuracy": 0.8163222283124923,
"step": 680
},
{
"epoch": 0.29013130029648454,
"grad_norm": 1.1914829607255677,
"learning_rate": 1.7883748637996113e-05,
"loss": 0.4881,
"mean_token_accuracy": 0.8130565702915191,
"step": 685
},
{
"epoch": 0.29224904701397714,
"grad_norm": 1.2277506964948814,
"learning_rate": 1.7838036931901033e-05,
"loss": 0.4559,
"mean_token_accuracy": 0.824514701962471,
"step": 690
},
{
"epoch": 0.29436679373146973,
"grad_norm": 1.0800320597549389,
"learning_rate": 1.7791896543009282e-05,
"loss": 0.4891,
"mean_token_accuracy": 0.8174144089221954,
"step": 695
},
{
"epoch": 0.29648454044896233,
"grad_norm": 1.5694294317697621,
"learning_rate": 1.7745329994859746e-05,
"loss": 0.4914,
"mean_token_accuracy": 0.8185641199350357,
"step": 700
},
{
"epoch": 0.29860228716645487,
"grad_norm": 1.1923041867729132,
"learning_rate": 1.7698339834299064e-05,
"loss": 0.5008,
"mean_token_accuracy": 0.8142161637544632,
"step": 705
},
{
"epoch": 0.30072003388394747,
"grad_norm": 1.3729946102267174,
"learning_rate": 1.7650928631342364e-05,
"loss": 0.4845,
"mean_token_accuracy": 0.8133604645729064,
"step": 710
},
{
"epoch": 0.30283778060144007,
"grad_norm": 1.174456646604131,
"learning_rate": 1.7603098979032683e-05,
"loss": 0.4777,
"mean_token_accuracy": 0.813685166835785,
"step": 715
},
{
"epoch": 0.30495552731893266,
"grad_norm": 1.158532302748484,
"learning_rate": 1.7554853493299142e-05,
"loss": 0.504,
"mean_token_accuracy": 0.8088937163352966,
"step": 720
},
{
"epoch": 0.30707327403642526,
"grad_norm": 1.2620596837516858,
"learning_rate": 1.7506194812813896e-05,
"loss": 0.4817,
"mean_token_accuracy": 0.8206409096717835,
"step": 725
},
{
"epoch": 0.30919102075391786,
"grad_norm": 1.148012521360775,
"learning_rate": 1.74571255988478e-05,
"loss": 0.4819,
"mean_token_accuracy": 0.812398812174797,
"step": 730
},
{
"epoch": 0.3113087674714104,
"grad_norm": 1.2373133691587057,
"learning_rate": 1.740764853512485e-05,
"loss": 0.49,
"mean_token_accuracy": 0.8143349289894104,
"step": 735
},
{
"epoch": 0.313426514188903,
"grad_norm": 2.100740115519466,
"learning_rate": 1.7357766327675433e-05,
"loss": 0.4651,
"mean_token_accuracy": 0.8216336488723754,
"step": 740
},
{
"epoch": 0.3155442609063956,
"grad_norm": 1.4189894877798284,
"learning_rate": 1.73074817046883e-05,
"loss": 0.4801,
"mean_token_accuracy": 0.8188165038824081,
"step": 745
},
{
"epoch": 0.3176620076238882,
"grad_norm": 1.2994480429040771,
"learning_rate": 1.725679741636136e-05,
"loss": 0.4614,
"mean_token_accuracy": 0.8237657248973846,
"step": 750
},
{
"epoch": 0.3197797543413808,
"grad_norm": 1.2308603791930401,
"learning_rate": 1.720571623475128e-05,
"loss": 0.492,
"mean_token_accuracy": 0.8165101200342179,
"step": 755
},
{
"epoch": 0.3218975010588734,
"grad_norm": 1.3843077010151197,
"learning_rate": 1.7154240953621844e-05,
"loss": 0.4564,
"mean_token_accuracy": 0.825025874376297,
"step": 760
},
{
"epoch": 0.3240152477763659,
"grad_norm": 1.1848129565884666,
"learning_rate": 1.7102374388291182e-05,
"loss": 0.4575,
"mean_token_accuracy": 0.8252220988273621,
"step": 765
},
{
"epoch": 0.3261329944938585,
"grad_norm": 1.3217187216198285,
"learning_rate": 1.705011937547779e-05,
"loss": 0.4629,
"mean_token_accuracy": 0.8198304086923599,
"step": 770
},
{
"epoch": 0.3282507412113511,
"grad_norm": 1.3851637896221318,
"learning_rate": 1.6997478773145363e-05,
"loss": 0.4337,
"mean_token_accuracy": 0.8338131695985794,
"step": 775
},
{
"epoch": 0.3303684879288437,
"grad_norm": 1.423775789920787,
"learning_rate": 1.6944455460346503e-05,
"loss": 0.4807,
"mean_token_accuracy": 0.8188902169466019,
"step": 780
},
{
"epoch": 0.3324862346463363,
"grad_norm": 1.3680154210297841,
"learning_rate": 1.6891052337065256e-05,
"loss": 0.4841,
"mean_token_accuracy": 0.8188378721475601,
"step": 785
},
{
"epoch": 0.3346039813638289,
"grad_norm": 1.1670007538420892,
"learning_rate": 1.6837272324058487e-05,
"loss": 0.4209,
"mean_token_accuracy": 0.8359328061342239,
"step": 790
},
{
"epoch": 0.33672172808132145,
"grad_norm": 1.2238185684348435,
"learning_rate": 1.6783118362696162e-05,
"loss": 0.4687,
"mean_token_accuracy": 0.8194981902837754,
"step": 795
},
{
"epoch": 0.33883947479881404,
"grad_norm": 1.3104844364549155,
"learning_rate": 1.672859341480046e-05,
"loss": 0.4605,
"mean_token_accuracy": 0.8169092148542404,
"step": 800
},
{
"epoch": 0.34095722151630664,
"grad_norm": 1.1074420443801423,
"learning_rate": 1.6673700462483776e-05,
"loss": 0.4424,
"mean_token_accuracy": 0.8315922617912292,
"step": 805
},
{
"epoch": 0.34307496823379924,
"grad_norm": 1.2002465546594834,
"learning_rate": 1.661844250798565e-05,
"loss": 0.4773,
"mean_token_accuracy": 0.8234172344207764,
"step": 810
},
{
"epoch": 0.34519271495129183,
"grad_norm": 1.3643314568341807,
"learning_rate": 1.6562822573508533e-05,
"loss": 0.4803,
"mean_token_accuracy": 0.8155502796173095,
"step": 815
},
{
"epoch": 0.34731046166878443,
"grad_norm": 1.1653511889703811,
"learning_rate": 1.650684370105252e-05,
"loss": 0.4907,
"mean_token_accuracy": 0.8095988690853119,
"step": 820
},
{
"epoch": 0.34942820838627703,
"grad_norm": 1.2052540958169133,
"learning_rate": 1.6450508952248957e-05,
"loss": 0.4664,
"mean_token_accuracy": 0.8265933513641357,
"step": 825
},
{
"epoch": 0.35154595510376957,
"grad_norm": 1.5477552328113091,
"learning_rate": 1.6393821408193007e-05,
"loss": 0.4783,
"mean_token_accuracy": 0.8169477820396424,
"step": 830
},
{
"epoch": 0.35366370182126217,
"grad_norm": 1.8070494772139423,
"learning_rate": 1.6336784169275132e-05,
"loss": 0.454,
"mean_token_accuracy": 0.8248355984687805,
"step": 835
},
{
"epoch": 0.35578144853875476,
"grad_norm": 1.2257376390653825,
"learning_rate": 1.627940035501152e-05,
"loss": 0.4506,
"mean_token_accuracy": 0.8257219165563583,
"step": 840
},
{
"epoch": 0.35789919525624736,
"grad_norm": 1.3198794046839721,
"learning_rate": 1.6221673103873474e-05,
"loss": 0.4427,
"mean_token_accuracy": 0.8296634495258332,
"step": 845
},
{
"epoch": 0.36001694197373996,
"grad_norm": 2.109231295857473,
"learning_rate": 1.616360557311575e-05,
"loss": 0.489,
"mean_token_accuracy": 0.8102859228849411,
"step": 850
},
{
"epoch": 0.36213468869123255,
"grad_norm": 1.1872292152679083,
"learning_rate": 1.6105200938603917e-05,
"loss": 0.4681,
"mean_token_accuracy": 0.8261395335197449,
"step": 855
},
{
"epoch": 0.3642524354087251,
"grad_norm": 1.214005452933459,
"learning_rate": 1.60464623946406e-05,
"loss": 0.4852,
"mean_token_accuracy": 0.8179385870695114,
"step": 860
},
{
"epoch": 0.3663701821262177,
"grad_norm": 1.0907256335398452,
"learning_rate": 1.5987393153790832e-05,
"loss": 0.4623,
"mean_token_accuracy": 0.8248693764209747,
"step": 865
},
{
"epoch": 0.3684879288437103,
"grad_norm": 1.061691508146564,
"learning_rate": 1.5927996446706308e-05,
"loss": 0.4803,
"mean_token_accuracy": 0.8169174045324326,
"step": 870
},
{
"epoch": 0.3706056755612029,
"grad_norm": 1.1759352091149649,
"learning_rate": 1.5868275521948726e-05,
"loss": 0.4563,
"mean_token_accuracy": 0.8279780805110931,
"step": 875
},
{
"epoch": 0.3727234222786955,
"grad_norm": 1.2135030886876705,
"learning_rate": 1.5808233645812087e-05,
"loss": 0.4418,
"mean_token_accuracy": 0.8301020473241806,
"step": 880
},
{
"epoch": 0.3748411689961881,
"grad_norm": 1.1266881444254488,
"learning_rate": 1.5747874102144073e-05,
"loss": 0.4626,
"mean_token_accuracy": 0.8214969336986542,
"step": 885
},
{
"epoch": 0.3769589157136806,
"grad_norm": 1.0911244736489776,
"learning_rate": 1.5687200192166424e-05,
"loss": 0.4635,
"mean_token_accuracy": 0.8221491903066636,
"step": 890
},
{
"epoch": 0.3790766624311732,
"grad_norm": 1.0852849507203284,
"learning_rate": 1.5626215234294416e-05,
"loss": 0.451,
"mean_token_accuracy": 0.8251518607139587,
"step": 895
},
{
"epoch": 0.3811944091486658,
"grad_norm": 1.1215853338868707,
"learning_rate": 1.5564922563955337e-05,
"loss": 0.4608,
"mean_token_accuracy": 0.8237892210483551,
"step": 900
},
{
"epoch": 0.3833121558661584,
"grad_norm": 0.9235255903522734,
"learning_rate": 1.5503325533406076e-05,
"loss": 0.4676,
"mean_token_accuracy": 0.8222286731004715,
"step": 905
},
{
"epoch": 0.385429902583651,
"grad_norm": 1.0494173037764836,
"learning_rate": 1.5441427511549795e-05,
"loss": 0.4652,
"mean_token_accuracy": 0.8235789179801941,
"step": 910
},
{
"epoch": 0.3875476493011436,
"grad_norm": 1.2934333868332708,
"learning_rate": 1.537923188375164e-05,
"loss": 0.459,
"mean_token_accuracy": 0.8253506690263748,
"step": 915
},
{
"epoch": 0.38966539601863615,
"grad_norm": 1.045643086378396,
"learning_rate": 1.5316742051653624e-05,
"loss": 0.4487,
"mean_token_accuracy": 0.8300421804189682,
"step": 920
},
{
"epoch": 0.39178314273612874,
"grad_norm": 1.0549731687620314,
"learning_rate": 1.5253961432988548e-05,
"loss": 0.4756,
"mean_token_accuracy": 0.8141780078411103,
"step": 925
},
{
"epoch": 0.39390088945362134,
"grad_norm": 1.1263426428393677,
"learning_rate": 1.5190893461393108e-05,
"loss": 0.4698,
"mean_token_accuracy": 0.8173887878656387,
"step": 930
},
{
"epoch": 0.39601863617111394,
"grad_norm": 1.1982411204873675,
"learning_rate": 1.5127541586220077e-05,
"loss": 0.4595,
"mean_token_accuracy": 0.8246693462133408,
"step": 935
},
{
"epoch": 0.39813638288860653,
"grad_norm": 1.331125977750805,
"learning_rate": 1.5063909272349664e-05,
"loss": 0.466,
"mean_token_accuracy": 0.8266402333974838,
"step": 940
},
{
"epoch": 0.40025412960609913,
"grad_norm": 1.165754254305497,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.435,
"mean_token_accuracy": 0.8271202713251113,
"step": 945
},
{
"epoch": 0.4023718763235917,
"grad_norm": 1.1585938088360928,
"learning_rate": 1.4935817264536809e-05,
"loss": 0.4386,
"mean_token_accuracy": 0.8255492657423019,
"step": 950
},
{
"epoch": 0.40448962304108427,
"grad_norm": 1.1542702135186313,
"learning_rate": 1.4871364576282223e-05,
"loss": 0.4769,
"mean_token_accuracy": 0.8163278847932816,
"step": 955
},
{
"epoch": 0.40660736975857686,
"grad_norm": 1.1855267108232739,
"learning_rate": 1.4806645460322804e-05,
"loss": 0.4938,
"mean_token_accuracy": 0.8140994518995285,
"step": 960
},
{
"epoch": 0.40872511647606946,
"grad_norm": 1.0583179034757253,
"learning_rate": 1.4741663456316742e-05,
"loss": 0.4694,
"mean_token_accuracy": 0.8194496780633926,
"step": 965
},
{
"epoch": 0.41084286319356206,
"grad_norm": 1.2166297794886325,
"learning_rate": 1.4676422118300266e-05,
"loss": 0.4583,
"mean_token_accuracy": 0.8240072697401046,
"step": 970
},
{
"epoch": 0.41296060991105465,
"grad_norm": 1.2077033819076497,
"learning_rate": 1.461092501449326e-05,
"loss": 0.4683,
"mean_token_accuracy": 0.8127462983131408,
"step": 975
},
{
"epoch": 0.41507835662854725,
"grad_norm": 1.2024839451726628,
"learning_rate": 1.4545175727104113e-05,
"loss": 0.4746,
"mean_token_accuracy": 0.817327806353569,
"step": 980
},
{
"epoch": 0.4171961033460398,
"grad_norm": 43.895890122529586,
"learning_rate": 1.4479177852133787e-05,
"loss": 0.4339,
"mean_token_accuracy": 0.83043053150177,
"step": 985
},
{
"epoch": 0.4193138500635324,
"grad_norm": 1.3761452239892333,
"learning_rate": 1.4412934999179169e-05,
"loss": 0.4682,
"mean_token_accuracy": 0.82216075360775,
"step": 990
},
{
"epoch": 0.421431596781025,
"grad_norm": 9.553572882081992,
"learning_rate": 1.4346450791235611e-05,
"loss": 0.425,
"mean_token_accuracy": 0.8346862554550171,
"step": 995
},
{
"epoch": 0.4235493434985176,
"grad_norm": 1.19535922636142,
"learning_rate": 1.427972886449882e-05,
"loss": 0.4916,
"mean_token_accuracy": 0.8201052099466324,
"step": 1000
},
{
"epoch": 0.4256670902160102,
"grad_norm": 1.487407000401354,
"learning_rate": 1.4212772868165957e-05,
"loss": 0.4759,
"mean_token_accuracy": 0.8201690822839737,
"step": 1005
},
{
"epoch": 0.4277848369335028,
"grad_norm": 1.2209557581398112,
"learning_rate": 1.4145586464236074e-05,
"loss": 0.4776,
"mean_token_accuracy": 0.8144995361566544,
"step": 1010
},
{
"epoch": 0.4299025836509953,
"grad_norm": 1.4175123984588354,
"learning_rate": 1.4078173327309807e-05,
"loss": 0.4697,
"mean_token_accuracy": 0.820775744318962,
"step": 1015
},
{
"epoch": 0.4320203303684879,
"grad_norm": 1.2129818965934513,
"learning_rate": 1.4010537144388416e-05,
"loss": 0.463,
"mean_token_accuracy": 0.8259893089532853,
"step": 1020
},
{
"epoch": 0.4341380770859805,
"grad_norm": 1.12010970838833,
"learning_rate": 1.3942681614672144e-05,
"loss": 0.4629,
"mean_token_accuracy": 0.8218669801950454,
"step": 1025
},
{
"epoch": 0.4362558238034731,
"grad_norm": 1.1464961804103622,
"learning_rate": 1.3874610449357873e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8335713416337966,
"step": 1030
},
{
"epoch": 0.4383735705209657,
"grad_norm": 1.1351310993680606,
"learning_rate": 1.3806327371436159e-05,
"loss": 0.4394,
"mean_token_accuracy": 0.8307629436254501,
"step": 1035
},
{
"epoch": 0.4404913172384583,
"grad_norm": 1.1188266853744508,
"learning_rate": 1.3737836115487624e-05,
"loss": 0.4663,
"mean_token_accuracy": 0.8193978488445282,
"step": 1040
},
{
"epoch": 0.44260906395595084,
"grad_norm": 1.1620199858915772,
"learning_rate": 1.3669140427478693e-05,
"loss": 0.4705,
"mean_token_accuracy": 0.8229668527841568,
"step": 1045
},
{
"epoch": 0.44472681067344344,
"grad_norm": 1.110101616240863,
"learning_rate": 1.3600244064556702e-05,
"loss": 0.4747,
"mean_token_accuracy": 0.8179006308317185,
"step": 1050
},
{
"epoch": 0.44684455739093604,
"grad_norm": 1.2783615446297392,
"learning_rate": 1.353115079484444e-05,
"loss": 0.4458,
"mean_token_accuracy": 0.8308207571506501,
"step": 1055
},
{
"epoch": 0.44896230410842863,
"grad_norm": 1.1007302332610067,
"learning_rate": 1.3461864397234041e-05,
"loss": 0.4598,
"mean_token_accuracy": 0.8242943733930588,
"step": 1060
},
{
"epoch": 0.45108005082592123,
"grad_norm": 1.2199483732995027,
"learning_rate": 1.3392388661180303e-05,
"loss": 0.445,
"mean_token_accuracy": 0.824502220749855,
"step": 1065
},
{
"epoch": 0.4531977975434138,
"grad_norm": 1.0010509955815885,
"learning_rate": 1.332272738649345e-05,
"loss": 0.4583,
"mean_token_accuracy": 0.8303744524717331,
"step": 1070
},
{
"epoch": 0.45531554426090637,
"grad_norm": 1.918284418636839,
"learning_rate": 1.325288438313129e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.8296439230442048,
"step": 1075
},
{
"epoch": 0.45743329097839897,
"grad_norm": 1.1887902164021535,
"learning_rate": 1.318286347099086e-05,
"loss": 0.4625,
"mean_token_accuracy": 0.8217881232500076,
"step": 1080
},
{
"epoch": 0.45955103769589156,
"grad_norm": 1.1360766453253965,
"learning_rate": 1.3112668479699486e-05,
"loss": 0.4589,
"mean_token_accuracy": 0.8269425123929978,
"step": 1085
},
{
"epoch": 0.46166878441338416,
"grad_norm": 1.2399254503178083,
"learning_rate": 1.3042303248405346e-05,
"loss": 0.4555,
"mean_token_accuracy": 0.8309968203306198,
"step": 1090
},
{
"epoch": 0.46378653113087676,
"grad_norm": 1.0508779611719044,
"learning_rate": 1.297177162556748e-05,
"loss": 0.4545,
"mean_token_accuracy": 0.824161484837532,
"step": 1095
},
{
"epoch": 0.46590427784836935,
"grad_norm": 1.0822262810815348,
"learning_rate": 1.2901077468745329e-05,
"loss": 0.4571,
"mean_token_accuracy": 0.8281063556671142,
"step": 1100
},
{
"epoch": 0.46802202456586195,
"grad_norm": 1.0744745429140576,
"learning_rate": 1.2830224644387742e-05,
"loss": 0.471,
"mean_token_accuracy": 0.8183866649866104,
"step": 1105
},
{
"epoch": 0.4701397712833545,
"grad_norm": 1.2108459211634035,
"learning_rate": 1.2759217027621507e-05,
"loss": 0.4445,
"mean_token_accuracy": 0.8313823521137238,
"step": 1110
},
{
"epoch": 0.4722575180008471,
"grad_norm": 1.1385271166035913,
"learning_rate": 1.2688058502039416e-05,
"loss": 0.4724,
"mean_token_accuracy": 0.8208224922418594,
"step": 1115
},
{
"epoch": 0.4743752647183397,
"grad_norm": 1.1608922255857643,
"learning_rate": 1.261675295948786e-05,
"loss": 0.4402,
"mean_token_accuracy": 0.8260656505823135,
"step": 1120
},
{
"epoch": 0.4764930114358323,
"grad_norm": 1.2001870807148136,
"learning_rate": 1.2545304299853977e-05,
"loss": 0.4676,
"mean_token_accuracy": 0.8217555999755859,
"step": 1125
},
{
"epoch": 0.4786107581533249,
"grad_norm": 1.099496727008847,
"learning_rate": 1.2473716430852353e-05,
"loss": 0.436,
"mean_token_accuracy": 0.8312188684940338,
"step": 1130
},
{
"epoch": 0.4807285048708175,
"grad_norm": 2.032998570634967,
"learning_rate": 1.2401993267811293e-05,
"loss": 0.4317,
"mean_token_accuracy": 0.8295620054006576,
"step": 1135
},
{
"epoch": 0.48284625158831,
"grad_norm": 1.1812725212971202,
"learning_rate": 1.2330138733458693e-05,
"loss": 0.4156,
"mean_token_accuracy": 0.8353513538837433,
"step": 1140
},
{
"epoch": 0.4849639983058026,
"grad_norm": 1.138821301405385,
"learning_rate": 1.2258156757707496e-05,
"loss": 0.4506,
"mean_token_accuracy": 0.8284595161676407,
"step": 1145
},
{
"epoch": 0.4870817450232952,
"grad_norm": 1.039456646381961,
"learning_rate": 1.2186051277440739e-05,
"loss": 0.4281,
"mean_token_accuracy": 0.8340547412633896,
"step": 1150
},
{
"epoch": 0.4891994917407878,
"grad_norm": 1.0935441587184827,
"learning_rate": 1.2113826236296245e-05,
"loss": 0.4368,
"mean_token_accuracy": 0.8294982463121414,
"step": 1155
},
{
"epoch": 0.4913172384582804,
"grad_norm": 1.0601849025025707,
"learning_rate": 1.2041485584450945e-05,
"loss": 0.4496,
"mean_token_accuracy": 0.8288684636354446,
"step": 1160
},
{
"epoch": 0.493434985175773,
"grad_norm": 1.1432826242197904,
"learning_rate": 1.1969033278404816e-05,
"loss": 0.472,
"mean_token_accuracy": 0.8184500396251678,
"step": 1165
},
{
"epoch": 0.49555273189326554,
"grad_norm": 1.178255399480397,
"learning_rate": 1.1896473280764498e-05,
"loss": 0.453,
"mean_token_accuracy": 0.82464899122715,
"step": 1170
},
{
"epoch": 0.49767047861075814,
"grad_norm": 1.2123556499205794,
"learning_rate": 1.1823809560026558e-05,
"loss": 0.442,
"mean_token_accuracy": 0.8262520909309388,
"step": 1175
},
{
"epoch": 0.49978822532825073,
"grad_norm": 1.490671459887953,
"learning_rate": 1.175104609036047e-05,
"loss": 0.4493,
"mean_token_accuracy": 0.8295370072126389,
"step": 1180
},
{
"epoch": 0.5019059720457433,
"grad_norm": 3.5058816478434993,
"learning_rate": 1.1678186851391218e-05,
"loss": 0.4593,
"mean_token_accuracy": 0.8269213020801545,
"step": 1185
},
{
"epoch": 0.5040237187632359,
"grad_norm": 1.1384716073513477,
"learning_rate": 1.1605235827981673e-05,
"loss": 0.4463,
"mean_token_accuracy": 0.8314786165952682,
"step": 1190
},
{
"epoch": 0.5061414654807285,
"grad_norm": 1.1752572701433124,
"learning_rate": 1.1532197010014636e-05,
"loss": 0.4453,
"mean_token_accuracy": 0.8288865953683853,
"step": 1195
},
{
"epoch": 0.5082592121982211,
"grad_norm": 1.0006379736398943,
"learning_rate": 1.1459074392174619e-05,
"loss": 0.4293,
"mean_token_accuracy": 0.8350226402282714,
"step": 1200
},
{
"epoch": 0.5103769589157137,
"grad_norm": 1.1784455736187447,
"learning_rate": 1.138587197372937e-05,
"loss": 0.4612,
"mean_token_accuracy": 0.8215854614973068,
"step": 1205
},
{
"epoch": 0.5124947056332063,
"grad_norm": 1.1048766566547503,
"learning_rate": 1.1312593758311143e-05,
"loss": 0.4279,
"mean_token_accuracy": 0.8407860666513443,
"step": 1210
},
{
"epoch": 0.5146124523506989,
"grad_norm": 1.0718700385713946,
"learning_rate": 1.1239243753697728e-05,
"loss": 0.4288,
"mean_token_accuracy": 0.8378984898328781,
"step": 1215
},
{
"epoch": 0.5167301990681914,
"grad_norm": 1.558568433227081,
"learning_rate": 1.1165825971593251e-05,
"loss": 0.4678,
"mean_token_accuracy": 0.825000548362732,
"step": 1220
},
{
"epoch": 0.518847945785684,
"grad_norm": 1.082392246698731,
"learning_rate": 1.1092344427408767e-05,
"loss": 0.4276,
"mean_token_accuracy": 0.8359992414712906,
"step": 1225
},
{
"epoch": 0.5209656925031766,
"grad_norm": 1.256334909576375,
"learning_rate": 1.1018803140042651e-05,
"loss": 0.4633,
"mean_token_accuracy": 0.8229638338088989,
"step": 1230
},
{
"epoch": 0.5230834392206692,
"grad_norm": 1.303814596864245,
"learning_rate": 1.0945206131660787e-05,
"loss": 0.469,
"mean_token_accuracy": 0.8193328499794006,
"step": 1235
},
{
"epoch": 0.5252011859381618,
"grad_norm": 1.0507039996160834,
"learning_rate": 1.0871557427476585e-05,
"loss": 0.4414,
"mean_token_accuracy": 0.8317544460296631,
"step": 1240
},
{
"epoch": 0.5273189326556544,
"grad_norm": 1.015866344156703,
"learning_rate": 1.0797861055530832e-05,
"loss": 0.428,
"mean_token_accuracy": 0.8305379122495651,
"step": 1245
},
{
"epoch": 0.529436679373147,
"grad_norm": 1.1624992956977676,
"learning_rate": 1.07241210464714e-05,
"loss": 0.467,
"mean_token_accuracy": 0.820591053366661,
"step": 1250
},
{
"epoch": 0.5315544260906395,
"grad_norm": 1.2782647412686758,
"learning_rate": 1.0650341433332778e-05,
"loss": 0.4689,
"mean_token_accuracy": 0.8219984292984008,
"step": 1255
},
{
"epoch": 0.5336721728081322,
"grad_norm": 1.1784870838731618,
"learning_rate": 1.0576526251315515e-05,
"loss": 0.4596,
"mean_token_accuracy": 0.8260756641626358,
"step": 1260
},
{
"epoch": 0.5357899195256247,
"grad_norm": 1.1204805080469906,
"learning_rate": 1.0502679537565507e-05,
"loss": 0.442,
"mean_token_accuracy": 0.8296466141939163,
"step": 1265
},
{
"epoch": 0.5379076662431174,
"grad_norm": 1.0718296420595828,
"learning_rate": 1.0428805330953209e-05,
"loss": 0.4215,
"mean_token_accuracy": 0.8308669030666351,
"step": 1270
},
{
"epoch": 0.5400254129606099,
"grad_norm": 1.1125024136410944,
"learning_rate": 1.0354907671852733e-05,
"loss": 0.4363,
"mean_token_accuracy": 0.8332655102014541,
"step": 1275
},
{
"epoch": 0.5421431596781024,
"grad_norm": 1.090167844275342,
"learning_rate": 1.0280990601920863e-05,
"loss": 0.4435,
"mean_token_accuracy": 0.8282716870307922,
"step": 1280
},
{
"epoch": 0.5442609063955951,
"grad_norm": 1.0290238619990948,
"learning_rate": 1.0207058163876021e-05,
"loss": 0.4413,
"mean_token_accuracy": 0.8311887979507446,
"step": 1285
},
{
"epoch": 0.5463786531130876,
"grad_norm": 1.0778232888370207,
"learning_rate": 1.013311440127714e-05,
"loss": 0.4386,
"mean_token_accuracy": 0.8266764581203461,
"step": 1290
},
{
"epoch": 0.5484963998305803,
"grad_norm": 1.1219731141973122,
"learning_rate": 1.0059163358302537e-05,
"loss": 0.4103,
"mean_token_accuracy": 0.8391000181436539,
"step": 1295
},
{
"epoch": 0.5506141465480728,
"grad_norm": 1.1468466517999107,
"learning_rate": 9.9852090795287e-06,
"loss": 0.4391,
"mean_token_accuracy": 0.8361193478107453,
"step": 1300
},
{
"epoch": 0.5527318932655655,
"grad_norm": 1.0284132663014267,
"learning_rate": 9.911255609709089e-06,
"loss": 0.4409,
"mean_token_accuracy": 0.8269284754991532,
"step": 1305
},
{
"epoch": 0.554849639983058,
"grad_norm": 1.0310999165822667,
"learning_rate": 9.83730699355294e-06,
"loss": 0.4071,
"mean_token_accuracy": 0.835135304927826,
"step": 1310
},
{
"epoch": 0.5569673867005506,
"grad_norm": 1.2728900066425748,
"learning_rate": 9.76336727550401e-06,
"loss": 0.4601,
"mean_token_accuracy": 0.8267913639545441,
"step": 1315
},
{
"epoch": 0.5590851334180432,
"grad_norm": 1.2269899407592741,
"learning_rate": 9.689440499519395e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.8314703017473221,
"step": 1320
},
{
"epoch": 0.5612028801355358,
"grad_norm": 1.1418757049837882,
"learning_rate": 9.615530708848373e-06,
"loss": 0.4231,
"mean_token_accuracy": 0.8340400338172913,
"step": 1325
},
{
"epoch": 0.5633206268530284,
"grad_norm": 1.1108149486798655,
"learning_rate": 9.541641945811233e-06,
"loss": 0.4492,
"mean_token_accuracy": 0.8232677519321442,
"step": 1330
},
{
"epoch": 0.565438373570521,
"grad_norm": 1.1088127297572268,
"learning_rate": 9.467778251578217e-06,
"loss": 0.4549,
"mean_token_accuracy": 0.8236530691385269,
"step": 1335
},
{
"epoch": 0.5675561202880135,
"grad_norm": 0.9179664771961787,
"learning_rate": 9.393943665948478e-06,
"loss": 0.4763,
"mean_token_accuracy": 0.8244054973125458,
"step": 1340
},
{
"epoch": 0.5696738670055062,
"grad_norm": 1.1777867866273308,
"learning_rate": 9.320142227129158e-06,
"loss": 0.4348,
"mean_token_accuracy": 0.8331925332546234,
"step": 1345
},
{
"epoch": 0.5717916137229987,
"grad_norm": 1.0020743360016087,
"learning_rate": 9.246377971514504e-06,
"loss": 0.4161,
"mean_token_accuracy": 0.8360674440860748,
"step": 1350
},
{
"epoch": 0.5739093604404913,
"grad_norm": 1.346066080223308,
"learning_rate": 9.172654933465114e-06,
"loss": 0.448,
"mean_token_accuracy": 0.8250635206699372,
"step": 1355
},
{
"epoch": 0.5760271071579839,
"grad_norm": 1.3221207747875352,
"learning_rate": 9.0989771450873e-06,
"loss": 0.4228,
"mean_token_accuracy": 0.8357968628406525,
"step": 1360
},
{
"epoch": 0.5781448538754765,
"grad_norm": 1.1501989319658534,
"learning_rate": 9.025348636012537e-06,
"loss": 0.4411,
"mean_token_accuracy": 0.8290417343378067,
"step": 1365
},
{
"epoch": 0.5802626005929691,
"grad_norm": 1.1694331116554113,
"learning_rate": 8.951773433177095e-06,
"loss": 0.4343,
"mean_token_accuracy": 0.8303040146827698,
"step": 1370
},
{
"epoch": 0.5823803473104616,
"grad_norm": 1.2089472872967426,
"learning_rate": 8.878255560601781e-06,
"loss": 0.4285,
"mean_token_accuracy": 0.8339911371469497,
"step": 1375
},
{
"epoch": 0.5844980940279543,
"grad_norm": 1.1555334960481487,
"learning_rate": 8.804799039171863e-06,
"loss": 0.4225,
"mean_token_accuracy": 0.8346673488616944,
"step": 1380
},
{
"epoch": 0.5866158407454468,
"grad_norm": 0.9976941601020334,
"learning_rate": 8.731407886417155e-06,
"loss": 0.4538,
"mean_token_accuracy": 0.8272438108921051,
"step": 1385
},
{
"epoch": 0.5887335874629395,
"grad_norm": 1.0977726966561636,
"learning_rate": 8.658086116292283e-06,
"loss": 0.4297,
"mean_token_accuracy": 0.8334219962358475,
"step": 1390
},
{
"epoch": 0.590851334180432,
"grad_norm": 2.0194878160007987,
"learning_rate": 8.584837738957155e-06,
"loss": 0.4413,
"mean_token_accuracy": 0.8283408343791961,
"step": 1395
},
{
"epoch": 0.5929690808979247,
"grad_norm": 1.2186719145281468,
"learning_rate": 8.511666760557638e-06,
"loss": 0.4693,
"mean_token_accuracy": 0.8232256740331649,
"step": 1400
},
{
"epoch": 0.5950868276154172,
"grad_norm": 1.1198588684752515,
"learning_rate": 8.438577183006448e-06,
"loss": 0.4221,
"mean_token_accuracy": 0.8324928849935531,
"step": 1405
},
{
"epoch": 0.5972045743329097,
"grad_norm": 1.1215071963961742,
"learning_rate": 8.36557300376427e-06,
"loss": 0.4392,
"mean_token_accuracy": 0.8286356210708619,
"step": 1410
},
{
"epoch": 0.5993223210504024,
"grad_norm": 1.107475266800191,
"learning_rate": 8.292658215621139e-06,
"loss": 0.4344,
"mean_token_accuracy": 0.8313880443572998,
"step": 1415
},
{
"epoch": 0.6014400677678949,
"grad_norm": 1.1686631557802003,
"learning_rate": 8.219836806478049e-06,
"loss": 0.4336,
"mean_token_accuracy": 0.8312123149633408,
"step": 1420
},
{
"epoch": 0.6035578144853876,
"grad_norm": 1.230978585871069,
"learning_rate": 8.147112759128859e-06,
"loss": 0.4647,
"mean_token_accuracy": 0.8231993585824966,
"step": 1425
},
{
"epoch": 0.6056755612028801,
"grad_norm": 1.0717890273842352,
"learning_rate": 8.074490051042447e-06,
"loss": 0.4353,
"mean_token_accuracy": 0.8321529895067215,
"step": 1430
},
{
"epoch": 0.6077933079203727,
"grad_norm": 1.085108371368418,
"learning_rate": 8.001972654145194e-06,
"loss": 0.4415,
"mean_token_accuracy": 0.8277548223733902,
"step": 1435
},
{
"epoch": 0.6099110546378653,
"grad_norm": 1.2119593900205077,
"learning_rate": 7.929564534603722e-06,
"loss": 0.4571,
"mean_token_accuracy": 0.8255878984928131,
"step": 1440
},
{
"epoch": 0.6120288013553579,
"grad_norm": 1.1055437345283827,
"learning_rate": 7.857269652607995e-06,
"loss": 0.4406,
"mean_token_accuracy": 0.8275179982185363,
"step": 1445
},
{
"epoch": 0.6141465480728505,
"grad_norm": 1.1275451956189597,
"learning_rate": 7.78509196215472e-06,
"loss": 0.4308,
"mean_token_accuracy": 0.8301453530788422,
"step": 1450
},
{
"epoch": 0.6162642947903431,
"grad_norm": 1.2886494426253579,
"learning_rate": 7.713035410831086e-06,
"loss": 0.4573,
"mean_token_accuracy": 0.8251194447278977,
"step": 1455
},
{
"epoch": 0.6183820415078357,
"grad_norm": 1.1109768793864798,
"learning_rate": 7.64110393959887e-06,
"loss": 0.4279,
"mean_token_accuracy": 0.8380070447921752,
"step": 1460
},
{
"epoch": 0.6204997882253283,
"grad_norm": 1.0182035864318235,
"learning_rate": 7.569301482578885e-06,
"loss": 0.4281,
"mean_token_accuracy": 0.8316156834363937,
"step": 1465
},
{
"epoch": 0.6226175349428208,
"grad_norm": 1.2074345207100396,
"learning_rate": 7.497631966835828e-06,
"loss": 0.4527,
"mean_token_accuracy": 0.8231601238250732,
"step": 1470
},
{
"epoch": 0.6247352816603134,
"grad_norm": 0.991329003303421,
"learning_rate": 7.42609931216348e-06,
"loss": 0.442,
"mean_token_accuracy": 0.8327670186758042,
"step": 1475
},
{
"epoch": 0.626853028377806,
"grad_norm": 1.38024365126256,
"learning_rate": 7.354707430870332e-06,
"loss": 0.4335,
"mean_token_accuracy": 0.8324557185173035,
"step": 1480
},
{
"epoch": 0.6289707750952986,
"grad_norm": 1.2263457500699402,
"learning_rate": 7.283460227565614e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.8289420217275619,
"step": 1485
},
{
"epoch": 0.6310885218127912,
"grad_norm": 1.1601375730316865,
"learning_rate": 7.2123615989457364e-06,
"loss": 0.4465,
"mean_token_accuracy": 0.832300814986229,
"step": 1490
},
{
"epoch": 0.6332062685302838,
"grad_norm": 1.3029839142463893,
"learning_rate": 7.141415433581169e-06,
"loss": 0.4167,
"mean_token_accuracy": 0.8393772184848786,
"step": 1495
},
{
"epoch": 0.6353240152477764,
"grad_norm": 1.0421344337402514,
"learning_rate": 7.070625611703762e-06,
"loss": 0.4537,
"mean_token_accuracy": 0.8257811456918717,
"step": 1500
},
{
"epoch": 0.6374417619652689,
"grad_norm": 1.1352186472493642,
"learning_rate": 6.9999960049945406e-06,
"loss": 0.4227,
"mean_token_accuracy": 0.8368300348520279,
"step": 1505
},
{
"epoch": 0.6395595086827616,
"grad_norm": 0.9884985072070904,
"learning_rate": 6.929530476371935e-06,
"loss": 0.4189,
"mean_token_accuracy": 0.8349219173192978,
"step": 1510
},
{
"epoch": 0.6416772554002541,
"grad_norm": 1.7766008455284357,
"learning_rate": 6.859232879780515e-06,
"loss": 0.4288,
"mean_token_accuracy": 0.8374936401844024,
"step": 1515
},
{
"epoch": 0.6437950021177468,
"grad_norm": 1.012934970024209,
"learning_rate": 6.7891070599802045e-06,
"loss": 0.4549,
"mean_token_accuracy": 0.8239244252443314,
"step": 1520
},
{
"epoch": 0.6459127488352393,
"grad_norm": 0.9859441855867837,
"learning_rate": 6.719156852336015e-06,
"loss": 0.4293,
"mean_token_accuracy": 0.8353272944688797,
"step": 1525
},
{
"epoch": 0.6480304955527318,
"grad_norm": 1.261329902420831,
"learning_rate": 6.649386082608256e-06,
"loss": 0.428,
"mean_token_accuracy": 0.8329044044017792,
"step": 1530
},
{
"epoch": 0.6501482422702245,
"grad_norm": 1.2457535519058567,
"learning_rate": 6.579798566743314e-06,
"loss": 0.4324,
"mean_token_accuracy": 0.8307075470685958,
"step": 1535
},
{
"epoch": 0.652265988987717,
"grad_norm": 1.213114456712863,
"learning_rate": 6.510398110664939e-06,
"loss": 0.4223,
"mean_token_accuracy": 0.8351607590913772,
"step": 1540
},
{
"epoch": 0.6543837357052097,
"grad_norm": 1.155264435257233,
"learning_rate": 6.441188510066092e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.8374445289373398,
"step": 1545
},
{
"epoch": 0.6565014824227022,
"grad_norm": 1.1756119576548756,
"learning_rate": 6.372173550201346e-06,
"loss": 0.4119,
"mean_token_accuracy": 0.8390755444765091,
"step": 1550
},
{
"epoch": 0.6586192291401949,
"grad_norm": 1.0243897900651528,
"learning_rate": 6.303357005679858e-06,
"loss": 0.4478,
"mean_token_accuracy": 0.8277173846960068,
"step": 1555
},
{
"epoch": 0.6607369758576874,
"grad_norm": 1.0868676429874986,
"learning_rate": 6.234742640258938e-06,
"loss": 0.4552,
"mean_token_accuracy": 0.827509269118309,
"step": 1560
},
{
"epoch": 0.66285472257518,
"grad_norm": 1.1792649536698685,
"learning_rate": 6.166334206638186e-06,
"loss": 0.4396,
"mean_token_accuracy": 0.8288001954555512,
"step": 1565
},
{
"epoch": 0.6649724692926726,
"grad_norm": 1.171894663481444,
"learning_rate": 6.0981354462542456e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.8315492898225785,
"step": 1570
},
{
"epoch": 0.6670902160101652,
"grad_norm": 1.1333037764256397,
"learning_rate": 6.030150089076199e-06,
"loss": 0.4319,
"mean_token_accuracy": 0.8316318243741989,
"step": 1575
},
{
"epoch": 0.6692079627276578,
"grad_norm": 1.1892286300854609,
"learning_rate": 5.9623818534015275e-06,
"loss": 0.4275,
"mean_token_accuracy": 0.8352140128612519,
"step": 1580
},
{
"epoch": 0.6713257094451504,
"grad_norm": 4.250523219515856,
"learning_rate": 5.894834445652777e-06,
"loss": 0.411,
"mean_token_accuracy": 0.8329778879880905,
"step": 1585
},
{
"epoch": 0.6734434561626429,
"grad_norm": 1.157008090047474,
"learning_rate": 5.827511560174835e-06,
"loss": 0.4242,
"mean_token_accuracy": 0.832972839474678,
"step": 1590
},
{
"epoch": 0.6755612028801355,
"grad_norm": 1.1834078816860993,
"learning_rate": 5.7604168790328774e-06,
"loss": 0.3931,
"mean_token_accuracy": 0.8443128287792205,
"step": 1595
},
{
"epoch": 0.6776789495976281,
"grad_norm": 1.0766345733639675,
"learning_rate": 5.693554071810987e-06,
"loss": 0.4478,
"mean_token_accuracy": 0.8282081812620163,
"step": 1600
},
{
"epoch": 0.6797966963151207,
"grad_norm": 1.0314594529031804,
"learning_rate": 5.626926795411447e-06,
"loss": 0.4246,
"mean_token_accuracy": 0.8321157455444336,
"step": 1605
},
{
"epoch": 0.6819144430326133,
"grad_norm": 1.055274137880832,
"learning_rate": 5.560538693854751e-06,
"loss": 0.4193,
"mean_token_accuracy": 0.8316533505916596,
"step": 1610
},
{
"epoch": 0.6840321897501059,
"grad_norm": 1.1972782090907812,
"learning_rate": 5.494393398080292e-06,
"loss": 0.4313,
"mean_token_accuracy": 0.834712353348732,
"step": 1615
},
{
"epoch": 0.6861499364675985,
"grad_norm": 1.0962501568970522,
"learning_rate": 5.428494525747769e-06,
"loss": 0.4597,
"mean_token_accuracy": 0.8248083680868149,
"step": 1620
},
{
"epoch": 0.688267683185091,
"grad_norm": 1.0751444988160856,
"learning_rate": 5.362845681039348e-06,
"loss": 0.4321,
"mean_token_accuracy": 0.8374727904796601,
"step": 1625
},
{
"epoch": 0.6903854299025837,
"grad_norm": 1.1471090324016462,
"learning_rate": 5.297450454462526e-06,
"loss": 0.4328,
"mean_token_accuracy": 0.8296476870775222,
"step": 1630
},
{
"epoch": 0.6925031766200762,
"grad_norm": 0.962534660265453,
"learning_rate": 5.23231242265375e-06,
"loss": 0.4181,
"mean_token_accuracy": 0.83418510556221,
"step": 1635
},
{
"epoch": 0.6946209233375689,
"grad_norm": 1.1168651450432128,
"learning_rate": 5.167435148182824e-06,
"loss": 0.4176,
"mean_token_accuracy": 0.8372534781694412,
"step": 1640
},
{
"epoch": 0.6967386700550614,
"grad_norm": 1.2186341287706137,
"learning_rate": 5.102822179358037e-06,
"loss": 0.4075,
"mean_token_accuracy": 0.8409687280654907,
"step": 1645
},
{
"epoch": 0.6988564167725541,
"grad_norm": 0.9820636174800459,
"learning_rate": 5.0384770500321175e-06,
"loss": 0.4128,
"mean_token_accuracy": 0.8384972155094147,
"step": 1650
},
{
"epoch": 0.7009741634900466,
"grad_norm": 0.943830506781205,
"learning_rate": 4.97440327940895e-06,
"loss": 0.4027,
"mean_token_accuracy": 0.8365049093961716,
"step": 1655
},
{
"epoch": 0.7030919102075391,
"grad_norm": 1.0574783345670844,
"learning_rate": 4.910604371851091e-06,
"loss": 0.4308,
"mean_token_accuracy": 0.8333552926778793,
"step": 1660
},
{
"epoch": 0.7052096569250318,
"grad_norm": 1.103380699456734,
"learning_rate": 4.847083816688123e-06,
"loss": 0.412,
"mean_token_accuracy": 0.8425119102001191,
"step": 1665
},
{
"epoch": 0.7073274036425243,
"grad_norm": 1.117253769501395,
"learning_rate": 4.783845088025807e-06,
"loss": 0.4346,
"mean_token_accuracy": 0.8330845534801483,
"step": 1670
},
{
"epoch": 0.709445150360017,
"grad_norm": 1.4108563780024128,
"learning_rate": 4.7208916445560625e-06,
"loss": 0.414,
"mean_token_accuracy": 0.8379091322422028,
"step": 1675
},
{
"epoch": 0.7115628970775095,
"grad_norm": 1.031565575748758,
"learning_rate": 4.658226929367826e-06,
"loss": 0.4598,
"mean_token_accuracy": 0.8240681082010269,
"step": 1680
},
{
"epoch": 0.7136806437950021,
"grad_norm": 1.2248996065912452,
"learning_rate": 4.595854369758727e-06,
"loss": 0.4299,
"mean_token_accuracy": 0.8363937050104141,
"step": 1685
},
{
"epoch": 0.7157983905124947,
"grad_norm": 1.1049025661918381,
"learning_rate": 4.5337773770476245e-06,
"loss": 0.4273,
"mean_token_accuracy": 0.8340339243412018,
"step": 1690
},
{
"epoch": 0.7179161372299873,
"grad_norm": 1.1244170950870136,
"learning_rate": 4.4719993463880695e-06,
"loss": 0.4571,
"mean_token_accuracy": 0.8225684702396393,
"step": 1695
},
{
"epoch": 0.7200338839474799,
"grad_norm": 1.1969285633316296,
"learning_rate": 4.410523656582576e-06,
"loss": 0.4025,
"mean_token_accuracy": 0.8440569192171097,
"step": 1700
},
{
"epoch": 0.7221516306649725,
"grad_norm": 1.122866308313561,
"learning_rate": 4.349353669897856e-06,
"loss": 0.4208,
"mean_token_accuracy": 0.837623131275177,
"step": 1705
},
{
"epoch": 0.7242693773824651,
"grad_norm": 1.0173115464088704,
"learning_rate": 4.288492731880917e-06,
"loss": 0.4148,
"mean_token_accuracy": 0.8388867497444152,
"step": 1710
},
{
"epoch": 0.7263871240999576,
"grad_norm": 1.1018457774189827,
"learning_rate": 4.227944171176072e-06,
"loss": 0.4003,
"mean_token_accuracy": 0.8392677456140518,
"step": 1715
},
{
"epoch": 0.7285048708174502,
"grad_norm": 1.2471156860459571,
"learning_rate": 4.167711299342909e-06,
"loss": 0.4459,
"mean_token_accuracy": 0.8256678134202957,
"step": 1720
},
{
"epoch": 0.7306226175349428,
"grad_norm": 1.1273568017592417,
"learning_rate": 4.107797410675166e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8386416286230087,
"step": 1725
},
{
"epoch": 0.7327403642524354,
"grad_norm": 1.20918067568615,
"learning_rate": 4.048205782020544e-06,
"loss": 0.4539,
"mean_token_accuracy": 0.8220532357692718,
"step": 1730
},
{
"epoch": 0.734858110969928,
"grad_norm": 1.1573583276355073,
"learning_rate": 3.988939672601509e-06,
"loss": 0.395,
"mean_token_accuracy": 0.844212406873703,
"step": 1735
},
{
"epoch": 0.7369758576874206,
"grad_norm": 1.1516374922245958,
"learning_rate": 3.930002323837026e-06,
"loss": 0.4251,
"mean_token_accuracy": 0.8371291518211365,
"step": 1740
},
{
"epoch": 0.7390936044049131,
"grad_norm": 1.274643963255776,
"learning_rate": 3.871396959165267e-06,
"loss": 0.429,
"mean_token_accuracy": 0.8348165363073349,
"step": 1745
},
{
"epoch": 0.7412113511224058,
"grad_norm": 1.025583507042276,
"learning_rate": 3.8131267838673336e-06,
"loss": 0.4262,
"mean_token_accuracy": 0.8343986541032791,
"step": 1750
},
{
"epoch": 0.7433290978398983,
"grad_norm": 1.1299748085754966,
"learning_rate": 3.755194984891943e-06,
"loss": 0.4081,
"mean_token_accuracy": 0.8430469453334808,
"step": 1755
},
{
"epoch": 0.745446844557391,
"grad_norm": 1.0603027089656643,
"learning_rate": 3.6976047306811115e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.8382641762495041,
"step": 1760
},
{
"epoch": 0.7475645912748835,
"grad_norm": 1.1281590494510496,
"learning_rate": 3.6403591709968924e-06,
"loss": 0.4357,
"mean_token_accuracy": 0.8320927768945694,
"step": 1765
},
{
"epoch": 0.7496823379923762,
"grad_norm": 1.0367839611389602,
"learning_rate": 3.5834614367490706e-06,
"loss": 0.4221,
"mean_token_accuracy": 0.835366889834404,
"step": 1770
},
{
"epoch": 0.7518000847098687,
"grad_norm": 1.0958827736818129,
"learning_rate": 3.526914639823973e-06,
"loss": 0.4381,
"mean_token_accuracy": 0.8301591634750366,
"step": 1775
},
{
"epoch": 0.7539178314273612,
"grad_norm": 1.0559223618431266,
"learning_rate": 3.4707218729142224e-06,
"loss": 0.4291,
"mean_token_accuracy": 0.8316712707281113,
"step": 1780
},
{
"epoch": 0.7560355781448539,
"grad_norm": 1.0792688197107765,
"learning_rate": 3.414886209349615e-06,
"loss": 0.4269,
"mean_token_accuracy": 0.835688841342926,
"step": 1785
},
{
"epoch": 0.7581533248623464,
"grad_norm": 1.1979681287726258,
"learning_rate": 3.3594107029290347e-06,
"loss": 0.4269,
"mean_token_accuracy": 0.8371979027986527,
"step": 1790
},
{
"epoch": 0.7602710715798391,
"grad_norm": 1.1468783022113433,
"learning_rate": 3.304298387753426e-06,
"loss": 0.4311,
"mean_token_accuracy": 0.8341523915529251,
"step": 1795
},
{
"epoch": 0.7623888182973316,
"grad_norm": 1.142335742385377,
"learning_rate": 3.2495522780598442e-06,
"loss": 0.4174,
"mean_token_accuracy": 0.8298469454050064,
"step": 1800
},
{
"epoch": 0.7645065650148243,
"grad_norm": 1.1968773332651736,
"learning_rate": 3.1951753680566143e-06,
"loss": 0.4383,
"mean_token_accuracy": 0.8313175171613694,
"step": 1805
},
{
"epoch": 0.7666243117323168,
"grad_norm": 1.0804618708583653,
"learning_rate": 3.141170631759558e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.8373444229364395,
"step": 1810
},
{
"epoch": 0.7687420584498094,
"grad_norm": 1.0872538790077677,
"learning_rate": 3.087541022829347e-06,
"loss": 0.4221,
"mean_token_accuracy": 0.8371105402708053,
"step": 1815
},
{
"epoch": 0.770859805167302,
"grad_norm": 0.9905135006363225,
"learning_rate": 3.034289474409943e-06,
"loss": 0.4133,
"mean_token_accuracy": 0.8365035742521286,
"step": 1820
},
{
"epoch": 0.7729775518847946,
"grad_norm": 1.0890914888672922,
"learning_rate": 2.981418898968186e-06,
"loss": 0.4189,
"mean_token_accuracy": 0.838862606883049,
"step": 1825
},
{
"epoch": 0.7750952986022872,
"grad_norm": 1.1417209565486737,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.4169,
"mean_token_accuracy": 0.833648070693016,
"step": 1830
},
{
"epoch": 0.7772130453197797,
"grad_norm": 1.1684616910176908,
"learning_rate": 2.8768322125448265e-06,
"loss": 0.4469,
"mean_token_accuracy": 0.83056038916111,
"step": 1835
},
{
"epoch": 0.7793307920372723,
"grad_norm": 1.1845681597767028,
"learning_rate": 2.825121821683391e-06,
"loss": 0.4223,
"mean_token_accuracy": 0.8353413581848145,
"step": 1840
},
{
"epoch": 0.7814485387547649,
"grad_norm": 1.1732126933903428,
"learning_rate": 2.7738038437271288e-06,
"loss": 0.4121,
"mean_token_accuracy": 0.842677703499794,
"step": 1845
},
{
"epoch": 0.7835662854722575,
"grad_norm": 1.0292583187860371,
"learning_rate": 2.7228810853908406e-06,
"loss": 0.3921,
"mean_token_accuracy": 0.8447476714849472,
"step": 1850
},
{
"epoch": 0.7856840321897501,
"grad_norm": 0.9892030702997285,
"learning_rate": 2.67235633177373e-06,
"loss": 0.4387,
"mean_token_accuracy": 0.8288900941610337,
"step": 1855
},
{
"epoch": 0.7878017789072427,
"grad_norm": 1.0050687986582967,
"learning_rate": 2.6222323462070897e-06,
"loss": 0.4356,
"mean_token_accuracy": 0.828187745809555,
"step": 1860
},
{
"epoch": 0.7899195256247353,
"grad_norm": 1.1304197153376732,
"learning_rate": 2.572511870103149e-06,
"loss": 0.4125,
"mean_token_accuracy": 0.8425087302923202,
"step": 1865
},
{
"epoch": 0.7920372723422279,
"grad_norm": 1.0444576639344187,
"learning_rate": 2.5231976228051526e-06,
"loss": 0.4318,
"mean_token_accuracy": 0.8337043792009353,
"step": 1870
},
{
"epoch": 0.7941550190597204,
"grad_norm": 1.0875080317220023,
"learning_rate": 2.4742923014386154e-06,
"loss": 0.4287,
"mean_token_accuracy": 0.8368022799491882,
"step": 1875
},
{
"epoch": 0.7962727657772131,
"grad_norm": 1.1517129093084153,
"learning_rate": 2.4257985807638294e-06,
"loss": 0.4284,
"mean_token_accuracy": 0.8356128752231597,
"step": 1880
},
{
"epoch": 0.7983905124947056,
"grad_norm": 1.2213468844119533,
"learning_rate": 2.3777191130295673e-06,
"loss": 0.411,
"mean_token_accuracy": 0.8373890697956086,
"step": 1885
},
{
"epoch": 0.8005082592121983,
"grad_norm": 1.1105462272187794,
"learning_rate": 2.330056527828013e-06,
"loss": 0.4549,
"mean_token_accuracy": 0.8282926619052887,
"step": 1890
},
{
"epoch": 0.8026260059296908,
"grad_norm": 1.1626653178571262,
"learning_rate": 2.282813431950952e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8333282887935638,
"step": 1895
},
{
"epoch": 0.8047437526471835,
"grad_norm": 1.1195581942328177,
"learning_rate": 2.235992409247214e-06,
"loss": 0.4338,
"mean_token_accuracy": 0.8319763302803039,
"step": 1900
},
{
"epoch": 0.806861499364676,
"grad_norm": 1.026868168904022,
"learning_rate": 2.1895960204813194e-06,
"loss": 0.4118,
"mean_token_accuracy": 0.8370046824216842,
"step": 1905
},
{
"epoch": 0.8089792460821685,
"grad_norm": 1.0639569641896143,
"learning_rate": 2.1436268031934602e-06,
"loss": 0.4411,
"mean_token_accuracy": 0.8297486454248428,
"step": 1910
},
{
"epoch": 0.8110969927996612,
"grad_norm": 1.0385740186847223,
"learning_rate": 2.098087271560687e-06,
"loss": 0.4152,
"mean_token_accuracy": 0.8370089381933212,
"step": 1915
},
{
"epoch": 0.8132147395171537,
"grad_norm": 1.1169845772777505,
"learning_rate": 2.0529799162594242e-06,
"loss": 0.4094,
"mean_token_accuracy": 0.839673039317131,
"step": 1920
},
{
"epoch": 0.8153324862346464,
"grad_norm": 1.0745546751170598,
"learning_rate": 2.0083072043292406e-06,
"loss": 0.417,
"mean_token_accuracy": 0.8379459470510483,
"step": 1925
},
{
"epoch": 0.8174502329521389,
"grad_norm": 1.206429363415916,
"learning_rate": 1.9640715790379084e-06,
"loss": 0.4133,
"mean_token_accuracy": 0.8345289677381516,
"step": 1930
},
{
"epoch": 0.8195679796696315,
"grad_norm": 1.0452292636568519,
"learning_rate": 1.920275459747796e-06,
"loss": 0.4123,
"mean_token_accuracy": 0.8368586808443069,
"step": 1935
},
{
"epoch": 0.8216857263871241,
"grad_norm": 1.0706189800647066,
"learning_rate": 1.8769212417835314e-06,
"loss": 0.3773,
"mean_token_accuracy": 0.8513321369886399,
"step": 1940
},
{
"epoch": 0.8238034731046167,
"grad_norm": 1.0974535637452612,
"learning_rate": 1.8340112963009993e-06,
"loss": 0.4353,
"mean_token_accuracy": 0.8337271898984909,
"step": 1945
},
{
"epoch": 0.8259212198221093,
"grad_norm": 1.0867209847341632,
"learning_rate": 1.7915479701576577e-06,
"loss": 0.4489,
"mean_token_accuracy": 0.8291646331548691,
"step": 1950
},
{
"epoch": 0.8280389665396019,
"grad_norm": 1.1993569416921062,
"learning_rate": 1.7495335857841855e-06,
"loss": 0.4138,
"mean_token_accuracy": 0.8385995358228684,
"step": 1955
},
{
"epoch": 0.8301567132570945,
"grad_norm": 1.1414883228473476,
"learning_rate": 1.7079704410574505e-06,
"loss": 0.3859,
"mean_token_accuracy": 0.8459228605031968,
"step": 1960
},
{
"epoch": 0.832274459974587,
"grad_norm": 1.048311577922366,
"learning_rate": 1.6668608091748495e-06,
"loss": 0.426,
"mean_token_accuracy": 0.8357879251241684,
"step": 1965
},
{
"epoch": 0.8343922066920796,
"grad_norm": 1.0617438922962255,
"learning_rate": 1.6262069385299694e-06,
"loss": 0.4334,
"mean_token_accuracy": 0.8343731433153152,
"step": 1970
},
{
"epoch": 0.8365099534095722,
"grad_norm": 1.1279209328755353,
"learning_rate": 1.5860110525896143e-06,
"loss": 0.4197,
"mean_token_accuracy": 0.835442116856575,
"step": 1975
},
{
"epoch": 0.8386277001270648,
"grad_norm": 0.9640338154076892,
"learning_rate": 1.5462753497722139e-06,
"loss": 0.4228,
"mean_token_accuracy": 0.8363285154104233,
"step": 1980
},
{
"epoch": 0.8407454468445574,
"grad_norm": 1.065476222817932,
"learning_rate": 1.5070020033275655e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8427035689353943,
"step": 1985
},
{
"epoch": 0.84286319356205,
"grad_norm": 1.055480105683973,
"learning_rate": 1.4681931612179901e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.8340502351522445,
"step": 1990
},
{
"epoch": 0.8449809402795425,
"grad_norm": 1.0690985831761302,
"learning_rate": 1.4298509460008491e-06,
"loss": 0.4072,
"mean_token_accuracy": 0.8402904689311981,
"step": 1995
},
{
"epoch": 0.8470986869970352,
"grad_norm": 1.0063164183000968,
"learning_rate": 1.39197745471245e-06,
"loss": 0.4231,
"mean_token_accuracy": 0.8361636906862259,
"step": 2000
},
{
"epoch": 0.8492164337145277,
"grad_norm": 1.0247616494577987,
"learning_rate": 1.354574758753363e-06,
"loss": 0.4189,
"mean_token_accuracy": 0.8310322672128677,
"step": 2005
},
{
"epoch": 0.8513341804320204,
"grad_norm": 1.044860588344852,
"learning_rate": 1.3176449037751294e-06,
"loss": 0.4404,
"mean_token_accuracy": 0.8303707420825959,
"step": 2010
},
{
"epoch": 0.8534519271495129,
"grad_norm": 2.4537559889629694,
"learning_rate": 1.28118990956837e-06,
"loss": 0.4104,
"mean_token_accuracy": 0.835821408033371,
"step": 2015
},
{
"epoch": 0.8555696738670056,
"grad_norm": 1.0972489520800874,
"learning_rate": 1.2452117699523303e-06,
"loss": 0.4027,
"mean_token_accuracy": 0.8460766285657882,
"step": 2020
},
{
"epoch": 0.8576874205844981,
"grad_norm": 1.2309045137234433,
"learning_rate": 1.2097124526658277e-06,
"loss": 0.419,
"mean_token_accuracy": 0.8366678208112717,
"step": 2025
},
{
"epoch": 0.8598051673019906,
"grad_norm": 1.0849048269411365,
"learning_rate": 1.1746938992596257e-06,
"loss": 0.4174,
"mean_token_accuracy": 0.8296289384365082,
"step": 2030
},
{
"epoch": 0.8619229140194833,
"grad_norm": 0.989974221522167,
"learning_rate": 1.1401580249902566e-06,
"loss": 0.4153,
"mean_token_accuracy": 0.8379861056804657,
"step": 2035
},
{
"epoch": 0.8640406607369758,
"grad_norm": 1.0066596115748891,
"learning_rate": 1.1061067187152584e-06,
"loss": 0.4041,
"mean_token_accuracy": 0.8417060792446136,
"step": 2040
},
{
"epoch": 0.8661584074544685,
"grad_norm": 1.0526259070425423,
"learning_rate": 1.0725418427898792e-06,
"loss": 0.4099,
"mean_token_accuracy": 0.8398545056581497,
"step": 2045
},
{
"epoch": 0.868276154171961,
"grad_norm": 1.134470581551777,
"learning_rate": 1.0394652329652165e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.8354752600193024,
"step": 2050
},
{
"epoch": 0.8703939008894537,
"grad_norm": 1.130864865166622,
"learning_rate": 1.0068786982878087e-06,
"loss": 0.418,
"mean_token_accuracy": 0.8398678600788116,
"step": 2055
},
{
"epoch": 0.8725116476069462,
"grad_norm": 1.1500087879964977,
"learning_rate": 9.747840210007021e-07,
"loss": 0.4157,
"mean_token_accuracy": 0.8322781622409821,
"step": 2060
},
{
"epoch": 0.8746293943244388,
"grad_norm": 0.9770307768209092,
"learning_rate": 9.43182956445976e-07,
"loss": 0.3977,
"mean_token_accuracy": 0.8416966944932938,
"step": 2065
},
{
"epoch": 0.8767471410419314,
"grad_norm": 1.2583818143393242,
"learning_rate": 9.120772329687278e-07,
"loss": 0.4251,
"mean_token_accuracy": 0.8354076951742172,
"step": 2070
},
{
"epoch": 0.878864887759424,
"grad_norm": 1.0618576291439479,
"learning_rate": 8.814685518225552e-07,
"loss": 0.4291,
"mean_token_accuracy": 0.8308704495429993,
"step": 2075
},
{
"epoch": 0.8809826344769166,
"grad_norm": 1.1180367611245425,
"learning_rate": 8.513585870765118e-07,
"loss": 0.3907,
"mean_token_accuracy": 0.8452890306711197,
"step": 2080
},
{
"epoch": 0.8831003811944091,
"grad_norm": 1.230123212504081,
"learning_rate": 8.217489855235338e-07,
"loss": 0.4144,
"mean_token_accuracy": 0.8392110764980316,
"step": 2085
},
{
"epoch": 0.8852181279119017,
"grad_norm": 1.1108948475484288,
"learning_rate": 7.926413665903931e-07,
"loss": 0.4151,
"mean_token_accuracy": 0.8380868971347809,
"step": 2090
},
{
"epoch": 0.8873358746293943,
"grad_norm": 1.098761542271965,
"learning_rate": 7.640373222491038e-07,
"loss": 0.4196,
"mean_token_accuracy": 0.8407029449939728,
"step": 2095
},
{
"epoch": 0.8894536213468869,
"grad_norm": 1.0940803341605705,
"learning_rate": 7.359384169298744e-07,
"loss": 0.4097,
"mean_token_accuracy": 0.8401619613170623,
"step": 2100
},
{
"epoch": 0.8915713680643795,
"grad_norm": 0.9066347453646844,
"learning_rate": 7.083461874355335e-07,
"loss": 0.4257,
"mean_token_accuracy": 0.8362819194793701,
"step": 2105
},
{
"epoch": 0.8936891147818721,
"grad_norm": 1.0448023766882066,
"learning_rate": 6.81262142857475e-07,
"loss": 0.3898,
"mean_token_accuracy": 0.8459620922803879,
"step": 2110
},
{
"epoch": 0.8958068614993647,
"grad_norm": 1.0611643496346475,
"learning_rate": 6.546877644931315e-07,
"loss": 0.4208,
"mean_token_accuracy": 0.8312031596899032,
"step": 2115
},
{
"epoch": 0.8979246082168573,
"grad_norm": 1.1224663985096108,
"learning_rate": 6.286245057649542e-07,
"loss": 0.3994,
"mean_token_accuracy": 0.8465497404336929,
"step": 2120
},
{
"epoch": 0.9000423549343498,
"grad_norm": 1.0832056476567533,
"learning_rate": 6.030737921409169e-07,
"loss": 0.3867,
"mean_token_accuracy": 0.8440623044967651,
"step": 2125
},
{
"epoch": 0.9021601016518425,
"grad_norm": 1.0523110523954844,
"learning_rate": 5.7803702105656e-07,
"loss": 0.4127,
"mean_token_accuracy": 0.8366563141345977,
"step": 2130
},
{
"epoch": 0.904277848369335,
"grad_norm": 1.0105232913792406,
"learning_rate": 5.535155618385612e-07,
"loss": 0.4195,
"mean_token_accuracy": 0.8335390537977219,
"step": 2135
},
{
"epoch": 0.9063955950868277,
"grad_norm": 1.1129917485868344,
"learning_rate": 5.295107556298329e-07,
"loss": 0.3928,
"mean_token_accuracy": 0.8431670844554902,
"step": 2140
},
{
"epoch": 0.9085133418043202,
"grad_norm": 1.145719574659648,
"learning_rate": 5.060239153161872e-07,
"loss": 0.4019,
"mean_token_accuracy": 0.8419764310121536,
"step": 2145
},
{
"epoch": 0.9106310885218127,
"grad_norm": 1.443628282269306,
"learning_rate": 4.830563254545207e-07,
"loss": 0.4233,
"mean_token_accuracy": 0.8361739784479141,
"step": 2150
},
{
"epoch": 0.9127488352393054,
"grad_norm": 1.1691030241329559,
"learning_rate": 4.6060924220255654e-07,
"loss": 0.4257,
"mean_token_accuracy": 0.8305665761232376,
"step": 2155
},
{
"epoch": 0.9148665819567979,
"grad_norm": 1.2424085660240223,
"learning_rate": 4.386838932501547e-07,
"loss": 0.4303,
"mean_token_accuracy": 0.8358988225460052,
"step": 2160
},
{
"epoch": 0.9169843286742906,
"grad_norm": 1.0258262640063769,
"learning_rate": 4.172814777521483e-07,
"loss": 0.4298,
"mean_token_accuracy": 0.8366893321275711,
"step": 2165
},
{
"epoch": 0.9191020753917831,
"grad_norm": 1.0932401792673323,
"learning_rate": 3.9640316626277654e-07,
"loss": 0.4172,
"mean_token_accuracy": 0.836585283279419,
"step": 2170
},
{
"epoch": 0.9212198221092758,
"grad_norm": 1.0881178279493329,
"learning_rate": 3.7605010067165216e-07,
"loss": 0.42,
"mean_token_accuracy": 0.8352493315935134,
"step": 2175
},
{
"epoch": 0.9233375688267683,
"grad_norm": 1.057750886441079,
"learning_rate": 3.562233941413096e-07,
"loss": 0.3975,
"mean_token_accuracy": 0.8412194460630417,
"step": 2180
},
{
"epoch": 0.9254553155442609,
"grad_norm": 1.1056774030421723,
"learning_rate": 3.3692413104633226e-07,
"loss": 0.3976,
"mean_token_accuracy": 0.840697067975998,
"step": 2185
},
{
"epoch": 0.9275730622617535,
"grad_norm": 1.163101779598673,
"learning_rate": 3.1815336691403464e-07,
"loss": 0.3751,
"mean_token_accuracy": 0.8496327966451644,
"step": 2190
},
{
"epoch": 0.929690808979246,
"grad_norm": 0.9755793569303719,
"learning_rate": 2.999121283667339e-07,
"loss": 0.4079,
"mean_token_accuracy": 0.8418219208717346,
"step": 2195
},
{
"epoch": 0.9318085556967387,
"grad_norm": 1.021358583461123,
"learning_rate": 2.8220141306561034e-07,
"loss": 0.4186,
"mean_token_accuracy": 0.8352805793285369,
"step": 2200
},
{
"epoch": 0.9339263024142312,
"grad_norm": 1.0396837778560488,
"learning_rate": 2.6502218965613335e-07,
"loss": 0.4225,
"mean_token_accuracy": 0.8338442891836166,
"step": 2205
},
{
"epoch": 0.9360440491317239,
"grad_norm": 1.1742052357618658,
"learning_rate": 2.483753977150882e-07,
"loss": 0.4067,
"mean_token_accuracy": 0.8387827515602112,
"step": 2210
},
{
"epoch": 0.9381617958492164,
"grad_norm": 1.0739901995137444,
"learning_rate": 2.3226194769918497e-07,
"loss": 0.4041,
"mean_token_accuracy": 0.837730023264885,
"step": 2215
},
{
"epoch": 0.940279542566709,
"grad_norm": 1.0246012489566791,
"learning_rate": 2.1668272089526377e-07,
"loss": 0.4161,
"mean_token_accuracy": 0.8399739652872086,
"step": 2220
},
{
"epoch": 0.9423972892842016,
"grad_norm": 1.0463273467785923,
"learning_rate": 2.0163856937210236e-07,
"loss": 0.4245,
"mean_token_accuracy": 0.8379955619573594,
"step": 2225
},
{
"epoch": 0.9445150360016942,
"grad_norm": 1.13493837929642,
"learning_rate": 1.8713031593380116e-07,
"loss": 0.405,
"mean_token_accuracy": 0.8368137925863266,
"step": 2230
},
{
"epoch": 0.9466327827191868,
"grad_norm": 1.1326422720007092,
"learning_rate": 1.731587540747903e-07,
"loss": 0.4164,
"mean_token_accuracy": 0.839913833141327,
"step": 2235
},
{
"epoch": 0.9487505294366794,
"grad_norm": 1.1288581153860058,
"learning_rate": 1.597246479364345e-07,
"loss": 0.4345,
"mean_token_accuracy": 0.8263521671295166,
"step": 2240
},
{
"epoch": 0.9508682761541719,
"grad_norm": 1.0618048867408285,
"learning_rate": 1.4682873226523064e-07,
"loss": 0.4116,
"mean_token_accuracy": 0.8380947977304458,
"step": 2245
},
{
"epoch": 0.9529860228716646,
"grad_norm": 1.0089748524009554,
"learning_rate": 1.3447171237262912e-07,
"loss": 0.4281,
"mean_token_accuracy": 0.8311914891004563,
"step": 2250
},
{
"epoch": 0.9551037695891571,
"grad_norm": 1.1718653607363838,
"learning_rate": 1.2265426409645676e-07,
"loss": 0.4205,
"mean_token_accuracy": 0.8367854833602906,
"step": 2255
},
{
"epoch": 0.9572215163066498,
"grad_norm": 1.009709808393184,
"learning_rate": 1.1137703376395304e-07,
"loss": 0.4307,
"mean_token_accuracy": 0.8332184463739395,
"step": 2260
},
{
"epoch": 0.9593392630241423,
"grad_norm": 1.0456672123180084,
"learning_rate": 1.0064063815642178e-07,
"loss": 0.4143,
"mean_token_accuracy": 0.8407183200120926,
"step": 2265
},
{
"epoch": 0.961457009741635,
"grad_norm": 1.4564232381895734,
"learning_rate": 9.044566447549697e-08,
"loss": 0.3935,
"mean_token_accuracy": 0.843877837061882,
"step": 2270
},
{
"epoch": 0.9635747564591275,
"grad_norm": 1.006395133879737,
"learning_rate": 8.079267031102844e-08,
"loss": 0.4379,
"mean_token_accuracy": 0.8322035163640976,
"step": 2275
},
{
"epoch": 0.96569250317662,
"grad_norm": 1.0451381392295622,
"learning_rate": 7.16821836105841e-08,
"loss": 0.3998,
"mean_token_accuracy": 0.8473025262355804,
"step": 2280
},
{
"epoch": 0.9678102498941127,
"grad_norm": 1.0472971428422386,
"learning_rate": 6.311470265057518e-08,
"loss": 0.423,
"mean_token_accuracy": 0.8354467749595642,
"step": 2285
},
{
"epoch": 0.9699279966116052,
"grad_norm": 1.1605199240395647,
"learning_rate": 5.5090696009004744e-08,
"loss": 0.4257,
"mean_token_accuracy": 0.8360013753175736,
"step": 2290
},
{
"epoch": 0.9720457433290979,
"grad_norm": 0.9898182837486158,
"learning_rate": 4.761060253984151e-08,
"loss": 0.4204,
"mean_token_accuracy": 0.8367842882871628,
"step": 2295
},
{
"epoch": 0.9741634900465904,
"grad_norm": 1.088987157568079,
"learning_rate": 4.067483134901573e-08,
"loss": 0.4134,
"mean_token_accuracy": 0.83856400847435,
"step": 2300
},
{
"epoch": 0.976281236764083,
"grad_norm": 1.0295706774122013,
"learning_rate": 3.4283761772042623e-08,
"loss": 0.4224,
"mean_token_accuracy": 0.8354990780353546,
"step": 2305
},
{
"epoch": 0.9783989834815756,
"grad_norm": 1.0694735478921555,
"learning_rate": 2.84377433532812e-08,
"loss": 0.4305,
"mean_token_accuracy": 0.8316824287176132,
"step": 2310
},
{
"epoch": 0.9805167301990682,
"grad_norm": 1.0827744380795652,
"learning_rate": 2.3137095826809564e-08,
"loss": 0.402,
"mean_token_accuracy": 0.8404913783073426,
"step": 2315
},
{
"epoch": 0.9826344769165608,
"grad_norm": 1.0960538876783272,
"learning_rate": 1.8382109098944444e-08,
"loss": 0.4352,
"mean_token_accuracy": 0.8338410943746567,
"step": 2320
},
{
"epoch": 0.9847522236340533,
"grad_norm": 1.1206569874331853,
"learning_rate": 1.4173043232380557e-08,
"loss": 0.4076,
"mean_token_accuracy": 0.8435803085565567,
"step": 2325
},
{
"epoch": 0.986869970351546,
"grad_norm": 1.0708342349134583,
"learning_rate": 1.0510128431968635e-08,
"loss": 0.4041,
"mean_token_accuracy": 0.8435177773237228,
"step": 2330
},
{
"epoch": 0.9889877170690385,
"grad_norm": 1.0195754299190762,
"learning_rate": 7.3935650321255156e-09,
"loss": 0.4017,
"mean_token_accuracy": 0.8434190511703491,
"step": 2335
},
{
"epoch": 0.9911054637865311,
"grad_norm": 1.043562362927536,
"learning_rate": 4.823523485879556e-09,
"loss": 0.4441,
"mean_token_accuracy": 0.8331767469644547,
"step": 2340
},
{
"epoch": 0.9932232105040237,
"grad_norm": 0.9692528305370003,
"learning_rate": 2.800144355540324e-09,
"loss": 0.4112,
"mean_token_accuracy": 0.836205193400383,
"step": 2345
},
{
"epoch": 0.9953409572215163,
"grad_norm": 0.9767634882260735,
"learning_rate": 1.32353830502141e-09,
"loss": 0.4233,
"mean_token_accuracy": 0.8327444672584534,
"step": 2350
},
{
"epoch": 0.9974587039390089,
"grad_norm": 1.1074445733229596,
"learning_rate": 3.9378609377971335e-10,
"loss": 0.3959,
"mean_token_accuracy": 0.8446923106908798,
"step": 2355
},
{
"epoch": 0.9995764506565015,
"grad_norm": 1.0245959330198788,
"learning_rate": 1.0938572402308111e-11,
"loss": 0.4106,
"mean_token_accuracy": 0.8339618355035782,
"step": 2360
},
{
"epoch": 1.0,
"mean_token_accuracy": 0.890313521027565,
"step": 2361,
"total_flos": 451385831948288.0,
"train_loss": 0.48239256052181206,
"train_runtime": 37146.7848,
"train_samples_per_second": 1.017,
"train_steps_per_second": 0.064
}
],
"logging_steps": 5,
"max_steps": 2361,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 451385831948288.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}