CantoneseLLMChat-v1.0-72B / trainer_state.json
indiejoseph's picture
Upload folder using huggingface_hub
6abfbf6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9964020148716717,
"eval_steps": 1400,
"global_step": 2082,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007195970256656272,
"grad_norm": 3.382328658618869,
"learning_rate": 1.9138755980861244e-07,
"loss": 1.1034,
"step": 5
},
{
"epoch": 0.014391940513312544,
"grad_norm": 2.9332510464158155,
"learning_rate": 3.827751196172249e-07,
"loss": 1.1248,
"step": 10
},
{
"epoch": 0.021587910769968816,
"grad_norm": 2.8021125575132175,
"learning_rate": 5.741626794258373e-07,
"loss": 1.0979,
"step": 15
},
{
"epoch": 0.02878388102662509,
"grad_norm": 2.5443691578126004,
"learning_rate": 7.655502392344498e-07,
"loss": 1.1128,
"step": 20
},
{
"epoch": 0.035979851283281364,
"grad_norm": 2.823918046213155,
"learning_rate": 9.569377990430622e-07,
"loss": 1.0726,
"step": 25
},
{
"epoch": 0.04317582153993763,
"grad_norm": 2.0326907547127124,
"learning_rate": 1.1483253588516746e-06,
"loss": 1.0043,
"step": 30
},
{
"epoch": 0.05037179179659391,
"grad_norm": 2.168998278242428,
"learning_rate": 1.339712918660287e-06,
"loss": 1.0079,
"step": 35
},
{
"epoch": 0.05756776205325018,
"grad_norm": 2.152879633190319,
"learning_rate": 1.5311004784688995e-06,
"loss": 1.0086,
"step": 40
},
{
"epoch": 0.06476373230990645,
"grad_norm": 1.7771809680542394,
"learning_rate": 1.722488038277512e-06,
"loss": 0.9862,
"step": 45
},
{
"epoch": 0.07195970256656273,
"grad_norm": 1.6796335234145752,
"learning_rate": 1.9138755980861244e-06,
"loss": 0.9757,
"step": 50
},
{
"epoch": 0.079155672823219,
"grad_norm": 1.7111004529632545,
"learning_rate": 2.1052631578947366e-06,
"loss": 0.9067,
"step": 55
},
{
"epoch": 0.08635164307987526,
"grad_norm": 2.1161929062800184,
"learning_rate": 2.2966507177033493e-06,
"loss": 0.9534,
"step": 60
},
{
"epoch": 0.09354761333653154,
"grad_norm": 1.739361359363285,
"learning_rate": 2.4880382775119615e-06,
"loss": 0.9368,
"step": 65
},
{
"epoch": 0.10074358359318782,
"grad_norm": 1.9645645051440501,
"learning_rate": 2.679425837320574e-06,
"loss": 0.8762,
"step": 70
},
{
"epoch": 0.10793955384984409,
"grad_norm": 2.291876148211727,
"learning_rate": 2.8708133971291864e-06,
"loss": 0.9079,
"step": 75
},
{
"epoch": 0.11513552410650035,
"grad_norm": 1.9350312982517304,
"learning_rate": 3.062200956937799e-06,
"loss": 0.9063,
"step": 80
},
{
"epoch": 0.12233149436315663,
"grad_norm": 1.723686001407234,
"learning_rate": 3.2535885167464113e-06,
"loss": 0.9141,
"step": 85
},
{
"epoch": 0.1295274646198129,
"grad_norm": 1.7291796951049525,
"learning_rate": 3.444976076555024e-06,
"loss": 0.8852,
"step": 90
},
{
"epoch": 0.13672343487646918,
"grad_norm": 1.5591618462436518,
"learning_rate": 3.636363636363636e-06,
"loss": 0.8812,
"step": 95
},
{
"epoch": 0.14391940513312546,
"grad_norm": 2.5857950863717596,
"learning_rate": 3.827751196172249e-06,
"loss": 0.8719,
"step": 100
},
{
"epoch": 0.15111537538978173,
"grad_norm": 1.882950492021297,
"learning_rate": 4.019138755980861e-06,
"loss": 0.923,
"step": 105
},
{
"epoch": 0.158311345646438,
"grad_norm": 1.9095611162862844,
"learning_rate": 4.210526315789473e-06,
"loss": 0.8466,
"step": 110
},
{
"epoch": 0.16550731590309425,
"grad_norm": 1.7349719997638062,
"learning_rate": 4.4019138755980855e-06,
"loss": 0.8433,
"step": 115
},
{
"epoch": 0.17270328615975053,
"grad_norm": 1.6057643525733838,
"learning_rate": 4.5933014354066986e-06,
"loss": 0.8901,
"step": 120
},
{
"epoch": 0.1798992564164068,
"grad_norm": 1.636440080236875,
"learning_rate": 4.784688995215311e-06,
"loss": 0.9158,
"step": 125
},
{
"epoch": 0.18709522667306308,
"grad_norm": 1.7680919340453238,
"learning_rate": 4.976076555023923e-06,
"loss": 0.9012,
"step": 130
},
{
"epoch": 0.19429119692971936,
"grad_norm": 1.73915276490131,
"learning_rate": 5.167464114832536e-06,
"loss": 0.8372,
"step": 135
},
{
"epoch": 0.20148716718637563,
"grad_norm": 1.4761634523053184,
"learning_rate": 5.358851674641148e-06,
"loss": 0.8631,
"step": 140
},
{
"epoch": 0.2086831374430319,
"grad_norm": 1.7615083334015453,
"learning_rate": 5.5502392344497606e-06,
"loss": 0.8275,
"step": 145
},
{
"epoch": 0.21587910769968818,
"grad_norm": 1.5283874292360122,
"learning_rate": 5.741626794258373e-06,
"loss": 0.8625,
"step": 150
},
{
"epoch": 0.22307507795634446,
"grad_norm": 1.6114784875621726,
"learning_rate": 5.933014354066985e-06,
"loss": 0.891,
"step": 155
},
{
"epoch": 0.2302710482130007,
"grad_norm": 1.6981117694270553,
"learning_rate": 6.124401913875598e-06,
"loss": 0.8548,
"step": 160
},
{
"epoch": 0.23746701846965698,
"grad_norm": 1.7093051041039082,
"learning_rate": 6.31578947368421e-06,
"loss": 0.8953,
"step": 165
},
{
"epoch": 0.24466298872631326,
"grad_norm": 1.6829774560988056,
"learning_rate": 6.5071770334928226e-06,
"loss": 0.7894,
"step": 170
},
{
"epoch": 0.25185895898296956,
"grad_norm": 1.7224253982363609,
"learning_rate": 6.698564593301436e-06,
"loss": 0.9342,
"step": 175
},
{
"epoch": 0.2590549292396258,
"grad_norm": 1.718825453842612,
"learning_rate": 6.889952153110048e-06,
"loss": 0.8765,
"step": 180
},
{
"epoch": 0.26625089949628206,
"grad_norm": 1.8299643097592468,
"learning_rate": 7.081339712918659e-06,
"loss": 0.8562,
"step": 185
},
{
"epoch": 0.27344686975293836,
"grad_norm": 1.9650794343564884,
"learning_rate": 7.272727272727272e-06,
"loss": 0.8468,
"step": 190
},
{
"epoch": 0.2806428400095946,
"grad_norm": 1.6665409567649216,
"learning_rate": 7.4641148325358846e-06,
"loss": 0.8394,
"step": 195
},
{
"epoch": 0.2878388102662509,
"grad_norm": 1.6178516494615538,
"learning_rate": 7.655502392344498e-06,
"loss": 0.9063,
"step": 200
},
{
"epoch": 0.29503478052290716,
"grad_norm": 1.737750416660729,
"learning_rate": 7.84688995215311e-06,
"loss": 0.8376,
"step": 205
},
{
"epoch": 0.30223075077956346,
"grad_norm": 1.6775958248025469,
"learning_rate": 7.9999943732958e-06,
"loss": 0.8839,
"step": 210
},
{
"epoch": 0.3094267210362197,
"grad_norm": 1.6351855660641077,
"learning_rate": 7.999797440310976e-06,
"loss": 0.8138,
"step": 215
},
{
"epoch": 0.316622691292876,
"grad_norm": 1.5160913628770296,
"learning_rate": 7.999319187945908e-06,
"loss": 0.8634,
"step": 220
},
{
"epoch": 0.32381866154953226,
"grad_norm": 1.5346049341737504,
"learning_rate": 7.998559649837715e-06,
"loss": 0.8777,
"step": 225
},
{
"epoch": 0.3310146318061885,
"grad_norm": 1.7202200592837338,
"learning_rate": 7.997518879407302e-06,
"loss": 0.9041,
"step": 230
},
{
"epoch": 0.3382106020628448,
"grad_norm": 1.6194787415884186,
"learning_rate": 7.996196949855597e-06,
"loss": 0.8567,
"step": 235
},
{
"epoch": 0.34540657231950106,
"grad_norm": 1.5761200757620502,
"learning_rate": 7.994593954158409e-06,
"loss": 0.8683,
"step": 240
},
{
"epoch": 0.35260254257615736,
"grad_norm": 1.6216799441610352,
"learning_rate": 7.992710005059886e-06,
"loss": 0.8718,
"step": 245
},
{
"epoch": 0.3597985128328136,
"grad_norm": 1.4982517397299326,
"learning_rate": 7.990545235064588e-06,
"loss": 0.8491,
"step": 250
},
{
"epoch": 0.3669944830894699,
"grad_norm": 1.7915522189289803,
"learning_rate": 7.988099796428161e-06,
"loss": 0.8546,
"step": 255
},
{
"epoch": 0.37419045334612616,
"grad_norm": 1.6209807612052567,
"learning_rate": 7.985373861146636e-06,
"loss": 0.8112,
"step": 260
},
{
"epoch": 0.38138642360278247,
"grad_norm": 1.5354364067126935,
"learning_rate": 7.98236762094433e-06,
"loss": 0.8484,
"step": 265
},
{
"epoch": 0.3885823938594387,
"grad_norm": 1.7666681382505078,
"learning_rate": 7.979081287260356e-06,
"loss": 0.8752,
"step": 270
},
{
"epoch": 0.39577836411609496,
"grad_norm": 1.6458650430939799,
"learning_rate": 7.975515091233757e-06,
"loss": 0.8294,
"step": 275
},
{
"epoch": 0.40297433437275126,
"grad_norm": 1.7589737639189482,
"learning_rate": 7.971669283687252e-06,
"loss": 0.8269,
"step": 280
},
{
"epoch": 0.4101703046294075,
"grad_norm": 1.597742594232911,
"learning_rate": 7.967544135109583e-06,
"loss": 0.8873,
"step": 285
},
{
"epoch": 0.4173662748860638,
"grad_norm": 1.516759424294559,
"learning_rate": 7.963139935636505e-06,
"loss": 0.8162,
"step": 290
},
{
"epoch": 0.42456224514272006,
"grad_norm": 1.6997534481298684,
"learning_rate": 7.958456995030372e-06,
"loss": 0.8202,
"step": 295
},
{
"epoch": 0.43175821539937637,
"grad_norm": 1.726622337993047,
"learning_rate": 7.95349564265835e-06,
"loss": 0.8456,
"step": 300
},
{
"epoch": 0.4389541856560326,
"grad_norm": 1.647760739349251,
"learning_rate": 7.94825622746925e-06,
"loss": 0.8648,
"step": 305
},
{
"epoch": 0.4461501559126889,
"grad_norm": 1.7211951131144776,
"learning_rate": 7.942739117968995e-06,
"loss": 0.8272,
"step": 310
},
{
"epoch": 0.45334612616934516,
"grad_norm": 1.5335036085040443,
"learning_rate": 7.936944702194691e-06,
"loss": 0.878,
"step": 315
},
{
"epoch": 0.4605420964260014,
"grad_norm": 1.4848768316508538,
"learning_rate": 7.93087338768734e-06,
"loss": 0.8456,
"step": 320
},
{
"epoch": 0.4677380666826577,
"grad_norm": 1.5394168018437981,
"learning_rate": 7.924525601463173e-06,
"loss": 0.8427,
"step": 325
},
{
"epoch": 0.47493403693931396,
"grad_norm": 1.559312757501083,
"learning_rate": 7.91790178998362e-06,
"loss": 0.8202,
"step": 330
},
{
"epoch": 0.48213000719597027,
"grad_norm": 1.5792217882137543,
"learning_rate": 7.91100241912391e-06,
"loss": 0.8419,
"step": 335
},
{
"epoch": 0.4893259774526265,
"grad_norm": 1.738156211889958,
"learning_rate": 7.9038279741403e-06,
"loss": 0.8659,
"step": 340
},
{
"epoch": 0.4965219477092828,
"grad_norm": 1.7060742398951603,
"learning_rate": 7.896378959635946e-06,
"loss": 0.8564,
"step": 345
},
{
"epoch": 0.5037179179659391,
"grad_norm": 1.483857852582522,
"learning_rate": 7.888655899525413e-06,
"loss": 0.8122,
"step": 350
},
{
"epoch": 0.5109138882225953,
"grad_norm": 1.468389601692516,
"learning_rate": 7.880659336997833e-06,
"loss": 0.887,
"step": 355
},
{
"epoch": 0.5181098584792516,
"grad_norm": 1.5870295716868312,
"learning_rate": 7.872389834478688e-06,
"loss": 0.8813,
"step": 360
},
{
"epoch": 0.5253058287359079,
"grad_norm": 1.8895020621596357,
"learning_rate": 7.863847973590265e-06,
"loss": 0.8626,
"step": 365
},
{
"epoch": 0.5325017989925641,
"grad_norm": 1.550855518803781,
"learning_rate": 7.855034355110736e-06,
"loss": 0.8546,
"step": 370
},
{
"epoch": 0.5396977692492204,
"grad_norm": 1.5611172491745864,
"learning_rate": 7.845949598931918e-06,
"loss": 0.848,
"step": 375
},
{
"epoch": 0.5468937395058767,
"grad_norm": 1.5611908671622932,
"learning_rate": 7.836594344015661e-06,
"loss": 0.8738,
"step": 380
},
{
"epoch": 0.554089709762533,
"grad_norm": 1.5092812979555394,
"learning_rate": 7.826969248348915e-06,
"loss": 0.8693,
"step": 385
},
{
"epoch": 0.5612856800191892,
"grad_norm": 2.804211699082011,
"learning_rate": 7.817074988897446e-06,
"loss": 0.8373,
"step": 390
},
{
"epoch": 0.5684816502758455,
"grad_norm": 1.6322846593476272,
"learning_rate": 7.806912261558232e-06,
"loss": 0.8179,
"step": 395
},
{
"epoch": 0.5756776205325018,
"grad_norm": 1.537211750195004,
"learning_rate": 7.796481781110504e-06,
"loss": 0.8881,
"step": 400
},
{
"epoch": 0.5828735907891581,
"grad_norm": 1.6424496003416038,
"learning_rate": 7.785784281165491e-06,
"loss": 0.8285,
"step": 405
},
{
"epoch": 0.5900695610458143,
"grad_norm": 1.712983639226015,
"learning_rate": 7.774820514114804e-06,
"loss": 0.8471,
"step": 410
},
{
"epoch": 0.5972655313024706,
"grad_norm": 1.6543355090148368,
"learning_rate": 7.763591251077532e-06,
"loss": 0.8181,
"step": 415
},
{
"epoch": 0.6044615015591269,
"grad_norm": 1.7695569793499315,
"learning_rate": 7.752097281845998e-06,
"loss": 0.8317,
"step": 420
},
{
"epoch": 0.6116574718157831,
"grad_norm": 1.6408594582199851,
"learning_rate": 7.740339414830216e-06,
"loss": 0.8822,
"step": 425
},
{
"epoch": 0.6188534420724394,
"grad_norm": 1.532763883790403,
"learning_rate": 7.72831847700103e-06,
"loss": 0.8858,
"step": 430
},
{
"epoch": 0.6260494123290957,
"grad_norm": 1.5699531832045912,
"learning_rate": 7.71603531383195e-06,
"loss": 0.803,
"step": 435
},
{
"epoch": 0.633245382585752,
"grad_norm": 1.4023688398705096,
"learning_rate": 7.703490789239685e-06,
"loss": 0.8015,
"step": 440
},
{
"epoch": 0.6404413528424082,
"grad_norm": 1.743812894669404,
"learning_rate": 7.690685785523388e-06,
"loss": 0.8398,
"step": 445
},
{
"epoch": 0.6476373230990645,
"grad_norm": 1.5621879493455977,
"learning_rate": 7.677621203302591e-06,
"loss": 0.7979,
"step": 450
},
{
"epoch": 0.6548332933557208,
"grad_norm": 1.5193447362876678,
"learning_rate": 7.66429796145387e-06,
"loss": 0.8125,
"step": 455
},
{
"epoch": 0.662029263612377,
"grad_norm": 1.6795960658239877,
"learning_rate": 7.650716997046216e-06,
"loss": 0.8477,
"step": 460
},
{
"epoch": 0.6692252338690333,
"grad_norm": 1.585934440158675,
"learning_rate": 7.636879265275119e-06,
"loss": 0.845,
"step": 465
},
{
"epoch": 0.6764212041256896,
"grad_norm": 1.550239333699456,
"learning_rate": 7.622785739395397e-06,
"loss": 0.8723,
"step": 470
},
{
"epoch": 0.6836171743823459,
"grad_norm": 1.6342679772831266,
"learning_rate": 7.608437410652739e-06,
"loss": 0.8237,
"step": 475
},
{
"epoch": 0.6908131446390021,
"grad_norm": 1.9009580720490618,
"learning_rate": 7.593835288213984e-06,
"loss": 0.8525,
"step": 480
},
{
"epoch": 0.6980091148956584,
"grad_norm": 1.4797149380660173,
"learning_rate": 7.578980399096153e-06,
"loss": 0.8343,
"step": 485
},
{
"epoch": 0.7052050851523147,
"grad_norm": 1.6927806221547512,
"learning_rate": 7.5638737880942e-06,
"loss": 0.819,
"step": 490
},
{
"epoch": 0.712401055408971,
"grad_norm": 1.6132959543020267,
"learning_rate": 7.548516517707544e-06,
"loss": 0.8177,
"step": 495
},
{
"epoch": 0.7195970256656272,
"grad_norm": 1.5783090956116979,
"learning_rate": 7.532909668065329e-06,
"loss": 0.8217,
"step": 500
},
{
"epoch": 0.7267929959222835,
"grad_norm": 1.439090732947757,
"learning_rate": 7.517054336850457e-06,
"loss": 0.8617,
"step": 505
},
{
"epoch": 0.7339889661789398,
"grad_norm": 1.906650403594057,
"learning_rate": 7.500951639222389e-06,
"loss": 0.8427,
"step": 510
},
{
"epoch": 0.741184936435596,
"grad_norm": 1.5718009905007941,
"learning_rate": 7.484602707738707e-06,
"loss": 0.9079,
"step": 515
},
{
"epoch": 0.7483809066922523,
"grad_norm": 1.716197310859666,
"learning_rate": 7.468008692275457e-06,
"loss": 0.8278,
"step": 520
},
{
"epoch": 0.7555768769489086,
"grad_norm": 1.5485542395076963,
"learning_rate": 7.45117075994628e-06,
"loss": 0.8326,
"step": 525
},
{
"epoch": 0.7627728472055649,
"grad_norm": 1.4519889463898326,
"learning_rate": 7.434090095020318e-06,
"loss": 0.7923,
"step": 530
},
{
"epoch": 0.7699688174622211,
"grad_norm": 1.4499255632268135,
"learning_rate": 7.416767898838926e-06,
"loss": 0.8449,
"step": 535
},
{
"epoch": 0.7771647877188774,
"grad_norm": 1.498217506498266,
"learning_rate": 7.399205389731172e-06,
"loss": 0.8462,
"step": 540
},
{
"epoch": 0.7843607579755337,
"grad_norm": 1.441416826072204,
"learning_rate": 7.381403802928153e-06,
"loss": 0.7864,
"step": 545
},
{
"epoch": 0.7915567282321899,
"grad_norm": 1.5728216952970062,
"learning_rate": 7.363364390476114e-06,
"loss": 0.779,
"step": 550
},
{
"epoch": 0.7987526984888462,
"grad_norm": 1.579757123001059,
"learning_rate": 7.34508842114839e-06,
"loss": 0.8342,
"step": 555
},
{
"epoch": 0.8059486687455025,
"grad_norm": 1.5204038228338386,
"learning_rate": 7.326577180356162e-06,
"loss": 0.8202,
"step": 560
},
{
"epoch": 0.8131446390021588,
"grad_norm": 1.6070382704422732,
"learning_rate": 7.30783197005806e-06,
"loss": 0.7948,
"step": 565
},
{
"epoch": 0.820340609258815,
"grad_norm": 1.4952361279508235,
"learning_rate": 7.288854108668586e-06,
"loss": 0.8451,
"step": 570
},
{
"epoch": 0.8275365795154713,
"grad_norm": 1.4373323975649135,
"learning_rate": 7.2696449309653795e-06,
"loss": 0.8381,
"step": 575
},
{
"epoch": 0.8347325497721276,
"grad_norm": 1.292833703910724,
"learning_rate": 7.250205787995353e-06,
"loss": 0.8286,
"step": 580
},
{
"epoch": 0.8419285200287839,
"grad_norm": 1.2885838799591653,
"learning_rate": 7.230538046979654e-06,
"loss": 0.8506,
"step": 585
},
{
"epoch": 0.8491244902854401,
"grad_norm": 1.4134558176619048,
"learning_rate": 7.210643091217513e-06,
"loss": 0.8411,
"step": 590
},
{
"epoch": 0.8563204605420964,
"grad_norm": 1.6657530487665673,
"learning_rate": 7.1905223199889425e-06,
"loss": 0.834,
"step": 595
},
{
"epoch": 0.8635164307987527,
"grad_norm": 1.7801521918282026,
"learning_rate": 7.170177148456331e-06,
"loss": 0.8461,
"step": 600
},
{
"epoch": 0.8707124010554089,
"grad_norm": 1.5712149389587902,
"learning_rate": 7.149609007564903e-06,
"loss": 0.8683,
"step": 605
},
{
"epoch": 0.8779083713120652,
"grad_norm": 1.5017107860836532,
"learning_rate": 7.128819343942077e-06,
"loss": 0.8442,
"step": 610
},
{
"epoch": 0.8851043415687215,
"grad_norm": 1.4727249376876361,
"learning_rate": 7.107809619795722e-06,
"loss": 0.8668,
"step": 615
},
{
"epoch": 0.8923003118253778,
"grad_norm": 1.428036074094576,
"learning_rate": 7.086581312811309e-06,
"loss": 0.773,
"step": 620
},
{
"epoch": 0.899496282082034,
"grad_norm": 1.459125555270072,
"learning_rate": 7.065135916047992e-06,
"loss": 0.8551,
"step": 625
},
{
"epoch": 0.9066922523386903,
"grad_norm": 1.4437358399730527,
"learning_rate": 7.043474937833581e-06,
"loss": 0.8055,
"step": 630
},
{
"epoch": 0.9138882225953466,
"grad_norm": 1.6487138533805716,
"learning_rate": 7.021599901658467e-06,
"loss": 0.8162,
"step": 635
},
{
"epoch": 0.9210841928520028,
"grad_norm": 1.5886402538030315,
"learning_rate": 6.999512346068467e-06,
"loss": 0.8472,
"step": 640
},
{
"epoch": 0.9282801631086591,
"grad_norm": 1.580701659838036,
"learning_rate": 6.977213824556613e-06,
"loss": 0.8185,
"step": 645
},
{
"epoch": 0.9354761333653154,
"grad_norm": 1.3791827207697653,
"learning_rate": 6.95470590545389e-06,
"loss": 0.8424,
"step": 650
},
{
"epoch": 0.9426721036219717,
"grad_norm": 1.5248977943916064,
"learning_rate": 6.931990171818923e-06,
"loss": 0.8829,
"step": 655
},
{
"epoch": 0.9498680738786279,
"grad_norm": 1.4962085290443874,
"learning_rate": 6.909068221326647e-06,
"loss": 0.8236,
"step": 660
},
{
"epoch": 0.9570640441352842,
"grad_norm": 1.6074178928012908,
"learning_rate": 6.88594166615593e-06,
"loss": 0.8165,
"step": 665
},
{
"epoch": 0.9642600143919405,
"grad_norm": 1.538150422370725,
"learning_rate": 6.8626121328761824e-06,
"loss": 0.8155,
"step": 670
},
{
"epoch": 0.9714559846485968,
"grad_norm": 1.4261928133080215,
"learning_rate": 6.839081262332957e-06,
"loss": 0.8271,
"step": 675
},
{
"epoch": 0.978651954905253,
"grad_norm": 1.4811633224353407,
"learning_rate": 6.815350709532544e-06,
"loss": 0.8417,
"step": 680
},
{
"epoch": 0.9858479251619093,
"grad_norm": 1.5008044573312285,
"learning_rate": 6.791422143525564e-06,
"loss": 0.859,
"step": 685
},
{
"epoch": 0.9930438954185656,
"grad_norm": 1.5977699325206687,
"learning_rate": 6.767297247289585e-06,
"loss": 0.8663,
"step": 690
},
{
"epoch": 1.0002398656752218,
"grad_norm": 2.198059715592546,
"learning_rate": 6.742977717610744e-06,
"loss": 0.8427,
"step": 695
},
{
"epoch": 1.0074358359318782,
"grad_norm": 1.447486858474852,
"learning_rate": 6.718465264964414e-06,
"loss": 0.5445,
"step": 700
},
{
"epoch": 1.0146318061885344,
"grad_norm": 1.5387360085110118,
"learning_rate": 6.693761613394899e-06,
"loss": 0.5585,
"step": 705
},
{
"epoch": 1.0218277764451906,
"grad_norm": 1.3617154485025116,
"learning_rate": 6.668868500394172e-06,
"loss": 0.4605,
"step": 710
},
{
"epoch": 1.029023746701847,
"grad_norm": 1.433780770938119,
"learning_rate": 6.643787676779671e-06,
"loss": 0.5254,
"step": 715
},
{
"epoch": 1.0362197169585032,
"grad_norm": 1.4766138665890287,
"learning_rate": 6.618520906571171e-06,
"loss": 0.476,
"step": 720
},
{
"epoch": 1.0434156872151594,
"grad_norm": 1.4473624764279305,
"learning_rate": 6.593069966866694e-06,
"loss": 0.5404,
"step": 725
},
{
"epoch": 1.0506116574718158,
"grad_norm": 1.4713255574009756,
"learning_rate": 6.567436647717535e-06,
"loss": 0.5293,
"step": 730
},
{
"epoch": 1.057807627728472,
"grad_norm": 1.3099551452546936,
"learning_rate": 6.541622752002355e-06,
"loss": 0.5168,
"step": 735
},
{
"epoch": 1.0650035979851282,
"grad_norm": 1.4756303619125208,
"learning_rate": 6.515630095300383e-06,
"loss": 0.5253,
"step": 740
},
{
"epoch": 1.0721995682417846,
"grad_norm": 1.2607903619444512,
"learning_rate": 6.489460505763713e-06,
"loss": 0.5203,
"step": 745
},
{
"epoch": 1.0793955384984408,
"grad_norm": 1.2668860008648557,
"learning_rate": 6.463115823988732e-06,
"loss": 0.5133,
"step": 750
},
{
"epoch": 1.0865915087550972,
"grad_norm": 1.5445317289615172,
"learning_rate": 6.436597902886655e-06,
"loss": 0.5399,
"step": 755
},
{
"epoch": 1.0937874790117534,
"grad_norm": 1.373346882134398,
"learning_rate": 6.409908607553217e-06,
"loss": 0.4742,
"step": 760
},
{
"epoch": 1.1009834492684096,
"grad_norm": 1.2423912396823111,
"learning_rate": 6.38304981513748e-06,
"loss": 0.4928,
"step": 765
},
{
"epoch": 1.108179419525066,
"grad_norm": 1.564288911463839,
"learning_rate": 6.3560234147098155e-06,
"loss": 0.509,
"step": 770
},
{
"epoch": 1.1153753897817222,
"grad_norm": 1.592002787628597,
"learning_rate": 6.328831307129039e-06,
"loss": 0.5373,
"step": 775
},
{
"epoch": 1.1225713600383784,
"grad_norm": 1.2953958374191832,
"learning_rate": 6.30147540490871e-06,
"loss": 0.5053,
"step": 780
},
{
"epoch": 1.1297673302950348,
"grad_norm": 1.3851707132475586,
"learning_rate": 6.27395763208263e-06,
"loss": 0.5138,
"step": 785
},
{
"epoch": 1.136963300551691,
"grad_norm": 1.5083938363704992,
"learning_rate": 6.246279924069504e-06,
"loss": 0.4639,
"step": 790
},
{
"epoch": 1.1441592708083472,
"grad_norm": 1.4583984595795128,
"learning_rate": 6.218444227536832e-06,
"loss": 0.509,
"step": 795
},
{
"epoch": 1.1513552410650036,
"grad_norm": 1.1915713777559744,
"learning_rate": 6.190452500263975e-06,
"loss": 0.4771,
"step": 800
},
{
"epoch": 1.1585512113216598,
"grad_norm": 1.2538864728177044,
"learning_rate": 6.162306711004474e-06,
"loss": 0.4927,
"step": 805
},
{
"epoch": 1.165747181578316,
"grad_norm": 1.3142042016311857,
"learning_rate": 6.134008839347575e-06,
"loss": 0.4884,
"step": 810
},
{
"epoch": 1.1729431518349724,
"grad_norm": 1.232777503769632,
"learning_rate": 6.105560875578994e-06,
"loss": 0.5273,
"step": 815
},
{
"epoch": 1.1801391220916286,
"grad_norm": 1.502479848796588,
"learning_rate": 6.076964820540937e-06,
"loss": 0.5086,
"step": 820
},
{
"epoch": 1.187335092348285,
"grad_norm": 1.5121283948236117,
"learning_rate": 6.048222685491374e-06,
"loss": 0.5374,
"step": 825
},
{
"epoch": 1.1945310626049412,
"grad_norm": 1.8544252686881426,
"learning_rate": 6.019336491962581e-06,
"loss": 0.5381,
"step": 830
},
{
"epoch": 1.2017270328615974,
"grad_norm": 1.3559009500960952,
"learning_rate": 5.990308271618956e-06,
"loss": 0.4939,
"step": 835
},
{
"epoch": 1.2089230031182538,
"grad_norm": 1.6991974082062777,
"learning_rate": 5.961140066114128e-06,
"loss": 0.5429,
"step": 840
},
{
"epoch": 1.21611897337491,
"grad_norm": 1.40613574888153,
"learning_rate": 5.931833926947358e-06,
"loss": 0.4778,
"step": 845
},
{
"epoch": 1.2233149436315662,
"grad_norm": 1.291827429944612,
"learning_rate": 5.902391915319252e-06,
"loss": 0.4604,
"step": 850
},
{
"epoch": 1.2305109138882226,
"grad_norm": 1.5403421830718962,
"learning_rate": 5.872816101986789e-06,
"loss": 0.4993,
"step": 855
},
{
"epoch": 1.2377068841448788,
"grad_norm": 1.295984623620993,
"learning_rate": 5.843108567117678e-06,
"loss": 0.4972,
"step": 860
},
{
"epoch": 1.244902854401535,
"grad_norm": 1.372473134492806,
"learning_rate": 5.813271400144051e-06,
"loss": 0.5199,
"step": 865
},
{
"epoch": 1.2520988246581914,
"grad_norm": 1.3747684408273264,
"learning_rate": 5.783306699615512e-06,
"loss": 0.5136,
"step": 870
},
{
"epoch": 1.2592947949148476,
"grad_norm": 1.3792887268238399,
"learning_rate": 5.753216573051526e-06,
"loss": 0.5045,
"step": 875
},
{
"epoch": 1.266490765171504,
"grad_norm": 1.6127625261536036,
"learning_rate": 5.723003136793208e-06,
"loss": 0.5003,
"step": 880
},
{
"epoch": 1.2736867354281602,
"grad_norm": 1.4440630275399904,
"learning_rate": 5.692668515854457e-06,
"loss": 0.4521,
"step": 885
},
{
"epoch": 1.2808827056848164,
"grad_norm": 1.5683931030948375,
"learning_rate": 5.662214843772506e-06,
"loss": 0.5435,
"step": 890
},
{
"epoch": 1.2880786759414729,
"grad_norm": 1.4069551760135997,
"learning_rate": 5.631644262457861e-06,
"loss": 0.5326,
"step": 895
},
{
"epoch": 1.295274646198129,
"grad_norm": 1.3933443999205188,
"learning_rate": 5.600958922043651e-06,
"loss": 0.4905,
"step": 900
},
{
"epoch": 1.3024706164547855,
"grad_norm": 2.0908428594407344,
"learning_rate": 5.570160980734405e-06,
"loss": 0.4444,
"step": 905
},
{
"epoch": 1.3096665867114416,
"grad_norm": 1.6732069684734259,
"learning_rate": 5.539252604654256e-06,
"loss": 0.5535,
"step": 910
},
{
"epoch": 1.3168625569680978,
"grad_norm": 1.4666496678005971,
"learning_rate": 5.50823596769459e-06,
"loss": 0.4977,
"step": 915
},
{
"epoch": 1.324058527224754,
"grad_norm": 1.3633392168469545,
"learning_rate": 5.477113251361149e-06,
"loss": 0.5118,
"step": 920
},
{
"epoch": 1.3312544974814104,
"grad_norm": 1.2762316788464472,
"learning_rate": 5.445886644620601e-06,
"loss": 0.5136,
"step": 925
},
{
"epoch": 1.3384504677380666,
"grad_norm": 1.5424740001396617,
"learning_rate": 5.414558343746579e-06,
"loss": 0.4926,
"step": 930
},
{
"epoch": 1.345646437994723,
"grad_norm": 1.3847453947317292,
"learning_rate": 5.38313055216521e-06,
"loss": 0.5458,
"step": 935
},
{
"epoch": 1.3528424082513792,
"grad_norm": 1.3877715039925074,
"learning_rate": 5.351605480300143e-06,
"loss": 0.4637,
"step": 940
},
{
"epoch": 1.3600383785080354,
"grad_norm": 1.3969957173831602,
"learning_rate": 5.319985345417079e-06,
"loss": 0.4787,
"step": 945
},
{
"epoch": 1.3672343487646919,
"grad_norm": 1.669129051283974,
"learning_rate": 5.288272371467827e-06,
"loss": 0.484,
"step": 950
},
{
"epoch": 1.374430319021348,
"grad_norm": 1.3646255552767974,
"learning_rate": 5.256468788933881e-06,
"loss": 0.4782,
"step": 955
},
{
"epoch": 1.3816262892780042,
"grad_norm": 1.384029909257828,
"learning_rate": 5.2245768346695494e-06,
"loss": 0.5021,
"step": 960
},
{
"epoch": 1.3888222595346607,
"grad_norm": 1.3642725031029292,
"learning_rate": 5.192598751744621e-06,
"loss": 0.476,
"step": 965
},
{
"epoch": 1.3960182297913168,
"grad_norm": 1.4324224845783868,
"learning_rate": 5.160536789286612e-06,
"loss": 0.4966,
"step": 970
},
{
"epoch": 1.403214200047973,
"grad_norm": 1.3426650482840705,
"learning_rate": 5.128393202322565e-06,
"loss": 0.5116,
"step": 975
},
{
"epoch": 1.4104101703046295,
"grad_norm": 1.4860297463165402,
"learning_rate": 5.096170251620458e-06,
"loss": 0.512,
"step": 980
},
{
"epoch": 1.4176061405612856,
"grad_norm": 1.3809734192523142,
"learning_rate": 5.063870203530188e-06,
"loss": 0.5128,
"step": 985
},
{
"epoch": 1.424802110817942,
"grad_norm": 1.61363323216821,
"learning_rate": 5.031495329824175e-06,
"loss": 0.5342,
"step": 990
},
{
"epoch": 1.4319980810745983,
"grad_norm": 2.088959286090713,
"learning_rate": 4.999047907537582e-06,
"loss": 0.489,
"step": 995
},
{
"epoch": 1.4391940513312544,
"grad_norm": 1.5116312432174273,
"learning_rate": 4.966530218808157e-06,
"loss": 0.4968,
"step": 1000
},
{
"epoch": 1.4463900215879106,
"grad_norm": 1.4453482163933629,
"learning_rate": 4.933944550715725e-06,
"loss": 0.5297,
"step": 1005
},
{
"epoch": 1.453585991844567,
"grad_norm": 1.5874368851505785,
"learning_rate": 4.901293195121338e-06,
"loss": 0.5005,
"step": 1010
},
{
"epoch": 1.4607819621012232,
"grad_norm": 1.5140612281307557,
"learning_rate": 4.868578448506067e-06,
"loss": 0.5425,
"step": 1015
},
{
"epoch": 1.4679779323578797,
"grad_norm": 1.5247613079347937,
"learning_rate": 4.835802611809492e-06,
"loss": 0.5246,
"step": 1020
},
{
"epoch": 1.4751739026145358,
"grad_norm": 1.4922253999621244,
"learning_rate": 4.802967990267867e-06,
"loss": 0.5129,
"step": 1025
},
{
"epoch": 1.482369872871192,
"grad_norm": 1.5098918292072203,
"learning_rate": 4.770076893251986e-06,
"loss": 0.5239,
"step": 1030
},
{
"epoch": 1.4895658431278485,
"grad_norm": 1.6715329520651785,
"learning_rate": 4.7371316341047484e-06,
"loss": 0.5659,
"step": 1035
},
{
"epoch": 1.4967618133845046,
"grad_norm": 1.4935236860112138,
"learning_rate": 4.704134529978471e-06,
"loss": 0.4914,
"step": 1040
},
{
"epoch": 1.503957783641161,
"grad_norm": 1.4122572036404084,
"learning_rate": 4.671087901671899e-06,
"loss": 0.4798,
"step": 1045
},
{
"epoch": 1.5111537538978173,
"grad_norm": 1.4681688735493286,
"learning_rate": 4.637994073466981e-06,
"loss": 0.5051,
"step": 1050
},
{
"epoch": 1.5183497241544734,
"grad_norm": 1.44942902813713,
"learning_rate": 4.604855372965394e-06,
"loss": 0.539,
"step": 1055
},
{
"epoch": 1.5255456944111296,
"grad_norm": 1.236269924531647,
"learning_rate": 4.5716741309248445e-06,
"loss": 0.5305,
"step": 1060
},
{
"epoch": 1.532741664667786,
"grad_norm": 1.6240033228942266,
"learning_rate": 4.538452681095123e-06,
"loss": 0.5531,
"step": 1065
},
{
"epoch": 1.5399376349244425,
"grad_norm": 1.5622316143038788,
"learning_rate": 4.5051933600539705e-06,
"loss": 0.494,
"step": 1070
},
{
"epoch": 1.5471336051810987,
"grad_norm": 1.5638109251408436,
"learning_rate": 4.471898507042745e-06,
"loss": 0.533,
"step": 1075
},
{
"epoch": 1.5543295754377549,
"grad_norm": 1.5029077042303538,
"learning_rate": 4.438570463801884e-06,
"loss": 0.513,
"step": 1080
},
{
"epoch": 1.561525545694411,
"grad_norm": 1.372369046315647,
"learning_rate": 4.405211574406209e-06,
"loss": 0.4698,
"step": 1085
},
{
"epoch": 1.5687215159510672,
"grad_norm": 1.6322850083649267,
"learning_rate": 4.371824185100054e-06,
"loss": 0.4607,
"step": 1090
},
{
"epoch": 1.5759174862077237,
"grad_norm": 1.6038884982185644,
"learning_rate": 4.338410644132256e-06,
"loss": 0.4918,
"step": 1095
},
{
"epoch": 1.58311345646438,
"grad_norm": 1.510163759577701,
"learning_rate": 4.304973301590977e-06,
"loss": 0.5141,
"step": 1100
},
{
"epoch": 1.5903094267210363,
"grad_norm": 1.3447962919620353,
"learning_rate": 4.271514509238434e-06,
"loss": 0.5719,
"step": 1105
},
{
"epoch": 1.5975053969776924,
"grad_norm": 1.4095809553505452,
"learning_rate": 4.238036620345477e-06,
"loss": 0.5378,
"step": 1110
},
{
"epoch": 1.6047013672343486,
"grad_norm": 1.5471600021180223,
"learning_rate": 4.204541989526083e-06,
"loss": 0.5159,
"step": 1115
},
{
"epoch": 1.611897337491005,
"grad_norm": 1.3407123732434036,
"learning_rate": 4.171032972571744e-06,
"loss": 0.514,
"step": 1120
},
{
"epoch": 1.6190933077476615,
"grad_norm": 1.5587421912806174,
"learning_rate": 4.137511926285779e-06,
"loss": 0.4943,
"step": 1125
},
{
"epoch": 1.6262892780043177,
"grad_norm": 1.5228893274275985,
"learning_rate": 4.103981208317571e-06,
"loss": 0.5161,
"step": 1130
},
{
"epoch": 1.6334852482609739,
"grad_norm": 1.3816307683209126,
"learning_rate": 4.070443176996745e-06,
"loss": 0.5036,
"step": 1135
},
{
"epoch": 1.64068121851763,
"grad_norm": 1.551360790563111,
"learning_rate": 4.036900191167301e-06,
"loss": 0.4973,
"step": 1140
},
{
"epoch": 1.6478771887742862,
"grad_norm": 1.3412086458854404,
"learning_rate": 4.003354610021701e-06,
"loss": 0.5029,
"step": 1145
},
{
"epoch": 1.6550731590309427,
"grad_norm": 1.4793955535305545,
"learning_rate": 3.96980879293495e-06,
"loss": 0.4925,
"step": 1150
},
{
"epoch": 1.662269129287599,
"grad_norm": 1.2902630164808577,
"learning_rate": 3.9362650992986465e-06,
"loss": 0.4906,
"step": 1155
},
{
"epoch": 1.6694650995442553,
"grad_norm": 1.5152262911206174,
"learning_rate": 3.902725888355037e-06,
"loss": 0.5019,
"step": 1160
},
{
"epoch": 1.6766610698009115,
"grad_norm": 1.577445286461562,
"learning_rate": 3.869193519031086e-06,
"loss": 0.49,
"step": 1165
},
{
"epoch": 1.6838570400575676,
"grad_norm": 1.4532186104713023,
"learning_rate": 3.835670349772566e-06,
"loss": 0.47,
"step": 1170
},
{
"epoch": 1.691053010314224,
"grad_norm": 1.2527595036522912,
"learning_rate": 3.802158738378176e-06,
"loss": 0.4508,
"step": 1175
},
{
"epoch": 1.6982489805708805,
"grad_norm": 1.4942224309571124,
"learning_rate": 3.7686610418337083e-06,
"loss": 0.5039,
"step": 1180
},
{
"epoch": 1.7054449508275367,
"grad_norm": 1.72911326258422,
"learning_rate": 3.7351796161462796e-06,
"loss": 0.4808,
"step": 1185
},
{
"epoch": 1.7126409210841929,
"grad_norm": 1.3747290591542072,
"learning_rate": 3.7017168161786215e-06,
"loss": 0.4993,
"step": 1190
},
{
"epoch": 1.719836891340849,
"grad_norm": 1.4764634140281585,
"learning_rate": 3.6682749954834548e-06,
"loss": 0.5115,
"step": 1195
},
{
"epoch": 1.7270328615975052,
"grad_norm": 1.6701936544900278,
"learning_rate": 3.634856506137956e-06,
"loss": 0.5653,
"step": 1200
},
{
"epoch": 1.7342288318541617,
"grad_norm": 1.5206960936360632,
"learning_rate": 3.6014636985783287e-06,
"loss": 0.521,
"step": 1205
},
{
"epoch": 1.741424802110818,
"grad_norm": 1.5138010545314067,
"learning_rate": 3.568098921434488e-06,
"loss": 0.4856,
"step": 1210
},
{
"epoch": 1.7486207723674743,
"grad_norm": 1.5508467662920749,
"learning_rate": 3.534764521364879e-06,
"loss": 0.4846,
"step": 1215
},
{
"epoch": 1.7558167426241305,
"grad_norm": 1.294214634658577,
"learning_rate": 3.501462842891418e-06,
"loss": 0.4876,
"step": 1220
},
{
"epoch": 1.7630127128807866,
"grad_norm": 1.4066532330498682,
"learning_rate": 3.4681962282346023e-06,
"loss": 0.4644,
"step": 1225
},
{
"epoch": 1.770208683137443,
"grad_norm": 1.4435875920097863,
"learning_rate": 3.4349670171487714e-06,
"loss": 0.5199,
"step": 1230
},
{
"epoch": 1.7774046533940993,
"grad_norm": 1.5007291535231901,
"learning_rate": 3.4017775467575446e-06,
"loss": 0.5224,
"step": 1235
},
{
"epoch": 1.7846006236507557,
"grad_norm": 1.2699334841273793,
"learning_rate": 3.3686301513894416e-06,
"loss": 0.4914,
"step": 1240
},
{
"epoch": 1.7917965939074119,
"grad_norm": 1.2195698381350315,
"learning_rate": 3.3355271624137037e-06,
"loss": 0.4719,
"step": 1245
},
{
"epoch": 1.798992564164068,
"grad_norm": 1.3895500926839512,
"learning_rate": 3.3024709080763186e-06,
"loss": 0.5144,
"step": 1250
},
{
"epoch": 1.8061885344207242,
"grad_norm": 1.48913348325029,
"learning_rate": 3.269463713336268e-06,
"loss": 0.5103,
"step": 1255
},
{
"epoch": 1.8133845046773807,
"grad_norm": 1.3658603273721261,
"learning_rate": 3.236507899702005e-06,
"loss": 0.473,
"step": 1260
},
{
"epoch": 1.820580474934037,
"grad_norm": 1.3775281668274946,
"learning_rate": 3.2036057850681745e-06,
"loss": 0.514,
"step": 1265
},
{
"epoch": 1.8277764451906933,
"grad_norm": 1.631774330401289,
"learning_rate": 3.170759683552586e-06,
"loss": 0.5163,
"step": 1270
},
{
"epoch": 1.8349724154473495,
"grad_norm": 1.384315211463836,
"learning_rate": 3.137971905333458e-06,
"loss": 0.4752,
"step": 1275
},
{
"epoch": 1.8421683857040057,
"grad_norm": 1.4207130788508293,
"learning_rate": 3.1052447564869343e-06,
"loss": 0.5018,
"step": 1280
},
{
"epoch": 1.849364355960662,
"grad_norm": 1.5148685580490273,
"learning_rate": 3.0725805388248834e-06,
"loss": 0.5127,
"step": 1285
},
{
"epoch": 1.8565603262173183,
"grad_norm": 1.4919953654248346,
"learning_rate": 3.039981549733014e-06,
"loss": 0.4971,
"step": 1290
},
{
"epoch": 1.8637562964739747,
"grad_norm": 2.065773895317616,
"learning_rate": 3.007450082009283e-06,
"loss": 0.4843,
"step": 1295
},
{
"epoch": 1.8709522667306309,
"grad_norm": 1.4849907079900204,
"learning_rate": 2.9749884237026426e-06,
"loss": 0.5102,
"step": 1300
},
{
"epoch": 1.878148236987287,
"grad_norm": 1.7567340972637704,
"learning_rate": 2.9425988579521103e-06,
"loss": 0.4901,
"step": 1305
},
{
"epoch": 1.8853442072439432,
"grad_norm": 1.4271802301503538,
"learning_rate": 2.910283662826188e-06,
"loss": 0.4805,
"step": 1310
},
{
"epoch": 1.8925401775005997,
"grad_norm": 1.479518679920681,
"learning_rate": 2.8780451111626384e-06,
"loss": 0.4908,
"step": 1315
},
{
"epoch": 1.899736147757256,
"grad_norm": 1.5062976034854971,
"learning_rate": 2.8458854704086275e-06,
"loss": 0.491,
"step": 1320
},
{
"epoch": 1.9069321180139123,
"grad_norm": 1.552858614788501,
"learning_rate": 2.8138070024612504e-06,
"loss": 0.4787,
"step": 1325
},
{
"epoch": 1.9141280882705685,
"grad_norm": 1.668691870366938,
"learning_rate": 2.7818119635084392e-06,
"loss": 0.536,
"step": 1330
},
{
"epoch": 1.9213240585272247,
"grad_norm": 1.4857517196291667,
"learning_rate": 2.749902603870283e-06,
"loss": 0.5047,
"step": 1335
},
{
"epoch": 1.928520028783881,
"grad_norm": 2.9875786042021932,
"learning_rate": 2.7180811678407525e-06,
"loss": 0.504,
"step": 1340
},
{
"epoch": 1.9357159990405373,
"grad_norm": 1.3595766267302944,
"learning_rate": 2.686349893529849e-06,
"loss": 0.4863,
"step": 1345
},
{
"epoch": 1.9429119692971937,
"grad_norm": 1.4889142920291538,
"learning_rate": 2.6547110127061975e-06,
"loss": 0.4926,
"step": 1350
},
{
"epoch": 1.9501079395538499,
"grad_norm": 1.451668900558599,
"learning_rate": 2.6231667506400706e-06,
"loss": 0.4984,
"step": 1355
},
{
"epoch": 1.957303909810506,
"grad_norm": 1.47137694607456,
"learning_rate": 2.591719325946883e-06,
"loss": 0.5209,
"step": 1360
},
{
"epoch": 1.9644998800671623,
"grad_norm": 1.5284770765800149,
"learning_rate": 2.560370950431146e-06,
"loss": 0.4603,
"step": 1365
},
{
"epoch": 1.9716958503238187,
"grad_norm": 1.402896466872614,
"learning_rate": 2.5291238289309054e-06,
"loss": 0.5077,
"step": 1370
},
{
"epoch": 1.978891820580475,
"grad_norm": 1.4817926638302614,
"learning_rate": 2.497980159162667e-06,
"loss": 0.4839,
"step": 1375
},
{
"epoch": 1.9860877908371313,
"grad_norm": 1.5453436757112435,
"learning_rate": 2.466942131566824e-06,
"loss": 0.4888,
"step": 1380
},
{
"epoch": 1.9932837610937875,
"grad_norm": 1.4335485637084342,
"learning_rate": 2.4360119291535955e-06,
"loss": 0.4917,
"step": 1385
},
{
"epoch": 2.0004797313504437,
"grad_norm": 1.6143369616539034,
"learning_rate": 2.405191727349489e-06,
"loss": 0.4993,
"step": 1390
},
{
"epoch": 2.0076757016071,
"grad_norm": 1.2224443867251211,
"learning_rate": 2.3744836938442936e-06,
"loss": 0.2088,
"step": 1395
},
{
"epoch": 2.0148716718637565,
"grad_norm": 1.2602299678447657,
"learning_rate": 2.3438899884386185e-06,
"loss": 0.1941,
"step": 1400
},
{
"epoch": 2.0148716718637565,
"eval_loss": 0.9261869192123413,
"eval_runtime": 740.3886,
"eval_samples_per_second": 10.008,
"eval_steps_per_second": 0.627,
"step": 1400
},
{
"epoch": 2.0220676421204127,
"grad_norm": 1.323505095452205,
"learning_rate": 2.3134127628919927e-06,
"loss": 0.1915,
"step": 1405
},
{
"epoch": 2.029263612377069,
"grad_norm": 1.2958926807268987,
"learning_rate": 2.2830541607715136e-06,
"loss": 0.1736,
"step": 1410
},
{
"epoch": 2.036459582633725,
"grad_norm": 1.271545256175082,
"learning_rate": 2.2528163173010927e-06,
"loss": 0.1845,
"step": 1415
},
{
"epoch": 2.0436555528903813,
"grad_norm": 1.5029993532268295,
"learning_rate": 2.2227013592112757e-06,
"loss": 0.1893,
"step": 1420
},
{
"epoch": 2.0508515231470374,
"grad_norm": 1.2921857666544403,
"learning_rate": 2.192711404589658e-06,
"loss": 0.1958,
"step": 1425
},
{
"epoch": 2.058047493403694,
"grad_norm": 1.2460289218576504,
"learning_rate": 2.162848562731916e-06,
"loss": 0.1994,
"step": 1430
},
{
"epoch": 2.0652434636603503,
"grad_norm": 1.2881623243419067,
"learning_rate": 2.133114933993452e-06,
"loss": 0.1935,
"step": 1435
},
{
"epoch": 2.0724394339170065,
"grad_norm": 1.1792218418956621,
"learning_rate": 2.1035126096416704e-06,
"loss": 0.1951,
"step": 1440
},
{
"epoch": 2.0796354041736627,
"grad_norm": 1.284870948942911,
"learning_rate": 2.07404367170889e-06,
"loss": 0.1948,
"step": 1445
},
{
"epoch": 2.086831374430319,
"grad_norm": 1.2222749381574636,
"learning_rate": 2.0447101928459083e-06,
"loss": 0.1927,
"step": 1450
},
{
"epoch": 2.0940273446869755,
"grad_norm": 1.4089391140981338,
"learning_rate": 2.0155142361762256e-06,
"loss": 0.1553,
"step": 1455
},
{
"epoch": 2.1012233149436317,
"grad_norm": 1.1670069976157664,
"learning_rate": 1.986457855150937e-06,
"loss": 0.1882,
"step": 1460
},
{
"epoch": 2.108419285200288,
"grad_norm": 1.20035808468667,
"learning_rate": 1.957543093404309e-06,
"loss": 0.1723,
"step": 1465
},
{
"epoch": 2.115615255456944,
"grad_norm": 1.2485627377889825,
"learning_rate": 1.9287719846100366e-06,
"loss": 0.1841,
"step": 1470
},
{
"epoch": 2.1228112257136003,
"grad_norm": 1.4758861123400375,
"learning_rate": 1.900146552338222e-06,
"loss": 0.1989,
"step": 1475
},
{
"epoch": 2.1300071959702565,
"grad_norm": 1.415660377393989,
"learning_rate": 1.8716688099130336e-06,
"loss": 0.1792,
"step": 1480
},
{
"epoch": 2.137203166226913,
"grad_norm": 1.1398390415745234,
"learning_rate": 1.8433407602711122e-06,
"loss": 0.1828,
"step": 1485
},
{
"epoch": 2.1443991364835693,
"grad_norm": 1.436825768706905,
"learning_rate": 1.8151643958206963e-06,
"loss": 0.1873,
"step": 1490
},
{
"epoch": 2.1515951067402255,
"grad_norm": 1.2111903866598819,
"learning_rate": 1.7871416983014864e-06,
"loss": 0.1747,
"step": 1495
},
{
"epoch": 2.1587910769968817,
"grad_norm": 1.592322648486121,
"learning_rate": 1.7592746386452641e-06,
"loss": 0.1981,
"step": 1500
},
{
"epoch": 2.165987047253538,
"grad_norm": 1.3381476033081696,
"learning_rate": 1.7315651768372734e-06,
"loss": 0.1752,
"step": 1505
},
{
"epoch": 2.1731830175101945,
"grad_norm": 1.5243125399513529,
"learning_rate": 1.7040152617783607e-06,
"loss": 0.1797,
"step": 1510
},
{
"epoch": 2.1803789877668507,
"grad_norm": 1.5148343638714192,
"learning_rate": 1.6766268311479078e-06,
"loss": 0.193,
"step": 1515
},
{
"epoch": 2.187574958023507,
"grad_norm": 1.315102374687142,
"learning_rate": 1.649401811267546e-06,
"loss": 0.1889,
"step": 1520
},
{
"epoch": 2.194770928280163,
"grad_norm": 1.5038597370043303,
"learning_rate": 1.622342116965672e-06,
"loss": 0.2193,
"step": 1525
},
{
"epoch": 2.2019668985368193,
"grad_norm": 1.3456620640508148,
"learning_rate": 1.595449651442771e-06,
"loss": 0.1842,
"step": 1530
},
{
"epoch": 2.2091628687934755,
"grad_norm": 1.3647300470767014,
"learning_rate": 1.5687263061375595e-06,
"loss": 0.1752,
"step": 1535
},
{
"epoch": 2.216358839050132,
"grad_norm": 1.417987485184227,
"learning_rate": 1.5421739605939518e-06,
"loss": 0.1728,
"step": 1540
},
{
"epoch": 2.2235548093067883,
"grad_norm": 1.5887020304276804,
"learning_rate": 1.5157944823288672e-06,
"loss": 0.1637,
"step": 1545
},
{
"epoch": 2.2307507795634445,
"grad_norm": 1.3375708672110973,
"learning_rate": 1.4895897267008782e-06,
"loss": 0.1792,
"step": 1550
},
{
"epoch": 2.2379467498201007,
"grad_norm": 1.3565700525423485,
"learning_rate": 1.463561536779724e-06,
"loss": 0.1921,
"step": 1555
},
{
"epoch": 2.245142720076757,
"grad_norm": 1.5551856772129453,
"learning_rate": 1.4377117432166718e-06,
"loss": 0.1618,
"step": 1560
},
{
"epoch": 2.2523386903334135,
"grad_norm": 1.2100448164204372,
"learning_rate": 1.4120421641157662e-06,
"loss": 0.1928,
"step": 1565
},
{
"epoch": 2.2595346605900697,
"grad_norm": 1.438877153368831,
"learning_rate": 1.386554604905955e-06,
"loss": 0.1774,
"step": 1570
},
{
"epoch": 2.266730630846726,
"grad_norm": 1.2780217507242704,
"learning_rate": 1.3612508582141065e-06,
"loss": 0.1871,
"step": 1575
},
{
"epoch": 2.273926601103382,
"grad_norm": 1.3558845492725387,
"learning_rate": 1.3361327037389295e-06,
"loss": 0.2018,
"step": 1580
},
{
"epoch": 2.2811225713600383,
"grad_norm": 1.3490250928179355,
"learning_rate": 1.3112019081257986e-06,
"loss": 0.1731,
"step": 1585
},
{
"epoch": 2.2883185416166945,
"grad_norm": 1.2405141654870557,
"learning_rate": 1.2864602248425018e-06,
"loss": 0.1886,
"step": 1590
},
{
"epoch": 2.295514511873351,
"grad_norm": 1.2873724354006912,
"learning_rate": 1.2619093940559138e-06,
"loss": 0.1868,
"step": 1595
},
{
"epoch": 2.3027104821300073,
"grad_norm": 1.3107124153475105,
"learning_rate": 1.2375511425096013e-06,
"loss": 0.187,
"step": 1600
},
{
"epoch": 2.3099064523866635,
"grad_norm": 1.3468010137925535,
"learning_rate": 1.213387183402378e-06,
"loss": 0.1771,
"step": 1605
},
{
"epoch": 2.3171024226433197,
"grad_norm": 1.4179240822671797,
"learning_rate": 1.1894192162678086e-06,
"loss": 0.1654,
"step": 1610
},
{
"epoch": 2.324298392899976,
"grad_norm": 1.3848546480668056,
"learning_rate": 1.165648926854672e-06,
"loss": 0.1838,
"step": 1615
},
{
"epoch": 2.331494363156632,
"grad_norm": 1.5195023852589002,
"learning_rate": 1.1420779870084052e-06,
"loss": 0.1955,
"step": 1620
},
{
"epoch": 2.3386903334132887,
"grad_norm": 1.2410727385995408,
"learning_rate": 1.1187080545535064e-06,
"loss": 0.1685,
"step": 1625
},
{
"epoch": 2.345886303669945,
"grad_norm": 1.1237477417805415,
"learning_rate": 1.09554077317694e-06,
"loss": 0.1824,
"step": 1630
},
{
"epoch": 2.353082273926601,
"grad_norm": 1.2937342096954545,
"learning_rate": 1.0725777723125301e-06,
"loss": 0.1943,
"step": 1635
},
{
"epoch": 2.3602782441832573,
"grad_norm": 1.2828779926698606,
"learning_rate": 1.0498206670263567e-06,
"loss": 0.1832,
"step": 1640
},
{
"epoch": 2.3674742144399135,
"grad_norm": 1.253425033010922,
"learning_rate": 1.0272710579031616e-06,
"loss": 0.2044,
"step": 1645
},
{
"epoch": 2.37467018469657,
"grad_norm": 1.3678742472737333,
"learning_rate": 1.0049305309337758e-06,
"loss": 0.1672,
"step": 1650
},
{
"epoch": 2.3818661549532263,
"grad_norm": 1.5542727998398753,
"learning_rate": 9.82800657403569e-07,
"loss": 0.1955,
"step": 1655
},
{
"epoch": 2.3890621252098825,
"grad_norm": 1.4017624513152087,
"learning_rate": 9.60882993781937e-07,
"loss": 0.1733,
"step": 1660
},
{
"epoch": 2.3962580954665387,
"grad_norm": 1.199342554533447,
"learning_rate": 9.391790816128304e-07,
"loss": 0.1649,
"step": 1665
},
{
"epoch": 2.403454065723195,
"grad_norm": 1.2335679341459465,
"learning_rate": 9.176904474063319e-07,
"loss": 0.198,
"step": 1670
},
{
"epoch": 2.4106500359798515,
"grad_norm": 1.4585828923578052,
"learning_rate": 8.964186025312908e-07,
"loss": 0.1988,
"step": 1675
},
{
"epoch": 2.4178460062365077,
"grad_norm": 1.4518660782918198,
"learning_rate": 8.753650431090252e-07,
"loss": 0.1701,
"step": 1680
},
{
"epoch": 2.425041976493164,
"grad_norm": 1.3322728405275928,
"learning_rate": 8.545312499080922e-07,
"loss": 0.1729,
"step": 1685
},
{
"epoch": 2.43223794674982,
"grad_norm": 1.3067316057050342,
"learning_rate": 8.339186882401445e-07,
"loss": 0.1874,
"step": 1690
},
{
"epoch": 2.4394339170064763,
"grad_norm": 1.4177678336114292,
"learning_rate": 8.135288078568656e-07,
"loss": 0.2021,
"step": 1695
},
{
"epoch": 2.4466298872631325,
"grad_norm": 1.3121080863750958,
"learning_rate": 7.933630428480049e-07,
"loss": 0.1699,
"step": 1700
},
{
"epoch": 2.453825857519789,
"grad_norm": 1.3185780885959946,
"learning_rate": 7.734228115405161e-07,
"loss": 0.1624,
"step": 1705
},
{
"epoch": 2.4610218277764453,
"grad_norm": 1.33019533604804,
"learning_rate": 7.537095163987972e-07,
"loss": 0.1784,
"step": 1710
},
{
"epoch": 2.4682177980331015,
"grad_norm": 1.3853774517952444,
"learning_rate": 7.342245439260537e-07,
"loss": 0.1824,
"step": 1715
},
{
"epoch": 2.4754137682897577,
"grad_norm": 1.1804687459435843,
"learning_rate": 7.149692645667804e-07,
"loss": 0.1693,
"step": 1720
},
{
"epoch": 2.482609738546414,
"grad_norm": 1.250231314429457,
"learning_rate": 6.959450326103722e-07,
"loss": 0.2067,
"step": 1725
},
{
"epoch": 2.48980570880307,
"grad_norm": 1.3184620916504868,
"learning_rate": 6.771531860958726e-07,
"loss": 0.1557,
"step": 1730
},
{
"epoch": 2.4970016790597267,
"grad_norm": 1.3996911523285738,
"learning_rate": 6.585950467178656e-07,
"loss": 0.1984,
"step": 1735
},
{
"epoch": 2.504197649316383,
"grad_norm": 1.330732277956789,
"learning_rate": 6.402719197335181e-07,
"loss": 0.1656,
"step": 1740
},
{
"epoch": 2.511393619573039,
"grad_norm": 1.3782406997114114,
"learning_rate": 6.22185093870772e-07,
"loss": 0.1669,
"step": 1745
},
{
"epoch": 2.5185895898296953,
"grad_norm": 1.4431968846802443,
"learning_rate": 6.043358412377069e-07,
"loss": 0.1799,
"step": 1750
},
{
"epoch": 2.5257855600863515,
"grad_norm": 1.1865288276002492,
"learning_rate": 5.867254172330689e-07,
"loss": 0.1614,
"step": 1755
},
{
"epoch": 2.532981530343008,
"grad_norm": 1.3447844251083265,
"learning_rate": 5.693550604579722e-07,
"loss": 0.1761,
"step": 1760
},
{
"epoch": 2.5401775005996643,
"grad_norm": 1.312290863998097,
"learning_rate": 5.52225992628784e-07,
"loss": 0.175,
"step": 1765
},
{
"epoch": 2.5473734708563205,
"grad_norm": 1.325480546799902,
"learning_rate": 5.353394184912012e-07,
"loss": 0.1893,
"step": 1770
},
{
"epoch": 2.5545694411129767,
"grad_norm": 1.211006197074522,
"learning_rate": 5.186965257355092e-07,
"loss": 0.1738,
"step": 1775
},
{
"epoch": 2.561765411369633,
"grad_norm": 1.2613128106853304,
"learning_rate": 5.022984849130542e-07,
"loss": 0.1735,
"step": 1780
},
{
"epoch": 2.5689613816262895,
"grad_norm": 1.4240080375407917,
"learning_rate": 4.861464493539116e-07,
"loss": 0.209,
"step": 1785
},
{
"epoch": 2.5761573518829457,
"grad_norm": 1.212642870699417,
"learning_rate": 4.702415550857668e-07,
"loss": 0.1661,
"step": 1790
},
{
"epoch": 2.583353322139602,
"grad_norm": 1.19899124906289,
"learning_rate": 4.5458492075401845e-07,
"loss": 0.1871,
"step": 1795
},
{
"epoch": 2.590549292396258,
"grad_norm": 1.2451776201467897,
"learning_rate": 4.391776475430964e-07,
"loss": 0.1736,
"step": 1800
},
{
"epoch": 2.5977452626529143,
"grad_norm": 1.4217111682942414,
"learning_rate": 4.240208190990149e-07,
"loss": 0.1656,
"step": 1805
},
{
"epoch": 2.604941232909571,
"grad_norm": 1.154023125578338,
"learning_rate": 4.0911550145315356e-07,
"loss": 0.176,
"step": 1810
},
{
"epoch": 2.6121372031662267,
"grad_norm": 1.2517982852871838,
"learning_rate": 3.944627429472809e-07,
"loss": 0.168,
"step": 1815
},
{
"epoch": 2.6193331734228833,
"grad_norm": 1.3001175217867729,
"learning_rate": 3.8006357415981947e-07,
"loss": 0.1582,
"step": 1820
},
{
"epoch": 2.6265291436795395,
"grad_norm": 1.4179539106113206,
"learning_rate": 3.659190078333667e-07,
"loss": 0.1901,
"step": 1825
},
{
"epoch": 2.6337251139361957,
"grad_norm": 1.2865481274768071,
"learning_rate": 3.5203003880345786e-07,
"loss": 0.1825,
"step": 1830
},
{
"epoch": 2.640921084192852,
"grad_norm": 1.2107327771575902,
"learning_rate": 3.383976439286007e-07,
"loss": 0.178,
"step": 1835
},
{
"epoch": 2.648117054449508,
"grad_norm": 1.4930579520298934,
"learning_rate": 3.250227820215694e-07,
"loss": 0.1795,
"step": 1840
},
{
"epoch": 2.6553130247061647,
"grad_norm": 1.7580144453795274,
"learning_rate": 3.119063937819666e-07,
"loss": 0.1988,
"step": 1845
},
{
"epoch": 2.662508994962821,
"grad_norm": 1.389677232858989,
"learning_rate": 2.990494017300604e-07,
"loss": 0.189,
"step": 1850
},
{
"epoch": 2.669704965219477,
"grad_norm": 1.4778063736068945,
"learning_rate": 2.864527101419032e-07,
"loss": 0.2053,
"step": 1855
},
{
"epoch": 2.6769009354761333,
"grad_norm": 1.2577420076989798,
"learning_rate": 2.7411720498572744e-07,
"loss": 0.1917,
"step": 1860
},
{
"epoch": 2.6840969057327895,
"grad_norm": 1.5130645195940433,
"learning_rate": 2.6204375385963494e-07,
"loss": 0.161,
"step": 1865
},
{
"epoch": 2.691292875989446,
"grad_norm": 1.080302530956707,
"learning_rate": 2.502332059305745e-07,
"loss": 0.1752,
"step": 1870
},
{
"epoch": 2.6984888462461023,
"grad_norm": 1.306131392662643,
"learning_rate": 2.386863918746167e-07,
"loss": 0.1968,
"step": 1875
},
{
"epoch": 2.7056848165027585,
"grad_norm": 1.3461515984684975,
"learning_rate": 2.2740412381853223e-07,
"loss": 0.183,
"step": 1880
},
{
"epoch": 2.7128807867594147,
"grad_norm": 1.4486810753503954,
"learning_rate": 2.1638719528266835e-07,
"loss": 0.1938,
"step": 1885
},
{
"epoch": 2.720076757016071,
"grad_norm": 1.035281562927121,
"learning_rate": 2.0563638112514047e-07,
"loss": 0.1823,
"step": 1890
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.4407581503306328,
"learning_rate": 1.9515243748733455e-07,
"loss": 0.1648,
"step": 1895
},
{
"epoch": 2.7344686975293837,
"grad_norm": 1.2289916037617492,
"learning_rate": 1.8493610174072248e-07,
"loss": 0.1716,
"step": 1900
},
{
"epoch": 2.74166466778604,
"grad_norm": 1.1641895371111006,
"learning_rate": 1.7498809243500133e-07,
"loss": 0.1659,
"step": 1905
},
{
"epoch": 2.748860638042696,
"grad_norm": 1.2177853046541605,
"learning_rate": 1.6530910924755603e-07,
"loss": 0.1905,
"step": 1910
},
{
"epoch": 2.7560566082993523,
"grad_norm": 1.373432351937655,
"learning_rate": 1.5589983293424802e-07,
"loss": 0.1948,
"step": 1915
},
{
"epoch": 2.7632525785560085,
"grad_norm": 1.2223263222194338,
"learning_rate": 1.4676092528153495e-07,
"loss": 0.1635,
"step": 1920
},
{
"epoch": 2.7704485488126647,
"grad_norm": 1.323423227538223,
"learning_rate": 1.378930290599265e-07,
"loss": 0.1941,
"step": 1925
},
{
"epoch": 2.7776445190693213,
"grad_norm": 1.1747342118230812,
"learning_rate": 1.29296767978774e-07,
"loss": 0.1556,
"step": 1930
},
{
"epoch": 2.7848404893259775,
"grad_norm": 1.4551159106122102,
"learning_rate": 1.2097274664240486e-07,
"loss": 0.1778,
"step": 1935
},
{
"epoch": 2.7920364595826337,
"grad_norm": 1.2601624341009796,
"learning_rate": 1.1292155050759689e-07,
"loss": 0.183,
"step": 1940
},
{
"epoch": 2.79923242983929,
"grad_norm": 1.2755552581393579,
"learning_rate": 1.0514374584240338e-07,
"loss": 0.1623,
"step": 1945
},
{
"epoch": 2.806428400095946,
"grad_norm": 1.2967219351574655,
"learning_rate": 9.763987968632293e-08,
"loss": 0.1895,
"step": 1950
},
{
"epoch": 2.8136243703526027,
"grad_norm": 1.5216756650995487,
"learning_rate": 9.04104798118257e-08,
"loss": 0.18,
"step": 1955
},
{
"epoch": 2.820820340609259,
"grad_norm": 1.2463724400999108,
"learning_rate": 8.345605468723427e-08,
"loss": 0.1855,
"step": 1960
},
{
"epoch": 2.828016310865915,
"grad_norm": 1.3612982361619894,
"learning_rate": 7.677709344095883e-08,
"loss": 0.1971,
"step": 1965
},
{
"epoch": 2.8352122811225713,
"grad_norm": 1.1485240309542117,
"learning_rate": 7.037406582709815e-08,
"loss": 0.1673,
"step": 1970
},
{
"epoch": 2.8424082513792275,
"grad_norm": 1.129849173373603,
"learning_rate": 6.424742219239698e-08,
"loss": 0.1688,
"step": 1975
},
{
"epoch": 2.849604221635884,
"grad_norm": 1.271858319859489,
"learning_rate": 5.839759344457462e-08,
"loss": 0.1864,
"step": 1980
},
{
"epoch": 2.8568001918925403,
"grad_norm": 1.3695271997638596,
"learning_rate": 5.282499102201532e-08,
"loss": 0.182,
"step": 1985
},
{
"epoch": 2.8639961621491965,
"grad_norm": 1.2817216551660306,
"learning_rate": 4.753000686483189e-08,
"loss": 0.191,
"step": 1990
},
{
"epoch": 2.8711921324058527,
"grad_norm": 1.2963119208915668,
"learning_rate": 4.2513013387298846e-08,
"loss": 0.1877,
"step": 1995
},
{
"epoch": 2.878388102662509,
"grad_norm": 1.290101189771667,
"learning_rate": 3.7774363451658744e-08,
"loss": 0.1796,
"step": 2000
},
{
"epoch": 2.8855840729191655,
"grad_norm": 1.4819813983719878,
"learning_rate": 3.331439034330552e-08,
"loss": 0.1763,
"step": 2005
},
{
"epoch": 2.8927800431758213,
"grad_norm": 1.4019557468116053,
"learning_rate": 2.913340774734152e-08,
"loss": 0.1708,
"step": 2010
},
{
"epoch": 2.899976013432478,
"grad_norm": 1.2339439577357936,
"learning_rate": 2.5231709726516005e-08,
"loss": 0.1789,
"step": 2015
},
{
"epoch": 2.907171983689134,
"grad_norm": 1.2294457543053794,
"learning_rate": 2.1609570700543478e-08,
"loss": 0.1575,
"step": 2020
},
{
"epoch": 2.9143679539457903,
"grad_norm": 1.6910934029285385,
"learning_rate": 1.826724542680047e-08,
"loss": 0.1853,
"step": 2025
},
{
"epoch": 2.9215639242024465,
"grad_norm": 1.5951062337979125,
"learning_rate": 1.5204968982410527e-08,
"loss": 0.1994,
"step": 2030
},
{
"epoch": 2.9287598944591027,
"grad_norm": 1.244634963059678,
"learning_rate": 1.2422956747708546e-08,
"loss": 0.1792,
"step": 2035
},
{
"epoch": 2.9359558647157593,
"grad_norm": 1.3731755256138354,
"learning_rate": 9.92140439109157e-09,
"loss": 0.1855,
"step": 2040
},
{
"epoch": 2.9431518349724155,
"grad_norm": 1.487158498377204,
"learning_rate": 7.700487855260007e-09,
"loss": 0.1713,
"step": 2045
},
{
"epoch": 2.9503478052290717,
"grad_norm": 1.195702473189198,
"learning_rate": 5.760363344839536e-09,
"loss": 0.1756,
"step": 2050
},
{
"epoch": 2.957543775485728,
"grad_norm": 1.2219249635030713,
"learning_rate": 4.101167315396559e-09,
"loss": 0.1705,
"step": 2055
},
{
"epoch": 2.964739745742384,
"grad_norm": 1.1937579061988086,
"learning_rate": 2.7230164638401e-09,
"loss": 0.1669,
"step": 2060
},
{
"epoch": 2.9719357159990407,
"grad_norm": 1.0911166566357549,
"learning_rate": 1.626007720214595e-09,
"loss": 0.1556,
"step": 2065
},
{
"epoch": 2.979131686255697,
"grad_norm": 1.155309788494306,
"learning_rate": 8.102182408822322e-10,
"loss": 0.1572,
"step": 2070
},
{
"epoch": 2.986327656512353,
"grad_norm": 1.1809384871247852,
"learning_rate": 2.7570540309618253e-10,
"loss": 0.1651,
"step": 2075
},
{
"epoch": 2.9935236267690093,
"grad_norm": 2.41813981269854,
"learning_rate": 2.2506800965604867e-11,
"loss": 0.1452,
"step": 2080
},
{
"epoch": 2.9964020148716717,
"step": 2082,
"total_flos": 5.073775214995702e+17,
"train_loss": 0.5172660127031643,
"train_runtime": 62603.1223,
"train_samples_per_second": 3.196,
"train_steps_per_second": 0.033
}
],
"logging_steps": 5,
"max_steps": 2082,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 700,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.073775214995702e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}