ErrorAI's picture
Training in progress, step 1325, checkpoint
6892ef7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.000566358316028,
"eval_steps": 500,
"global_step": 1325,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007551444213705871,
"grad_norm": 0.059659671038389206,
"learning_rate": 2e-05,
"loss": 0.4492,
"step": 1
},
{
"epoch": 0.0015102888427411742,
"grad_norm": 0.0642104372382164,
"learning_rate": 4e-05,
"loss": 0.4602,
"step": 2
},
{
"epoch": 0.0022654332641117614,
"grad_norm": 0.06154001131653786,
"learning_rate": 6e-05,
"loss": 0.488,
"step": 3
},
{
"epoch": 0.0030205776854823484,
"grad_norm": 0.07486463338136673,
"learning_rate": 8e-05,
"loss": 0.5672,
"step": 4
},
{
"epoch": 0.003775722106852936,
"grad_norm": 0.08084491640329361,
"learning_rate": 0.0001,
"loss": 0.525,
"step": 5
},
{
"epoch": 0.004530866528223523,
"grad_norm": 0.09173104912042618,
"learning_rate": 9.999985839072915e-05,
"loss": 0.523,
"step": 6
},
{
"epoch": 0.00528601094959411,
"grad_norm": 0.10932840406894684,
"learning_rate": 9.999943356371866e-05,
"loss": 0.4872,
"step": 7
},
{
"epoch": 0.006041155370964697,
"grad_norm": 0.14193305373191833,
"learning_rate": 9.999872552137497e-05,
"loss": 0.5677,
"step": 8
},
{
"epoch": 0.006796299792335284,
"grad_norm": 0.15595699846744537,
"learning_rate": 9.999773426770865e-05,
"loss": 0.6329,
"step": 9
},
{
"epoch": 0.007551444213705872,
"grad_norm": 0.157088503241539,
"learning_rate": 9.999645980833454e-05,
"loss": 0.5819,
"step": 10
},
{
"epoch": 0.008306588635076459,
"grad_norm": 0.12618108093738556,
"learning_rate": 9.999490215047167e-05,
"loss": 0.5207,
"step": 11
},
{
"epoch": 0.009061733056447046,
"grad_norm": 0.15058279037475586,
"learning_rate": 9.999306130294317e-05,
"loss": 0.5821,
"step": 12
},
{
"epoch": 0.009816877477817633,
"grad_norm": 0.12929701805114746,
"learning_rate": 9.99909372761763e-05,
"loss": 0.5065,
"step": 13
},
{
"epoch": 0.01057202189918822,
"grad_norm": 0.12497358024120331,
"learning_rate": 9.99885300822023e-05,
"loss": 0.524,
"step": 14
},
{
"epoch": 0.011327166320558807,
"grad_norm": 0.12322711199522018,
"learning_rate": 9.998583973465646e-05,
"loss": 0.605,
"step": 15
},
{
"epoch": 0.012082310741929394,
"grad_norm": 0.11271341890096664,
"learning_rate": 9.998286624877786e-05,
"loss": 0.5465,
"step": 16
},
{
"epoch": 0.01283745516329998,
"grad_norm": 0.11549581587314606,
"learning_rate": 9.997960964140947e-05,
"loss": 0.5595,
"step": 17
},
{
"epoch": 0.013592599584670568,
"grad_norm": 0.12339697778224945,
"learning_rate": 9.997606993099789e-05,
"loss": 0.4647,
"step": 18
},
{
"epoch": 0.014347744006041155,
"grad_norm": 0.13940560817718506,
"learning_rate": 9.997224713759335e-05,
"loss": 0.5441,
"step": 19
},
{
"epoch": 0.015102888427411743,
"grad_norm": 0.12651537358760834,
"learning_rate": 9.99681412828496e-05,
"loss": 0.5482,
"step": 20
},
{
"epoch": 0.01585803284878233,
"grad_norm": 0.13802285492420197,
"learning_rate": 9.996375239002369e-05,
"loss": 0.4715,
"step": 21
},
{
"epoch": 0.016613177270152917,
"grad_norm": 0.13440218567848206,
"learning_rate": 9.995908048397595e-05,
"loss": 0.5641,
"step": 22
},
{
"epoch": 0.017368321691523504,
"grad_norm": 0.139227494597435,
"learning_rate": 9.995412559116979e-05,
"loss": 0.4398,
"step": 23
},
{
"epoch": 0.01812346611289409,
"grad_norm": 0.1373261958360672,
"learning_rate": 9.994888773967157e-05,
"loss": 0.5384,
"step": 24
},
{
"epoch": 0.018878610534264678,
"grad_norm": 0.14462506771087646,
"learning_rate": 9.99433669591504e-05,
"loss": 0.5708,
"step": 25
},
{
"epoch": 0.019633754955635265,
"grad_norm": 0.14911501109600067,
"learning_rate": 9.993756328087805e-05,
"loss": 0.5851,
"step": 26
},
{
"epoch": 0.020388899377005852,
"grad_norm": 0.13722750544548035,
"learning_rate": 9.99314767377287e-05,
"loss": 0.488,
"step": 27
},
{
"epoch": 0.02114404379837644,
"grad_norm": 0.1530655175447464,
"learning_rate": 9.992510736417878e-05,
"loss": 0.5364,
"step": 28
},
{
"epoch": 0.021899188219747026,
"grad_norm": 0.14158278703689575,
"learning_rate": 9.991845519630678e-05,
"loss": 0.4066,
"step": 29
},
{
"epoch": 0.022654332641117613,
"grad_norm": 0.14002585411071777,
"learning_rate": 9.991152027179307e-05,
"loss": 0.4307,
"step": 30
},
{
"epoch": 0.0234094770624882,
"grad_norm": 0.15578630566596985,
"learning_rate": 9.990430262991962e-05,
"loss": 0.4661,
"step": 31
},
{
"epoch": 0.024164621483858787,
"grad_norm": 0.16023118793964386,
"learning_rate": 9.989680231156981e-05,
"loss": 0.4454,
"step": 32
},
{
"epoch": 0.024919765905229374,
"grad_norm": 0.17581893503665924,
"learning_rate": 9.988901935922826e-05,
"loss": 0.5209,
"step": 33
},
{
"epoch": 0.02567491032659996,
"grad_norm": 0.1649864912033081,
"learning_rate": 9.988095381698048e-05,
"loss": 0.5037,
"step": 34
},
{
"epoch": 0.026430054747970548,
"grad_norm": 0.18106521666049957,
"learning_rate": 9.987260573051269e-05,
"loss": 0.4649,
"step": 35
},
{
"epoch": 0.027185199169341135,
"grad_norm": 0.1807573437690735,
"learning_rate": 9.986397514711154e-05,
"loss": 0.4987,
"step": 36
},
{
"epoch": 0.027940343590711722,
"grad_norm": 0.392092764377594,
"learning_rate": 9.985506211566388e-05,
"loss": 0.526,
"step": 37
},
{
"epoch": 0.02869548801208231,
"grad_norm": 0.17764483392238617,
"learning_rate": 9.98458666866564e-05,
"loss": 0.4262,
"step": 38
},
{
"epoch": 0.0294506324334529,
"grad_norm": 0.20041850209236145,
"learning_rate": 9.983638891217544e-05,
"loss": 0.5239,
"step": 39
},
{
"epoch": 0.030205776854823486,
"grad_norm": 0.19753199815750122,
"learning_rate": 9.982662884590662e-05,
"loss": 0.4918,
"step": 40
},
{
"epoch": 0.030960921276194073,
"grad_norm": 0.19215907156467438,
"learning_rate": 9.981658654313457e-05,
"loss": 0.4474,
"step": 41
},
{
"epoch": 0.03171606569756466,
"grad_norm": 0.19931669533252716,
"learning_rate": 9.980626206074263e-05,
"loss": 0.4531,
"step": 42
},
{
"epoch": 0.032471210118935244,
"grad_norm": 0.20595461130142212,
"learning_rate": 9.979565545721248e-05,
"loss": 0.4495,
"step": 43
},
{
"epoch": 0.033226354540305834,
"grad_norm": 0.20857509970664978,
"learning_rate": 9.978476679262387e-05,
"loss": 0.4331,
"step": 44
},
{
"epoch": 0.03398149896167642,
"grad_norm": 0.21194373071193695,
"learning_rate": 9.977359612865423e-05,
"loss": 0.3721,
"step": 45
},
{
"epoch": 0.03473664338304701,
"grad_norm": 0.21357247233390808,
"learning_rate": 9.976214352857834e-05,
"loss": 0.4586,
"step": 46
},
{
"epoch": 0.03549178780441759,
"grad_norm": 0.23830629885196686,
"learning_rate": 9.975040905726798e-05,
"loss": 0.4416,
"step": 47
},
{
"epoch": 0.03624693222578818,
"grad_norm": 0.2576565444469452,
"learning_rate": 9.973839278119155e-05,
"loss": 0.4685,
"step": 48
},
{
"epoch": 0.037002076647158766,
"grad_norm": 0.5738092660903931,
"learning_rate": 9.972609476841367e-05,
"loss": 0.3931,
"step": 49
},
{
"epoch": 0.037757221068529356,
"grad_norm": 0.31976327300071716,
"learning_rate": 9.971351508859488e-05,
"loss": 0.419,
"step": 50
},
{
"epoch": 0.03851236548989995,
"grad_norm": 0.11089778691530228,
"learning_rate": 9.970065381299112e-05,
"loss": 0.3417,
"step": 51
},
{
"epoch": 0.03926750991127053,
"grad_norm": 0.13030797243118286,
"learning_rate": 9.968751101445343e-05,
"loss": 0.3921,
"step": 52
},
{
"epoch": 0.04002265433264112,
"grad_norm": 0.12426062673330307,
"learning_rate": 9.967408676742751e-05,
"loss": 0.3519,
"step": 53
},
{
"epoch": 0.040777798754011704,
"grad_norm": 0.13413658738136292,
"learning_rate": 9.966038114795328e-05,
"loss": 0.4093,
"step": 54
},
{
"epoch": 0.041532943175382295,
"grad_norm": 0.1284988820552826,
"learning_rate": 9.964639423366442e-05,
"loss": 0.4002,
"step": 55
},
{
"epoch": 0.04228808759675288,
"grad_norm": 0.11679685115814209,
"learning_rate": 9.963212610378803e-05,
"loss": 0.4191,
"step": 56
},
{
"epoch": 0.04304323201812347,
"grad_norm": 0.12084402143955231,
"learning_rate": 9.961757683914406e-05,
"loss": 0.3768,
"step": 57
},
{
"epoch": 0.04379837643949405,
"grad_norm": 0.11981435120105743,
"learning_rate": 9.960274652214496e-05,
"loss": 0.4396,
"step": 58
},
{
"epoch": 0.04455352086086464,
"grad_norm": 0.1252336949110031,
"learning_rate": 9.958763523679514e-05,
"loss": 0.4526,
"step": 59
},
{
"epoch": 0.045308665282235226,
"grad_norm": 0.12791119515895844,
"learning_rate": 9.957224306869053e-05,
"loss": 0.4877,
"step": 60
},
{
"epoch": 0.04606380970360582,
"grad_norm": 0.12734884023666382,
"learning_rate": 9.955657010501806e-05,
"loss": 0.4608,
"step": 61
},
{
"epoch": 0.0468189541249764,
"grad_norm": 0.12033625692129135,
"learning_rate": 9.954061643455523e-05,
"loss": 0.4842,
"step": 62
},
{
"epoch": 0.04757409854634699,
"grad_norm": 0.11826111376285553,
"learning_rate": 9.952438214766955e-05,
"loss": 0.4132,
"step": 63
},
{
"epoch": 0.048329242967717574,
"grad_norm": 0.12830643355846405,
"learning_rate": 9.950786733631801e-05,
"loss": 0.4733,
"step": 64
},
{
"epoch": 0.049084387389088165,
"grad_norm": 0.13267682492733002,
"learning_rate": 9.949107209404665e-05,
"loss": 0.4303,
"step": 65
},
{
"epoch": 0.04983953181045875,
"grad_norm": 0.12670353055000305,
"learning_rate": 9.947399651598993e-05,
"loss": 0.5202,
"step": 66
},
{
"epoch": 0.05059467623182934,
"grad_norm": 0.15055082738399506,
"learning_rate": 9.945664069887028e-05,
"loss": 0.4475,
"step": 67
},
{
"epoch": 0.05134982065319992,
"grad_norm": 0.13549265265464783,
"learning_rate": 9.943900474099748e-05,
"loss": 0.5083,
"step": 68
},
{
"epoch": 0.05210496507457051,
"grad_norm": 0.12346430122852325,
"learning_rate": 9.942108874226811e-05,
"loss": 0.3601,
"step": 69
},
{
"epoch": 0.052860109495941096,
"grad_norm": 0.14251157641410828,
"learning_rate": 9.940289280416508e-05,
"loss": 0.4827,
"step": 70
},
{
"epoch": 0.05361525391731169,
"grad_norm": 0.13678328692913055,
"learning_rate": 9.938441702975689e-05,
"loss": 0.4081,
"step": 71
},
{
"epoch": 0.05437039833868227,
"grad_norm": 0.13870520889759064,
"learning_rate": 9.93656615236972e-05,
"loss": 0.442,
"step": 72
},
{
"epoch": 0.05512554276005286,
"grad_norm": 0.13190938532352448,
"learning_rate": 9.934662639222412e-05,
"loss": 0.409,
"step": 73
},
{
"epoch": 0.055880687181423444,
"grad_norm": 0.1491832584142685,
"learning_rate": 9.932731174315972e-05,
"loss": 0.5432,
"step": 74
},
{
"epoch": 0.056635831602794035,
"grad_norm": 0.14117342233657837,
"learning_rate": 9.930771768590933e-05,
"loss": 0.436,
"step": 75
},
{
"epoch": 0.05739097602416462,
"grad_norm": 0.15490145981311798,
"learning_rate": 9.928784433146096e-05,
"loss": 0.4399,
"step": 76
},
{
"epoch": 0.05814612044553521,
"grad_norm": 0.15420539677143097,
"learning_rate": 9.926769179238466e-05,
"loss": 0.4223,
"step": 77
},
{
"epoch": 0.0589012648669058,
"grad_norm": 0.15358476340770721,
"learning_rate": 9.924726018283187e-05,
"loss": 0.4702,
"step": 78
},
{
"epoch": 0.05965640928827638,
"grad_norm": 0.15698279440402985,
"learning_rate": 9.922654961853481e-05,
"loss": 0.4728,
"step": 79
},
{
"epoch": 0.06041155370964697,
"grad_norm": 0.16668903827667236,
"learning_rate": 9.92055602168058e-05,
"loss": 0.4654,
"step": 80
},
{
"epoch": 0.061166698131017556,
"grad_norm": 0.16828201711177826,
"learning_rate": 9.918429209653662e-05,
"loss": 0.4588,
"step": 81
},
{
"epoch": 0.06192184255238815,
"grad_norm": 0.1675289273262024,
"learning_rate": 9.916274537819775e-05,
"loss": 0.489,
"step": 82
},
{
"epoch": 0.06267698697375873,
"grad_norm": 0.19768594205379486,
"learning_rate": 9.914092018383778e-05,
"loss": 0.4707,
"step": 83
},
{
"epoch": 0.06343213139512932,
"grad_norm": 0.17943724989891052,
"learning_rate": 9.911881663708275e-05,
"loss": 0.458,
"step": 84
},
{
"epoch": 0.06418727581649991,
"grad_norm": 0.18578174710273743,
"learning_rate": 9.909643486313533e-05,
"loss": 0.403,
"step": 85
},
{
"epoch": 0.06494242023787049,
"grad_norm": 0.1998097449541092,
"learning_rate": 9.90737749887742e-05,
"loss": 0.5154,
"step": 86
},
{
"epoch": 0.06569756465924108,
"grad_norm": 0.18927428126335144,
"learning_rate": 9.905083714235326e-05,
"loss": 0.4682,
"step": 87
},
{
"epoch": 0.06645270908061167,
"grad_norm": 0.18993408977985382,
"learning_rate": 9.9027621453801e-05,
"loss": 0.4373,
"step": 88
},
{
"epoch": 0.06720785350198226,
"grad_norm": 0.18295501172542572,
"learning_rate": 9.900412805461967e-05,
"loss": 0.4163,
"step": 89
},
{
"epoch": 0.06796299792335284,
"grad_norm": 0.20295414328575134,
"learning_rate": 9.898035707788463e-05,
"loss": 0.4816,
"step": 90
},
{
"epoch": 0.06871814234472343,
"grad_norm": 0.21273529529571533,
"learning_rate": 9.895630865824347e-05,
"loss": 0.4363,
"step": 91
},
{
"epoch": 0.06947328676609402,
"grad_norm": 0.21219299733638763,
"learning_rate": 9.893198293191538e-05,
"loss": 0.5055,
"step": 92
},
{
"epoch": 0.07022843118746461,
"grad_norm": 0.2346925586462021,
"learning_rate": 9.890738003669029e-05,
"loss": 0.5144,
"step": 93
},
{
"epoch": 0.07098357560883518,
"grad_norm": 0.2355310320854187,
"learning_rate": 9.888250011192811e-05,
"loss": 0.4826,
"step": 94
},
{
"epoch": 0.07173872003020577,
"grad_norm": 0.23005840182304382,
"learning_rate": 9.885734329855798e-05,
"loss": 0.4764,
"step": 95
},
{
"epoch": 0.07249386445157636,
"grad_norm": 0.2246306836605072,
"learning_rate": 9.883190973907741e-05,
"loss": 0.3891,
"step": 96
},
{
"epoch": 0.07324900887294696,
"grad_norm": 0.28144529461860657,
"learning_rate": 9.880619957755151e-05,
"loss": 0.5174,
"step": 97
},
{
"epoch": 0.07400415329431753,
"grad_norm": 0.27972379326820374,
"learning_rate": 9.878021295961217e-05,
"loss": 0.4433,
"step": 98
},
{
"epoch": 0.07475929771568812,
"grad_norm": 0.3294559717178345,
"learning_rate": 9.875395003245724e-05,
"loss": 0.442,
"step": 99
},
{
"epoch": 0.07551444213705871,
"grad_norm": 0.3774302303791046,
"learning_rate": 9.872741094484965e-05,
"loss": 0.5358,
"step": 100
},
{
"epoch": 0.0762695865584293,
"grad_norm": 0.12596380710601807,
"learning_rate": 9.870059584711668e-05,
"loss": 0.3584,
"step": 101
},
{
"epoch": 0.0770247309797999,
"grad_norm": 0.1355014592409134,
"learning_rate": 9.867350489114894e-05,
"loss": 0.4008,
"step": 102
},
{
"epoch": 0.07777987540117047,
"grad_norm": 0.14024491608142853,
"learning_rate": 9.864613823039969e-05,
"loss": 0.3752,
"step": 103
},
{
"epoch": 0.07853501982254106,
"grad_norm": 0.13403770327568054,
"learning_rate": 9.861849601988383e-05,
"loss": 0.38,
"step": 104
},
{
"epoch": 0.07929016424391165,
"grad_norm": 0.1362624615430832,
"learning_rate": 9.859057841617709e-05,
"loss": 0.3889,
"step": 105
},
{
"epoch": 0.08004530866528224,
"grad_norm": 0.137498140335083,
"learning_rate": 9.856238557741513e-05,
"loss": 0.4215,
"step": 106
},
{
"epoch": 0.08080045308665282,
"grad_norm": 0.1259029656648636,
"learning_rate": 9.853391766329263e-05,
"loss": 0.3831,
"step": 107
},
{
"epoch": 0.08155559750802341,
"grad_norm": 0.12306073307991028,
"learning_rate": 9.850517483506244e-05,
"loss": 0.3843,
"step": 108
},
{
"epoch": 0.082310741929394,
"grad_norm": 0.12481655925512314,
"learning_rate": 9.847615725553456e-05,
"loss": 0.4206,
"step": 109
},
{
"epoch": 0.08306588635076459,
"grad_norm": 0.11769527196884155,
"learning_rate": 9.844686508907537e-05,
"loss": 0.3633,
"step": 110
},
{
"epoch": 0.08382103077213517,
"grad_norm": 0.12572641670703888,
"learning_rate": 9.841729850160652e-05,
"loss": 0.4113,
"step": 111
},
{
"epoch": 0.08457617519350576,
"grad_norm": 0.12385623157024384,
"learning_rate": 9.838745766060416e-05,
"loss": 0.3894,
"step": 112
},
{
"epoch": 0.08533131961487635,
"grad_norm": 0.13819383084774017,
"learning_rate": 9.835734273509786e-05,
"loss": 0.4989,
"step": 113
},
{
"epoch": 0.08608646403624694,
"grad_norm": 0.13039085268974304,
"learning_rate": 9.832695389566972e-05,
"loss": 0.4502,
"step": 114
},
{
"epoch": 0.08684160845761751,
"grad_norm": 0.14915941655635834,
"learning_rate": 9.829629131445342e-05,
"loss": 0.4674,
"step": 115
},
{
"epoch": 0.0875967528789881,
"grad_norm": 0.13414114713668823,
"learning_rate": 9.826535516513317e-05,
"loss": 0.4914,
"step": 116
},
{
"epoch": 0.0883518973003587,
"grad_norm": 0.13531801104545593,
"learning_rate": 9.82341456229428e-05,
"loss": 0.4764,
"step": 117
},
{
"epoch": 0.08910704172172929,
"grad_norm": 0.1374198943376541,
"learning_rate": 9.820266286466471e-05,
"loss": 0.4299,
"step": 118
},
{
"epoch": 0.08986218614309986,
"grad_norm": 0.14264227449893951,
"learning_rate": 9.817090706862895e-05,
"loss": 0.4164,
"step": 119
},
{
"epoch": 0.09061733056447045,
"grad_norm": 0.14601197838783264,
"learning_rate": 9.81388784147121e-05,
"loss": 0.4273,
"step": 120
},
{
"epoch": 0.09137247498584104,
"grad_norm": 0.1535925269126892,
"learning_rate": 9.810657708433637e-05,
"loss": 0.4947,
"step": 121
},
{
"epoch": 0.09212761940721163,
"grad_norm": 0.1509617269039154,
"learning_rate": 9.807400326046843e-05,
"loss": 0.3626,
"step": 122
},
{
"epoch": 0.09288276382858221,
"grad_norm": 0.15619726479053497,
"learning_rate": 9.804115712761851e-05,
"loss": 0.4597,
"step": 123
},
{
"epoch": 0.0936379082499528,
"grad_norm": 0.1454990804195404,
"learning_rate": 9.80080388718393e-05,
"loss": 0.3858,
"step": 124
},
{
"epoch": 0.09439305267132339,
"grad_norm": 0.1631968766450882,
"learning_rate": 9.797464868072488e-05,
"loss": 0.435,
"step": 125
},
{
"epoch": 0.09514819709269398,
"grad_norm": 0.15620705485343933,
"learning_rate": 9.794098674340965e-05,
"loss": 0.4259,
"step": 126
},
{
"epoch": 0.09590334151406457,
"grad_norm": 0.16626444458961487,
"learning_rate": 9.790705325056735e-05,
"loss": 0.4253,
"step": 127
},
{
"epoch": 0.09665848593543515,
"grad_norm": 0.16531290113925934,
"learning_rate": 9.787284839440982e-05,
"loss": 0.4284,
"step": 128
},
{
"epoch": 0.09741363035680574,
"grad_norm": 0.15550102293491364,
"learning_rate": 9.783837236868609e-05,
"loss": 0.3745,
"step": 129
},
{
"epoch": 0.09816877477817633,
"grad_norm": 0.17284280061721802,
"learning_rate": 9.780362536868113e-05,
"loss": 0.4458,
"step": 130
},
{
"epoch": 0.09892391919954692,
"grad_norm": 0.16307754814624786,
"learning_rate": 9.776860759121484e-05,
"loss": 0.4001,
"step": 131
},
{
"epoch": 0.0996790636209175,
"grad_norm": 0.1765722781419754,
"learning_rate": 9.77333192346409e-05,
"loss": 0.4215,
"step": 132
},
{
"epoch": 0.10043420804228809,
"grad_norm": 0.1748800426721573,
"learning_rate": 9.769776049884563e-05,
"loss": 0.4215,
"step": 133
},
{
"epoch": 0.10118935246365868,
"grad_norm": 0.18451336026191711,
"learning_rate": 9.766193158524692e-05,
"loss": 0.4214,
"step": 134
},
{
"epoch": 0.10194449688502927,
"grad_norm": 0.19954369962215424,
"learning_rate": 9.762583269679303e-05,
"loss": 0.4085,
"step": 135
},
{
"epoch": 0.10269964130639984,
"grad_norm": 0.2019212394952774,
"learning_rate": 9.758946403796143e-05,
"loss": 0.3836,
"step": 136
},
{
"epoch": 0.10345478572777043,
"grad_norm": 0.20586973428726196,
"learning_rate": 9.755282581475769e-05,
"loss": 0.4231,
"step": 137
},
{
"epoch": 0.10420993014914103,
"grad_norm": 0.2090785652399063,
"learning_rate": 9.751591823471429e-05,
"loss": 0.4385,
"step": 138
},
{
"epoch": 0.10496507457051162,
"grad_norm": 0.21643619239330292,
"learning_rate": 9.747874150688948e-05,
"loss": 0.4758,
"step": 139
},
{
"epoch": 0.10572021899188219,
"grad_norm": 0.19914501905441284,
"learning_rate": 9.744129584186598e-05,
"loss": 0.3888,
"step": 140
},
{
"epoch": 0.10647536341325278,
"grad_norm": 0.23445309698581696,
"learning_rate": 9.740358145174998e-05,
"loss": 0.4988,
"step": 141
},
{
"epoch": 0.10723050783462337,
"grad_norm": 0.21564048528671265,
"learning_rate": 9.736559855016973e-05,
"loss": 0.4387,
"step": 142
},
{
"epoch": 0.10798565225599396,
"grad_norm": 0.21970051527023315,
"learning_rate": 9.73273473522745e-05,
"loss": 0.3984,
"step": 143
},
{
"epoch": 0.10874079667736454,
"grad_norm": 0.24472594261169434,
"learning_rate": 9.728882807473324e-05,
"loss": 0.378,
"step": 144
},
{
"epoch": 0.10949594109873513,
"grad_norm": 0.24230483174324036,
"learning_rate": 9.725004093573342e-05,
"loss": 0.4024,
"step": 145
},
{
"epoch": 0.11025108552010572,
"grad_norm": 0.24002547562122345,
"learning_rate": 9.72109861549798e-05,
"loss": 0.3738,
"step": 146
},
{
"epoch": 0.11100622994147631,
"grad_norm": 0.2504161298274994,
"learning_rate": 9.717166395369313e-05,
"loss": 0.3868,
"step": 147
},
{
"epoch": 0.11176137436284689,
"grad_norm": 0.2596012055873871,
"learning_rate": 9.713207455460894e-05,
"loss": 0.4068,
"step": 148
},
{
"epoch": 0.11251651878421748,
"grad_norm": 0.3224775493144989,
"learning_rate": 9.709221818197624e-05,
"loss": 0.4875,
"step": 149
},
{
"epoch": 0.11327166320558807,
"grad_norm": 0.40476924180984497,
"learning_rate": 9.705209506155634e-05,
"loss": 0.4832,
"step": 150
},
{
"epoch": 0.11402680762695866,
"grad_norm": 0.11512809246778488,
"learning_rate": 9.701170542062148e-05,
"loss": 0.3255,
"step": 151
},
{
"epoch": 0.11478195204832924,
"grad_norm": 0.13325871527194977,
"learning_rate": 9.697104948795352e-05,
"loss": 0.3707,
"step": 152
},
{
"epoch": 0.11553709646969983,
"grad_norm": 0.1390884816646576,
"learning_rate": 9.693012749384279e-05,
"loss": 0.3659,
"step": 153
},
{
"epoch": 0.11629224089107042,
"grad_norm": 0.13481466472148895,
"learning_rate": 9.688893967008661e-05,
"loss": 0.3847,
"step": 154
},
{
"epoch": 0.11704738531244101,
"grad_norm": 0.14832735061645508,
"learning_rate": 9.68474862499881e-05,
"loss": 0.457,
"step": 155
},
{
"epoch": 0.1178025297338116,
"grad_norm": 0.13634005188941956,
"learning_rate": 9.68057674683548e-05,
"loss": 0.4172,
"step": 156
},
{
"epoch": 0.11855767415518217,
"grad_norm": 0.1387951821088791,
"learning_rate": 9.676378356149734e-05,
"loss": 0.4252,
"step": 157
},
{
"epoch": 0.11931281857655276,
"grad_norm": 0.1387217789888382,
"learning_rate": 9.672153476722816e-05,
"loss": 0.3928,
"step": 158
},
{
"epoch": 0.12006796299792336,
"grad_norm": 0.14391182363033295,
"learning_rate": 9.667902132486009e-05,
"loss": 0.4545,
"step": 159
},
{
"epoch": 0.12082310741929395,
"grad_norm": 0.1416245996952057,
"learning_rate": 9.663624347520505e-05,
"loss": 0.4543,
"step": 160
},
{
"epoch": 0.12157825184066452,
"grad_norm": 0.13763579726219177,
"learning_rate": 9.659320146057262e-05,
"loss": 0.3957,
"step": 161
},
{
"epoch": 0.12233339626203511,
"grad_norm": 0.14178583025932312,
"learning_rate": 9.654989552476875e-05,
"loss": 0.448,
"step": 162
},
{
"epoch": 0.1230885406834057,
"grad_norm": 0.13738113641738892,
"learning_rate": 9.650632591309431e-05,
"loss": 0.3954,
"step": 163
},
{
"epoch": 0.1238436851047763,
"grad_norm": 0.14869531989097595,
"learning_rate": 9.646249287234374e-05,
"loss": 0.4405,
"step": 164
},
{
"epoch": 0.12459882952614687,
"grad_norm": 0.14332245290279388,
"learning_rate": 9.641839665080363e-05,
"loss": 0.4543,
"step": 165
},
{
"epoch": 0.12535397394751746,
"grad_norm": 0.14617004990577698,
"learning_rate": 9.637403749825135e-05,
"loss": 0.397,
"step": 166
},
{
"epoch": 0.12610911836888805,
"grad_norm": 0.16142374277114868,
"learning_rate": 9.632941566595357e-05,
"loss": 0.5283,
"step": 167
},
{
"epoch": 0.12686426279025864,
"grad_norm": 0.15721464157104492,
"learning_rate": 9.628453140666492e-05,
"loss": 0.433,
"step": 168
},
{
"epoch": 0.12761940721162923,
"grad_norm": 0.1646830439567566,
"learning_rate": 9.623938497462646e-05,
"loss": 0.5024,
"step": 169
},
{
"epoch": 0.12837455163299982,
"grad_norm": 0.14574110507965088,
"learning_rate": 9.619397662556435e-05,
"loss": 0.4194,
"step": 170
},
{
"epoch": 0.12912969605437039,
"grad_norm": 0.15734295547008514,
"learning_rate": 9.614830661668829e-05,
"loss": 0.4757,
"step": 171
},
{
"epoch": 0.12988484047574098,
"grad_norm": 0.15386731922626495,
"learning_rate": 9.610237520669016e-05,
"loss": 0.3874,
"step": 172
},
{
"epoch": 0.13063998489711157,
"grad_norm": 0.16264206171035767,
"learning_rate": 9.60561826557425e-05,
"loss": 0.409,
"step": 173
},
{
"epoch": 0.13139512931848216,
"grad_norm": 0.1713552325963974,
"learning_rate": 9.600972922549707e-05,
"loss": 0.4799,
"step": 174
},
{
"epoch": 0.13215027373985275,
"grad_norm": 0.17139887809753418,
"learning_rate": 9.596301517908328e-05,
"loss": 0.3992,
"step": 175
},
{
"epoch": 0.13290541816122334,
"grad_norm": 0.16348589956760406,
"learning_rate": 9.591604078110685e-05,
"loss": 0.3942,
"step": 176
},
{
"epoch": 0.13366056258259393,
"grad_norm": 0.1675892323255539,
"learning_rate": 9.586880629764817e-05,
"loss": 0.4305,
"step": 177
},
{
"epoch": 0.13441570700396452,
"grad_norm": 0.1761837750673294,
"learning_rate": 9.582131199626087e-05,
"loss": 0.4386,
"step": 178
},
{
"epoch": 0.1351708514253351,
"grad_norm": 0.1876501590013504,
"learning_rate": 9.577355814597031e-05,
"loss": 0.4765,
"step": 179
},
{
"epoch": 0.13592599584670567,
"grad_norm": 0.18781447410583496,
"learning_rate": 9.572554501727198e-05,
"loss": 0.4502,
"step": 180
},
{
"epoch": 0.13668114026807626,
"grad_norm": 0.20662984251976013,
"learning_rate": 9.567727288213005e-05,
"loss": 0.4612,
"step": 181
},
{
"epoch": 0.13743628468944685,
"grad_norm": 0.19330036640167236,
"learning_rate": 9.56287420139758e-05,
"loss": 0.4344,
"step": 182
},
{
"epoch": 0.13819142911081744,
"grad_norm": 0.19399768114089966,
"learning_rate": 9.557995268770608e-05,
"loss": 0.4314,
"step": 183
},
{
"epoch": 0.13894657353218803,
"grad_norm": 0.200825035572052,
"learning_rate": 9.553090517968169e-05,
"loss": 0.4095,
"step": 184
},
{
"epoch": 0.13970171795355862,
"grad_norm": 0.21249161660671234,
"learning_rate": 9.548159976772592e-05,
"loss": 0.5122,
"step": 185
},
{
"epoch": 0.14045686237492921,
"grad_norm": 0.2057344764471054,
"learning_rate": 9.543203673112293e-05,
"loss": 0.4131,
"step": 186
},
{
"epoch": 0.1412120067962998,
"grad_norm": 0.2109992653131485,
"learning_rate": 9.538221635061611e-05,
"loss": 0.4596,
"step": 187
},
{
"epoch": 0.14196715121767037,
"grad_norm": 0.2060767263174057,
"learning_rate": 9.533213890840657e-05,
"loss": 0.4008,
"step": 188
},
{
"epoch": 0.14272229563904096,
"grad_norm": 0.22520488500595093,
"learning_rate": 9.528180468815155e-05,
"loss": 0.451,
"step": 189
},
{
"epoch": 0.14347744006041155,
"grad_norm": 0.23366759717464447,
"learning_rate": 9.523121397496269e-05,
"loss": 0.435,
"step": 190
},
{
"epoch": 0.14423258448178214,
"grad_norm": 0.23018625378608704,
"learning_rate": 9.518036705540458e-05,
"loss": 0.4397,
"step": 191
},
{
"epoch": 0.14498772890315273,
"grad_norm": 0.22498714923858643,
"learning_rate": 9.512926421749304e-05,
"loss": 0.3717,
"step": 192
},
{
"epoch": 0.14574287332452332,
"grad_norm": 0.24823418259620667,
"learning_rate": 9.507790575069347e-05,
"loss": 0.3805,
"step": 193
},
{
"epoch": 0.1464980177458939,
"grad_norm": 0.27361419796943665,
"learning_rate": 9.502629194591926e-05,
"loss": 0.5063,
"step": 194
},
{
"epoch": 0.1472531621672645,
"grad_norm": 0.27400317788124084,
"learning_rate": 9.497442309553016e-05,
"loss": 0.455,
"step": 195
},
{
"epoch": 0.14800830658863506,
"grad_norm": 0.30166488885879517,
"learning_rate": 9.492229949333058e-05,
"loss": 0.5024,
"step": 196
},
{
"epoch": 0.14876345101000565,
"grad_norm": 0.29768070578575134,
"learning_rate": 9.486992143456792e-05,
"loss": 0.382,
"step": 197
},
{
"epoch": 0.14951859543137624,
"grad_norm": 0.3229611814022064,
"learning_rate": 9.481728921593093e-05,
"loss": 0.4178,
"step": 198
},
{
"epoch": 0.15027373985274683,
"grad_norm": 0.31606340408325195,
"learning_rate": 9.476440313554803e-05,
"loss": 0.403,
"step": 199
},
{
"epoch": 0.15102888427411743,
"grad_norm": 0.3485797047615051,
"learning_rate": 9.471126349298556e-05,
"loss": 0.3898,
"step": 200
},
{
"epoch": 0.15178402869548802,
"grad_norm": 0.12325902283191681,
"learning_rate": 9.46578705892462e-05,
"loss": 0.3083,
"step": 201
},
{
"epoch": 0.1525391731168586,
"grad_norm": 0.1569177210330963,
"learning_rate": 9.460422472676712e-05,
"loss": 0.3685,
"step": 202
},
{
"epoch": 0.1532943175382292,
"grad_norm": 0.1423245519399643,
"learning_rate": 9.45503262094184e-05,
"loss": 0.341,
"step": 203
},
{
"epoch": 0.1540494619595998,
"grad_norm": 0.13771581649780273,
"learning_rate": 9.449617534250122e-05,
"loss": 0.3271,
"step": 204
},
{
"epoch": 0.15480460638097035,
"grad_norm": 0.15911588072776794,
"learning_rate": 9.444177243274618e-05,
"loss": 0.4001,
"step": 205
},
{
"epoch": 0.15555975080234094,
"grad_norm": 0.14060606062412262,
"learning_rate": 9.438711778831152e-05,
"loss": 0.3537,
"step": 206
},
{
"epoch": 0.15631489522371153,
"grad_norm": 0.1690395623445511,
"learning_rate": 9.433221171878144e-05,
"loss": 0.5235,
"step": 207
},
{
"epoch": 0.15707003964508212,
"grad_norm": 0.133980393409729,
"learning_rate": 9.427705453516427e-05,
"loss": 0.351,
"step": 208
},
{
"epoch": 0.1578251840664527,
"grad_norm": 0.1446908861398697,
"learning_rate": 9.422164654989072e-05,
"loss": 0.3714,
"step": 209
},
{
"epoch": 0.1585803284878233,
"grad_norm": 0.15370596945285797,
"learning_rate": 9.41659880768122e-05,
"loss": 0.4452,
"step": 210
},
{
"epoch": 0.1593354729091939,
"grad_norm": 0.15254907310009003,
"learning_rate": 9.411007943119894e-05,
"loss": 0.4388,
"step": 211
},
{
"epoch": 0.16009061733056448,
"grad_norm": 0.14596116542816162,
"learning_rate": 9.405392092973823e-05,
"loss": 0.4037,
"step": 212
},
{
"epoch": 0.16084576175193505,
"grad_norm": 0.14959284663200378,
"learning_rate": 9.399751289053267e-05,
"loss": 0.3917,
"step": 213
},
{
"epoch": 0.16160090617330564,
"grad_norm": 0.16624942421913147,
"learning_rate": 9.394085563309827e-05,
"loss": 0.4609,
"step": 214
},
{
"epoch": 0.16235605059467623,
"grad_norm": 0.16217663884162903,
"learning_rate": 9.388394947836279e-05,
"loss": 0.4446,
"step": 215
},
{
"epoch": 0.16311119501604682,
"grad_norm": 0.1554042100906372,
"learning_rate": 9.382679474866376e-05,
"loss": 0.4281,
"step": 216
},
{
"epoch": 0.1638663394374174,
"grad_norm": 0.16471102833747864,
"learning_rate": 9.376939176774679e-05,
"loss": 0.4674,
"step": 217
},
{
"epoch": 0.164621483858788,
"grad_norm": 0.15578734874725342,
"learning_rate": 9.371174086076363e-05,
"loss": 0.3938,
"step": 218
},
{
"epoch": 0.1653766282801586,
"grad_norm": 0.16161073744297028,
"learning_rate": 9.365384235427042e-05,
"loss": 0.4238,
"step": 219
},
{
"epoch": 0.16613177270152918,
"grad_norm": 0.16967645287513733,
"learning_rate": 9.359569657622574e-05,
"loss": 0.4663,
"step": 220
},
{
"epoch": 0.16688691712289974,
"grad_norm": 0.1597963124513626,
"learning_rate": 9.353730385598887e-05,
"loss": 0.4046,
"step": 221
},
{
"epoch": 0.16764206154427033,
"grad_norm": 0.17794618010520935,
"learning_rate": 9.34786645243178e-05,
"loss": 0.4111,
"step": 222
},
{
"epoch": 0.16839720596564092,
"grad_norm": 0.16721504926681519,
"learning_rate": 9.341977891336749e-05,
"loss": 0.4147,
"step": 223
},
{
"epoch": 0.1691523503870115,
"grad_norm": 0.1782609224319458,
"learning_rate": 9.336064735668784e-05,
"loss": 0.4159,
"step": 224
},
{
"epoch": 0.1699074948083821,
"grad_norm": 0.17892874777317047,
"learning_rate": 9.330127018922194e-05,
"loss": 0.4498,
"step": 225
},
{
"epoch": 0.1706626392297527,
"grad_norm": 0.18540053069591522,
"learning_rate": 9.324164774730406e-05,
"loss": 0.4727,
"step": 226
},
{
"epoch": 0.17141778365112328,
"grad_norm": 0.19462263584136963,
"learning_rate": 9.318178036865785e-05,
"loss": 0.4427,
"step": 227
},
{
"epoch": 0.17217292807249387,
"grad_norm": 0.19947673380374908,
"learning_rate": 9.312166839239433e-05,
"loss": 0.4262,
"step": 228
},
{
"epoch": 0.17292807249386447,
"grad_norm": 0.19704130291938782,
"learning_rate": 9.306131215901003e-05,
"loss": 0.4226,
"step": 229
},
{
"epoch": 0.17368321691523503,
"grad_norm": 0.18819350004196167,
"learning_rate": 9.300071201038503e-05,
"loss": 0.4082,
"step": 230
},
{
"epoch": 0.17443836133660562,
"grad_norm": 0.20187118649482727,
"learning_rate": 9.293986828978106e-05,
"loss": 0.449,
"step": 231
},
{
"epoch": 0.1751935057579762,
"grad_norm": 0.2177709937095642,
"learning_rate": 9.287878134183948e-05,
"loss": 0.4607,
"step": 232
},
{
"epoch": 0.1759486501793468,
"grad_norm": 0.2070026397705078,
"learning_rate": 9.281745151257946e-05,
"loss": 0.3923,
"step": 233
},
{
"epoch": 0.1767037946007174,
"grad_norm": 0.2048753798007965,
"learning_rate": 9.275587914939586e-05,
"loss": 0.4518,
"step": 234
},
{
"epoch": 0.17745893902208798,
"grad_norm": 0.20374053716659546,
"learning_rate": 9.26940646010574e-05,
"loss": 0.4258,
"step": 235
},
{
"epoch": 0.17821408344345857,
"grad_norm": 0.22188866138458252,
"learning_rate": 9.263200821770461e-05,
"loss": 0.3896,
"step": 236
},
{
"epoch": 0.17896922786482916,
"grad_norm": 0.22264693677425385,
"learning_rate": 9.256971035084785e-05,
"loss": 0.4189,
"step": 237
},
{
"epoch": 0.17972437228619972,
"grad_norm": 0.2051049768924713,
"learning_rate": 9.250717135336534e-05,
"loss": 0.3751,
"step": 238
},
{
"epoch": 0.18047951670757031,
"grad_norm": 0.24615737795829773,
"learning_rate": 9.244439157950114e-05,
"loss": 0.4728,
"step": 239
},
{
"epoch": 0.1812346611289409,
"grad_norm": 0.2331840842962265,
"learning_rate": 9.238137138486318e-05,
"loss": 0.4516,
"step": 240
},
{
"epoch": 0.1819898055503115,
"grad_norm": 0.23615127801895142,
"learning_rate": 9.231811112642121e-05,
"loss": 0.3788,
"step": 241
},
{
"epoch": 0.18274494997168209,
"grad_norm": 0.2417721003293991,
"learning_rate": 9.225461116250483e-05,
"loss": 0.4161,
"step": 242
},
{
"epoch": 0.18350009439305268,
"grad_norm": 0.26503533124923706,
"learning_rate": 9.219087185280132e-05,
"loss": 0.4356,
"step": 243
},
{
"epoch": 0.18425523881442327,
"grad_norm": 0.24484668672084808,
"learning_rate": 9.212689355835379e-05,
"loss": 0.3629,
"step": 244
},
{
"epoch": 0.18501038323579386,
"grad_norm": 0.2426530420780182,
"learning_rate": 9.206267664155907e-05,
"loss": 0.3427,
"step": 245
},
{
"epoch": 0.18576552765716442,
"grad_norm": 0.26813021302223206,
"learning_rate": 9.199822146616552e-05,
"loss": 0.4148,
"step": 246
},
{
"epoch": 0.186520672078535,
"grad_norm": 0.3065304458141327,
"learning_rate": 9.193352839727121e-05,
"loss": 0.4015,
"step": 247
},
{
"epoch": 0.1872758164999056,
"grad_norm": 0.32816389203071594,
"learning_rate": 9.186859780132164e-05,
"loss": 0.4233,
"step": 248
},
{
"epoch": 0.1880309609212762,
"grad_norm": 0.3237447738647461,
"learning_rate": 9.18034300461078e-05,
"loss": 0.351,
"step": 249
},
{
"epoch": 0.18878610534264678,
"grad_norm": 0.3921673595905304,
"learning_rate": 9.173802550076401e-05,
"loss": 0.4465,
"step": 250
},
{
"epoch": 0.18954124976401737,
"grad_norm": 0.12883761525154114,
"learning_rate": 9.167238453576589e-05,
"loss": 0.3514,
"step": 251
},
{
"epoch": 0.19029639418538796,
"grad_norm": 0.14347058534622192,
"learning_rate": 9.160650752292819e-05,
"loss": 0.3831,
"step": 252
},
{
"epoch": 0.19105153860675855,
"grad_norm": 0.1370943933725357,
"learning_rate": 9.154039483540273e-05,
"loss": 0.3797,
"step": 253
},
{
"epoch": 0.19180668302812914,
"grad_norm": 0.1585707813501358,
"learning_rate": 9.147404684767632e-05,
"loss": 0.4072,
"step": 254
},
{
"epoch": 0.1925618274494997,
"grad_norm": 0.147821843624115,
"learning_rate": 9.140746393556854e-05,
"loss": 0.3441,
"step": 255
},
{
"epoch": 0.1933169718708703,
"grad_norm": 0.1582058221101761,
"learning_rate": 9.134064647622972e-05,
"loss": 0.4164,
"step": 256
},
{
"epoch": 0.1940721162922409,
"grad_norm": 0.1520204097032547,
"learning_rate": 9.12735948481387e-05,
"loss": 0.3704,
"step": 257
},
{
"epoch": 0.19482726071361148,
"grad_norm": 0.14902523159980774,
"learning_rate": 9.120630943110077e-05,
"loss": 0.4204,
"step": 258
},
{
"epoch": 0.19558240513498207,
"grad_norm": 0.14658547937870026,
"learning_rate": 9.113879060624553e-05,
"loss": 0.3435,
"step": 259
},
{
"epoch": 0.19633754955635266,
"grad_norm": 0.14122474193572998,
"learning_rate": 9.107103875602459e-05,
"loss": 0.3681,
"step": 260
},
{
"epoch": 0.19709269397772325,
"grad_norm": 0.14616741240024567,
"learning_rate": 9.100305426420956e-05,
"loss": 0.3931,
"step": 261
},
{
"epoch": 0.19784783839909384,
"grad_norm": 0.1556493043899536,
"learning_rate": 9.093483751588983e-05,
"loss": 0.3966,
"step": 262
},
{
"epoch": 0.1986029828204644,
"grad_norm": 0.1637788712978363,
"learning_rate": 9.086638889747035e-05,
"loss": 0.5167,
"step": 263
},
{
"epoch": 0.199358127241835,
"grad_norm": 0.1598958671092987,
"learning_rate": 9.079770879666949e-05,
"loss": 0.4257,
"step": 264
},
{
"epoch": 0.20011327166320558,
"grad_norm": 0.15921838581562042,
"learning_rate": 9.072879760251679e-05,
"loss": 0.4247,
"step": 265
},
{
"epoch": 0.20086841608457617,
"grad_norm": 0.16750189661979675,
"learning_rate": 9.065965570535082e-05,
"loss": 0.4249,
"step": 266
},
{
"epoch": 0.20162356050594676,
"grad_norm": 0.18144546449184418,
"learning_rate": 9.059028349681694e-05,
"loss": 0.5394,
"step": 267
},
{
"epoch": 0.20237870492731735,
"grad_norm": 0.16736812889575958,
"learning_rate": 9.052068136986502e-05,
"loss": 0.5119,
"step": 268
},
{
"epoch": 0.20313384934868794,
"grad_norm": 0.1700868159532547,
"learning_rate": 9.045084971874738e-05,
"loss": 0.4552,
"step": 269
},
{
"epoch": 0.20388899377005854,
"grad_norm": 0.17564022541046143,
"learning_rate": 9.038078893901634e-05,
"loss": 0.4344,
"step": 270
},
{
"epoch": 0.2046441381914291,
"grad_norm": 0.18228355050086975,
"learning_rate": 9.031049942752215e-05,
"loss": 0.4206,
"step": 271
},
{
"epoch": 0.2053992826127997,
"grad_norm": 0.17782603204250336,
"learning_rate": 9.023998158241068e-05,
"loss": 0.4786,
"step": 272
},
{
"epoch": 0.20615442703417028,
"grad_norm": 0.18378295004367828,
"learning_rate": 9.016923580312113e-05,
"loss": 0.443,
"step": 273
},
{
"epoch": 0.20690957145554087,
"grad_norm": 0.17299628257751465,
"learning_rate": 9.009826249038387e-05,
"loss": 0.3804,
"step": 274
},
{
"epoch": 0.20766471587691146,
"grad_norm": 0.19087707996368408,
"learning_rate": 9.002706204621803e-05,
"loss": 0.4698,
"step": 275
},
{
"epoch": 0.20841986029828205,
"grad_norm": 0.17627054452896118,
"learning_rate": 8.995563487392932e-05,
"loss": 0.3927,
"step": 276
},
{
"epoch": 0.20917500471965264,
"grad_norm": 0.19746743142604828,
"learning_rate": 8.988398137810777e-05,
"loss": 0.4765,
"step": 277
},
{
"epoch": 0.20993014914102323,
"grad_norm": 0.19492515921592712,
"learning_rate": 8.981210196462533e-05,
"loss": 0.4254,
"step": 278
},
{
"epoch": 0.21068529356239382,
"grad_norm": 0.19054411351680756,
"learning_rate": 8.973999704063365e-05,
"loss": 0.398,
"step": 279
},
{
"epoch": 0.21144043798376438,
"grad_norm": 0.202229306101799,
"learning_rate": 8.966766701456177e-05,
"loss": 0.5233,
"step": 280
},
{
"epoch": 0.21219558240513497,
"grad_norm": 0.20624679327011108,
"learning_rate": 8.959511229611376e-05,
"loss": 0.4411,
"step": 281
},
{
"epoch": 0.21295072682650557,
"grad_norm": 0.2044438272714615,
"learning_rate": 8.952233329626647e-05,
"loss": 0.4102,
"step": 282
},
{
"epoch": 0.21370587124787616,
"grad_norm": 0.20926257967948914,
"learning_rate": 8.944933042726714e-05,
"loss": 0.3872,
"step": 283
},
{
"epoch": 0.21446101566924675,
"grad_norm": 0.2206258326768875,
"learning_rate": 8.937610410263109e-05,
"loss": 0.4126,
"step": 284
},
{
"epoch": 0.21521616009061734,
"grad_norm": 0.23429769277572632,
"learning_rate": 8.930265473713938e-05,
"loss": 0.4453,
"step": 285
},
{
"epoch": 0.21597130451198793,
"grad_norm": 0.2304621934890747,
"learning_rate": 8.922898274683644e-05,
"loss": 0.4429,
"step": 286
},
{
"epoch": 0.21672644893335852,
"grad_norm": 0.23192839324474335,
"learning_rate": 8.915508854902778e-05,
"loss": 0.4737,
"step": 287
},
{
"epoch": 0.21748159335472908,
"grad_norm": 0.23144613206386566,
"learning_rate": 8.908097256227749e-05,
"loss": 0.382,
"step": 288
},
{
"epoch": 0.21823673777609967,
"grad_norm": 0.24735011160373688,
"learning_rate": 8.900663520640604e-05,
"loss": 0.3929,
"step": 289
},
{
"epoch": 0.21899188219747026,
"grad_norm": 0.267395555973053,
"learning_rate": 8.893207690248776e-05,
"loss": 0.4586,
"step": 290
},
{
"epoch": 0.21974702661884085,
"grad_norm": 0.2836948335170746,
"learning_rate": 8.885729807284856e-05,
"loss": 0.5218,
"step": 291
},
{
"epoch": 0.22050217104021144,
"grad_norm": 0.2509090304374695,
"learning_rate": 8.878229914106342e-05,
"loss": 0.4114,
"step": 292
},
{
"epoch": 0.22125731546158203,
"grad_norm": 0.27369990944862366,
"learning_rate": 8.870708053195413e-05,
"loss": 0.3736,
"step": 293
},
{
"epoch": 0.22201245988295262,
"grad_norm": 0.279341459274292,
"learning_rate": 8.863164267158678e-05,
"loss": 0.3845,
"step": 294
},
{
"epoch": 0.2227676043043232,
"grad_norm": 0.27738478779792786,
"learning_rate": 8.855598598726939e-05,
"loss": 0.3331,
"step": 295
},
{
"epoch": 0.22352274872569378,
"grad_norm": 0.32042670249938965,
"learning_rate": 8.848011090754947e-05,
"loss": 0.3943,
"step": 296
},
{
"epoch": 0.22427789314706437,
"grad_norm": 0.2977651059627533,
"learning_rate": 8.840401786221159e-05,
"loss": 0.3701,
"step": 297
},
{
"epoch": 0.22503303756843496,
"grad_norm": 0.35676780343055725,
"learning_rate": 8.832770728227502e-05,
"loss": 0.4728,
"step": 298
},
{
"epoch": 0.22578818198980555,
"grad_norm": 0.3656991720199585,
"learning_rate": 8.825117959999116e-05,
"loss": 0.3662,
"step": 299
},
{
"epoch": 0.22654332641117614,
"grad_norm": 0.36816850304603577,
"learning_rate": 8.817443524884119e-05,
"loss": 0.3644,
"step": 300
},
{
"epoch": 0.22729847083254673,
"grad_norm": 0.12952867150306702,
"learning_rate": 8.809747466353356e-05,
"loss": 0.2991,
"step": 301
},
{
"epoch": 0.22805361525391732,
"grad_norm": 0.1346244215965271,
"learning_rate": 8.802029828000156e-05,
"loss": 0.3154,
"step": 302
},
{
"epoch": 0.2288087596752879,
"grad_norm": 0.15575996041297913,
"learning_rate": 8.794290653540084e-05,
"loss": 0.4171,
"step": 303
},
{
"epoch": 0.22956390409665847,
"grad_norm": 0.1542641818523407,
"learning_rate": 8.7865299868107e-05,
"loss": 0.3851,
"step": 304
},
{
"epoch": 0.23031904851802906,
"grad_norm": 0.16775798797607422,
"learning_rate": 8.778747871771292e-05,
"loss": 0.3872,
"step": 305
},
{
"epoch": 0.23107419293939965,
"grad_norm": 0.15936799347400665,
"learning_rate": 8.770944352502648e-05,
"loss": 0.3998,
"step": 306
},
{
"epoch": 0.23182933736077024,
"grad_norm": 0.16513392329216003,
"learning_rate": 8.763119473206794e-05,
"loss": 0.4005,
"step": 307
},
{
"epoch": 0.23258448178214083,
"grad_norm": 0.14965298771858215,
"learning_rate": 8.755273278206749e-05,
"loss": 0.3792,
"step": 308
},
{
"epoch": 0.23333962620351142,
"grad_norm": 0.1585433930158615,
"learning_rate": 8.74740581194627e-05,
"loss": 0.4291,
"step": 309
},
{
"epoch": 0.23409477062488201,
"grad_norm": 0.16825465857982635,
"learning_rate": 8.739517118989605e-05,
"loss": 0.4434,
"step": 310
},
{
"epoch": 0.2348499150462526,
"grad_norm": 0.16293483972549438,
"learning_rate": 8.731607244021236e-05,
"loss": 0.4481,
"step": 311
},
{
"epoch": 0.2356050594676232,
"grad_norm": 0.15974049270153046,
"learning_rate": 8.723676231845626e-05,
"loss": 0.4557,
"step": 312
},
{
"epoch": 0.23636020388899376,
"grad_norm": 0.1649303436279297,
"learning_rate": 8.715724127386972e-05,
"loss": 0.4451,
"step": 313
},
{
"epoch": 0.23711534831036435,
"grad_norm": 0.16639627516269684,
"learning_rate": 8.70775097568894e-05,
"loss": 0.4518,
"step": 314
},
{
"epoch": 0.23787049273173494,
"grad_norm": 0.17083673179149628,
"learning_rate": 8.69975682191442e-05,
"loss": 0.3745,
"step": 315
},
{
"epoch": 0.23862563715310553,
"grad_norm": 0.16381201148033142,
"learning_rate": 8.691741711345263e-05,
"loss": 0.3922,
"step": 316
},
{
"epoch": 0.23938078157447612,
"grad_norm": 0.17072418332099915,
"learning_rate": 8.683705689382024e-05,
"loss": 0.3563,
"step": 317
},
{
"epoch": 0.2401359259958467,
"grad_norm": 0.1824284940958023,
"learning_rate": 8.675648801543718e-05,
"loss": 0.432,
"step": 318
},
{
"epoch": 0.2408910704172173,
"grad_norm": 0.17872655391693115,
"learning_rate": 8.667571093467541e-05,
"loss": 0.4294,
"step": 319
},
{
"epoch": 0.2416462148385879,
"grad_norm": 0.17728659510612488,
"learning_rate": 8.659472610908627e-05,
"loss": 0.3963,
"step": 320
},
{
"epoch": 0.24240135925995845,
"grad_norm": 0.18014661967754364,
"learning_rate": 8.651353399739787e-05,
"loss": 0.4614,
"step": 321
},
{
"epoch": 0.24315650368132904,
"grad_norm": 0.18057285249233246,
"learning_rate": 8.643213505951242e-05,
"loss": 0.4134,
"step": 322
},
{
"epoch": 0.24391164810269964,
"grad_norm": 0.19068454205989838,
"learning_rate": 8.635052975650369e-05,
"loss": 0.539,
"step": 323
},
{
"epoch": 0.24466679252407023,
"grad_norm": 0.18311984837055206,
"learning_rate": 8.626871855061438e-05,
"loss": 0.3847,
"step": 324
},
{
"epoch": 0.24542193694544082,
"grad_norm": 0.172931507229805,
"learning_rate": 8.618670190525352e-05,
"loss": 0.3835,
"step": 325
},
{
"epoch": 0.2461770813668114,
"grad_norm": 0.18892034888267517,
"learning_rate": 8.610448028499376e-05,
"loss": 0.4231,
"step": 326
},
{
"epoch": 0.246932225788182,
"grad_norm": 0.19887331128120422,
"learning_rate": 8.602205415556889e-05,
"loss": 0.4835,
"step": 327
},
{
"epoch": 0.2476873702095526,
"grad_norm": 0.18918727338314056,
"learning_rate": 8.593942398387105e-05,
"loss": 0.4285,
"step": 328
},
{
"epoch": 0.24844251463092315,
"grad_norm": 0.19010977447032928,
"learning_rate": 8.585659023794818e-05,
"loss": 0.4059,
"step": 329
},
{
"epoch": 0.24919765905229374,
"grad_norm": 0.1906062811613083,
"learning_rate": 8.577355338700132e-05,
"loss": 0.423,
"step": 330
},
{
"epoch": 0.24995280347366433,
"grad_norm": 0.1999729573726654,
"learning_rate": 8.569031390138202e-05,
"loss": 0.4482,
"step": 331
},
{
"epoch": 0.2507079478950349,
"grad_norm": 0.20304431021213531,
"learning_rate": 8.560687225258958e-05,
"loss": 0.3917,
"step": 332
},
{
"epoch": 0.2514630923164055,
"grad_norm": 0.21377113461494446,
"learning_rate": 8.552322891326846e-05,
"loss": 0.4558,
"step": 333
},
{
"epoch": 0.2522182367377761,
"grad_norm": 0.21330960094928741,
"learning_rate": 8.543938435720549e-05,
"loss": 0.4073,
"step": 334
},
{
"epoch": 0.2529733811591467,
"grad_norm": 0.2062496691942215,
"learning_rate": 8.535533905932738e-05,
"loss": 0.3382,
"step": 335
},
{
"epoch": 0.2537285255805173,
"grad_norm": 0.24538478255271912,
"learning_rate": 8.527109349569787e-05,
"loss": 0.4455,
"step": 336
},
{
"epoch": 0.2544836700018879,
"grad_norm": 0.2312334179878235,
"learning_rate": 8.518664814351502e-05,
"loss": 0.4248,
"step": 337
},
{
"epoch": 0.25523881442325846,
"grad_norm": 0.2378857582807541,
"learning_rate": 8.510200348110868e-05,
"loss": 0.4568,
"step": 338
},
{
"epoch": 0.25599395884462905,
"grad_norm": 0.2495100200176239,
"learning_rate": 8.501715998793757e-05,
"loss": 0.4504,
"step": 339
},
{
"epoch": 0.25674910326599965,
"grad_norm": 0.2401399314403534,
"learning_rate": 8.493211814458673e-05,
"loss": 0.3736,
"step": 340
},
{
"epoch": 0.25750424768737024,
"grad_norm": 0.2643365263938904,
"learning_rate": 8.484687843276469e-05,
"loss": 0.4577,
"step": 341
},
{
"epoch": 0.25825939210874077,
"grad_norm": 0.26086193323135376,
"learning_rate": 8.476144133530075e-05,
"loss": 0.4151,
"step": 342
},
{
"epoch": 0.25901453653011136,
"grad_norm": 0.25575414299964905,
"learning_rate": 8.467580733614233e-05,
"loss": 0.4097,
"step": 343
},
{
"epoch": 0.25976968095148195,
"grad_norm": 0.24969545006752014,
"learning_rate": 8.45899769203522e-05,
"loss": 0.3696,
"step": 344
},
{
"epoch": 0.26052482537285254,
"grad_norm": 0.2540886402130127,
"learning_rate": 8.450395057410561e-05,
"loss": 0.3615,
"step": 345
},
{
"epoch": 0.26127996979422313,
"grad_norm": 0.30520960688591003,
"learning_rate": 8.44177287846877e-05,
"loss": 0.393,
"step": 346
},
{
"epoch": 0.2620351142155937,
"grad_norm": 0.300483763217926,
"learning_rate": 8.433131204049067e-05,
"loss": 0.3889,
"step": 347
},
{
"epoch": 0.2627902586369643,
"grad_norm": 0.326023131608963,
"learning_rate": 8.424470083101101e-05,
"loss": 0.4587,
"step": 348
},
{
"epoch": 0.2635454030583349,
"grad_norm": 0.3497852683067322,
"learning_rate": 8.415789564684673e-05,
"loss": 0.4071,
"step": 349
},
{
"epoch": 0.2643005474797055,
"grad_norm": 0.39250481128692627,
"learning_rate": 8.407089697969457e-05,
"loss": 0.3864,
"step": 350
},
{
"epoch": 0.2650556919010761,
"grad_norm": 0.13501006364822388,
"learning_rate": 8.398370532234722e-05,
"loss": 0.2857,
"step": 351
},
{
"epoch": 0.2658108363224467,
"grad_norm": 0.13370132446289062,
"learning_rate": 8.389632116869061e-05,
"loss": 0.3307,
"step": 352
},
{
"epoch": 0.26656598074381727,
"grad_norm": 0.15557359158992767,
"learning_rate": 8.380874501370097e-05,
"loss": 0.3663,
"step": 353
},
{
"epoch": 0.26732112516518786,
"grad_norm": 0.15083420276641846,
"learning_rate": 8.372097735344212e-05,
"loss": 0.3517,
"step": 354
},
{
"epoch": 0.26807626958655845,
"grad_norm": 0.1640172004699707,
"learning_rate": 8.363301868506264e-05,
"loss": 0.3621,
"step": 355
},
{
"epoch": 0.26883141400792904,
"grad_norm": 0.17162185907363892,
"learning_rate": 8.354486950679301e-05,
"loss": 0.3933,
"step": 356
},
{
"epoch": 0.2695865584292996,
"grad_norm": 0.154635950922966,
"learning_rate": 8.345653031794292e-05,
"loss": 0.3834,
"step": 357
},
{
"epoch": 0.2703417028506702,
"grad_norm": 0.15567855536937714,
"learning_rate": 8.336800161889826e-05,
"loss": 0.4,
"step": 358
},
{
"epoch": 0.27109684727204075,
"grad_norm": 0.1589149832725525,
"learning_rate": 8.327928391111841e-05,
"loss": 0.3923,
"step": 359
},
{
"epoch": 0.27185199169341134,
"grad_norm": 0.1654612272977829,
"learning_rate": 8.319037769713338e-05,
"loss": 0.3808,
"step": 360
},
{
"epoch": 0.27260713611478193,
"grad_norm": 0.1704426407814026,
"learning_rate": 8.310128348054094e-05,
"loss": 0.4662,
"step": 361
},
{
"epoch": 0.2733622805361525,
"grad_norm": 0.16645170748233795,
"learning_rate": 8.301200176600375e-05,
"loss": 0.4369,
"step": 362
},
{
"epoch": 0.2741174249575231,
"grad_norm": 0.16606634855270386,
"learning_rate": 8.292253305924655e-05,
"loss": 0.4147,
"step": 363
},
{
"epoch": 0.2748725693788937,
"grad_norm": 0.17440947890281677,
"learning_rate": 8.283287786705331e-05,
"loss": 0.4787,
"step": 364
},
{
"epoch": 0.2756277138002643,
"grad_norm": 0.1618100106716156,
"learning_rate": 8.274303669726426e-05,
"loss": 0.3666,
"step": 365
},
{
"epoch": 0.2763828582216349,
"grad_norm": 0.17216123640537262,
"learning_rate": 8.265301005877309e-05,
"loss": 0.4511,
"step": 366
},
{
"epoch": 0.2771380026430055,
"grad_norm": 0.17408417165279388,
"learning_rate": 8.25627984615241e-05,
"loss": 0.4472,
"step": 367
},
{
"epoch": 0.27789314706437607,
"grad_norm": 0.17124348878860474,
"learning_rate": 8.247240241650918e-05,
"loss": 0.4406,
"step": 368
},
{
"epoch": 0.27864829148574666,
"grad_norm": 0.1730695217847824,
"learning_rate": 8.238182243576512e-05,
"loss": 0.3658,
"step": 369
},
{
"epoch": 0.27940343590711725,
"grad_norm": 0.1981906294822693,
"learning_rate": 8.229105903237044e-05,
"loss": 0.4417,
"step": 370
},
{
"epoch": 0.28015858032848784,
"grad_norm": 0.18475179374217987,
"learning_rate": 8.220011272044277e-05,
"loss": 0.4125,
"step": 371
},
{
"epoch": 0.28091372474985843,
"grad_norm": 0.1894509643316269,
"learning_rate": 8.210898401513574e-05,
"loss": 0.463,
"step": 372
},
{
"epoch": 0.281668869171229,
"grad_norm": 0.18303871154785156,
"learning_rate": 8.201767343263612e-05,
"loss": 0.4299,
"step": 373
},
{
"epoch": 0.2824240135925996,
"grad_norm": 0.1773396134376526,
"learning_rate": 8.192618149016091e-05,
"loss": 0.384,
"step": 374
},
{
"epoch": 0.28317915801397014,
"grad_norm": 0.1883654147386551,
"learning_rate": 8.183450870595441e-05,
"loss": 0.3913,
"step": 375
},
{
"epoch": 0.28393430243534074,
"grad_norm": 0.18863160908222198,
"learning_rate": 8.174265559928527e-05,
"loss": 0.4005,
"step": 376
},
{
"epoch": 0.2846894468567113,
"grad_norm": 0.18807992339134216,
"learning_rate": 8.165062269044353e-05,
"loss": 0.3547,
"step": 377
},
{
"epoch": 0.2854445912780819,
"grad_norm": 0.1992112398147583,
"learning_rate": 8.155841050073771e-05,
"loss": 0.3788,
"step": 378
},
{
"epoch": 0.2861997356994525,
"grad_norm": 0.2015550285577774,
"learning_rate": 8.146601955249188e-05,
"loss": 0.457,
"step": 379
},
{
"epoch": 0.2869548801208231,
"grad_norm": 0.2002776712179184,
"learning_rate": 8.13734503690426e-05,
"loss": 0.3584,
"step": 380
},
{
"epoch": 0.2877100245421937,
"grad_norm": 0.21863123774528503,
"learning_rate": 8.128070347473609e-05,
"loss": 0.4101,
"step": 381
},
{
"epoch": 0.2884651689635643,
"grad_norm": 0.21147026121616364,
"learning_rate": 8.11877793949251e-05,
"loss": 0.4124,
"step": 382
},
{
"epoch": 0.28922031338493487,
"grad_norm": 0.21045903861522675,
"learning_rate": 8.109467865596612e-05,
"loss": 0.3634,
"step": 383
},
{
"epoch": 0.28997545780630546,
"grad_norm": 0.23374846577644348,
"learning_rate": 8.100140178521624e-05,
"loss": 0.4587,
"step": 384
},
{
"epoch": 0.29073060222767605,
"grad_norm": 0.21156945824623108,
"learning_rate": 8.090794931103026e-05,
"loss": 0.3417,
"step": 385
},
{
"epoch": 0.29148574664904664,
"grad_norm": 0.23073367774486542,
"learning_rate": 8.081432176275765e-05,
"loss": 0.4538,
"step": 386
},
{
"epoch": 0.29224089107041723,
"grad_norm": 0.25584685802459717,
"learning_rate": 8.072051967073955e-05,
"loss": 0.4006,
"step": 387
},
{
"epoch": 0.2929960354917878,
"grad_norm": 0.25985392928123474,
"learning_rate": 8.06265435663058e-05,
"loss": 0.4146,
"step": 388
},
{
"epoch": 0.2937511799131584,
"grad_norm": 0.23220856487751007,
"learning_rate": 8.053239398177191e-05,
"loss": 0.3574,
"step": 389
},
{
"epoch": 0.294506324334529,
"grad_norm": 0.26954004168510437,
"learning_rate": 8.043807145043604e-05,
"loss": 0.4694,
"step": 390
},
{
"epoch": 0.2952614687558996,
"grad_norm": 0.25626078248023987,
"learning_rate": 8.034357650657598e-05,
"loss": 0.3952,
"step": 391
},
{
"epoch": 0.2960166131772701,
"grad_norm": 0.24921610951423645,
"learning_rate": 8.024890968544613e-05,
"loss": 0.3574,
"step": 392
},
{
"epoch": 0.2967717575986407,
"grad_norm": 0.25851958990097046,
"learning_rate": 8.015407152327448e-05,
"loss": 0.3562,
"step": 393
},
{
"epoch": 0.2975269020200113,
"grad_norm": 0.29698118567466736,
"learning_rate": 8.005906255725956e-05,
"loss": 0.4558,
"step": 394
},
{
"epoch": 0.2982820464413819,
"grad_norm": 0.2812412977218628,
"learning_rate": 7.996388332556735e-05,
"loss": 0.3705,
"step": 395
},
{
"epoch": 0.2990371908627525,
"grad_norm": 0.27774620056152344,
"learning_rate": 7.986853436732836e-05,
"loss": 0.3893,
"step": 396
},
{
"epoch": 0.2997923352841231,
"grad_norm": 0.3051348626613617,
"learning_rate": 7.97730162226344e-05,
"loss": 0.3988,
"step": 397
},
{
"epoch": 0.30054747970549367,
"grad_norm": 0.34216588735580444,
"learning_rate": 7.967732943253571e-05,
"loss": 0.3755,
"step": 398
},
{
"epoch": 0.30130262412686426,
"grad_norm": 0.3706994652748108,
"learning_rate": 7.958147453903773e-05,
"loss": 0.4116,
"step": 399
},
{
"epoch": 0.30205776854823485,
"grad_norm": 0.44550538063049316,
"learning_rate": 7.94854520850981e-05,
"loss": 0.3799,
"step": 400
},
{
"epoch": 0.30281291296960544,
"grad_norm": 0.14217671751976013,
"learning_rate": 7.938926261462366e-05,
"loss": 0.3466,
"step": 401
},
{
"epoch": 0.30356805739097603,
"grad_norm": 0.15967446565628052,
"learning_rate": 7.92929066724672e-05,
"loss": 0.3642,
"step": 402
},
{
"epoch": 0.3043232018123466,
"grad_norm": 0.15074422955513,
"learning_rate": 7.919638480442452e-05,
"loss": 0.3297,
"step": 403
},
{
"epoch": 0.3050783462337172,
"grad_norm": 0.15751783549785614,
"learning_rate": 7.90996975572313e-05,
"loss": 0.3825,
"step": 404
},
{
"epoch": 0.3058334906550878,
"grad_norm": 0.15959064662456512,
"learning_rate": 7.900284547855991e-05,
"loss": 0.4007,
"step": 405
},
{
"epoch": 0.3065886350764584,
"grad_norm": 0.16654494404792786,
"learning_rate": 7.890582911701649e-05,
"loss": 0.3951,
"step": 406
},
{
"epoch": 0.307343779497829,
"grad_norm": 0.16326242685317993,
"learning_rate": 7.880864902213765e-05,
"loss": 0.3828,
"step": 407
},
{
"epoch": 0.3080989239191996,
"grad_norm": 0.1605200171470642,
"learning_rate": 7.871130574438752e-05,
"loss": 0.4009,
"step": 408
},
{
"epoch": 0.3088540683405701,
"grad_norm": 0.1753162443637848,
"learning_rate": 7.861379983515449e-05,
"loss": 0.4088,
"step": 409
},
{
"epoch": 0.3096092127619407,
"grad_norm": 0.1612052321434021,
"learning_rate": 7.85161318467482e-05,
"loss": 0.3821,
"step": 410
},
{
"epoch": 0.3103643571833113,
"grad_norm": 0.17267635464668274,
"learning_rate": 7.841830233239638e-05,
"loss": 0.4376,
"step": 411
},
{
"epoch": 0.3111195016046819,
"grad_norm": 0.17416229844093323,
"learning_rate": 7.832031184624164e-05,
"loss": 0.4073,
"step": 412
},
{
"epoch": 0.31187464602605247,
"grad_norm": 0.17623671889305115,
"learning_rate": 7.822216094333847e-05,
"loss": 0.4482,
"step": 413
},
{
"epoch": 0.31262979044742306,
"grad_norm": 0.1862659901380539,
"learning_rate": 7.812385017964994e-05,
"loss": 0.4902,
"step": 414
},
{
"epoch": 0.31338493486879365,
"grad_norm": 0.19047270715236664,
"learning_rate": 7.80253801120447e-05,
"loss": 0.4848,
"step": 415
},
{
"epoch": 0.31414007929016424,
"grad_norm": 0.18323764204978943,
"learning_rate": 7.792675129829373e-05,
"loss": 0.3942,
"step": 416
},
{
"epoch": 0.31489522371153483,
"grad_norm": 0.17627227306365967,
"learning_rate": 7.78279642970672e-05,
"loss": 0.3859,
"step": 417
},
{
"epoch": 0.3156503681329054,
"grad_norm": 0.18996286392211914,
"learning_rate": 7.772901966793132e-05,
"loss": 0.3722,
"step": 418
},
{
"epoch": 0.316405512554276,
"grad_norm": 0.17553554475307465,
"learning_rate": 7.762991797134514e-05,
"loss": 0.4016,
"step": 419
},
{
"epoch": 0.3171606569756466,
"grad_norm": 0.18145516514778137,
"learning_rate": 7.753065976865744e-05,
"loss": 0.3781,
"step": 420
},
{
"epoch": 0.3179158013970172,
"grad_norm": 0.18947438895702362,
"learning_rate": 7.74312456221035e-05,
"loss": 0.4143,
"step": 421
},
{
"epoch": 0.3186709458183878,
"grad_norm": 0.18466342985630035,
"learning_rate": 7.73316760948019e-05,
"loss": 0.446,
"step": 422
},
{
"epoch": 0.3194260902397584,
"grad_norm": 0.19619819521903992,
"learning_rate": 7.723195175075136e-05,
"loss": 0.4955,
"step": 423
},
{
"epoch": 0.32018123466112897,
"grad_norm": 0.1804795265197754,
"learning_rate": 7.713207315482755e-05,
"loss": 0.3693,
"step": 424
},
{
"epoch": 0.3209363790824995,
"grad_norm": 0.1992131769657135,
"learning_rate": 7.703204087277988e-05,
"loss": 0.4447,
"step": 425
},
{
"epoch": 0.3216915235038701,
"grad_norm": 0.19404684007167816,
"learning_rate": 7.693185547122829e-05,
"loss": 0.3876,
"step": 426
},
{
"epoch": 0.3224466679252407,
"grad_norm": 0.19772908091545105,
"learning_rate": 7.683151751766004e-05,
"loss": 0.4085,
"step": 427
},
{
"epoch": 0.3232018123466113,
"grad_norm": 0.21671298146247864,
"learning_rate": 7.673102758042653e-05,
"loss": 0.4911,
"step": 428
},
{
"epoch": 0.32395695676798186,
"grad_norm": 0.20475253462791443,
"learning_rate": 7.663038622873999e-05,
"loss": 0.395,
"step": 429
},
{
"epoch": 0.32471210118935245,
"grad_norm": 0.2204219251871109,
"learning_rate": 7.652959403267041e-05,
"loss": 0.4481,
"step": 430
},
{
"epoch": 0.32546724561072304,
"grad_norm": 0.20216889679431915,
"learning_rate": 7.64286515631421e-05,
"loss": 0.3628,
"step": 431
},
{
"epoch": 0.32622239003209363,
"grad_norm": 0.22937530279159546,
"learning_rate": 7.63275593919307e-05,
"loss": 0.5035,
"step": 432
},
{
"epoch": 0.3269775344534642,
"grad_norm": 0.2090550810098648,
"learning_rate": 7.622631809165973e-05,
"loss": 0.3664,
"step": 433
},
{
"epoch": 0.3277326788748348,
"grad_norm": 0.22946757078170776,
"learning_rate": 7.612492823579745e-05,
"loss": 0.4146,
"step": 434
},
{
"epoch": 0.3284878232962054,
"grad_norm": 0.24275080859661102,
"learning_rate": 7.602339039865362e-05,
"loss": 0.3957,
"step": 435
},
{
"epoch": 0.329242967717576,
"grad_norm": 0.24225813150405884,
"learning_rate": 7.59217051553762e-05,
"loss": 0.4345,
"step": 436
},
{
"epoch": 0.3299981121389466,
"grad_norm": 0.24835467338562012,
"learning_rate": 7.58198730819481e-05,
"loss": 0.4618,
"step": 437
},
{
"epoch": 0.3307532565603172,
"grad_norm": 0.22757121920585632,
"learning_rate": 7.571789475518399e-05,
"loss": 0.3643,
"step": 438
},
{
"epoch": 0.33150840098168777,
"grad_norm": 0.2474457025527954,
"learning_rate": 7.561577075272686e-05,
"loss": 0.3929,
"step": 439
},
{
"epoch": 0.33226354540305836,
"grad_norm": 0.2527807056903839,
"learning_rate": 7.5513501653045e-05,
"loss": 0.3763,
"step": 440
},
{
"epoch": 0.33301868982442895,
"grad_norm": 0.25088363885879517,
"learning_rate": 7.541108803542846e-05,
"loss": 0.4079,
"step": 441
},
{
"epoch": 0.3337738342457995,
"grad_norm": 0.26713383197784424,
"learning_rate": 7.530853047998599e-05,
"loss": 0.4074,
"step": 442
},
{
"epoch": 0.3345289786671701,
"grad_norm": 0.2865240275859833,
"learning_rate": 7.52058295676416e-05,
"loss": 0.4333,
"step": 443
},
{
"epoch": 0.33528412308854066,
"grad_norm": 0.27827370166778564,
"learning_rate": 7.510298588013134e-05,
"loss": 0.4248,
"step": 444
},
{
"epoch": 0.33603926750991125,
"grad_norm": 0.27344340085983276,
"learning_rate": 7.500000000000001e-05,
"loss": 0.3485,
"step": 445
},
{
"epoch": 0.33679441193128185,
"grad_norm": 0.2948186993598938,
"learning_rate": 7.48968725105978e-05,
"loss": 0.3763,
"step": 446
},
{
"epoch": 0.33754955635265244,
"grad_norm": 0.3137521743774414,
"learning_rate": 7.479360399607707e-05,
"loss": 0.4188,
"step": 447
},
{
"epoch": 0.338304700774023,
"grad_norm": 0.30302637815475464,
"learning_rate": 7.469019504138898e-05,
"loss": 0.3524,
"step": 448
},
{
"epoch": 0.3390598451953936,
"grad_norm": 0.4019578695297241,
"learning_rate": 7.45866462322802e-05,
"loss": 0.4313,
"step": 449
},
{
"epoch": 0.3398149896167642,
"grad_norm": 0.40166327357292175,
"learning_rate": 7.448295815528956e-05,
"loss": 0.4065,
"step": 450
},
{
"epoch": 0.3405701340381348,
"grad_norm": 0.15198914706707,
"learning_rate": 7.437913139774482e-05,
"loss": 0.3844,
"step": 451
},
{
"epoch": 0.3413252784595054,
"grad_norm": 0.1483955681324005,
"learning_rate": 7.427516654775922e-05,
"loss": 0.3974,
"step": 452
},
{
"epoch": 0.342080422880876,
"grad_norm": 0.15665127336978912,
"learning_rate": 7.417106419422819e-05,
"loss": 0.3879,
"step": 453
},
{
"epoch": 0.34283556730224657,
"grad_norm": 0.15843887627124786,
"learning_rate": 7.406682492682611e-05,
"loss": 0.4013,
"step": 454
},
{
"epoch": 0.34359071172361716,
"grad_norm": 0.1631614714860916,
"learning_rate": 7.396244933600285e-05,
"loss": 0.4135,
"step": 455
},
{
"epoch": 0.34434585614498775,
"grad_norm": 0.1582673341035843,
"learning_rate": 7.385793801298042e-05,
"loss": 0.3674,
"step": 456
},
{
"epoch": 0.34510100056635834,
"grad_norm": 0.15745416283607483,
"learning_rate": 7.375329154974975e-05,
"loss": 0.3907,
"step": 457
},
{
"epoch": 0.34585614498772893,
"grad_norm": 0.15578390657901764,
"learning_rate": 7.364851053906718e-05,
"loss": 0.3829,
"step": 458
},
{
"epoch": 0.34661128940909947,
"grad_norm": 0.16378001868724823,
"learning_rate": 7.354359557445126e-05,
"loss": 0.4303,
"step": 459
},
{
"epoch": 0.34736643383047006,
"grad_norm": 0.16894683241844177,
"learning_rate": 7.343854725017918e-05,
"loss": 0.4237,
"step": 460
},
{
"epoch": 0.34812157825184065,
"grad_norm": 0.16270391643047333,
"learning_rate": 7.333336616128369e-05,
"loss": 0.4058,
"step": 461
},
{
"epoch": 0.34887672267321124,
"grad_norm": 0.1628381311893463,
"learning_rate": 7.322805290354943e-05,
"loss": 0.3908,
"step": 462
},
{
"epoch": 0.3496318670945818,
"grad_norm": 0.17563199996948242,
"learning_rate": 7.312260807350975e-05,
"loss": 0.4293,
"step": 463
},
{
"epoch": 0.3503870115159524,
"grad_norm": 0.16583438217639923,
"learning_rate": 7.301703226844327e-05,
"loss": 0.3951,
"step": 464
},
{
"epoch": 0.351142155937323,
"grad_norm": 0.18068847060203552,
"learning_rate": 7.291132608637052e-05,
"loss": 0.4112,
"step": 465
},
{
"epoch": 0.3518973003586936,
"grad_norm": 0.18490007519721985,
"learning_rate": 7.28054901260505e-05,
"loss": 0.4502,
"step": 466
},
{
"epoch": 0.3526524447800642,
"grad_norm": 0.1794682741165161,
"learning_rate": 7.269952498697734e-05,
"loss": 0.414,
"step": 467
},
{
"epoch": 0.3534075892014348,
"grad_norm": 0.18929120898246765,
"learning_rate": 7.259343126937689e-05,
"loss": 0.4282,
"step": 468
},
{
"epoch": 0.35416273362280537,
"grad_norm": 0.18550460040569305,
"learning_rate": 7.24872095742033e-05,
"loss": 0.4052,
"step": 469
},
{
"epoch": 0.35491787804417596,
"grad_norm": 0.20153513550758362,
"learning_rate": 7.238086050313563e-05,
"loss": 0.5138,
"step": 470
},
{
"epoch": 0.35567302246554655,
"grad_norm": 0.1876605898141861,
"learning_rate": 7.227438465857448e-05,
"loss": 0.3959,
"step": 471
},
{
"epoch": 0.35642816688691714,
"grad_norm": 0.19823205471038818,
"learning_rate": 7.216778264363853e-05,
"loss": 0.4628,
"step": 472
},
{
"epoch": 0.35718331130828773,
"grad_norm": 0.1963682770729065,
"learning_rate": 7.206105506216106e-05,
"loss": 0.3864,
"step": 473
},
{
"epoch": 0.3579384557296583,
"grad_norm": 0.197306826710701,
"learning_rate": 7.195420251868675e-05,
"loss": 0.3942,
"step": 474
},
{
"epoch": 0.35869360015102886,
"grad_norm": 0.20867206156253815,
"learning_rate": 7.184722561846798e-05,
"loss": 0.4348,
"step": 475
},
{
"epoch": 0.35944874457239945,
"grad_norm": 0.20809237658977509,
"learning_rate": 7.17401249674616e-05,
"loss": 0.4088,
"step": 476
},
{
"epoch": 0.36020388899377004,
"grad_norm": 0.2266395539045334,
"learning_rate": 7.163290117232542e-05,
"loss": 0.4356,
"step": 477
},
{
"epoch": 0.36095903341514063,
"grad_norm": 0.21087811887264252,
"learning_rate": 7.152555484041476e-05,
"loss": 0.3613,
"step": 478
},
{
"epoch": 0.3617141778365112,
"grad_norm": 0.21059347689151764,
"learning_rate": 7.141808657977907e-05,
"loss": 0.3699,
"step": 479
},
{
"epoch": 0.3624693222578818,
"grad_norm": 0.2281947284936905,
"learning_rate": 7.131049699915841e-05,
"loss": 0.4497,
"step": 480
},
{
"epoch": 0.3632244666792524,
"grad_norm": 0.21307678520679474,
"learning_rate": 7.120278670798009e-05,
"loss": 0.3323,
"step": 481
},
{
"epoch": 0.363979611100623,
"grad_norm": 0.22060398757457733,
"learning_rate": 7.109495631635512e-05,
"loss": 0.3923,
"step": 482
},
{
"epoch": 0.3647347555219936,
"grad_norm": 0.2321634739637375,
"learning_rate": 7.098700643507485e-05,
"loss": 0.3901,
"step": 483
},
{
"epoch": 0.36548989994336417,
"grad_norm": 0.25326424837112427,
"learning_rate": 7.08789376756074e-05,
"loss": 0.3722,
"step": 484
},
{
"epoch": 0.36624504436473476,
"grad_norm": 0.23451083898544312,
"learning_rate": 7.077075065009433e-05,
"loss": 0.3816,
"step": 485
},
{
"epoch": 0.36700018878610535,
"grad_norm": 0.2323211133480072,
"learning_rate": 7.066244597134706e-05,
"loss": 0.3628,
"step": 486
},
{
"epoch": 0.36775533320747594,
"grad_norm": 0.22949382662773132,
"learning_rate": 7.055402425284346e-05,
"loss": 0.382,
"step": 487
},
{
"epoch": 0.36851047762884653,
"grad_norm": 0.2613668739795685,
"learning_rate": 7.044548610872434e-05,
"loss": 0.4107,
"step": 488
},
{
"epoch": 0.3692656220502171,
"grad_norm": 0.264017254114151,
"learning_rate": 7.033683215379002e-05,
"loss": 0.4147,
"step": 489
},
{
"epoch": 0.3700207664715877,
"grad_norm": 0.2688436806201935,
"learning_rate": 7.022806300349675e-05,
"loss": 0.4326,
"step": 490
},
{
"epoch": 0.3707759108929583,
"grad_norm": 0.2659049332141876,
"learning_rate": 7.01191792739534e-05,
"loss": 0.3967,
"step": 491
},
{
"epoch": 0.37153105531432884,
"grad_norm": 0.25551602244377136,
"learning_rate": 7.001018158191772e-05,
"loss": 0.3369,
"step": 492
},
{
"epoch": 0.37228619973569943,
"grad_norm": 0.31377312541007996,
"learning_rate": 6.990107054479312e-05,
"loss": 0.4221,
"step": 493
},
{
"epoch": 0.37304134415707,
"grad_norm": 0.2841019332408905,
"learning_rate": 6.979184678062493e-05,
"loss": 0.439,
"step": 494
},
{
"epoch": 0.3737964885784406,
"grad_norm": 0.3099525570869446,
"learning_rate": 6.968251090809708e-05,
"loss": 0.398,
"step": 495
},
{
"epoch": 0.3745516329998112,
"grad_norm": 0.3198589086532593,
"learning_rate": 6.957306354652848e-05,
"loss": 0.4434,
"step": 496
},
{
"epoch": 0.3753067774211818,
"grad_norm": 0.308400422334671,
"learning_rate": 6.946350531586959e-05,
"loss": 0.3945,
"step": 497
},
{
"epoch": 0.3760619218425524,
"grad_norm": 0.3412231206893921,
"learning_rate": 6.935383683669884e-05,
"loss": 0.4598,
"step": 498
},
{
"epoch": 0.376817066263923,
"grad_norm": 0.3658400774002075,
"learning_rate": 6.924405873021918e-05,
"loss": 0.3627,
"step": 499
},
{
"epoch": 0.37757221068529356,
"grad_norm": 0.44540345668792725,
"learning_rate": 6.91341716182545e-05,
"loss": 0.4255,
"step": 500
},
{
"epoch": 0.37832735510666415,
"grad_norm": 0.14448663592338562,
"learning_rate": 6.902417612324615e-05,
"loss": 0.346,
"step": 501
},
{
"epoch": 0.37908249952803474,
"grad_norm": 0.15504960715770721,
"learning_rate": 6.891407286824944e-05,
"loss": 0.346,
"step": 502
},
{
"epoch": 0.37983764394940533,
"grad_norm": 0.1535167396068573,
"learning_rate": 6.880386247692999e-05,
"loss": 0.3239,
"step": 503
},
{
"epoch": 0.3805927883707759,
"grad_norm": 0.1622663289308548,
"learning_rate": 6.869354557356036e-05,
"loss": 0.3378,
"step": 504
},
{
"epoch": 0.3813479327921465,
"grad_norm": 0.17973972856998444,
"learning_rate": 6.858312278301637e-05,
"loss": 0.4137,
"step": 505
},
{
"epoch": 0.3821030772135171,
"grad_norm": 0.17408986389636993,
"learning_rate": 6.84725947307737e-05,
"loss": 0.4164,
"step": 506
},
{
"epoch": 0.3828582216348877,
"grad_norm": 0.1629510372877121,
"learning_rate": 6.836196204290417e-05,
"loss": 0.3753,
"step": 507
},
{
"epoch": 0.3836133660562583,
"grad_norm": 0.17549729347229004,
"learning_rate": 6.825122534607239e-05,
"loss": 0.3796,
"step": 508
},
{
"epoch": 0.3843685104776288,
"grad_norm": 0.1774250566959381,
"learning_rate": 6.814038526753205e-05,
"loss": 0.4563,
"step": 509
},
{
"epoch": 0.3851236548989994,
"grad_norm": 0.17569324374198914,
"learning_rate": 6.80294424351225e-05,
"loss": 0.3899,
"step": 510
},
{
"epoch": 0.38587879932037,
"grad_norm": 0.18050192296504974,
"learning_rate": 6.7918397477265e-05,
"loss": 0.4665,
"step": 511
},
{
"epoch": 0.3866339437417406,
"grad_norm": 0.18089371919631958,
"learning_rate": 6.780725102295948e-05,
"loss": 0.4632,
"step": 512
},
{
"epoch": 0.3873890881631112,
"grad_norm": 0.16716820001602173,
"learning_rate": 6.769600370178059e-05,
"loss": 0.3797,
"step": 513
},
{
"epoch": 0.3881442325844818,
"grad_norm": 0.18242698907852173,
"learning_rate": 6.758465614387446e-05,
"loss": 0.4545,
"step": 514
},
{
"epoch": 0.38889937700585236,
"grad_norm": 0.179984450340271,
"learning_rate": 6.747320897995493e-05,
"loss": 0.42,
"step": 515
},
{
"epoch": 0.38965452142722296,
"grad_norm": 0.17841747403144836,
"learning_rate": 6.736166284130006e-05,
"loss": 0.4256,
"step": 516
},
{
"epoch": 0.39040966584859355,
"grad_norm": 0.1961648315191269,
"learning_rate": 6.725001835974853e-05,
"loss": 0.424,
"step": 517
},
{
"epoch": 0.39116481026996414,
"grad_norm": 0.18314050137996674,
"learning_rate": 6.713827616769614e-05,
"loss": 0.4107,
"step": 518
},
{
"epoch": 0.3919199546913347,
"grad_norm": 0.18332213163375854,
"learning_rate": 6.702643689809205e-05,
"loss": 0.4012,
"step": 519
},
{
"epoch": 0.3926750991127053,
"grad_norm": 0.18527810275554657,
"learning_rate": 6.691450118443537e-05,
"loss": 0.4209,
"step": 520
},
{
"epoch": 0.3934302435340759,
"grad_norm": 0.18408484756946564,
"learning_rate": 6.680246966077151e-05,
"loss": 0.3916,
"step": 521
},
{
"epoch": 0.3941853879554465,
"grad_norm": 0.1861732453107834,
"learning_rate": 6.669034296168855e-05,
"loss": 0.4096,
"step": 522
},
{
"epoch": 0.3949405323768171,
"grad_norm": 0.20170702040195465,
"learning_rate": 6.65781217223137e-05,
"loss": 0.4398,
"step": 523
},
{
"epoch": 0.3956956767981877,
"grad_norm": 0.20506803691387177,
"learning_rate": 6.646580657830966e-05,
"loss": 0.4482,
"step": 524
},
{
"epoch": 0.3964508212195582,
"grad_norm": 0.199477881193161,
"learning_rate": 6.635339816587109e-05,
"loss": 0.4267,
"step": 525
},
{
"epoch": 0.3972059656409288,
"grad_norm": 0.19555439054965973,
"learning_rate": 6.624089712172088e-05,
"loss": 0.4385,
"step": 526
},
{
"epoch": 0.3979611100622994,
"grad_norm": 0.20855289697647095,
"learning_rate": 6.61283040831067e-05,
"loss": 0.4423,
"step": 527
},
{
"epoch": 0.39871625448367,
"grad_norm": 0.20509350299835205,
"learning_rate": 6.601561968779725e-05,
"loss": 0.3744,
"step": 528
},
{
"epoch": 0.3994713989050406,
"grad_norm": 0.19435042142868042,
"learning_rate": 6.590284457407876e-05,
"loss": 0.3924,
"step": 529
},
{
"epoch": 0.40022654332641117,
"grad_norm": 0.22069820761680603,
"learning_rate": 6.578997938075125e-05,
"loss": 0.388,
"step": 530
},
{
"epoch": 0.40098168774778176,
"grad_norm": 0.21975107491016388,
"learning_rate": 6.567702474712507e-05,
"loss": 0.4088,
"step": 531
},
{
"epoch": 0.40173683216915235,
"grad_norm": 0.21666480600833893,
"learning_rate": 6.556398131301713e-05,
"loss": 0.397,
"step": 532
},
{
"epoch": 0.40249197659052294,
"grad_norm": 0.22098886966705322,
"learning_rate": 6.545084971874738e-05,
"loss": 0.4027,
"step": 533
},
{
"epoch": 0.40324712101189353,
"grad_norm": 0.23352603614330292,
"learning_rate": 6.53376306051351e-05,
"loss": 0.4029,
"step": 534
},
{
"epoch": 0.4040022654332641,
"grad_norm": 0.23887783288955688,
"learning_rate": 6.522432461349536e-05,
"loss": 0.3958,
"step": 535
},
{
"epoch": 0.4047574098546347,
"grad_norm": 0.2337152659893036,
"learning_rate": 6.51109323856353e-05,
"loss": 0.3805,
"step": 536
},
{
"epoch": 0.4055125542760053,
"grad_norm": 0.24378469586372375,
"learning_rate": 6.499745456385054e-05,
"loss": 0.4048,
"step": 537
},
{
"epoch": 0.4062676986973759,
"grad_norm": 0.2503972351551056,
"learning_rate": 6.488389179092155e-05,
"loss": 0.3469,
"step": 538
},
{
"epoch": 0.4070228431187465,
"grad_norm": 0.26343488693237305,
"learning_rate": 6.477024471011001e-05,
"loss": 0.3914,
"step": 539
},
{
"epoch": 0.40777798754011707,
"grad_norm": 0.2782142758369446,
"learning_rate": 6.46565139651551e-05,
"loss": 0.4735,
"step": 540
},
{
"epoch": 0.40853313196148766,
"grad_norm": 0.27638283371925354,
"learning_rate": 6.454270020026995e-05,
"loss": 0.3961,
"step": 541
},
{
"epoch": 0.4092882763828582,
"grad_norm": 0.28320929408073425,
"learning_rate": 6.442880406013794e-05,
"loss": 0.3899,
"step": 542
},
{
"epoch": 0.4100434208042288,
"grad_norm": 0.26396995782852173,
"learning_rate": 6.431482618990902e-05,
"loss": 0.3126,
"step": 543
},
{
"epoch": 0.4107985652255994,
"grad_norm": 0.29057344794273376,
"learning_rate": 6.420076723519614e-05,
"loss": 0.3829,
"step": 544
},
{
"epoch": 0.41155370964696997,
"grad_norm": 0.30193406343460083,
"learning_rate": 6.408662784207149e-05,
"loss": 0.3592,
"step": 545
},
{
"epoch": 0.41230885406834056,
"grad_norm": 0.3181254267692566,
"learning_rate": 6.397240865706295e-05,
"loss": 0.382,
"step": 546
},
{
"epoch": 0.41306399848971115,
"grad_norm": 0.32498013973236084,
"learning_rate": 6.38581103271503e-05,
"loss": 0.3579,
"step": 547
},
{
"epoch": 0.41381914291108174,
"grad_norm": 0.36153993010520935,
"learning_rate": 6.374373349976169e-05,
"loss": 0.3281,
"step": 548
},
{
"epoch": 0.41457428733245233,
"grad_norm": 0.3981688618659973,
"learning_rate": 6.36292788227699e-05,
"loss": 0.3811,
"step": 549
},
{
"epoch": 0.4153294317538229,
"grad_norm": 0.4573695957660675,
"learning_rate": 6.351474694448864e-05,
"loss": 0.4341,
"step": 550
},
{
"epoch": 0.4160845761751935,
"grad_norm": 0.1360294222831726,
"learning_rate": 6.340013851366896e-05,
"loss": 0.3219,
"step": 551
},
{
"epoch": 0.4168397205965641,
"grad_norm": 0.1482553780078888,
"learning_rate": 6.328545417949549e-05,
"loss": 0.3273,
"step": 552
},
{
"epoch": 0.4175948650179347,
"grad_norm": 0.16173024475574493,
"learning_rate": 6.317069459158284e-05,
"loss": 0.387,
"step": 553
},
{
"epoch": 0.4183500094393053,
"grad_norm": 0.15186697244644165,
"learning_rate": 6.305586039997188e-05,
"loss": 0.3458,
"step": 554
},
{
"epoch": 0.41910515386067587,
"grad_norm": 0.16657663881778717,
"learning_rate": 6.294095225512603e-05,
"loss": 0.3808,
"step": 555
},
{
"epoch": 0.41986029828204646,
"grad_norm": 0.1734580099582672,
"learning_rate": 6.282597080792768e-05,
"loss": 0.3749,
"step": 556
},
{
"epoch": 0.42061544270341705,
"grad_norm": 0.16928981244564056,
"learning_rate": 6.271091670967436e-05,
"loss": 0.4225,
"step": 557
},
{
"epoch": 0.42137058712478764,
"grad_norm": 0.1675954908132553,
"learning_rate": 6.259579061207512e-05,
"loss": 0.3699,
"step": 558
},
{
"epoch": 0.4221257315461582,
"grad_norm": 0.16955533623695374,
"learning_rate": 6.248059316724693e-05,
"loss": 0.3821,
"step": 559
},
{
"epoch": 0.42288087596752877,
"grad_norm": 0.17539368569850922,
"learning_rate": 6.236532502771078e-05,
"loss": 0.4061,
"step": 560
},
{
"epoch": 0.42363602038889936,
"grad_norm": 0.16773320734500885,
"learning_rate": 6.22499868463882e-05,
"loss": 0.3684,
"step": 561
},
{
"epoch": 0.42439116481026995,
"grad_norm": 0.17698176205158234,
"learning_rate": 6.213457927659736e-05,
"loss": 0.446,
"step": 562
},
{
"epoch": 0.42514630923164054,
"grad_norm": 0.175985187292099,
"learning_rate": 6.201910297204962e-05,
"loss": 0.4014,
"step": 563
},
{
"epoch": 0.42590145365301113,
"grad_norm": 0.18848945200443268,
"learning_rate": 6.190355858684554e-05,
"loss": 0.3939,
"step": 564
},
{
"epoch": 0.4266565980743817,
"grad_norm": 0.20211070775985718,
"learning_rate": 6.178794677547137e-05,
"loss": 0.5346,
"step": 565
},
{
"epoch": 0.4274117424957523,
"grad_norm": 0.20022115111351013,
"learning_rate": 6.167226819279528e-05,
"loss": 0.485,
"step": 566
},
{
"epoch": 0.4281668869171229,
"grad_norm": 0.1909688413143158,
"learning_rate": 6.155652349406365e-05,
"loss": 0.3948,
"step": 567
},
{
"epoch": 0.4289220313384935,
"grad_norm": 0.19687382876873016,
"learning_rate": 6.144071333489741e-05,
"loss": 0.5016,
"step": 568
},
{
"epoch": 0.4296771757598641,
"grad_norm": 0.19113020598888397,
"learning_rate": 6.132483837128823e-05,
"loss": 0.409,
"step": 569
},
{
"epoch": 0.4304323201812347,
"grad_norm": 0.19415104389190674,
"learning_rate": 6.120889925959485e-05,
"loss": 0.4469,
"step": 570
},
{
"epoch": 0.43118746460260526,
"grad_norm": 0.20082654058933258,
"learning_rate": 6.109289665653944e-05,
"loss": 0.3998,
"step": 571
},
{
"epoch": 0.43194260902397585,
"grad_norm": 0.2006131410598755,
"learning_rate": 6.0976831219203724e-05,
"loss": 0.3904,
"step": 572
},
{
"epoch": 0.43269775344534644,
"grad_norm": 0.21381130814552307,
"learning_rate": 6.0860703605025395e-05,
"loss": 0.4501,
"step": 573
},
{
"epoch": 0.43345289786671704,
"grad_norm": 0.2047959566116333,
"learning_rate": 6.074451447179432e-05,
"loss": 0.4456,
"step": 574
},
{
"epoch": 0.43420804228808757,
"grad_norm": 0.19398140907287598,
"learning_rate": 6.062826447764883e-05,
"loss": 0.3951,
"step": 575
},
{
"epoch": 0.43496318670945816,
"grad_norm": 0.20938366651535034,
"learning_rate": 6.0511954281072034e-05,
"loss": 0.4182,
"step": 576
},
{
"epoch": 0.43571833113082875,
"grad_norm": 0.22829927504062653,
"learning_rate": 6.0395584540887963e-05,
"loss": 0.4267,
"step": 577
},
{
"epoch": 0.43647347555219934,
"grad_norm": 0.20386339724063873,
"learning_rate": 6.027915591625804e-05,
"loss": 0.3305,
"step": 578
},
{
"epoch": 0.43722861997356993,
"grad_norm": 0.2285105437040329,
"learning_rate": 6.016266906667711e-05,
"loss": 0.4457,
"step": 579
},
{
"epoch": 0.4379837643949405,
"grad_norm": 0.21181875467300415,
"learning_rate": 6.004612465196994e-05,
"loss": 0.3773,
"step": 580
},
{
"epoch": 0.4387389088163111,
"grad_norm": 0.21869616210460663,
"learning_rate": 5.992952333228728e-05,
"loss": 0.4118,
"step": 581
},
{
"epoch": 0.4394940532376817,
"grad_norm": 0.22501884400844574,
"learning_rate": 5.981286576810225e-05,
"loss": 0.3624,
"step": 582
},
{
"epoch": 0.4402491976590523,
"grad_norm": 0.22811758518218994,
"learning_rate": 5.969615262020657e-05,
"loss": 0.3633,
"step": 583
},
{
"epoch": 0.4410043420804229,
"grad_norm": 0.23688824474811554,
"learning_rate": 5.9579384549706775e-05,
"loss": 0.4004,
"step": 584
},
{
"epoch": 0.4417594865017935,
"grad_norm": 0.2543923258781433,
"learning_rate": 5.946256221802051e-05,
"loss": 0.4547,
"step": 585
},
{
"epoch": 0.44251463092316407,
"grad_norm": 0.2558960020542145,
"learning_rate": 5.9345686286872826e-05,
"loss": 0.3585,
"step": 586
},
{
"epoch": 0.44326977534453466,
"grad_norm": 0.25656744837760925,
"learning_rate": 5.9228757418292266e-05,
"loss": 0.3907,
"step": 587
},
{
"epoch": 0.44402491976590525,
"grad_norm": 0.2832098603248596,
"learning_rate": 5.911177627460739e-05,
"loss": 0.4802,
"step": 588
},
{
"epoch": 0.44478006418727584,
"grad_norm": 0.2527141571044922,
"learning_rate": 5.8994743518442694e-05,
"loss": 0.3935,
"step": 589
},
{
"epoch": 0.4455352086086464,
"grad_norm": 0.27374371886253357,
"learning_rate": 5.887765981271518e-05,
"loss": 0.3938,
"step": 590
},
{
"epoch": 0.446290353030017,
"grad_norm": 0.26626354455947876,
"learning_rate": 5.876052582063031e-05,
"loss": 0.3361,
"step": 591
},
{
"epoch": 0.44704549745138755,
"grad_norm": 0.2699456512928009,
"learning_rate": 5.864334220567851e-05,
"loss": 0.3243,
"step": 592
},
{
"epoch": 0.44780064187275814,
"grad_norm": 0.3059113323688507,
"learning_rate": 5.85261096316312e-05,
"loss": 0.4765,
"step": 593
},
{
"epoch": 0.44855578629412873,
"grad_norm": 0.26939067244529724,
"learning_rate": 5.840882876253715e-05,
"loss": 0.355,
"step": 594
},
{
"epoch": 0.4493109307154993,
"grad_norm": 0.32320332527160645,
"learning_rate": 5.829150026271871e-05,
"loss": 0.4086,
"step": 595
},
{
"epoch": 0.4500660751368699,
"grad_norm": 0.33342641592025757,
"learning_rate": 5.8174124796768e-05,
"loss": 0.4379,
"step": 596
},
{
"epoch": 0.4508212195582405,
"grad_norm": 0.33563926815986633,
"learning_rate": 5.805670302954321e-05,
"loss": 0.3539,
"step": 597
},
{
"epoch": 0.4515763639796111,
"grad_norm": 0.35168662667274475,
"learning_rate": 5.793923562616475e-05,
"loss": 0.4063,
"step": 598
},
{
"epoch": 0.4523315084009817,
"grad_norm": 0.3756003677845001,
"learning_rate": 5.782172325201155e-05,
"loss": 0.3802,
"step": 599
},
{
"epoch": 0.4530866528223523,
"grad_norm": 0.43410277366638184,
"learning_rate": 5.770416657271729e-05,
"loss": 0.4295,
"step": 600
},
{
"epoch": 0.45384179724372287,
"grad_norm": 0.16801206767559052,
"learning_rate": 5.7586566254166583e-05,
"loss": 0.3764,
"step": 601
},
{
"epoch": 0.45459694166509346,
"grad_norm": 0.13662360608577728,
"learning_rate": 5.746892296249126e-05,
"loss": 0.3318,
"step": 602
},
{
"epoch": 0.45535208608646405,
"grad_norm": 0.157858207821846,
"learning_rate": 5.7351237364066547e-05,
"loss": 0.3435,
"step": 603
},
{
"epoch": 0.45610723050783464,
"grad_norm": 0.16429783403873444,
"learning_rate": 5.723351012550729e-05,
"loss": 0.3754,
"step": 604
},
{
"epoch": 0.45686237492920523,
"grad_norm": 0.1615344136953354,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.3407,
"step": 605
},
{
"epoch": 0.4576175193505758,
"grad_norm": 0.17267441749572754,
"learning_rate": 5.699793339562026e-05,
"loss": 0.3956,
"step": 606
},
{
"epoch": 0.4583726637719464,
"grad_norm": 0.17077352106571198,
"learning_rate": 5.6880085238686454e-05,
"loss": 0.3547,
"step": 607
},
{
"epoch": 0.45912780819331694,
"grad_norm": 0.1691327840089798,
"learning_rate": 5.6762198110398444e-05,
"loss": 0.3248,
"step": 608
},
{
"epoch": 0.45988295261468753,
"grad_norm": 0.17237545549869537,
"learning_rate": 5.664427267851271e-05,
"loss": 0.3824,
"step": 609
},
{
"epoch": 0.4606380970360581,
"grad_norm": 0.1798117756843567,
"learning_rate": 5.6526309611002594e-05,
"loss": 0.3806,
"step": 610
},
{
"epoch": 0.4613932414574287,
"grad_norm": 0.1780758649110794,
"learning_rate": 5.640830957605465e-05,
"loss": 0.4111,
"step": 611
},
{
"epoch": 0.4621483858787993,
"grad_norm": 0.17500564455986023,
"learning_rate": 5.629027324206484e-05,
"loss": 0.4043,
"step": 612
},
{
"epoch": 0.4629035303001699,
"grad_norm": 0.1749061942100525,
"learning_rate": 5.617220127763474e-05,
"loss": 0.3671,
"step": 613
},
{
"epoch": 0.4636586747215405,
"grad_norm": 0.1828213781118393,
"learning_rate": 5.6054094351567746e-05,
"loss": 0.3784,
"step": 614
},
{
"epoch": 0.4644138191429111,
"grad_norm": 0.18562084436416626,
"learning_rate": 5.593595313286526e-05,
"loss": 0.4139,
"step": 615
},
{
"epoch": 0.46516896356428167,
"grad_norm": 0.18980775773525238,
"learning_rate": 5.581777829072299e-05,
"loss": 0.4277,
"step": 616
},
{
"epoch": 0.46592410798565226,
"grad_norm": 0.19855181872844696,
"learning_rate": 5.569957049452703e-05,
"loss": 0.4124,
"step": 617
},
{
"epoch": 0.46667925240702285,
"grad_norm": 0.18563315272331238,
"learning_rate": 5.5581330413850206e-05,
"loss": 0.4239,
"step": 618
},
{
"epoch": 0.46743439682839344,
"grad_norm": 0.20975831151008606,
"learning_rate": 5.5463058718448155e-05,
"loss": 0.4546,
"step": 619
},
{
"epoch": 0.46818954124976403,
"grad_norm": 0.21700595319271088,
"learning_rate": 5.534475607825566e-05,
"loss": 0.4703,
"step": 620
},
{
"epoch": 0.4689446856711346,
"grad_norm": 0.19500944018363953,
"learning_rate": 5.522642316338268e-05,
"loss": 0.4008,
"step": 621
},
{
"epoch": 0.4696998300925052,
"grad_norm": 0.18913483619689941,
"learning_rate": 5.510806064411078e-05,
"loss": 0.3416,
"step": 622
},
{
"epoch": 0.4704549745138758,
"grad_norm": 0.22911082208156586,
"learning_rate": 5.4989669190889136e-05,
"loss": 0.4004,
"step": 623
},
{
"epoch": 0.4712101189352464,
"grad_norm": 0.21053546667099,
"learning_rate": 5.4871249474330866e-05,
"loss": 0.4137,
"step": 624
},
{
"epoch": 0.4719652633566169,
"grad_norm": 0.20017597079277039,
"learning_rate": 5.475280216520913e-05,
"loss": 0.3467,
"step": 625
},
{
"epoch": 0.4727204077779875,
"grad_norm": 0.23001310229301453,
"learning_rate": 5.463432793445344e-05,
"loss": 0.4107,
"step": 626
},
{
"epoch": 0.4734755521993581,
"grad_norm": 0.22943522036075592,
"learning_rate": 5.4515827453145765e-05,
"loss": 0.4129,
"step": 627
},
{
"epoch": 0.4742306966207287,
"grad_norm": 0.22657130658626556,
"learning_rate": 5.439730139251675e-05,
"loss": 0.3364,
"step": 628
},
{
"epoch": 0.4749858410420993,
"grad_norm": 0.22038240730762482,
"learning_rate": 5.427875042394199e-05,
"loss": 0.3809,
"step": 629
},
{
"epoch": 0.4757409854634699,
"grad_norm": 0.24683795869350433,
"learning_rate": 5.4160175218938124e-05,
"loss": 0.4047,
"step": 630
},
{
"epoch": 0.47649612988484047,
"grad_norm": 0.22947093844413757,
"learning_rate": 5.404157644915907e-05,
"loss": 0.3557,
"step": 631
},
{
"epoch": 0.47725127430621106,
"grad_norm": 0.23205658793449402,
"learning_rate": 5.392295478639225e-05,
"loss": 0.3911,
"step": 632
},
{
"epoch": 0.47800641872758165,
"grad_norm": 0.2397453486919403,
"learning_rate": 5.3804310902554754e-05,
"loss": 0.341,
"step": 633
},
{
"epoch": 0.47876156314895224,
"grad_norm": 0.25788554549217224,
"learning_rate": 5.368564546968954e-05,
"loss": 0.4089,
"step": 634
},
{
"epoch": 0.47951670757032283,
"grad_norm": 0.2771666347980499,
"learning_rate": 5.3566959159961615e-05,
"loss": 0.4121,
"step": 635
},
{
"epoch": 0.4802718519916934,
"grad_norm": 0.3412117063999176,
"learning_rate": 5.344825264565426e-05,
"loss": 0.4031,
"step": 636
},
{
"epoch": 0.481026996413064,
"grad_norm": 0.2685895562171936,
"learning_rate": 5.3329526599165204e-05,
"loss": 0.4285,
"step": 637
},
{
"epoch": 0.4817821408344346,
"grad_norm": 0.2930139899253845,
"learning_rate": 5.3210781693002754e-05,
"loss": 0.4991,
"step": 638
},
{
"epoch": 0.4825372852558052,
"grad_norm": 0.3006288707256317,
"learning_rate": 5.3092018599782155e-05,
"loss": 0.454,
"step": 639
},
{
"epoch": 0.4832924296771758,
"grad_norm": 0.2801501452922821,
"learning_rate": 5.297323799222156e-05,
"loss": 0.3826,
"step": 640
},
{
"epoch": 0.4840475740985464,
"grad_norm": 0.26145270466804504,
"learning_rate": 5.2854440543138406e-05,
"loss": 0.3434,
"step": 641
},
{
"epoch": 0.4848027185199169,
"grad_norm": 0.29839491844177246,
"learning_rate": 5.273562692544548e-05,
"loss": 0.3767,
"step": 642
},
{
"epoch": 0.4855578629412875,
"grad_norm": 0.3101263642311096,
"learning_rate": 5.26167978121472e-05,
"loss": 0.383,
"step": 643
},
{
"epoch": 0.4863130073626581,
"grad_norm": 0.35069891810417175,
"learning_rate": 5.24979538763357e-05,
"loss": 0.4163,
"step": 644
},
{
"epoch": 0.4870681517840287,
"grad_norm": 0.32553043961524963,
"learning_rate": 5.2379095791187124e-05,
"loss": 0.3942,
"step": 645
},
{
"epoch": 0.48782329620539927,
"grad_norm": 0.31216907501220703,
"learning_rate": 5.226022422995773e-05,
"loss": 0.3572,
"step": 646
},
{
"epoch": 0.48857844062676986,
"grad_norm": 0.3375815749168396,
"learning_rate": 5.2141339865980134e-05,
"loss": 0.3524,
"step": 647
},
{
"epoch": 0.48933358504814045,
"grad_norm": 0.33117055892944336,
"learning_rate": 5.2022443372659446e-05,
"loss": 0.3501,
"step": 648
},
{
"epoch": 0.49008872946951104,
"grad_norm": 0.41448086500167847,
"learning_rate": 5.1903535423469505e-05,
"loss": 0.3976,
"step": 649
},
{
"epoch": 0.49084387389088163,
"grad_norm": 0.5221715569496155,
"learning_rate": 5.178461669194903e-05,
"loss": 0.4395,
"step": 650
},
{
"epoch": 0.4915990183122522,
"grad_norm": 0.15094821155071259,
"learning_rate": 5.166568785169781e-05,
"loss": 0.3399,
"step": 651
},
{
"epoch": 0.4923541627336228,
"grad_norm": 0.14581607282161713,
"learning_rate": 5.154674957637291e-05,
"loss": 0.3412,
"step": 652
},
{
"epoch": 0.4931093071549934,
"grad_norm": 0.16245442628860474,
"learning_rate": 5.142780253968481e-05,
"loss": 0.3448,
"step": 653
},
{
"epoch": 0.493864451576364,
"grad_norm": 0.15717031061649323,
"learning_rate": 5.1308847415393666e-05,
"loss": 0.332,
"step": 654
},
{
"epoch": 0.4946195959977346,
"grad_norm": 0.17713572084903717,
"learning_rate": 5.1189884877305375e-05,
"loss": 0.3773,
"step": 655
},
{
"epoch": 0.4953747404191052,
"grad_norm": 0.18064700067043304,
"learning_rate": 5.107091559926791e-05,
"loss": 0.3872,
"step": 656
},
{
"epoch": 0.49612988484047577,
"grad_norm": 0.18684056401252747,
"learning_rate": 5.095194025516733e-05,
"loss": 0.4255,
"step": 657
},
{
"epoch": 0.4968850292618463,
"grad_norm": 0.18245136737823486,
"learning_rate": 5.0832959518924165e-05,
"loss": 0.3741,
"step": 658
},
{
"epoch": 0.4976401736832169,
"grad_norm": 0.1796809732913971,
"learning_rate": 5.0713974064489367e-05,
"loss": 0.3846,
"step": 659
},
{
"epoch": 0.4983953181045875,
"grad_norm": 0.193410262465477,
"learning_rate": 5.059498456584072e-05,
"loss": 0.3921,
"step": 660
},
{
"epoch": 0.49915046252595807,
"grad_norm": 0.17409726977348328,
"learning_rate": 5.047599169697884e-05,
"loss": 0.3479,
"step": 661
},
{
"epoch": 0.49990560694732866,
"grad_norm": 0.19628435373306274,
"learning_rate": 5.035699613192347e-05,
"loss": 0.4364,
"step": 662
},
{
"epoch": 0.5006607513686993,
"grad_norm": 0.1964567005634308,
"learning_rate": 5.023799854470963e-05,
"loss": 0.4891,
"step": 663
},
{
"epoch": 0.5014158957900698,
"grad_norm": 0.1932559609413147,
"learning_rate": 5.0118999609383776e-05,
"loss": 0.4156,
"step": 664
},
{
"epoch": 0.5021710402114404,
"grad_norm": 0.18889658153057098,
"learning_rate": 5e-05,
"loss": 0.393,
"step": 665
},
{
"epoch": 0.502926184632811,
"grad_norm": 0.20598454773426056,
"learning_rate": 4.9881000390616236e-05,
"loss": 0.3809,
"step": 666
},
{
"epoch": 0.5036813290541816,
"grad_norm": 0.18910270929336548,
"learning_rate": 4.9762001455290385e-05,
"loss": 0.3594,
"step": 667
},
{
"epoch": 0.5044364734755522,
"grad_norm": 0.19010648131370544,
"learning_rate": 4.964300386807653e-05,
"loss": 0.4025,
"step": 668
},
{
"epoch": 0.5051916178969228,
"grad_norm": 0.19111211597919464,
"learning_rate": 4.952400830302117e-05,
"loss": 0.4046,
"step": 669
},
{
"epoch": 0.5059467623182934,
"grad_norm": 0.20030421018600464,
"learning_rate": 4.940501543415929e-05,
"loss": 0.3772,
"step": 670
},
{
"epoch": 0.506701906739664,
"grad_norm": 0.20106443762779236,
"learning_rate": 4.928602593551065e-05,
"loss": 0.3878,
"step": 671
},
{
"epoch": 0.5074570511610346,
"grad_norm": 0.21929006278514862,
"learning_rate": 4.916704048107586e-05,
"loss": 0.374,
"step": 672
},
{
"epoch": 0.5082121955824052,
"grad_norm": 0.2119339108467102,
"learning_rate": 4.9048059744832666e-05,
"loss": 0.3846,
"step": 673
},
{
"epoch": 0.5089673400037757,
"grad_norm": 0.2138437032699585,
"learning_rate": 4.89290844007321e-05,
"loss": 0.4283,
"step": 674
},
{
"epoch": 0.5097224844251463,
"grad_norm": 0.21618351340293884,
"learning_rate": 4.881011512269463e-05,
"loss": 0.3673,
"step": 675
},
{
"epoch": 0.5104776288465169,
"grad_norm": 0.22225847840309143,
"learning_rate": 4.869115258460635e-05,
"loss": 0.4018,
"step": 676
},
{
"epoch": 0.5112327732678875,
"grad_norm": 0.20154333114624023,
"learning_rate": 4.85721974603152e-05,
"loss": 0.3655,
"step": 677
},
{
"epoch": 0.5119879176892581,
"grad_norm": 0.2222219705581665,
"learning_rate": 4.845325042362709e-05,
"loss": 0.3887,
"step": 678
},
{
"epoch": 0.5127430621106287,
"grad_norm": 0.23664206266403198,
"learning_rate": 4.83343121483022e-05,
"loss": 0.4378,
"step": 679
},
{
"epoch": 0.5134982065319993,
"grad_norm": 0.22598662972450256,
"learning_rate": 4.821538330805098e-05,
"loss": 0.361,
"step": 680
},
{
"epoch": 0.5142533509533699,
"grad_norm": 0.23625950515270233,
"learning_rate": 4.8096464576530507e-05,
"loss": 0.3911,
"step": 681
},
{
"epoch": 0.5150084953747405,
"grad_norm": 0.24311421811580658,
"learning_rate": 4.797755662734056e-05,
"loss": 0.383,
"step": 682
},
{
"epoch": 0.515763639796111,
"grad_norm": 0.2555188834667206,
"learning_rate": 4.7858660134019884e-05,
"loss": 0.4043,
"step": 683
},
{
"epoch": 0.5165187842174815,
"grad_norm": 0.26212912797927856,
"learning_rate": 4.7739775770042285e-05,
"loss": 0.3899,
"step": 684
},
{
"epoch": 0.5172739286388521,
"grad_norm": 0.25993528962135315,
"learning_rate": 4.762090420881289e-05,
"loss": 0.4474,
"step": 685
},
{
"epoch": 0.5180290730602227,
"grad_norm": 0.2636634409427643,
"learning_rate": 4.7502046123664316e-05,
"loss": 0.4205,
"step": 686
},
{
"epoch": 0.5187842174815933,
"grad_norm": 0.27789896726608276,
"learning_rate": 4.738320218785281e-05,
"loss": 0.4362,
"step": 687
},
{
"epoch": 0.5195393619029639,
"grad_norm": 0.2860543727874756,
"learning_rate": 4.726437307455452e-05,
"loss": 0.3886,
"step": 688
},
{
"epoch": 0.5202945063243345,
"grad_norm": 0.26604828238487244,
"learning_rate": 4.71455594568616e-05,
"loss": 0.3778,
"step": 689
},
{
"epoch": 0.5210496507457051,
"grad_norm": 0.29641634225845337,
"learning_rate": 4.702676200777846e-05,
"loss": 0.4318,
"step": 690
},
{
"epoch": 0.5218047951670757,
"grad_norm": 0.30234697461128235,
"learning_rate": 4.6907981400217864e-05,
"loss": 0.3916,
"step": 691
},
{
"epoch": 0.5225599395884463,
"grad_norm": 0.2846793830394745,
"learning_rate": 4.678921830699724e-05,
"loss": 0.3517,
"step": 692
},
{
"epoch": 0.5233150840098169,
"grad_norm": 0.305910587310791,
"learning_rate": 4.667047340083481e-05,
"loss": 0.3995,
"step": 693
},
{
"epoch": 0.5240702284311874,
"grad_norm": 0.2852495014667511,
"learning_rate": 4.655174735434575e-05,
"loss": 0.3481,
"step": 694
},
{
"epoch": 0.524825372852558,
"grad_norm": 0.32877644896507263,
"learning_rate": 4.643304084003839e-05,
"loss": 0.3678,
"step": 695
},
{
"epoch": 0.5255805172739286,
"grad_norm": 0.3292486071586609,
"learning_rate": 4.631435453031047e-05,
"loss": 0.3611,
"step": 696
},
{
"epoch": 0.5263356616952992,
"grad_norm": 0.33952105045318604,
"learning_rate": 4.619568909744524e-05,
"loss": 0.3875,
"step": 697
},
{
"epoch": 0.5270908061166698,
"grad_norm": 0.41025200486183167,
"learning_rate": 4.607704521360776e-05,
"loss": 0.3853,
"step": 698
},
{
"epoch": 0.5278459505380404,
"grad_norm": 0.3911071717739105,
"learning_rate": 4.595842355084094e-05,
"loss": 0.3416,
"step": 699
},
{
"epoch": 0.528601094959411,
"grad_norm": 0.46904653310775757,
"learning_rate": 4.583982478106189e-05,
"loss": 0.3506,
"step": 700
},
{
"epoch": 0.5293562393807816,
"grad_norm": 0.14935268461704254,
"learning_rate": 4.5721249576058027e-05,
"loss": 0.3182,
"step": 701
},
{
"epoch": 0.5301113838021522,
"grad_norm": 0.15800884366035461,
"learning_rate": 4.560269860748325e-05,
"loss": 0.3109,
"step": 702
},
{
"epoch": 0.5308665282235228,
"grad_norm": 0.16600751876831055,
"learning_rate": 4.5484172546854246e-05,
"loss": 0.3653,
"step": 703
},
{
"epoch": 0.5316216726448934,
"grad_norm": 0.17389142513275146,
"learning_rate": 4.536567206554656e-05,
"loss": 0.3637,
"step": 704
},
{
"epoch": 0.5323768170662639,
"grad_norm": 0.16549134254455566,
"learning_rate": 4.5247197834790876e-05,
"loss": 0.3916,
"step": 705
},
{
"epoch": 0.5331319614876345,
"grad_norm": 0.16650977730751038,
"learning_rate": 4.512875052566915e-05,
"loss": 0.3398,
"step": 706
},
{
"epoch": 0.5338871059090051,
"grad_norm": 0.17638632655143738,
"learning_rate": 4.501033080911086e-05,
"loss": 0.3994,
"step": 707
},
{
"epoch": 0.5346422503303757,
"grad_norm": 0.1747698038816452,
"learning_rate": 4.489193935588923e-05,
"loss": 0.399,
"step": 708
},
{
"epoch": 0.5353973947517463,
"grad_norm": 0.1881653517484665,
"learning_rate": 4.477357683661734e-05,
"loss": 0.4072,
"step": 709
},
{
"epoch": 0.5361525391731169,
"grad_norm": 0.1721341758966446,
"learning_rate": 4.4655243921744374e-05,
"loss": 0.3317,
"step": 710
},
{
"epoch": 0.5369076835944875,
"grad_norm": 0.18106478452682495,
"learning_rate": 4.4536941281551864e-05,
"loss": 0.4081,
"step": 711
},
{
"epoch": 0.5376628280158581,
"grad_norm": 0.19442430138587952,
"learning_rate": 4.44186695861498e-05,
"loss": 0.4044,
"step": 712
},
{
"epoch": 0.5384179724372287,
"grad_norm": 0.19563159346580505,
"learning_rate": 4.4300429505472976e-05,
"loss": 0.3849,
"step": 713
},
{
"epoch": 0.5391731168585993,
"grad_norm": 0.1997566670179367,
"learning_rate": 4.418222170927702e-05,
"loss": 0.432,
"step": 714
},
{
"epoch": 0.5399282612799698,
"grad_norm": 0.1938876062631607,
"learning_rate": 4.4064046867134756e-05,
"loss": 0.3994,
"step": 715
},
{
"epoch": 0.5406834057013404,
"grad_norm": 0.2057284563779831,
"learning_rate": 4.394590564843226e-05,
"loss": 0.4499,
"step": 716
},
{
"epoch": 0.5414385501227109,
"grad_norm": 0.1877799779176712,
"learning_rate": 4.3827798722365264e-05,
"loss": 0.3824,
"step": 717
},
{
"epoch": 0.5421936945440815,
"grad_norm": 0.2022031992673874,
"learning_rate": 4.370972675793517e-05,
"loss": 0.4552,
"step": 718
},
{
"epoch": 0.5429488389654521,
"grad_norm": 0.22796858847141266,
"learning_rate": 4.359169042394536e-05,
"loss": 0.4589,
"step": 719
},
{
"epoch": 0.5437039833868227,
"grad_norm": 0.19858501851558685,
"learning_rate": 4.347369038899744e-05,
"loss": 0.4075,
"step": 720
},
{
"epoch": 0.5444591278081933,
"grad_norm": 0.20992298424243927,
"learning_rate": 4.33557273214873e-05,
"loss": 0.382,
"step": 721
},
{
"epoch": 0.5452142722295639,
"grad_norm": 0.2098989337682724,
"learning_rate": 4.3237801889601554e-05,
"loss": 0.4268,
"step": 722
},
{
"epoch": 0.5459694166509345,
"grad_norm": 0.2050914168357849,
"learning_rate": 4.3119914761313564e-05,
"loss": 0.3695,
"step": 723
},
{
"epoch": 0.546724561072305,
"grad_norm": 0.21308262646198273,
"learning_rate": 4.3002066604379746e-05,
"loss": 0.4058,
"step": 724
},
{
"epoch": 0.5474797054936756,
"grad_norm": 0.22114039957523346,
"learning_rate": 4.288425808633575e-05,
"loss": 0.4412,
"step": 725
},
{
"epoch": 0.5482348499150462,
"grad_norm": 0.2253432720899582,
"learning_rate": 4.276648987449271e-05,
"loss": 0.4339,
"step": 726
},
{
"epoch": 0.5489899943364168,
"grad_norm": 0.22624394297599792,
"learning_rate": 4.2648762635933465e-05,
"loss": 0.4147,
"step": 727
},
{
"epoch": 0.5497451387577874,
"grad_norm": 0.23290926218032837,
"learning_rate": 4.253107703750875e-05,
"loss": 0.4315,
"step": 728
},
{
"epoch": 0.550500283179158,
"grad_norm": 0.23128274083137512,
"learning_rate": 4.241343374583343e-05,
"loss": 0.4271,
"step": 729
},
{
"epoch": 0.5512554276005286,
"grad_norm": 0.243194580078125,
"learning_rate": 4.2295833427282734e-05,
"loss": 0.4344,
"step": 730
},
{
"epoch": 0.5520105720218992,
"grad_norm": 0.2592814564704895,
"learning_rate": 4.2178276747988446e-05,
"loss": 0.4633,
"step": 731
},
{
"epoch": 0.5527657164432698,
"grad_norm": 0.2409772276878357,
"learning_rate": 4.2060764373835264e-05,
"loss": 0.3903,
"step": 732
},
{
"epoch": 0.5535208608646404,
"grad_norm": 0.23426584899425507,
"learning_rate": 4.19432969704568e-05,
"loss": 0.3476,
"step": 733
},
{
"epoch": 0.554276005286011,
"grad_norm": 0.2637597322463989,
"learning_rate": 4.182587520323201e-05,
"loss": 0.4371,
"step": 734
},
{
"epoch": 0.5550311497073815,
"grad_norm": 0.2537882328033447,
"learning_rate": 4.17084997372813e-05,
"loss": 0.4176,
"step": 735
},
{
"epoch": 0.5557862941287521,
"grad_norm": 0.25388282537460327,
"learning_rate": 4.159117123746286e-05,
"loss": 0.3644,
"step": 736
},
{
"epoch": 0.5565414385501227,
"grad_norm": 0.259795218706131,
"learning_rate": 4.147389036836881e-05,
"loss": 0.3493,
"step": 737
},
{
"epoch": 0.5572965829714933,
"grad_norm": 0.2668931782245636,
"learning_rate": 4.1356657794321496e-05,
"loss": 0.3802,
"step": 738
},
{
"epoch": 0.5580517273928639,
"grad_norm": 0.28695622086524963,
"learning_rate": 4.12394741793697e-05,
"loss": 0.3657,
"step": 739
},
{
"epoch": 0.5588068718142345,
"grad_norm": 0.2887243628501892,
"learning_rate": 4.1122340187284846e-05,
"loss": 0.4289,
"step": 740
},
{
"epoch": 0.5595620162356051,
"grad_norm": 0.2813442349433899,
"learning_rate": 4.100525648155731e-05,
"loss": 0.3503,
"step": 741
},
{
"epoch": 0.5603171606569757,
"grad_norm": 0.2727963924407959,
"learning_rate": 4.088822372539263e-05,
"loss": 0.3067,
"step": 742
},
{
"epoch": 0.5610723050783463,
"grad_norm": 0.2962748408317566,
"learning_rate": 4.077124258170774e-05,
"loss": 0.3492,
"step": 743
},
{
"epoch": 0.5618274494997169,
"grad_norm": 0.293473482131958,
"learning_rate": 4.06543137131272e-05,
"loss": 0.3261,
"step": 744
},
{
"epoch": 0.5625825939210874,
"grad_norm": 0.3267402946949005,
"learning_rate": 4.0537437781979506e-05,
"loss": 0.3525,
"step": 745
},
{
"epoch": 0.563337738342458,
"grad_norm": 0.33864033222198486,
"learning_rate": 4.042061545029323e-05,
"loss": 0.3754,
"step": 746
},
{
"epoch": 0.5640928827638286,
"grad_norm": 0.38044723868370056,
"learning_rate": 4.0303847379793447e-05,
"loss": 0.4512,
"step": 747
},
{
"epoch": 0.5648480271851992,
"grad_norm": 0.37810018658638,
"learning_rate": 4.018713423189775e-05,
"loss": 0.359,
"step": 748
},
{
"epoch": 0.5656031716065698,
"grad_norm": 0.42582571506500244,
"learning_rate": 4.007047666771274e-05,
"loss": 0.4383,
"step": 749
},
{
"epoch": 0.5663583160279403,
"grad_norm": 0.4589140713214874,
"learning_rate": 3.995387534803006e-05,
"loss": 0.4365,
"step": 750
},
{
"epoch": 0.5671134604493109,
"grad_norm": 0.155814066529274,
"learning_rate": 3.983733093332289e-05,
"loss": 0.3505,
"step": 751
},
{
"epoch": 0.5678686048706815,
"grad_norm": 0.16563649475574493,
"learning_rate": 3.9720844083741975e-05,
"loss": 0.3567,
"step": 752
},
{
"epoch": 0.5686237492920521,
"grad_norm": 0.16976523399353027,
"learning_rate": 3.960441545911204e-05,
"loss": 0.3639,
"step": 753
},
{
"epoch": 0.5693788937134227,
"grad_norm": 0.18034231662750244,
"learning_rate": 3.948804571892799e-05,
"loss": 0.3462,
"step": 754
},
{
"epoch": 0.5701340381347932,
"grad_norm": 0.1798115074634552,
"learning_rate": 3.937173552235117e-05,
"loss": 0.374,
"step": 755
},
{
"epoch": 0.5708891825561638,
"grad_norm": 0.18299606442451477,
"learning_rate": 3.925548552820568e-05,
"loss": 0.3865,
"step": 756
},
{
"epoch": 0.5716443269775344,
"grad_norm": 0.20100784301757812,
"learning_rate": 3.913929639497462e-05,
"loss": 0.4081,
"step": 757
},
{
"epoch": 0.572399471398905,
"grad_norm": 0.18966151773929596,
"learning_rate": 3.9023168780796294e-05,
"loss": 0.3809,
"step": 758
},
{
"epoch": 0.5731546158202756,
"grad_norm": 0.2044794112443924,
"learning_rate": 3.890710334346058e-05,
"loss": 0.4423,
"step": 759
},
{
"epoch": 0.5739097602416462,
"grad_norm": 0.19047518074512482,
"learning_rate": 3.879110074040514e-05,
"loss": 0.3845,
"step": 760
},
{
"epoch": 0.5746649046630168,
"grad_norm": 0.1907937079668045,
"learning_rate": 3.8675161628711776e-05,
"loss": 0.394,
"step": 761
},
{
"epoch": 0.5754200490843874,
"grad_norm": 0.19930242002010345,
"learning_rate": 3.85592866651026e-05,
"loss": 0.434,
"step": 762
},
{
"epoch": 0.576175193505758,
"grad_norm": 0.1895764023065567,
"learning_rate": 3.844347650593635e-05,
"loss": 0.3668,
"step": 763
},
{
"epoch": 0.5769303379271286,
"grad_norm": 0.20043134689331055,
"learning_rate": 3.832773180720475e-05,
"loss": 0.3956,
"step": 764
},
{
"epoch": 0.5776854823484991,
"grad_norm": 0.2107314020395279,
"learning_rate": 3.821205322452863e-05,
"loss": 0.4535,
"step": 765
},
{
"epoch": 0.5784406267698697,
"grad_norm": 0.22263993322849274,
"learning_rate": 3.8096441413154464e-05,
"loss": 0.4392,
"step": 766
},
{
"epoch": 0.5791957711912403,
"grad_norm": 0.2031395435333252,
"learning_rate": 3.798089702795038e-05,
"loss": 0.3902,
"step": 767
},
{
"epoch": 0.5799509156126109,
"grad_norm": 0.20621052384376526,
"learning_rate": 3.7865420723402634e-05,
"loss": 0.4127,
"step": 768
},
{
"epoch": 0.5807060600339815,
"grad_norm": 0.20813485980033875,
"learning_rate": 3.775001315361183e-05,
"loss": 0.3858,
"step": 769
},
{
"epoch": 0.5814612044553521,
"grad_norm": 0.21165066957473755,
"learning_rate": 3.763467497228922e-05,
"loss": 0.4723,
"step": 770
},
{
"epoch": 0.5822163488767227,
"grad_norm": 0.2068208009004593,
"learning_rate": 3.7519406832753085e-05,
"loss": 0.3991,
"step": 771
},
{
"epoch": 0.5829714932980933,
"grad_norm": 0.22032210230827332,
"learning_rate": 3.740420938792489e-05,
"loss": 0.4052,
"step": 772
},
{
"epoch": 0.5837266377194639,
"grad_norm": 0.21245840191841125,
"learning_rate": 3.728908329032567e-05,
"loss": 0.3571,
"step": 773
},
{
"epoch": 0.5844817821408345,
"grad_norm": 0.2291347086429596,
"learning_rate": 3.717402919207234e-05,
"loss": 0.4335,
"step": 774
},
{
"epoch": 0.585236926562205,
"grad_norm": 0.23557013273239136,
"learning_rate": 3.705904774487396e-05,
"loss": 0.4521,
"step": 775
},
{
"epoch": 0.5859920709835756,
"grad_norm": 0.23025040328502655,
"learning_rate": 3.6944139600028136e-05,
"loss": 0.3855,
"step": 776
},
{
"epoch": 0.5867472154049462,
"grad_norm": 0.23952262103557587,
"learning_rate": 3.6829305408417166e-05,
"loss": 0.3627,
"step": 777
},
{
"epoch": 0.5875023598263168,
"grad_norm": 0.21594206988811493,
"learning_rate": 3.6714545820504525e-05,
"loss": 0.3442,
"step": 778
},
{
"epoch": 0.5882575042476874,
"grad_norm": 0.22128231823444366,
"learning_rate": 3.659986148633107e-05,
"loss": 0.355,
"step": 779
},
{
"epoch": 0.589012648669058,
"grad_norm": 0.25088027119636536,
"learning_rate": 3.648525305551136e-05,
"loss": 0.4293,
"step": 780
},
{
"epoch": 0.5897677930904286,
"grad_norm": 0.2476910948753357,
"learning_rate": 3.6370721177230116e-05,
"loss": 0.3875,
"step": 781
},
{
"epoch": 0.5905229375117992,
"grad_norm": 0.24124464392662048,
"learning_rate": 3.625626650023831e-05,
"loss": 0.393,
"step": 782
},
{
"epoch": 0.5912780819331697,
"grad_norm": 0.2532835900783539,
"learning_rate": 3.6141889672849726e-05,
"loss": 0.418,
"step": 783
},
{
"epoch": 0.5920332263545403,
"grad_norm": 0.24893806874752045,
"learning_rate": 3.602759134293706e-05,
"loss": 0.3763,
"step": 784
},
{
"epoch": 0.5927883707759108,
"grad_norm": 0.2560838758945465,
"learning_rate": 3.591337215792852e-05,
"loss": 0.3267,
"step": 785
},
{
"epoch": 0.5935435151972814,
"grad_norm": 0.26388421654701233,
"learning_rate": 3.579923276480387e-05,
"loss": 0.4026,
"step": 786
},
{
"epoch": 0.594298659618652,
"grad_norm": 0.27024954557418823,
"learning_rate": 3.568517381009099e-05,
"loss": 0.3871,
"step": 787
},
{
"epoch": 0.5950538040400226,
"grad_norm": 0.26593852043151855,
"learning_rate": 3.557119593986208e-05,
"loss": 0.3685,
"step": 788
},
{
"epoch": 0.5958089484613932,
"grad_norm": 0.2777867317199707,
"learning_rate": 3.545729979973005e-05,
"loss": 0.3664,
"step": 789
},
{
"epoch": 0.5965640928827638,
"grad_norm": 0.28436291217803955,
"learning_rate": 3.5343486034844895e-05,
"loss": 0.3673,
"step": 790
},
{
"epoch": 0.5973192373041344,
"grad_norm": 0.3022597134113312,
"learning_rate": 3.522975528989e-05,
"loss": 0.4292,
"step": 791
},
{
"epoch": 0.598074381725505,
"grad_norm": 0.3157149851322174,
"learning_rate": 3.511610820907846e-05,
"loss": 0.4385,
"step": 792
},
{
"epoch": 0.5988295261468756,
"grad_norm": 0.2940748333930969,
"learning_rate": 3.5002545436149474e-05,
"loss": 0.389,
"step": 793
},
{
"epoch": 0.5995846705682462,
"grad_norm": 0.3128991425037384,
"learning_rate": 3.4889067614364714e-05,
"loss": 0.3999,
"step": 794
},
{
"epoch": 0.6003398149896167,
"grad_norm": 0.29500824213027954,
"learning_rate": 3.4775675386504656e-05,
"loss": 0.3033,
"step": 795
},
{
"epoch": 0.6010949594109873,
"grad_norm": 0.36105644702911377,
"learning_rate": 3.466236939486491e-05,
"loss": 0.4132,
"step": 796
},
{
"epoch": 0.6018501038323579,
"grad_norm": 0.3596034646034241,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.335,
"step": 797
},
{
"epoch": 0.6026052482537285,
"grad_norm": 0.3680574893951416,
"learning_rate": 3.443601868698288e-05,
"loss": 0.3667,
"step": 798
},
{
"epoch": 0.6033603926750991,
"grad_norm": 0.38893964886665344,
"learning_rate": 3.4322975252874946e-05,
"loss": 0.3922,
"step": 799
},
{
"epoch": 0.6041155370964697,
"grad_norm": 0.4646053910255432,
"learning_rate": 3.421002061924876e-05,
"loss": 0.3689,
"step": 800
},
{
"epoch": 0.6048706815178403,
"grad_norm": 0.16041673719882965,
"learning_rate": 3.4097155425921254e-05,
"loss": 0.2799,
"step": 801
},
{
"epoch": 0.6056258259392109,
"grad_norm": 0.14617888629436493,
"learning_rate": 3.398438031220276e-05,
"loss": 0.3073,
"step": 802
},
{
"epoch": 0.6063809703605815,
"grad_norm": 0.1890193223953247,
"learning_rate": 3.3871695916893314e-05,
"loss": 0.3961,
"step": 803
},
{
"epoch": 0.6071361147819521,
"grad_norm": 0.17151913046836853,
"learning_rate": 3.375910287827912e-05,
"loss": 0.3757,
"step": 804
},
{
"epoch": 0.6078912592033227,
"grad_norm": 0.17613713443279266,
"learning_rate": 3.364660183412892e-05,
"loss": 0.3356,
"step": 805
},
{
"epoch": 0.6086464036246932,
"grad_norm": 0.17522138357162476,
"learning_rate": 3.353419342169035e-05,
"loss": 0.3864,
"step": 806
},
{
"epoch": 0.6094015480460638,
"grad_norm": 0.17417658865451813,
"learning_rate": 3.3421878277686314e-05,
"loss": 0.3412,
"step": 807
},
{
"epoch": 0.6101566924674344,
"grad_norm": 0.1665177345275879,
"learning_rate": 3.330965703831146e-05,
"loss": 0.3375,
"step": 808
},
{
"epoch": 0.610911836888805,
"grad_norm": 0.2024083286523819,
"learning_rate": 3.3197530339228487e-05,
"loss": 0.3916,
"step": 809
},
{
"epoch": 0.6116669813101756,
"grad_norm": 0.19576655328273773,
"learning_rate": 3.3085498815564645e-05,
"loss": 0.4199,
"step": 810
},
{
"epoch": 0.6124221257315462,
"grad_norm": 0.19635462760925293,
"learning_rate": 3.297356310190797e-05,
"loss": 0.448,
"step": 811
},
{
"epoch": 0.6131772701529168,
"grad_norm": 0.1944982409477234,
"learning_rate": 3.286172383230388e-05,
"loss": 0.3877,
"step": 812
},
{
"epoch": 0.6139324145742874,
"grad_norm": 0.1916523575782776,
"learning_rate": 3.274998164025148e-05,
"loss": 0.3653,
"step": 813
},
{
"epoch": 0.614687558995658,
"grad_norm": 0.20687338709831238,
"learning_rate": 3.263833715869996e-05,
"loss": 0.4505,
"step": 814
},
{
"epoch": 0.6154427034170286,
"grad_norm": 0.18990960717201233,
"learning_rate": 3.2526791020045086e-05,
"loss": 0.3874,
"step": 815
},
{
"epoch": 0.6161978478383991,
"grad_norm": 0.18829166889190674,
"learning_rate": 3.2415343856125547e-05,
"loss": 0.3377,
"step": 816
},
{
"epoch": 0.6169529922597696,
"grad_norm": 0.21549442410469055,
"learning_rate": 3.230399629821942e-05,
"loss": 0.4082,
"step": 817
},
{
"epoch": 0.6177081366811402,
"grad_norm": 0.21937482059001923,
"learning_rate": 3.219274897704053e-05,
"loss": 0.3876,
"step": 818
},
{
"epoch": 0.6184632811025108,
"grad_norm": 0.21827095746994019,
"learning_rate": 3.2081602522734986e-05,
"loss": 0.4106,
"step": 819
},
{
"epoch": 0.6192184255238814,
"grad_norm": 0.2126101851463318,
"learning_rate": 3.197055756487752e-05,
"loss": 0.3958,
"step": 820
},
{
"epoch": 0.619973569945252,
"grad_norm": 0.22217732667922974,
"learning_rate": 3.1859614732467954e-05,
"loss": 0.414,
"step": 821
},
{
"epoch": 0.6207287143666226,
"grad_norm": 0.19480924308300018,
"learning_rate": 3.174877465392763e-05,
"loss": 0.3671,
"step": 822
},
{
"epoch": 0.6214838587879932,
"grad_norm": 0.19732743501663208,
"learning_rate": 3.163803795709583e-05,
"loss": 0.3422,
"step": 823
},
{
"epoch": 0.6222390032093638,
"grad_norm": 0.2333568036556244,
"learning_rate": 3.1527405269226305e-05,
"loss": 0.3685,
"step": 824
},
{
"epoch": 0.6229941476307344,
"grad_norm": 0.21316733956336975,
"learning_rate": 3.141687721698363e-05,
"loss": 0.3704,
"step": 825
},
{
"epoch": 0.6237492920521049,
"grad_norm": 0.20892879366874695,
"learning_rate": 3.130645442643965e-05,
"loss": 0.3474,
"step": 826
},
{
"epoch": 0.6245044364734755,
"grad_norm": 0.23324836790561676,
"learning_rate": 3.119613752307002e-05,
"loss": 0.3941,
"step": 827
},
{
"epoch": 0.6252595808948461,
"grad_norm": 0.2080870121717453,
"learning_rate": 3.108592713175056e-05,
"loss": 0.358,
"step": 828
},
{
"epoch": 0.6260147253162167,
"grad_norm": 0.24301281571388245,
"learning_rate": 3.097582387675385e-05,
"loss": 0.4344,
"step": 829
},
{
"epoch": 0.6267698697375873,
"grad_norm": 0.2200392484664917,
"learning_rate": 3.086582838174551e-05,
"loss": 0.3661,
"step": 830
},
{
"epoch": 0.6275250141589579,
"grad_norm": 0.24137113988399506,
"learning_rate": 3.075594126978084e-05,
"loss": 0.4266,
"step": 831
},
{
"epoch": 0.6282801585803285,
"grad_norm": 0.24250073730945587,
"learning_rate": 3.0646163163301186e-05,
"loss": 0.41,
"step": 832
},
{
"epoch": 0.6290353030016991,
"grad_norm": 0.26428642868995667,
"learning_rate": 3.053649468413043e-05,
"loss": 0.4392,
"step": 833
},
{
"epoch": 0.6297904474230697,
"grad_norm": 0.2617412805557251,
"learning_rate": 3.0426936453471533e-05,
"loss": 0.4245,
"step": 834
},
{
"epoch": 0.6305455918444403,
"grad_norm": 0.2729879319667816,
"learning_rate": 3.0317489091902935e-05,
"loss": 0.3944,
"step": 835
},
{
"epoch": 0.6313007362658108,
"grad_norm": 0.261433482170105,
"learning_rate": 3.020815321937509e-05,
"loss": 0.3796,
"step": 836
},
{
"epoch": 0.6320558806871814,
"grad_norm": 0.2721140682697296,
"learning_rate": 3.0098929455206904e-05,
"loss": 0.395,
"step": 837
},
{
"epoch": 0.632811025108552,
"grad_norm": 0.26962924003601074,
"learning_rate": 2.998981841808227e-05,
"loss": 0.3807,
"step": 838
},
{
"epoch": 0.6335661695299226,
"grad_norm": 0.2710483968257904,
"learning_rate": 2.988082072604661e-05,
"loss": 0.3361,
"step": 839
},
{
"epoch": 0.6343213139512932,
"grad_norm": 0.2910868227481842,
"learning_rate": 2.9771936996503248e-05,
"loss": 0.4196,
"step": 840
},
{
"epoch": 0.6350764583726638,
"grad_norm": 0.2808794379234314,
"learning_rate": 2.9663167846209998e-05,
"loss": 0.3742,
"step": 841
},
{
"epoch": 0.6358316027940344,
"grad_norm": 0.3063143491744995,
"learning_rate": 2.955451389127567e-05,
"loss": 0.4192,
"step": 842
},
{
"epoch": 0.636586747215405,
"grad_norm": 0.2972528338432312,
"learning_rate": 2.9445975747156545e-05,
"loss": 0.3618,
"step": 843
},
{
"epoch": 0.6373418916367756,
"grad_norm": 0.2982980012893677,
"learning_rate": 2.9337554028652952e-05,
"loss": 0.3622,
"step": 844
},
{
"epoch": 0.6380970360581462,
"grad_norm": 0.29826125502586365,
"learning_rate": 2.9229249349905684e-05,
"loss": 0.3206,
"step": 845
},
{
"epoch": 0.6388521804795168,
"grad_norm": 0.3471531569957733,
"learning_rate": 2.9121062324392623e-05,
"loss": 0.383,
"step": 846
},
{
"epoch": 0.6396073249008873,
"grad_norm": 0.37260305881500244,
"learning_rate": 2.901299356492516e-05,
"loss": 0.403,
"step": 847
},
{
"epoch": 0.6403624693222579,
"grad_norm": 0.3672800064086914,
"learning_rate": 2.8905043683644872e-05,
"loss": 0.3851,
"step": 848
},
{
"epoch": 0.6411176137436285,
"grad_norm": 0.40943288803100586,
"learning_rate": 2.8797213292019926e-05,
"loss": 0.3267,
"step": 849
},
{
"epoch": 0.641872758164999,
"grad_norm": 0.5209497213363647,
"learning_rate": 2.86895030008416e-05,
"loss": 0.4702,
"step": 850
},
{
"epoch": 0.6426279025863696,
"grad_norm": 0.15170446038246155,
"learning_rate": 2.858191342022095e-05,
"loss": 0.2699,
"step": 851
},
{
"epoch": 0.6433830470077402,
"grad_norm": 0.16055040061473846,
"learning_rate": 2.8474445159585235e-05,
"loss": 0.3269,
"step": 852
},
{
"epoch": 0.6441381914291108,
"grad_norm": 0.16934970021247864,
"learning_rate": 2.8367098827674578e-05,
"loss": 0.3698,
"step": 853
},
{
"epoch": 0.6448933358504814,
"grad_norm": 0.16251638531684875,
"learning_rate": 2.8259875032538407e-05,
"loss": 0.3339,
"step": 854
},
{
"epoch": 0.645648480271852,
"grad_norm": 0.1724422574043274,
"learning_rate": 2.8152774381532033e-05,
"loss": 0.4104,
"step": 855
},
{
"epoch": 0.6464036246932225,
"grad_norm": 0.17181172966957092,
"learning_rate": 2.8045797481313262e-05,
"loss": 0.3565,
"step": 856
},
{
"epoch": 0.6471587691145931,
"grad_norm": 0.17750045657157898,
"learning_rate": 2.7938944937838923e-05,
"loss": 0.3573,
"step": 857
},
{
"epoch": 0.6479139135359637,
"grad_norm": 0.18744954466819763,
"learning_rate": 2.78322173563615e-05,
"loss": 0.3277,
"step": 858
},
{
"epoch": 0.6486690579573343,
"grad_norm": 0.18920008838176727,
"learning_rate": 2.7725615341425525e-05,
"loss": 0.407,
"step": 859
},
{
"epoch": 0.6494242023787049,
"grad_norm": 0.19674719870090485,
"learning_rate": 2.7619139496864378e-05,
"loss": 0.4228,
"step": 860
},
{
"epoch": 0.6501793468000755,
"grad_norm": 0.20689047873020172,
"learning_rate": 2.7512790425796718e-05,
"loss": 0.4252,
"step": 861
},
{
"epoch": 0.6509344912214461,
"grad_norm": 0.18963493406772614,
"learning_rate": 2.740656873062312e-05,
"loss": 0.3796,
"step": 862
},
{
"epoch": 0.6516896356428167,
"grad_norm": 0.21359539031982422,
"learning_rate": 2.7300475013022663e-05,
"loss": 0.4206,
"step": 863
},
{
"epoch": 0.6524447800641873,
"grad_norm": 0.2088915854692459,
"learning_rate": 2.7194509873949503e-05,
"loss": 0.4344,
"step": 864
},
{
"epoch": 0.6531999244855579,
"grad_norm": 0.2131178230047226,
"learning_rate": 2.708867391362948e-05,
"loss": 0.4203,
"step": 865
},
{
"epoch": 0.6539550689069284,
"grad_norm": 0.20361606776714325,
"learning_rate": 2.698296773155673e-05,
"loss": 0.3994,
"step": 866
},
{
"epoch": 0.654710213328299,
"grad_norm": 0.2058803290128708,
"learning_rate": 2.687739192649026e-05,
"loss": 0.3746,
"step": 867
},
{
"epoch": 0.6554653577496696,
"grad_norm": 0.22526845335960388,
"learning_rate": 2.6771947096450577e-05,
"loss": 0.4883,
"step": 868
},
{
"epoch": 0.6562205021710402,
"grad_norm": 0.1975262463092804,
"learning_rate": 2.6666633838716314e-05,
"loss": 0.3258,
"step": 869
},
{
"epoch": 0.6569756465924108,
"grad_norm": 0.2137899249792099,
"learning_rate": 2.6561452749820807e-05,
"loss": 0.3739,
"step": 870
},
{
"epoch": 0.6577307910137814,
"grad_norm": 0.21968792378902435,
"learning_rate": 2.6456404425548774e-05,
"loss": 0.3703,
"step": 871
},
{
"epoch": 0.658485935435152,
"grad_norm": 0.21001875400543213,
"learning_rate": 2.6351489460932816e-05,
"loss": 0.4517,
"step": 872
},
{
"epoch": 0.6592410798565226,
"grad_norm": 0.23233747482299805,
"learning_rate": 2.6246708450250256e-05,
"loss": 0.3885,
"step": 873
},
{
"epoch": 0.6599962242778932,
"grad_norm": 0.23604817688465118,
"learning_rate": 2.6142061987019577e-05,
"loss": 0.4035,
"step": 874
},
{
"epoch": 0.6607513686992638,
"grad_norm": 0.23978619277477264,
"learning_rate": 2.603755066399718e-05,
"loss": 0.4077,
"step": 875
},
{
"epoch": 0.6615065131206344,
"grad_norm": 0.23582671582698822,
"learning_rate": 2.5933175073173898e-05,
"loss": 0.4175,
"step": 876
},
{
"epoch": 0.662261657542005,
"grad_norm": 0.24461962282657623,
"learning_rate": 2.5828935805771802e-05,
"loss": 0.4297,
"step": 877
},
{
"epoch": 0.6630168019633755,
"grad_norm": 0.24499231576919556,
"learning_rate": 2.5724833452240792e-05,
"loss": 0.3868,
"step": 878
},
{
"epoch": 0.6637719463847461,
"grad_norm": 0.2279515117406845,
"learning_rate": 2.5620868602255197e-05,
"loss": 0.4166,
"step": 879
},
{
"epoch": 0.6645270908061167,
"grad_norm": 0.24552056193351746,
"learning_rate": 2.5517041844710453e-05,
"loss": 0.4447,
"step": 880
},
{
"epoch": 0.6652822352274873,
"grad_norm": 0.25690439343452454,
"learning_rate": 2.5413353767719805e-05,
"loss": 0.4388,
"step": 881
},
{
"epoch": 0.6660373796488579,
"grad_norm": 0.27102774381637573,
"learning_rate": 2.5309804958611016e-05,
"loss": 0.4063,
"step": 882
},
{
"epoch": 0.6667925240702284,
"grad_norm": 0.2607583701610565,
"learning_rate": 2.520639600392295e-05,
"loss": 0.4336,
"step": 883
},
{
"epoch": 0.667547668491599,
"grad_norm": 0.28491443395614624,
"learning_rate": 2.5103127489402217e-05,
"loss": 0.493,
"step": 884
},
{
"epoch": 0.6683028129129696,
"grad_norm": 0.27002084255218506,
"learning_rate": 2.500000000000001e-05,
"loss": 0.4069,
"step": 885
},
{
"epoch": 0.6690579573343401,
"grad_norm": 0.2716214060783386,
"learning_rate": 2.489701411986865e-05,
"loss": 0.4394,
"step": 886
},
{
"epoch": 0.6698131017557107,
"grad_norm": 0.26700031757354736,
"learning_rate": 2.4794170432358415e-05,
"loss": 0.3797,
"step": 887
},
{
"epoch": 0.6705682461770813,
"grad_norm": 0.2846347987651825,
"learning_rate": 2.4691469520014025e-05,
"loss": 0.3867,
"step": 888
},
{
"epoch": 0.6713233905984519,
"grad_norm": 0.32261618971824646,
"learning_rate": 2.4588911964571553e-05,
"loss": 0.4208,
"step": 889
},
{
"epoch": 0.6720785350198225,
"grad_norm": 0.2805870771408081,
"learning_rate": 2.4486498346955027e-05,
"loss": 0.3418,
"step": 890
},
{
"epoch": 0.6728336794411931,
"grad_norm": 0.2825523912906647,
"learning_rate": 2.4384229247273155e-05,
"loss": 0.399,
"step": 891
},
{
"epoch": 0.6735888238625637,
"grad_norm": 0.34398892521858215,
"learning_rate": 2.4282105244816045e-05,
"loss": 0.4278,
"step": 892
},
{
"epoch": 0.6743439682839343,
"grad_norm": 0.31729134917259216,
"learning_rate": 2.418012691805191e-05,
"loss": 0.3586,
"step": 893
},
{
"epoch": 0.6750991127053049,
"grad_norm": 0.30837181210517883,
"learning_rate": 2.4078294844623816e-05,
"loss": 0.3639,
"step": 894
},
{
"epoch": 0.6758542571266755,
"grad_norm": 0.33780837059020996,
"learning_rate": 2.3976609601346394e-05,
"loss": 0.3787,
"step": 895
},
{
"epoch": 0.676609401548046,
"grad_norm": 0.3720472455024719,
"learning_rate": 2.3875071764202563e-05,
"loss": 0.3951,
"step": 896
},
{
"epoch": 0.6773645459694166,
"grad_norm": 0.3754374384880066,
"learning_rate": 2.3773681908340284e-05,
"loss": 0.3508,
"step": 897
},
{
"epoch": 0.6781196903907872,
"grad_norm": 0.4134625196456909,
"learning_rate": 2.3672440608069313e-05,
"loss": 0.3392,
"step": 898
},
{
"epoch": 0.6788748348121578,
"grad_norm": 0.43518126010894775,
"learning_rate": 2.3571348436857904e-05,
"loss": 0.3959,
"step": 899
},
{
"epoch": 0.6796299792335284,
"grad_norm": 0.46200618147850037,
"learning_rate": 2.3470405967329605e-05,
"loss": 0.3458,
"step": 900
},
{
"epoch": 0.680385123654899,
"grad_norm": 0.1568119376897812,
"learning_rate": 2.336961377126001e-05,
"loss": 0.2835,
"step": 901
},
{
"epoch": 0.6811402680762696,
"grad_norm": 0.1660660058259964,
"learning_rate": 2.326897241957348e-05,
"loss": 0.3211,
"step": 902
},
{
"epoch": 0.6818954124976402,
"grad_norm": 0.1690598577260971,
"learning_rate": 2.3168482482339955e-05,
"loss": 0.3941,
"step": 903
},
{
"epoch": 0.6826505569190108,
"grad_norm": 0.17117637395858765,
"learning_rate": 2.3068144528771712e-05,
"loss": 0.3738,
"step": 904
},
{
"epoch": 0.6834057013403814,
"grad_norm": 0.16965438425540924,
"learning_rate": 2.296795912722014e-05,
"loss": 0.3393,
"step": 905
},
{
"epoch": 0.684160845761752,
"grad_norm": 0.18404419720172882,
"learning_rate": 2.286792684517245e-05,
"loss": 0.357,
"step": 906
},
{
"epoch": 0.6849159901831225,
"grad_norm": 0.1736985743045807,
"learning_rate": 2.2768048249248648e-05,
"loss": 0.3317,
"step": 907
},
{
"epoch": 0.6856711346044931,
"grad_norm": 0.18678586184978485,
"learning_rate": 2.2668323905198108e-05,
"loss": 0.3558,
"step": 908
},
{
"epoch": 0.6864262790258637,
"grad_norm": 0.19678306579589844,
"learning_rate": 2.2568754377896516e-05,
"loss": 0.4089,
"step": 909
},
{
"epoch": 0.6871814234472343,
"grad_norm": 0.20676788687705994,
"learning_rate": 2.246934023134257e-05,
"loss": 0.3973,
"step": 910
},
{
"epoch": 0.6879365678686049,
"grad_norm": 0.21614781022071838,
"learning_rate": 2.2370082028654866e-05,
"loss": 0.3998,
"step": 911
},
{
"epoch": 0.6886917122899755,
"grad_norm": 0.2001326084136963,
"learning_rate": 2.22709803320687e-05,
"loss": 0.3733,
"step": 912
},
{
"epoch": 0.6894468567113461,
"grad_norm": 0.20481480658054352,
"learning_rate": 2.2172035702932825e-05,
"loss": 0.4074,
"step": 913
},
{
"epoch": 0.6902020011327167,
"grad_norm": 0.21287479996681213,
"learning_rate": 2.207324870170629e-05,
"loss": 0.429,
"step": 914
},
{
"epoch": 0.6909571455540873,
"grad_norm": 0.21085505187511444,
"learning_rate": 2.1974619887955294e-05,
"loss": 0.3979,
"step": 915
},
{
"epoch": 0.6917122899754579,
"grad_norm": 0.21903711557388306,
"learning_rate": 2.1876149820350057e-05,
"loss": 0.5028,
"step": 916
},
{
"epoch": 0.6924674343968283,
"grad_norm": 0.2196279913187027,
"learning_rate": 2.1777839056661554e-05,
"loss": 0.4467,
"step": 917
},
{
"epoch": 0.6932225788181989,
"grad_norm": 0.2197204977273941,
"learning_rate": 2.167968815375837e-05,
"loss": 0.4149,
"step": 918
},
{
"epoch": 0.6939777232395695,
"grad_norm": 0.23177607357501984,
"learning_rate": 2.1581697667603633e-05,
"loss": 0.4254,
"step": 919
},
{
"epoch": 0.6947328676609401,
"grad_norm": 0.22916048765182495,
"learning_rate": 2.148386815325179e-05,
"loss": 0.4336,
"step": 920
},
{
"epoch": 0.6954880120823107,
"grad_norm": 0.2257329225540161,
"learning_rate": 2.1386200164845526e-05,
"loss": 0.3985,
"step": 921
},
{
"epoch": 0.6962431565036813,
"grad_norm": 0.2298842817544937,
"learning_rate": 2.1288694255612502e-05,
"loss": 0.4249,
"step": 922
},
{
"epoch": 0.6969983009250519,
"grad_norm": 0.22104403376579285,
"learning_rate": 2.119135097786236e-05,
"loss": 0.3709,
"step": 923
},
{
"epoch": 0.6977534453464225,
"grad_norm": 0.2183627337217331,
"learning_rate": 2.1094170882983526e-05,
"loss": 0.3912,
"step": 924
},
{
"epoch": 0.6985085897677931,
"grad_norm": 0.23446856439113617,
"learning_rate": 2.09971545214401e-05,
"loss": 0.3894,
"step": 925
},
{
"epoch": 0.6992637341891637,
"grad_norm": 0.22170968353748322,
"learning_rate": 2.0900302442768715e-05,
"loss": 0.3473,
"step": 926
},
{
"epoch": 0.7000188786105342,
"grad_norm": 0.2619408071041107,
"learning_rate": 2.0803615195575475e-05,
"loss": 0.4502,
"step": 927
},
{
"epoch": 0.7007740230319048,
"grad_norm": 0.24339045584201813,
"learning_rate": 2.0707093327532805e-05,
"loss": 0.3812,
"step": 928
},
{
"epoch": 0.7015291674532754,
"grad_norm": 0.2485007792711258,
"learning_rate": 2.061073738537635e-05,
"loss": 0.374,
"step": 929
},
{
"epoch": 0.702284311874646,
"grad_norm": 0.252399206161499,
"learning_rate": 2.05145479149019e-05,
"loss": 0.3943,
"step": 930
},
{
"epoch": 0.7030394562960166,
"grad_norm": 0.2727779448032379,
"learning_rate": 2.0418525460962285e-05,
"loss": 0.415,
"step": 931
},
{
"epoch": 0.7037946007173872,
"grad_norm": 0.24974308907985687,
"learning_rate": 2.03226705674643e-05,
"loss": 0.3446,
"step": 932
},
{
"epoch": 0.7045497451387578,
"grad_norm": 0.2764071524143219,
"learning_rate": 2.0226983777365604e-05,
"loss": 0.4001,
"step": 933
},
{
"epoch": 0.7053048895601284,
"grad_norm": 0.25649023056030273,
"learning_rate": 2.0131465632671652e-05,
"loss": 0.3642,
"step": 934
},
{
"epoch": 0.706060033981499,
"grad_norm": 0.26145219802856445,
"learning_rate": 2.0036116674432654e-05,
"loss": 0.367,
"step": 935
},
{
"epoch": 0.7068151784028696,
"grad_norm": 0.2874867022037506,
"learning_rate": 1.9940937442740454e-05,
"loss": 0.3967,
"step": 936
},
{
"epoch": 0.7075703228242401,
"grad_norm": 0.2877250015735626,
"learning_rate": 1.9845928476725524e-05,
"loss": 0.4071,
"step": 937
},
{
"epoch": 0.7083254672456107,
"grad_norm": 0.2828865647315979,
"learning_rate": 1.9751090314553878e-05,
"loss": 0.3551,
"step": 938
},
{
"epoch": 0.7090806116669813,
"grad_norm": 0.30538272857666016,
"learning_rate": 1.9656423493424048e-05,
"loss": 0.3898,
"step": 939
},
{
"epoch": 0.7098357560883519,
"grad_norm": 0.33847618103027344,
"learning_rate": 1.9561928549563968e-05,
"loss": 0.4906,
"step": 940
},
{
"epoch": 0.7105909005097225,
"grad_norm": 0.3496238887310028,
"learning_rate": 1.946760601822809e-05,
"loss": 0.4622,
"step": 941
},
{
"epoch": 0.7113460449310931,
"grad_norm": 0.3086839020252228,
"learning_rate": 1.9373456433694198e-05,
"loss": 0.3681,
"step": 942
},
{
"epoch": 0.7121011893524637,
"grad_norm": 0.33560845255851746,
"learning_rate": 1.927948032926047e-05,
"loss": 0.3716,
"step": 943
},
{
"epoch": 0.7128563337738343,
"grad_norm": 0.34799009561538696,
"learning_rate": 1.9185678237242373e-05,
"loss": 0.3493,
"step": 944
},
{
"epoch": 0.7136114781952049,
"grad_norm": 0.3494877815246582,
"learning_rate": 1.9092050688969738e-05,
"loss": 0.3688,
"step": 945
},
{
"epoch": 0.7143666226165755,
"grad_norm": 0.34960487484931946,
"learning_rate": 1.899859821478376e-05,
"loss": 0.3618,
"step": 946
},
{
"epoch": 0.715121767037946,
"grad_norm": 0.38590875267982483,
"learning_rate": 1.8905321344033898e-05,
"loss": 0.437,
"step": 947
},
{
"epoch": 0.7158769114593166,
"grad_norm": 0.4215663969516754,
"learning_rate": 1.881222060507492e-05,
"loss": 0.4118,
"step": 948
},
{
"epoch": 0.7166320558806872,
"grad_norm": 0.36362165212631226,
"learning_rate": 1.8719296525263922e-05,
"loss": 0.3083,
"step": 949
},
{
"epoch": 0.7173872003020577,
"grad_norm": 0.5022253394126892,
"learning_rate": 1.8626549630957396e-05,
"loss": 0.4431,
"step": 950
},
{
"epoch": 0.7181423447234283,
"grad_norm": 0.14882560074329376,
"learning_rate": 1.8533980447508137e-05,
"loss": 0.3217,
"step": 951
},
{
"epoch": 0.7188974891447989,
"grad_norm": 0.16775713860988617,
"learning_rate": 1.8441589499262303e-05,
"loss": 0.3601,
"step": 952
},
{
"epoch": 0.7196526335661695,
"grad_norm": 0.17136643826961517,
"learning_rate": 1.8349377309556486e-05,
"loss": 0.3492,
"step": 953
},
{
"epoch": 0.7204077779875401,
"grad_norm": 0.17329272627830505,
"learning_rate": 1.8257344400714732e-05,
"loss": 0.3725,
"step": 954
},
{
"epoch": 0.7211629224089107,
"grad_norm": 0.16661885380744934,
"learning_rate": 1.8165491294045593e-05,
"loss": 0.3236,
"step": 955
},
{
"epoch": 0.7219180668302813,
"grad_norm": 0.182418555021286,
"learning_rate": 1.8073818509839098e-05,
"loss": 0.4007,
"step": 956
},
{
"epoch": 0.7226732112516518,
"grad_norm": 0.18849271535873413,
"learning_rate": 1.7982326567363888e-05,
"loss": 0.3725,
"step": 957
},
{
"epoch": 0.7234283556730224,
"grad_norm": 0.1787819266319275,
"learning_rate": 1.789101598486427e-05,
"loss": 0.3415,
"step": 958
},
{
"epoch": 0.724183500094393,
"grad_norm": 0.20412220060825348,
"learning_rate": 1.7799887279557237e-05,
"loss": 0.392,
"step": 959
},
{
"epoch": 0.7249386445157636,
"grad_norm": 0.19325533509254456,
"learning_rate": 1.7708940967629567e-05,
"loss": 0.3702,
"step": 960
},
{
"epoch": 0.7256937889371342,
"grad_norm": 0.19539514183998108,
"learning_rate": 1.7618177564234905e-05,
"loss": 0.3573,
"step": 961
},
{
"epoch": 0.7264489333585048,
"grad_norm": 0.21055525541305542,
"learning_rate": 1.7527597583490822e-05,
"loss": 0.3993,
"step": 962
},
{
"epoch": 0.7272040777798754,
"grad_norm": 0.19510890543460846,
"learning_rate": 1.7437201538475916e-05,
"loss": 0.391,
"step": 963
},
{
"epoch": 0.727959222201246,
"grad_norm": 0.2113380879163742,
"learning_rate": 1.734698994122691e-05,
"loss": 0.4522,
"step": 964
},
{
"epoch": 0.7287143666226166,
"grad_norm": 0.23572376370429993,
"learning_rate": 1.725696330273575e-05,
"loss": 0.4537,
"step": 965
},
{
"epoch": 0.7294695110439872,
"grad_norm": 0.3286976218223572,
"learning_rate": 1.7167122132946694e-05,
"loss": 0.3092,
"step": 966
},
{
"epoch": 0.7302246554653578,
"grad_norm": 0.22728495299816132,
"learning_rate": 1.7077466940753444e-05,
"loss": 0.3905,
"step": 967
},
{
"epoch": 0.7309797998867283,
"grad_norm": 0.21779009699821472,
"learning_rate": 1.698799823399628e-05,
"loss": 0.402,
"step": 968
},
{
"epoch": 0.7317349443080989,
"grad_norm": 0.22000010311603546,
"learning_rate": 1.6898716519459074e-05,
"loss": 0.4021,
"step": 969
},
{
"epoch": 0.7324900887294695,
"grad_norm": 0.21992164850234985,
"learning_rate": 1.6809622302866625e-05,
"loss": 0.3588,
"step": 970
},
{
"epoch": 0.7332452331508401,
"grad_norm": 0.24943576753139496,
"learning_rate": 1.6720716088881594e-05,
"loss": 0.4089,
"step": 971
},
{
"epoch": 0.7340003775722107,
"grad_norm": 0.22193026542663574,
"learning_rate": 1.6631998381101767e-05,
"loss": 0.379,
"step": 972
},
{
"epoch": 0.7347555219935813,
"grad_norm": 0.21193212270736694,
"learning_rate": 1.6543469682057106e-05,
"loss": 0.296,
"step": 973
},
{
"epoch": 0.7355106664149519,
"grad_norm": 0.251336008310318,
"learning_rate": 1.6455130493206987e-05,
"loss": 0.4699,
"step": 974
},
{
"epoch": 0.7362658108363225,
"grad_norm": 0.23066718876361847,
"learning_rate": 1.6366981314937376e-05,
"loss": 0.4149,
"step": 975
},
{
"epoch": 0.7370209552576931,
"grad_norm": 0.26630717515945435,
"learning_rate": 1.627902264655788e-05,
"loss": 0.4322,
"step": 976
},
{
"epoch": 0.7377760996790637,
"grad_norm": 0.263698548078537,
"learning_rate": 1.619125498629904e-05,
"loss": 0.407,
"step": 977
},
{
"epoch": 0.7385312441004342,
"grad_norm": 0.2588689625263214,
"learning_rate": 1.61036788313094e-05,
"loss": 0.4343,
"step": 978
},
{
"epoch": 0.7392863885218048,
"grad_norm": 0.23290039598941803,
"learning_rate": 1.601629467765277e-05,
"loss": 0.3627,
"step": 979
},
{
"epoch": 0.7400415329431754,
"grad_norm": 0.24483409523963928,
"learning_rate": 1.592910302030544e-05,
"loss": 0.3279,
"step": 980
},
{
"epoch": 0.740796677364546,
"grad_norm": 0.25037166476249695,
"learning_rate": 1.5842104353153287e-05,
"loss": 0.4103,
"step": 981
},
{
"epoch": 0.7415518217859166,
"grad_norm": 0.2862619459629059,
"learning_rate": 1.5755299168988997e-05,
"loss": 0.433,
"step": 982
},
{
"epoch": 0.7423069662072871,
"grad_norm": 0.28889885544776917,
"learning_rate": 1.566868795950932e-05,
"loss": 0.426,
"step": 983
},
{
"epoch": 0.7430621106286577,
"grad_norm": 0.2708713710308075,
"learning_rate": 1.5582271215312294e-05,
"loss": 0.4114,
"step": 984
},
{
"epoch": 0.7438172550500283,
"grad_norm": 0.269611120223999,
"learning_rate": 1.549604942589441e-05,
"loss": 0.3912,
"step": 985
},
{
"epoch": 0.7445723994713989,
"grad_norm": 0.2908805310726166,
"learning_rate": 1.5410023079647822e-05,
"loss": 0.3776,
"step": 986
},
{
"epoch": 0.7453275438927695,
"grad_norm": 0.2953813076019287,
"learning_rate": 1.5324192663857674e-05,
"loss": 0.4081,
"step": 987
},
{
"epoch": 0.74608268831414,
"grad_norm": 0.28919684886932373,
"learning_rate": 1.5238558664699255e-05,
"loss": 0.4329,
"step": 988
},
{
"epoch": 0.7468378327355106,
"grad_norm": 0.31675902009010315,
"learning_rate": 1.5153121567235335e-05,
"loss": 0.4214,
"step": 989
},
{
"epoch": 0.7475929771568812,
"grad_norm": 0.3113098442554474,
"learning_rate": 1.5067881855413274e-05,
"loss": 0.4025,
"step": 990
},
{
"epoch": 0.7483481215782518,
"grad_norm": 0.2837510108947754,
"learning_rate": 1.4982840012062426e-05,
"loss": 0.347,
"step": 991
},
{
"epoch": 0.7491032659996224,
"grad_norm": 0.32457566261291504,
"learning_rate": 1.4897996518891327e-05,
"loss": 0.3995,
"step": 992
},
{
"epoch": 0.749858410420993,
"grad_norm": 0.3207525908946991,
"learning_rate": 1.481335185648498e-05,
"loss": 0.3603,
"step": 993
},
{
"epoch": 0.7506135548423636,
"grad_norm": 0.37354031205177307,
"learning_rate": 1.4728906504302153e-05,
"loss": 0.4288,
"step": 994
},
{
"epoch": 0.7513686992637342,
"grad_norm": 0.3249981105327606,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.346,
"step": 995
},
{
"epoch": 0.7521238436851048,
"grad_norm": 0.3194781541824341,
"learning_rate": 1.4560615642794517e-05,
"loss": 0.3108,
"step": 996
},
{
"epoch": 0.7528789881064754,
"grad_norm": 0.39953500032424927,
"learning_rate": 1.4476771086731567e-05,
"loss": 0.4049,
"step": 997
},
{
"epoch": 0.753634132527846,
"grad_norm": 0.35916373133659363,
"learning_rate": 1.4393127747410417e-05,
"loss": 0.335,
"step": 998
},
{
"epoch": 0.7543892769492165,
"grad_norm": 0.3811121881008148,
"learning_rate": 1.4309686098617975e-05,
"loss": 0.3608,
"step": 999
},
{
"epoch": 0.7551444213705871,
"grad_norm": 0.49154841899871826,
"learning_rate": 1.4226446612998673e-05,
"loss": 0.4254,
"step": 1000
},
{
"epoch": 0.7558995657919577,
"grad_norm": 0.14996010065078735,
"learning_rate": 1.414340976205183e-05,
"loss": 0.3259,
"step": 1001
},
{
"epoch": 0.7566547102133283,
"grad_norm": 0.15046906471252441,
"learning_rate": 1.4060576016128974e-05,
"loss": 0.3184,
"step": 1002
},
{
"epoch": 0.7574098546346989,
"grad_norm": 0.1714065670967102,
"learning_rate": 1.3977945844431118e-05,
"loss": 0.3564,
"step": 1003
},
{
"epoch": 0.7581649990560695,
"grad_norm": 0.18413116037845612,
"learning_rate": 1.3895519715006238e-05,
"loss": 0.3889,
"step": 1004
},
{
"epoch": 0.7589201434774401,
"grad_norm": 0.19010891020298004,
"learning_rate": 1.3813298094746491e-05,
"loss": 0.3909,
"step": 1005
},
{
"epoch": 0.7596752878988107,
"grad_norm": 0.1898065060377121,
"learning_rate": 1.373128144938563e-05,
"loss": 0.4388,
"step": 1006
},
{
"epoch": 0.7604304323201813,
"grad_norm": 0.1871468871831894,
"learning_rate": 1.3649470243496326e-05,
"loss": 0.401,
"step": 1007
},
{
"epoch": 0.7611855767415519,
"grad_norm": 0.18679498136043549,
"learning_rate": 1.3567864940487584e-05,
"loss": 0.4038,
"step": 1008
},
{
"epoch": 0.7619407211629224,
"grad_norm": 0.19137872755527496,
"learning_rate": 1.3486466002602133e-05,
"loss": 0.3803,
"step": 1009
},
{
"epoch": 0.762695865584293,
"grad_norm": 0.1978340446949005,
"learning_rate": 1.340527389091374e-05,
"loss": 0.388,
"step": 1010
},
{
"epoch": 0.7634510100056636,
"grad_norm": 0.19116266071796417,
"learning_rate": 1.3324289065324608e-05,
"loss": 0.3728,
"step": 1011
},
{
"epoch": 0.7642061544270342,
"grad_norm": 0.21141821146011353,
"learning_rate": 1.3243511984562824e-05,
"loss": 0.4367,
"step": 1012
},
{
"epoch": 0.7649612988484048,
"grad_norm": 0.22845213115215302,
"learning_rate": 1.3162943106179749e-05,
"loss": 0.3907,
"step": 1013
},
{
"epoch": 0.7657164432697754,
"grad_norm": 0.22269576787948608,
"learning_rate": 1.3082582886547395e-05,
"loss": 0.4779,
"step": 1014
},
{
"epoch": 0.766471587691146,
"grad_norm": 0.21351350843906403,
"learning_rate": 1.3002431780855817e-05,
"loss": 0.4206,
"step": 1015
},
{
"epoch": 0.7672267321125166,
"grad_norm": 0.2109295129776001,
"learning_rate": 1.2922490243110614e-05,
"loss": 0.3882,
"step": 1016
},
{
"epoch": 0.767981876533887,
"grad_norm": 0.23167261481285095,
"learning_rate": 1.2842758726130283e-05,
"loss": 0.4386,
"step": 1017
},
{
"epoch": 0.7687370209552576,
"grad_norm": 0.23334629833698273,
"learning_rate": 1.2763237681543732e-05,
"loss": 0.4477,
"step": 1018
},
{
"epoch": 0.7694921653766282,
"grad_norm": 0.23084315657615662,
"learning_rate": 1.2683927559787655e-05,
"loss": 0.423,
"step": 1019
},
{
"epoch": 0.7702473097979988,
"grad_norm": 0.2281108796596527,
"learning_rate": 1.2604828810103957e-05,
"loss": 0.4073,
"step": 1020
},
{
"epoch": 0.7710024542193694,
"grad_norm": 0.2173498570919037,
"learning_rate": 1.2525941880537307e-05,
"loss": 0.3423,
"step": 1021
},
{
"epoch": 0.77175759864074,
"grad_norm": 0.21901051700115204,
"learning_rate": 1.2447267217932507e-05,
"loss": 0.3601,
"step": 1022
},
{
"epoch": 0.7725127430621106,
"grad_norm": 0.22421492636203766,
"learning_rate": 1.236880526793207e-05,
"loss": 0.3953,
"step": 1023
},
{
"epoch": 0.7732678874834812,
"grad_norm": 0.2377631813287735,
"learning_rate": 1.2290556474973536e-05,
"loss": 0.3744,
"step": 1024
},
{
"epoch": 0.7740230319048518,
"grad_norm": 0.23121589422225952,
"learning_rate": 1.2212521282287092e-05,
"loss": 0.3976,
"step": 1025
},
{
"epoch": 0.7747781763262224,
"grad_norm": 0.255655437707901,
"learning_rate": 1.2134700131893012e-05,
"loss": 0.4125,
"step": 1026
},
{
"epoch": 0.775533320747593,
"grad_norm": 0.2366100549697876,
"learning_rate": 1.2057093464599157e-05,
"loss": 0.3724,
"step": 1027
},
{
"epoch": 0.7762884651689635,
"grad_norm": 0.2489083856344223,
"learning_rate": 1.1979701719998453e-05,
"loss": 0.4281,
"step": 1028
},
{
"epoch": 0.7770436095903341,
"grad_norm": 0.26776382327079773,
"learning_rate": 1.1902525336466464e-05,
"loss": 0.4041,
"step": 1029
},
{
"epoch": 0.7777987540117047,
"grad_norm": 0.24452626705169678,
"learning_rate": 1.1825564751158823e-05,
"loss": 0.4135,
"step": 1030
},
{
"epoch": 0.7785538984330753,
"grad_norm": 0.2541411817073822,
"learning_rate": 1.1748820400008843e-05,
"loss": 0.4086,
"step": 1031
},
{
"epoch": 0.7793090428544459,
"grad_norm": 0.27573496103286743,
"learning_rate": 1.167229271772498e-05,
"loss": 0.3883,
"step": 1032
},
{
"epoch": 0.7800641872758165,
"grad_norm": 0.2743297219276428,
"learning_rate": 1.1595982137788403e-05,
"loss": 0.4083,
"step": 1033
},
{
"epoch": 0.7808193316971871,
"grad_norm": 0.2688688039779663,
"learning_rate": 1.1519889092450542e-05,
"loss": 0.3992,
"step": 1034
},
{
"epoch": 0.7815744761185577,
"grad_norm": 0.28381526470184326,
"learning_rate": 1.144401401273062e-05,
"loss": 0.3882,
"step": 1035
},
{
"epoch": 0.7823296205399283,
"grad_norm": 0.26710647344589233,
"learning_rate": 1.1368357328413242e-05,
"loss": 0.3639,
"step": 1036
},
{
"epoch": 0.7830847649612989,
"grad_norm": 0.3097337782382965,
"learning_rate": 1.1292919468045877e-05,
"loss": 0.4394,
"step": 1037
},
{
"epoch": 0.7838399093826695,
"grad_norm": 0.28815412521362305,
"learning_rate": 1.1217700858936587e-05,
"loss": 0.4298,
"step": 1038
},
{
"epoch": 0.78459505380404,
"grad_norm": 0.30151909589767456,
"learning_rate": 1.1142701927151456e-05,
"loss": 0.393,
"step": 1039
},
{
"epoch": 0.7853501982254106,
"grad_norm": 0.30772241950035095,
"learning_rate": 1.1067923097512256e-05,
"loss": 0.3688,
"step": 1040
},
{
"epoch": 0.7861053426467812,
"grad_norm": 0.31982895731925964,
"learning_rate": 1.099336479359398e-05,
"loss": 0.3815,
"step": 1041
},
{
"epoch": 0.7868604870681518,
"grad_norm": 0.3298172950744629,
"learning_rate": 1.0919027437722513e-05,
"loss": 0.4153,
"step": 1042
},
{
"epoch": 0.7876156314895224,
"grad_norm": 0.33412277698516846,
"learning_rate": 1.0844911450972229e-05,
"loss": 0.3972,
"step": 1043
},
{
"epoch": 0.788370775910893,
"grad_norm": 0.3366442322731018,
"learning_rate": 1.0771017253163568e-05,
"loss": 0.3627,
"step": 1044
},
{
"epoch": 0.7891259203322636,
"grad_norm": 0.3680926263332367,
"learning_rate": 1.0697345262860636e-05,
"loss": 0.4297,
"step": 1045
},
{
"epoch": 0.7898810647536342,
"grad_norm": 0.3372995853424072,
"learning_rate": 1.0623895897368913e-05,
"loss": 0.3856,
"step": 1046
},
{
"epoch": 0.7906362091750048,
"grad_norm": 0.34899917244911194,
"learning_rate": 1.0550669572732863e-05,
"loss": 0.2923,
"step": 1047
},
{
"epoch": 0.7913913535963754,
"grad_norm": 0.4165075421333313,
"learning_rate": 1.0477666703733541e-05,
"loss": 0.3788,
"step": 1048
},
{
"epoch": 0.792146498017746,
"grad_norm": 0.44895628094673157,
"learning_rate": 1.0404887703886251e-05,
"loss": 0.373,
"step": 1049
},
{
"epoch": 0.7929016424391164,
"grad_norm": 0.4823060631752014,
"learning_rate": 1.0332332985438248e-05,
"loss": 0.3716,
"step": 1050
},
{
"epoch": 0.793656786860487,
"grad_norm": 0.15826448798179626,
"learning_rate": 1.0260002959366349e-05,
"loss": 0.3269,
"step": 1051
},
{
"epoch": 0.7944119312818576,
"grad_norm": 0.1592281609773636,
"learning_rate": 1.0187898035374682e-05,
"loss": 0.3417,
"step": 1052
},
{
"epoch": 0.7951670757032282,
"grad_norm": 0.18132025003433228,
"learning_rate": 1.0116018621892237e-05,
"loss": 0.3531,
"step": 1053
},
{
"epoch": 0.7959222201245988,
"grad_norm": 0.16262286901474,
"learning_rate": 1.0044365126070682e-05,
"loss": 0.3089,
"step": 1054
},
{
"epoch": 0.7966773645459694,
"grad_norm": 0.17961286008358002,
"learning_rate": 9.972937953781986e-06,
"loss": 0.3534,
"step": 1055
},
{
"epoch": 0.79743250896734,
"grad_norm": 0.17105191946029663,
"learning_rate": 9.901737509616143e-06,
"loss": 0.3361,
"step": 1056
},
{
"epoch": 0.7981876533887106,
"grad_norm": 0.1858292818069458,
"learning_rate": 9.830764196878872e-06,
"loss": 0.354,
"step": 1057
},
{
"epoch": 0.7989427978100812,
"grad_norm": 0.19711358845233917,
"learning_rate": 9.760018417589334e-06,
"loss": 0.3887,
"step": 1058
},
{
"epoch": 0.7996979422314517,
"grad_norm": 0.20733587443828583,
"learning_rate": 9.689500572477855e-06,
"loss": 0.4724,
"step": 1059
},
{
"epoch": 0.8004530866528223,
"grad_norm": 0.2481202483177185,
"learning_rate": 9.619211060983675e-06,
"loss": 0.4828,
"step": 1060
},
{
"epoch": 0.8012082310741929,
"grad_norm": 0.191118523478508,
"learning_rate": 9.549150281252633e-06,
"loss": 0.3909,
"step": 1061
},
{
"epoch": 0.8019633754955635,
"grad_norm": 0.19519171118736267,
"learning_rate": 9.479318630134976e-06,
"loss": 0.339,
"step": 1062
},
{
"epoch": 0.8027185199169341,
"grad_norm": 0.19820590317249298,
"learning_rate": 9.409716503183074e-06,
"loss": 0.3476,
"step": 1063
},
{
"epoch": 0.8034736643383047,
"grad_norm": 0.23485320806503296,
"learning_rate": 9.340344294649184e-06,
"loss": 0.4675,
"step": 1064
},
{
"epoch": 0.8042288087596753,
"grad_norm": 0.20198017358779907,
"learning_rate": 9.271202397483215e-06,
"loss": 0.336,
"step": 1065
},
{
"epoch": 0.8049839531810459,
"grad_norm": 0.19426412880420685,
"learning_rate": 9.20229120333052e-06,
"loss": 0.3578,
"step": 1066
},
{
"epoch": 0.8057390976024165,
"grad_norm": 0.2336643636226654,
"learning_rate": 9.133611102529654e-06,
"loss": 0.4355,
"step": 1067
},
{
"epoch": 0.8064942420237871,
"grad_norm": 0.2223149985074997,
"learning_rate": 9.065162484110179e-06,
"loss": 0.4256,
"step": 1068
},
{
"epoch": 0.8072493864451576,
"grad_norm": 0.23664018511772156,
"learning_rate": 8.996945735790447e-06,
"loss": 0.4148,
"step": 1069
},
{
"epoch": 0.8080045308665282,
"grad_norm": 0.22716124355793,
"learning_rate": 8.928961243975437e-06,
"loss": 0.3981,
"step": 1070
},
{
"epoch": 0.8087596752878988,
"grad_norm": 0.230534628033638,
"learning_rate": 8.861209393754477e-06,
"loss": 0.4269,
"step": 1071
},
{
"epoch": 0.8095148197092694,
"grad_norm": 0.21818408370018005,
"learning_rate": 8.793690568899216e-06,
"loss": 0.3498,
"step": 1072
},
{
"epoch": 0.81026996413064,
"grad_norm": 0.2515822947025299,
"learning_rate": 8.7264051518613e-06,
"loss": 0.4829,
"step": 1073
},
{
"epoch": 0.8110251085520106,
"grad_norm": 0.23883438110351562,
"learning_rate": 8.659353523770297e-06,
"loss": 0.3792,
"step": 1074
},
{
"epoch": 0.8117802529733812,
"grad_norm": 0.25294432044029236,
"learning_rate": 8.592536064431467e-06,
"loss": 0.3966,
"step": 1075
},
{
"epoch": 0.8125353973947518,
"grad_norm": 0.2528051435947418,
"learning_rate": 8.525953152323684e-06,
"loss": 0.4245,
"step": 1076
},
{
"epoch": 0.8132905418161224,
"grad_norm": 0.25422972440719604,
"learning_rate": 8.459605164597267e-06,
"loss": 0.4256,
"step": 1077
},
{
"epoch": 0.814045686237493,
"grad_norm": 0.2697378098964691,
"learning_rate": 8.393492477071829e-06,
"loss": 0.4137,
"step": 1078
},
{
"epoch": 0.8148008306588636,
"grad_norm": 0.25492045283317566,
"learning_rate": 8.327615464234129e-06,
"loss": 0.4055,
"step": 1079
},
{
"epoch": 0.8155559750802341,
"grad_norm": 0.26645827293395996,
"learning_rate": 8.261974499235987e-06,
"loss": 0.4531,
"step": 1080
},
{
"epoch": 0.8163111195016047,
"grad_norm": 0.2661876082420349,
"learning_rate": 8.196569953892202e-06,
"loss": 0.3774,
"step": 1081
},
{
"epoch": 0.8170662639229753,
"grad_norm": 0.2471131682395935,
"learning_rate": 8.131402198678373e-06,
"loss": 0.3474,
"step": 1082
},
{
"epoch": 0.8178214083443458,
"grad_norm": 0.26696231961250305,
"learning_rate": 8.066471602728803e-06,
"loss": 0.3357,
"step": 1083
},
{
"epoch": 0.8185765527657164,
"grad_norm": 0.26743122935295105,
"learning_rate": 8.001778533834487e-06,
"loss": 0.3404,
"step": 1084
},
{
"epoch": 0.819331697187087,
"grad_norm": 0.28732678294181824,
"learning_rate": 7.937323358440935e-06,
"loss": 0.389,
"step": 1085
},
{
"epoch": 0.8200868416084576,
"grad_norm": 0.30629798769950867,
"learning_rate": 7.873106441646205e-06,
"loss": 0.4185,
"step": 1086
},
{
"epoch": 0.8208419860298282,
"grad_norm": 0.2828892469406128,
"learning_rate": 7.809128147198691e-06,
"loss": 0.3792,
"step": 1087
},
{
"epoch": 0.8215971304511988,
"grad_norm": 0.28884345293045044,
"learning_rate": 7.745388837495188e-06,
"loss": 0.369,
"step": 1088
},
{
"epoch": 0.8223522748725693,
"grad_norm": 0.30468007922172546,
"learning_rate": 7.681888873578786e-06,
"loss": 0.4518,
"step": 1089
},
{
"epoch": 0.8231074192939399,
"grad_norm": 0.3138682246208191,
"learning_rate": 7.618628615136825e-06,
"loss": 0.3665,
"step": 1090
},
{
"epoch": 0.8238625637153105,
"grad_norm": 0.2910728454589844,
"learning_rate": 7.555608420498872e-06,
"loss": 0.2928,
"step": 1091
},
{
"epoch": 0.8246177081366811,
"grad_norm": 0.3152346611022949,
"learning_rate": 7.4928286466346754e-06,
"loss": 0.3834,
"step": 1092
},
{
"epoch": 0.8253728525580517,
"grad_norm": 0.336488276720047,
"learning_rate": 7.430289649152156e-06,
"loss": 0.3728,
"step": 1093
},
{
"epoch": 0.8261279969794223,
"grad_norm": 0.32753413915634155,
"learning_rate": 7.367991782295391e-06,
"loss": 0.3237,
"step": 1094
},
{
"epoch": 0.8268831414007929,
"grad_norm": 0.33121833205223083,
"learning_rate": 7.305935398942598e-06,
"loss": 0.3403,
"step": 1095
},
{
"epoch": 0.8276382858221635,
"grad_norm": 0.3293071389198303,
"learning_rate": 7.244120850604141e-06,
"loss": 0.3105,
"step": 1096
},
{
"epoch": 0.8283934302435341,
"grad_norm": 0.3871884047985077,
"learning_rate": 7.182548487420554e-06,
"loss": 0.3617,
"step": 1097
},
{
"epoch": 0.8291485746649047,
"grad_norm": 0.4038209915161133,
"learning_rate": 7.121218658160527e-06,
"loss": 0.4204,
"step": 1098
},
{
"epoch": 0.8299037190862752,
"grad_norm": 0.41719168424606323,
"learning_rate": 7.060131710218959e-06,
"loss": 0.299,
"step": 1099
},
{
"epoch": 0.8306588635076458,
"grad_norm": 0.4760392904281616,
"learning_rate": 6.999287989614972e-06,
"loss": 0.3683,
"step": 1100
},
{
"epoch": 0.8314140079290164,
"grad_norm": 0.17138616740703583,
"learning_rate": 6.9386878409899715e-06,
"loss": 0.3231,
"step": 1101
},
{
"epoch": 0.832169152350387,
"grad_norm": 0.16634538769721985,
"learning_rate": 6.87833160760567e-06,
"loss": 0.3349,
"step": 1102
},
{
"epoch": 0.8329242967717576,
"grad_norm": 0.17278127372264862,
"learning_rate": 6.818219631342149e-06,
"loss": 0.3614,
"step": 1103
},
{
"epoch": 0.8336794411931282,
"grad_norm": 0.16819556057453156,
"learning_rate": 6.758352252695949e-06,
"loss": 0.3442,
"step": 1104
},
{
"epoch": 0.8344345856144988,
"grad_norm": 0.181631401181221,
"learning_rate": 6.698729810778065e-06,
"loss": 0.3663,
"step": 1105
},
{
"epoch": 0.8351897300358694,
"grad_norm": 0.18373197317123413,
"learning_rate": 6.639352643312164e-06,
"loss": 0.3636,
"step": 1106
},
{
"epoch": 0.83594487445724,
"grad_norm": 0.1827540099620819,
"learning_rate": 6.580221086632516e-06,
"loss": 0.3765,
"step": 1107
},
{
"epoch": 0.8367000188786106,
"grad_norm": 0.1983499974012375,
"learning_rate": 6.521335475682205e-06,
"loss": 0.3806,
"step": 1108
},
{
"epoch": 0.8374551632999812,
"grad_norm": 0.2072797417640686,
"learning_rate": 6.462696144011149e-06,
"loss": 0.4196,
"step": 1109
},
{
"epoch": 0.8382103077213517,
"grad_norm": 0.20540378987789154,
"learning_rate": 6.40430342377426e-06,
"loss": 0.4063,
"step": 1110
},
{
"epoch": 0.8389654521427223,
"grad_norm": 0.22013606131076813,
"learning_rate": 6.346157645729589e-06,
"loss": 0.4732,
"step": 1111
},
{
"epoch": 0.8397205965640929,
"grad_norm": 0.2054942101240158,
"learning_rate": 6.2882591392363795e-06,
"loss": 0.3476,
"step": 1112
},
{
"epoch": 0.8404757409854635,
"grad_norm": 0.22685834765434265,
"learning_rate": 6.230608232253227e-06,
"loss": 0.4091,
"step": 1113
},
{
"epoch": 0.8412308854068341,
"grad_norm": 0.22038882970809937,
"learning_rate": 6.173205251336239e-06,
"loss": 0.4229,
"step": 1114
},
{
"epoch": 0.8419860298282047,
"grad_norm": 0.20709578692913055,
"learning_rate": 6.116050521637218e-06,
"loss": 0.4012,
"step": 1115
},
{
"epoch": 0.8427411742495753,
"grad_norm": 0.2158709317445755,
"learning_rate": 6.059144366901736e-06,
"loss": 0.3793,
"step": 1116
},
{
"epoch": 0.8434963186709458,
"grad_norm": 0.21242888271808624,
"learning_rate": 6.002487109467347e-06,
"loss": 0.334,
"step": 1117
},
{
"epoch": 0.8442514630923164,
"grad_norm": 0.23054109513759613,
"learning_rate": 5.946079070261773e-06,
"loss": 0.4508,
"step": 1118
},
{
"epoch": 0.845006607513687,
"grad_norm": 0.22902311384677887,
"learning_rate": 5.889920568801055e-06,
"loss": 0.4533,
"step": 1119
},
{
"epoch": 0.8457617519350575,
"grad_norm": 0.22745831310749054,
"learning_rate": 5.834011923187805e-06,
"loss": 0.4043,
"step": 1120
},
{
"epoch": 0.8465168963564281,
"grad_norm": 0.22601962089538574,
"learning_rate": 5.778353450109286e-06,
"loss": 0.4465,
"step": 1121
},
{
"epoch": 0.8472720407777987,
"grad_norm": 0.2339319884777069,
"learning_rate": 5.722945464835749e-06,
"loss": 0.3846,
"step": 1122
},
{
"epoch": 0.8480271851991693,
"grad_norm": 0.23972941935062408,
"learning_rate": 5.667788281218567e-06,
"loss": 0.4077,
"step": 1123
},
{
"epoch": 0.8487823296205399,
"grad_norm": 0.24830228090286255,
"learning_rate": 5.61288221168848e-06,
"loss": 0.4068,
"step": 1124
},
{
"epoch": 0.8495374740419105,
"grad_norm": 0.24531161785125732,
"learning_rate": 5.558227567253832e-06,
"loss": 0.3847,
"step": 1125
},
{
"epoch": 0.8502926184632811,
"grad_norm": 0.2520170509815216,
"learning_rate": 5.503824657498785e-06,
"loss": 0.3514,
"step": 1126
},
{
"epoch": 0.8510477628846517,
"grad_norm": 0.24631567299365997,
"learning_rate": 5.449673790581611e-06,
"loss": 0.4191,
"step": 1127
},
{
"epoch": 0.8518029073060223,
"grad_norm": 0.26101672649383545,
"learning_rate": 5.39577527323289e-06,
"loss": 0.4393,
"step": 1128
},
{
"epoch": 0.8525580517273929,
"grad_norm": 0.2639968991279602,
"learning_rate": 5.34212941075381e-06,
"loss": 0.4323,
"step": 1129
},
{
"epoch": 0.8533131961487634,
"grad_norm": 0.2551827132701874,
"learning_rate": 5.288736507014435e-06,
"loss": 0.3638,
"step": 1130
},
{
"epoch": 0.854068340570134,
"grad_norm": 0.24728746712207794,
"learning_rate": 5.235596864451975e-06,
"loss": 0.3579,
"step": 1131
},
{
"epoch": 0.8548234849915046,
"grad_norm": 0.2714408040046692,
"learning_rate": 5.182710784069067e-06,
"loss": 0.4218,
"step": 1132
},
{
"epoch": 0.8555786294128752,
"grad_norm": 0.26579996943473816,
"learning_rate": 5.13007856543209e-06,
"loss": 0.3796,
"step": 1133
},
{
"epoch": 0.8563337738342458,
"grad_norm": 0.2808961868286133,
"learning_rate": 5.077700506669425e-06,
"loss": 0.35,
"step": 1134
},
{
"epoch": 0.8570889182556164,
"grad_norm": 0.30303752422332764,
"learning_rate": 5.025576904469842e-06,
"loss": 0.4059,
"step": 1135
},
{
"epoch": 0.857844062676987,
"grad_norm": 0.30269986391067505,
"learning_rate": 4.97370805408075e-06,
"loss": 0.4018,
"step": 1136
},
{
"epoch": 0.8585992070983576,
"grad_norm": 0.27650511264801025,
"learning_rate": 4.922094249306558e-06,
"loss": 0.3363,
"step": 1137
},
{
"epoch": 0.8593543515197282,
"grad_norm": 0.28319036960601807,
"learning_rate": 4.87073578250698e-06,
"loss": 0.3566,
"step": 1138
},
{
"epoch": 0.8601094959410988,
"grad_norm": 0.31123289465904236,
"learning_rate": 4.819632944595415e-06,
"loss": 0.3427,
"step": 1139
},
{
"epoch": 0.8608646403624693,
"grad_norm": 0.3235510587692261,
"learning_rate": 4.768786025037309e-06,
"loss": 0.3882,
"step": 1140
},
{
"epoch": 0.8616197847838399,
"grad_norm": 0.34232163429260254,
"learning_rate": 4.7181953118484556e-06,
"loss": 0.4307,
"step": 1141
},
{
"epoch": 0.8623749292052105,
"grad_norm": 0.3100459575653076,
"learning_rate": 4.667861091593434e-06,
"loss": 0.3515,
"step": 1142
},
{
"epoch": 0.8631300736265811,
"grad_norm": 0.33639565110206604,
"learning_rate": 4.617783649383905e-06,
"loss": 0.4251,
"step": 1143
},
{
"epoch": 0.8638852180479517,
"grad_norm": 0.30844351649284363,
"learning_rate": 4.567963268877079e-06,
"loss": 0.336,
"step": 1144
},
{
"epoch": 0.8646403624693223,
"grad_norm": 0.3265226483345032,
"learning_rate": 4.5184002322740785e-06,
"loss": 0.3545,
"step": 1145
},
{
"epoch": 0.8653955068906929,
"grad_norm": 0.3475089967250824,
"learning_rate": 4.4690948203183255e-06,
"loss": 0.3436,
"step": 1146
},
{
"epoch": 0.8661506513120635,
"grad_norm": 0.3850986957550049,
"learning_rate": 4.4200473122939456e-06,
"loss": 0.4274,
"step": 1147
},
{
"epoch": 0.8669057957334341,
"grad_norm": 0.4068509340286255,
"learning_rate": 4.371257986024202e-06,
"loss": 0.4225,
"step": 1148
},
{
"epoch": 0.8676609401548047,
"grad_norm": 0.41805300116539,
"learning_rate": 4.322727117869951e-06,
"loss": 0.4207,
"step": 1149
},
{
"epoch": 0.8684160845761751,
"grad_norm": 0.4633561670780182,
"learning_rate": 4.274454982728032e-06,
"loss": 0.3865,
"step": 1150
},
{
"epoch": 0.8691712289975457,
"grad_norm": 0.13895747065544128,
"learning_rate": 4.2264418540297e-06,
"loss": 0.2752,
"step": 1151
},
{
"epoch": 0.8699263734189163,
"grad_norm": 0.16666154563426971,
"learning_rate": 4.178688003739129e-06,
"loss": 0.3396,
"step": 1152
},
{
"epoch": 0.8706815178402869,
"grad_norm": 0.1707499921321869,
"learning_rate": 4.131193702351827e-06,
"loss": 0.3197,
"step": 1153
},
{
"epoch": 0.8714366622616575,
"grad_norm": 0.176160529255867,
"learning_rate": 4.0839592188931576e-06,
"loss": 0.3482,
"step": 1154
},
{
"epoch": 0.8721918066830281,
"grad_norm": 0.18616865575313568,
"learning_rate": 4.036984820916723e-06,
"loss": 0.3697,
"step": 1155
},
{
"epoch": 0.8729469511043987,
"grad_norm": 0.19371068477630615,
"learning_rate": 3.990270774502941e-06,
"loss": 0.413,
"step": 1156
},
{
"epoch": 0.8737020955257693,
"grad_norm": 0.19405323266983032,
"learning_rate": 3.9438173442575e-06,
"loss": 0.3545,
"step": 1157
},
{
"epoch": 0.8744572399471399,
"grad_norm": 0.19808508455753326,
"learning_rate": 3.897624793309846e-06,
"loss": 0.3991,
"step": 1158
},
{
"epoch": 0.8752123843685105,
"grad_norm": 0.19052360951900482,
"learning_rate": 3.851693383311722e-06,
"loss": 0.3765,
"step": 1159
},
{
"epoch": 0.875967528789881,
"grad_norm": 0.1995311975479126,
"learning_rate": 3.8060233744356633e-06,
"loss": 0.4073,
"step": 1160
},
{
"epoch": 0.8767226732112516,
"grad_norm": 0.19826874136924744,
"learning_rate": 3.760615025373543e-06,
"loss": 0.3841,
"step": 1161
},
{
"epoch": 0.8774778176326222,
"grad_norm": 0.20532841980457306,
"learning_rate": 3.7154685933350864e-06,
"loss": 0.3538,
"step": 1162
},
{
"epoch": 0.8782329620539928,
"grad_norm": 0.20944344997406006,
"learning_rate": 3.6705843340464286e-06,
"loss": 0.4038,
"step": 1163
},
{
"epoch": 0.8789881064753634,
"grad_norm": 0.22046121954917908,
"learning_rate": 3.625962501748653e-06,
"loss": 0.4242,
"step": 1164
},
{
"epoch": 0.879743250896734,
"grad_norm": 0.22591526806354523,
"learning_rate": 3.581603349196372e-06,
"loss": 0.4469,
"step": 1165
},
{
"epoch": 0.8804983953181046,
"grad_norm": 0.22444604337215424,
"learning_rate": 3.53750712765627e-06,
"loss": 0.3994,
"step": 1166
},
{
"epoch": 0.8812535397394752,
"grad_norm": 0.2247525304555893,
"learning_rate": 3.4936740869057073e-06,
"loss": 0.4276,
"step": 1167
},
{
"epoch": 0.8820086841608458,
"grad_norm": 0.2309032678604126,
"learning_rate": 3.4501044752312582e-06,
"loss": 0.463,
"step": 1168
},
{
"epoch": 0.8827638285822164,
"grad_norm": 0.22152245044708252,
"learning_rate": 3.406798539427386e-06,
"loss": 0.3662,
"step": 1169
},
{
"epoch": 0.883518973003587,
"grad_norm": 0.2504233419895172,
"learning_rate": 3.3637565247949588e-06,
"loss": 0.4318,
"step": 1170
},
{
"epoch": 0.8842741174249575,
"grad_norm": 0.23805570602416992,
"learning_rate": 3.3209786751399187e-06,
"loss": 0.4156,
"step": 1171
},
{
"epoch": 0.8850292618463281,
"grad_norm": 0.23133568465709686,
"learning_rate": 3.2784652327718547e-06,
"loss": 0.3695,
"step": 1172
},
{
"epoch": 0.8857844062676987,
"grad_norm": 0.2318771332502365,
"learning_rate": 3.2362164385026706e-06,
"loss": 0.3824,
"step": 1173
},
{
"epoch": 0.8865395506890693,
"grad_norm": 0.22176909446716309,
"learning_rate": 3.194232531645219e-06,
"loss": 0.3575,
"step": 1174
},
{
"epoch": 0.8872946951104399,
"grad_norm": 0.24867790937423706,
"learning_rate": 3.1525137500119207e-06,
"loss": 0.4419,
"step": 1175
},
{
"epoch": 0.8880498395318105,
"grad_norm": 0.252105176448822,
"learning_rate": 3.111060329913401e-06,
"loss": 0.3854,
"step": 1176
},
{
"epoch": 0.8888049839531811,
"grad_norm": 0.24969127774238586,
"learning_rate": 3.069872506157212e-06,
"loss": 0.3825,
"step": 1177
},
{
"epoch": 0.8895601283745517,
"grad_norm": 0.2607312500476837,
"learning_rate": 3.0289505120464743e-06,
"loss": 0.3986,
"step": 1178
},
{
"epoch": 0.8903152727959223,
"grad_norm": 0.2574225664138794,
"learning_rate": 2.9882945793785367e-06,
"loss": 0.3998,
"step": 1179
},
{
"epoch": 0.8910704172172929,
"grad_norm": 0.25807490944862366,
"learning_rate": 2.947904938443663e-06,
"loss": 0.4147,
"step": 1180
},
{
"epoch": 0.8918255616386634,
"grad_norm": 0.24891719222068787,
"learning_rate": 2.9077818180237693e-06,
"loss": 0.3648,
"step": 1181
},
{
"epoch": 0.892580706060034,
"grad_norm": 0.27970343828201294,
"learning_rate": 2.8679254453910785e-06,
"loss": 0.4347,
"step": 1182
},
{
"epoch": 0.8933358504814045,
"grad_norm": 0.26613345742225647,
"learning_rate": 2.8283360463068785e-06,
"loss": 0.3969,
"step": 1183
},
{
"epoch": 0.8940909949027751,
"grad_norm": 0.3011374771595001,
"learning_rate": 2.789013845020205e-06,
"loss": 0.389,
"step": 1184
},
{
"epoch": 0.8948461393241457,
"grad_norm": 0.26815304160118103,
"learning_rate": 2.7499590642665774e-06,
"loss": 0.3572,
"step": 1185
},
{
"epoch": 0.8956012837455163,
"grad_norm": 0.27228736877441406,
"learning_rate": 2.7111719252667647e-06,
"loss": 0.3945,
"step": 1186
},
{
"epoch": 0.8963564281668869,
"grad_norm": 0.28344476222991943,
"learning_rate": 2.6726526477254987e-06,
"loss": 0.4098,
"step": 1187
},
{
"epoch": 0.8971115725882575,
"grad_norm": 0.3316936790943146,
"learning_rate": 2.6344014498302704e-06,
"loss": 0.4422,
"step": 1188
},
{
"epoch": 0.8978667170096281,
"grad_norm": 0.3098110556602478,
"learning_rate": 2.596418548250029e-06,
"loss": 0.3844,
"step": 1189
},
{
"epoch": 0.8986218614309986,
"grad_norm": 0.27956005930900574,
"learning_rate": 2.5587041581340233e-06,
"loss": 0.3017,
"step": 1190
},
{
"epoch": 0.8993770058523692,
"grad_norm": 0.3119887709617615,
"learning_rate": 2.52125849311054e-06,
"loss": 0.4119,
"step": 1191
},
{
"epoch": 0.9001321502737398,
"grad_norm": 0.3407526910305023,
"learning_rate": 2.4840817652857172e-06,
"loss": 0.4231,
"step": 1192
},
{
"epoch": 0.9008872946951104,
"grad_norm": 0.34797540307044983,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.394,
"step": 1193
},
{
"epoch": 0.901642439116481,
"grad_norm": 0.3451668322086334,
"learning_rate": 2.4105359620385847e-06,
"loss": 0.3652,
"step": 1194
},
{
"epoch": 0.9023975835378516,
"grad_norm": 0.3375682532787323,
"learning_rate": 2.3741673032069756e-06,
"loss": 0.3551,
"step": 1195
},
{
"epoch": 0.9031527279592222,
"grad_norm": 0.36490514874458313,
"learning_rate": 2.338068414753075e-06,
"loss": 0.3753,
"step": 1196
},
{
"epoch": 0.9039078723805928,
"grad_norm": 0.3733910322189331,
"learning_rate": 2.3022395011543686e-06,
"loss": 0.4036,
"step": 1197
},
{
"epoch": 0.9046630168019634,
"grad_norm": 0.38239195942878723,
"learning_rate": 2.2666807653591083e-06,
"loss": 0.4228,
"step": 1198
},
{
"epoch": 0.905418161223334,
"grad_norm": 0.46867436170578003,
"learning_rate": 2.2313924087851656e-06,
"loss": 0.4567,
"step": 1199
},
{
"epoch": 0.9061733056447046,
"grad_norm": 0.6003281474113464,
"learning_rate": 2.196374631318876e-06,
"loss": 0.4376,
"step": 1200
},
{
"epoch": 0.9069284500660751,
"grad_norm": 0.1415417641401291,
"learning_rate": 2.161627631313923e-06,
"loss": 0.2603,
"step": 1201
},
{
"epoch": 0.9076835944874457,
"grad_norm": 0.1586138904094696,
"learning_rate": 2.1271516055901777e-06,
"loss": 0.3263,
"step": 1202
},
{
"epoch": 0.9084387389088163,
"grad_norm": 0.16962364315986633,
"learning_rate": 2.0929467494326614e-06,
"loss": 0.3294,
"step": 1203
},
{
"epoch": 0.9091938833301869,
"grad_norm": 0.16859561204910278,
"learning_rate": 2.0590132565903476e-06,
"loss": 0.3619,
"step": 1204
},
{
"epoch": 0.9099490277515575,
"grad_norm": 0.1871105134487152,
"learning_rate": 2.0253513192751373e-06,
"loss": 0.3679,
"step": 1205
},
{
"epoch": 0.9107041721729281,
"grad_norm": 0.18284808099269867,
"learning_rate": 1.9919611281607077e-06,
"loss": 0.3423,
"step": 1206
},
{
"epoch": 0.9114593165942987,
"grad_norm": 0.19052286446094513,
"learning_rate": 1.9588428723814946e-06,
"loss": 0.4149,
"step": 1207
},
{
"epoch": 0.9122144610156693,
"grad_norm": 0.20341211557388306,
"learning_rate": 1.925996739531577e-06,
"loss": 0.3938,
"step": 1208
},
{
"epoch": 0.9129696054370399,
"grad_norm": 0.19892559945583344,
"learning_rate": 1.8934229156636452e-06,
"loss": 0.361,
"step": 1209
},
{
"epoch": 0.9137247498584105,
"grad_norm": 0.21719536185264587,
"learning_rate": 1.8611215852879005e-06,
"loss": 0.429,
"step": 1210
},
{
"epoch": 0.914479894279781,
"grad_norm": 0.20925089716911316,
"learning_rate": 1.8290929313710513e-06,
"loss": 0.3961,
"step": 1211
},
{
"epoch": 0.9152350387011516,
"grad_norm": 0.21349644660949707,
"learning_rate": 1.797337135335292e-06,
"loss": 0.3969,
"step": 1212
},
{
"epoch": 0.9159901831225222,
"grad_norm": 0.2184101939201355,
"learning_rate": 1.7658543770572189e-06,
"loss": 0.3583,
"step": 1213
},
{
"epoch": 0.9167453275438928,
"grad_norm": 0.21886181831359863,
"learning_rate": 1.7346448348668443e-06,
"loss": 0.42,
"step": 1214
},
{
"epoch": 0.9175004719652634,
"grad_norm": 0.20769384503364563,
"learning_rate": 1.70370868554659e-06,
"loss": 0.3761,
"step": 1215
},
{
"epoch": 0.9182556163866339,
"grad_norm": 0.2125682830810547,
"learning_rate": 1.6730461043302726e-06,
"loss": 0.3481,
"step": 1216
},
{
"epoch": 0.9190107608080045,
"grad_norm": 0.22738561034202576,
"learning_rate": 1.6426572649021476e-06,
"loss": 0.4114,
"step": 1217
},
{
"epoch": 0.9197659052293751,
"grad_norm": 0.22645215690135956,
"learning_rate": 1.612542339395845e-06,
"loss": 0.4581,
"step": 1218
},
{
"epoch": 0.9205210496507457,
"grad_norm": 0.2223154753446579,
"learning_rate": 1.582701498393474e-06,
"loss": 0.4042,
"step": 1219
},
{
"epoch": 0.9212761940721163,
"grad_norm": 0.23326793313026428,
"learning_rate": 1.5531349109246362e-06,
"loss": 0.4387,
"step": 1220
},
{
"epoch": 0.9220313384934868,
"grad_norm": 0.23844638466835022,
"learning_rate": 1.523842744465437e-06,
"loss": 0.4143,
"step": 1221
},
{
"epoch": 0.9227864829148574,
"grad_norm": 0.23629891872406006,
"learning_rate": 1.4948251649375745e-06,
"loss": 0.4301,
"step": 1222
},
{
"epoch": 0.923541627336228,
"grad_norm": 0.23292282223701477,
"learning_rate": 1.4660823367073751e-06,
"loss": 0.389,
"step": 1223
},
{
"epoch": 0.9242967717575986,
"grad_norm": 0.25617265701293945,
"learning_rate": 1.437614422584882e-06,
"loss": 0.4615,
"step": 1224
},
{
"epoch": 0.9250519161789692,
"grad_norm": 0.23757751286029816,
"learning_rate": 1.4094215838229176e-06,
"loss": 0.3766,
"step": 1225
},
{
"epoch": 0.9258070606003398,
"grad_norm": 0.2631858289241791,
"learning_rate": 1.3815039801161721e-06,
"loss": 0.4561,
"step": 1226
},
{
"epoch": 0.9265622050217104,
"grad_norm": 0.2697192132472992,
"learning_rate": 1.3538617696003064e-06,
"loss": 0.4365,
"step": 1227
},
{
"epoch": 0.927317349443081,
"grad_norm": 0.23894578218460083,
"learning_rate": 1.3264951088510502e-06,
"loss": 0.3623,
"step": 1228
},
{
"epoch": 0.9280724938644516,
"grad_norm": 0.27340683341026306,
"learning_rate": 1.2994041528833266e-06,
"loss": 0.4602,
"step": 1229
},
{
"epoch": 0.9288276382858222,
"grad_norm": 0.2491341382265091,
"learning_rate": 1.2725890551503472e-06,
"loss": 0.3397,
"step": 1230
},
{
"epoch": 0.9295827827071927,
"grad_norm": 0.26839953660964966,
"learning_rate": 1.2460499675427729e-06,
"loss": 0.4095,
"step": 1231
},
{
"epoch": 0.9303379271285633,
"grad_norm": 0.28354331851005554,
"learning_rate": 1.2197870403878375e-06,
"loss": 0.4034,
"step": 1232
},
{
"epoch": 0.9310930715499339,
"grad_norm": 0.2778373062610626,
"learning_rate": 1.1938004224484988e-06,
"loss": 0.3612,
"step": 1233
},
{
"epoch": 0.9318482159713045,
"grad_norm": 0.28623783588409424,
"learning_rate": 1.1680902609225941e-06,
"loss": 0.3763,
"step": 1234
},
{
"epoch": 0.9326033603926751,
"grad_norm": 0.2844613790512085,
"learning_rate": 1.1426567014420297e-06,
"loss": 0.4077,
"step": 1235
},
{
"epoch": 0.9333585048140457,
"grad_norm": 0.2865941524505615,
"learning_rate": 1.1174998880718935e-06,
"loss": 0.3973,
"step": 1236
},
{
"epoch": 0.9341136492354163,
"grad_norm": 0.2908569276332855,
"learning_rate": 1.0926199633097157e-06,
"loss": 0.3688,
"step": 1237
},
{
"epoch": 0.9348687936567869,
"grad_norm": 0.3032941520214081,
"learning_rate": 1.0680170680846259e-06,
"loss": 0.3971,
"step": 1238
},
{
"epoch": 0.9356239380781575,
"grad_norm": 0.29943525791168213,
"learning_rate": 1.0436913417565365e-06,
"loss": 0.4151,
"step": 1239
},
{
"epoch": 0.9363790824995281,
"grad_norm": 0.3232915997505188,
"learning_rate": 1.0196429221153824e-06,
"loss": 0.336,
"step": 1240
},
{
"epoch": 0.9371342269208986,
"grad_norm": 0.3355953097343445,
"learning_rate": 9.958719453803278e-07,
"loss": 0.4568,
"step": 1241
},
{
"epoch": 0.9378893713422692,
"grad_norm": 0.35610586404800415,
"learning_rate": 9.723785461990099e-07,
"loss": 0.4446,
"step": 1242
},
{
"epoch": 0.9386445157636398,
"grad_norm": 0.347074031829834,
"learning_rate": 9.491628576467515e-07,
"loss": 0.4065,
"step": 1243
},
{
"epoch": 0.9393996601850104,
"grad_norm": 0.37643951177597046,
"learning_rate": 9.26225011225812e-07,
"loss": 0.3945,
"step": 1244
},
{
"epoch": 0.940154804606381,
"grad_norm": 0.34759992361068726,
"learning_rate": 9.035651368646648e-07,
"loss": 0.3654,
"step": 1245
},
{
"epoch": 0.9409099490277516,
"grad_norm": 0.3707546889781952,
"learning_rate": 8.811833629172428e-07,
"loss": 0.2976,
"step": 1246
},
{
"epoch": 0.9416650934491222,
"grad_norm": 0.3839170038700104,
"learning_rate": 8.590798161622227e-07,
"loss": 0.4161,
"step": 1247
},
{
"epoch": 0.9424202378704928,
"grad_norm": 0.3534461557865143,
"learning_rate": 8.372546218022747e-07,
"loss": 0.3231,
"step": 1248
},
{
"epoch": 0.9431753822918634,
"grad_norm": 0.4794304370880127,
"learning_rate": 8.157079034633974e-07,
"loss": 0.4584,
"step": 1249
},
{
"epoch": 0.9439305267132339,
"grad_norm": 0.5317772030830383,
"learning_rate": 7.944397831941952e-07,
"loss": 0.4143,
"step": 1250
},
{
"epoch": 0.9446856711346044,
"grad_norm": 0.16856196522712708,
"learning_rate": 7.734503814651906e-07,
"loss": 0.3159,
"step": 1251
},
{
"epoch": 0.945440815555975,
"grad_norm": 0.16190548241138458,
"learning_rate": 7.527398171681354e-07,
"loss": 0.3332,
"step": 1252
},
{
"epoch": 0.9461959599773456,
"grad_norm": 0.16370098292827606,
"learning_rate": 7.323082076153509e-07,
"loss": 0.331,
"step": 1253
},
{
"epoch": 0.9469511043987162,
"grad_norm": 0.16921286284923553,
"learning_rate": 7.12155668539044e-07,
"loss": 0.2992,
"step": 1254
},
{
"epoch": 0.9477062488200868,
"grad_norm": 0.17997263371944427,
"learning_rate": 6.922823140906753e-07,
"loss": 0.3884,
"step": 1255
},
{
"epoch": 0.9484613932414574,
"grad_norm": 0.18398523330688477,
"learning_rate": 6.726882568402871e-07,
"loss": 0.3779,
"step": 1256
},
{
"epoch": 0.949216537662828,
"grad_norm": 0.19981886446475983,
"learning_rate": 6.533736077758868e-07,
"loss": 0.3687,
"step": 1257
},
{
"epoch": 0.9499716820841986,
"grad_norm": 0.19422514736652374,
"learning_rate": 6.343384763028148e-07,
"loss": 0.3624,
"step": 1258
},
{
"epoch": 0.9507268265055692,
"grad_norm": 0.19577431678771973,
"learning_rate": 6.15582970243117e-07,
"loss": 0.4185,
"step": 1259
},
{
"epoch": 0.9514819709269398,
"grad_norm": 0.2101883441209793,
"learning_rate": 5.971071958349228e-07,
"loss": 0.3948,
"step": 1260
},
{
"epoch": 0.9522371153483103,
"grad_norm": 0.19973735511302948,
"learning_rate": 5.78911257731879e-07,
"loss": 0.4095,
"step": 1261
},
{
"epoch": 0.9529922597696809,
"grad_norm": 0.20807954668998718,
"learning_rate": 5.609952590025224e-07,
"loss": 0.3892,
"step": 1262
},
{
"epoch": 0.9537474041910515,
"grad_norm": 0.2160942405462265,
"learning_rate": 5.4335930112972e-07,
"loss": 0.3782,
"step": 1263
},
{
"epoch": 0.9545025486124221,
"grad_norm": 0.1981229931116104,
"learning_rate": 5.260034840100736e-07,
"loss": 0.3768,
"step": 1264
},
{
"epoch": 0.9552576930337927,
"grad_norm": 0.19447720050811768,
"learning_rate": 5.089279059533658e-07,
"loss": 0.3527,
"step": 1265
},
{
"epoch": 0.9560128374551633,
"grad_norm": 0.21603922545909882,
"learning_rate": 4.92132663681999e-07,
"loss": 0.3792,
"step": 1266
},
{
"epoch": 0.9567679818765339,
"grad_norm": 0.22324316203594208,
"learning_rate": 4.756178523304622e-07,
"loss": 0.413,
"step": 1267
},
{
"epoch": 0.9575231262979045,
"grad_norm": 0.21948575973510742,
"learning_rate": 4.593835654447709e-07,
"loss": 0.4481,
"step": 1268
},
{
"epoch": 0.9582782707192751,
"grad_norm": 0.21984027326107025,
"learning_rate": 4.434298949819449e-07,
"loss": 0.3766,
"step": 1269
},
{
"epoch": 0.9590334151406457,
"grad_norm": 0.24279214441776276,
"learning_rate": 4.277569313094809e-07,
"loss": 0.3761,
"step": 1270
},
{
"epoch": 0.9597885595620163,
"grad_norm": 0.23110735416412354,
"learning_rate": 4.123647632048644e-07,
"loss": 0.4528,
"step": 1271
},
{
"epoch": 0.9605437039833868,
"grad_norm": 0.2304868847131729,
"learning_rate": 3.972534778550474e-07,
"loss": 0.3633,
"step": 1272
},
{
"epoch": 0.9612988484047574,
"grad_norm": 0.2233697474002838,
"learning_rate": 3.824231608559492e-07,
"loss": 0.397,
"step": 1273
},
{
"epoch": 0.962053992826128,
"grad_norm": 0.23600885272026062,
"learning_rate": 3.6787389621198987e-07,
"loss": 0.4136,
"step": 1274
},
{
"epoch": 0.9628091372474986,
"grad_norm": 0.23501838743686676,
"learning_rate": 3.536057663355852e-07,
"loss": 0.323,
"step": 1275
},
{
"epoch": 0.9635642816688692,
"grad_norm": 0.2491409033536911,
"learning_rate": 3.3961885204673026e-07,
"loss": 0.4763,
"step": 1276
},
{
"epoch": 0.9643194260902398,
"grad_norm": 0.2430865615606308,
"learning_rate": 3.2591323257248893e-07,
"loss": 0.3949,
"step": 1277
},
{
"epoch": 0.9650745705116104,
"grad_norm": 0.2593831717967987,
"learning_rate": 3.124889855465718e-07,
"loss": 0.3738,
"step": 1278
},
{
"epoch": 0.965829714932981,
"grad_norm": 0.24975533783435822,
"learning_rate": 2.993461870088921e-07,
"loss": 0.3614,
"step": 1279
},
{
"epoch": 0.9665848593543516,
"grad_norm": 0.25487789511680603,
"learning_rate": 2.8648491140513266e-07,
"loss": 0.46,
"step": 1280
},
{
"epoch": 0.9673400037757222,
"grad_norm": 0.2505668103694916,
"learning_rate": 2.7390523158633554e-07,
"loss": 0.3317,
"step": 1281
},
{
"epoch": 0.9680951481970927,
"grad_norm": 0.261342853307724,
"learning_rate": 2.616072188084628e-07,
"loss": 0.3857,
"step": 1282
},
{
"epoch": 0.9688502926184632,
"grad_norm": 0.2685639262199402,
"learning_rate": 2.4959094273201977e-07,
"loss": 0.3631,
"step": 1283
},
{
"epoch": 0.9696054370398338,
"grad_norm": 0.26906096935272217,
"learning_rate": 2.378564714216547e-07,
"loss": 0.4212,
"step": 1284
},
{
"epoch": 0.9703605814612044,
"grad_norm": 0.28708136081695557,
"learning_rate": 2.2640387134577058e-07,
"loss": 0.5326,
"step": 1285
},
{
"epoch": 0.971115725882575,
"grad_norm": 0.2781200706958771,
"learning_rate": 2.1523320737613095e-07,
"loss": 0.3952,
"step": 1286
},
{
"epoch": 0.9718708703039456,
"grad_norm": 0.2814629077911377,
"learning_rate": 2.0434454278752123e-07,
"loss": 0.3578,
"step": 1287
},
{
"epoch": 0.9726260147253162,
"grad_norm": 0.2876596450805664,
"learning_rate": 1.937379392573768e-07,
"loss": 0.3978,
"step": 1288
},
{
"epoch": 0.9733811591466868,
"grad_norm": 0.3008500635623932,
"learning_rate": 1.8341345686543332e-07,
"loss": 0.3936,
"step": 1289
},
{
"epoch": 0.9741363035680574,
"grad_norm": 0.32256075739860535,
"learning_rate": 1.7337115409338244e-07,
"loss": 0.4423,
"step": 1290
},
{
"epoch": 0.974891447989428,
"grad_norm": 0.3017309010028839,
"learning_rate": 1.6361108782456113e-07,
"loss": 0.4027,
"step": 1291
},
{
"epoch": 0.9756465924107985,
"grad_norm": 0.3096626400947571,
"learning_rate": 1.5413331334360182e-07,
"loss": 0.3962,
"step": 1292
},
{
"epoch": 0.9764017368321691,
"grad_norm": 0.3427668511867523,
"learning_rate": 1.449378843361271e-07,
"loss": 0.4388,
"step": 1293
},
{
"epoch": 0.9771568812535397,
"grad_norm": 0.3453672528266907,
"learning_rate": 1.360248528884611e-07,
"loss": 0.4638,
"step": 1294
},
{
"epoch": 0.9779120256749103,
"grad_norm": 0.33292150497436523,
"learning_rate": 1.2739426948732424e-07,
"loss": 0.3513,
"step": 1295
},
{
"epoch": 0.9786671700962809,
"grad_norm": 0.34688884019851685,
"learning_rate": 1.190461830195333e-07,
"loss": 0.3666,
"step": 1296
},
{
"epoch": 0.9794223145176515,
"grad_norm": 0.41783586144447327,
"learning_rate": 1.109806407717462e-07,
"loss": 0.4518,
"step": 1297
},
{
"epoch": 0.9801774589390221,
"grad_norm": 0.36416101455688477,
"learning_rate": 1.0319768843018996e-07,
"loss": 0.3785,
"step": 1298
},
{
"epoch": 0.9809326033603927,
"grad_norm": 0.4353952407836914,
"learning_rate": 9.56973700803887e-08,
"loss": 0.4347,
"step": 1299
},
{
"epoch": 0.9816877477817633,
"grad_norm": 0.5201441645622253,
"learning_rate": 8.847972820693051e-08,
"loss": 0.3487,
"step": 1300
},
{
"epoch": 0.9824428922031339,
"grad_norm": 0.14752991497516632,
"learning_rate": 8.15448036932176e-08,
"loss": 0.3271,
"step": 1301
},
{
"epoch": 0.9831980366245044,
"grad_norm": 0.1663774847984314,
"learning_rate": 7.489263582122763e-08,
"loss": 0.3467,
"step": 1302
},
{
"epoch": 0.983953181045875,
"grad_norm": 0.18630105257034302,
"learning_rate": 6.852326227130834e-08,
"loss": 0.4198,
"step": 1303
},
{
"epoch": 0.9847083254672456,
"grad_norm": 0.18574944138526917,
"learning_rate": 6.243671912194993e-08,
"loss": 0.393,
"step": 1304
},
{
"epoch": 0.9854634698886162,
"grad_norm": 0.20900239050388336,
"learning_rate": 5.663304084960186e-08,
"loss": 0.3765,
"step": 1305
},
{
"epoch": 0.9862186143099868,
"grad_norm": 0.20655685663223267,
"learning_rate": 5.111226032843974e-08,
"loss": 0.4079,
"step": 1306
},
{
"epoch": 0.9869737587313574,
"grad_norm": 0.22022481262683868,
"learning_rate": 4.5874408830215434e-08,
"loss": 0.4319,
"step": 1307
},
{
"epoch": 0.987728903152728,
"grad_norm": 0.22208468616008759,
"learning_rate": 4.0919516024057195e-08,
"loss": 0.4083,
"step": 1308
},
{
"epoch": 0.9884840475740986,
"grad_norm": 0.24290700256824493,
"learning_rate": 3.624760997631982e-08,
"loss": 0.4351,
"step": 1309
},
{
"epoch": 0.9892391919954692,
"grad_norm": 0.22663183510303497,
"learning_rate": 3.185871715041255e-08,
"loss": 0.3828,
"step": 1310
},
{
"epoch": 0.9899943364168398,
"grad_norm": 0.25170832872390747,
"learning_rate": 2.7752862406654757e-08,
"loss": 0.4401,
"step": 1311
},
{
"epoch": 0.9907494808382104,
"grad_norm": 0.2576093077659607,
"learning_rate": 2.393006900212047e-08,
"loss": 0.4513,
"step": 1312
},
{
"epoch": 0.9915046252595809,
"grad_norm": 0.2478175014257431,
"learning_rate": 2.0390358590538504e-08,
"loss": 0.3902,
"step": 1313
},
{
"epoch": 0.9922597696809515,
"grad_norm": 0.26220056414604187,
"learning_rate": 1.7133751222137007e-08,
"loss": 0.4562,
"step": 1314
},
{
"epoch": 0.9930149141023221,
"grad_norm": 0.272030234336853,
"learning_rate": 1.4160265343549083e-08,
"loss": 0.4128,
"step": 1315
},
{
"epoch": 0.9937700585236926,
"grad_norm": 0.282482385635376,
"learning_rate": 1.1469917797696239e-08,
"loss": 0.4678,
"step": 1316
},
{
"epoch": 0.9945252029450632,
"grad_norm": 0.29758918285369873,
"learning_rate": 9.06272382371065e-09,
"loss": 0.4549,
"step": 1317
},
{
"epoch": 0.9952803473664338,
"grad_norm": 0.2753942012786865,
"learning_rate": 6.9386970568297014e-09,
"loss": 0.3572,
"step": 1318
},
{
"epoch": 0.9960354917878044,
"grad_norm": 0.3179655075073242,
"learning_rate": 5.097849528334919e-09,
"loss": 0.4328,
"step": 1319
},
{
"epoch": 0.996790636209175,
"grad_norm": 0.30263400077819824,
"learning_rate": 3.540191665457604e-09,
"loss": 0.3691,
"step": 1320
},
{
"epoch": 0.9975457806305456,
"grad_norm": 0.3153160810470581,
"learning_rate": 2.265732291356626e-09,
"loss": 0.3449,
"step": 1321
},
{
"epoch": 0.9983009250519161,
"grad_norm": 0.33442163467407227,
"learning_rate": 1.2744786250407092e-09,
"loss": 0.3951,
"step": 1322
},
{
"epoch": 0.9990560694732867,
"grad_norm": 0.3670744299888611,
"learning_rate": 5.664362813406765e-10,
"loss": 0.3497,
"step": 1323
},
{
"epoch": 0.9998112138946573,
"grad_norm": 0.5614824891090393,
"learning_rate": 1.416092708650396e-10,
"loss": 0.4963,
"step": 1324
},
{
"epoch": 0.9998112138946573,
"eval_loss": 0.3858806788921356,
"eval_runtime": 92.8566,
"eval_samples_per_second": 12.008,
"eval_steps_per_second": 3.005,
"step": 1324
},
{
"epoch": 1.000566358316028,
"grad_norm": 3.6732022762298584,
"learning_rate": 0.0,
"loss": 1.1897,
"step": 1325
}
],
"logging_steps": 1,
"max_steps": 1325,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 332,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7073098196516864e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}