DeB3RTa-small / trainer_state.json
higopires's picture
Upload 12 files
0725def verified
raw
history blame
29.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 80650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.30998140111593303,
"grad_norm": 0.3543250262737274,
"learning_rate": 6.195786864931847e-05,
"loss": 9.0345,
"step": 500
},
{
"epoch": 0.6199628022318661,
"grad_norm": 0.5106557607650757,
"learning_rate": 9.97582756158962e-05,
"loss": 6.2184,
"step": 1000
},
{
"epoch": 0.9299442033477991,
"grad_norm": 2.6617751121520996,
"learning_rate": 9.913204664153402e-05,
"loss": 5.4194,
"step": 1500
},
{
"epoch": 1.2399256044637321,
"grad_norm": 1.8096632957458496,
"learning_rate": 9.850581766717182e-05,
"loss": 3.915,
"step": 2000
},
{
"epoch": 1.5499070055796653,
"grad_norm": 1.2520173788070679,
"learning_rate": 9.787958869280964e-05,
"loss": 2.7963,
"step": 2500
},
{
"epoch": 1.8598884066955983,
"grad_norm": 0.8099603056907654,
"learning_rate": 9.725335971844745e-05,
"loss": 2.2568,
"step": 3000
},
{
"epoch": 2.1698698078115313,
"grad_norm": 0.7233591079711914,
"learning_rate": 9.662713074408527e-05,
"loss": 1.9847,
"step": 3500
},
{
"epoch": 2.4798512089274642,
"grad_norm": 0.6427165865898132,
"learning_rate": 9.600090176972308e-05,
"loss": 1.8216,
"step": 4000
},
{
"epoch": 2.7898326100433977,
"grad_norm": 0.6729193925857544,
"learning_rate": 9.53746727953609e-05,
"loss": 1.7067,
"step": 4500
},
{
"epoch": 3.0998140111593306,
"grad_norm": 0.6484789848327637,
"learning_rate": 9.47484438209987e-05,
"loss": 1.6187,
"step": 5000
},
{
"epoch": 3.4097954122752636,
"grad_norm": 0.5950448513031006,
"learning_rate": 9.412221484663653e-05,
"loss": 1.5479,
"step": 5500
},
{
"epoch": 3.7197768133911966,
"grad_norm": 0.6102598309516907,
"learning_rate": 9.349598587227433e-05,
"loss": 1.4879,
"step": 6000
},
{
"epoch": 4.02975821450713,
"grad_norm": 0.6204754710197449,
"learning_rate": 9.286975689791215e-05,
"loss": 1.4379,
"step": 6500
},
{
"epoch": 4.3397396156230625,
"grad_norm": 0.590217649936676,
"learning_rate": 9.224352792354997e-05,
"loss": 1.3926,
"step": 7000
},
{
"epoch": 4.6497210167389955,
"grad_norm": 0.6062743663787842,
"learning_rate": 9.161729894918779e-05,
"loss": 1.3553,
"step": 7500
},
{
"epoch": 4.9597024178549285,
"grad_norm": 0.5663708448410034,
"learning_rate": 9.09910699748256e-05,
"loss": 1.3201,
"step": 8000
},
{
"epoch": 5.2696838189708615,
"grad_norm": 0.5806947350502014,
"learning_rate": 9.036484100046342e-05,
"loss": 1.2904,
"step": 8500
},
{
"epoch": 5.579665220086794,
"grad_norm": 0.6131803393363953,
"learning_rate": 8.973861202610123e-05,
"loss": 1.2623,
"step": 9000
},
{
"epoch": 5.889646621202727,
"grad_norm": 0.5666236281394958,
"learning_rate": 8.911238305173905e-05,
"loss": 1.2368,
"step": 9500
},
{
"epoch": 6.199628022318661,
"grad_norm": 0.6078547239303589,
"learning_rate": 8.848615407737685e-05,
"loss": 1.212,
"step": 10000
},
{
"epoch": 6.509609423434594,
"grad_norm": 0.575513482093811,
"learning_rate": 8.785992510301467e-05,
"loss": 1.1914,
"step": 10500
},
{
"epoch": 6.819590824550527,
"grad_norm": 0.5826976895332336,
"learning_rate": 8.723369612865248e-05,
"loss": 1.1718,
"step": 11000
},
{
"epoch": 7.12957222566646,
"grad_norm": 0.544598400592804,
"learning_rate": 8.66074671542903e-05,
"loss": 1.1548,
"step": 11500
},
{
"epoch": 7.439553626782393,
"grad_norm": 0.5824791193008423,
"learning_rate": 8.598123817992811e-05,
"loss": 1.1363,
"step": 12000
},
{
"epoch": 7.749535027898326,
"grad_norm": 0.5747692584991455,
"learning_rate": 8.535500920556593e-05,
"loss": 1.1211,
"step": 12500
},
{
"epoch": 8.05951642901426,
"grad_norm": 0.5473280549049377,
"learning_rate": 8.472878023120375e-05,
"loss": 1.1077,
"step": 13000
},
{
"epoch": 8.369497830130193,
"grad_norm": 0.5574379563331604,
"learning_rate": 8.410255125684155e-05,
"loss": 1.0908,
"step": 13500
},
{
"epoch": 8.679479231246125,
"grad_norm": 0.5424452424049377,
"learning_rate": 8.347632228247937e-05,
"loss": 1.0785,
"step": 14000
},
{
"epoch": 8.989460632362059,
"grad_norm": 0.5508283376693726,
"learning_rate": 8.285009330811718e-05,
"loss": 1.0683,
"step": 14500
},
{
"epoch": 9.299442033477991,
"grad_norm": 0.5519115924835205,
"learning_rate": 8.2223864333755e-05,
"loss": 1.0537,
"step": 15000
},
{
"epoch": 9.609423434593925,
"grad_norm": 0.5510475039482117,
"learning_rate": 8.159763535939281e-05,
"loss": 1.0443,
"step": 15500
},
{
"epoch": 9.919404835709857,
"grad_norm": 0.5631123185157776,
"learning_rate": 8.097140638503063e-05,
"loss": 1.0339,
"step": 16000
},
{
"epoch": 10.22938623682579,
"grad_norm": 0.5705382823944092,
"learning_rate": 8.034517741066844e-05,
"loss": 1.0217,
"step": 16500
},
{
"epoch": 10.539367637941723,
"grad_norm": 0.5316577553749084,
"learning_rate": 7.971894843630626e-05,
"loss": 1.0151,
"step": 17000
},
{
"epoch": 10.849349039057657,
"grad_norm": 0.5557442307472229,
"learning_rate": 7.909271946194406e-05,
"loss": 1.0043,
"step": 17500
},
{
"epoch": 11.159330440173589,
"grad_norm": 0.5498985648155212,
"learning_rate": 7.846649048758188e-05,
"loss": 0.9951,
"step": 18000
},
{
"epoch": 11.469311841289523,
"grad_norm": 0.552780032157898,
"learning_rate": 7.784026151321969e-05,
"loss": 0.9855,
"step": 18500
},
{
"epoch": 11.779293242405455,
"grad_norm": 0.5406888127326965,
"learning_rate": 7.721403253885752e-05,
"loss": 0.9795,
"step": 19000
},
{
"epoch": 12.089274643521389,
"grad_norm": 0.537375271320343,
"learning_rate": 7.658780356449533e-05,
"loss": 0.971,
"step": 19500
},
{
"epoch": 12.399256044637323,
"grad_norm": 0.5666614174842834,
"learning_rate": 7.596157459013315e-05,
"loss": 0.9643,
"step": 20000
},
{
"epoch": 12.709237445753255,
"grad_norm": 0.5302731990814209,
"learning_rate": 7.533659807371968e-05,
"loss": 0.9582,
"step": 20500
},
{
"epoch": 13.019218846869189,
"grad_norm": 0.5608243346214294,
"learning_rate": 7.471036909935749e-05,
"loss": 0.9512,
"step": 21000
},
{
"epoch": 13.32920024798512,
"grad_norm": 0.5309119820594788,
"learning_rate": 7.408414012499531e-05,
"loss": 0.9424,
"step": 21500
},
{
"epoch": 13.639181649101054,
"grad_norm": 0.5380939245223999,
"learning_rate": 7.345791115063312e-05,
"loss": 0.9383,
"step": 22000
},
{
"epoch": 13.949163050216987,
"grad_norm": 0.5440984964370728,
"learning_rate": 7.283168217627094e-05,
"loss": 0.9298,
"step": 22500
},
{
"epoch": 14.25914445133292,
"grad_norm": 0.5377441048622131,
"learning_rate": 7.220545320190874e-05,
"loss": 0.9245,
"step": 23000
},
{
"epoch": 14.569125852448852,
"grad_norm": 0.5402495265007019,
"learning_rate": 7.157922422754656e-05,
"loss": 0.9196,
"step": 23500
},
{
"epoch": 14.879107253564786,
"grad_norm": 0.5610705018043518,
"learning_rate": 7.095299525318437e-05,
"loss": 0.9146,
"step": 24000
},
{
"epoch": 15.189088654680718,
"grad_norm": 0.5305636525154114,
"learning_rate": 7.032676627882219e-05,
"loss": 0.9071,
"step": 24500
},
{
"epoch": 15.499070055796652,
"grad_norm": 0.5398979187011719,
"learning_rate": 6.970053730446e-05,
"loss": 0.9037,
"step": 25000
},
{
"epoch": 15.809051456912584,
"grad_norm": 0.5490283370018005,
"learning_rate": 6.907556078804655e-05,
"loss": 0.8982,
"step": 25500
},
{
"epoch": 16.11903285802852,
"grad_norm": 0.5505014061927795,
"learning_rate": 6.844933181368435e-05,
"loss": 0.8933,
"step": 26000
},
{
"epoch": 16.429014259144452,
"grad_norm": 0.5260488390922546,
"learning_rate": 6.782310283932217e-05,
"loss": 0.8865,
"step": 26500
},
{
"epoch": 16.738995660260386,
"grad_norm": 0.5459970235824585,
"learning_rate": 6.719687386495999e-05,
"loss": 0.8837,
"step": 27000
},
{
"epoch": 17.048977061376316,
"grad_norm": 0.5260828733444214,
"learning_rate": 6.657189734854653e-05,
"loss": 0.8812,
"step": 27500
},
{
"epoch": 17.35895846249225,
"grad_norm": 0.531878650188446,
"learning_rate": 6.594566837418435e-05,
"loss": 0.874,
"step": 28000
},
{
"epoch": 17.668939863608184,
"grad_norm": 0.5373751521110535,
"learning_rate": 6.531943939982215e-05,
"loss": 0.8703,
"step": 28500
},
{
"epoch": 17.978921264724118,
"grad_norm": 0.5685413479804993,
"learning_rate": 6.469321042545997e-05,
"loss": 0.8674,
"step": 29000
},
{
"epoch": 18.288902665840048,
"grad_norm": 0.5405117273330688,
"learning_rate": 6.406698145109778e-05,
"loss": 0.8618,
"step": 29500
},
{
"epoch": 18.598884066955982,
"grad_norm": 0.5303318500518799,
"learning_rate": 6.344325739263305e-05,
"loss": 0.8572,
"step": 30000
},
{
"epoch": 18.908865468071916,
"grad_norm": 0.5173208117485046,
"learning_rate": 6.281702841827086e-05,
"loss": 0.8552,
"step": 30500
},
{
"epoch": 19.21884686918785,
"grad_norm": 0.5334449410438538,
"learning_rate": 6.219079944390868e-05,
"loss": 0.8494,
"step": 31000
},
{
"epoch": 19.52882827030378,
"grad_norm": 0.5522080659866333,
"learning_rate": 6.156457046954649e-05,
"loss": 0.8464,
"step": 31500
},
{
"epoch": 19.838809671419714,
"grad_norm": 0.5295758247375488,
"learning_rate": 6.09383414951843e-05,
"loss": 0.845,
"step": 32000
},
{
"epoch": 20.148791072535648,
"grad_norm": 0.5164583325386047,
"learning_rate": 6.0312112520822115e-05,
"loss": 0.8395,
"step": 32500
},
{
"epoch": 20.45877247365158,
"grad_norm": 0.5620171427726746,
"learning_rate": 5.968713600440865e-05,
"loss": 0.8354,
"step": 33000
},
{
"epoch": 20.768753874767516,
"grad_norm": 0.5254458785057068,
"learning_rate": 5.906090703004646e-05,
"loss": 0.8336,
"step": 33500
},
{
"epoch": 21.078735275883446,
"grad_norm": 0.5437597632408142,
"learning_rate": 5.8434678055684276e-05,
"loss": 0.8304,
"step": 34000
},
{
"epoch": 21.38871667699938,
"grad_norm": 0.5438856482505798,
"learning_rate": 5.78084490813221e-05,
"loss": 0.8263,
"step": 34500
},
{
"epoch": 21.698698078115314,
"grad_norm": 0.5386750102043152,
"learning_rate": 5.7182220106959916e-05,
"loss": 0.8248,
"step": 35000
},
{
"epoch": 22.008679479231247,
"grad_norm": 0.5307642817497253,
"learning_rate": 5.655724359054645e-05,
"loss": 0.8223,
"step": 35500
},
{
"epoch": 22.318660880347178,
"grad_norm": 0.5404214859008789,
"learning_rate": 5.5931014616184264e-05,
"loss": 0.8176,
"step": 36000
},
{
"epoch": 22.62864228146311,
"grad_norm": 0.555665910243988,
"learning_rate": 5.530478564182208e-05,
"loss": 0.8164,
"step": 36500
},
{
"epoch": 22.938623682579045,
"grad_norm": 0.5331476330757141,
"learning_rate": 5.467855666745989e-05,
"loss": 0.8135,
"step": 37000
},
{
"epoch": 23.24860508369498,
"grad_norm": 0.541491687297821,
"learning_rate": 5.405358015104644e-05,
"loss": 0.8097,
"step": 37500
},
{
"epoch": 23.55858648481091,
"grad_norm": 0.5554507374763489,
"learning_rate": 5.342735117668425e-05,
"loss": 0.8074,
"step": 38000
},
{
"epoch": 23.868567885926844,
"grad_norm": 0.5485785007476807,
"learning_rate": 5.2801122202322065e-05,
"loss": 0.8054,
"step": 38500
},
{
"epoch": 24.178549287042777,
"grad_norm": 0.5320767164230347,
"learning_rate": 5.217489322795988e-05,
"loss": 0.8018,
"step": 39000
},
{
"epoch": 24.48853068815871,
"grad_norm": 0.5248667001724243,
"learning_rate": 5.154866425359769e-05,
"loss": 0.8008,
"step": 39500
},
{
"epoch": 24.798512089274645,
"grad_norm": 0.5368346571922302,
"learning_rate": 5.0922435279235505e-05,
"loss": 0.7975,
"step": 40000
},
{
"epoch": 25.108493490390575,
"grad_norm": 0.53144371509552,
"learning_rate": 5.029620630487332e-05,
"loss": 0.7947,
"step": 40500
},
{
"epoch": 25.41847489150651,
"grad_norm": 0.5482547879219055,
"learning_rate": 4.966997733051113e-05,
"loss": 0.793,
"step": 41000
},
{
"epoch": 25.728456292622443,
"grad_norm": 0.5446964502334595,
"learning_rate": 4.9043748356148946e-05,
"loss": 0.7905,
"step": 41500
},
{
"epoch": 26.038437693738377,
"grad_norm": 0.5257270932197571,
"learning_rate": 4.841751938178676e-05,
"loss": 0.7892,
"step": 42000
},
{
"epoch": 26.348419094854307,
"grad_norm": 0.5478941202163696,
"learning_rate": 4.779129040742457e-05,
"loss": 0.7856,
"step": 42500
},
{
"epoch": 26.65840049597024,
"grad_norm": 0.5381990671157837,
"learning_rate": 4.7165061433062386e-05,
"loss": 0.7863,
"step": 43000
},
{
"epoch": 26.968381897086175,
"grad_norm": 0.546461820602417,
"learning_rate": 4.65388324587002e-05,
"loss": 0.7826,
"step": 43500
},
{
"epoch": 27.27836329820211,
"grad_norm": 0.543404757976532,
"learning_rate": 4.591260348433802e-05,
"loss": 0.7796,
"step": 44000
},
{
"epoch": 27.58834469931804,
"grad_norm": 0.5448907613754272,
"learning_rate": 4.528637450997583e-05,
"loss": 0.7796,
"step": 44500
},
{
"epoch": 27.898326100433973,
"grad_norm": 0.5504478216171265,
"learning_rate": 4.466014553561365e-05,
"loss": 0.7761,
"step": 45000
},
{
"epoch": 28.208307501549907,
"grad_norm": 0.544154703617096,
"learning_rate": 4.403391656125146e-05,
"loss": 0.7753,
"step": 45500
},
{
"epoch": 28.51828890266584,
"grad_norm": 0.542306125164032,
"learning_rate": 4.3407687586889274e-05,
"loss": 0.7735,
"step": 46000
},
{
"epoch": 28.828270303781775,
"grad_norm": 0.5549866557121277,
"learning_rate": 4.278145861252709e-05,
"loss": 0.7707,
"step": 46500
},
{
"epoch": 29.138251704897705,
"grad_norm": 0.538090169429779,
"learning_rate": 4.21552296381649e-05,
"loss": 0.7697,
"step": 47000
},
{
"epoch": 29.44823310601364,
"grad_norm": 0.5609955191612244,
"learning_rate": 4.1529000663802714e-05,
"loss": 0.7682,
"step": 47500
},
{
"epoch": 29.758214507129573,
"grad_norm": 0.5595529675483704,
"learning_rate": 4.090277168944053e-05,
"loss": 0.7659,
"step": 48000
},
{
"epoch": 30.068195908245507,
"grad_norm": 0.5461651086807251,
"learning_rate": 4.027654271507834e-05,
"loss": 0.7656,
"step": 48500
},
{
"epoch": 30.378177309361437,
"grad_norm": 0.5438820719718933,
"learning_rate": 3.9650313740716154e-05,
"loss": 0.7625,
"step": 49000
},
{
"epoch": 30.68815871047737,
"grad_norm": 0.5458811521530151,
"learning_rate": 3.902408476635397e-05,
"loss": 0.762,
"step": 49500
},
{
"epoch": 30.998140111593305,
"grad_norm": 0.535521388053894,
"learning_rate": 3.839785579199179e-05,
"loss": 0.7589,
"step": 50000
},
{
"epoch": 31.30812151270924,
"grad_norm": 0.5407618284225464,
"learning_rate": 3.77716268176296e-05,
"loss": 0.7576,
"step": 50500
},
{
"epoch": 31.61810291382517,
"grad_norm": 0.5259741544723511,
"learning_rate": 3.7145397843267415e-05,
"loss": 0.7571,
"step": 51000
},
{
"epoch": 31.928084314941103,
"grad_norm": 0.5338233709335327,
"learning_rate": 3.651916886890523e-05,
"loss": 0.7561,
"step": 51500
},
{
"epoch": 32.23806571605704,
"grad_norm": 0.5369750261306763,
"learning_rate": 3.589293989454304e-05,
"loss": 0.7541,
"step": 52000
},
{
"epoch": 32.54804711717297,
"grad_norm": 0.5418145656585693,
"learning_rate": 3.5266710920180856e-05,
"loss": 0.7521,
"step": 52500
},
{
"epoch": 32.858028518288904,
"grad_norm": 0.533149242401123,
"learning_rate": 3.464048194581867e-05,
"loss": 0.7519,
"step": 53000
},
{
"epoch": 33.16800991940484,
"grad_norm": 0.5384135246276855,
"learning_rate": 3.401425297145648e-05,
"loss": 0.7497,
"step": 53500
},
{
"epoch": 33.47799132052077,
"grad_norm": 0.5323925018310547,
"learning_rate": 3.3388023997094296e-05,
"loss": 0.7485,
"step": 54000
},
{
"epoch": 33.7879727216367,
"grad_norm": 0.535434901714325,
"learning_rate": 3.276179502273211e-05,
"loss": 0.7472,
"step": 54500
},
{
"epoch": 34.09795412275263,
"grad_norm": 0.5496259331703186,
"learning_rate": 3.213556604836992e-05,
"loss": 0.7454,
"step": 55000
},
{
"epoch": 34.40793552386857,
"grad_norm": 0.5429278016090393,
"learning_rate": 3.150933707400774e-05,
"loss": 0.7447,
"step": 55500
},
{
"epoch": 34.7179169249845,
"grad_norm": 0.5489596724510193,
"learning_rate": 3.088310809964556e-05,
"loss": 0.7438,
"step": 56000
},
{
"epoch": 35.027898326100434,
"grad_norm": 0.5510178208351135,
"learning_rate": 3.025687912528337e-05,
"loss": 0.7416,
"step": 56500
},
{
"epoch": 35.33787972721637,
"grad_norm": 0.5540343523025513,
"learning_rate": 2.9630650150921187e-05,
"loss": 0.7401,
"step": 57000
},
{
"epoch": 35.6478611283323,
"grad_norm": 0.551895260810852,
"learning_rate": 2.9004421176559e-05,
"loss": 0.7404,
"step": 57500
},
{
"epoch": 35.957842529448236,
"grad_norm": 0.5412101149559021,
"learning_rate": 2.8378192202196814e-05,
"loss": 0.74,
"step": 58000
},
{
"epoch": 36.26782393056417,
"grad_norm": 0.5450315475463867,
"learning_rate": 2.7751963227834627e-05,
"loss": 0.7386,
"step": 58500
},
{
"epoch": 36.577805331680096,
"grad_norm": 0.5550098419189453,
"learning_rate": 2.712573425347244e-05,
"loss": 0.7382,
"step": 59000
},
{
"epoch": 36.88778673279603,
"grad_norm": 0.5502198338508606,
"learning_rate": 2.6499505279110254e-05,
"loss": 0.7345,
"step": 59500
},
{
"epoch": 37.197768133911964,
"grad_norm": 0.5401105880737305,
"learning_rate": 2.587452876269679e-05,
"loss": 0.7355,
"step": 60000
},
{
"epoch": 37.5077495350279,
"grad_norm": 0.543369710445404,
"learning_rate": 2.5248299788334605e-05,
"loss": 0.7338,
"step": 60500
},
{
"epoch": 37.81773093614383,
"grad_norm": 0.5440373420715332,
"learning_rate": 2.4622070813972422e-05,
"loss": 0.7326,
"step": 61000
},
{
"epoch": 38.127712337259766,
"grad_norm": 0.5450806021690369,
"learning_rate": 2.3995841839610235e-05,
"loss": 0.7315,
"step": 61500
},
{
"epoch": 38.4376937383757,
"grad_norm": 0.5412734746932983,
"learning_rate": 2.336961286524805e-05,
"loss": 0.7301,
"step": 62000
},
{
"epoch": 38.74767513949163,
"grad_norm": 0.5553017854690552,
"learning_rate": 2.274463634883459e-05,
"loss": 0.732,
"step": 62500
},
{
"epoch": 39.05765654060756,
"grad_norm": 0.5467730164527893,
"learning_rate": 2.2118407374472403e-05,
"loss": 0.7289,
"step": 63000
},
{
"epoch": 39.367637941723494,
"grad_norm": 0.551267683506012,
"learning_rate": 2.1492178400110216e-05,
"loss": 0.728,
"step": 63500
},
{
"epoch": 39.67761934283943,
"grad_norm": 0.5391538739204407,
"learning_rate": 2.0865949425748033e-05,
"loss": 0.7276,
"step": 64000
},
{
"epoch": 39.98760074395536,
"grad_norm": 0.5523350238800049,
"learning_rate": 2.0239720451385847e-05,
"loss": 0.7272,
"step": 64500
},
{
"epoch": 40.297582145071296,
"grad_norm": 0.5367141366004944,
"learning_rate": 1.961349147702366e-05,
"loss": 0.726,
"step": 65000
},
{
"epoch": 40.60756354618723,
"grad_norm": 0.5538766980171204,
"learning_rate": 1.8987262502661473e-05,
"loss": 0.7238,
"step": 65500
},
{
"epoch": 40.91754494730316,
"grad_norm": 0.5274632573127747,
"learning_rate": 1.8361033528299287e-05,
"loss": 0.725,
"step": 66000
},
{
"epoch": 41.2275263484191,
"grad_norm": 0.521597146987915,
"learning_rate": 1.7736057011885827e-05,
"loss": 0.7233,
"step": 66500
},
{
"epoch": 41.53750774953503,
"grad_norm": 0.5390001535415649,
"learning_rate": 1.710982803752364e-05,
"loss": 0.7225,
"step": 67000
},
{
"epoch": 41.84748915065096,
"grad_norm": 0.5474331378936768,
"learning_rate": 1.6483599063161458e-05,
"loss": 0.7218,
"step": 67500
},
{
"epoch": 42.15747055176689,
"grad_norm": 0.5352886915206909,
"learning_rate": 1.5858622546747995e-05,
"loss": 0.7213,
"step": 68000
},
{
"epoch": 42.467451952882826,
"grad_norm": 0.540053129196167,
"learning_rate": 1.5232393572385808e-05,
"loss": 0.7204,
"step": 68500
},
{
"epoch": 42.77743335399876,
"grad_norm": 0.5470998883247375,
"learning_rate": 1.4606164598023622e-05,
"loss": 0.721,
"step": 69000
},
{
"epoch": 43.08741475511469,
"grad_norm": 0.5613588094711304,
"learning_rate": 1.3979935623661435e-05,
"loss": 0.7194,
"step": 69500
},
{
"epoch": 43.39739615623063,
"grad_norm": 0.5471562743186951,
"learning_rate": 1.3354959107247974e-05,
"loss": 0.7178,
"step": 70000
},
{
"epoch": 43.70737755734656,
"grad_norm": 0.5386627912521362,
"learning_rate": 1.2728730132885787e-05,
"loss": 0.7184,
"step": 70500
},
{
"epoch": 44.017358958462495,
"grad_norm": 0.5391978621482849,
"learning_rate": 1.2102501158523603e-05,
"loss": 0.7186,
"step": 71000
},
{
"epoch": 44.32734035957843,
"grad_norm": 0.5381629467010498,
"learning_rate": 1.1476272184161418e-05,
"loss": 0.7168,
"step": 71500
},
{
"epoch": 44.637321760694356,
"grad_norm": 0.5467249155044556,
"learning_rate": 1.0850043209799233e-05,
"loss": 0.7162,
"step": 72000
},
{
"epoch": 44.94730316181029,
"grad_norm": 0.5548228025436401,
"learning_rate": 1.0223814235437046e-05,
"loss": 0.7146,
"step": 72500
},
{
"epoch": 45.25728456292622,
"grad_norm": 0.5488151907920837,
"learning_rate": 9.59758526107486e-06,
"loss": 0.7152,
"step": 73000
},
{
"epoch": 45.56726596404216,
"grad_norm": 0.5473387241363525,
"learning_rate": 8.971356286712675e-06,
"loss": 0.7142,
"step": 73500
},
{
"epoch": 45.87724736515809,
"grad_norm": 0.5331913828849792,
"learning_rate": 8.345127312350489e-06,
"loss": 0.7155,
"step": 74000
},
{
"epoch": 46.187228766274025,
"grad_norm": 0.5443392395973206,
"learning_rate": 7.718898337988302e-06,
"loss": 0.7136,
"step": 74500
},
{
"epoch": 46.49721016738996,
"grad_norm": 0.5461409091949463,
"learning_rate": 7.092669363626117e-06,
"loss": 0.7148,
"step": 75000
},
{
"epoch": 46.80719156850589,
"grad_norm": 0.5504785180091858,
"learning_rate": 6.466440389263931e-06,
"loss": 0.7133,
"step": 75500
},
{
"epoch": 47.11717296962182,
"grad_norm": 0.5478015542030334,
"learning_rate": 5.840211414901745e-06,
"loss": 0.7125,
"step": 76000
},
{
"epoch": 47.42715437073775,
"grad_norm": 0.5464319586753845,
"learning_rate": 5.2139824405395585e-06,
"loss": 0.7125,
"step": 76500
},
{
"epoch": 47.73713577185369,
"grad_norm": 0.5370163321495056,
"learning_rate": 4.587753466177374e-06,
"loss": 0.7117,
"step": 77000
},
{
"epoch": 48.04711717296962,
"grad_norm": 0.5529221892356873,
"learning_rate": 3.961524491815188e-06,
"loss": 0.711,
"step": 77500
},
{
"epoch": 48.357098574085555,
"grad_norm": 0.549679160118103,
"learning_rate": 3.3352955174530015e-06,
"loss": 0.7112,
"step": 78000
},
{
"epoch": 48.66707997520149,
"grad_norm": 0.5416662096977234,
"learning_rate": 2.709066543090816e-06,
"loss": 0.7112,
"step": 78500
},
{
"epoch": 48.97706137631742,
"grad_norm": 0.5428098440170288,
"learning_rate": 2.08283756872863e-06,
"loss": 0.7109,
"step": 79000
},
{
"epoch": 49.287042777433356,
"grad_norm": 0.5247154235839844,
"learning_rate": 1.4566085943664442e-06,
"loss": 0.7106,
"step": 79500
},
{
"epoch": 49.59702417854929,
"grad_norm": 0.5486724376678467,
"learning_rate": 8.303796200042584e-07,
"loss": 0.7097,
"step": 80000
},
{
"epoch": 49.90700557966522,
"grad_norm": 0.5495786070823669,
"learning_rate": 2.0415064564207257e-07,
"loss": 0.7106,
"step": 80500
},
{
"epoch": 50.0,
"step": 80650,
"total_flos": 2.052104150815488e+18,
"train_loss": 0.04098836247254364,
"train_runtime": 10357.3823,
"train_samples_per_second": 11959.61,
"train_steps_per_second": 7.787
}
],
"logging_steps": 500,
"max_steps": 80650,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.052104150815488e+18,
"train_batch_size": 192,
"trial_name": null,
"trial_params": null
}