|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 80650, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.30998140111593303, |
|
"grad_norm": 0.3543250262737274, |
|
"learning_rate": 6.195786864931847e-05, |
|
"loss": 9.0345, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6199628022318661, |
|
"grad_norm": 0.5106557607650757, |
|
"learning_rate": 9.97582756158962e-05, |
|
"loss": 6.2184, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9299442033477991, |
|
"grad_norm": 2.6617751121520996, |
|
"learning_rate": 9.913204664153402e-05, |
|
"loss": 5.4194, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2399256044637321, |
|
"grad_norm": 1.8096632957458496, |
|
"learning_rate": 9.850581766717182e-05, |
|
"loss": 3.915, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.5499070055796653, |
|
"grad_norm": 1.2520173788070679, |
|
"learning_rate": 9.787958869280964e-05, |
|
"loss": 2.7963, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.8598884066955983, |
|
"grad_norm": 0.8099603056907654, |
|
"learning_rate": 9.725335971844745e-05, |
|
"loss": 2.2568, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1698698078115313, |
|
"grad_norm": 0.7233591079711914, |
|
"learning_rate": 9.662713074408527e-05, |
|
"loss": 1.9847, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.4798512089274642, |
|
"grad_norm": 0.6427165865898132, |
|
"learning_rate": 9.600090176972308e-05, |
|
"loss": 1.8216, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.7898326100433977, |
|
"grad_norm": 0.6729193925857544, |
|
"learning_rate": 9.53746727953609e-05, |
|
"loss": 1.7067, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.0998140111593306, |
|
"grad_norm": 0.6484789848327637, |
|
"learning_rate": 9.47484438209987e-05, |
|
"loss": 1.6187, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.4097954122752636, |
|
"grad_norm": 0.5950448513031006, |
|
"learning_rate": 9.412221484663653e-05, |
|
"loss": 1.5479, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.7197768133911966, |
|
"grad_norm": 0.6102598309516907, |
|
"learning_rate": 9.349598587227433e-05, |
|
"loss": 1.4879, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.02975821450713, |
|
"grad_norm": 0.6204754710197449, |
|
"learning_rate": 9.286975689791215e-05, |
|
"loss": 1.4379, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.3397396156230625, |
|
"grad_norm": 0.590217649936676, |
|
"learning_rate": 9.224352792354997e-05, |
|
"loss": 1.3926, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.6497210167389955, |
|
"grad_norm": 0.6062743663787842, |
|
"learning_rate": 9.161729894918779e-05, |
|
"loss": 1.3553, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.9597024178549285, |
|
"grad_norm": 0.5663708448410034, |
|
"learning_rate": 9.09910699748256e-05, |
|
"loss": 1.3201, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.2696838189708615, |
|
"grad_norm": 0.5806947350502014, |
|
"learning_rate": 9.036484100046342e-05, |
|
"loss": 1.2904, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 5.579665220086794, |
|
"grad_norm": 0.6131803393363953, |
|
"learning_rate": 8.973861202610123e-05, |
|
"loss": 1.2623, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.889646621202727, |
|
"grad_norm": 0.5666236281394958, |
|
"learning_rate": 8.911238305173905e-05, |
|
"loss": 1.2368, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 6.199628022318661, |
|
"grad_norm": 0.6078547239303589, |
|
"learning_rate": 8.848615407737685e-05, |
|
"loss": 1.212, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.509609423434594, |
|
"grad_norm": 0.575513482093811, |
|
"learning_rate": 8.785992510301467e-05, |
|
"loss": 1.1914, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.819590824550527, |
|
"grad_norm": 0.5826976895332336, |
|
"learning_rate": 8.723369612865248e-05, |
|
"loss": 1.1718, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 7.12957222566646, |
|
"grad_norm": 0.544598400592804, |
|
"learning_rate": 8.66074671542903e-05, |
|
"loss": 1.1548, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 7.439553626782393, |
|
"grad_norm": 0.5824791193008423, |
|
"learning_rate": 8.598123817992811e-05, |
|
"loss": 1.1363, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.749535027898326, |
|
"grad_norm": 0.5747692584991455, |
|
"learning_rate": 8.535500920556593e-05, |
|
"loss": 1.1211, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 8.05951642901426, |
|
"grad_norm": 0.5473280549049377, |
|
"learning_rate": 8.472878023120375e-05, |
|
"loss": 1.1077, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.369497830130193, |
|
"grad_norm": 0.5574379563331604, |
|
"learning_rate": 8.410255125684155e-05, |
|
"loss": 1.0908, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 8.679479231246125, |
|
"grad_norm": 0.5424452424049377, |
|
"learning_rate": 8.347632228247937e-05, |
|
"loss": 1.0785, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 8.989460632362059, |
|
"grad_norm": 0.5508283376693726, |
|
"learning_rate": 8.285009330811718e-05, |
|
"loss": 1.0683, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 9.299442033477991, |
|
"grad_norm": 0.5519115924835205, |
|
"learning_rate": 8.2223864333755e-05, |
|
"loss": 1.0537, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.609423434593925, |
|
"grad_norm": 0.5510475039482117, |
|
"learning_rate": 8.159763535939281e-05, |
|
"loss": 1.0443, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 9.919404835709857, |
|
"grad_norm": 0.5631123185157776, |
|
"learning_rate": 8.097140638503063e-05, |
|
"loss": 1.0339, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 10.22938623682579, |
|
"grad_norm": 0.5705382823944092, |
|
"learning_rate": 8.034517741066844e-05, |
|
"loss": 1.0217, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 10.539367637941723, |
|
"grad_norm": 0.5316577553749084, |
|
"learning_rate": 7.971894843630626e-05, |
|
"loss": 1.0151, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 10.849349039057657, |
|
"grad_norm": 0.5557442307472229, |
|
"learning_rate": 7.909271946194406e-05, |
|
"loss": 1.0043, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 11.159330440173589, |
|
"grad_norm": 0.5498985648155212, |
|
"learning_rate": 7.846649048758188e-05, |
|
"loss": 0.9951, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 11.469311841289523, |
|
"grad_norm": 0.552780032157898, |
|
"learning_rate": 7.784026151321969e-05, |
|
"loss": 0.9855, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 11.779293242405455, |
|
"grad_norm": 0.5406888127326965, |
|
"learning_rate": 7.721403253885752e-05, |
|
"loss": 0.9795, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 12.089274643521389, |
|
"grad_norm": 0.537375271320343, |
|
"learning_rate": 7.658780356449533e-05, |
|
"loss": 0.971, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 12.399256044637323, |
|
"grad_norm": 0.5666614174842834, |
|
"learning_rate": 7.596157459013315e-05, |
|
"loss": 0.9643, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 12.709237445753255, |
|
"grad_norm": 0.5302731990814209, |
|
"learning_rate": 7.533659807371968e-05, |
|
"loss": 0.9582, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 13.019218846869189, |
|
"grad_norm": 0.5608243346214294, |
|
"learning_rate": 7.471036909935749e-05, |
|
"loss": 0.9512, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 13.32920024798512, |
|
"grad_norm": 0.5309119820594788, |
|
"learning_rate": 7.408414012499531e-05, |
|
"loss": 0.9424, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 13.639181649101054, |
|
"grad_norm": 0.5380939245223999, |
|
"learning_rate": 7.345791115063312e-05, |
|
"loss": 0.9383, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 13.949163050216987, |
|
"grad_norm": 0.5440984964370728, |
|
"learning_rate": 7.283168217627094e-05, |
|
"loss": 0.9298, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 14.25914445133292, |
|
"grad_norm": 0.5377441048622131, |
|
"learning_rate": 7.220545320190874e-05, |
|
"loss": 0.9245, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 14.569125852448852, |
|
"grad_norm": 0.5402495265007019, |
|
"learning_rate": 7.157922422754656e-05, |
|
"loss": 0.9196, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 14.879107253564786, |
|
"grad_norm": 0.5610705018043518, |
|
"learning_rate": 7.095299525318437e-05, |
|
"loss": 0.9146, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 15.189088654680718, |
|
"grad_norm": 0.5305636525154114, |
|
"learning_rate": 7.032676627882219e-05, |
|
"loss": 0.9071, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 15.499070055796652, |
|
"grad_norm": 0.5398979187011719, |
|
"learning_rate": 6.970053730446e-05, |
|
"loss": 0.9037, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 15.809051456912584, |
|
"grad_norm": 0.5490283370018005, |
|
"learning_rate": 6.907556078804655e-05, |
|
"loss": 0.8982, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 16.11903285802852, |
|
"grad_norm": 0.5505014061927795, |
|
"learning_rate": 6.844933181368435e-05, |
|
"loss": 0.8933, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 16.429014259144452, |
|
"grad_norm": 0.5260488390922546, |
|
"learning_rate": 6.782310283932217e-05, |
|
"loss": 0.8865, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 16.738995660260386, |
|
"grad_norm": 0.5459970235824585, |
|
"learning_rate": 6.719687386495999e-05, |
|
"loss": 0.8837, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 17.048977061376316, |
|
"grad_norm": 0.5260828733444214, |
|
"learning_rate": 6.657189734854653e-05, |
|
"loss": 0.8812, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 17.35895846249225, |
|
"grad_norm": 0.531878650188446, |
|
"learning_rate": 6.594566837418435e-05, |
|
"loss": 0.874, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 17.668939863608184, |
|
"grad_norm": 0.5373751521110535, |
|
"learning_rate": 6.531943939982215e-05, |
|
"loss": 0.8703, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 17.978921264724118, |
|
"grad_norm": 0.5685413479804993, |
|
"learning_rate": 6.469321042545997e-05, |
|
"loss": 0.8674, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 18.288902665840048, |
|
"grad_norm": 0.5405117273330688, |
|
"learning_rate": 6.406698145109778e-05, |
|
"loss": 0.8618, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 18.598884066955982, |
|
"grad_norm": 0.5303318500518799, |
|
"learning_rate": 6.344325739263305e-05, |
|
"loss": 0.8572, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 18.908865468071916, |
|
"grad_norm": 0.5173208117485046, |
|
"learning_rate": 6.281702841827086e-05, |
|
"loss": 0.8552, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 19.21884686918785, |
|
"grad_norm": 0.5334449410438538, |
|
"learning_rate": 6.219079944390868e-05, |
|
"loss": 0.8494, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 19.52882827030378, |
|
"grad_norm": 0.5522080659866333, |
|
"learning_rate": 6.156457046954649e-05, |
|
"loss": 0.8464, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 19.838809671419714, |
|
"grad_norm": 0.5295758247375488, |
|
"learning_rate": 6.09383414951843e-05, |
|
"loss": 0.845, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 20.148791072535648, |
|
"grad_norm": 0.5164583325386047, |
|
"learning_rate": 6.0312112520822115e-05, |
|
"loss": 0.8395, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 20.45877247365158, |
|
"grad_norm": 0.5620171427726746, |
|
"learning_rate": 5.968713600440865e-05, |
|
"loss": 0.8354, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 20.768753874767516, |
|
"grad_norm": 0.5254458785057068, |
|
"learning_rate": 5.906090703004646e-05, |
|
"loss": 0.8336, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 21.078735275883446, |
|
"grad_norm": 0.5437597632408142, |
|
"learning_rate": 5.8434678055684276e-05, |
|
"loss": 0.8304, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 21.38871667699938, |
|
"grad_norm": 0.5438856482505798, |
|
"learning_rate": 5.78084490813221e-05, |
|
"loss": 0.8263, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 21.698698078115314, |
|
"grad_norm": 0.5386750102043152, |
|
"learning_rate": 5.7182220106959916e-05, |
|
"loss": 0.8248, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 22.008679479231247, |
|
"grad_norm": 0.5307642817497253, |
|
"learning_rate": 5.655724359054645e-05, |
|
"loss": 0.8223, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 22.318660880347178, |
|
"grad_norm": 0.5404214859008789, |
|
"learning_rate": 5.5931014616184264e-05, |
|
"loss": 0.8176, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 22.62864228146311, |
|
"grad_norm": 0.555665910243988, |
|
"learning_rate": 5.530478564182208e-05, |
|
"loss": 0.8164, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 22.938623682579045, |
|
"grad_norm": 0.5331476330757141, |
|
"learning_rate": 5.467855666745989e-05, |
|
"loss": 0.8135, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 23.24860508369498, |
|
"grad_norm": 0.541491687297821, |
|
"learning_rate": 5.405358015104644e-05, |
|
"loss": 0.8097, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 23.55858648481091, |
|
"grad_norm": 0.5554507374763489, |
|
"learning_rate": 5.342735117668425e-05, |
|
"loss": 0.8074, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 23.868567885926844, |
|
"grad_norm": 0.5485785007476807, |
|
"learning_rate": 5.2801122202322065e-05, |
|
"loss": 0.8054, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 24.178549287042777, |
|
"grad_norm": 0.5320767164230347, |
|
"learning_rate": 5.217489322795988e-05, |
|
"loss": 0.8018, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 24.48853068815871, |
|
"grad_norm": 0.5248667001724243, |
|
"learning_rate": 5.154866425359769e-05, |
|
"loss": 0.8008, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 24.798512089274645, |
|
"grad_norm": 0.5368346571922302, |
|
"learning_rate": 5.0922435279235505e-05, |
|
"loss": 0.7975, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 25.108493490390575, |
|
"grad_norm": 0.53144371509552, |
|
"learning_rate": 5.029620630487332e-05, |
|
"loss": 0.7947, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 25.41847489150651, |
|
"grad_norm": 0.5482547879219055, |
|
"learning_rate": 4.966997733051113e-05, |
|
"loss": 0.793, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 25.728456292622443, |
|
"grad_norm": 0.5446964502334595, |
|
"learning_rate": 4.9043748356148946e-05, |
|
"loss": 0.7905, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 26.038437693738377, |
|
"grad_norm": 0.5257270932197571, |
|
"learning_rate": 4.841751938178676e-05, |
|
"loss": 0.7892, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 26.348419094854307, |
|
"grad_norm": 0.5478941202163696, |
|
"learning_rate": 4.779129040742457e-05, |
|
"loss": 0.7856, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 26.65840049597024, |
|
"grad_norm": 0.5381990671157837, |
|
"learning_rate": 4.7165061433062386e-05, |
|
"loss": 0.7863, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 26.968381897086175, |
|
"grad_norm": 0.546461820602417, |
|
"learning_rate": 4.65388324587002e-05, |
|
"loss": 0.7826, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 27.27836329820211, |
|
"grad_norm": 0.543404757976532, |
|
"learning_rate": 4.591260348433802e-05, |
|
"loss": 0.7796, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 27.58834469931804, |
|
"grad_norm": 0.5448907613754272, |
|
"learning_rate": 4.528637450997583e-05, |
|
"loss": 0.7796, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 27.898326100433973, |
|
"grad_norm": 0.5504478216171265, |
|
"learning_rate": 4.466014553561365e-05, |
|
"loss": 0.7761, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 28.208307501549907, |
|
"grad_norm": 0.544154703617096, |
|
"learning_rate": 4.403391656125146e-05, |
|
"loss": 0.7753, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 28.51828890266584, |
|
"grad_norm": 0.542306125164032, |
|
"learning_rate": 4.3407687586889274e-05, |
|
"loss": 0.7735, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 28.828270303781775, |
|
"grad_norm": 0.5549866557121277, |
|
"learning_rate": 4.278145861252709e-05, |
|
"loss": 0.7707, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 29.138251704897705, |
|
"grad_norm": 0.538090169429779, |
|
"learning_rate": 4.21552296381649e-05, |
|
"loss": 0.7697, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 29.44823310601364, |
|
"grad_norm": 0.5609955191612244, |
|
"learning_rate": 4.1529000663802714e-05, |
|
"loss": 0.7682, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 29.758214507129573, |
|
"grad_norm": 0.5595529675483704, |
|
"learning_rate": 4.090277168944053e-05, |
|
"loss": 0.7659, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 30.068195908245507, |
|
"grad_norm": 0.5461651086807251, |
|
"learning_rate": 4.027654271507834e-05, |
|
"loss": 0.7656, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 30.378177309361437, |
|
"grad_norm": 0.5438820719718933, |
|
"learning_rate": 3.9650313740716154e-05, |
|
"loss": 0.7625, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 30.68815871047737, |
|
"grad_norm": 0.5458811521530151, |
|
"learning_rate": 3.902408476635397e-05, |
|
"loss": 0.762, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 30.998140111593305, |
|
"grad_norm": 0.535521388053894, |
|
"learning_rate": 3.839785579199179e-05, |
|
"loss": 0.7589, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 31.30812151270924, |
|
"grad_norm": 0.5407618284225464, |
|
"learning_rate": 3.77716268176296e-05, |
|
"loss": 0.7576, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 31.61810291382517, |
|
"grad_norm": 0.5259741544723511, |
|
"learning_rate": 3.7145397843267415e-05, |
|
"loss": 0.7571, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 31.928084314941103, |
|
"grad_norm": 0.5338233709335327, |
|
"learning_rate": 3.651916886890523e-05, |
|
"loss": 0.7561, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 32.23806571605704, |
|
"grad_norm": 0.5369750261306763, |
|
"learning_rate": 3.589293989454304e-05, |
|
"loss": 0.7541, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 32.54804711717297, |
|
"grad_norm": 0.5418145656585693, |
|
"learning_rate": 3.5266710920180856e-05, |
|
"loss": 0.7521, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 32.858028518288904, |
|
"grad_norm": 0.533149242401123, |
|
"learning_rate": 3.464048194581867e-05, |
|
"loss": 0.7519, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 33.16800991940484, |
|
"grad_norm": 0.5384135246276855, |
|
"learning_rate": 3.401425297145648e-05, |
|
"loss": 0.7497, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 33.47799132052077, |
|
"grad_norm": 0.5323925018310547, |
|
"learning_rate": 3.3388023997094296e-05, |
|
"loss": 0.7485, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 33.7879727216367, |
|
"grad_norm": 0.535434901714325, |
|
"learning_rate": 3.276179502273211e-05, |
|
"loss": 0.7472, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 34.09795412275263, |
|
"grad_norm": 0.5496259331703186, |
|
"learning_rate": 3.213556604836992e-05, |
|
"loss": 0.7454, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 34.40793552386857, |
|
"grad_norm": 0.5429278016090393, |
|
"learning_rate": 3.150933707400774e-05, |
|
"loss": 0.7447, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 34.7179169249845, |
|
"grad_norm": 0.5489596724510193, |
|
"learning_rate": 3.088310809964556e-05, |
|
"loss": 0.7438, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 35.027898326100434, |
|
"grad_norm": 0.5510178208351135, |
|
"learning_rate": 3.025687912528337e-05, |
|
"loss": 0.7416, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 35.33787972721637, |
|
"grad_norm": 0.5540343523025513, |
|
"learning_rate": 2.9630650150921187e-05, |
|
"loss": 0.7401, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 35.6478611283323, |
|
"grad_norm": 0.551895260810852, |
|
"learning_rate": 2.9004421176559e-05, |
|
"loss": 0.7404, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 35.957842529448236, |
|
"grad_norm": 0.5412101149559021, |
|
"learning_rate": 2.8378192202196814e-05, |
|
"loss": 0.74, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 36.26782393056417, |
|
"grad_norm": 0.5450315475463867, |
|
"learning_rate": 2.7751963227834627e-05, |
|
"loss": 0.7386, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 36.577805331680096, |
|
"grad_norm": 0.5550098419189453, |
|
"learning_rate": 2.712573425347244e-05, |
|
"loss": 0.7382, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 36.88778673279603, |
|
"grad_norm": 0.5502198338508606, |
|
"learning_rate": 2.6499505279110254e-05, |
|
"loss": 0.7345, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 37.197768133911964, |
|
"grad_norm": 0.5401105880737305, |
|
"learning_rate": 2.587452876269679e-05, |
|
"loss": 0.7355, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 37.5077495350279, |
|
"grad_norm": 0.543369710445404, |
|
"learning_rate": 2.5248299788334605e-05, |
|
"loss": 0.7338, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 37.81773093614383, |
|
"grad_norm": 0.5440373420715332, |
|
"learning_rate": 2.4622070813972422e-05, |
|
"loss": 0.7326, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 38.127712337259766, |
|
"grad_norm": 0.5450806021690369, |
|
"learning_rate": 2.3995841839610235e-05, |
|
"loss": 0.7315, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 38.4376937383757, |
|
"grad_norm": 0.5412734746932983, |
|
"learning_rate": 2.336961286524805e-05, |
|
"loss": 0.7301, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 38.74767513949163, |
|
"grad_norm": 0.5553017854690552, |
|
"learning_rate": 2.274463634883459e-05, |
|
"loss": 0.732, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 39.05765654060756, |
|
"grad_norm": 0.5467730164527893, |
|
"learning_rate": 2.2118407374472403e-05, |
|
"loss": 0.7289, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 39.367637941723494, |
|
"grad_norm": 0.551267683506012, |
|
"learning_rate": 2.1492178400110216e-05, |
|
"loss": 0.728, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 39.67761934283943, |
|
"grad_norm": 0.5391538739204407, |
|
"learning_rate": 2.0865949425748033e-05, |
|
"loss": 0.7276, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 39.98760074395536, |
|
"grad_norm": 0.5523350238800049, |
|
"learning_rate": 2.0239720451385847e-05, |
|
"loss": 0.7272, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 40.297582145071296, |
|
"grad_norm": 0.5367141366004944, |
|
"learning_rate": 1.961349147702366e-05, |
|
"loss": 0.726, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 40.60756354618723, |
|
"grad_norm": 0.5538766980171204, |
|
"learning_rate": 1.8987262502661473e-05, |
|
"loss": 0.7238, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 40.91754494730316, |
|
"grad_norm": 0.5274632573127747, |
|
"learning_rate": 1.8361033528299287e-05, |
|
"loss": 0.725, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 41.2275263484191, |
|
"grad_norm": 0.521597146987915, |
|
"learning_rate": 1.7736057011885827e-05, |
|
"loss": 0.7233, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 41.53750774953503, |
|
"grad_norm": 0.5390001535415649, |
|
"learning_rate": 1.710982803752364e-05, |
|
"loss": 0.7225, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 41.84748915065096, |
|
"grad_norm": 0.5474331378936768, |
|
"learning_rate": 1.6483599063161458e-05, |
|
"loss": 0.7218, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 42.15747055176689, |
|
"grad_norm": 0.5352886915206909, |
|
"learning_rate": 1.5858622546747995e-05, |
|
"loss": 0.7213, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 42.467451952882826, |
|
"grad_norm": 0.540053129196167, |
|
"learning_rate": 1.5232393572385808e-05, |
|
"loss": 0.7204, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 42.77743335399876, |
|
"grad_norm": 0.5470998883247375, |
|
"learning_rate": 1.4606164598023622e-05, |
|
"loss": 0.721, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 43.08741475511469, |
|
"grad_norm": 0.5613588094711304, |
|
"learning_rate": 1.3979935623661435e-05, |
|
"loss": 0.7194, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 43.39739615623063, |
|
"grad_norm": 0.5471562743186951, |
|
"learning_rate": 1.3354959107247974e-05, |
|
"loss": 0.7178, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 43.70737755734656, |
|
"grad_norm": 0.5386627912521362, |
|
"learning_rate": 1.2728730132885787e-05, |
|
"loss": 0.7184, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 44.017358958462495, |
|
"grad_norm": 0.5391978621482849, |
|
"learning_rate": 1.2102501158523603e-05, |
|
"loss": 0.7186, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 44.32734035957843, |
|
"grad_norm": 0.5381629467010498, |
|
"learning_rate": 1.1476272184161418e-05, |
|
"loss": 0.7168, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 44.637321760694356, |
|
"grad_norm": 0.5467249155044556, |
|
"learning_rate": 1.0850043209799233e-05, |
|
"loss": 0.7162, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 44.94730316181029, |
|
"grad_norm": 0.5548228025436401, |
|
"learning_rate": 1.0223814235437046e-05, |
|
"loss": 0.7146, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 45.25728456292622, |
|
"grad_norm": 0.5488151907920837, |
|
"learning_rate": 9.59758526107486e-06, |
|
"loss": 0.7152, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 45.56726596404216, |
|
"grad_norm": 0.5473387241363525, |
|
"learning_rate": 8.971356286712675e-06, |
|
"loss": 0.7142, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 45.87724736515809, |
|
"grad_norm": 0.5331913828849792, |
|
"learning_rate": 8.345127312350489e-06, |
|
"loss": 0.7155, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 46.187228766274025, |
|
"grad_norm": 0.5443392395973206, |
|
"learning_rate": 7.718898337988302e-06, |
|
"loss": 0.7136, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 46.49721016738996, |
|
"grad_norm": 0.5461409091949463, |
|
"learning_rate": 7.092669363626117e-06, |
|
"loss": 0.7148, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 46.80719156850589, |
|
"grad_norm": 0.5504785180091858, |
|
"learning_rate": 6.466440389263931e-06, |
|
"loss": 0.7133, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 47.11717296962182, |
|
"grad_norm": 0.5478015542030334, |
|
"learning_rate": 5.840211414901745e-06, |
|
"loss": 0.7125, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 47.42715437073775, |
|
"grad_norm": 0.5464319586753845, |
|
"learning_rate": 5.2139824405395585e-06, |
|
"loss": 0.7125, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 47.73713577185369, |
|
"grad_norm": 0.5370163321495056, |
|
"learning_rate": 4.587753466177374e-06, |
|
"loss": 0.7117, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 48.04711717296962, |
|
"grad_norm": 0.5529221892356873, |
|
"learning_rate": 3.961524491815188e-06, |
|
"loss": 0.711, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 48.357098574085555, |
|
"grad_norm": 0.549679160118103, |
|
"learning_rate": 3.3352955174530015e-06, |
|
"loss": 0.7112, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 48.66707997520149, |
|
"grad_norm": 0.5416662096977234, |
|
"learning_rate": 2.709066543090816e-06, |
|
"loss": 0.7112, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 48.97706137631742, |
|
"grad_norm": 0.5428098440170288, |
|
"learning_rate": 2.08283756872863e-06, |
|
"loss": 0.7109, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 49.287042777433356, |
|
"grad_norm": 0.5247154235839844, |
|
"learning_rate": 1.4566085943664442e-06, |
|
"loss": 0.7106, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 49.59702417854929, |
|
"grad_norm": 0.5486724376678467, |
|
"learning_rate": 8.303796200042584e-07, |
|
"loss": 0.7097, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 49.90700557966522, |
|
"grad_norm": 0.5495786070823669, |
|
"learning_rate": 2.0415064564207257e-07, |
|
"loss": 0.7106, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 80650, |
|
"total_flos": 2.052104150815488e+18, |
|
"train_loss": 0.04098836247254364, |
|
"train_runtime": 10357.3823, |
|
"train_samples_per_second": 11959.61, |
|
"train_steps_per_second": 7.787 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 80650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.052104150815488e+18, |
|
"train_batch_size": 192, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|