{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 80650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.30998140111593303, "grad_norm": 0.3543250262737274, "learning_rate": 6.195786864931847e-05, "loss": 9.0345, "step": 500 }, { "epoch": 0.6199628022318661, "grad_norm": 0.5106557607650757, "learning_rate": 9.97582756158962e-05, "loss": 6.2184, "step": 1000 }, { "epoch": 0.9299442033477991, "grad_norm": 2.6617751121520996, "learning_rate": 9.913204664153402e-05, "loss": 5.4194, "step": 1500 }, { "epoch": 1.2399256044637321, "grad_norm": 1.8096632957458496, "learning_rate": 9.850581766717182e-05, "loss": 3.915, "step": 2000 }, { "epoch": 1.5499070055796653, "grad_norm": 1.2520173788070679, "learning_rate": 9.787958869280964e-05, "loss": 2.7963, "step": 2500 }, { "epoch": 1.8598884066955983, "grad_norm": 0.8099603056907654, "learning_rate": 9.725335971844745e-05, "loss": 2.2568, "step": 3000 }, { "epoch": 2.1698698078115313, "grad_norm": 0.7233591079711914, "learning_rate": 9.662713074408527e-05, "loss": 1.9847, "step": 3500 }, { "epoch": 2.4798512089274642, "grad_norm": 0.6427165865898132, "learning_rate": 9.600090176972308e-05, "loss": 1.8216, "step": 4000 }, { "epoch": 2.7898326100433977, "grad_norm": 0.6729193925857544, "learning_rate": 9.53746727953609e-05, "loss": 1.7067, "step": 4500 }, { "epoch": 3.0998140111593306, "grad_norm": 0.6484789848327637, "learning_rate": 9.47484438209987e-05, "loss": 1.6187, "step": 5000 }, { "epoch": 3.4097954122752636, "grad_norm": 0.5950448513031006, "learning_rate": 9.412221484663653e-05, "loss": 1.5479, "step": 5500 }, { "epoch": 3.7197768133911966, "grad_norm": 0.6102598309516907, "learning_rate": 9.349598587227433e-05, "loss": 1.4879, "step": 6000 }, { "epoch": 4.02975821450713, "grad_norm": 0.6204754710197449, "learning_rate": 9.286975689791215e-05, "loss": 1.4379, "step": 6500 }, { "epoch": 4.3397396156230625, "grad_norm": 0.590217649936676, "learning_rate": 9.224352792354997e-05, "loss": 1.3926, "step": 7000 }, { "epoch": 4.6497210167389955, "grad_norm": 0.6062743663787842, "learning_rate": 9.161729894918779e-05, "loss": 1.3553, "step": 7500 }, { "epoch": 4.9597024178549285, "grad_norm": 0.5663708448410034, "learning_rate": 9.09910699748256e-05, "loss": 1.3201, "step": 8000 }, { "epoch": 5.2696838189708615, "grad_norm": 0.5806947350502014, "learning_rate": 9.036484100046342e-05, "loss": 1.2904, "step": 8500 }, { "epoch": 5.579665220086794, "grad_norm": 0.6131803393363953, "learning_rate": 8.973861202610123e-05, "loss": 1.2623, "step": 9000 }, { "epoch": 5.889646621202727, "grad_norm": 0.5666236281394958, "learning_rate": 8.911238305173905e-05, "loss": 1.2368, "step": 9500 }, { "epoch": 6.199628022318661, "grad_norm": 0.6078547239303589, "learning_rate": 8.848615407737685e-05, "loss": 1.212, "step": 10000 }, { "epoch": 6.509609423434594, "grad_norm": 0.575513482093811, "learning_rate": 8.785992510301467e-05, "loss": 1.1914, "step": 10500 }, { "epoch": 6.819590824550527, "grad_norm": 0.5826976895332336, "learning_rate": 8.723369612865248e-05, "loss": 1.1718, "step": 11000 }, { "epoch": 7.12957222566646, "grad_norm": 0.544598400592804, "learning_rate": 8.66074671542903e-05, "loss": 1.1548, "step": 11500 }, { "epoch": 7.439553626782393, "grad_norm": 0.5824791193008423, "learning_rate": 8.598123817992811e-05, "loss": 1.1363, "step": 12000 }, { "epoch": 7.749535027898326, "grad_norm": 0.5747692584991455, "learning_rate": 8.535500920556593e-05, "loss": 1.1211, "step": 12500 }, { "epoch": 8.05951642901426, "grad_norm": 0.5473280549049377, "learning_rate": 8.472878023120375e-05, "loss": 1.1077, "step": 13000 }, { "epoch": 8.369497830130193, "grad_norm": 0.5574379563331604, "learning_rate": 8.410255125684155e-05, "loss": 1.0908, "step": 13500 }, { "epoch": 8.679479231246125, "grad_norm": 0.5424452424049377, "learning_rate": 8.347632228247937e-05, "loss": 1.0785, "step": 14000 }, { "epoch": 8.989460632362059, "grad_norm": 0.5508283376693726, "learning_rate": 8.285009330811718e-05, "loss": 1.0683, "step": 14500 }, { "epoch": 9.299442033477991, "grad_norm": 0.5519115924835205, "learning_rate": 8.2223864333755e-05, "loss": 1.0537, "step": 15000 }, { "epoch": 9.609423434593925, "grad_norm": 0.5510475039482117, "learning_rate": 8.159763535939281e-05, "loss": 1.0443, "step": 15500 }, { "epoch": 9.919404835709857, "grad_norm": 0.5631123185157776, "learning_rate": 8.097140638503063e-05, "loss": 1.0339, "step": 16000 }, { "epoch": 10.22938623682579, "grad_norm": 0.5705382823944092, "learning_rate": 8.034517741066844e-05, "loss": 1.0217, "step": 16500 }, { "epoch": 10.539367637941723, "grad_norm": 0.5316577553749084, "learning_rate": 7.971894843630626e-05, "loss": 1.0151, "step": 17000 }, { "epoch": 10.849349039057657, "grad_norm": 0.5557442307472229, "learning_rate": 7.909271946194406e-05, "loss": 1.0043, "step": 17500 }, { "epoch": 11.159330440173589, "grad_norm": 0.5498985648155212, "learning_rate": 7.846649048758188e-05, "loss": 0.9951, "step": 18000 }, { "epoch": 11.469311841289523, "grad_norm": 0.552780032157898, "learning_rate": 7.784026151321969e-05, "loss": 0.9855, "step": 18500 }, { "epoch": 11.779293242405455, "grad_norm": 0.5406888127326965, "learning_rate": 7.721403253885752e-05, "loss": 0.9795, "step": 19000 }, { "epoch": 12.089274643521389, "grad_norm": 0.537375271320343, "learning_rate": 7.658780356449533e-05, "loss": 0.971, "step": 19500 }, { "epoch": 12.399256044637323, "grad_norm": 0.5666614174842834, "learning_rate": 7.596157459013315e-05, "loss": 0.9643, "step": 20000 }, { "epoch": 12.709237445753255, "grad_norm": 0.5302731990814209, "learning_rate": 7.533659807371968e-05, "loss": 0.9582, "step": 20500 }, { "epoch": 13.019218846869189, "grad_norm": 0.5608243346214294, "learning_rate": 7.471036909935749e-05, "loss": 0.9512, "step": 21000 }, { "epoch": 13.32920024798512, "grad_norm": 0.5309119820594788, "learning_rate": 7.408414012499531e-05, "loss": 0.9424, "step": 21500 }, { "epoch": 13.639181649101054, "grad_norm": 0.5380939245223999, "learning_rate": 7.345791115063312e-05, "loss": 0.9383, "step": 22000 }, { "epoch": 13.949163050216987, "grad_norm": 0.5440984964370728, "learning_rate": 7.283168217627094e-05, "loss": 0.9298, "step": 22500 }, { "epoch": 14.25914445133292, "grad_norm": 0.5377441048622131, "learning_rate": 7.220545320190874e-05, "loss": 0.9245, "step": 23000 }, { "epoch": 14.569125852448852, "grad_norm": 0.5402495265007019, "learning_rate": 7.157922422754656e-05, "loss": 0.9196, "step": 23500 }, { "epoch": 14.879107253564786, "grad_norm": 0.5610705018043518, "learning_rate": 7.095299525318437e-05, "loss": 0.9146, "step": 24000 }, { "epoch": 15.189088654680718, "grad_norm": 0.5305636525154114, "learning_rate": 7.032676627882219e-05, "loss": 0.9071, "step": 24500 }, { "epoch": 15.499070055796652, "grad_norm": 0.5398979187011719, "learning_rate": 6.970053730446e-05, "loss": 0.9037, "step": 25000 }, { "epoch": 15.809051456912584, "grad_norm": 0.5490283370018005, "learning_rate": 6.907556078804655e-05, "loss": 0.8982, "step": 25500 }, { "epoch": 16.11903285802852, "grad_norm": 0.5505014061927795, "learning_rate": 6.844933181368435e-05, "loss": 0.8933, "step": 26000 }, { "epoch": 16.429014259144452, "grad_norm": 0.5260488390922546, "learning_rate": 6.782310283932217e-05, "loss": 0.8865, "step": 26500 }, { "epoch": 16.738995660260386, "grad_norm": 0.5459970235824585, "learning_rate": 6.719687386495999e-05, "loss": 0.8837, "step": 27000 }, { "epoch": 17.048977061376316, "grad_norm": 0.5260828733444214, "learning_rate": 6.657189734854653e-05, "loss": 0.8812, "step": 27500 }, { "epoch": 17.35895846249225, "grad_norm": 0.531878650188446, "learning_rate": 6.594566837418435e-05, "loss": 0.874, "step": 28000 }, { "epoch": 17.668939863608184, "grad_norm": 0.5373751521110535, "learning_rate": 6.531943939982215e-05, "loss": 0.8703, "step": 28500 }, { "epoch": 17.978921264724118, "grad_norm": 0.5685413479804993, "learning_rate": 6.469321042545997e-05, "loss": 0.8674, "step": 29000 }, { "epoch": 18.288902665840048, "grad_norm": 0.5405117273330688, "learning_rate": 6.406698145109778e-05, "loss": 0.8618, "step": 29500 }, { "epoch": 18.598884066955982, "grad_norm": 0.5303318500518799, "learning_rate": 6.344325739263305e-05, "loss": 0.8572, "step": 30000 }, { "epoch": 18.908865468071916, "grad_norm": 0.5173208117485046, "learning_rate": 6.281702841827086e-05, "loss": 0.8552, "step": 30500 }, { "epoch": 19.21884686918785, "grad_norm": 0.5334449410438538, "learning_rate": 6.219079944390868e-05, "loss": 0.8494, "step": 31000 }, { "epoch": 19.52882827030378, "grad_norm": 0.5522080659866333, "learning_rate": 6.156457046954649e-05, "loss": 0.8464, "step": 31500 }, { "epoch": 19.838809671419714, "grad_norm": 0.5295758247375488, "learning_rate": 6.09383414951843e-05, "loss": 0.845, "step": 32000 }, { "epoch": 20.148791072535648, "grad_norm": 0.5164583325386047, "learning_rate": 6.0312112520822115e-05, "loss": 0.8395, "step": 32500 }, { "epoch": 20.45877247365158, "grad_norm": 0.5620171427726746, "learning_rate": 5.968713600440865e-05, "loss": 0.8354, "step": 33000 }, { "epoch": 20.768753874767516, "grad_norm": 0.5254458785057068, "learning_rate": 5.906090703004646e-05, "loss": 0.8336, "step": 33500 }, { "epoch": 21.078735275883446, "grad_norm": 0.5437597632408142, "learning_rate": 5.8434678055684276e-05, "loss": 0.8304, "step": 34000 }, { "epoch": 21.38871667699938, "grad_norm": 0.5438856482505798, "learning_rate": 5.78084490813221e-05, "loss": 0.8263, "step": 34500 }, { "epoch": 21.698698078115314, "grad_norm": 0.5386750102043152, "learning_rate": 5.7182220106959916e-05, "loss": 0.8248, "step": 35000 }, { "epoch": 22.008679479231247, "grad_norm": 0.5307642817497253, "learning_rate": 5.655724359054645e-05, "loss": 0.8223, "step": 35500 }, { "epoch": 22.318660880347178, "grad_norm": 0.5404214859008789, "learning_rate": 5.5931014616184264e-05, "loss": 0.8176, "step": 36000 }, { "epoch": 22.62864228146311, "grad_norm": 0.555665910243988, "learning_rate": 5.530478564182208e-05, "loss": 0.8164, "step": 36500 }, { "epoch": 22.938623682579045, "grad_norm": 0.5331476330757141, "learning_rate": 5.467855666745989e-05, "loss": 0.8135, "step": 37000 }, { "epoch": 23.24860508369498, "grad_norm": 0.541491687297821, "learning_rate": 5.405358015104644e-05, "loss": 0.8097, "step": 37500 }, { "epoch": 23.55858648481091, "grad_norm": 0.5554507374763489, "learning_rate": 5.342735117668425e-05, "loss": 0.8074, "step": 38000 }, { "epoch": 23.868567885926844, "grad_norm": 0.5485785007476807, "learning_rate": 5.2801122202322065e-05, "loss": 0.8054, "step": 38500 }, { "epoch": 24.178549287042777, "grad_norm": 0.5320767164230347, "learning_rate": 5.217489322795988e-05, "loss": 0.8018, "step": 39000 }, { "epoch": 24.48853068815871, "grad_norm": 0.5248667001724243, "learning_rate": 5.154866425359769e-05, "loss": 0.8008, "step": 39500 }, { "epoch": 24.798512089274645, "grad_norm": 0.5368346571922302, "learning_rate": 5.0922435279235505e-05, "loss": 0.7975, "step": 40000 }, { "epoch": 25.108493490390575, "grad_norm": 0.53144371509552, "learning_rate": 5.029620630487332e-05, "loss": 0.7947, "step": 40500 }, { "epoch": 25.41847489150651, "grad_norm": 0.5482547879219055, "learning_rate": 4.966997733051113e-05, "loss": 0.793, "step": 41000 }, { "epoch": 25.728456292622443, "grad_norm": 0.5446964502334595, "learning_rate": 4.9043748356148946e-05, "loss": 0.7905, "step": 41500 }, { "epoch": 26.038437693738377, "grad_norm": 0.5257270932197571, "learning_rate": 4.841751938178676e-05, "loss": 0.7892, "step": 42000 }, { "epoch": 26.348419094854307, "grad_norm": 0.5478941202163696, "learning_rate": 4.779129040742457e-05, "loss": 0.7856, "step": 42500 }, { "epoch": 26.65840049597024, "grad_norm": 0.5381990671157837, "learning_rate": 4.7165061433062386e-05, "loss": 0.7863, "step": 43000 }, { "epoch": 26.968381897086175, "grad_norm": 0.546461820602417, "learning_rate": 4.65388324587002e-05, "loss": 0.7826, "step": 43500 }, { "epoch": 27.27836329820211, "grad_norm": 0.543404757976532, "learning_rate": 4.591260348433802e-05, "loss": 0.7796, "step": 44000 }, { "epoch": 27.58834469931804, "grad_norm": 0.5448907613754272, "learning_rate": 4.528637450997583e-05, "loss": 0.7796, "step": 44500 }, { "epoch": 27.898326100433973, "grad_norm": 0.5504478216171265, "learning_rate": 4.466014553561365e-05, "loss": 0.7761, "step": 45000 }, { "epoch": 28.208307501549907, "grad_norm": 0.544154703617096, "learning_rate": 4.403391656125146e-05, "loss": 0.7753, "step": 45500 }, { "epoch": 28.51828890266584, "grad_norm": 0.542306125164032, "learning_rate": 4.3407687586889274e-05, "loss": 0.7735, "step": 46000 }, { "epoch": 28.828270303781775, "grad_norm": 0.5549866557121277, "learning_rate": 4.278145861252709e-05, "loss": 0.7707, "step": 46500 }, { "epoch": 29.138251704897705, "grad_norm": 0.538090169429779, "learning_rate": 4.21552296381649e-05, "loss": 0.7697, "step": 47000 }, { "epoch": 29.44823310601364, "grad_norm": 0.5609955191612244, "learning_rate": 4.1529000663802714e-05, "loss": 0.7682, "step": 47500 }, { "epoch": 29.758214507129573, "grad_norm": 0.5595529675483704, "learning_rate": 4.090277168944053e-05, "loss": 0.7659, "step": 48000 }, { "epoch": 30.068195908245507, "grad_norm": 0.5461651086807251, "learning_rate": 4.027654271507834e-05, "loss": 0.7656, "step": 48500 }, { "epoch": 30.378177309361437, "grad_norm": 0.5438820719718933, "learning_rate": 3.9650313740716154e-05, "loss": 0.7625, "step": 49000 }, { "epoch": 30.68815871047737, "grad_norm": 0.5458811521530151, "learning_rate": 3.902408476635397e-05, "loss": 0.762, "step": 49500 }, { "epoch": 30.998140111593305, "grad_norm": 0.535521388053894, "learning_rate": 3.839785579199179e-05, "loss": 0.7589, "step": 50000 }, { "epoch": 31.30812151270924, "grad_norm": 0.5407618284225464, "learning_rate": 3.77716268176296e-05, "loss": 0.7576, "step": 50500 }, { "epoch": 31.61810291382517, "grad_norm": 0.5259741544723511, "learning_rate": 3.7145397843267415e-05, "loss": 0.7571, "step": 51000 }, { "epoch": 31.928084314941103, "grad_norm": 0.5338233709335327, "learning_rate": 3.651916886890523e-05, "loss": 0.7561, "step": 51500 }, { "epoch": 32.23806571605704, "grad_norm": 0.5369750261306763, "learning_rate": 3.589293989454304e-05, "loss": 0.7541, "step": 52000 }, { "epoch": 32.54804711717297, "grad_norm": 0.5418145656585693, "learning_rate": 3.5266710920180856e-05, "loss": 0.7521, "step": 52500 }, { "epoch": 32.858028518288904, "grad_norm": 0.533149242401123, "learning_rate": 3.464048194581867e-05, "loss": 0.7519, "step": 53000 }, { "epoch": 33.16800991940484, "grad_norm": 0.5384135246276855, "learning_rate": 3.401425297145648e-05, "loss": 0.7497, "step": 53500 }, { "epoch": 33.47799132052077, "grad_norm": 0.5323925018310547, "learning_rate": 3.3388023997094296e-05, "loss": 0.7485, "step": 54000 }, { "epoch": 33.7879727216367, "grad_norm": 0.535434901714325, "learning_rate": 3.276179502273211e-05, "loss": 0.7472, "step": 54500 }, { "epoch": 34.09795412275263, "grad_norm": 0.5496259331703186, "learning_rate": 3.213556604836992e-05, "loss": 0.7454, "step": 55000 }, { "epoch": 34.40793552386857, "grad_norm": 0.5429278016090393, "learning_rate": 3.150933707400774e-05, "loss": 0.7447, "step": 55500 }, { "epoch": 34.7179169249845, "grad_norm": 0.5489596724510193, "learning_rate": 3.088310809964556e-05, "loss": 0.7438, "step": 56000 }, { "epoch": 35.027898326100434, "grad_norm": 0.5510178208351135, "learning_rate": 3.025687912528337e-05, "loss": 0.7416, "step": 56500 }, { "epoch": 35.33787972721637, "grad_norm": 0.5540343523025513, "learning_rate": 2.9630650150921187e-05, "loss": 0.7401, "step": 57000 }, { "epoch": 35.6478611283323, "grad_norm": 0.551895260810852, "learning_rate": 2.9004421176559e-05, "loss": 0.7404, "step": 57500 }, { "epoch": 35.957842529448236, "grad_norm": 0.5412101149559021, "learning_rate": 2.8378192202196814e-05, "loss": 0.74, "step": 58000 }, { "epoch": 36.26782393056417, "grad_norm": 0.5450315475463867, "learning_rate": 2.7751963227834627e-05, "loss": 0.7386, "step": 58500 }, { "epoch": 36.577805331680096, "grad_norm": 0.5550098419189453, "learning_rate": 2.712573425347244e-05, "loss": 0.7382, "step": 59000 }, { "epoch": 36.88778673279603, "grad_norm": 0.5502198338508606, "learning_rate": 2.6499505279110254e-05, "loss": 0.7345, "step": 59500 }, { "epoch": 37.197768133911964, "grad_norm": 0.5401105880737305, "learning_rate": 2.587452876269679e-05, "loss": 0.7355, "step": 60000 }, { "epoch": 37.5077495350279, "grad_norm": 0.543369710445404, "learning_rate": 2.5248299788334605e-05, "loss": 0.7338, "step": 60500 }, { "epoch": 37.81773093614383, "grad_norm": 0.5440373420715332, "learning_rate": 2.4622070813972422e-05, "loss": 0.7326, "step": 61000 }, { "epoch": 38.127712337259766, "grad_norm": 0.5450806021690369, "learning_rate": 2.3995841839610235e-05, "loss": 0.7315, "step": 61500 }, { "epoch": 38.4376937383757, "grad_norm": 0.5412734746932983, "learning_rate": 2.336961286524805e-05, "loss": 0.7301, "step": 62000 }, { "epoch": 38.74767513949163, "grad_norm": 0.5553017854690552, "learning_rate": 2.274463634883459e-05, "loss": 0.732, "step": 62500 }, { "epoch": 39.05765654060756, "grad_norm": 0.5467730164527893, "learning_rate": 2.2118407374472403e-05, "loss": 0.7289, "step": 63000 }, { "epoch": 39.367637941723494, "grad_norm": 0.551267683506012, "learning_rate": 2.1492178400110216e-05, "loss": 0.728, "step": 63500 }, { "epoch": 39.67761934283943, "grad_norm": 0.5391538739204407, "learning_rate": 2.0865949425748033e-05, "loss": 0.7276, "step": 64000 }, { "epoch": 39.98760074395536, "grad_norm": 0.5523350238800049, "learning_rate": 2.0239720451385847e-05, "loss": 0.7272, "step": 64500 }, { "epoch": 40.297582145071296, "grad_norm": 0.5367141366004944, "learning_rate": 1.961349147702366e-05, "loss": 0.726, "step": 65000 }, { "epoch": 40.60756354618723, "grad_norm": 0.5538766980171204, "learning_rate": 1.8987262502661473e-05, "loss": 0.7238, "step": 65500 }, { "epoch": 40.91754494730316, "grad_norm": 0.5274632573127747, "learning_rate": 1.8361033528299287e-05, "loss": 0.725, "step": 66000 }, { "epoch": 41.2275263484191, "grad_norm": 0.521597146987915, "learning_rate": 1.7736057011885827e-05, "loss": 0.7233, "step": 66500 }, { "epoch": 41.53750774953503, "grad_norm": 0.5390001535415649, "learning_rate": 1.710982803752364e-05, "loss": 0.7225, "step": 67000 }, { "epoch": 41.84748915065096, "grad_norm": 0.5474331378936768, "learning_rate": 1.6483599063161458e-05, "loss": 0.7218, "step": 67500 }, { "epoch": 42.15747055176689, "grad_norm": 0.5352886915206909, "learning_rate": 1.5858622546747995e-05, "loss": 0.7213, "step": 68000 }, { "epoch": 42.467451952882826, "grad_norm": 0.540053129196167, "learning_rate": 1.5232393572385808e-05, "loss": 0.7204, "step": 68500 }, { "epoch": 42.77743335399876, "grad_norm": 0.5470998883247375, "learning_rate": 1.4606164598023622e-05, "loss": 0.721, "step": 69000 }, { "epoch": 43.08741475511469, "grad_norm": 0.5613588094711304, "learning_rate": 1.3979935623661435e-05, "loss": 0.7194, "step": 69500 }, { "epoch": 43.39739615623063, "grad_norm": 0.5471562743186951, "learning_rate": 1.3354959107247974e-05, "loss": 0.7178, "step": 70000 }, { "epoch": 43.70737755734656, "grad_norm": 0.5386627912521362, "learning_rate": 1.2728730132885787e-05, "loss": 0.7184, "step": 70500 }, { "epoch": 44.017358958462495, "grad_norm": 0.5391978621482849, "learning_rate": 1.2102501158523603e-05, "loss": 0.7186, "step": 71000 }, { "epoch": 44.32734035957843, "grad_norm": 0.5381629467010498, "learning_rate": 1.1476272184161418e-05, "loss": 0.7168, "step": 71500 }, { "epoch": 44.637321760694356, "grad_norm": 0.5467249155044556, "learning_rate": 1.0850043209799233e-05, "loss": 0.7162, "step": 72000 }, { "epoch": 44.94730316181029, "grad_norm": 0.5548228025436401, "learning_rate": 1.0223814235437046e-05, "loss": 0.7146, "step": 72500 }, { "epoch": 45.25728456292622, "grad_norm": 0.5488151907920837, "learning_rate": 9.59758526107486e-06, "loss": 0.7152, "step": 73000 }, { "epoch": 45.56726596404216, "grad_norm": 0.5473387241363525, "learning_rate": 8.971356286712675e-06, "loss": 0.7142, "step": 73500 }, { "epoch": 45.87724736515809, "grad_norm": 0.5331913828849792, "learning_rate": 8.345127312350489e-06, "loss": 0.7155, "step": 74000 }, { "epoch": 46.187228766274025, "grad_norm": 0.5443392395973206, "learning_rate": 7.718898337988302e-06, "loss": 0.7136, "step": 74500 }, { "epoch": 46.49721016738996, "grad_norm": 0.5461409091949463, "learning_rate": 7.092669363626117e-06, "loss": 0.7148, "step": 75000 }, { "epoch": 46.80719156850589, "grad_norm": 0.5504785180091858, "learning_rate": 6.466440389263931e-06, "loss": 0.7133, "step": 75500 }, { "epoch": 47.11717296962182, "grad_norm": 0.5478015542030334, "learning_rate": 5.840211414901745e-06, "loss": 0.7125, "step": 76000 }, { "epoch": 47.42715437073775, "grad_norm": 0.5464319586753845, "learning_rate": 5.2139824405395585e-06, "loss": 0.7125, "step": 76500 }, { "epoch": 47.73713577185369, "grad_norm": 0.5370163321495056, "learning_rate": 4.587753466177374e-06, "loss": 0.7117, "step": 77000 }, { "epoch": 48.04711717296962, "grad_norm": 0.5529221892356873, "learning_rate": 3.961524491815188e-06, "loss": 0.711, "step": 77500 }, { "epoch": 48.357098574085555, "grad_norm": 0.549679160118103, "learning_rate": 3.3352955174530015e-06, "loss": 0.7112, "step": 78000 }, { "epoch": 48.66707997520149, "grad_norm": 0.5416662096977234, "learning_rate": 2.709066543090816e-06, "loss": 0.7112, "step": 78500 }, { "epoch": 48.97706137631742, "grad_norm": 0.5428098440170288, "learning_rate": 2.08283756872863e-06, "loss": 0.7109, "step": 79000 }, { "epoch": 49.287042777433356, "grad_norm": 0.5247154235839844, "learning_rate": 1.4566085943664442e-06, "loss": 0.7106, "step": 79500 }, { "epoch": 49.59702417854929, "grad_norm": 0.5486724376678467, "learning_rate": 8.303796200042584e-07, "loss": 0.7097, "step": 80000 }, { "epoch": 49.90700557966522, "grad_norm": 0.5495786070823669, "learning_rate": 2.0415064564207257e-07, "loss": 0.7106, "step": 80500 }, { "epoch": 50.0, "step": 80650, "total_flos": 2.052104150815488e+18, "train_loss": 0.04098836247254364, "train_runtime": 10357.3823, "train_samples_per_second": 11959.61, "train_steps_per_second": 7.787 } ], "logging_steps": 500, "max_steps": 80650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.052104150815488e+18, "train_batch_size": 192, "trial_name": null, "trial_params": null }