t5-big-scratch-iwslt2017 / trainer_state.json
minseok0809's picture
End of training
1b42f37 verified
raw
history blame
113 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 322050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0776276975624903,
"grad_norm": 2.2886219024658203,
"learning_rate": 4.992237230243752e-05,
"loss": 6.7024,
"step": 500
},
{
"epoch": 0.1552553951249806,
"grad_norm": 2.4521589279174805,
"learning_rate": 4.9844744604875024e-05,
"loss": 5.9478,
"step": 1000
},
{
"epoch": 0.2328830926874709,
"grad_norm": 2.662937641143799,
"learning_rate": 4.976711690731253e-05,
"loss": 5.6144,
"step": 1500
},
{
"epoch": 0.3105107902499612,
"grad_norm": 2.6923885345458984,
"learning_rate": 4.968948920975004e-05,
"loss": 5.3722,
"step": 2000
},
{
"epoch": 0.38813848781245147,
"grad_norm": 3.0573277473449707,
"learning_rate": 4.9611861512187554e-05,
"loss": 5.2099,
"step": 2500
},
{
"epoch": 0.4657661853749418,
"grad_norm": 2.492630958557129,
"learning_rate": 4.953423381462506e-05,
"loss": 5.0823,
"step": 3000
},
{
"epoch": 0.5433938829374321,
"grad_norm": 2.724647045135498,
"learning_rate": 4.945660611706257e-05,
"loss": 4.9667,
"step": 3500
},
{
"epoch": 0.6210215804999224,
"grad_norm": 2.4619314670562744,
"learning_rate": 4.937897841950008e-05,
"loss": 4.8775,
"step": 4000
},
{
"epoch": 0.6986492780624126,
"grad_norm": 2.758463144302368,
"learning_rate": 4.930135072193759e-05,
"loss": 4.7939,
"step": 4500
},
{
"epoch": 0.7762769756249029,
"grad_norm": 3.0213730335235596,
"learning_rate": 4.92237230243751e-05,
"loss": 4.7288,
"step": 5000
},
{
"epoch": 0.8539046731873933,
"grad_norm": 2.7506508827209473,
"learning_rate": 4.9146095326812606e-05,
"loss": 4.6724,
"step": 5500
},
{
"epoch": 0.9315323707498836,
"grad_norm": 3.3897273540496826,
"learning_rate": 4.906846762925012e-05,
"loss": 4.6203,
"step": 6000
},
{
"epoch": 1.0091600683123738,
"grad_norm": 3.2612226009368896,
"learning_rate": 4.899083993168763e-05,
"loss": 4.564,
"step": 6500
},
{
"epoch": 1.0867877658748641,
"grad_norm": 2.9909706115722656,
"learning_rate": 4.891321223412514e-05,
"loss": 4.4841,
"step": 7000
},
{
"epoch": 1.1644154634373545,
"grad_norm": 3.00471830368042,
"learning_rate": 4.883558453656264e-05,
"loss": 4.4436,
"step": 7500
},
{
"epoch": 1.2420431609998448,
"grad_norm": 3.588019609451294,
"learning_rate": 4.875795683900016e-05,
"loss": 4.4248,
"step": 8000
},
{
"epoch": 1.319670858562335,
"grad_norm": 3.1261277198791504,
"learning_rate": 4.868032914143767e-05,
"loss": 4.3914,
"step": 8500
},
{
"epoch": 1.3972985561248255,
"grad_norm": 3.248203754425049,
"learning_rate": 4.860270144387518e-05,
"loss": 4.3539,
"step": 9000
},
{
"epoch": 1.4749262536873156,
"grad_norm": 3.6183948516845703,
"learning_rate": 4.8525073746312687e-05,
"loss": 4.3439,
"step": 9500
},
{
"epoch": 1.5525539512498059,
"grad_norm": 3.6323795318603516,
"learning_rate": 4.8447446048750194e-05,
"loss": 4.3104,
"step": 10000
},
{
"epoch": 1.6301816488122962,
"grad_norm": 3.8555796146392822,
"learning_rate": 4.836981835118771e-05,
"loss": 4.2775,
"step": 10500
},
{
"epoch": 1.7078093463747865,
"grad_norm": 3.804065465927124,
"learning_rate": 4.8292190653625216e-05,
"loss": 4.2645,
"step": 11000
},
{
"epoch": 1.7854370439372769,
"grad_norm": 3.5051915645599365,
"learning_rate": 4.8214562956062723e-05,
"loss": 4.2555,
"step": 11500
},
{
"epoch": 1.863064741499767,
"grad_norm": 3.28206205368042,
"learning_rate": 4.813693525850024e-05,
"loss": 4.2254,
"step": 12000
},
{
"epoch": 1.9406924390622575,
"grad_norm": 3.6532084941864014,
"learning_rate": 4.8059307560937745e-05,
"loss": 4.2142,
"step": 12500
},
{
"epoch": 2.0183201366247476,
"grad_norm": 3.8629403114318848,
"learning_rate": 4.798167986337525e-05,
"loss": 4.1695,
"step": 13000
},
{
"epoch": 2.095947834187238,
"grad_norm": 3.7742209434509277,
"learning_rate": 4.790405216581276e-05,
"loss": 4.1056,
"step": 13500
},
{
"epoch": 2.1735755317497283,
"grad_norm": 3.638509750366211,
"learning_rate": 4.7826424468250275e-05,
"loss": 4.0926,
"step": 14000
},
{
"epoch": 2.2512032293122184,
"grad_norm": 3.4432594776153564,
"learning_rate": 4.774879677068778e-05,
"loss": 4.0826,
"step": 14500
},
{
"epoch": 2.328830926874709,
"grad_norm": 3.252643346786499,
"learning_rate": 4.76711690731253e-05,
"loss": 4.073,
"step": 15000
},
{
"epoch": 2.406458624437199,
"grad_norm": 3.611611843109131,
"learning_rate": 4.7593541375562804e-05,
"loss": 4.0556,
"step": 15500
},
{
"epoch": 2.4840863219996896,
"grad_norm": 3.842820644378662,
"learning_rate": 4.751591367800031e-05,
"loss": 4.0538,
"step": 16000
},
{
"epoch": 2.5617140195621797,
"grad_norm": 4.127362251281738,
"learning_rate": 4.7438285980437826e-05,
"loss": 4.0186,
"step": 16500
},
{
"epoch": 2.63934171712467,
"grad_norm": 3.498431921005249,
"learning_rate": 4.7360658282875334e-05,
"loss": 3.9995,
"step": 17000
},
{
"epoch": 2.7169694146871604,
"grad_norm": 3.7191123962402344,
"learning_rate": 4.728303058531284e-05,
"loss": 4.0059,
"step": 17500
},
{
"epoch": 2.794597112249651,
"grad_norm": 3.748997688293457,
"learning_rate": 4.720540288775035e-05,
"loss": 3.9807,
"step": 18000
},
{
"epoch": 2.872224809812141,
"grad_norm": 3.91758394241333,
"learning_rate": 4.712777519018786e-05,
"loss": 3.9752,
"step": 18500
},
{
"epoch": 2.949852507374631,
"grad_norm": 4.286660671234131,
"learning_rate": 4.705014749262537e-05,
"loss": 3.9597,
"step": 19000
},
{
"epoch": 3.0274802049371217,
"grad_norm": 4.166433334350586,
"learning_rate": 4.697251979506288e-05,
"loss": 3.9264,
"step": 19500
},
{
"epoch": 3.1051079024996118,
"grad_norm": 4.093895435333252,
"learning_rate": 4.689489209750039e-05,
"loss": 3.8771,
"step": 20000
},
{
"epoch": 3.1827356000621023,
"grad_norm": 3.8036608695983887,
"learning_rate": 4.68172643999379e-05,
"loss": 3.8691,
"step": 20500
},
{
"epoch": 3.2603632976245924,
"grad_norm": 3.8469622135162354,
"learning_rate": 4.6739636702375414e-05,
"loss": 3.8456,
"step": 21000
},
{
"epoch": 3.3379909951870825,
"grad_norm": 4.524165630340576,
"learning_rate": 4.6662009004812915e-05,
"loss": 3.8516,
"step": 21500
},
{
"epoch": 3.415618692749573,
"grad_norm": 4.203705310821533,
"learning_rate": 4.658438130725043e-05,
"loss": 3.8486,
"step": 22000
},
{
"epoch": 3.493246390312063,
"grad_norm": 3.79025936126709,
"learning_rate": 4.650675360968794e-05,
"loss": 3.8466,
"step": 22500
},
{
"epoch": 3.5708740878745537,
"grad_norm": 4.120058059692383,
"learning_rate": 4.642912591212545e-05,
"loss": 3.8195,
"step": 23000
},
{
"epoch": 3.648501785437044,
"grad_norm": 4.125455379486084,
"learning_rate": 4.635149821456296e-05,
"loss": 3.7975,
"step": 23500
},
{
"epoch": 3.7261294829995344,
"grad_norm": 4.129229545593262,
"learning_rate": 4.6273870517000466e-05,
"loss": 3.8115,
"step": 24000
},
{
"epoch": 3.8037571805620245,
"grad_norm": 4.444260597229004,
"learning_rate": 4.619624281943798e-05,
"loss": 3.8045,
"step": 24500
},
{
"epoch": 3.881384878124515,
"grad_norm": 4.36641788482666,
"learning_rate": 4.611861512187549e-05,
"loss": 3.813,
"step": 25000
},
{
"epoch": 3.959012575687005,
"grad_norm": 4.3214802742004395,
"learning_rate": 4.6040987424312996e-05,
"loss": 3.7778,
"step": 25500
},
{
"epoch": 4.036640273249495,
"grad_norm": 4.101747989654541,
"learning_rate": 4.5963359726750503e-05,
"loss": 3.7416,
"step": 26000
},
{
"epoch": 4.114267970811985,
"grad_norm": 4.384554386138916,
"learning_rate": 4.588573202918802e-05,
"loss": 3.7074,
"step": 26500
},
{
"epoch": 4.191895668374476,
"grad_norm": 4.370575904846191,
"learning_rate": 4.580810433162553e-05,
"loss": 3.7012,
"step": 27000
},
{
"epoch": 4.2695233659369665,
"grad_norm": 4.443875789642334,
"learning_rate": 4.573047663406303e-05,
"loss": 3.691,
"step": 27500
},
{
"epoch": 4.347151063499457,
"grad_norm": 4.347660064697266,
"learning_rate": 4.565284893650055e-05,
"loss": 3.6706,
"step": 28000
},
{
"epoch": 4.424778761061947,
"grad_norm": 4.289429187774658,
"learning_rate": 4.5575221238938055e-05,
"loss": 3.698,
"step": 28500
},
{
"epoch": 4.502406458624437,
"grad_norm": 4.255033016204834,
"learning_rate": 4.549759354137557e-05,
"loss": 3.6576,
"step": 29000
},
{
"epoch": 4.580034156186928,
"grad_norm": 4.466300010681152,
"learning_rate": 4.541996584381307e-05,
"loss": 3.6684,
"step": 29500
},
{
"epoch": 4.657661853749418,
"grad_norm": 4.410152435302734,
"learning_rate": 4.5342338146250584e-05,
"loss": 3.6477,
"step": 30000
},
{
"epoch": 4.735289551311908,
"grad_norm": 4.257645130157471,
"learning_rate": 4.52647104486881e-05,
"loss": 3.6531,
"step": 30500
},
{
"epoch": 4.812917248874398,
"grad_norm": 4.475682258605957,
"learning_rate": 4.5187082751125606e-05,
"loss": 3.6587,
"step": 31000
},
{
"epoch": 4.890544946436888,
"grad_norm": 4.372265338897705,
"learning_rate": 4.5109455053563114e-05,
"loss": 3.632,
"step": 31500
},
{
"epoch": 4.968172643999379,
"grad_norm": 4.2151360511779785,
"learning_rate": 4.503182735600062e-05,
"loss": 3.6336,
"step": 32000
},
{
"epoch": 5.045800341561869,
"grad_norm": 4.397316932678223,
"learning_rate": 4.4954199658438135e-05,
"loss": 3.566,
"step": 32500
},
{
"epoch": 5.123428039124359,
"grad_norm": 4.471977710723877,
"learning_rate": 4.487657196087564e-05,
"loss": 3.5522,
"step": 33000
},
{
"epoch": 5.2010557366868495,
"grad_norm": 4.2865471839904785,
"learning_rate": 4.479894426331315e-05,
"loss": 3.5675,
"step": 33500
},
{
"epoch": 5.2786834342493405,
"grad_norm": 4.559909343719482,
"learning_rate": 4.472131656575066e-05,
"loss": 3.54,
"step": 34000
},
{
"epoch": 5.356311131811831,
"grad_norm": 4.453431606292725,
"learning_rate": 4.464368886818817e-05,
"loss": 3.5392,
"step": 34500
},
{
"epoch": 5.433938829374321,
"grad_norm": 4.54495906829834,
"learning_rate": 4.456606117062569e-05,
"loss": 3.5424,
"step": 35000
},
{
"epoch": 5.511566526936811,
"grad_norm": 4.494850158691406,
"learning_rate": 4.448843347306319e-05,
"loss": 3.5414,
"step": 35500
},
{
"epoch": 5.589194224499302,
"grad_norm": 4.8761162757873535,
"learning_rate": 4.44108057755007e-05,
"loss": 3.525,
"step": 36000
},
{
"epoch": 5.666821922061792,
"grad_norm": 4.575265884399414,
"learning_rate": 4.433317807793821e-05,
"loss": 3.5405,
"step": 36500
},
{
"epoch": 5.744449619624282,
"grad_norm": 4.597631454467773,
"learning_rate": 4.4255550380375724e-05,
"loss": 3.5259,
"step": 37000
},
{
"epoch": 5.822077317186772,
"grad_norm": 4.326088905334473,
"learning_rate": 4.4177922682813225e-05,
"loss": 3.4985,
"step": 37500
},
{
"epoch": 5.899705014749262,
"grad_norm": 4.202051639556885,
"learning_rate": 4.410029498525074e-05,
"loss": 3.5087,
"step": 38000
},
{
"epoch": 5.977332712311753,
"grad_norm": 4.386417388916016,
"learning_rate": 4.402266728768825e-05,
"loss": 3.4926,
"step": 38500
},
{
"epoch": 6.054960409874243,
"grad_norm": 4.612489700317383,
"learning_rate": 4.394503959012576e-05,
"loss": 3.456,
"step": 39000
},
{
"epoch": 6.132588107436733,
"grad_norm": 4.2950286865234375,
"learning_rate": 4.386741189256327e-05,
"loss": 3.4195,
"step": 39500
},
{
"epoch": 6.2102158049992235,
"grad_norm": 4.728135585784912,
"learning_rate": 4.3789784195000776e-05,
"loss": 3.422,
"step": 40000
},
{
"epoch": 6.287843502561714,
"grad_norm": 4.690753936767578,
"learning_rate": 4.371215649743829e-05,
"loss": 3.4147,
"step": 40500
},
{
"epoch": 6.365471200124205,
"grad_norm": 4.528134346008301,
"learning_rate": 4.36345287998758e-05,
"loss": 3.4115,
"step": 41000
},
{
"epoch": 6.443098897686695,
"grad_norm": 4.323470592498779,
"learning_rate": 4.3556901102313305e-05,
"loss": 3.4058,
"step": 41500
},
{
"epoch": 6.520726595249185,
"grad_norm": 4.374230861663818,
"learning_rate": 4.347927340475082e-05,
"loss": 3.4112,
"step": 42000
},
{
"epoch": 6.598354292811675,
"grad_norm": 4.312314033508301,
"learning_rate": 4.340164570718833e-05,
"loss": 3.3881,
"step": 42500
},
{
"epoch": 6.675981990374165,
"grad_norm": 4.178228378295898,
"learning_rate": 4.332401800962584e-05,
"loss": 3.4044,
"step": 43000
},
{
"epoch": 6.753609687936656,
"grad_norm": 4.638906002044678,
"learning_rate": 4.324639031206334e-05,
"loss": 3.3954,
"step": 43500
},
{
"epoch": 6.831237385499146,
"grad_norm": 4.238986492156982,
"learning_rate": 4.3168762614500857e-05,
"loss": 3.4013,
"step": 44000
},
{
"epoch": 6.908865083061636,
"grad_norm": 4.471828460693359,
"learning_rate": 4.3091134916938364e-05,
"loss": 3.3806,
"step": 44500
},
{
"epoch": 6.986492780624126,
"grad_norm": 4.4187912940979,
"learning_rate": 4.301350721937588e-05,
"loss": 3.3834,
"step": 45000
},
{
"epoch": 7.064120478186617,
"grad_norm": 5.066268444061279,
"learning_rate": 4.293587952181338e-05,
"loss": 3.3064,
"step": 45500
},
{
"epoch": 7.1417481757491075,
"grad_norm": 4.942110538482666,
"learning_rate": 4.2858251824250894e-05,
"loss": 3.2971,
"step": 46000
},
{
"epoch": 7.219375873311598,
"grad_norm": 5.294034957885742,
"learning_rate": 4.278062412668841e-05,
"loss": 3.2643,
"step": 46500
},
{
"epoch": 7.297003570874088,
"grad_norm": 4.650871753692627,
"learning_rate": 4.2702996429125915e-05,
"loss": 3.2768,
"step": 47000
},
{
"epoch": 7.374631268436578,
"grad_norm": 5.170124053955078,
"learning_rate": 4.262536873156342e-05,
"loss": 3.2832,
"step": 47500
},
{
"epoch": 7.452258965999069,
"grad_norm": 4.852886199951172,
"learning_rate": 4.254774103400093e-05,
"loss": 3.2779,
"step": 48000
},
{
"epoch": 7.529886663561559,
"grad_norm": 5.047275543212891,
"learning_rate": 4.2470113336438445e-05,
"loss": 3.273,
"step": 48500
},
{
"epoch": 7.607514361124049,
"grad_norm": 4.9860520362854,
"learning_rate": 4.239248563887595e-05,
"loss": 3.2538,
"step": 49000
},
{
"epoch": 7.685142058686539,
"grad_norm": 4.9074859619140625,
"learning_rate": 4.231485794131346e-05,
"loss": 3.248,
"step": 49500
},
{
"epoch": 7.76276975624903,
"grad_norm": 4.936252593994141,
"learning_rate": 4.2237230243750974e-05,
"loss": 3.2492,
"step": 50000
},
{
"epoch": 7.84039745381152,
"grad_norm": 4.652443885803223,
"learning_rate": 4.215960254618848e-05,
"loss": 3.2412,
"step": 50500
},
{
"epoch": 7.91802515137401,
"grad_norm": 4.407495021820068,
"learning_rate": 4.2081974848625996e-05,
"loss": 3.2372,
"step": 51000
},
{
"epoch": 7.9956528489365,
"grad_norm": 4.413294792175293,
"learning_rate": 4.20043471510635e-05,
"loss": 3.2131,
"step": 51500
},
{
"epoch": 8.07328054649899,
"grad_norm": 4.42469596862793,
"learning_rate": 4.192671945350101e-05,
"loss": 3.1377,
"step": 52000
},
{
"epoch": 8.150908244061482,
"grad_norm": 4.906301498413086,
"learning_rate": 4.184909175593852e-05,
"loss": 3.1072,
"step": 52500
},
{
"epoch": 8.22853594162397,
"grad_norm": 5.0347900390625,
"learning_rate": 4.177146405837603e-05,
"loss": 3.1374,
"step": 53000
},
{
"epoch": 8.306163639186462,
"grad_norm": 5.217957496643066,
"learning_rate": 4.169383636081354e-05,
"loss": 3.1124,
"step": 53500
},
{
"epoch": 8.383791336748953,
"grad_norm": 4.475755214691162,
"learning_rate": 4.161620866325105e-05,
"loss": 3.1194,
"step": 54000
},
{
"epoch": 8.461419034311442,
"grad_norm": 5.22430419921875,
"learning_rate": 4.153858096568856e-05,
"loss": 3.1201,
"step": 54500
},
{
"epoch": 8.539046731873933,
"grad_norm": 6.327775955200195,
"learning_rate": 4.146095326812607e-05,
"loss": 3.1031,
"step": 55000
},
{
"epoch": 8.616674429436422,
"grad_norm": 4.703291893005371,
"learning_rate": 4.138332557056358e-05,
"loss": 3.1043,
"step": 55500
},
{
"epoch": 8.694302126998913,
"grad_norm": 5.288379192352295,
"learning_rate": 4.1305697873001085e-05,
"loss": 3.1024,
"step": 56000
},
{
"epoch": 8.771929824561404,
"grad_norm": 4.9670090675354,
"learning_rate": 4.12280701754386e-05,
"loss": 3.0797,
"step": 56500
},
{
"epoch": 8.849557522123893,
"grad_norm": 4.910192012786865,
"learning_rate": 4.115044247787611e-05,
"loss": 3.0869,
"step": 57000
},
{
"epoch": 8.927185219686384,
"grad_norm": 4.804894924163818,
"learning_rate": 4.1072814780313615e-05,
"loss": 3.0885,
"step": 57500
},
{
"epoch": 9.004812917248874,
"grad_norm": 5.052229404449463,
"learning_rate": 4.099518708275113e-05,
"loss": 3.0821,
"step": 58000
},
{
"epoch": 9.082440614811365,
"grad_norm": 5.419916152954102,
"learning_rate": 4.0917559385188637e-05,
"loss": 2.9879,
"step": 58500
},
{
"epoch": 9.160068312373856,
"grad_norm": 5.0662078857421875,
"learning_rate": 4.083993168762615e-05,
"loss": 2.9825,
"step": 59000
},
{
"epoch": 9.237696009936345,
"grad_norm": 4.776367664337158,
"learning_rate": 4.076230399006365e-05,
"loss": 2.977,
"step": 59500
},
{
"epoch": 9.315323707498836,
"grad_norm": 4.7674031257629395,
"learning_rate": 4.0684676292501166e-05,
"loss": 2.9971,
"step": 60000
},
{
"epoch": 9.392951405061325,
"grad_norm": 4.947634696960449,
"learning_rate": 4.0607048594938673e-05,
"loss": 2.9651,
"step": 60500
},
{
"epoch": 9.470579102623816,
"grad_norm": 4.943103790283203,
"learning_rate": 4.052942089737619e-05,
"loss": 2.9781,
"step": 61000
},
{
"epoch": 9.548206800186307,
"grad_norm": 5.14945125579834,
"learning_rate": 4.0451793199813695e-05,
"loss": 2.9702,
"step": 61500
},
{
"epoch": 9.625834497748796,
"grad_norm": 5.054744243621826,
"learning_rate": 4.03741655022512e-05,
"loss": 2.9553,
"step": 62000
},
{
"epoch": 9.703462195311287,
"grad_norm": 5.338235855102539,
"learning_rate": 4.029653780468872e-05,
"loss": 2.9489,
"step": 62500
},
{
"epoch": 9.781089892873778,
"grad_norm": 4.819457530975342,
"learning_rate": 4.0218910107126225e-05,
"loss": 2.9676,
"step": 63000
},
{
"epoch": 9.858717590436267,
"grad_norm": 4.814851760864258,
"learning_rate": 4.014128240956373e-05,
"loss": 2.9374,
"step": 63500
},
{
"epoch": 9.936345287998758,
"grad_norm": 4.723858833312988,
"learning_rate": 4.006365471200124e-05,
"loss": 2.9474,
"step": 64000
},
{
"epoch": 10.013972985561248,
"grad_norm": 4.435904026031494,
"learning_rate": 3.9986027014438754e-05,
"loss": 2.9094,
"step": 64500
},
{
"epoch": 10.091600683123739,
"grad_norm": 4.80678129196167,
"learning_rate": 3.990839931687627e-05,
"loss": 2.8467,
"step": 65000
},
{
"epoch": 10.16922838068623,
"grad_norm": 5.187747001647949,
"learning_rate": 3.983077161931377e-05,
"loss": 2.8237,
"step": 65500
},
{
"epoch": 10.246856078248719,
"grad_norm": 4.363202095031738,
"learning_rate": 3.9753143921751284e-05,
"loss": 2.8334,
"step": 66000
},
{
"epoch": 10.32448377581121,
"grad_norm": 5.085516929626465,
"learning_rate": 3.967551622418879e-05,
"loss": 2.8284,
"step": 66500
},
{
"epoch": 10.402111473373699,
"grad_norm": 4.973574638366699,
"learning_rate": 3.9597888526626306e-05,
"loss": 2.8194,
"step": 67000
},
{
"epoch": 10.47973917093619,
"grad_norm": 4.629599094390869,
"learning_rate": 3.952026082906381e-05,
"loss": 2.8284,
"step": 67500
},
{
"epoch": 10.557366868498681,
"grad_norm": 4.970963001251221,
"learning_rate": 3.944263313150132e-05,
"loss": 2.8285,
"step": 68000
},
{
"epoch": 10.63499456606117,
"grad_norm": 4.869990348815918,
"learning_rate": 3.936500543393883e-05,
"loss": 2.8048,
"step": 68500
},
{
"epoch": 10.712622263623661,
"grad_norm": 5.26320743560791,
"learning_rate": 3.928737773637634e-05,
"loss": 2.803,
"step": 69000
},
{
"epoch": 10.79024996118615,
"grad_norm": 4.8318352699279785,
"learning_rate": 3.920975003881385e-05,
"loss": 2.7984,
"step": 69500
},
{
"epoch": 10.867877658748641,
"grad_norm": 4.917919158935547,
"learning_rate": 3.913212234125136e-05,
"loss": 2.8091,
"step": 70000
},
{
"epoch": 10.945505356311132,
"grad_norm": 4.485991954803467,
"learning_rate": 3.905449464368887e-05,
"loss": 2.7917,
"step": 70500
},
{
"epoch": 11.023133053873622,
"grad_norm": 4.8984246253967285,
"learning_rate": 3.897686694612638e-05,
"loss": 2.7501,
"step": 71000
},
{
"epoch": 11.100760751436113,
"grad_norm": 4.431053161621094,
"learning_rate": 3.889923924856389e-05,
"loss": 2.6896,
"step": 71500
},
{
"epoch": 11.178388448998602,
"grad_norm": 4.597928524017334,
"learning_rate": 3.8821611551001395e-05,
"loss": 2.6874,
"step": 72000
},
{
"epoch": 11.256016146561093,
"grad_norm": 4.701462268829346,
"learning_rate": 3.874398385343891e-05,
"loss": 2.679,
"step": 72500
},
{
"epoch": 11.333643844123584,
"grad_norm": 4.706751346588135,
"learning_rate": 3.866635615587642e-05,
"loss": 2.6799,
"step": 73000
},
{
"epoch": 11.411271541686073,
"grad_norm": 4.8909430503845215,
"learning_rate": 3.858872845831393e-05,
"loss": 2.6779,
"step": 73500
},
{
"epoch": 11.488899239248564,
"grad_norm": 4.814470291137695,
"learning_rate": 3.851110076075144e-05,
"loss": 2.6723,
"step": 74000
},
{
"epoch": 11.566526936811055,
"grad_norm": 4.277644157409668,
"learning_rate": 3.8433473063188946e-05,
"loss": 2.6787,
"step": 74500
},
{
"epoch": 11.644154634373544,
"grad_norm": 4.709313869476318,
"learning_rate": 3.835584536562646e-05,
"loss": 2.6672,
"step": 75000
},
{
"epoch": 11.721782331936035,
"grad_norm": 4.462389945983887,
"learning_rate": 3.827821766806397e-05,
"loss": 2.66,
"step": 75500
},
{
"epoch": 11.799410029498524,
"grad_norm": 4.836484909057617,
"learning_rate": 3.8200589970501475e-05,
"loss": 2.6646,
"step": 76000
},
{
"epoch": 11.877037727061015,
"grad_norm": 4.758359909057617,
"learning_rate": 3.812296227293899e-05,
"loss": 2.6561,
"step": 76500
},
{
"epoch": 11.954665424623506,
"grad_norm": 4.208640098571777,
"learning_rate": 3.80453345753765e-05,
"loss": 2.6659,
"step": 77000
},
{
"epoch": 12.032293122185996,
"grad_norm": 4.91511344909668,
"learning_rate": 3.7967706877814005e-05,
"loss": 2.5897,
"step": 77500
},
{
"epoch": 12.109920819748487,
"grad_norm": 4.086484909057617,
"learning_rate": 3.789007918025151e-05,
"loss": 2.5594,
"step": 78000
},
{
"epoch": 12.187548517310976,
"grad_norm": 4.583057880401611,
"learning_rate": 3.781245148268903e-05,
"loss": 2.5543,
"step": 78500
},
{
"epoch": 12.265176214873467,
"grad_norm": 4.570094585418701,
"learning_rate": 3.7734823785126534e-05,
"loss": 2.5503,
"step": 79000
},
{
"epoch": 12.342803912435958,
"grad_norm": 4.889599800109863,
"learning_rate": 3.765719608756404e-05,
"loss": 2.5416,
"step": 79500
},
{
"epoch": 12.420431609998447,
"grad_norm": 4.4805426597595215,
"learning_rate": 3.757956839000155e-05,
"loss": 2.5589,
"step": 80000
},
{
"epoch": 12.498059307560938,
"grad_norm": 4.407408237457275,
"learning_rate": 3.7501940692439064e-05,
"loss": 2.5315,
"step": 80500
},
{
"epoch": 12.575687005123427,
"grad_norm": 4.637092113494873,
"learning_rate": 3.742431299487658e-05,
"loss": 2.5454,
"step": 81000
},
{
"epoch": 12.653314702685918,
"grad_norm": 4.7181854248046875,
"learning_rate": 3.7346685297314085e-05,
"loss": 2.5383,
"step": 81500
},
{
"epoch": 12.73094240024841,
"grad_norm": 4.588499546051025,
"learning_rate": 3.726905759975159e-05,
"loss": 2.5267,
"step": 82000
},
{
"epoch": 12.808570097810899,
"grad_norm": 4.137992858886719,
"learning_rate": 3.71914299021891e-05,
"loss": 2.5345,
"step": 82500
},
{
"epoch": 12.88619779537339,
"grad_norm": 4.400317668914795,
"learning_rate": 3.7113802204626615e-05,
"loss": 2.5259,
"step": 83000
},
{
"epoch": 12.963825492935879,
"grad_norm": 4.139917850494385,
"learning_rate": 3.703617450706412e-05,
"loss": 2.5335,
"step": 83500
},
{
"epoch": 13.04145319049837,
"grad_norm": 4.182736396789551,
"learning_rate": 3.695854680950163e-05,
"loss": 2.4574,
"step": 84000
},
{
"epoch": 13.11908088806086,
"grad_norm": 4.659245491027832,
"learning_rate": 3.6880919111939144e-05,
"loss": 2.4193,
"step": 84500
},
{
"epoch": 13.19670858562335,
"grad_norm": 4.163915157318115,
"learning_rate": 3.680329141437665e-05,
"loss": 2.4169,
"step": 85000
},
{
"epoch": 13.274336283185841,
"grad_norm": 4.518395900726318,
"learning_rate": 3.672566371681416e-05,
"loss": 2.4161,
"step": 85500
},
{
"epoch": 13.35196398074833,
"grad_norm": 4.277214050292969,
"learning_rate": 3.664803601925167e-05,
"loss": 2.4169,
"step": 86000
},
{
"epoch": 13.429591678310821,
"grad_norm": 4.701220989227295,
"learning_rate": 3.657040832168918e-05,
"loss": 2.424,
"step": 86500
},
{
"epoch": 13.507219375873312,
"grad_norm": 4.375713348388672,
"learning_rate": 3.649278062412669e-05,
"loss": 2.4193,
"step": 87000
},
{
"epoch": 13.584847073435801,
"grad_norm": 4.191773891448975,
"learning_rate": 3.64151529265642e-05,
"loss": 2.4188,
"step": 87500
},
{
"epoch": 13.662474770998292,
"grad_norm": 4.385691165924072,
"learning_rate": 3.633752522900171e-05,
"loss": 2.4149,
"step": 88000
},
{
"epoch": 13.740102468560782,
"grad_norm": 4.488534927368164,
"learning_rate": 3.625989753143922e-05,
"loss": 2.3998,
"step": 88500
},
{
"epoch": 13.817730166123273,
"grad_norm": 4.578937530517578,
"learning_rate": 3.618226983387673e-05,
"loss": 2.4065,
"step": 89000
},
{
"epoch": 13.895357863685764,
"grad_norm": 4.423867702484131,
"learning_rate": 3.610464213631424e-05,
"loss": 2.4004,
"step": 89500
},
{
"epoch": 13.972985561248253,
"grad_norm": 4.474419116973877,
"learning_rate": 3.602701443875175e-05,
"loss": 2.4044,
"step": 90000
},
{
"epoch": 14.050613258810744,
"grad_norm": 4.806559085845947,
"learning_rate": 3.5949386741189255e-05,
"loss": 2.3339,
"step": 90500
},
{
"epoch": 14.128240956373235,
"grad_norm": 4.276415824890137,
"learning_rate": 3.587175904362677e-05,
"loss": 2.2801,
"step": 91000
},
{
"epoch": 14.205868653935724,
"grad_norm": 4.825454235076904,
"learning_rate": 3.579413134606428e-05,
"loss": 2.297,
"step": 91500
},
{
"epoch": 14.283496351498215,
"grad_norm": 4.838090896606445,
"learning_rate": 3.5716503648501785e-05,
"loss": 2.299,
"step": 92000
},
{
"epoch": 14.361124049060704,
"grad_norm": 4.015684604644775,
"learning_rate": 3.56388759509393e-05,
"loss": 2.2892,
"step": 92500
},
{
"epoch": 14.438751746623195,
"grad_norm": 4.386364459991455,
"learning_rate": 3.5561248253376807e-05,
"loss": 2.3058,
"step": 93000
},
{
"epoch": 14.516379444185686,
"grad_norm": 4.3224968910217285,
"learning_rate": 3.548362055581432e-05,
"loss": 2.3027,
"step": 93500
},
{
"epoch": 14.594007141748175,
"grad_norm": 4.265476226806641,
"learning_rate": 3.540599285825182e-05,
"loss": 2.2993,
"step": 94000
},
{
"epoch": 14.671634839310666,
"grad_norm": 4.053600311279297,
"learning_rate": 3.5328365160689336e-05,
"loss": 2.2942,
"step": 94500
},
{
"epoch": 14.749262536873156,
"grad_norm": 4.602315902709961,
"learning_rate": 3.5250737463126844e-05,
"loss": 2.2906,
"step": 95000
},
{
"epoch": 14.826890234435647,
"grad_norm": 4.402678489685059,
"learning_rate": 3.517310976556436e-05,
"loss": 2.2702,
"step": 95500
},
{
"epoch": 14.904517931998138,
"grad_norm": 4.164185523986816,
"learning_rate": 3.5095482068001865e-05,
"loss": 2.2815,
"step": 96000
},
{
"epoch": 14.982145629560627,
"grad_norm": 3.9488399028778076,
"learning_rate": 3.501785437043937e-05,
"loss": 2.2949,
"step": 96500
},
{
"epoch": 15.059773327123118,
"grad_norm": 4.283924102783203,
"learning_rate": 3.494022667287689e-05,
"loss": 2.2053,
"step": 97000
},
{
"epoch": 15.137401024685607,
"grad_norm": 4.1038923263549805,
"learning_rate": 3.4862598975314395e-05,
"loss": 2.1718,
"step": 97500
},
{
"epoch": 15.215028722248098,
"grad_norm": 3.826446533203125,
"learning_rate": 3.47849712777519e-05,
"loss": 2.1859,
"step": 98000
},
{
"epoch": 15.292656419810589,
"grad_norm": 4.282005310058594,
"learning_rate": 3.470734358018941e-05,
"loss": 2.1854,
"step": 98500
},
{
"epoch": 15.370284117373078,
"grad_norm": 4.259530067443848,
"learning_rate": 3.4629715882626924e-05,
"loss": 2.188,
"step": 99000
},
{
"epoch": 15.44791181493557,
"grad_norm": 4.105893135070801,
"learning_rate": 3.455208818506443e-05,
"loss": 2.1824,
"step": 99500
},
{
"epoch": 15.52553951249806,
"grad_norm": 4.21387243270874,
"learning_rate": 3.447446048750194e-05,
"loss": 2.1729,
"step": 100000
},
{
"epoch": 15.60316721006055,
"grad_norm": 4.400328636169434,
"learning_rate": 3.4396832789939454e-05,
"loss": 2.1831,
"step": 100500
},
{
"epoch": 15.68079490762304,
"grad_norm": 4.224130153656006,
"learning_rate": 3.431920509237696e-05,
"loss": 2.1936,
"step": 101000
},
{
"epoch": 15.75842260518553,
"grad_norm": 3.9993326663970947,
"learning_rate": 3.4241577394814476e-05,
"loss": 2.1838,
"step": 101500
},
{
"epoch": 15.83605030274802,
"grad_norm": 4.2306671142578125,
"learning_rate": 3.4163949697251976e-05,
"loss": 2.1838,
"step": 102000
},
{
"epoch": 15.913678000310512,
"grad_norm": 4.4622368812561035,
"learning_rate": 3.408632199968949e-05,
"loss": 2.1836,
"step": 102500
},
{
"epoch": 15.991305697873,
"grad_norm": 4.376685619354248,
"learning_rate": 3.4008694302127005e-05,
"loss": 2.1779,
"step": 103000
},
{
"epoch": 16.06893339543549,
"grad_norm": 4.104698657989502,
"learning_rate": 3.393106660456451e-05,
"loss": 2.0854,
"step": 103500
},
{
"epoch": 16.14656109299798,
"grad_norm": 3.761953353881836,
"learning_rate": 3.385343890700202e-05,
"loss": 2.0603,
"step": 104000
},
{
"epoch": 16.224188790560472,
"grad_norm": 4.365135192871094,
"learning_rate": 3.377581120943953e-05,
"loss": 2.0572,
"step": 104500
},
{
"epoch": 16.301816488122963,
"grad_norm": 4.137313365936279,
"learning_rate": 3.369818351187704e-05,
"loss": 2.0691,
"step": 105000
},
{
"epoch": 16.379444185685454,
"grad_norm": 4.869952201843262,
"learning_rate": 3.362055581431455e-05,
"loss": 2.0935,
"step": 105500
},
{
"epoch": 16.45707188324794,
"grad_norm": 4.275235652923584,
"learning_rate": 3.354292811675206e-05,
"loss": 2.077,
"step": 106000
},
{
"epoch": 16.534699580810432,
"grad_norm": 4.092933177947998,
"learning_rate": 3.3465300419189565e-05,
"loss": 2.0977,
"step": 106500
},
{
"epoch": 16.612327278372923,
"grad_norm": 3.9494364261627197,
"learning_rate": 3.338767272162708e-05,
"loss": 2.095,
"step": 107000
},
{
"epoch": 16.689954975935414,
"grad_norm": 3.6660993099212646,
"learning_rate": 3.331004502406459e-05,
"loss": 2.0867,
"step": 107500
},
{
"epoch": 16.767582673497905,
"grad_norm": 4.6808977127075195,
"learning_rate": 3.3232417326502094e-05,
"loss": 2.0856,
"step": 108000
},
{
"epoch": 16.845210371060393,
"grad_norm": 3.951265335083008,
"learning_rate": 3.315478962893961e-05,
"loss": 2.0786,
"step": 108500
},
{
"epoch": 16.922838068622884,
"grad_norm": 3.390282392501831,
"learning_rate": 3.3077161931377116e-05,
"loss": 2.0756,
"step": 109000
},
{
"epoch": 17.000465766185375,
"grad_norm": 3.9212212562561035,
"learning_rate": 3.299953423381463e-05,
"loss": 2.0858,
"step": 109500
},
{
"epoch": 17.078093463747866,
"grad_norm": 4.350470542907715,
"learning_rate": 3.292190653625213e-05,
"loss": 1.969,
"step": 110000
},
{
"epoch": 17.155721161310357,
"grad_norm": 4.253689765930176,
"learning_rate": 3.2844278838689645e-05,
"loss": 1.9756,
"step": 110500
},
{
"epoch": 17.233348858872844,
"grad_norm": 4.202712059020996,
"learning_rate": 3.276665114112716e-05,
"loss": 1.9793,
"step": 111000
},
{
"epoch": 17.310976556435335,
"grad_norm": 4.103579998016357,
"learning_rate": 3.268902344356467e-05,
"loss": 1.9825,
"step": 111500
},
{
"epoch": 17.388604253997826,
"grad_norm": 4.335016250610352,
"learning_rate": 3.2611395746002175e-05,
"loss": 1.978,
"step": 112000
},
{
"epoch": 17.466231951560317,
"grad_norm": 4.291495323181152,
"learning_rate": 3.253376804843968e-05,
"loss": 1.9884,
"step": 112500
},
{
"epoch": 17.54385964912281,
"grad_norm": 4.035206317901611,
"learning_rate": 3.24561403508772e-05,
"loss": 2.0041,
"step": 113000
},
{
"epoch": 17.621487346685296,
"grad_norm": 3.9616289138793945,
"learning_rate": 3.2378512653314704e-05,
"loss": 1.9928,
"step": 113500
},
{
"epoch": 17.699115044247787,
"grad_norm": 4.101945400238037,
"learning_rate": 3.230088495575221e-05,
"loss": 1.9906,
"step": 114000
},
{
"epoch": 17.776742741810278,
"grad_norm": 4.0245490074157715,
"learning_rate": 3.2223257258189726e-05,
"loss": 1.9873,
"step": 114500
},
{
"epoch": 17.85437043937277,
"grad_norm": 4.1350908279418945,
"learning_rate": 3.2145629560627234e-05,
"loss": 1.9917,
"step": 115000
},
{
"epoch": 17.93199813693526,
"grad_norm": 4.366165637969971,
"learning_rate": 3.206800186306475e-05,
"loss": 1.9897,
"step": 115500
},
{
"epoch": 18.009625834497747,
"grad_norm": 4.272118091583252,
"learning_rate": 3.199037416550225e-05,
"loss": 1.9837,
"step": 116000
},
{
"epoch": 18.087253532060238,
"grad_norm": 4.427468776702881,
"learning_rate": 3.191274646793976e-05,
"loss": 1.8798,
"step": 116500
},
{
"epoch": 18.16488122962273,
"grad_norm": 4.1292033195495605,
"learning_rate": 3.183511877037727e-05,
"loss": 1.8857,
"step": 117000
},
{
"epoch": 18.24250892718522,
"grad_norm": 4.270112037658691,
"learning_rate": 3.1757491072814785e-05,
"loss": 1.8921,
"step": 117500
},
{
"epoch": 18.32013662474771,
"grad_norm": 4.079245567321777,
"learning_rate": 3.1679863375252286e-05,
"loss": 1.8984,
"step": 118000
},
{
"epoch": 18.3977643223102,
"grad_norm": 3.783048391342163,
"learning_rate": 3.16022356776898e-05,
"loss": 1.9001,
"step": 118500
},
{
"epoch": 18.47539201987269,
"grad_norm": 3.9977831840515137,
"learning_rate": 3.1524607980127314e-05,
"loss": 1.9026,
"step": 119000
},
{
"epoch": 18.55301971743518,
"grad_norm": 5.004773139953613,
"learning_rate": 3.144698028256482e-05,
"loss": 1.9027,
"step": 119500
},
{
"epoch": 18.63064741499767,
"grad_norm": 4.3422417640686035,
"learning_rate": 3.136935258500233e-05,
"loss": 1.9084,
"step": 120000
},
{
"epoch": 18.708275112560163,
"grad_norm": 3.9378857612609863,
"learning_rate": 3.129172488743984e-05,
"loss": 1.9038,
"step": 120500
},
{
"epoch": 18.78590281012265,
"grad_norm": 4.138620853424072,
"learning_rate": 3.121409718987735e-05,
"loss": 1.9133,
"step": 121000
},
{
"epoch": 18.86353050768514,
"grad_norm": 4.3769659996032715,
"learning_rate": 3.113646949231486e-05,
"loss": 1.9109,
"step": 121500
},
{
"epoch": 18.941158205247632,
"grad_norm": 3.955392837524414,
"learning_rate": 3.1058841794752366e-05,
"loss": 1.913,
"step": 122000
},
{
"epoch": 19.018785902810123,
"grad_norm": 4.047823905944824,
"learning_rate": 3.098121409718988e-05,
"loss": 1.8897,
"step": 122500
},
{
"epoch": 19.096413600372614,
"grad_norm": 4.446326732635498,
"learning_rate": 3.090358639962739e-05,
"loss": 1.7936,
"step": 123000
},
{
"epoch": 19.174041297935105,
"grad_norm": 3.9434542655944824,
"learning_rate": 3.08259587020649e-05,
"loss": 1.8065,
"step": 123500
},
{
"epoch": 19.251668995497592,
"grad_norm": 4.108802318572998,
"learning_rate": 3.0748331004502403e-05,
"loss": 1.8157,
"step": 124000
},
{
"epoch": 19.329296693060083,
"grad_norm": 4.374671459197998,
"learning_rate": 3.067070330693992e-05,
"loss": 1.8276,
"step": 124500
},
{
"epoch": 19.406924390622574,
"grad_norm": 3.985368013381958,
"learning_rate": 3.0593075609377425e-05,
"loss": 1.8246,
"step": 125000
},
{
"epoch": 19.484552088185065,
"grad_norm": 3.956395149230957,
"learning_rate": 3.0515447911814936e-05,
"loss": 1.8263,
"step": 125500
},
{
"epoch": 19.562179785747556,
"grad_norm": 3.358553886413574,
"learning_rate": 3.043782021425245e-05,
"loss": 1.8227,
"step": 126000
},
{
"epoch": 19.639807483310044,
"grad_norm": 4.203612804412842,
"learning_rate": 3.0360192516689955e-05,
"loss": 1.8225,
"step": 126500
},
{
"epoch": 19.717435180872535,
"grad_norm": 3.790905714035034,
"learning_rate": 3.028256481912747e-05,
"loss": 1.8433,
"step": 127000
},
{
"epoch": 19.795062878435026,
"grad_norm": 4.040520191192627,
"learning_rate": 3.0204937121564973e-05,
"loss": 1.8336,
"step": 127500
},
{
"epoch": 19.872690575997517,
"grad_norm": 4.027768135070801,
"learning_rate": 3.0127309424002488e-05,
"loss": 1.8314,
"step": 128000
},
{
"epoch": 19.950318273560008,
"grad_norm": 3.8109354972839355,
"learning_rate": 3.0049681726439992e-05,
"loss": 1.8425,
"step": 128500
},
{
"epoch": 20.027945971122495,
"grad_norm": 3.751999855041504,
"learning_rate": 2.9972054028877506e-05,
"loss": 1.7967,
"step": 129000
},
{
"epoch": 20.105573668684986,
"grad_norm": 3.9639225006103516,
"learning_rate": 2.9894426331315014e-05,
"loss": 1.7213,
"step": 129500
},
{
"epoch": 20.183201366247477,
"grad_norm": 4.027946472167969,
"learning_rate": 2.9816798633752525e-05,
"loss": 1.7408,
"step": 130000
},
{
"epoch": 20.260829063809968,
"grad_norm": 4.050852298736572,
"learning_rate": 2.9739170936190035e-05,
"loss": 1.7305,
"step": 130500
},
{
"epoch": 20.33845676137246,
"grad_norm": 4.3804216384887695,
"learning_rate": 2.9661543238627543e-05,
"loss": 1.7499,
"step": 131000
},
{
"epoch": 20.416084458934947,
"grad_norm": 4.021152019500732,
"learning_rate": 2.9583915541065054e-05,
"loss": 1.7484,
"step": 131500
},
{
"epoch": 20.493712156497438,
"grad_norm": 3.7631611824035645,
"learning_rate": 2.950628784350256e-05,
"loss": 1.7531,
"step": 132000
},
{
"epoch": 20.57133985405993,
"grad_norm": 4.4973249435424805,
"learning_rate": 2.9428660145940072e-05,
"loss": 1.767,
"step": 132500
},
{
"epoch": 20.64896755162242,
"grad_norm": 4.386341571807861,
"learning_rate": 2.935103244837758e-05,
"loss": 1.7621,
"step": 133000
},
{
"epoch": 20.72659524918491,
"grad_norm": 4.0129499435424805,
"learning_rate": 2.927340475081509e-05,
"loss": 1.7637,
"step": 133500
},
{
"epoch": 20.804222946747398,
"grad_norm": 4.22186279296875,
"learning_rate": 2.9195777053252605e-05,
"loss": 1.7643,
"step": 134000
},
{
"epoch": 20.88185064430989,
"grad_norm": 4.511717319488525,
"learning_rate": 2.911814935569011e-05,
"loss": 1.7761,
"step": 134500
},
{
"epoch": 20.95947834187238,
"grad_norm": 4.100383281707764,
"learning_rate": 2.9040521658127624e-05,
"loss": 1.7625,
"step": 135000
},
{
"epoch": 21.03710603943487,
"grad_norm": 4.241291046142578,
"learning_rate": 2.8962893960565128e-05,
"loss": 1.7083,
"step": 135500
},
{
"epoch": 21.114733736997362,
"grad_norm": 3.8240482807159424,
"learning_rate": 2.8885266263002642e-05,
"loss": 1.6514,
"step": 136000
},
{
"epoch": 21.19236143455985,
"grad_norm": 3.9241297245025635,
"learning_rate": 2.880763856544015e-05,
"loss": 1.662,
"step": 136500
},
{
"epoch": 21.26998913212234,
"grad_norm": 3.836834669113159,
"learning_rate": 2.873001086787766e-05,
"loss": 1.6674,
"step": 137000
},
{
"epoch": 21.34761682968483,
"grad_norm": 4.176065921783447,
"learning_rate": 2.865238317031517e-05,
"loss": 1.6754,
"step": 137500
},
{
"epoch": 21.425244527247322,
"grad_norm": 4.702647686004639,
"learning_rate": 2.857475547275268e-05,
"loss": 1.6841,
"step": 138000
},
{
"epoch": 21.502872224809813,
"grad_norm": 3.71679425239563,
"learning_rate": 2.849712777519019e-05,
"loss": 1.6918,
"step": 138500
},
{
"epoch": 21.5804999223723,
"grad_norm": 4.379159450531006,
"learning_rate": 2.8419500077627698e-05,
"loss": 1.6845,
"step": 139000
},
{
"epoch": 21.658127619934792,
"grad_norm": 3.984041213989258,
"learning_rate": 2.834187238006521e-05,
"loss": 1.7042,
"step": 139500
},
{
"epoch": 21.735755317497283,
"grad_norm": 4.80483865737915,
"learning_rate": 2.8264244682502716e-05,
"loss": 1.7063,
"step": 140000
},
{
"epoch": 21.813383015059774,
"grad_norm": 3.897512674331665,
"learning_rate": 2.8186616984940227e-05,
"loss": 1.697,
"step": 140500
},
{
"epoch": 21.891010712622265,
"grad_norm": 3.8755526542663574,
"learning_rate": 2.8108989287377735e-05,
"loss": 1.6936,
"step": 141000
},
{
"epoch": 21.968638410184752,
"grad_norm": 4.30952262878418,
"learning_rate": 2.8031361589815246e-05,
"loss": 1.7112,
"step": 141500
},
{
"epoch": 22.046266107747243,
"grad_norm": 4.38576602935791,
"learning_rate": 2.795373389225276e-05,
"loss": 1.644,
"step": 142000
},
{
"epoch": 22.123893805309734,
"grad_norm": 4.09429931640625,
"learning_rate": 2.7876106194690264e-05,
"loss": 1.6035,
"step": 142500
},
{
"epoch": 22.201521502872225,
"grad_norm": 4.038272857666016,
"learning_rate": 2.779847849712778e-05,
"loss": 1.6024,
"step": 143000
},
{
"epoch": 22.279149200434716,
"grad_norm": 4.369879245758057,
"learning_rate": 2.7720850799565286e-05,
"loss": 1.6185,
"step": 143500
},
{
"epoch": 22.356776897997204,
"grad_norm": 4.589230537414551,
"learning_rate": 2.7643223102002797e-05,
"loss": 1.6199,
"step": 144000
},
{
"epoch": 22.434404595559695,
"grad_norm": 4.705469608306885,
"learning_rate": 2.7565595404440304e-05,
"loss": 1.6101,
"step": 144500
},
{
"epoch": 22.512032293122186,
"grad_norm": 4.487303256988525,
"learning_rate": 2.7487967706877815e-05,
"loss": 1.6163,
"step": 145000
},
{
"epoch": 22.589659990684677,
"grad_norm": 3.795254945755005,
"learning_rate": 2.7410340009315326e-05,
"loss": 1.6382,
"step": 145500
},
{
"epoch": 22.667287688247168,
"grad_norm": 3.8786396980285645,
"learning_rate": 2.7332712311752834e-05,
"loss": 1.6223,
"step": 146000
},
{
"epoch": 22.744915385809655,
"grad_norm": 4.308375835418701,
"learning_rate": 2.7255084614190345e-05,
"loss": 1.6447,
"step": 146500
},
{
"epoch": 22.822543083372146,
"grad_norm": 4.034188747406006,
"learning_rate": 2.7177456916627852e-05,
"loss": 1.6351,
"step": 147000
},
{
"epoch": 22.900170780934637,
"grad_norm": 4.602024555206299,
"learning_rate": 2.7099829219065363e-05,
"loss": 1.6344,
"step": 147500
},
{
"epoch": 22.977798478497128,
"grad_norm": 4.131753921508789,
"learning_rate": 2.702220152150287e-05,
"loss": 1.6437,
"step": 148000
},
{
"epoch": 23.05542617605962,
"grad_norm": 3.612490177154541,
"learning_rate": 2.6944573823940382e-05,
"loss": 1.5592,
"step": 148500
},
{
"epoch": 23.13305387362211,
"grad_norm": 4.134332656860352,
"learning_rate": 2.6866946126377896e-05,
"loss": 1.5415,
"step": 149000
},
{
"epoch": 23.210681571184598,
"grad_norm": 4.3021321296691895,
"learning_rate": 2.67893184288154e-05,
"loss": 1.5512,
"step": 149500
},
{
"epoch": 23.28830926874709,
"grad_norm": 4.436678886413574,
"learning_rate": 2.6711690731252915e-05,
"loss": 1.5472,
"step": 150000
},
{
"epoch": 23.36593696630958,
"grad_norm": 4.172628402709961,
"learning_rate": 2.6634063033690422e-05,
"loss": 1.5494,
"step": 150500
},
{
"epoch": 23.44356466387207,
"grad_norm": 4.578736782073975,
"learning_rate": 2.6556435336127933e-05,
"loss": 1.5561,
"step": 151000
},
{
"epoch": 23.52119236143456,
"grad_norm": 4.1252336502075195,
"learning_rate": 2.647880763856544e-05,
"loss": 1.5626,
"step": 151500
},
{
"epoch": 23.59882005899705,
"grad_norm": 3.929494619369507,
"learning_rate": 2.640117994100295e-05,
"loss": 1.5769,
"step": 152000
},
{
"epoch": 23.67644775655954,
"grad_norm": 4.310312271118164,
"learning_rate": 2.6323552243440463e-05,
"loss": 1.5716,
"step": 152500
},
{
"epoch": 23.75407545412203,
"grad_norm": 3.970519781112671,
"learning_rate": 2.624592454587797e-05,
"loss": 1.5764,
"step": 153000
},
{
"epoch": 23.831703151684522,
"grad_norm": 3.880556583404541,
"learning_rate": 2.616829684831548e-05,
"loss": 1.5871,
"step": 153500
},
{
"epoch": 23.909330849247013,
"grad_norm": 4.146645545959473,
"learning_rate": 2.609066915075299e-05,
"loss": 1.5869,
"step": 154000
},
{
"epoch": 23.9869585468095,
"grad_norm": 4.036287784576416,
"learning_rate": 2.60130414531905e-05,
"loss": 1.583,
"step": 154500
},
{
"epoch": 24.06458624437199,
"grad_norm": 4.351132869720459,
"learning_rate": 2.5935413755628007e-05,
"loss": 1.4982,
"step": 155000
},
{
"epoch": 24.142213941934482,
"grad_norm": 4.366822242736816,
"learning_rate": 2.5857786058065518e-05,
"loss": 1.4897,
"step": 155500
},
{
"epoch": 24.219841639496973,
"grad_norm": 4.432433128356934,
"learning_rate": 2.5780158360503026e-05,
"loss": 1.4969,
"step": 156000
},
{
"epoch": 24.297469337059464,
"grad_norm": 4.0283613204956055,
"learning_rate": 2.570253066294054e-05,
"loss": 1.4992,
"step": 156500
},
{
"epoch": 24.37509703462195,
"grad_norm": 4.035061359405518,
"learning_rate": 2.562490296537805e-05,
"loss": 1.4968,
"step": 157000
},
{
"epoch": 24.452724732184443,
"grad_norm": 3.834836006164551,
"learning_rate": 2.554727526781556e-05,
"loss": 1.5156,
"step": 157500
},
{
"epoch": 24.530352429746934,
"grad_norm": 4.057690143585205,
"learning_rate": 2.546964757025307e-05,
"loss": 1.5052,
"step": 158000
},
{
"epoch": 24.607980127309425,
"grad_norm": 4.63842248916626,
"learning_rate": 2.5392019872690577e-05,
"loss": 1.5107,
"step": 158500
},
{
"epoch": 24.685607824871916,
"grad_norm": 3.624314069747925,
"learning_rate": 2.5314392175128088e-05,
"loss": 1.5185,
"step": 159000
},
{
"epoch": 24.763235522434403,
"grad_norm": 4.338582515716553,
"learning_rate": 2.5236764477565595e-05,
"loss": 1.5187,
"step": 159500
},
{
"epoch": 24.840863219996894,
"grad_norm": 3.9074742794036865,
"learning_rate": 2.5159136780003106e-05,
"loss": 1.524,
"step": 160000
},
{
"epoch": 24.918490917559385,
"grad_norm": 3.97880482673645,
"learning_rate": 2.5081509082440617e-05,
"loss": 1.5278,
"step": 160500
},
{
"epoch": 24.996118615121876,
"grad_norm": 4.298096656799316,
"learning_rate": 2.5003881384878125e-05,
"loss": 1.5267,
"step": 161000
},
{
"epoch": 25.073746312684367,
"grad_norm": 3.85455322265625,
"learning_rate": 2.4926253687315636e-05,
"loss": 1.442,
"step": 161500
},
{
"epoch": 25.151374010246855,
"grad_norm": 3.907085418701172,
"learning_rate": 2.4848625989753147e-05,
"loss": 1.4262,
"step": 162000
},
{
"epoch": 25.229001707809346,
"grad_norm": 4.488945484161377,
"learning_rate": 2.4770998292190654e-05,
"loss": 1.4391,
"step": 162500
},
{
"epoch": 25.306629405371837,
"grad_norm": 4.565778732299805,
"learning_rate": 2.4693370594628165e-05,
"loss": 1.447,
"step": 163000
},
{
"epoch": 25.384257102934328,
"grad_norm": 4.2508015632629395,
"learning_rate": 2.4615742897065676e-05,
"loss": 1.4442,
"step": 163500
},
{
"epoch": 25.46188480049682,
"grad_norm": 4.572117328643799,
"learning_rate": 2.4538115199503184e-05,
"loss": 1.4495,
"step": 164000
},
{
"epoch": 25.539512498059306,
"grad_norm": 4.516686916351318,
"learning_rate": 2.4460487501940695e-05,
"loss": 1.4625,
"step": 164500
},
{
"epoch": 25.617140195621797,
"grad_norm": 4.200167655944824,
"learning_rate": 2.4382859804378202e-05,
"loss": 1.4614,
"step": 165000
},
{
"epoch": 25.694767893184288,
"grad_norm": 3.777397632598877,
"learning_rate": 2.4305232106815713e-05,
"loss": 1.4632,
"step": 165500
},
{
"epoch": 25.77239559074678,
"grad_norm": 4.383970737457275,
"learning_rate": 2.4227604409253224e-05,
"loss": 1.4773,
"step": 166000
},
{
"epoch": 25.85002328830927,
"grad_norm": 4.216927528381348,
"learning_rate": 2.4149976711690735e-05,
"loss": 1.4794,
"step": 166500
},
{
"epoch": 25.927650985871757,
"grad_norm": 5.53390645980835,
"learning_rate": 2.4072349014128243e-05,
"loss": 1.4685,
"step": 167000
},
{
"epoch": 26.00527868343425,
"grad_norm": 3.9746012687683105,
"learning_rate": 2.3994721316565753e-05,
"loss": 1.4873,
"step": 167500
},
{
"epoch": 26.08290638099674,
"grad_norm": 4.278408527374268,
"learning_rate": 2.391709361900326e-05,
"loss": 1.3877,
"step": 168000
},
{
"epoch": 26.16053407855923,
"grad_norm": 4.082756042480469,
"learning_rate": 2.3839465921440772e-05,
"loss": 1.3938,
"step": 168500
},
{
"epoch": 26.23816177612172,
"grad_norm": 3.929353713989258,
"learning_rate": 2.376183822387828e-05,
"loss": 1.3903,
"step": 169000
},
{
"epoch": 26.31578947368421,
"grad_norm": 4.400444030761719,
"learning_rate": 2.368421052631579e-05,
"loss": 1.4032,
"step": 169500
},
{
"epoch": 26.3934171712467,
"grad_norm": 4.266624450683594,
"learning_rate": 2.36065828287533e-05,
"loss": 1.4028,
"step": 170000
},
{
"epoch": 26.47104486880919,
"grad_norm": 4.547267913818359,
"learning_rate": 2.3528955131190812e-05,
"loss": 1.4043,
"step": 170500
},
{
"epoch": 26.548672566371682,
"grad_norm": 4.04599666595459,
"learning_rate": 2.345132743362832e-05,
"loss": 1.4047,
"step": 171000
},
{
"epoch": 26.626300263934173,
"grad_norm": 4.308363437652588,
"learning_rate": 2.337369973606583e-05,
"loss": 1.4154,
"step": 171500
},
{
"epoch": 26.70392796149666,
"grad_norm": 3.774397373199463,
"learning_rate": 2.329607203850334e-05,
"loss": 1.4127,
"step": 172000
},
{
"epoch": 26.78155565905915,
"grad_norm": 4.222719669342041,
"learning_rate": 2.321844434094085e-05,
"loss": 1.4149,
"step": 172500
},
{
"epoch": 26.859183356621642,
"grad_norm": 4.3920135498046875,
"learning_rate": 2.3140816643378357e-05,
"loss": 1.4238,
"step": 173000
},
{
"epoch": 26.936811054184133,
"grad_norm": 4.5161213874816895,
"learning_rate": 2.306318894581587e-05,
"loss": 1.4232,
"step": 173500
},
{
"epoch": 27.014438751746624,
"grad_norm": 4.091419696807861,
"learning_rate": 2.298556124825338e-05,
"loss": 1.412,
"step": 174000
},
{
"epoch": 27.092066449309115,
"grad_norm": 4.063779830932617,
"learning_rate": 2.290793355069089e-05,
"loss": 1.3344,
"step": 174500
},
{
"epoch": 27.169694146871603,
"grad_norm": 4.165656089782715,
"learning_rate": 2.2830305853128397e-05,
"loss": 1.3348,
"step": 175000
},
{
"epoch": 27.247321844434094,
"grad_norm": 4.288286209106445,
"learning_rate": 2.2752678155565908e-05,
"loss": 1.3389,
"step": 175500
},
{
"epoch": 27.324949541996585,
"grad_norm": 4.2835211753845215,
"learning_rate": 2.2675050458003416e-05,
"loss": 1.3493,
"step": 176000
},
{
"epoch": 27.402577239559076,
"grad_norm": 4.381802558898926,
"learning_rate": 2.2597422760440927e-05,
"loss": 1.358,
"step": 176500
},
{
"epoch": 27.480204937121567,
"grad_norm": 4.263532638549805,
"learning_rate": 2.2519795062878434e-05,
"loss": 1.3632,
"step": 177000
},
{
"epoch": 27.557832634684054,
"grad_norm": 4.2341742515563965,
"learning_rate": 2.244216736531595e-05,
"loss": 1.3734,
"step": 177500
},
{
"epoch": 27.635460332246545,
"grad_norm": 3.9163522720336914,
"learning_rate": 2.2364539667753456e-05,
"loss": 1.3658,
"step": 178000
},
{
"epoch": 27.713088029809036,
"grad_norm": 4.0479841232299805,
"learning_rate": 2.2286911970190967e-05,
"loss": 1.3593,
"step": 178500
},
{
"epoch": 27.790715727371527,
"grad_norm": 5.027287483215332,
"learning_rate": 2.2209284272628475e-05,
"loss": 1.3869,
"step": 179000
},
{
"epoch": 27.868343424934018,
"grad_norm": 4.199400424957275,
"learning_rate": 2.2131656575065985e-05,
"loss": 1.3882,
"step": 179500
},
{
"epoch": 27.945971122496506,
"grad_norm": 3.9147210121154785,
"learning_rate": 2.2054028877503493e-05,
"loss": 1.3781,
"step": 180000
},
{
"epoch": 28.023598820058996,
"grad_norm": 4.450961112976074,
"learning_rate": 2.1976401179941004e-05,
"loss": 1.3514,
"step": 180500
},
{
"epoch": 28.101226517621487,
"grad_norm": 4.467356204986572,
"learning_rate": 2.189877348237851e-05,
"loss": 1.2839,
"step": 181000
},
{
"epoch": 28.17885421518398,
"grad_norm": 4.179466247558594,
"learning_rate": 2.1821145784816026e-05,
"loss": 1.3017,
"step": 181500
},
{
"epoch": 28.25648191274647,
"grad_norm": 3.7988483905792236,
"learning_rate": 2.1743518087253533e-05,
"loss": 1.3177,
"step": 182000
},
{
"epoch": 28.334109610308957,
"grad_norm": 3.9721014499664307,
"learning_rate": 2.1665890389691044e-05,
"loss": 1.302,
"step": 182500
},
{
"epoch": 28.411737307871448,
"grad_norm": 4.474249362945557,
"learning_rate": 2.1588262692128552e-05,
"loss": 1.3053,
"step": 183000
},
{
"epoch": 28.48936500543394,
"grad_norm": 4.546684741973877,
"learning_rate": 2.1510634994566063e-05,
"loss": 1.3231,
"step": 183500
},
{
"epoch": 28.56699270299643,
"grad_norm": 4.715445518493652,
"learning_rate": 2.143300729700357e-05,
"loss": 1.3305,
"step": 184000
},
{
"epoch": 28.64462040055892,
"grad_norm": 4.777371406555176,
"learning_rate": 2.135537959944108e-05,
"loss": 1.3231,
"step": 184500
},
{
"epoch": 28.72224809812141,
"grad_norm": 4.404980182647705,
"learning_rate": 2.1277751901878592e-05,
"loss": 1.3266,
"step": 185000
},
{
"epoch": 28.7998757956839,
"grad_norm": 4.121158599853516,
"learning_rate": 2.1200124204316103e-05,
"loss": 1.3326,
"step": 185500
},
{
"epoch": 28.87750349324639,
"grad_norm": 4.212721824645996,
"learning_rate": 2.112249650675361e-05,
"loss": 1.3239,
"step": 186000
},
{
"epoch": 28.95513119080888,
"grad_norm": 3.941192626953125,
"learning_rate": 2.104486880919112e-05,
"loss": 1.337,
"step": 186500
},
{
"epoch": 29.032758888371372,
"grad_norm": 4.226070404052734,
"learning_rate": 2.096724111162863e-05,
"loss": 1.2999,
"step": 187000
},
{
"epoch": 29.11038658593386,
"grad_norm": 4.37491512298584,
"learning_rate": 2.088961341406614e-05,
"loss": 1.2449,
"step": 187500
},
{
"epoch": 29.18801428349635,
"grad_norm": 4.1313347816467285,
"learning_rate": 2.0811985716503648e-05,
"loss": 1.2655,
"step": 188000
},
{
"epoch": 29.26564198105884,
"grad_norm": 4.144821643829346,
"learning_rate": 2.073435801894116e-05,
"loss": 1.2701,
"step": 188500
},
{
"epoch": 29.343269678621333,
"grad_norm": 4.262469291687012,
"learning_rate": 2.065673032137867e-05,
"loss": 1.2671,
"step": 189000
},
{
"epoch": 29.420897376183824,
"grad_norm": 4.0824761390686035,
"learning_rate": 2.057910262381618e-05,
"loss": 1.2757,
"step": 189500
},
{
"epoch": 29.49852507374631,
"grad_norm": 4.00981330871582,
"learning_rate": 2.0501474926253688e-05,
"loss": 1.275,
"step": 190000
},
{
"epoch": 29.576152771308802,
"grad_norm": 4.502607822418213,
"learning_rate": 2.04238472286912e-05,
"loss": 1.278,
"step": 190500
},
{
"epoch": 29.653780468871293,
"grad_norm": 4.623337745666504,
"learning_rate": 2.0346219531128707e-05,
"loss": 1.2805,
"step": 191000
},
{
"epoch": 29.731408166433784,
"grad_norm": 4.471139430999756,
"learning_rate": 2.0268591833566218e-05,
"loss": 1.2761,
"step": 191500
},
{
"epoch": 29.809035863996275,
"grad_norm": 4.283520698547363,
"learning_rate": 2.0190964136003725e-05,
"loss": 1.2907,
"step": 192000
},
{
"epoch": 29.886663561558763,
"grad_norm": 4.755760192871094,
"learning_rate": 2.011333643844124e-05,
"loss": 1.2887,
"step": 192500
},
{
"epoch": 29.964291259121254,
"grad_norm": 4.386314392089844,
"learning_rate": 2.0035708740878747e-05,
"loss": 1.2949,
"step": 193000
},
{
"epoch": 30.041918956683745,
"grad_norm": 4.468728542327881,
"learning_rate": 1.9958081043316258e-05,
"loss": 1.2377,
"step": 193500
},
{
"epoch": 30.119546654246236,
"grad_norm": 4.082640171051025,
"learning_rate": 1.9880453345753765e-05,
"loss": 1.2118,
"step": 194000
},
{
"epoch": 30.197174351808727,
"grad_norm": 4.6380205154418945,
"learning_rate": 1.9802825648191276e-05,
"loss": 1.2211,
"step": 194500
},
{
"epoch": 30.274802049371214,
"grad_norm": 4.422779083251953,
"learning_rate": 1.9725197950628784e-05,
"loss": 1.2255,
"step": 195000
},
{
"epoch": 30.352429746933705,
"grad_norm": 4.414443016052246,
"learning_rate": 1.9647570253066295e-05,
"loss": 1.2277,
"step": 195500
},
{
"epoch": 30.430057444496196,
"grad_norm": 4.212508201599121,
"learning_rate": 1.9569942555503802e-05,
"loss": 1.236,
"step": 196000
},
{
"epoch": 30.507685142058687,
"grad_norm": 4.3478803634643555,
"learning_rate": 1.9492314857941317e-05,
"loss": 1.2387,
"step": 196500
},
{
"epoch": 30.585312839621178,
"grad_norm": 5.213949203491211,
"learning_rate": 1.9414687160378824e-05,
"loss": 1.2434,
"step": 197000
},
{
"epoch": 30.662940537183665,
"grad_norm": 3.907501459121704,
"learning_rate": 1.9337059462816335e-05,
"loss": 1.2415,
"step": 197500
},
{
"epoch": 30.740568234746156,
"grad_norm": 4.092105865478516,
"learning_rate": 1.9259431765253843e-05,
"loss": 1.2515,
"step": 198000
},
{
"epoch": 30.818195932308647,
"grad_norm": 4.422701835632324,
"learning_rate": 1.9181804067691354e-05,
"loss": 1.2554,
"step": 198500
},
{
"epoch": 30.89582362987114,
"grad_norm": 4.132325172424316,
"learning_rate": 1.910417637012886e-05,
"loss": 1.2607,
"step": 199000
},
{
"epoch": 30.97345132743363,
"grad_norm": 4.294840335845947,
"learning_rate": 1.9026548672566372e-05,
"loss": 1.2457,
"step": 199500
},
{
"epoch": 31.05107902499612,
"grad_norm": 4.593545913696289,
"learning_rate": 1.894892097500388e-05,
"loss": 1.1969,
"step": 200000
},
{
"epoch": 31.128706722558608,
"grad_norm": 3.965829610824585,
"learning_rate": 1.8871293277441394e-05,
"loss": 1.1812,
"step": 200500
},
{
"epoch": 31.2063344201211,
"grad_norm": 4.391860008239746,
"learning_rate": 1.87936655798789e-05,
"loss": 1.1764,
"step": 201000
},
{
"epoch": 31.28396211768359,
"grad_norm": 4.370110511779785,
"learning_rate": 1.8716037882316413e-05,
"loss": 1.1804,
"step": 201500
},
{
"epoch": 31.36158981524608,
"grad_norm": 4.167665958404541,
"learning_rate": 1.863841018475392e-05,
"loss": 1.1993,
"step": 202000
},
{
"epoch": 31.439217512808572,
"grad_norm": 4.17106294631958,
"learning_rate": 1.856078248719143e-05,
"loss": 1.1915,
"step": 202500
},
{
"epoch": 31.51684521037106,
"grad_norm": 4.328006267547607,
"learning_rate": 1.848315478962894e-05,
"loss": 1.2023,
"step": 203000
},
{
"epoch": 31.59447290793355,
"grad_norm": 4.033382415771484,
"learning_rate": 1.840552709206645e-05,
"loss": 1.2049,
"step": 203500
},
{
"epoch": 31.67210060549604,
"grad_norm": 4.497017860412598,
"learning_rate": 1.832789939450396e-05,
"loss": 1.2005,
"step": 204000
},
{
"epoch": 31.749728303058532,
"grad_norm": 4.34217643737793,
"learning_rate": 1.825027169694147e-05,
"loss": 1.1972,
"step": 204500
},
{
"epoch": 31.827356000621023,
"grad_norm": 4.198293209075928,
"learning_rate": 1.817264399937898e-05,
"loss": 1.2119,
"step": 205000
},
{
"epoch": 31.90498369818351,
"grad_norm": 4.584846019744873,
"learning_rate": 1.809501630181649e-05,
"loss": 1.2265,
"step": 205500
},
{
"epoch": 31.982611395746,
"grad_norm": 4.147974014282227,
"learning_rate": 1.8017388604253997e-05,
"loss": 1.231,
"step": 206000
},
{
"epoch": 32.06023909330849,
"grad_norm": 4.133516311645508,
"learning_rate": 1.793976090669151e-05,
"loss": 1.1624,
"step": 206500
},
{
"epoch": 32.13786679087098,
"grad_norm": 3.903019905090332,
"learning_rate": 1.7862133209129016e-05,
"loss": 1.1447,
"step": 207000
},
{
"epoch": 32.21549448843347,
"grad_norm": 4.349834442138672,
"learning_rate": 1.7784505511566527e-05,
"loss": 1.1472,
"step": 207500
},
{
"epoch": 32.29312218599596,
"grad_norm": 5.044727325439453,
"learning_rate": 1.7706877814004038e-05,
"loss": 1.1497,
"step": 208000
},
{
"epoch": 32.37074988355845,
"grad_norm": 4.564863681793213,
"learning_rate": 1.762925011644155e-05,
"loss": 1.1568,
"step": 208500
},
{
"epoch": 32.448377581120944,
"grad_norm": 4.659034252166748,
"learning_rate": 1.7551622418879056e-05,
"loss": 1.1652,
"step": 209000
},
{
"epoch": 32.526005278683435,
"grad_norm": 4.484036445617676,
"learning_rate": 1.7473994721316567e-05,
"loss": 1.1689,
"step": 209500
},
{
"epoch": 32.603632976245926,
"grad_norm": 3.8715898990631104,
"learning_rate": 1.7396367023754075e-05,
"loss": 1.1625,
"step": 210000
},
{
"epoch": 32.68126067380842,
"grad_norm": 4.791990280151367,
"learning_rate": 1.7318739326191586e-05,
"loss": 1.1649,
"step": 210500
},
{
"epoch": 32.75888837137091,
"grad_norm": 4.657315254211426,
"learning_rate": 1.7241111628629093e-05,
"loss": 1.1658,
"step": 211000
},
{
"epoch": 32.83651606893339,
"grad_norm": 4.780379295349121,
"learning_rate": 1.7163483931066604e-05,
"loss": 1.1789,
"step": 211500
},
{
"epoch": 32.91414376649588,
"grad_norm": 4.298798561096191,
"learning_rate": 1.7085856233504115e-05,
"loss": 1.1873,
"step": 212000
},
{
"epoch": 32.991771464058374,
"grad_norm": 4.570270538330078,
"learning_rate": 1.7008228535941626e-05,
"loss": 1.1736,
"step": 212500
},
{
"epoch": 33.069399161620865,
"grad_norm": 4.421665191650391,
"learning_rate": 1.6930600838379134e-05,
"loss": 1.1079,
"step": 213000
},
{
"epoch": 33.147026859183356,
"grad_norm": 4.232321739196777,
"learning_rate": 1.6852973140816645e-05,
"loss": 1.0986,
"step": 213500
},
{
"epoch": 33.22465455674585,
"grad_norm": 4.439553737640381,
"learning_rate": 1.6775345443254152e-05,
"loss": 1.114,
"step": 214000
},
{
"epoch": 33.30228225430834,
"grad_norm": 3.9282166957855225,
"learning_rate": 1.6697717745691663e-05,
"loss": 1.1229,
"step": 214500
},
{
"epoch": 33.37990995187083,
"grad_norm": 4.5075907707214355,
"learning_rate": 1.662009004812917e-05,
"loss": 1.1298,
"step": 215000
},
{
"epoch": 33.45753764943332,
"grad_norm": 4.296872138977051,
"learning_rate": 1.6542462350566685e-05,
"loss": 1.1271,
"step": 215500
},
{
"epoch": 33.53516534699581,
"grad_norm": 3.8833069801330566,
"learning_rate": 1.6464834653004193e-05,
"loss": 1.1334,
"step": 216000
},
{
"epoch": 33.6127930445583,
"grad_norm": 4.518033027648926,
"learning_rate": 1.6387206955441703e-05,
"loss": 1.1251,
"step": 216500
},
{
"epoch": 33.690420742120786,
"grad_norm": 4.618717670440674,
"learning_rate": 1.630957925787921e-05,
"loss": 1.137,
"step": 217000
},
{
"epoch": 33.76804843968328,
"grad_norm": 4.346001148223877,
"learning_rate": 1.6231951560316722e-05,
"loss": 1.1439,
"step": 217500
},
{
"epoch": 33.84567613724577,
"grad_norm": 4.203965663909912,
"learning_rate": 1.615432386275423e-05,
"loss": 1.1424,
"step": 218000
},
{
"epoch": 33.92330383480826,
"grad_norm": 4.829082489013672,
"learning_rate": 1.607669616519174e-05,
"loss": 1.1476,
"step": 218500
},
{
"epoch": 34.00093153237075,
"grad_norm": 4.414132118225098,
"learning_rate": 1.5999068467629248e-05,
"loss": 1.1452,
"step": 219000
},
{
"epoch": 34.07855922993324,
"grad_norm": 4.220102787017822,
"learning_rate": 1.5921440770066762e-05,
"loss": 1.0785,
"step": 219500
},
{
"epoch": 34.15618692749573,
"grad_norm": 4.156444549560547,
"learning_rate": 1.584381307250427e-05,
"loss": 1.0781,
"step": 220000
},
{
"epoch": 34.23381462505822,
"grad_norm": 3.997420072555542,
"learning_rate": 1.576618537494178e-05,
"loss": 1.0911,
"step": 220500
},
{
"epoch": 34.311442322620714,
"grad_norm": 4.4925537109375,
"learning_rate": 1.568855767737929e-05,
"loss": 1.0861,
"step": 221000
},
{
"epoch": 34.389070020183205,
"grad_norm": 4.4098615646362305,
"learning_rate": 1.56109299798168e-05,
"loss": 1.0984,
"step": 221500
},
{
"epoch": 34.46669771774569,
"grad_norm": 4.235119819641113,
"learning_rate": 1.5533302282254307e-05,
"loss": 1.0945,
"step": 222000
},
{
"epoch": 34.54432541530818,
"grad_norm": 4.796499729156494,
"learning_rate": 1.5455674584691818e-05,
"loss": 1.0973,
"step": 222500
},
{
"epoch": 34.62195311287067,
"grad_norm": 4.959954261779785,
"learning_rate": 1.537804688712933e-05,
"loss": 1.0978,
"step": 223000
},
{
"epoch": 34.69958081043316,
"grad_norm": 4.675489902496338,
"learning_rate": 1.530041918956684e-05,
"loss": 1.1047,
"step": 223500
},
{
"epoch": 34.77720850799565,
"grad_norm": 4.466859340667725,
"learning_rate": 1.5222791492004349e-05,
"loss": 1.093,
"step": 224000
},
{
"epoch": 34.85483620555814,
"grad_norm": 4.607345104217529,
"learning_rate": 1.5145163794441858e-05,
"loss": 1.1098,
"step": 224500
},
{
"epoch": 34.932463903120635,
"grad_norm": 3.9733870029449463,
"learning_rate": 1.5067536096879367e-05,
"loss": 1.1199,
"step": 225000
},
{
"epoch": 35.010091600683126,
"grad_norm": 4.052885055541992,
"learning_rate": 1.4989908399316877e-05,
"loss": 1.1009,
"step": 225500
},
{
"epoch": 35.08771929824562,
"grad_norm": 4.508426189422607,
"learning_rate": 1.4912280701754386e-05,
"loss": 1.0394,
"step": 226000
},
{
"epoch": 35.16534699580811,
"grad_norm": 4.186591148376465,
"learning_rate": 1.4834653004191895e-05,
"loss": 1.0526,
"step": 226500
},
{
"epoch": 35.24297469337059,
"grad_norm": 4.583897590637207,
"learning_rate": 1.4757025306629408e-05,
"loss": 1.0492,
"step": 227000
},
{
"epoch": 35.32060239093308,
"grad_norm": 4.202432155609131,
"learning_rate": 1.4679397609066917e-05,
"loss": 1.0575,
"step": 227500
},
{
"epoch": 35.39823008849557,
"grad_norm": 4.248536586761475,
"learning_rate": 1.4601769911504426e-05,
"loss": 1.0694,
"step": 228000
},
{
"epoch": 35.475857786058064,
"grad_norm": 4.490120887756348,
"learning_rate": 1.4524142213941935e-05,
"loss": 1.0661,
"step": 228500
},
{
"epoch": 35.553485483620555,
"grad_norm": 4.558992862701416,
"learning_rate": 1.4446514516379445e-05,
"loss": 1.0683,
"step": 229000
},
{
"epoch": 35.631113181183046,
"grad_norm": 4.340649127960205,
"learning_rate": 1.4368886818816954e-05,
"loss": 1.0733,
"step": 229500
},
{
"epoch": 35.70874087874554,
"grad_norm": 4.814639091491699,
"learning_rate": 1.4291259121254463e-05,
"loss": 1.0699,
"step": 230000
},
{
"epoch": 35.78636857630803,
"grad_norm": 5.107011795043945,
"learning_rate": 1.4213631423691972e-05,
"loss": 1.0785,
"step": 230500
},
{
"epoch": 35.86399627387052,
"grad_norm": 4.92033052444458,
"learning_rate": 1.4136003726129485e-05,
"loss": 1.0779,
"step": 231000
},
{
"epoch": 35.94162397143301,
"grad_norm": 5.033237457275391,
"learning_rate": 1.4058376028566994e-05,
"loss": 1.0863,
"step": 231500
},
{
"epoch": 36.019251668995494,
"grad_norm": 4.0776591300964355,
"learning_rate": 1.3980748331004504e-05,
"loss": 1.0703,
"step": 232000
},
{
"epoch": 36.096879366557985,
"grad_norm": 4.491557598114014,
"learning_rate": 1.3903120633442013e-05,
"loss": 1.0207,
"step": 232500
},
{
"epoch": 36.174507064120476,
"grad_norm": 4.444462299346924,
"learning_rate": 1.3825492935879522e-05,
"loss": 1.0357,
"step": 233000
},
{
"epoch": 36.25213476168297,
"grad_norm": 4.559656143188477,
"learning_rate": 1.3747865238317031e-05,
"loss": 1.0295,
"step": 233500
},
{
"epoch": 36.32976245924546,
"grad_norm": 4.09979248046875,
"learning_rate": 1.367023754075454e-05,
"loss": 1.0142,
"step": 234000
},
{
"epoch": 36.40739015680795,
"grad_norm": 4.5045084953308105,
"learning_rate": 1.3592609843192053e-05,
"loss": 1.0292,
"step": 234500
},
{
"epoch": 36.48501785437044,
"grad_norm": 5.544869422912598,
"learning_rate": 1.3514982145629562e-05,
"loss": 1.0371,
"step": 235000
},
{
"epoch": 36.56264555193293,
"grad_norm": 4.618766784667969,
"learning_rate": 1.3437354448067072e-05,
"loss": 1.0376,
"step": 235500
},
{
"epoch": 36.64027324949542,
"grad_norm": 4.791065216064453,
"learning_rate": 1.3359726750504581e-05,
"loss": 1.0438,
"step": 236000
},
{
"epoch": 36.71790094705791,
"grad_norm": 4.122102737426758,
"learning_rate": 1.328209905294209e-05,
"loss": 1.0462,
"step": 236500
},
{
"epoch": 36.7955286446204,
"grad_norm": 4.137369632720947,
"learning_rate": 1.32044713553796e-05,
"loss": 1.0444,
"step": 237000
},
{
"epoch": 36.87315634218289,
"grad_norm": 4.59998083114624,
"learning_rate": 1.3126843657817109e-05,
"loss": 1.0508,
"step": 237500
},
{
"epoch": 36.95078403974538,
"grad_norm": 4.751966953277588,
"learning_rate": 1.3049215960254618e-05,
"loss": 1.0474,
"step": 238000
},
{
"epoch": 37.02841173730787,
"grad_norm": 4.363110065460205,
"learning_rate": 1.297158826269213e-05,
"loss": 1.026,
"step": 238500
},
{
"epoch": 37.10603943487036,
"grad_norm": 5.005125045776367,
"learning_rate": 1.289396056512964e-05,
"loss": 0.9971,
"step": 239000
},
{
"epoch": 37.18366713243285,
"grad_norm": 4.143869400024414,
"learning_rate": 1.2816332867567149e-05,
"loss": 0.9877,
"step": 239500
},
{
"epoch": 37.26129482999534,
"grad_norm": 4.527329444885254,
"learning_rate": 1.2738705170004658e-05,
"loss": 0.9914,
"step": 240000
},
{
"epoch": 37.338922527557834,
"grad_norm": 3.8393781185150146,
"learning_rate": 1.2661077472442168e-05,
"loss": 1.0098,
"step": 240500
},
{
"epoch": 37.416550225120325,
"grad_norm": 4.1036295890808105,
"learning_rate": 1.2583449774879677e-05,
"loss": 1.0058,
"step": 241000
},
{
"epoch": 37.494177922682816,
"grad_norm": 4.97705078125,
"learning_rate": 1.2505822077317186e-05,
"loss": 1.0098,
"step": 241500
},
{
"epoch": 37.57180562024531,
"grad_norm": 4.289205074310303,
"learning_rate": 1.2428194379754697e-05,
"loss": 1.0117,
"step": 242000
},
{
"epoch": 37.64943331780779,
"grad_norm": 4.353816509246826,
"learning_rate": 1.2350566682192206e-05,
"loss": 1.0162,
"step": 242500
},
{
"epoch": 37.72706101537028,
"grad_norm": 4.447281837463379,
"learning_rate": 1.2272938984629717e-05,
"loss": 1.0202,
"step": 243000
},
{
"epoch": 37.80468871293277,
"grad_norm": 4.254565715789795,
"learning_rate": 1.2195311287067226e-05,
"loss": 1.0252,
"step": 243500
},
{
"epoch": 37.882316410495264,
"grad_norm": 4.382399559020996,
"learning_rate": 1.2117683589504736e-05,
"loss": 1.023,
"step": 244000
},
{
"epoch": 37.959944108057755,
"grad_norm": 4.591485977172852,
"learning_rate": 1.2040055891942245e-05,
"loss": 1.024,
"step": 244500
},
{
"epoch": 38.037571805620246,
"grad_norm": 4.238889217376709,
"learning_rate": 1.1962428194379756e-05,
"loss": 0.996,
"step": 245000
},
{
"epoch": 38.11519950318274,
"grad_norm": 5.276005268096924,
"learning_rate": 1.1884800496817265e-05,
"loss": 0.97,
"step": 245500
},
{
"epoch": 38.19282720074523,
"grad_norm": 4.318702697753906,
"learning_rate": 1.1807172799254774e-05,
"loss": 0.9679,
"step": 246000
},
{
"epoch": 38.27045489830772,
"grad_norm": 4.6534504890441895,
"learning_rate": 1.1729545101692284e-05,
"loss": 0.9754,
"step": 246500
},
{
"epoch": 38.34808259587021,
"grad_norm": 4.487671375274658,
"learning_rate": 1.1651917404129794e-05,
"loss": 0.9771,
"step": 247000
},
{
"epoch": 38.425710293432694,
"grad_norm": 4.206161975860596,
"learning_rate": 1.1574289706567304e-05,
"loss": 0.9824,
"step": 247500
},
{
"epoch": 38.503337990995185,
"grad_norm": 4.533993721008301,
"learning_rate": 1.1496662009004813e-05,
"loss": 0.98,
"step": 248000
},
{
"epoch": 38.580965688557676,
"grad_norm": 4.58768892288208,
"learning_rate": 1.1419034311442322e-05,
"loss": 0.9891,
"step": 248500
},
{
"epoch": 38.65859338612017,
"grad_norm": 4.578085422515869,
"learning_rate": 1.1341406613879833e-05,
"loss": 0.9912,
"step": 249000
},
{
"epoch": 38.73622108368266,
"grad_norm": 4.549184799194336,
"learning_rate": 1.1263778916317342e-05,
"loss": 0.998,
"step": 249500
},
{
"epoch": 38.81384878124515,
"grad_norm": 4.277008056640625,
"learning_rate": 1.1186151218754852e-05,
"loss": 0.9872,
"step": 250000
},
{
"epoch": 38.89147647880764,
"grad_norm": 4.436850070953369,
"learning_rate": 1.1108523521192361e-05,
"loss": 0.9902,
"step": 250500
},
{
"epoch": 38.96910417637013,
"grad_norm": 4.574080944061279,
"learning_rate": 1.1030895823629872e-05,
"loss": 1.0062,
"step": 251000
},
{
"epoch": 39.04673187393262,
"grad_norm": 4.431211471557617,
"learning_rate": 1.0953268126067381e-05,
"loss": 0.9653,
"step": 251500
},
{
"epoch": 39.12435957149511,
"grad_norm": 4.642630100250244,
"learning_rate": 1.087564042850489e-05,
"loss": 0.9415,
"step": 252000
},
{
"epoch": 39.2019872690576,
"grad_norm": 4.911776065826416,
"learning_rate": 1.0798012730942401e-05,
"loss": 0.9479,
"step": 252500
},
{
"epoch": 39.27961496662009,
"grad_norm": 4.803096771240234,
"learning_rate": 1.072038503337991e-05,
"loss": 0.9548,
"step": 253000
},
{
"epoch": 39.35724266418258,
"grad_norm": 4.382226943969727,
"learning_rate": 1.064275733581742e-05,
"loss": 0.9501,
"step": 253500
},
{
"epoch": 39.43487036174507,
"grad_norm": 4.663143634796143,
"learning_rate": 1.0565129638254929e-05,
"loss": 0.9671,
"step": 254000
},
{
"epoch": 39.51249805930756,
"grad_norm": 4.334278106689453,
"learning_rate": 1.048750194069244e-05,
"loss": 0.9637,
"step": 254500
},
{
"epoch": 39.59012575687005,
"grad_norm": 4.499300956726074,
"learning_rate": 1.040987424312995e-05,
"loss": 0.959,
"step": 255000
},
{
"epoch": 39.66775345443254,
"grad_norm": 4.04175329208374,
"learning_rate": 1.0332246545567458e-05,
"loss": 0.9625,
"step": 255500
},
{
"epoch": 39.74538115199503,
"grad_norm": 4.483138084411621,
"learning_rate": 1.0254618848004968e-05,
"loss": 0.9654,
"step": 256000
},
{
"epoch": 39.823008849557525,
"grad_norm": 4.5711140632629395,
"learning_rate": 1.0176991150442479e-05,
"loss": 0.9705,
"step": 256500
},
{
"epoch": 39.900636547120016,
"grad_norm": 4.339575290679932,
"learning_rate": 1.0099363452879988e-05,
"loss": 0.971,
"step": 257000
},
{
"epoch": 39.9782642446825,
"grad_norm": 4.528174877166748,
"learning_rate": 1.0021735755317497e-05,
"loss": 0.9714,
"step": 257500
},
{
"epoch": 40.05589194224499,
"grad_norm": 4.42559289932251,
"learning_rate": 9.944108057755006e-06,
"loss": 0.9325,
"step": 258000
},
{
"epoch": 40.13351963980748,
"grad_norm": 4.588589191436768,
"learning_rate": 9.866480360192517e-06,
"loss": 0.9248,
"step": 258500
},
{
"epoch": 40.21114733736997,
"grad_norm": 5.253052711486816,
"learning_rate": 9.788852662630027e-06,
"loss": 0.9285,
"step": 259000
},
{
"epoch": 40.28877503493246,
"grad_norm": 4.5551042556762695,
"learning_rate": 9.711224965067536e-06,
"loss": 0.9384,
"step": 259500
},
{
"epoch": 40.366402732494954,
"grad_norm": 4.9546990394592285,
"learning_rate": 9.633597267505045e-06,
"loss": 0.9332,
"step": 260000
},
{
"epoch": 40.444030430057445,
"grad_norm": 4.840395450592041,
"learning_rate": 9.555969569942556e-06,
"loss": 0.9321,
"step": 260500
},
{
"epoch": 40.521658127619936,
"grad_norm": 4.765369415283203,
"learning_rate": 9.478341872380065e-06,
"loss": 0.9366,
"step": 261000
},
{
"epoch": 40.59928582518243,
"grad_norm": 4.869214057922363,
"learning_rate": 9.400714174817574e-06,
"loss": 0.9419,
"step": 261500
},
{
"epoch": 40.67691352274492,
"grad_norm": 4.868770599365234,
"learning_rate": 9.323086477255084e-06,
"loss": 0.9431,
"step": 262000
},
{
"epoch": 40.7545412203074,
"grad_norm": 5.142333030700684,
"learning_rate": 9.245458779692595e-06,
"loss": 0.9455,
"step": 262500
},
{
"epoch": 40.83216891786989,
"grad_norm": 4.263994216918945,
"learning_rate": 9.167831082130104e-06,
"loss": 0.9497,
"step": 263000
},
{
"epoch": 40.909796615432384,
"grad_norm": 4.486149311065674,
"learning_rate": 9.090203384567613e-06,
"loss": 0.9484,
"step": 263500
},
{
"epoch": 40.987424312994875,
"grad_norm": 4.359130859375,
"learning_rate": 9.012575687005124e-06,
"loss": 0.9441,
"step": 264000
},
{
"epoch": 41.065052010557366,
"grad_norm": 4.38929557800293,
"learning_rate": 8.934947989442633e-06,
"loss": 0.9057,
"step": 264500
},
{
"epoch": 41.14267970811986,
"grad_norm": 4.379587650299072,
"learning_rate": 8.857320291880143e-06,
"loss": 0.9024,
"step": 265000
},
{
"epoch": 41.22030740568235,
"grad_norm": 4.549973964691162,
"learning_rate": 8.779692594317652e-06,
"loss": 0.9116,
"step": 265500
},
{
"epoch": 41.29793510324484,
"grad_norm": 4.387326240539551,
"learning_rate": 8.702064896755163e-06,
"loss": 0.9132,
"step": 266000
},
{
"epoch": 41.37556280080733,
"grad_norm": 4.824013710021973,
"learning_rate": 8.624437199192672e-06,
"loss": 0.9048,
"step": 266500
},
{
"epoch": 41.45319049836982,
"grad_norm": 4.79560661315918,
"learning_rate": 8.546809501630181e-06,
"loss": 0.9142,
"step": 267000
},
{
"epoch": 41.53081819593231,
"grad_norm": 4.503738880157471,
"learning_rate": 8.46918180406769e-06,
"loss": 0.92,
"step": 267500
},
{
"epoch": 41.608445893494796,
"grad_norm": 4.430568218231201,
"learning_rate": 8.391554106505201e-06,
"loss": 0.9258,
"step": 268000
},
{
"epoch": 41.68607359105729,
"grad_norm": 4.630665302276611,
"learning_rate": 8.31392640894271e-06,
"loss": 0.9226,
"step": 268500
},
{
"epoch": 41.76370128861978,
"grad_norm": 4.298410415649414,
"learning_rate": 8.23629871138022e-06,
"loss": 0.9264,
"step": 269000
},
{
"epoch": 41.84132898618227,
"grad_norm": 4.575562000274658,
"learning_rate": 8.15867101381773e-06,
"loss": 0.9194,
"step": 269500
},
{
"epoch": 41.91895668374476,
"grad_norm": 4.254932880401611,
"learning_rate": 8.08104331625524e-06,
"loss": 0.9339,
"step": 270000
},
{
"epoch": 41.99658438130725,
"grad_norm": 4.799808502197266,
"learning_rate": 8.00341561869275e-06,
"loss": 0.9262,
"step": 270500
},
{
"epoch": 42.07421207886974,
"grad_norm": 4.432214260101318,
"learning_rate": 7.925787921130259e-06,
"loss": 0.8875,
"step": 271000
},
{
"epoch": 42.15183977643223,
"grad_norm": 4.276678085327148,
"learning_rate": 7.84816022356777e-06,
"loss": 0.8923,
"step": 271500
},
{
"epoch": 42.229467473994724,
"grad_norm": 5.178389072418213,
"learning_rate": 7.770532526005279e-06,
"loss": 0.8835,
"step": 272000
},
{
"epoch": 42.307095171557215,
"grad_norm": 4.696712017059326,
"learning_rate": 7.692904828442788e-06,
"loss": 0.8872,
"step": 272500
},
{
"epoch": 42.3847228691197,
"grad_norm": 4.507452011108398,
"learning_rate": 7.615277130880298e-06,
"loss": 0.892,
"step": 273000
},
{
"epoch": 42.46235056668219,
"grad_norm": 4.397420883178711,
"learning_rate": 7.537649433317809e-06,
"loss": 0.9004,
"step": 273500
},
{
"epoch": 42.53997826424468,
"grad_norm": 4.42085599899292,
"learning_rate": 7.460021735755318e-06,
"loss": 0.9006,
"step": 274000
},
{
"epoch": 42.61760596180717,
"grad_norm": 4.6971306800842285,
"learning_rate": 7.3823940381928275e-06,
"loss": 0.8923,
"step": 274500
},
{
"epoch": 42.69523365936966,
"grad_norm": 4.580519199371338,
"learning_rate": 7.304766340630337e-06,
"loss": 0.8984,
"step": 275000
},
{
"epoch": 42.772861356932154,
"grad_norm": 4.263189315795898,
"learning_rate": 7.227138643067848e-06,
"loss": 0.9049,
"step": 275500
},
{
"epoch": 42.850489054494645,
"grad_norm": 4.588529586791992,
"learning_rate": 7.149510945505357e-06,
"loss": 0.9078,
"step": 276000
},
{
"epoch": 42.928116752057136,
"grad_norm": 4.9102559089660645,
"learning_rate": 7.071883247942866e-06,
"loss": 0.9073,
"step": 276500
},
{
"epoch": 43.00574444961963,
"grad_norm": 4.7918853759765625,
"learning_rate": 6.994255550380375e-06,
"loss": 0.9072,
"step": 277000
},
{
"epoch": 43.08337214718212,
"grad_norm": 3.824863910675049,
"learning_rate": 6.916627852817886e-06,
"loss": 0.8697,
"step": 277500
},
{
"epoch": 43.1609998447446,
"grad_norm": 4.692780017852783,
"learning_rate": 6.839000155255396e-06,
"loss": 0.8758,
"step": 278000
},
{
"epoch": 43.23862754230709,
"grad_norm": 5.024048805236816,
"learning_rate": 6.761372457692905e-06,
"loss": 0.8725,
"step": 278500
},
{
"epoch": 43.316255239869584,
"grad_norm": 4.9430975914001465,
"learning_rate": 6.683744760130414e-06,
"loss": 0.8739,
"step": 279000
},
{
"epoch": 43.393882937432075,
"grad_norm": 4.70835542678833,
"learning_rate": 6.606117062567925e-06,
"loss": 0.8774,
"step": 279500
},
{
"epoch": 43.471510634994566,
"grad_norm": 4.474407196044922,
"learning_rate": 6.528489365005434e-06,
"loss": 0.8788,
"step": 280000
},
{
"epoch": 43.54913833255706,
"grad_norm": 4.508847713470459,
"learning_rate": 6.4508616674429435e-06,
"loss": 0.8812,
"step": 280500
},
{
"epoch": 43.62676603011955,
"grad_norm": 4.584230422973633,
"learning_rate": 6.373233969880453e-06,
"loss": 0.8787,
"step": 281000
},
{
"epoch": 43.70439372768204,
"grad_norm": 4.892379283905029,
"learning_rate": 6.295606272317964e-06,
"loss": 0.8883,
"step": 281500
},
{
"epoch": 43.78202142524453,
"grad_norm": 4.759417533874512,
"learning_rate": 6.217978574755473e-06,
"loss": 0.885,
"step": 282000
},
{
"epoch": 43.85964912280702,
"grad_norm": 4.658566474914551,
"learning_rate": 6.140350877192982e-06,
"loss": 0.8799,
"step": 282500
},
{
"epoch": 43.937276820369505,
"grad_norm": 4.660683631896973,
"learning_rate": 6.062723179630492e-06,
"loss": 0.8899,
"step": 283000
},
{
"epoch": 44.014904517931996,
"grad_norm": 4.208764553070068,
"learning_rate": 5.985095482068002e-06,
"loss": 0.8801,
"step": 283500
},
{
"epoch": 44.09253221549449,
"grad_norm": 4.277160167694092,
"learning_rate": 5.907467784505512e-06,
"loss": 0.854,
"step": 284000
},
{
"epoch": 44.17015991305698,
"grad_norm": 4.98652982711792,
"learning_rate": 5.829840086943022e-06,
"loss": 0.8548,
"step": 284500
},
{
"epoch": 44.24778761061947,
"grad_norm": 4.677061557769775,
"learning_rate": 5.752212389380531e-06,
"loss": 0.8661,
"step": 285000
},
{
"epoch": 44.32541530818196,
"grad_norm": 4.650174617767334,
"learning_rate": 5.674584691818041e-06,
"loss": 0.8626,
"step": 285500
},
{
"epoch": 44.40304300574445,
"grad_norm": 4.145635604858398,
"learning_rate": 5.59695699425555e-06,
"loss": 0.8635,
"step": 286000
},
{
"epoch": 44.48067070330694,
"grad_norm": 4.334202766418457,
"learning_rate": 5.51932929669306e-06,
"loss": 0.8633,
"step": 286500
},
{
"epoch": 44.55829840086943,
"grad_norm": 4.45126485824585,
"learning_rate": 5.44170159913057e-06,
"loss": 0.863,
"step": 287000
},
{
"epoch": 44.63592609843192,
"grad_norm": 4.916016578674316,
"learning_rate": 5.36407390156808e-06,
"loss": 0.8687,
"step": 287500
},
{
"epoch": 44.71355379599441,
"grad_norm": 4.656139373779297,
"learning_rate": 5.286446204005589e-06,
"loss": 0.8665,
"step": 288000
},
{
"epoch": 44.7911814935569,
"grad_norm": 4.845007419586182,
"learning_rate": 5.208818506443099e-06,
"loss": 0.8681,
"step": 288500
},
{
"epoch": 44.86880919111939,
"grad_norm": 4.315593242645264,
"learning_rate": 5.131190808880608e-06,
"loss": 0.863,
"step": 289000
},
{
"epoch": 44.94643688868188,
"grad_norm": 4.265692710876465,
"learning_rate": 5.053563111318118e-06,
"loss": 0.8605,
"step": 289500
},
{
"epoch": 45.02406458624437,
"grad_norm": 4.859785079956055,
"learning_rate": 4.975935413755628e-06,
"loss": 0.8637,
"step": 290000
},
{
"epoch": 45.10169228380686,
"grad_norm": 4.233875751495361,
"learning_rate": 4.898307716193138e-06,
"loss": 0.8408,
"step": 290500
},
{
"epoch": 45.17931998136935,
"grad_norm": 4.796300411224365,
"learning_rate": 4.820680018630647e-06,
"loss": 0.85,
"step": 291000
},
{
"epoch": 45.256947678931844,
"grad_norm": 4.32379150390625,
"learning_rate": 4.743052321068157e-06,
"loss": 0.8455,
"step": 291500
},
{
"epoch": 45.334575376494335,
"grad_norm": 4.826063632965088,
"learning_rate": 4.665424623505667e-06,
"loss": 0.853,
"step": 292000
},
{
"epoch": 45.412203074056826,
"grad_norm": 4.197807312011719,
"learning_rate": 4.587796925943176e-06,
"loss": 0.8563,
"step": 292500
},
{
"epoch": 45.48983077161931,
"grad_norm": 4.949887275695801,
"learning_rate": 4.5101692283806865e-06,
"loss": 0.8478,
"step": 293000
},
{
"epoch": 45.5674584691818,
"grad_norm": 4.073297023773193,
"learning_rate": 4.432541530818196e-06,
"loss": 0.8502,
"step": 293500
},
{
"epoch": 45.64508616674429,
"grad_norm": 4.890108108520508,
"learning_rate": 4.354913833255706e-06,
"loss": 0.8482,
"step": 294000
},
{
"epoch": 45.72271386430678,
"grad_norm": 4.2948079109191895,
"learning_rate": 4.277286135693216e-06,
"loss": 0.847,
"step": 294500
},
{
"epoch": 45.800341561869274,
"grad_norm": 4.1356425285339355,
"learning_rate": 4.199658438130725e-06,
"loss": 0.8543,
"step": 295000
},
{
"epoch": 45.877969259431765,
"grad_norm": 4.8358001708984375,
"learning_rate": 4.122030740568235e-06,
"loss": 0.8519,
"step": 295500
},
{
"epoch": 45.955596956994256,
"grad_norm": 4.316599369049072,
"learning_rate": 4.0444030430057445e-06,
"loss": 0.8518,
"step": 296000
},
{
"epoch": 46.03322465455675,
"grad_norm": 5.166982173919678,
"learning_rate": 3.9667753454432546e-06,
"loss": 0.837,
"step": 296500
},
{
"epoch": 46.11085235211924,
"grad_norm": 5.095579624176025,
"learning_rate": 3.889147647880765e-06,
"loss": 0.8304,
"step": 297000
},
{
"epoch": 46.18848004968173,
"grad_norm": 4.376230716705322,
"learning_rate": 3.8115199503182735e-06,
"loss": 0.8317,
"step": 297500
},
{
"epoch": 46.26610774724422,
"grad_norm": 4.394167900085449,
"learning_rate": 3.7338922527557836e-06,
"loss": 0.8334,
"step": 298000
},
{
"epoch": 46.343735444806704,
"grad_norm": 4.203426361083984,
"learning_rate": 3.656264555193293e-06,
"loss": 0.8282,
"step": 298500
},
{
"epoch": 46.421363142369195,
"grad_norm": 4.700695991516113,
"learning_rate": 3.578636857630803e-06,
"loss": 0.8387,
"step": 299000
},
{
"epoch": 46.498990839931686,
"grad_norm": 4.512545585632324,
"learning_rate": 3.501009160068312e-06,
"loss": 0.8371,
"step": 299500
},
{
"epoch": 46.57661853749418,
"grad_norm": 4.69306755065918,
"learning_rate": 3.4233814625058222e-06,
"loss": 0.8328,
"step": 300000
},
{
"epoch": 46.65424623505667,
"grad_norm": 4.748707294464111,
"learning_rate": 3.3457537649433315e-06,
"loss": 0.8387,
"step": 300500
},
{
"epoch": 46.73187393261916,
"grad_norm": 4.850402355194092,
"learning_rate": 3.2681260673808416e-06,
"loss": 0.8433,
"step": 301000
},
{
"epoch": 46.80950163018165,
"grad_norm": 4.6922197341918945,
"learning_rate": 3.1904983698183512e-06,
"loss": 0.8437,
"step": 301500
},
{
"epoch": 46.88712932774414,
"grad_norm": 4.400567054748535,
"learning_rate": 3.112870672255861e-06,
"loss": 0.8395,
"step": 302000
},
{
"epoch": 46.96475702530663,
"grad_norm": 4.891355037689209,
"learning_rate": 3.0352429746933706e-06,
"loss": 0.8376,
"step": 302500
},
{
"epoch": 47.04238472286912,
"grad_norm": 4.655758857727051,
"learning_rate": 2.9576152771308803e-06,
"loss": 0.8284,
"step": 303000
},
{
"epoch": 47.12001242043161,
"grad_norm": 4.718132972717285,
"learning_rate": 2.87998757956839e-06,
"loss": 0.8187,
"step": 303500
},
{
"epoch": 47.1976401179941,
"grad_norm": 4.415502071380615,
"learning_rate": 2.8023598820059e-06,
"loss": 0.8213,
"step": 304000
},
{
"epoch": 47.27526781555659,
"grad_norm": 5.419862270355225,
"learning_rate": 2.7247321844434097e-06,
"loss": 0.8256,
"step": 304500
},
{
"epoch": 47.35289551311908,
"grad_norm": 4.600099563598633,
"learning_rate": 2.6471044868809193e-06,
"loss": 0.8259,
"step": 305000
},
{
"epoch": 47.43052321068157,
"grad_norm": 5.056214332580566,
"learning_rate": 2.569476789318429e-06,
"loss": 0.8232,
"step": 305500
},
{
"epoch": 47.50815090824406,
"grad_norm": 4.458391189575195,
"learning_rate": 2.4918490917559387e-06,
"loss": 0.8297,
"step": 306000
},
{
"epoch": 47.58577860580655,
"grad_norm": 4.724514961242676,
"learning_rate": 2.4142213941934484e-06,
"loss": 0.8257,
"step": 306500
},
{
"epoch": 47.663406303369044,
"grad_norm": 4.462941646575928,
"learning_rate": 2.336593696630958e-06,
"loss": 0.8265,
"step": 307000
},
{
"epoch": 47.741034000931535,
"grad_norm": 4.594760417938232,
"learning_rate": 2.2589659990684677e-06,
"loss": 0.8285,
"step": 307500
},
{
"epoch": 47.818661698494026,
"grad_norm": 4.6404032707214355,
"learning_rate": 2.1813383015059778e-06,
"loss": 0.8261,
"step": 308000
},
{
"epoch": 47.89628939605651,
"grad_norm": 3.944291830062866,
"learning_rate": 2.1037106039434874e-06,
"loss": 0.834,
"step": 308500
},
{
"epoch": 47.973917093619,
"grad_norm": 4.836678504943848,
"learning_rate": 2.026082906380997e-06,
"loss": 0.827,
"step": 309000
},
{
"epoch": 48.05154479118149,
"grad_norm": 4.680452823638916,
"learning_rate": 1.9484552088185068e-06,
"loss": 0.8142,
"step": 309500
},
{
"epoch": 48.12917248874398,
"grad_norm": 5.229122161865234,
"learning_rate": 1.8708275112560162e-06,
"loss": 0.8151,
"step": 310000
},
{
"epoch": 48.206800186306474,
"grad_norm": 4.585724353790283,
"learning_rate": 1.7931998136935261e-06,
"loss": 0.8188,
"step": 310500
},
{
"epoch": 48.284427883868965,
"grad_norm": 4.325538158416748,
"learning_rate": 1.7155721161310358e-06,
"loss": 0.8115,
"step": 311000
},
{
"epoch": 48.362055581431456,
"grad_norm": 4.884690761566162,
"learning_rate": 1.6379444185685455e-06,
"loss": 0.8105,
"step": 311500
},
{
"epoch": 48.43968327899395,
"grad_norm": 4.815389633178711,
"learning_rate": 1.5603167210060551e-06,
"loss": 0.814,
"step": 312000
},
{
"epoch": 48.51731097655644,
"grad_norm": 4.258877277374268,
"learning_rate": 1.4826890234435648e-06,
"loss": 0.814,
"step": 312500
},
{
"epoch": 48.59493867411893,
"grad_norm": 4.596804618835449,
"learning_rate": 1.4050613258810745e-06,
"loss": 0.8168,
"step": 313000
},
{
"epoch": 48.67256637168141,
"grad_norm": 4.754199504852295,
"learning_rate": 1.3274336283185841e-06,
"loss": 0.8205,
"step": 313500
},
{
"epoch": 48.7501940692439,
"grad_norm": 4.652686595916748,
"learning_rate": 1.2498059307560938e-06,
"loss": 0.818,
"step": 314000
},
{
"epoch": 48.827821766806395,
"grad_norm": 4.778179168701172,
"learning_rate": 1.1721782331936035e-06,
"loss": 0.8215,
"step": 314500
},
{
"epoch": 48.905449464368886,
"grad_norm": 4.835714817047119,
"learning_rate": 1.0945505356311131e-06,
"loss": 0.8184,
"step": 315000
},
{
"epoch": 48.98307716193138,
"grad_norm": 4.331784725189209,
"learning_rate": 1.016922838068623e-06,
"loss": 0.8149,
"step": 315500
},
{
"epoch": 49.06070485949387,
"grad_norm": 4.657207012176514,
"learning_rate": 9.392951405061327e-07,
"loss": 0.8137,
"step": 316000
},
{
"epoch": 49.13833255705636,
"grad_norm": 4.450284481048584,
"learning_rate": 8.616674429436423e-07,
"loss": 0.8115,
"step": 316500
},
{
"epoch": 49.21596025461885,
"grad_norm": 3.921935558319092,
"learning_rate": 7.84039745381152e-07,
"loss": 0.8102,
"step": 317000
},
{
"epoch": 49.29358795218134,
"grad_norm": 4.742419719696045,
"learning_rate": 7.064120478186618e-07,
"loss": 0.8069,
"step": 317500
},
{
"epoch": 49.37121564974383,
"grad_norm": 4.7592387199401855,
"learning_rate": 6.287843502561715e-07,
"loss": 0.8111,
"step": 318000
},
{
"epoch": 49.44884334730632,
"grad_norm": 4.364270210266113,
"learning_rate": 5.511566526936811e-07,
"loss": 0.8044,
"step": 318500
},
{
"epoch": 49.526471044868806,
"grad_norm": 4.5575337409973145,
"learning_rate": 4.735289551311908e-07,
"loss": 0.8007,
"step": 319000
},
{
"epoch": 49.6040987424313,
"grad_norm": 4.399910926818848,
"learning_rate": 3.9590125756870057e-07,
"loss": 0.8097,
"step": 319500
},
{
"epoch": 49.68172643999379,
"grad_norm": 4.863783836364746,
"learning_rate": 3.1827356000621023e-07,
"loss": 0.8093,
"step": 320000
},
{
"epoch": 49.75935413755628,
"grad_norm": 4.700865745544434,
"learning_rate": 2.4064586244371996e-07,
"loss": 0.812,
"step": 320500
},
{
"epoch": 49.83698183511877,
"grad_norm": 4.929879188537598,
"learning_rate": 1.6301816488122962e-07,
"loss": 0.8121,
"step": 321000
},
{
"epoch": 49.91460953268126,
"grad_norm": 4.459561347961426,
"learning_rate": 8.539046731873933e-08,
"loss": 0.8108,
"step": 321500
},
{
"epoch": 49.99223723024375,
"grad_norm": 4.53715181350708,
"learning_rate": 7.76276975624903e-09,
"loss": 0.8126,
"step": 322000
},
{
"epoch": 50.0,
"step": 322050,
"total_flos": 9.94521893679661e+17,
"train_loss": 1.8893472661618176,
"train_runtime": 93675.3384,
"train_samples_per_second": 110.014,
"train_steps_per_second": 3.438
}
],
"logging_steps": 500,
"max_steps": 322050,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.94521893679661e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}