inter-play-sim-user-sft / trainer_state.json
jeromeramos's picture
Model save
906f965 verified
raw
history blame contribute delete
51.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1446,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013831258644536654,
"grad_norm": 133.08836364746094,
"learning_rate": 0.0,
"loss": 1.6747,
"step": 1
},
{
"epoch": 0.006915629322268326,
"grad_norm": 10.979754447937012,
"learning_rate": 5.517241379310345e-06,
"loss": 1.5287,
"step": 5
},
{
"epoch": 0.013831258644536652,
"grad_norm": 5.598673343658447,
"learning_rate": 1.2413793103448277e-05,
"loss": 1.1371,
"step": 10
},
{
"epoch": 0.02074688796680498,
"grad_norm": 4.170475959777832,
"learning_rate": 1.9310344827586207e-05,
"loss": 1.0279,
"step": 15
},
{
"epoch": 0.027662517289073305,
"grad_norm": 3.909233331680298,
"learning_rate": 2.620689655172414e-05,
"loss": 0.9947,
"step": 20
},
{
"epoch": 0.034578146611341634,
"grad_norm": 3.94997501373291,
"learning_rate": 3.310344827586207e-05,
"loss": 0.9337,
"step": 25
},
{
"epoch": 0.04149377593360996,
"grad_norm": 3.896550178527832,
"learning_rate": 4e-05,
"loss": 0.8761,
"step": 30
},
{
"epoch": 0.048409405255878286,
"grad_norm": 3.581073522567749,
"learning_rate": 4.689655172413793e-05,
"loss": 0.8054,
"step": 35
},
{
"epoch": 0.05532503457814661,
"grad_norm": 2.294735908508301,
"learning_rate": 5.379310344827586e-05,
"loss": 0.7511,
"step": 40
},
{
"epoch": 0.06224066390041494,
"grad_norm": 1.9514353275299072,
"learning_rate": 6.068965517241379e-05,
"loss": 0.7493,
"step": 45
},
{
"epoch": 0.06915629322268327,
"grad_norm": 2.1982648372650146,
"learning_rate": 6.758620689655173e-05,
"loss": 0.8087,
"step": 50
},
{
"epoch": 0.07607192254495158,
"grad_norm": 2.2902512550354004,
"learning_rate": 7.448275862068966e-05,
"loss": 0.7906,
"step": 55
},
{
"epoch": 0.08298755186721991,
"grad_norm": 1.9472178220748901,
"learning_rate": 8.137931034482759e-05,
"loss": 0.7872,
"step": 60
},
{
"epoch": 0.08990318118948824,
"grad_norm": 1.7574316263198853,
"learning_rate": 8.827586206896552e-05,
"loss": 0.7898,
"step": 65
},
{
"epoch": 0.09681881051175657,
"grad_norm": 1.8785020112991333,
"learning_rate": 9.517241379310345e-05,
"loss": 0.8166,
"step": 70
},
{
"epoch": 0.1037344398340249,
"grad_norm": 1.5703641176223755,
"learning_rate": 0.0001020689655172414,
"loss": 0.7987,
"step": 75
},
{
"epoch": 0.11065006915629322,
"grad_norm": 1.4895148277282715,
"learning_rate": 0.00010896551724137931,
"loss": 0.8054,
"step": 80
},
{
"epoch": 0.11756569847856155,
"grad_norm": 1.7236794233322144,
"learning_rate": 0.00011586206896551725,
"loss": 0.8111,
"step": 85
},
{
"epoch": 0.12448132780082988,
"grad_norm": 1.788547396659851,
"learning_rate": 0.00012275862068965518,
"loss": 0.8067,
"step": 90
},
{
"epoch": 0.1313969571230982,
"grad_norm": 1.6218682527542114,
"learning_rate": 0.0001296551724137931,
"loss": 0.8175,
"step": 95
},
{
"epoch": 0.13831258644536654,
"grad_norm": 1.438413381576538,
"learning_rate": 0.00013655172413793104,
"loss": 0.8485,
"step": 100
},
{
"epoch": 0.14522821576763487,
"grad_norm": 1.6937086582183838,
"learning_rate": 0.00014344827586206896,
"loss": 0.8562,
"step": 105
},
{
"epoch": 0.15214384508990317,
"grad_norm": 1.4097000360488892,
"learning_rate": 0.0001503448275862069,
"loss": 0.851,
"step": 110
},
{
"epoch": 0.1590594744121715,
"grad_norm": 2.706479072570801,
"learning_rate": 0.00015724137931034485,
"loss": 0.8461,
"step": 115
},
{
"epoch": 0.16597510373443983,
"grad_norm": 2.0034353733062744,
"learning_rate": 0.00016413793103448276,
"loss": 0.8548,
"step": 120
},
{
"epoch": 0.17289073305670816,
"grad_norm": 1.3242019414901733,
"learning_rate": 0.0001710344827586207,
"loss": 0.8553,
"step": 125
},
{
"epoch": 0.1798063623789765,
"grad_norm": 2.6091411113739014,
"learning_rate": 0.00017793103448275862,
"loss": 0.848,
"step": 130
},
{
"epoch": 0.18672199170124482,
"grad_norm": 1.3679989576339722,
"learning_rate": 0.00018482758620689654,
"loss": 0.8716,
"step": 135
},
{
"epoch": 0.19363762102351315,
"grad_norm": 1.323454737663269,
"learning_rate": 0.0001917241379310345,
"loss": 0.8488,
"step": 140
},
{
"epoch": 0.20055325034578148,
"grad_norm": 2.7075045108795166,
"learning_rate": 0.00019862068965517243,
"loss": 0.9631,
"step": 145
},
{
"epoch": 0.2074688796680498,
"grad_norm": 2.2683520317077637,
"learning_rate": 0.0001999953352135947,
"loss": 0.9498,
"step": 150
},
{
"epoch": 0.2143845089903181,
"grad_norm": 8.38696002960205,
"learning_rate": 0.0001999763852647035,
"loss": 0.891,
"step": 155
},
{
"epoch": 0.22130013831258644,
"grad_norm": 24.307083129882812,
"learning_rate": 0.00019994286136445976,
"loss": 0.9308,
"step": 160
},
{
"epoch": 0.22821576763485477,
"grad_norm": 2.0212178230285645,
"learning_rate": 0.0001998947683997744,
"loss": 0.8901,
"step": 165
},
{
"epoch": 0.2351313969571231,
"grad_norm": 1.2989600896835327,
"learning_rate": 0.00019983211338134828,
"loss": 0.8546,
"step": 170
},
{
"epoch": 0.24204702627939143,
"grad_norm": 1.2097629308700562,
"learning_rate": 0.00019975490544265012,
"loss": 0.8543,
"step": 175
},
{
"epoch": 0.24896265560165975,
"grad_norm": 30.946725845336914,
"learning_rate": 0.00019966315583858516,
"loss": 1.6915,
"step": 180
},
{
"epoch": 0.25587828492392806,
"grad_norm": 2.118316173553467,
"learning_rate": 0.0001995568779438545,
"loss": 1.0665,
"step": 185
},
{
"epoch": 0.2627939142461964,
"grad_norm": 3.086249589920044,
"learning_rate": 0.00019943608725100532,
"loss": 0.9988,
"step": 190
},
{
"epoch": 0.2697095435684647,
"grad_norm": 1.1766245365142822,
"learning_rate": 0.00019930080136817255,
"loss": 0.9133,
"step": 195
},
{
"epoch": 0.2766251728907331,
"grad_norm": 1.33064603805542,
"learning_rate": 0.00019915104001651203,
"loss": 0.8809,
"step": 200
},
{
"epoch": 0.2835408022130014,
"grad_norm": 1.6439151763916016,
"learning_rate": 0.00019898682502732568,
"loss": 0.8967,
"step": 205
},
{
"epoch": 0.29045643153526973,
"grad_norm": 1.3271840810775757,
"learning_rate": 0.00019880818033887916,
"loss": 0.8866,
"step": 210
},
{
"epoch": 0.29737206085753803,
"grad_norm": 1.1745598316192627,
"learning_rate": 0.0001986151319929121,
"loss": 0.8598,
"step": 215
},
{
"epoch": 0.30428769017980634,
"grad_norm": 1.0390416383743286,
"learning_rate": 0.00019840770813084205,
"loss": 0.8434,
"step": 220
},
{
"epoch": 0.3112033195020747,
"grad_norm": 1.0289931297302246,
"learning_rate": 0.00019818593898966212,
"loss": 0.8765,
"step": 225
},
{
"epoch": 0.318118948824343,
"grad_norm": 1.1883145570755005,
"learning_rate": 0.00019794985689753337,
"loss": 0.8859,
"step": 230
},
{
"epoch": 0.32503457814661135,
"grad_norm": 1.2715511322021484,
"learning_rate": 0.00019769949626907186,
"loss": 0.8626,
"step": 235
},
{
"epoch": 0.33195020746887965,
"grad_norm": 1.287985920906067,
"learning_rate": 0.00019743489360033231,
"loss": 0.8805,
"step": 240
},
{
"epoch": 0.338865836791148,
"grad_norm": 1.1524231433868408,
"learning_rate": 0.00019715608746348763,
"loss": 0.8588,
"step": 245
},
{
"epoch": 0.3457814661134163,
"grad_norm": 1.0479854345321655,
"learning_rate": 0.00019686311850120625,
"loss": 0.858,
"step": 250
},
{
"epoch": 0.35269709543568467,
"grad_norm": 0.9751342535018921,
"learning_rate": 0.0001965560294207274,
"loss": 0.8558,
"step": 255
},
{
"epoch": 0.359612724757953,
"grad_norm": 0.9835847020149231,
"learning_rate": 0.00019623486498763555,
"loss": 0.8755,
"step": 260
},
{
"epoch": 0.3665283540802213,
"grad_norm": 3.7135465145111084,
"learning_rate": 0.00019589967201933471,
"loss": 0.8584,
"step": 265
},
{
"epoch": 0.37344398340248963,
"grad_norm": 1.0380736589431763,
"learning_rate": 0.00019555049937822384,
"loss": 0.8544,
"step": 270
},
{
"epoch": 0.38035961272475793,
"grad_norm": 1.0324746370315552,
"learning_rate": 0.00019518739796457366,
"loss": 0.8673,
"step": 275
},
{
"epoch": 0.3872752420470263,
"grad_norm": 1.0108332633972168,
"learning_rate": 0.00019481042070910705,
"loss": 0.8443,
"step": 280
},
{
"epoch": 0.3941908713692946,
"grad_norm": 0.8845415711402893,
"learning_rate": 0.00019441962256528292,
"loss": 0.857,
"step": 285
},
{
"epoch": 0.40110650069156295,
"grad_norm": 0.8270080089569092,
"learning_rate": 0.00019401506050128556,
"loss": 0.8583,
"step": 290
},
{
"epoch": 0.40802213001383125,
"grad_norm": 0.8553124070167542,
"learning_rate": 0.00019359679349172004,
"loss": 0.8291,
"step": 295
},
{
"epoch": 0.4149377593360996,
"grad_norm": 0.9521012306213379,
"learning_rate": 0.00019316488250901534,
"loss": 0.8486,
"step": 300
},
{
"epoch": 0.4218533886583679,
"grad_norm": 0.8840038776397705,
"learning_rate": 0.00019271939051453612,
"loss": 0.8258,
"step": 305
},
{
"epoch": 0.4287690179806362,
"grad_norm": 1.223691463470459,
"learning_rate": 0.00019226038244940464,
"loss": 0.8142,
"step": 310
},
{
"epoch": 0.43568464730290457,
"grad_norm": 0.8988921642303467,
"learning_rate": 0.00019178792522503394,
"loss": 0.8611,
"step": 315
},
{
"epoch": 0.4426002766251729,
"grad_norm": 0.813004732131958,
"learning_rate": 0.000191302087713374,
"loss": 0.8208,
"step": 320
},
{
"epoch": 0.44951590594744123,
"grad_norm": 0.8524804711341858,
"learning_rate": 0.00019080294073687193,
"loss": 0.8393,
"step": 325
},
{
"epoch": 0.45643153526970953,
"grad_norm": 0.9394051432609558,
"learning_rate": 0.000190290557058148,
"loss": 0.8567,
"step": 330
},
{
"epoch": 0.4633471645919779,
"grad_norm": 0.9042031168937683,
"learning_rate": 0.00018976501136938864,
"loss": 0.8387,
"step": 335
},
{
"epoch": 0.4702627939142462,
"grad_norm": 0.7205588221549988,
"learning_rate": 0.00018922638028145828,
"loss": 0.8231,
"step": 340
},
{
"epoch": 0.47717842323651455,
"grad_norm": 0.798876941204071,
"learning_rate": 0.0001886747423127316,
"loss": 0.8254,
"step": 345
},
{
"epoch": 0.48409405255878285,
"grad_norm": 0.7704493403434753,
"learning_rate": 0.00018811017787764747,
"loss": 0.8244,
"step": 350
},
{
"epoch": 0.49100968188105115,
"grad_norm": 0.8552532196044922,
"learning_rate": 0.00018753276927498659,
"loss": 0.8347,
"step": 355
},
{
"epoch": 0.4979253112033195,
"grad_norm": 0.7718790173530579,
"learning_rate": 0.00018694260067587463,
"loss": 0.7962,
"step": 360
},
{
"epoch": 0.5048409405255878,
"grad_norm": 0.888421356678009,
"learning_rate": 0.00018633975811151223,
"loss": 0.8284,
"step": 365
},
{
"epoch": 0.5117565698478561,
"grad_norm": 0.8525059819221497,
"learning_rate": 0.00018572432946063367,
"loss": 0.8241,
"step": 370
},
{
"epoch": 0.5186721991701245,
"grad_norm": 0.6882660388946533,
"learning_rate": 0.00018509640443669682,
"loss": 0.8001,
"step": 375
},
{
"epoch": 0.5255878284923928,
"grad_norm": 0.7356276512145996,
"learning_rate": 0.00018445607457480493,
"loss": 0.8177,
"step": 380
},
{
"epoch": 0.5325034578146611,
"grad_norm": 0.7074828147888184,
"learning_rate": 0.0001838034332183634,
"loss": 0.837,
"step": 385
},
{
"epoch": 0.5394190871369294,
"grad_norm": 0.7650839686393738,
"learning_rate": 0.0001831385755054726,
"loss": 0.8219,
"step": 390
},
{
"epoch": 0.5463347164591977,
"grad_norm": 0.6335296630859375,
"learning_rate": 0.00018246159835505932,
"loss": 0.8146,
"step": 395
},
{
"epoch": 0.5532503457814661,
"grad_norm": 0.8082166314125061,
"learning_rate": 0.0001817726004527485,
"loss": 0.8083,
"step": 400
},
{
"epoch": 0.5601659751037344,
"grad_norm": 0.6119678616523743,
"learning_rate": 0.0001810716822364774,
"loss": 0.7853,
"step": 405
},
{
"epoch": 0.5670816044260027,
"grad_norm": 0.7896009683609009,
"learning_rate": 0.00018035894588185438,
"loss": 0.7868,
"step": 410
},
{
"epoch": 0.573997233748271,
"grad_norm": 0.7086299657821655,
"learning_rate": 0.0001796344952872643,
"loss": 0.8234,
"step": 415
},
{
"epoch": 0.5809128630705395,
"grad_norm": 0.689249575138092,
"learning_rate": 0.00017889843605872305,
"loss": 0.7917,
"step": 420
},
{
"epoch": 0.5878284923928078,
"grad_norm": 0.6687580943107605,
"learning_rate": 0.0001781508754944827,
"loss": 0.7956,
"step": 425
},
{
"epoch": 0.5947441217150761,
"grad_norm": 0.7054949998855591,
"learning_rate": 0.0001773919225693903,
"loss": 0.7979,
"step": 430
},
{
"epoch": 0.6016597510373444,
"grad_norm": 0.8244301676750183,
"learning_rate": 0.00017662168791900232,
"loss": 0.7949,
"step": 435
},
{
"epoch": 0.6085753803596127,
"grad_norm": 0.7373748421669006,
"learning_rate": 0.00017584028382345654,
"loss": 0.7742,
"step": 440
},
{
"epoch": 0.6154910096818811,
"grad_norm": 0.716790497303009,
"learning_rate": 0.00017504782419110497,
"loss": 0.8082,
"step": 445
},
{
"epoch": 0.6224066390041494,
"grad_norm": 0.7063632011413574,
"learning_rate": 0.00017424442454190862,
"loss": 0.7859,
"step": 450
},
{
"epoch": 0.6293222683264177,
"grad_norm": 0.8756738901138306,
"learning_rate": 0.00017343020199059783,
"loss": 0.791,
"step": 455
},
{
"epoch": 0.636237897648686,
"grad_norm": 0.667121946811676,
"learning_rate": 0.0001726052752296001,
"loss": 0.8044,
"step": 460
},
{
"epoch": 0.6431535269709544,
"grad_norm": 0.7018240690231323,
"learning_rate": 0.00017176976451173758,
"loss": 0.7829,
"step": 465
},
{
"epoch": 0.6500691562932227,
"grad_norm": 0.6935915350914001,
"learning_rate": 0.00017092379163269764,
"loss": 0.7975,
"step": 470
},
{
"epoch": 0.656984785615491,
"grad_norm": 0.6710845232009888,
"learning_rate": 0.00017006747991327796,
"loss": 0.7777,
"step": 475
},
{
"epoch": 0.6639004149377593,
"grad_norm": 0.7099134922027588,
"learning_rate": 0.00016920095418140977,
"loss": 0.7755,
"step": 480
},
{
"epoch": 0.6708160442600276,
"grad_norm": 0.652553915977478,
"learning_rate": 0.00016832434075396101,
"loss": 0.7802,
"step": 485
},
{
"epoch": 0.677731673582296,
"grad_norm": 0.6347463130950928,
"learning_rate": 0.00016743776741832292,
"loss": 0.7814,
"step": 490
},
{
"epoch": 0.6846473029045643,
"grad_norm": 0.5948991179466248,
"learning_rate": 0.00016654136341378157,
"loss": 0.7704,
"step": 495
},
{
"epoch": 0.6915629322268326,
"grad_norm": 0.6586953401565552,
"learning_rate": 0.00016563525941267845,
"loss": 0.7781,
"step": 500
},
{
"epoch": 0.6984785615491009,
"grad_norm": 0.6398725509643555,
"learning_rate": 0.00016471958750136176,
"loss": 0.7707,
"step": 505
},
{
"epoch": 0.7053941908713693,
"grad_norm": 0.6144907474517822,
"learning_rate": 0.00016379448116093156,
"loss": 0.7714,
"step": 510
},
{
"epoch": 0.7123098201936376,
"grad_norm": 0.6330462694168091,
"learning_rate": 0.00016286007524778185,
"loss": 0.7653,
"step": 515
},
{
"epoch": 0.719225449515906,
"grad_norm": 0.5762836933135986,
"learning_rate": 0.00016191650597394198,
"loss": 0.7715,
"step": 520
},
{
"epoch": 0.7261410788381742,
"grad_norm": 0.578916609287262,
"learning_rate": 0.00016096391088722047,
"loss": 0.785,
"step": 525
},
{
"epoch": 0.7330567081604425,
"grad_norm": 0.5828131437301636,
"learning_rate": 0.0001600024288511541,
"loss": 0.7424,
"step": 530
},
{
"epoch": 0.739972337482711,
"grad_norm": 0.5606003999710083,
"learning_rate": 0.00015903220002476515,
"loss": 0.7782,
"step": 535
},
{
"epoch": 0.7468879668049793,
"grad_norm": 0.5754717588424683,
"learning_rate": 0.0001580533658421302,
"loss": 0.7865,
"step": 540
},
{
"epoch": 0.7538035961272476,
"grad_norm": 0.680016279220581,
"learning_rate": 0.0001570660689917623,
"loss": 0.7637,
"step": 545
},
{
"epoch": 0.7607192254495159,
"grad_norm": 0.6306326985359192,
"learning_rate": 0.00015607045339581096,
"loss": 0.7528,
"step": 550
},
{
"epoch": 0.7676348547717843,
"grad_norm": 0.5506445169448853,
"learning_rate": 0.00015506666418908203,
"loss": 0.767,
"step": 555
},
{
"epoch": 0.7745504840940526,
"grad_norm": 0.6151379346847534,
"learning_rate": 0.00015405484769788073,
"loss": 0.7511,
"step": 560
},
{
"epoch": 0.7814661134163209,
"grad_norm": 0.6520763635635376,
"learning_rate": 0.00015303515141868116,
"loss": 0.7529,
"step": 565
},
{
"epoch": 0.7883817427385892,
"grad_norm": 0.5401438474655151,
"learning_rate": 0.00015200772399662514,
"loss": 0.7754,
"step": 570
},
{
"epoch": 0.7952973720608575,
"grad_norm": 0.6025466322898865,
"learning_rate": 0.00015097271520385366,
"loss": 0.7577,
"step": 575
},
{
"epoch": 0.8022130013831259,
"grad_norm": 0.6601846218109131,
"learning_rate": 0.00014993027591767396,
"loss": 0.7406,
"step": 580
},
{
"epoch": 0.8091286307053942,
"grad_norm": 0.6068835854530334,
"learning_rate": 0.0001488805580985655,
"loss": 0.764,
"step": 585
},
{
"epoch": 0.8160442600276625,
"grad_norm": 0.586649477481842,
"learning_rate": 0.00014782371476802824,
"loss": 0.7547,
"step": 590
},
{
"epoch": 0.8229598893499308,
"grad_norm": 0.4942811131477356,
"learning_rate": 0.00014675989998627598,
"loss": 0.7539,
"step": 595
},
{
"epoch": 0.8298755186721992,
"grad_norm": 0.5191232562065125,
"learning_rate": 0.00014568926882977832,
"loss": 0.75,
"step": 600
},
{
"epoch": 0.8367911479944675,
"grad_norm": 0.5124319195747375,
"learning_rate": 0.00014461197736865481,
"loss": 0.7207,
"step": 605
},
{
"epoch": 0.8437067773167358,
"grad_norm": 0.5561709403991699,
"learning_rate": 0.00014352818264392364,
"loss": 0.7504,
"step": 610
},
{
"epoch": 0.8506224066390041,
"grad_norm": 0.5596902370452881,
"learning_rate": 0.00014243804264460957,
"loss": 0.7634,
"step": 615
},
{
"epoch": 0.8575380359612724,
"grad_norm": 0.5537763237953186,
"learning_rate": 0.00014134171628471276,
"loss": 0.7293,
"step": 620
},
{
"epoch": 0.8644536652835408,
"grad_norm": 0.6073982119560242,
"learning_rate": 0.00014023936338004373,
"loss": 0.7358,
"step": 625
},
{
"epoch": 0.8713692946058091,
"grad_norm": 0.5265364050865173,
"learning_rate": 0.00013913114462492601,
"loss": 0.7415,
"step": 630
},
{
"epoch": 0.8782849239280774,
"grad_norm": 0.5465463399887085,
"learning_rate": 0.00013801722156877143,
"loss": 0.7351,
"step": 635
},
{
"epoch": 0.8852005532503457,
"grad_norm": 0.5381340384483337,
"learning_rate": 0.00013689775659253006,
"loss": 0.7403,
"step": 640
},
{
"epoch": 0.8921161825726142,
"grad_norm": 0.5682520270347595,
"learning_rate": 0.00013577291288501952,
"loss": 0.7299,
"step": 645
},
{
"epoch": 0.8990318118948825,
"grad_norm": 0.538758397102356,
"learning_rate": 0.00013464285441913636,
"loss": 0.7501,
"step": 650
},
{
"epoch": 0.9059474412171508,
"grad_norm": 0.6069577932357788,
"learning_rate": 0.00013350774592795292,
"loss": 0.7373,
"step": 655
},
{
"epoch": 0.9128630705394191,
"grad_norm": 0.5356409549713135,
"learning_rate": 0.0001323677528807036,
"loss": 0.7343,
"step": 660
},
{
"epoch": 0.9197786998616874,
"grad_norm": 0.5176509618759155,
"learning_rate": 0.00013122304145866381,
"loss": 0.7298,
"step": 665
},
{
"epoch": 0.9266943291839558,
"grad_norm": 0.6070849895477295,
"learning_rate": 0.00013007377853092503,
"loss": 0.7352,
"step": 670
},
{
"epoch": 0.9336099585062241,
"grad_norm": 0.5752330422401428,
"learning_rate": 0.00012892013163006962,
"loss": 0.7323,
"step": 675
},
{
"epoch": 0.9405255878284924,
"grad_norm": 0.550092339515686,
"learning_rate": 0.00012776226892774903,
"loss": 0.7437,
"step": 680
},
{
"epoch": 0.9474412171507607,
"grad_norm": 0.5363165736198425,
"learning_rate": 0.00012660035921016854,
"loss": 0.7199,
"step": 685
},
{
"epoch": 0.9543568464730291,
"grad_norm": 0.5486807823181152,
"learning_rate": 0.00012543457185348298,
"loss": 0.7159,
"step": 690
},
{
"epoch": 0.9612724757952974,
"grad_norm": 0.6245793104171753,
"learning_rate": 0.00012426507679910576,
"loss": 0.7295,
"step": 695
},
{
"epoch": 0.9681881051175657,
"grad_norm": 0.5306342244148254,
"learning_rate": 0.00012309204452893606,
"loss": 0.7239,
"step": 700
},
{
"epoch": 0.975103734439834,
"grad_norm": 0.5488132238388062,
"learning_rate": 0.00012191564604050683,
"loss": 0.7027,
"step": 705
},
{
"epoch": 0.9820193637621023,
"grad_norm": 0.5506658554077148,
"learning_rate": 0.00012073605282205802,
"loss": 0.7397,
"step": 710
},
{
"epoch": 0.9889349930843707,
"grad_norm": 0.5438380241394043,
"learning_rate": 0.00011955343682753794,
"loss": 0.7241,
"step": 715
},
{
"epoch": 0.995850622406639,
"grad_norm": 0.5309740900993347,
"learning_rate": 0.0001183679704515368,
"loss": 0.7209,
"step": 720
},
{
"epoch": 1.0027662517289073,
"grad_norm": 0.6508172750473022,
"learning_rate": 0.00011717982650415624,
"loss": 0.6672,
"step": 725
},
{
"epoch": 1.0096818810511756,
"grad_norm": 0.6566136479377747,
"learning_rate": 0.00011598917818581791,
"loss": 0.5846,
"step": 730
},
{
"epoch": 1.016597510373444,
"grad_norm": 0.5107012987136841,
"learning_rate": 0.00011479619906201557,
"loss": 0.5685,
"step": 735
},
{
"epoch": 1.0235131396957122,
"grad_norm": 0.48931655287742615,
"learning_rate": 0.00011360106303801364,
"loss": 0.5846,
"step": 740
},
{
"epoch": 1.0304287690179805,
"grad_norm": 0.5219966769218445,
"learning_rate": 0.00011240394433349637,
"loss": 0.5588,
"step": 745
},
{
"epoch": 1.037344398340249,
"grad_norm": 0.5277289152145386,
"learning_rate": 0.00011120501745717112,
"loss": 0.5727,
"step": 750
},
{
"epoch": 1.0442600276625174,
"grad_norm": 0.5424395799636841,
"learning_rate": 0.00011000445718132966,
"loss": 0.566,
"step": 755
},
{
"epoch": 1.0511756569847857,
"grad_norm": 0.5607298612594604,
"learning_rate": 0.00010880243851637078,
"loss": 0.57,
"step": 760
},
{
"epoch": 1.058091286307054,
"grad_norm": 0.5789066553115845,
"learning_rate": 0.00010759913668528841,
"loss": 0.5659,
"step": 765
},
{
"epoch": 1.0650069156293223,
"grad_norm": 0.5418556928634644,
"learning_rate": 0.00010639472709812861,
"loss": 0.573,
"step": 770
},
{
"epoch": 1.0719225449515906,
"grad_norm": 0.49530017375946045,
"learning_rate": 0.0001051893853264195,
"loss": 0.5695,
"step": 775
},
{
"epoch": 1.0788381742738589,
"grad_norm": 0.5037497878074646,
"learning_rate": 0.00010398328707757738,
"loss": 0.5651,
"step": 780
},
{
"epoch": 1.0857538035961272,
"grad_norm": 0.5208268761634827,
"learning_rate": 0.00010277660816929313,
"loss": 0.5883,
"step": 785
},
{
"epoch": 1.0926694329183957,
"grad_norm": 0.5543485283851624,
"learning_rate": 0.00010156952450390269,
"loss": 0.5537,
"step": 790
},
{
"epoch": 1.099585062240664,
"grad_norm": 0.6337606310844421,
"learning_rate": 0.00010036221204274512,
"loss": 0.5649,
"step": 795
},
{
"epoch": 1.1065006915629323,
"grad_norm": 0.4707985818386078,
"learning_rate": 9.915484678051175e-05,
"loss": 0.5471,
"step": 800
},
{
"epoch": 1.1134163208852006,
"grad_norm": 0.5109753608703613,
"learning_rate": 9.794760471959116e-05,
"loss": 0.5663,
"step": 805
},
{
"epoch": 1.120331950207469,
"grad_norm": 0.5100296139717102,
"learning_rate": 9.674066184441221e-05,
"loss": 0.5713,
"step": 810
},
{
"epoch": 1.1272475795297372,
"grad_norm": 0.5414464473724365,
"learning_rate": 9.553419409579035e-05,
"loss": 0.5611,
"step": 815
},
{
"epoch": 1.1341632088520055,
"grad_norm": 0.5717695355415344,
"learning_rate": 9.432837734527995e-05,
"loss": 0.5817,
"step": 820
},
{
"epoch": 1.1410788381742738,
"grad_norm": 0.5524587035179138,
"learning_rate": 9.312338736953683e-05,
"loss": 0.5757,
"step": 825
},
{
"epoch": 1.147994467496542,
"grad_norm": 0.5250436663627625,
"learning_rate": 9.191939982469458e-05,
"loss": 0.569,
"step": 830
},
{
"epoch": 1.1549100968188104,
"grad_norm": 0.4710783064365387,
"learning_rate": 9.071659022075849e-05,
"loss": 0.565,
"step": 835
},
{
"epoch": 1.161825726141079,
"grad_norm": 0.4436459541320801,
"learning_rate": 8.951513389602076e-05,
"loss": 0.5666,
"step": 840
},
{
"epoch": 1.1687413554633472,
"grad_norm": 0.5180997252464294,
"learning_rate": 8.831520599150083e-05,
"loss": 0.5595,
"step": 845
},
{
"epoch": 1.1756569847856155,
"grad_norm": 0.49515190720558167,
"learning_rate": 8.71169814254142e-05,
"loss": 0.584,
"step": 850
},
{
"epoch": 1.1825726141078838,
"grad_norm": 0.5082345008850098,
"learning_rate": 8.592063486767406e-05,
"loss": 0.5687,
"step": 855
},
{
"epoch": 1.1894882434301521,
"grad_norm": 0.5054699182510376,
"learning_rate": 8.472634071442896e-05,
"loss": 0.5796,
"step": 860
},
{
"epoch": 1.1964038727524204,
"grad_norm": 0.4742473065853119,
"learning_rate": 8.353427306264032e-05,
"loss": 0.5575,
"step": 865
},
{
"epoch": 1.2033195020746887,
"grad_norm": 0.46908038854599,
"learning_rate": 8.23446056847037e-05,
"loss": 0.5654,
"step": 870
},
{
"epoch": 1.210235131396957,
"grad_norm": 0.4599679708480835,
"learning_rate": 8.115751200311725e-05,
"loss": 0.5452,
"step": 875
},
{
"epoch": 1.2171507607192256,
"grad_norm": 0.48915913701057434,
"learning_rate": 7.99731650652013e-05,
"loss": 0.5667,
"step": 880
},
{
"epoch": 1.2240663900414939,
"grad_norm": 0.5099295377731323,
"learning_rate": 7.879173751787259e-05,
"loss": 0.5673,
"step": 885
},
{
"epoch": 1.2309820193637622,
"grad_norm": 0.49911564588546753,
"learning_rate": 7.761340158247674e-05,
"loss": 0.5695,
"step": 890
},
{
"epoch": 1.2378976486860305,
"grad_norm": 0.4832319915294647,
"learning_rate": 7.64383290296829e-05,
"loss": 0.5477,
"step": 895
},
{
"epoch": 1.2448132780082988,
"grad_norm": 0.4508809447288513,
"learning_rate": 7.526669115444414e-05,
"loss": 0.5738,
"step": 900
},
{
"epoch": 1.251728907330567,
"grad_norm": 0.5219544768333435,
"learning_rate": 7.409865875102704e-05,
"loss": 0.5581,
"step": 905
},
{
"epoch": 1.2586445366528354,
"grad_norm": 0.4636399745941162,
"learning_rate": 7.293440208811435e-05,
"loss": 0.5593,
"step": 910
},
{
"epoch": 1.2655601659751037,
"grad_norm": 0.4862774908542633,
"learning_rate": 7.177409088398425e-05,
"loss": 0.5481,
"step": 915
},
{
"epoch": 1.272475795297372,
"grad_norm": 0.5330405235290527,
"learning_rate": 7.06178942817699e-05,
"loss": 0.5595,
"step": 920
},
{
"epoch": 1.2793914246196403,
"grad_norm": 0.49150362610816956,
"learning_rate": 6.946598082480268e-05,
"loss": 0.5429,
"step": 925
},
{
"epoch": 1.2863070539419086,
"grad_norm": 0.5385686755180359,
"learning_rate": 6.831851843204308e-05,
"loss": 0.5692,
"step": 930
},
{
"epoch": 1.293222683264177,
"grad_norm": 0.487090528011322,
"learning_rate": 6.71756743736024e-05,
"loss": 0.5484,
"step": 935
},
{
"epoch": 1.3001383125864454,
"grad_norm": 0.5285480618476868,
"learning_rate": 6.603761524635914e-05,
"loss": 0.5563,
"step": 940
},
{
"epoch": 1.3070539419087137,
"grad_norm": 0.48829564452171326,
"learning_rate": 6.490450694967358e-05,
"loss": 0.5606,
"step": 945
},
{
"epoch": 1.313969571230982,
"grad_norm": 0.4818781912326813,
"learning_rate": 6.377651466120391e-05,
"loss": 0.5538,
"step": 950
},
{
"epoch": 1.3208852005532503,
"grad_norm": 0.47652438282966614,
"learning_rate": 6.265380281282762e-05,
"loss": 0.5584,
"step": 955
},
{
"epoch": 1.3278008298755186,
"grad_norm": 0.47311243414878845,
"learning_rate": 6.15365350666718e-05,
"loss": 0.5409,
"step": 960
},
{
"epoch": 1.334716459197787,
"grad_norm": 0.46478375792503357,
"learning_rate": 6.042487429125516e-05,
"loss": 0.5554,
"step": 965
},
{
"epoch": 1.3416320885200554,
"grad_norm": 0.4783475399017334,
"learning_rate": 5.931898253774628e-05,
"loss": 0.5465,
"step": 970
},
{
"epoch": 1.3485477178423237,
"grad_norm": 0.48976966738700867,
"learning_rate": 5.821902101634069e-05,
"loss": 0.5563,
"step": 975
},
{
"epoch": 1.355463347164592,
"grad_norm": 0.4838036298751831,
"learning_rate": 5.7125150072760635e-05,
"loss": 0.5628,
"step": 980
},
{
"epoch": 1.3623789764868603,
"grad_norm": 0.46755191683769226,
"learning_rate": 5.603752916488085e-05,
"loss": 0.5472,
"step": 985
},
{
"epoch": 1.3692946058091287,
"grad_norm": 0.50596022605896,
"learning_rate": 5.4956316839483734e-05,
"loss": 0.5632,
"step": 990
},
{
"epoch": 1.376210235131397,
"grad_norm": 0.49392423033714294,
"learning_rate": 5.388167070914738e-05,
"loss": 0.5363,
"step": 995
},
{
"epoch": 1.3831258644536653,
"grad_norm": 0.4692226052284241,
"learning_rate": 5.281374742926987e-05,
"loss": 0.536,
"step": 1000
},
{
"epoch": 1.3900414937759336,
"grad_norm": 0.5283923745155334,
"learning_rate": 5.175270267523278e-05,
"loss": 0.5553,
"step": 1005
},
{
"epoch": 1.3969571230982019,
"grad_norm": 0.47653043270111084,
"learning_rate": 5.069869111970793e-05,
"loss": 0.5492,
"step": 1010
},
{
"epoch": 1.4038727524204702,
"grad_norm": 0.45839253067970276,
"learning_rate": 4.965186641011013e-05,
"loss": 0.5297,
"step": 1015
},
{
"epoch": 1.4107883817427385,
"grad_norm": 0.4624306261539459,
"learning_rate": 4.861238114619929e-05,
"loss": 0.5384,
"step": 1020
},
{
"epoch": 1.417704011065007,
"grad_norm": 0.48706522583961487,
"learning_rate": 4.75803868578355e-05,
"loss": 0.5369,
"step": 1025
},
{
"epoch": 1.4246196403872753,
"grad_norm": 0.4311310350894928,
"learning_rate": 4.655603398288979e-05,
"loss": 0.5276,
"step": 1030
},
{
"epoch": 1.4315352697095436,
"grad_norm": 0.47203701734542847,
"learning_rate": 4.5539471845314304e-05,
"loss": 0.5347,
"step": 1035
},
{
"epoch": 1.438450899031812,
"grad_norm": 0.46674981713294983,
"learning_rate": 4.453084863337471e-05,
"loss": 0.5299,
"step": 1040
},
{
"epoch": 1.4453665283540802,
"grad_norm": 0.42725497484207153,
"learning_rate": 4.353031137804821e-05,
"loss": 0.5369,
"step": 1045
},
{
"epoch": 1.4522821576763485,
"grad_norm": 0.4412887990474701,
"learning_rate": 4.253800593159029e-05,
"loss": 0.5418,
"step": 1050
},
{
"epoch": 1.4591977869986168,
"grad_norm": 0.45071831345558167,
"learning_rate": 4.155407694627322e-05,
"loss": 0.5551,
"step": 1055
},
{
"epoch": 1.4661134163208853,
"grad_norm": 0.44687119126319885,
"learning_rate": 4.057866785329959e-05,
"loss": 0.5424,
"step": 1060
},
{
"epoch": 1.4730290456431536,
"grad_norm": 0.45058682560920715,
"learning_rate": 3.96119208418937e-05,
"loss": 0.5524,
"step": 1065
},
{
"epoch": 1.479944674965422,
"grad_norm": 0.5045320391654968,
"learning_rate": 3.8653976838574104e-05,
"loss": 0.5403,
"step": 1070
},
{
"epoch": 1.4868603042876902,
"grad_norm": 0.4632035791873932,
"learning_rate": 3.770497548661021e-05,
"loss": 0.5346,
"step": 1075
},
{
"epoch": 1.4937759336099585,
"grad_norm": 0.4648893177509308,
"learning_rate": 3.676505512566597e-05,
"loss": 0.5423,
"step": 1080
},
{
"epoch": 1.5006915629322268,
"grad_norm": 0.4647049605846405,
"learning_rate": 3.5834352771633475e-05,
"loss": 0.5319,
"step": 1085
},
{
"epoch": 1.5076071922544951,
"grad_norm": 0.45878130197525024,
"learning_rate": 3.491300409665963e-05,
"loss": 0.533,
"step": 1090
},
{
"epoch": 1.5145228215767634,
"grad_norm": 0.4348316192626953,
"learning_rate": 3.4001143409368773e-05,
"loss": 0.5111,
"step": 1095
},
{
"epoch": 1.5214384508990317,
"grad_norm": 0.47867557406425476,
"learning_rate": 3.309890363528386e-05,
"loss": 0.527,
"step": 1100
},
{
"epoch": 1.5283540802213,
"grad_norm": 0.47667166590690613,
"learning_rate": 3.220641629744947e-05,
"loss": 0.5184,
"step": 1105
},
{
"epoch": 1.5352697095435683,
"grad_norm": 0.4480649530887604,
"learning_rate": 3.132381149725916e-05,
"loss": 0.5218,
"step": 1110
},
{
"epoch": 1.5421853388658366,
"grad_norm": 0.4330606460571289,
"learning_rate": 3.0451217895489992e-05,
"loss": 0.5317,
"step": 1115
},
{
"epoch": 1.5491009681881052,
"grad_norm": 0.45356714725494385,
"learning_rate": 2.9588762693547355e-05,
"loss": 0.5307,
"step": 1120
},
{
"epoch": 1.5560165975103735,
"grad_norm": 0.46869832277297974,
"learning_rate": 2.8736571614922046e-05,
"loss": 0.5231,
"step": 1125
},
{
"epoch": 1.5629322268326418,
"grad_norm": 0.5013293623924255,
"learning_rate": 2.7894768886863233e-05,
"loss": 0.5272,
"step": 1130
},
{
"epoch": 1.56984785615491,
"grad_norm": 0.44263824820518494,
"learning_rate": 2.7063477222269306e-05,
"loss": 0.53,
"step": 1135
},
{
"epoch": 1.5767634854771784,
"grad_norm": 0.45062586665153503,
"learning_rate": 2.6242817801799557e-05,
"loss": 0.5116,
"step": 1140
},
{
"epoch": 1.583679114799447,
"grad_norm": 0.4699437618255615,
"learning_rate": 2.5432910256209187e-05,
"loss": 0.5314,
"step": 1145
},
{
"epoch": 1.5905947441217152,
"grad_norm": 0.45320945978164673,
"learning_rate": 2.4633872648910252e-05,
"loss": 0.5244,
"step": 1150
},
{
"epoch": 1.5975103734439835,
"grad_norm": 0.45921820402145386,
"learning_rate": 2.3845821458761063e-05,
"loss": 0.5265,
"step": 1155
},
{
"epoch": 1.6044260027662518,
"grad_norm": 0.4836861789226532,
"learning_rate": 2.3068871563086757e-05,
"loss": 0.5397,
"step": 1160
},
{
"epoch": 1.61134163208852,
"grad_norm": 0.4829198122024536,
"learning_rate": 2.230313622093295e-05,
"loss": 0.5247,
"step": 1165
},
{
"epoch": 1.6182572614107884,
"grad_norm": 0.42497992515563965,
"learning_rate": 2.154872705655566e-05,
"loss": 0.4998,
"step": 1170
},
{
"epoch": 1.6251728907330567,
"grad_norm": 0.4427035450935364,
"learning_rate": 2.0805754043149394e-05,
"loss": 0.52,
"step": 1175
},
{
"epoch": 1.632088520055325,
"grad_norm": 0.4457587003707886,
"learning_rate": 2.0074325486815883e-05,
"loss": 0.4931,
"step": 1180
},
{
"epoch": 1.6390041493775933,
"grad_norm": 0.46790486574172974,
"learning_rate": 1.9354548010775896e-05,
"loss": 0.529,
"step": 1185
},
{
"epoch": 1.6459197786998616,
"grad_norm": 0.4801090955734253,
"learning_rate": 1.864652653982636e-05,
"loss": 0.5188,
"step": 1190
},
{
"epoch": 1.65283540802213,
"grad_norm": 0.4337711036205292,
"learning_rate": 1.7950364285044996e-05,
"loss": 0.5105,
"step": 1195
},
{
"epoch": 1.6597510373443982,
"grad_norm": 0.45500195026397705,
"learning_rate": 1.7266162728744993e-05,
"loss": 0.5049,
"step": 1200
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.4910541772842407,
"learning_rate": 1.6594021609681344e-05,
"loss": 0.5079,
"step": 1205
},
{
"epoch": 1.673582295988935,
"grad_norm": 0.4468669295310974,
"learning_rate": 1.5934038908511616e-05,
"loss": 0.5135,
"step": 1210
},
{
"epoch": 1.6804979253112033,
"grad_norm": 0.4427061975002289,
"learning_rate": 1.5286310833512963e-05,
"loss": 0.5019,
"step": 1215
},
{
"epoch": 1.6874135546334716,
"grad_norm": 0.44807228446006775,
"learning_rate": 1.4650931806557389e-05,
"loss": 0.5121,
"step": 1220
},
{
"epoch": 1.69432918395574,
"grad_norm": 0.45542779564857483,
"learning_rate": 1.402799444934757e-05,
"loss": 0.5183,
"step": 1225
},
{
"epoch": 1.7012448132780082,
"grad_norm": 0.44287553429603577,
"learning_rate": 1.3417589569914978e-05,
"loss": 0.4962,
"step": 1230
},
{
"epoch": 1.7081604426002768,
"grad_norm": 0.43979722261428833,
"learning_rate": 1.2819806149382441e-05,
"loss": 0.5065,
"step": 1235
},
{
"epoch": 1.715076071922545,
"grad_norm": 0.4374731481075287,
"learning_rate": 1.2234731328993055e-05,
"loss": 0.5144,
"step": 1240
},
{
"epoch": 1.7219917012448134,
"grad_norm": 0.4652646481990814,
"learning_rate": 1.1662450397407188e-05,
"loss": 0.5017,
"step": 1245
},
{
"epoch": 1.7289073305670817,
"grad_norm": 0.4413568377494812,
"learning_rate": 1.1103046778269687e-05,
"loss": 0.5109,
"step": 1250
},
{
"epoch": 1.73582295988935,
"grad_norm": 0.4495951235294342,
"learning_rate": 1.0556602018048866e-05,
"loss": 0.502,
"step": 1255
},
{
"epoch": 1.7427385892116183,
"grad_norm": 0.4346306025981903,
"learning_rate": 1.0023195774149119e-05,
"loss": 0.4963,
"step": 1260
},
{
"epoch": 1.7496542185338866,
"grad_norm": 0.4612555503845215,
"learning_rate": 9.502905803299e-06,
"loss": 0.5006,
"step": 1265
},
{
"epoch": 1.7565698478561549,
"grad_norm": 0.46448782086372375,
"learning_rate": 8.995807950216262e-06,
"loss": 0.5195,
"step": 1270
},
{
"epoch": 1.7634854771784232,
"grad_norm": 0.44382619857788086,
"learning_rate": 8.501976136551749e-06,
"loss": 0.5061,
"step": 1275
},
{
"epoch": 1.7704011065006915,
"grad_norm": 0.45043641328811646,
"learning_rate": 8.021482350113474e-06,
"loss": 0.5254,
"step": 1280
},
{
"epoch": 1.7773167358229598,
"grad_norm": 0.44958069920539856,
"learning_rate": 7.554396634372707e-06,
"loss": 0.5311,
"step": 1285
},
{
"epoch": 1.784232365145228,
"grad_norm": 0.433776319026947,
"learning_rate": 7.100787078253446e-06,
"loss": 0.486,
"step": 1290
},
{
"epoch": 1.7911479944674964,
"grad_norm": 0.41777148842811584,
"learning_rate": 6.660719806206839e-06,
"loss": 0.5108,
"step": 1295
},
{
"epoch": 1.798063623789765,
"grad_norm": 0.4487934112548828,
"learning_rate": 6.234258968571971e-06,
"loss": 0.5006,
"step": 1300
},
{
"epoch": 1.8049792531120332,
"grad_norm": 0.4544650614261627,
"learning_rate": 5.821466732224412e-06,
"loss": 0.5037,
"step": 1305
},
{
"epoch": 1.8118948824343015,
"grad_norm": 0.41929152607917786,
"learning_rate": 5.422403271513854e-06,
"loss": 0.4978,
"step": 1310
},
{
"epoch": 1.8188105117565698,
"grad_norm": 0.4613235294818878,
"learning_rate": 5.0371267594923834e-06,
"loss": 0.5209,
"step": 1315
},
{
"epoch": 1.8257261410788381,
"grad_norm": 0.44448205828666687,
"learning_rate": 4.66569335943422e-06,
"loss": 0.5014,
"step": 1320
},
{
"epoch": 1.8326417704011067,
"grad_norm": 0.43924444913864136,
"learning_rate": 4.3081572166486675e-06,
"loss": 0.5247,
"step": 1325
},
{
"epoch": 1.839557399723375,
"grad_norm": 0.44769856333732605,
"learning_rate": 3.964570450587113e-06,
"loss": 0.5005,
"step": 1330
},
{
"epoch": 1.8464730290456433,
"grad_norm": 0.4331699311733246,
"learning_rate": 3.6349831472453743e-06,
"loss": 0.5041,
"step": 1335
},
{
"epoch": 1.8533886583679116,
"grad_norm": 0.43336230516433716,
"learning_rate": 3.3194433518624614e-06,
"loss": 0.5162,
"step": 1340
},
{
"epoch": 1.8603042876901799,
"grad_norm": 0.4407755434513092,
"learning_rate": 3.017997061916833e-06,
"loss": 0.501,
"step": 1345
},
{
"epoch": 1.8672199170124482,
"grad_norm": 0.43583956360816956,
"learning_rate": 2.7306882204211626e-06,
"loss": 0.5009,
"step": 1350
},
{
"epoch": 1.8741355463347165,
"grad_norm": 0.42576801776885986,
"learning_rate": 2.4575587095166054e-06,
"loss": 0.5128,
"step": 1355
},
{
"epoch": 1.8810511756569848,
"grad_norm": 0.42050647735595703,
"learning_rate": 2.198648344367449e-06,
"loss": 0.5069,
"step": 1360
},
{
"epoch": 1.887966804979253,
"grad_norm": 0.4313651919364929,
"learning_rate": 1.953994867357134e-06,
"loss": 0.5055,
"step": 1365
},
{
"epoch": 1.8948824343015214,
"grad_norm": 0.43365243077278137,
"learning_rate": 1.7236339425863446e-06,
"loss": 0.5145,
"step": 1370
},
{
"epoch": 1.9017980636237897,
"grad_norm": 0.4598560035228729,
"learning_rate": 1.507599150674177e-06,
"loss": 0.5191,
"step": 1375
},
{
"epoch": 1.908713692946058,
"grad_norm": 0.4304969310760498,
"learning_rate": 1.3059219838629234e-06,
"loss": 0.501,
"step": 1380
},
{
"epoch": 1.9156293222683263,
"grad_norm": 0.44647660851478577,
"learning_rate": 1.11863184142732e-06,
"loss": 0.5063,
"step": 1385
},
{
"epoch": 1.9225449515905948,
"grad_norm": 0.40728816390037537,
"learning_rate": 9.457560253889219e-07,
"loss": 0.5009,
"step": 1390
},
{
"epoch": 1.929460580912863,
"grad_norm": 0.4395773410797119,
"learning_rate": 7.873197365361407e-07,
"loss": 0.4993,
"step": 1395
},
{
"epoch": 1.9363762102351314,
"grad_norm": 0.44724011421203613,
"learning_rate": 6.433460707506722e-07,
"loss": 0.5065,
"step": 1400
},
{
"epoch": 1.9432918395573997,
"grad_norm": 0.41976112127304077,
"learning_rate": 5.138560156407124e-07,
"loss": 0.5058,
"step": 1405
},
{
"epoch": 1.950207468879668,
"grad_norm": 0.43243739008903503,
"learning_rate": 3.988684474814819e-07,
"loss": 0.4919,
"step": 1410
},
{
"epoch": 1.9571230982019365,
"grad_norm": 0.4788164496421814,
"learning_rate": 2.984001284635496e-07,
"loss": 0.5128,
"step": 1415
},
{
"epoch": 1.9640387275242048,
"grad_norm": 0.42695632576942444,
"learning_rate": 2.1246570424940936e-07,
"loss": 0.5116,
"step": 1420
},
{
"epoch": 1.9709543568464731,
"grad_norm": 0.40510523319244385,
"learning_rate": 1.4107770183845458e-07,
"loss": 0.5058,
"step": 1425
},
{
"epoch": 1.9778699861687414,
"grad_norm": 0.4062838554382324,
"learning_rate": 8.424652774089436e-08,
"loss": 0.5128,
"step": 1430
},
{
"epoch": 1.9847856154910097,
"grad_norm": 0.4485253393650055,
"learning_rate": 4.198046646075593e-08,
"loss": 0.5062,
"step": 1435
},
{
"epoch": 1.991701244813278,
"grad_norm": 0.43608593940734863,
"learning_rate": 1.4285679288228437e-08,
"loss": 0.4919,
"step": 1440
},
{
"epoch": 1.9986168741355463,
"grad_norm": 0.4265099763870239,
"learning_rate": 1.166203401481436e-09,
"loss": 0.5001,
"step": 1445
},
{
"epoch": 2.0,
"step": 1446,
"total_flos": 6.011530730704732e+18,
"train_loss": 0.6795949197244182,
"train_runtime": 13742.7497,
"train_samples_per_second": 6.734,
"train_steps_per_second": 0.105
}
],
"logging_steps": 5,
"max_steps": 1446,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.011530730704732e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}