qingzhengwang's picture
Upload fine-tuned model
46a9e47 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9969604863221884,
"eval_steps": 500,
"global_step": 2466,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0121580547112462,
"grad_norm": 5.922801494598389,
"learning_rate": 3.6437246963562754e-07,
"loss": 1.1863,
"step": 10
},
{
"epoch": 0.0243161094224924,
"grad_norm": 6.66644811630249,
"learning_rate": 7.692307692307694e-07,
"loss": 1.1376,
"step": 20
},
{
"epoch": 0.0364741641337386,
"grad_norm": 7.5532402992248535,
"learning_rate": 1.174089068825911e-06,
"loss": 1.1034,
"step": 30
},
{
"epoch": 0.0486322188449848,
"grad_norm": 4.451807498931885,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.0824,
"step": 40
},
{
"epoch": 0.060790273556231005,
"grad_norm": 4.282721996307373,
"learning_rate": 1.9838056680161946e-06,
"loss": 0.9437,
"step": 50
},
{
"epoch": 0.0729483282674772,
"grad_norm": 3.476186990737915,
"learning_rate": 2.3886639676113362e-06,
"loss": 0.8906,
"step": 60
},
{
"epoch": 0.0851063829787234,
"grad_norm": 2.9487781524658203,
"learning_rate": 2.7935222672064783e-06,
"loss": 0.8558,
"step": 70
},
{
"epoch": 0.0972644376899696,
"grad_norm": 3.981879949569702,
"learning_rate": 3.19838056680162e-06,
"loss": 0.7978,
"step": 80
},
{
"epoch": 0.1094224924012158,
"grad_norm": 3.0333662033081055,
"learning_rate": 3.6032388663967615e-06,
"loss": 0.7652,
"step": 90
},
{
"epoch": 0.12158054711246201,
"grad_norm": 2.985710859298706,
"learning_rate": 4.008097165991903e-06,
"loss": 0.7671,
"step": 100
},
{
"epoch": 0.1337386018237082,
"grad_norm": 2.512871026992798,
"learning_rate": 4.412955465587045e-06,
"loss": 0.7848,
"step": 110
},
{
"epoch": 0.1458966565349544,
"grad_norm": 3.5035741329193115,
"learning_rate": 4.817813765182186e-06,
"loss": 0.7419,
"step": 120
},
{
"epoch": 0.1580547112462006,
"grad_norm": 4.36132287979126,
"learning_rate": 5.222672064777329e-06,
"loss": 0.7379,
"step": 130
},
{
"epoch": 0.1702127659574468,
"grad_norm": 3.3608834743499756,
"learning_rate": 5.6275303643724695e-06,
"loss": 0.7135,
"step": 140
},
{
"epoch": 0.182370820668693,
"grad_norm": 3.3592512607574463,
"learning_rate": 6.0323886639676124e-06,
"loss": 0.7427,
"step": 150
},
{
"epoch": 0.1945288753799392,
"grad_norm": 2.913890838623047,
"learning_rate": 6.437246963562754e-06,
"loss": 0.6847,
"step": 160
},
{
"epoch": 0.2066869300911854,
"grad_norm": 2.6773297786712646,
"learning_rate": 6.842105263157896e-06,
"loss": 0.7409,
"step": 170
},
{
"epoch": 0.2188449848024316,
"grad_norm": 2.984339714050293,
"learning_rate": 7.246963562753037e-06,
"loss": 0.6755,
"step": 180
},
{
"epoch": 0.23100303951367782,
"grad_norm": 3.006441593170166,
"learning_rate": 7.651821862348178e-06,
"loss": 0.7105,
"step": 190
},
{
"epoch": 0.24316109422492402,
"grad_norm": 3.7500977516174316,
"learning_rate": 8.056680161943322e-06,
"loss": 0.6796,
"step": 200
},
{
"epoch": 0.2553191489361702,
"grad_norm": 3.6839091777801514,
"learning_rate": 8.461538461538462e-06,
"loss": 0.6716,
"step": 210
},
{
"epoch": 0.2674772036474164,
"grad_norm": 3.265120267868042,
"learning_rate": 8.866396761133604e-06,
"loss": 0.6728,
"step": 220
},
{
"epoch": 0.2796352583586626,
"grad_norm": 3.2442452907562256,
"learning_rate": 9.271255060728746e-06,
"loss": 0.6807,
"step": 230
},
{
"epoch": 0.2917933130699088,
"grad_norm": 3.7613420486450195,
"learning_rate": 9.676113360323888e-06,
"loss": 0.674,
"step": 240
},
{
"epoch": 0.303951367781155,
"grad_norm": 4.201897621154785,
"learning_rate": 9.999979955978923e-06,
"loss": 0.6987,
"step": 250
},
{
"epoch": 0.3161094224924012,
"grad_norm": 3.1414012908935547,
"learning_rate": 9.999278432115106e-06,
"loss": 0.6584,
"step": 260
},
{
"epoch": 0.3282674772036474,
"grad_norm": 3.2702925205230713,
"learning_rate": 9.99757486789673e-06,
"loss": 0.683,
"step": 270
},
{
"epoch": 0.3404255319148936,
"grad_norm": 2.9207000732421875,
"learning_rate": 9.9948696047811e-06,
"loss": 0.6895,
"step": 280
},
{
"epoch": 0.3525835866261398,
"grad_norm": 3.319397211074829,
"learning_rate": 9.991163185003028e-06,
"loss": 0.66,
"step": 290
},
{
"epoch": 0.364741641337386,
"grad_norm": 2.8080954551696777,
"learning_rate": 9.98645635146616e-06,
"loss": 0.6914,
"step": 300
},
{
"epoch": 0.3768996960486322,
"grad_norm": 4.024887561798096,
"learning_rate": 9.980750047594076e-06,
"loss": 0.6511,
"step": 310
},
{
"epoch": 0.3890577507598784,
"grad_norm": 2.6032838821411133,
"learning_rate": 9.974045417141186e-06,
"loss": 0.6522,
"step": 320
},
{
"epoch": 0.4012158054711246,
"grad_norm": 2.4978721141815186,
"learning_rate": 9.966343803963481e-06,
"loss": 0.6517,
"step": 330
},
{
"epoch": 0.4133738601823708,
"grad_norm": 3.0864603519439697,
"learning_rate": 9.957646751749178e-06,
"loss": 0.662,
"step": 340
},
{
"epoch": 0.425531914893617,
"grad_norm": 3.713203191757202,
"learning_rate": 9.947956003709301e-06,
"loss": 0.6728,
"step": 350
},
{
"epoch": 0.4376899696048632,
"grad_norm": 2.8905911445617676,
"learning_rate": 9.937273502228283e-06,
"loss": 0.6905,
"step": 360
},
{
"epoch": 0.44984802431610943,
"grad_norm": 3.0076723098754883,
"learning_rate": 9.925601388474637e-06,
"loss": 0.6955,
"step": 370
},
{
"epoch": 0.46200607902735563,
"grad_norm": 3.2814724445343018,
"learning_rate": 9.912942001971792e-06,
"loss": 0.6176,
"step": 380
},
{
"epoch": 0.47416413373860183,
"grad_norm": 2.9065964221954346,
"learning_rate": 9.899297880129156e-06,
"loss": 0.6768,
"step": 390
},
{
"epoch": 0.48632218844984804,
"grad_norm": 2.724536657333374,
"learning_rate": 9.884671757733534e-06,
"loss": 0.6382,
"step": 400
},
{
"epoch": 0.49848024316109424,
"grad_norm": 2.8882269859313965,
"learning_rate": 9.869066566400975e-06,
"loss": 0.6603,
"step": 410
},
{
"epoch": 0.5106382978723404,
"grad_norm": 2.5395541191101074,
"learning_rate": 9.852485433989158e-06,
"loss": 0.642,
"step": 420
},
{
"epoch": 0.5227963525835866,
"grad_norm": 2.759591579437256,
"learning_rate": 9.834931683970468e-06,
"loss": 0.6424,
"step": 430
},
{
"epoch": 0.5349544072948328,
"grad_norm": 2.4878828525543213,
"learning_rate": 9.816408834765838e-06,
"loss": 0.6435,
"step": 440
},
{
"epoch": 0.547112462006079,
"grad_norm": 2.7571871280670166,
"learning_rate": 9.796920599039536e-06,
"loss": 0.6766,
"step": 450
},
{
"epoch": 0.5592705167173252,
"grad_norm": 2.585879325866699,
"learning_rate": 9.776470882954998e-06,
"loss": 0.6082,
"step": 460
},
{
"epoch": 0.5714285714285714,
"grad_norm": 2.5847771167755127,
"learning_rate": 9.7550637853919e-06,
"loss": 0.6551,
"step": 470
},
{
"epoch": 0.5835866261398176,
"grad_norm": 2.795027732849121,
"learning_rate": 9.732703597124586e-06,
"loss": 0.6429,
"step": 480
},
{
"epoch": 0.5957446808510638,
"grad_norm": 2.377835273742676,
"learning_rate": 9.709394799962038e-06,
"loss": 0.6386,
"step": 490
},
{
"epoch": 0.60790273556231,
"grad_norm": 3.254490852355957,
"learning_rate": 9.685142065849556e-06,
"loss": 0.5844,
"step": 500
},
{
"epoch": 0.6200607902735562,
"grad_norm": 2.866058588027954,
"learning_rate": 9.659950255932324e-06,
"loss": 0.6079,
"step": 510
},
{
"epoch": 0.6322188449848024,
"grad_norm": 2.680647373199463,
"learning_rate": 9.633824419581069e-06,
"loss": 0.6294,
"step": 520
},
{
"epoch": 0.6443768996960486,
"grad_norm": 3.1423895359039307,
"learning_rate": 9.60676979337996e-06,
"loss": 0.6311,
"step": 530
},
{
"epoch": 0.6565349544072948,
"grad_norm": 2.480421304702759,
"learning_rate": 9.578791800077021e-06,
"loss": 0.6395,
"step": 540
},
{
"epoch": 0.668693009118541,
"grad_norm": 3.06001877784729,
"learning_rate": 9.549896047497202e-06,
"loss": 0.6613,
"step": 550
},
{
"epoch": 0.6808510638297872,
"grad_norm": 2.8360841274261475,
"learning_rate": 9.520088327418371e-06,
"loss": 0.6161,
"step": 560
},
{
"epoch": 0.6930091185410334,
"grad_norm": 3.1503562927246094,
"learning_rate": 9.489374614410413e-06,
"loss": 0.6137,
"step": 570
},
{
"epoch": 0.7051671732522796,
"grad_norm": 2.579737663269043,
"learning_rate": 9.457761064637727e-06,
"loss": 0.6068,
"step": 580
},
{
"epoch": 0.7173252279635258,
"grad_norm": 3.200549602508545,
"learning_rate": 9.425254014625278e-06,
"loss": 0.6436,
"step": 590
},
{
"epoch": 0.729483282674772,
"grad_norm": 2.9303839206695557,
"learning_rate": 9.391859979988546e-06,
"loss": 0.6062,
"step": 600
},
{
"epoch": 0.7416413373860182,
"grad_norm": 3.0887465476989746,
"learning_rate": 9.35758565412754e-06,
"loss": 0.6244,
"step": 610
},
{
"epoch": 0.7537993920972644,
"grad_norm": 3.22578763961792,
"learning_rate": 9.322437906885199e-06,
"loss": 0.6544,
"step": 620
},
{
"epoch": 0.7659574468085106,
"grad_norm": 3.8590402603149414,
"learning_rate": 9.28642378317042e-06,
"loss": 0.6369,
"step": 630
},
{
"epoch": 0.7781155015197568,
"grad_norm": 3.001377820968628,
"learning_rate": 9.249550501545998e-06,
"loss": 0.6556,
"step": 640
},
{
"epoch": 0.790273556231003,
"grad_norm": 2.9286224842071533,
"learning_rate": 9.211825452781762e-06,
"loss": 0.599,
"step": 650
},
{
"epoch": 0.8024316109422492,
"grad_norm": 3.0069026947021484,
"learning_rate": 9.173256198373185e-06,
"loss": 0.6284,
"step": 660
},
{
"epoch": 0.8145896656534954,
"grad_norm": 3.2098565101623535,
"learning_rate": 9.133850469025786e-06,
"loss": 0.6047,
"step": 670
},
{
"epoch": 0.8267477203647416,
"grad_norm": 2.4998652935028076,
"learning_rate": 9.093616163105609e-06,
"loss": 0.6233,
"step": 680
},
{
"epoch": 0.8389057750759878,
"grad_norm": 2.6951255798339844,
"learning_rate": 9.052561345056095e-06,
"loss": 0.6288,
"step": 690
},
{
"epoch": 0.851063829787234,
"grad_norm": 2.79081654548645,
"learning_rate": 9.010694243781671e-06,
"loss": 0.6248,
"step": 700
},
{
"epoch": 0.8632218844984803,
"grad_norm": 3.0113465785980225,
"learning_rate": 8.96802325099838e-06,
"loss": 0.6262,
"step": 710
},
{
"epoch": 0.8753799392097265,
"grad_norm": 2.396683692932129,
"learning_rate": 8.924556919551863e-06,
"loss": 0.6154,
"step": 720
},
{
"epoch": 0.8875379939209727,
"grad_norm": 2.604168176651001,
"learning_rate": 8.880303961703048e-06,
"loss": 0.6044,
"step": 730
},
{
"epoch": 0.8996960486322189,
"grad_norm": 2.6339340209960938,
"learning_rate": 8.835273247381903e-06,
"loss": 0.6367,
"step": 740
},
{
"epoch": 0.9118541033434651,
"grad_norm": 2.699192762374878,
"learning_rate": 8.789473802409565e-06,
"loss": 0.6598,
"step": 750
},
{
"epoch": 0.9240121580547113,
"grad_norm": 3.0326826572418213,
"learning_rate": 8.742914806689234e-06,
"loss": 0.596,
"step": 760
},
{
"epoch": 0.9361702127659575,
"grad_norm": 3.168898582458496,
"learning_rate": 8.695605592366184e-06,
"loss": 0.5843,
"step": 770
},
{
"epoch": 0.9483282674772037,
"grad_norm": 2.6533005237579346,
"learning_rate": 8.647555641957243e-06,
"loss": 0.598,
"step": 780
},
{
"epoch": 0.9604863221884499,
"grad_norm": 3.4256536960601807,
"learning_rate": 8.59877458645017e-06,
"loss": 0.588,
"step": 790
},
{
"epoch": 0.9726443768996961,
"grad_norm": 2.429436445236206,
"learning_rate": 8.54927220337322e-06,
"loss": 0.5955,
"step": 800
},
{
"epoch": 0.9848024316109423,
"grad_norm": 2.6898012161254883,
"learning_rate": 8.499058414835389e-06,
"loss": 0.6068,
"step": 810
},
{
"epoch": 0.9969604863221885,
"grad_norm": 2.064389944076538,
"learning_rate": 8.448143285537645e-06,
"loss": 0.5694,
"step": 820
},
{
"epoch": 1.0085106382978724,
"grad_norm": 2.412479877471924,
"learning_rate": 8.396537020755588e-06,
"loss": 0.4937,
"step": 830
},
{
"epoch": 1.0206686930091184,
"grad_norm": 2.439929246902466,
"learning_rate": 8.344249964293942e-06,
"loss": 0.4945,
"step": 840
},
{
"epoch": 1.0328267477203648,
"grad_norm": 2.665113687515259,
"learning_rate": 8.291292596413272e-06,
"loss": 0.499,
"step": 850
},
{
"epoch": 1.0449848024316108,
"grad_norm": 2.39530086517334,
"learning_rate": 8.237675531729345e-06,
"loss": 0.4825,
"step": 860
},
{
"epoch": 1.0571428571428572,
"grad_norm": 2.943648099899292,
"learning_rate": 8.18340951708558e-06,
"loss": 0.4683,
"step": 870
},
{
"epoch": 1.0693009118541033,
"grad_norm": 2.766900062561035,
"learning_rate": 8.128505429398976e-06,
"loss": 0.477,
"step": 880
},
{
"epoch": 1.0814589665653496,
"grad_norm": 2.389639139175415,
"learning_rate": 8.072974273479972e-06,
"loss": 0.5606,
"step": 890
},
{
"epoch": 1.0936170212765957,
"grad_norm": 2.3432369232177734,
"learning_rate": 8.016827179826685e-06,
"loss": 0.4753,
"step": 900
},
{
"epoch": 1.105775075987842,
"grad_norm": 2.27811598777771,
"learning_rate": 7.960075402393937e-06,
"loss": 0.4733,
"step": 910
},
{
"epoch": 1.117933130699088,
"grad_norm": 2.5332465171813965,
"learning_rate": 7.902730316337556e-06,
"loss": 0.4444,
"step": 920
},
{
"epoch": 1.1300911854103344,
"grad_norm": 2.445450782775879,
"learning_rate": 7.844803415734368e-06,
"loss": 0.4694,
"step": 930
},
{
"epoch": 1.1422492401215805,
"grad_norm": 2.309290647506714,
"learning_rate": 7.786306311278354e-06,
"loss": 0.4617,
"step": 940
},
{
"epoch": 1.1544072948328268,
"grad_norm": 2.7566754817962646,
"learning_rate": 7.727250727953445e-06,
"loss": 0.5046,
"step": 950
},
{
"epoch": 1.1665653495440729,
"grad_norm": 2.475858211517334,
"learning_rate": 7.667648502683406e-06,
"loss": 0.5175,
"step": 960
},
{
"epoch": 1.1787234042553192,
"grad_norm": 2.5813775062561035,
"learning_rate": 7.607511581959261e-06,
"loss": 0.4656,
"step": 970
},
{
"epoch": 1.1908814589665653,
"grad_norm": 2.9376819133758545,
"learning_rate": 7.5468520194447925e-06,
"loss": 0.4945,
"step": 980
},
{
"epoch": 1.2030395136778116,
"grad_norm": 2.2708747386932373,
"learning_rate": 7.485681973560532e-06,
"loss": 0.4931,
"step": 990
},
{
"epoch": 1.2151975683890577,
"grad_norm": 2.87481427192688,
"learning_rate": 7.4240137050467635e-06,
"loss": 0.4713,
"step": 1000
},
{
"epoch": 1.227355623100304,
"grad_norm": 3.1325576305389404,
"learning_rate": 7.361859574506017e-06,
"loss": 0.4775,
"step": 1010
},
{
"epoch": 1.23951367781155,
"grad_norm": 2.519160509109497,
"learning_rate": 7.299232039925552e-06,
"loss": 0.4747,
"step": 1020
},
{
"epoch": 1.2516717325227964,
"grad_norm": 3.1819024085998535,
"learning_rate": 7.236143654180311e-06,
"loss": 0.4836,
"step": 1030
},
{
"epoch": 1.2638297872340425,
"grad_norm": 2.5699877738952637,
"learning_rate": 7.172607062516856e-06,
"loss": 0.4471,
"step": 1040
},
{
"epoch": 1.2759878419452888,
"grad_norm": 3.718123435974121,
"learning_rate": 7.108635000018802e-06,
"loss": 0.5022,
"step": 1050
},
{
"epoch": 1.288145896656535,
"grad_norm": 3.3233118057250977,
"learning_rate": 7.044240289054227e-06,
"loss": 0.4829,
"step": 1060
},
{
"epoch": 1.3003039513677812,
"grad_norm": 2.362790584564209,
"learning_rate": 6.979435836705602e-06,
"loss": 0.4801,
"step": 1070
},
{
"epoch": 1.3124620060790273,
"grad_norm": 2.5460381507873535,
"learning_rate": 6.9142346321827246e-06,
"loss": 0.4922,
"step": 1080
},
{
"epoch": 1.3246200607902736,
"grad_norm": 3.0161983966827393,
"learning_rate": 6.84864974421921e-06,
"loss": 0.4734,
"step": 1090
},
{
"epoch": 1.3367781155015197,
"grad_norm": 2.387080669403076,
"learning_rate": 6.782694318453033e-06,
"loss": 0.4924,
"step": 1100
},
{
"epoch": 1.348936170212766,
"grad_norm": 3.0172154903411865,
"learning_rate": 6.716381574791648e-06,
"loss": 0.4669,
"step": 1110
},
{
"epoch": 1.361094224924012,
"grad_norm": 3.0593836307525635,
"learning_rate": 6.649724804762236e-06,
"loss": 0.4689,
"step": 1120
},
{
"epoch": 1.3732522796352584,
"grad_norm": 2.0975329875946045,
"learning_rate": 6.5827373688475925e-06,
"loss": 0.501,
"step": 1130
},
{
"epoch": 1.3854103343465045,
"grad_norm": 2.6222572326660156,
"learning_rate": 6.5154326938081866e-06,
"loss": 0.487,
"step": 1140
},
{
"epoch": 1.3975683890577508,
"grad_norm": 2.6791985034942627,
"learning_rate": 6.447824269990947e-06,
"loss": 0.4589,
"step": 1150
},
{
"epoch": 1.409726443768997,
"grad_norm": 3.0119071006774902,
"learning_rate": 6.3799256486252945e-06,
"loss": 0.4839,
"step": 1160
},
{
"epoch": 1.4218844984802432,
"grad_norm": 3.6317973136901855,
"learning_rate": 6.311750439106976e-06,
"loss": 0.4391,
"step": 1170
},
{
"epoch": 1.4340425531914893,
"grad_norm": 1.9667277336120605,
"learning_rate": 6.243312306270235e-06,
"loss": 0.4379,
"step": 1180
},
{
"epoch": 1.4462006079027356,
"grad_norm": 2.5585360527038574,
"learning_rate": 6.174624967648877e-06,
"loss": 0.4954,
"step": 1190
},
{
"epoch": 1.4583586626139817,
"grad_norm": 2.421276330947876,
"learning_rate": 6.105702190726765e-06,
"loss": 0.4558,
"step": 1200
},
{
"epoch": 1.470516717325228,
"grad_norm": 2.7117786407470703,
"learning_rate": 6.03655779017831e-06,
"loss": 0.488,
"step": 1210
},
{
"epoch": 1.4826747720364741,
"grad_norm": 2.3945164680480957,
"learning_rate": 5.967205625099496e-06,
"loss": 0.4849,
"step": 1220
},
{
"epoch": 1.4948328267477204,
"grad_norm": 2.291707754135132,
"learning_rate": 5.897659596230003e-06,
"loss": 0.4614,
"step": 1230
},
{
"epoch": 1.5069908814589665,
"grad_norm": 2.583559036254883,
"learning_rate": 5.827933643166993e-06,
"loss": 0.4626,
"step": 1240
},
{
"epoch": 1.5191489361702128,
"grad_norm": 2.1779534816741943,
"learning_rate": 5.758041741571088e-06,
"loss": 0.4774,
"step": 1250
},
{
"epoch": 1.531306990881459,
"grad_norm": 3.738179922103882,
"learning_rate": 5.687997900365134e-06,
"loss": 0.4487,
"step": 1260
},
{
"epoch": 1.543465045592705,
"grad_norm": 2.461461305618286,
"learning_rate": 5.617816158926303e-06,
"loss": 0.4851,
"step": 1270
},
{
"epoch": 1.5556231003039513,
"grad_norm": 2.784620523452759,
"learning_rate": 5.547510584272069e-06,
"loss": 0.5079,
"step": 1280
},
{
"epoch": 1.5677811550151977,
"grad_norm": 2.8999369144439697,
"learning_rate": 5.477095268240669e-06,
"loss": 0.4596,
"step": 1290
},
{
"epoch": 1.5799392097264437,
"grad_norm": 2.786457061767578,
"learning_rate": 5.406584324666565e-06,
"loss": 0.4226,
"step": 1300
},
{
"epoch": 1.5920972644376898,
"grad_norm": 3.0281615257263184,
"learning_rate": 5.335991886551526e-06,
"loss": 0.4826,
"step": 1310
},
{
"epoch": 1.6042553191489362,
"grad_norm": 2.60740327835083,
"learning_rate": 5.2653321032318315e-06,
"loss": 0.5185,
"step": 1320
},
{
"epoch": 1.6164133738601825,
"grad_norm": 2.150132179260254,
"learning_rate": 5.194619137542241e-06,
"loss": 0.511,
"step": 1330
},
{
"epoch": 1.6285714285714286,
"grad_norm": 2.7613778114318848,
"learning_rate": 5.123867162977224e-06,
"loss": 0.4653,
"step": 1340
},
{
"epoch": 1.6407294832826747,
"grad_norm": 2.5103297233581543,
"learning_rate": 5.053090360850072e-06,
"loss": 0.4206,
"step": 1350
},
{
"epoch": 1.652887537993921,
"grad_norm": 2.6275055408477783,
"learning_rate": 4.9823029174504335e-06,
"loss": 0.4727,
"step": 1360
},
{
"epoch": 1.6650455927051673,
"grad_norm": 2.6730258464813232,
"learning_rate": 4.9115190212008745e-06,
"loss": 0.4616,
"step": 1370
},
{
"epoch": 1.6772036474164134,
"grad_norm": 2.8876965045928955,
"learning_rate": 4.840752859812972e-06,
"loss": 0.4868,
"step": 1380
},
{
"epoch": 1.6893617021276595,
"grad_norm": 2.7904891967773438,
"learning_rate": 4.770018617443578e-06,
"loss": 0.4453,
"step": 1390
},
{
"epoch": 1.7015197568389058,
"grad_norm": 2.7088356018066406,
"learning_rate": 4.699330471851798e-06,
"loss": 0.4708,
"step": 1400
},
{
"epoch": 1.713677811550152,
"grad_norm": 2.1268298625946045,
"learning_rate": 4.628702591557237e-06,
"loss": 0.4901,
"step": 1410
},
{
"epoch": 1.7258358662613982,
"grad_norm": 2.3978848457336426,
"learning_rate": 4.558149133000104e-06,
"loss": 0.5164,
"step": 1420
},
{
"epoch": 1.7379939209726443,
"grad_norm": 1.8490489721298218,
"learning_rate": 4.487684237703734e-06,
"loss": 0.4342,
"step": 1430
},
{
"epoch": 1.7501519756838906,
"grad_norm": 2.7367939949035645,
"learning_rate": 4.417322029440119e-06,
"loss": 0.4887,
"step": 1440
},
{
"epoch": 1.762310030395137,
"grad_norm": 2.217988967895508,
"learning_rate": 4.347076611398961e-06,
"loss": 0.46,
"step": 1450
},
{
"epoch": 1.774468085106383,
"grad_norm": 2.8989672660827637,
"learning_rate": 4.2769620633608835e-06,
"loss": 0.4524,
"step": 1460
},
{
"epoch": 1.786626139817629,
"grad_norm": 2.5125184059143066,
"learning_rate": 4.206992438875318e-06,
"loss": 0.4346,
"step": 1470
},
{
"epoch": 1.7987841945288754,
"grad_norm": 2.9049086570739746,
"learning_rate": 4.137181762443658e-06,
"loss": 0.3629,
"step": 1480
},
{
"epoch": 1.8109422492401217,
"grad_norm": 2.5007903575897217,
"learning_rate": 4.0675440267082236e-06,
"loss": 0.4943,
"step": 1490
},
{
"epoch": 1.8231003039513678,
"grad_norm": 2.90285325050354,
"learning_rate": 3.998093189647622e-06,
"loss": 0.4331,
"step": 1500
},
{
"epoch": 1.8352583586626139,
"grad_norm": 2.7467432022094727,
"learning_rate": 3.928843171779051e-06,
"loss": 0.4826,
"step": 1510
},
{
"epoch": 1.8474164133738602,
"grad_norm": 2.5184881687164307,
"learning_rate": 3.859807853368112e-06,
"loss": 0.4204,
"step": 1520
},
{
"epoch": 1.8595744680851065,
"grad_norm": 2.262559652328491,
"learning_rate": 3.791001071646695e-06,
"loss": 0.455,
"step": 1530
},
{
"epoch": 1.8717325227963526,
"grad_norm": 2.8905036449432373,
"learning_rate": 3.72243661803948e-06,
"loss": 0.4584,
"step": 1540
},
{
"epoch": 1.8838905775075987,
"grad_norm": 3.1106526851654053,
"learning_rate": 3.6541282353996275e-06,
"loss": 0.4559,
"step": 1550
},
{
"epoch": 1.896048632218845,
"grad_norm": 2.7985055446624756,
"learning_rate": 3.5860896152542013e-06,
"loss": 0.4452,
"step": 1560
},
{
"epoch": 1.9082066869300913,
"grad_norm": 2.4769771099090576,
"learning_rate": 3.5183343950598825e-06,
"loss": 0.463,
"step": 1570
},
{
"epoch": 1.9203647416413374,
"grad_norm": 2.9029109477996826,
"learning_rate": 3.450876155469518e-06,
"loss": 0.4377,
"step": 1580
},
{
"epoch": 1.9325227963525835,
"grad_norm": 2.444227457046509,
"learning_rate": 3.3837284176100543e-06,
"loss": 0.4559,
"step": 1590
},
{
"epoch": 1.9446808510638298,
"grad_norm": 1.8885170221328735,
"learning_rate": 3.3169046403724004e-06,
"loss": 0.4315,
"step": 1600
},
{
"epoch": 1.9568389057750761,
"grad_norm": 2.370542526245117,
"learning_rate": 3.250418217713771e-06,
"loss": 0.4496,
"step": 1610
},
{
"epoch": 1.9689969604863222,
"grad_norm": 2.281675338745117,
"learning_rate": 3.1842824759730518e-06,
"loss": 0.4651,
"step": 1620
},
{
"epoch": 1.9811550151975683,
"grad_norm": 2.4642460346221924,
"learning_rate": 3.1185106711996848e-06,
"loss": 0.4492,
"step": 1630
},
{
"epoch": 1.9933130699088146,
"grad_norm": 2.898085355758667,
"learning_rate": 3.0531159864966885e-06,
"loss": 0.4217,
"step": 1640
},
{
"epoch": 2.0048632218844986,
"grad_norm": 2.5043299198150635,
"learning_rate": 2.9881115293782638e-06,
"loss": 0.4087,
"step": 1650
},
{
"epoch": 2.0170212765957447,
"grad_norm": 2.5139353275299072,
"learning_rate": 2.923510329142568e-06,
"loss": 0.3166,
"step": 1660
},
{
"epoch": 2.029179331306991,
"grad_norm": 2.334691047668457,
"learning_rate": 2.8593253342601557e-06,
"loss": 0.2967,
"step": 1670
},
{
"epoch": 2.041337386018237,
"grad_norm": 2.763742685317993,
"learning_rate": 2.795569409778639e-06,
"loss": 0.3263,
"step": 1680
},
{
"epoch": 2.0534954407294834,
"grad_norm": 2.539466142654419,
"learning_rate": 2.7322553347440368e-06,
"loss": 0.2964,
"step": 1690
},
{
"epoch": 2.0656534954407295,
"grad_norm": 3.0033152103424072,
"learning_rate": 2.6693957996393984e-06,
"loss": 0.3157,
"step": 1700
},
{
"epoch": 2.0778115501519756,
"grad_norm": 2.8573410511016846,
"learning_rate": 2.6070034038411553e-06,
"loss": 0.3542,
"step": 1710
},
{
"epoch": 2.0899696048632217,
"grad_norm": 2.31459641456604,
"learning_rate": 2.545090653093738e-06,
"loss": 0.2965,
"step": 1720
},
{
"epoch": 2.1021276595744682,
"grad_norm": 2.5029146671295166,
"learning_rate": 2.4836699570029623e-06,
"loss": 0.295,
"step": 1730
},
{
"epoch": 2.1142857142857143,
"grad_norm": 2.295614004135132,
"learning_rate": 2.4227536265486885e-06,
"loss": 0.3075,
"step": 1740
},
{
"epoch": 2.1264437689969604,
"grad_norm": 2.5719516277313232,
"learning_rate": 2.3623538716172394e-06,
"loss": 0.3397,
"step": 1750
},
{
"epoch": 2.1386018237082065,
"grad_norm": 2.6095409393310547,
"learning_rate": 2.302482798554096e-06,
"loss": 0.324,
"step": 1760
},
{
"epoch": 2.150759878419453,
"grad_norm": 2.429758310317993,
"learning_rate": 2.2431524077373314e-06,
"loss": 0.2939,
"step": 1770
},
{
"epoch": 2.162917933130699,
"grad_norm": 2.9179751873016357,
"learning_rate": 2.1843745911722937e-06,
"loss": 0.308,
"step": 1780
},
{
"epoch": 2.1750759878419452,
"grad_norm": 3.281049966812134,
"learning_rate": 2.1261611301080063e-06,
"loss": 0.3229,
"step": 1790
},
{
"epoch": 2.1872340425531913,
"grad_norm": 2.931143283843994,
"learning_rate": 2.068523692675772e-06,
"loss": 0.3107,
"step": 1800
},
{
"epoch": 2.199392097264438,
"grad_norm": 2.432403802871704,
"learning_rate": 2.0114738315504505e-06,
"loss": 0.2925,
"step": 1810
},
{
"epoch": 2.211550151975684,
"grad_norm": 3.296288251876831,
"learning_rate": 1.955022981634863e-06,
"loss": 0.3115,
"step": 1820
},
{
"epoch": 2.22370820668693,
"grad_norm": 2.6482620239257812,
"learning_rate": 1.8991824577678269e-06,
"loss": 0.3423,
"step": 1830
},
{
"epoch": 2.235866261398176,
"grad_norm": 2.5689730644226074,
"learning_rate": 1.8439634524562423e-06,
"loss": 0.344,
"step": 1840
},
{
"epoch": 2.2480243161094227,
"grad_norm": 2.4932291507720947,
"learning_rate": 1.7893770336316928e-06,
"loss": 0.3052,
"step": 1850
},
{
"epoch": 2.2601823708206688,
"grad_norm": 2.985034704208374,
"learning_rate": 1.7354341424320286e-06,
"loss": 0.3056,
"step": 1860
},
{
"epoch": 2.272340425531915,
"grad_norm": 2.558969020843506,
"learning_rate": 1.6821455910083535e-06,
"loss": 0.2883,
"step": 1870
},
{
"epoch": 2.284498480243161,
"grad_norm": 2.731433868408203,
"learning_rate": 1.6295220603578727e-06,
"loss": 0.3017,
"step": 1880
},
{
"epoch": 2.2966565349544075,
"grad_norm": 2.7767181396484375,
"learning_rate": 1.5775740981830262e-06,
"loss": 0.3348,
"step": 1890
},
{
"epoch": 2.3088145896656536,
"grad_norm": 2.289393424987793,
"learning_rate": 1.526312116777336e-06,
"loss": 0.3377,
"step": 1900
},
{
"epoch": 2.3209726443768997,
"grad_norm": 2.3055357933044434,
"learning_rate": 1.475746390938399e-06,
"loss": 0.3111,
"step": 1910
},
{
"epoch": 2.3331306990881457,
"grad_norm": 3.120150089263916,
"learning_rate": 1.4258870559084387e-06,
"loss": 0.3172,
"step": 1920
},
{
"epoch": 2.3452887537993923,
"grad_norm": 2.520087718963623,
"learning_rate": 1.3767441053428244e-06,
"loss": 0.3071,
"step": 1930
},
{
"epoch": 2.3574468085106384,
"grad_norm": 2.533917188644409,
"learning_rate": 1.328327389306977e-06,
"loss": 0.328,
"step": 1940
},
{
"epoch": 2.3696048632218845,
"grad_norm": 2.611781120300293,
"learning_rate": 1.2806466123020479e-06,
"loss": 0.2373,
"step": 1950
},
{
"epoch": 2.3817629179331306,
"grad_norm": 2.3127501010894775,
"learning_rate": 1.2337113313197813e-06,
"loss": 0.3226,
"step": 1960
},
{
"epoch": 2.393920972644377,
"grad_norm": 3.0540225505828857,
"learning_rate": 1.1875309539269332e-06,
"loss": 0.3181,
"step": 1970
},
{
"epoch": 2.406079027355623,
"grad_norm": 2.2503840923309326,
"learning_rate": 1.1421147363796547e-06,
"loss": 0.2918,
"step": 1980
},
{
"epoch": 2.4182370820668693,
"grad_norm": 2.5295841693878174,
"learning_rate": 1.097471781768194e-06,
"loss": 0.2941,
"step": 1990
},
{
"epoch": 2.4303951367781154,
"grad_norm": 2.7835609912872314,
"learning_rate": 1.053611038192296e-06,
"loss": 0.2901,
"step": 2000
},
{
"epoch": 2.4425531914893615,
"grad_norm": 2.6751868724823,
"learning_rate": 1.0105412969676758e-06,
"loss": 0.335,
"step": 2010
},
{
"epoch": 2.454711246200608,
"grad_norm": 2.333569049835205,
"learning_rate": 9.682711908639137e-07,
"loss": 0.2967,
"step": 2020
},
{
"epoch": 2.466869300911854,
"grad_norm": 2.4557461738586426,
"learning_rate": 9.268091923741246e-07,
"loss": 0.2856,
"step": 2030
},
{
"epoch": 2.4790273556231,
"grad_norm": 2.8368921279907227,
"learning_rate": 8.861636120167632e-07,
"loss": 0.3396,
"step": 2040
},
{
"epoch": 2.4911854103343467,
"grad_norm": 2.238975763320923,
"learning_rate": 8.463425966698857e-07,
"loss": 0.3138,
"step": 2050
},
{
"epoch": 2.503343465045593,
"grad_norm": 2.7118637561798096,
"learning_rate": 8.073541279382135e-07,
"loss": 0.3397,
"step": 2060
},
{
"epoch": 2.515501519756839,
"grad_norm": 2.528172731399536,
"learning_rate": 7.69206020553323e-07,
"loss": 0.3273,
"step": 2070
},
{
"epoch": 2.527659574468085,
"grad_norm": 2.3029799461364746,
"learning_rate": 7.319059208072909e-07,
"loss": 0.3238,
"step": 2080
},
{
"epoch": 2.539817629179331,
"grad_norm": 2.535954475402832,
"learning_rate": 6.954613050200859e-07,
"loss": 0.328,
"step": 2090
},
{
"epoch": 2.5519756838905776,
"grad_norm": 2.4356179237365723,
"learning_rate": 6.5987947804104e-07,
"loss": 0.3081,
"step": 2100
},
{
"epoch": 2.5641337386018237,
"grad_norm": 2.039999008178711,
"learning_rate": 6.251675717846905e-07,
"loss": 0.311,
"step": 2110
},
{
"epoch": 2.57629179331307,
"grad_norm": 2.296266794204712,
"learning_rate": 5.913325438012773e-07,
"loss": 0.2815,
"step": 2120
},
{
"epoch": 2.5884498480243163,
"grad_norm": 2.9788355827331543,
"learning_rate": 5.583811758821916e-07,
"loss": 0.3346,
"step": 2130
},
{
"epoch": 2.6006079027355624,
"grad_norm": 3.0497772693634033,
"learning_rate": 5.263200727006568e-07,
"loss": 0.2976,
"step": 2140
},
{
"epoch": 2.6127659574468085,
"grad_norm": 2.972386121749878,
"learning_rate": 4.951556604879049e-07,
"loss": 0.272,
"step": 2150
},
{
"epoch": 2.6249240121580546,
"grad_norm": 3.3032212257385254,
"learning_rate": 4.648941857451228e-07,
"loss": 0.2989,
"step": 2160
},
{
"epoch": 2.6370820668693007,
"grad_norm": 2.078023672103882,
"learning_rate": 4.355417139914242e-07,
"loss": 0.3353,
"step": 2170
},
{
"epoch": 2.6492401215805472,
"grad_norm": 2.5029313564300537,
"learning_rate": 4.0710412854809255e-07,
"loss": 0.3413,
"step": 2180
},
{
"epoch": 2.6613981762917933,
"grad_norm": 2.516641616821289,
"learning_rate": 3.7958712935934726e-07,
"loss": 0.347,
"step": 2190
},
{
"epoch": 2.6735562310030394,
"grad_norm": 2.115417718887329,
"learning_rate": 3.5299623184986366e-07,
"loss": 0.2955,
"step": 2200
},
{
"epoch": 2.685714285714286,
"grad_norm": 2.419532299041748,
"learning_rate": 3.273367658192778e-07,
"loss": 0.285,
"step": 2210
},
{
"epoch": 2.697872340425532,
"grad_norm": 2.186316967010498,
"learning_rate": 3.0261387437389766e-07,
"loss": 0.3091,
"step": 2220
},
{
"epoch": 2.710030395136778,
"grad_norm": 2.725886583328247,
"learning_rate": 2.7883251289583467e-07,
"loss": 0.317,
"step": 2230
},
{
"epoch": 2.722188449848024,
"grad_norm": 2.355541944503784,
"learning_rate": 2.5599744804975956e-07,
"loss": 0.3093,
"step": 2240
},
{
"epoch": 2.7343465045592703,
"grad_norm": 2.3325600624084473,
"learning_rate": 2.3411325682748843e-07,
"loss": 0.2784,
"step": 2250
},
{
"epoch": 2.746504559270517,
"grad_norm": 2.1935389041900635,
"learning_rate": 2.1318432563058765e-07,
"loss": 0.2835,
"step": 2260
},
{
"epoch": 2.758662613981763,
"grad_norm": 2.6569478511810303,
"learning_rate": 1.9321484939116843e-07,
"loss": 0.2821,
"step": 2270
},
{
"epoch": 2.770820668693009,
"grad_norm": 2.2891359329223633,
"learning_rate": 1.742088307310741e-07,
"loss": 0.3362,
"step": 2280
},
{
"epoch": 2.7829787234042556,
"grad_norm": 2.387622833251953,
"learning_rate": 1.561700791596038e-07,
"loss": 0.2973,
"step": 2290
},
{
"epoch": 2.7951367781155017,
"grad_norm": 2.550020217895508,
"learning_rate": 1.3910221030994764e-07,
"loss": 0.279,
"step": 2300
},
{
"epoch": 2.8072948328267477,
"grad_norm": 2.9916679859161377,
"learning_rate": 1.2300864521447575e-07,
"loss": 0.3318,
"step": 2310
},
{
"epoch": 2.819452887537994,
"grad_norm": 2.5227696895599365,
"learning_rate": 1.0789260961904357e-07,
"loss": 0.2887,
"step": 2320
},
{
"epoch": 2.83161094224924,
"grad_norm": 2.750169038772583,
"learning_rate": 9.375713333642677e-08,
"loss": 0.3087,
"step": 2330
},
{
"epoch": 2.8437689969604865,
"grad_norm": 2.2595717906951904,
"learning_rate": 8.060504963903815e-08,
"loss": 0.28,
"step": 2340
},
{
"epoch": 2.8559270516717326,
"grad_norm": 2.5719687938690186,
"learning_rate": 6.843899469103521e-08,
"loss": 0.3007,
"step": 2350
},
{
"epoch": 2.8680851063829786,
"grad_norm": 2.3983314037323,
"learning_rate": 5.726140701993288e-08,
"loss": 0.3049,
"step": 2360
},
{
"epoch": 2.880243161094225,
"grad_norm": 2.910645008087158,
"learning_rate": 4.707452702783388e-08,
"loss": 0.3006,
"step": 2370
},
{
"epoch": 2.8924012158054713,
"grad_norm": 2.546748161315918,
"learning_rate": 3.7880396542369635e-08,
"loss": 0.3431,
"step": 2380
},
{
"epoch": 2.9045592705167174,
"grad_norm": 2.052919387817383,
"learning_rate": 2.9680858407441503e-08,
"loss": 0.2987,
"step": 2390
},
{
"epoch": 2.9167173252279635,
"grad_norm": 2.4598567485809326,
"learning_rate": 2.24775561138485e-08,
"loss": 0.3207,
"step": 2400
},
{
"epoch": 2.9288753799392095,
"grad_norm": 2.4281694889068604,
"learning_rate": 1.627193346986744e-08,
"loss": 0.2847,
"step": 2410
},
{
"epoch": 2.941033434650456,
"grad_norm": 2.2293615341186523,
"learning_rate": 1.1065234311864459e-08,
"loss": 0.2952,
"step": 2420
},
{
"epoch": 2.953191489361702,
"grad_norm": 2.4415087699890137,
"learning_rate": 6.858502254981081e-09,
"loss": 0.2756,
"step": 2430
},
{
"epoch": 2.9653495440729483,
"grad_norm": 2.482046604156494,
"learning_rate": 3.652580483956558e-09,
"loss": 0.3162,
"step": 2440
},
{
"epoch": 2.977507598784195,
"grad_norm": 2.2831978797912598,
"learning_rate": 1.4481115841230574e-09,
"loss": 0.2904,
"step": 2450
},
{
"epoch": 2.989665653495441,
"grad_norm": 2.147287368774414,
"learning_rate": 2.4553741260535667e-10,
"loss": 0.3062,
"step": 2460
}
],
"logging_steps": 10,
"max_steps": 2466,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0307400421054874e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}