{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9969604863221884, "eval_steps": 500, "global_step": 2466, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0121580547112462, "grad_norm": 5.922801494598389, "learning_rate": 3.6437246963562754e-07, "loss": 1.1863, "step": 10 }, { "epoch": 0.0243161094224924, "grad_norm": 6.66644811630249, "learning_rate": 7.692307692307694e-07, "loss": 1.1376, "step": 20 }, { "epoch": 0.0364741641337386, "grad_norm": 7.5532402992248535, "learning_rate": 1.174089068825911e-06, "loss": 1.1034, "step": 30 }, { "epoch": 0.0486322188449848, "grad_norm": 4.451807498931885, "learning_rate": 1.5789473684210526e-06, "loss": 1.0824, "step": 40 }, { "epoch": 0.060790273556231005, "grad_norm": 4.282721996307373, "learning_rate": 1.9838056680161946e-06, "loss": 0.9437, "step": 50 }, { "epoch": 0.0729483282674772, "grad_norm": 3.476186990737915, "learning_rate": 2.3886639676113362e-06, "loss": 0.8906, "step": 60 }, { "epoch": 0.0851063829787234, "grad_norm": 2.9487781524658203, "learning_rate": 2.7935222672064783e-06, "loss": 0.8558, "step": 70 }, { "epoch": 0.0972644376899696, "grad_norm": 3.981879949569702, "learning_rate": 3.19838056680162e-06, "loss": 0.7978, "step": 80 }, { "epoch": 0.1094224924012158, "grad_norm": 3.0333662033081055, "learning_rate": 3.6032388663967615e-06, "loss": 0.7652, "step": 90 }, { "epoch": 0.12158054711246201, "grad_norm": 2.985710859298706, "learning_rate": 4.008097165991903e-06, "loss": 0.7671, "step": 100 }, { "epoch": 0.1337386018237082, "grad_norm": 2.512871026992798, "learning_rate": 4.412955465587045e-06, "loss": 0.7848, "step": 110 }, { "epoch": 0.1458966565349544, "grad_norm": 3.5035741329193115, "learning_rate": 4.817813765182186e-06, "loss": 0.7419, "step": 120 }, { "epoch": 0.1580547112462006, "grad_norm": 4.36132287979126, "learning_rate": 5.222672064777329e-06, "loss": 0.7379, "step": 130 }, { "epoch": 0.1702127659574468, "grad_norm": 3.3608834743499756, "learning_rate": 5.6275303643724695e-06, "loss": 0.7135, "step": 140 }, { "epoch": 0.182370820668693, "grad_norm": 3.3592512607574463, "learning_rate": 6.0323886639676124e-06, "loss": 0.7427, "step": 150 }, { "epoch": 0.1945288753799392, "grad_norm": 2.913890838623047, "learning_rate": 6.437246963562754e-06, "loss": 0.6847, "step": 160 }, { "epoch": 0.2066869300911854, "grad_norm": 2.6773297786712646, "learning_rate": 6.842105263157896e-06, "loss": 0.7409, "step": 170 }, { "epoch": 0.2188449848024316, "grad_norm": 2.984339714050293, "learning_rate": 7.246963562753037e-06, "loss": 0.6755, "step": 180 }, { "epoch": 0.23100303951367782, "grad_norm": 3.006441593170166, "learning_rate": 7.651821862348178e-06, "loss": 0.7105, "step": 190 }, { "epoch": 0.24316109422492402, "grad_norm": 3.7500977516174316, "learning_rate": 8.056680161943322e-06, "loss": 0.6796, "step": 200 }, { "epoch": 0.2553191489361702, "grad_norm": 3.6839091777801514, "learning_rate": 8.461538461538462e-06, "loss": 0.6716, "step": 210 }, { "epoch": 0.2674772036474164, "grad_norm": 3.265120267868042, "learning_rate": 8.866396761133604e-06, "loss": 0.6728, "step": 220 }, { "epoch": 0.2796352583586626, "grad_norm": 3.2442452907562256, "learning_rate": 9.271255060728746e-06, "loss": 0.6807, "step": 230 }, { "epoch": 0.2917933130699088, "grad_norm": 3.7613420486450195, "learning_rate": 9.676113360323888e-06, "loss": 0.674, "step": 240 }, { "epoch": 0.303951367781155, "grad_norm": 4.201897621154785, "learning_rate": 9.999979955978923e-06, "loss": 0.6987, "step": 250 }, { "epoch": 0.3161094224924012, "grad_norm": 3.1414012908935547, "learning_rate": 9.999278432115106e-06, "loss": 0.6584, "step": 260 }, { "epoch": 0.3282674772036474, "grad_norm": 3.2702925205230713, "learning_rate": 9.99757486789673e-06, "loss": 0.683, "step": 270 }, { "epoch": 0.3404255319148936, "grad_norm": 2.9207000732421875, "learning_rate": 9.9948696047811e-06, "loss": 0.6895, "step": 280 }, { "epoch": 0.3525835866261398, "grad_norm": 3.319397211074829, "learning_rate": 9.991163185003028e-06, "loss": 0.66, "step": 290 }, { "epoch": 0.364741641337386, "grad_norm": 2.8080954551696777, "learning_rate": 9.98645635146616e-06, "loss": 0.6914, "step": 300 }, { "epoch": 0.3768996960486322, "grad_norm": 4.024887561798096, "learning_rate": 9.980750047594076e-06, "loss": 0.6511, "step": 310 }, { "epoch": 0.3890577507598784, "grad_norm": 2.6032838821411133, "learning_rate": 9.974045417141186e-06, "loss": 0.6522, "step": 320 }, { "epoch": 0.4012158054711246, "grad_norm": 2.4978721141815186, "learning_rate": 9.966343803963481e-06, "loss": 0.6517, "step": 330 }, { "epoch": 0.4133738601823708, "grad_norm": 3.0864603519439697, "learning_rate": 9.957646751749178e-06, "loss": 0.662, "step": 340 }, { "epoch": 0.425531914893617, "grad_norm": 3.713203191757202, "learning_rate": 9.947956003709301e-06, "loss": 0.6728, "step": 350 }, { "epoch": 0.4376899696048632, "grad_norm": 2.8905911445617676, "learning_rate": 9.937273502228283e-06, "loss": 0.6905, "step": 360 }, { "epoch": 0.44984802431610943, "grad_norm": 3.0076723098754883, "learning_rate": 9.925601388474637e-06, "loss": 0.6955, "step": 370 }, { "epoch": 0.46200607902735563, "grad_norm": 3.2814724445343018, "learning_rate": 9.912942001971792e-06, "loss": 0.6176, "step": 380 }, { "epoch": 0.47416413373860183, "grad_norm": 2.9065964221954346, "learning_rate": 9.899297880129156e-06, "loss": 0.6768, "step": 390 }, { "epoch": 0.48632218844984804, "grad_norm": 2.724536657333374, "learning_rate": 9.884671757733534e-06, "loss": 0.6382, "step": 400 }, { "epoch": 0.49848024316109424, "grad_norm": 2.8882269859313965, "learning_rate": 9.869066566400975e-06, "loss": 0.6603, "step": 410 }, { "epoch": 0.5106382978723404, "grad_norm": 2.5395541191101074, "learning_rate": 9.852485433989158e-06, "loss": 0.642, "step": 420 }, { "epoch": 0.5227963525835866, "grad_norm": 2.759591579437256, "learning_rate": 9.834931683970468e-06, "loss": 0.6424, "step": 430 }, { "epoch": 0.5349544072948328, "grad_norm": 2.4878828525543213, "learning_rate": 9.816408834765838e-06, "loss": 0.6435, "step": 440 }, { "epoch": 0.547112462006079, "grad_norm": 2.7571871280670166, "learning_rate": 9.796920599039536e-06, "loss": 0.6766, "step": 450 }, { "epoch": 0.5592705167173252, "grad_norm": 2.585879325866699, "learning_rate": 9.776470882954998e-06, "loss": 0.6082, "step": 460 }, { "epoch": 0.5714285714285714, "grad_norm": 2.5847771167755127, "learning_rate": 9.7550637853919e-06, "loss": 0.6551, "step": 470 }, { "epoch": 0.5835866261398176, "grad_norm": 2.795027732849121, "learning_rate": 9.732703597124586e-06, "loss": 0.6429, "step": 480 }, { "epoch": 0.5957446808510638, "grad_norm": 2.377835273742676, "learning_rate": 9.709394799962038e-06, "loss": 0.6386, "step": 490 }, { "epoch": 0.60790273556231, "grad_norm": 3.254490852355957, "learning_rate": 9.685142065849556e-06, "loss": 0.5844, "step": 500 }, { "epoch": 0.6200607902735562, "grad_norm": 2.866058588027954, "learning_rate": 9.659950255932324e-06, "loss": 0.6079, "step": 510 }, { "epoch": 0.6322188449848024, "grad_norm": 2.680647373199463, "learning_rate": 9.633824419581069e-06, "loss": 0.6294, "step": 520 }, { "epoch": 0.6443768996960486, "grad_norm": 3.1423895359039307, "learning_rate": 9.60676979337996e-06, "loss": 0.6311, "step": 530 }, { "epoch": 0.6565349544072948, "grad_norm": 2.480421304702759, "learning_rate": 9.578791800077021e-06, "loss": 0.6395, "step": 540 }, { "epoch": 0.668693009118541, "grad_norm": 3.06001877784729, "learning_rate": 9.549896047497202e-06, "loss": 0.6613, "step": 550 }, { "epoch": 0.6808510638297872, "grad_norm": 2.8360841274261475, "learning_rate": 9.520088327418371e-06, "loss": 0.6161, "step": 560 }, { "epoch": 0.6930091185410334, "grad_norm": 3.1503562927246094, "learning_rate": 9.489374614410413e-06, "loss": 0.6137, "step": 570 }, { "epoch": 0.7051671732522796, "grad_norm": 2.579737663269043, "learning_rate": 9.457761064637727e-06, "loss": 0.6068, "step": 580 }, { "epoch": 0.7173252279635258, "grad_norm": 3.200549602508545, "learning_rate": 9.425254014625278e-06, "loss": 0.6436, "step": 590 }, { "epoch": 0.729483282674772, "grad_norm": 2.9303839206695557, "learning_rate": 9.391859979988546e-06, "loss": 0.6062, "step": 600 }, { "epoch": 0.7416413373860182, "grad_norm": 3.0887465476989746, "learning_rate": 9.35758565412754e-06, "loss": 0.6244, "step": 610 }, { "epoch": 0.7537993920972644, "grad_norm": 3.22578763961792, "learning_rate": 9.322437906885199e-06, "loss": 0.6544, "step": 620 }, { "epoch": 0.7659574468085106, "grad_norm": 3.8590402603149414, "learning_rate": 9.28642378317042e-06, "loss": 0.6369, "step": 630 }, { "epoch": 0.7781155015197568, "grad_norm": 3.001377820968628, "learning_rate": 9.249550501545998e-06, "loss": 0.6556, "step": 640 }, { "epoch": 0.790273556231003, "grad_norm": 2.9286224842071533, "learning_rate": 9.211825452781762e-06, "loss": 0.599, "step": 650 }, { "epoch": 0.8024316109422492, "grad_norm": 3.0069026947021484, "learning_rate": 9.173256198373185e-06, "loss": 0.6284, "step": 660 }, { "epoch": 0.8145896656534954, "grad_norm": 3.2098565101623535, "learning_rate": 9.133850469025786e-06, "loss": 0.6047, "step": 670 }, { "epoch": 0.8267477203647416, "grad_norm": 2.4998652935028076, "learning_rate": 9.093616163105609e-06, "loss": 0.6233, "step": 680 }, { "epoch": 0.8389057750759878, "grad_norm": 2.6951255798339844, "learning_rate": 9.052561345056095e-06, "loss": 0.6288, "step": 690 }, { "epoch": 0.851063829787234, "grad_norm": 2.79081654548645, "learning_rate": 9.010694243781671e-06, "loss": 0.6248, "step": 700 }, { "epoch": 0.8632218844984803, "grad_norm": 3.0113465785980225, "learning_rate": 8.96802325099838e-06, "loss": 0.6262, "step": 710 }, { "epoch": 0.8753799392097265, "grad_norm": 2.396683692932129, "learning_rate": 8.924556919551863e-06, "loss": 0.6154, "step": 720 }, { "epoch": 0.8875379939209727, "grad_norm": 2.604168176651001, "learning_rate": 8.880303961703048e-06, "loss": 0.6044, "step": 730 }, { "epoch": 0.8996960486322189, "grad_norm": 2.6339340209960938, "learning_rate": 8.835273247381903e-06, "loss": 0.6367, "step": 740 }, { "epoch": 0.9118541033434651, "grad_norm": 2.699192762374878, "learning_rate": 8.789473802409565e-06, "loss": 0.6598, "step": 750 }, { "epoch": 0.9240121580547113, "grad_norm": 3.0326826572418213, "learning_rate": 8.742914806689234e-06, "loss": 0.596, "step": 760 }, { "epoch": 0.9361702127659575, "grad_norm": 3.168898582458496, "learning_rate": 8.695605592366184e-06, "loss": 0.5843, "step": 770 }, { "epoch": 0.9483282674772037, "grad_norm": 2.6533005237579346, "learning_rate": 8.647555641957243e-06, "loss": 0.598, "step": 780 }, { "epoch": 0.9604863221884499, "grad_norm": 3.4256536960601807, "learning_rate": 8.59877458645017e-06, "loss": 0.588, "step": 790 }, { "epoch": 0.9726443768996961, "grad_norm": 2.429436445236206, "learning_rate": 8.54927220337322e-06, "loss": 0.5955, "step": 800 }, { "epoch": 0.9848024316109423, "grad_norm": 2.6898012161254883, "learning_rate": 8.499058414835389e-06, "loss": 0.6068, "step": 810 }, { "epoch": 0.9969604863221885, "grad_norm": 2.064389944076538, "learning_rate": 8.448143285537645e-06, "loss": 0.5694, "step": 820 }, { "epoch": 1.0085106382978724, "grad_norm": 2.412479877471924, "learning_rate": 8.396537020755588e-06, "loss": 0.4937, "step": 830 }, { "epoch": 1.0206686930091184, "grad_norm": 2.439929246902466, "learning_rate": 8.344249964293942e-06, "loss": 0.4945, "step": 840 }, { "epoch": 1.0328267477203648, "grad_norm": 2.665113687515259, "learning_rate": 8.291292596413272e-06, "loss": 0.499, "step": 850 }, { "epoch": 1.0449848024316108, "grad_norm": 2.39530086517334, "learning_rate": 8.237675531729345e-06, "loss": 0.4825, "step": 860 }, { "epoch": 1.0571428571428572, "grad_norm": 2.943648099899292, "learning_rate": 8.18340951708558e-06, "loss": 0.4683, "step": 870 }, { "epoch": 1.0693009118541033, "grad_norm": 2.766900062561035, "learning_rate": 8.128505429398976e-06, "loss": 0.477, "step": 880 }, { "epoch": 1.0814589665653496, "grad_norm": 2.389639139175415, "learning_rate": 8.072974273479972e-06, "loss": 0.5606, "step": 890 }, { "epoch": 1.0936170212765957, "grad_norm": 2.3432369232177734, "learning_rate": 8.016827179826685e-06, "loss": 0.4753, "step": 900 }, { "epoch": 1.105775075987842, "grad_norm": 2.27811598777771, "learning_rate": 7.960075402393937e-06, "loss": 0.4733, "step": 910 }, { "epoch": 1.117933130699088, "grad_norm": 2.5332465171813965, "learning_rate": 7.902730316337556e-06, "loss": 0.4444, "step": 920 }, { "epoch": 1.1300911854103344, "grad_norm": 2.445450782775879, "learning_rate": 7.844803415734368e-06, "loss": 0.4694, "step": 930 }, { "epoch": 1.1422492401215805, "grad_norm": 2.309290647506714, "learning_rate": 7.786306311278354e-06, "loss": 0.4617, "step": 940 }, { "epoch": 1.1544072948328268, "grad_norm": 2.7566754817962646, "learning_rate": 7.727250727953445e-06, "loss": 0.5046, "step": 950 }, { "epoch": 1.1665653495440729, "grad_norm": 2.475858211517334, "learning_rate": 7.667648502683406e-06, "loss": 0.5175, "step": 960 }, { "epoch": 1.1787234042553192, "grad_norm": 2.5813775062561035, "learning_rate": 7.607511581959261e-06, "loss": 0.4656, "step": 970 }, { "epoch": 1.1908814589665653, "grad_norm": 2.9376819133758545, "learning_rate": 7.5468520194447925e-06, "loss": 0.4945, "step": 980 }, { "epoch": 1.2030395136778116, "grad_norm": 2.2708747386932373, "learning_rate": 7.485681973560532e-06, "loss": 0.4931, "step": 990 }, { "epoch": 1.2151975683890577, "grad_norm": 2.87481427192688, "learning_rate": 7.4240137050467635e-06, "loss": 0.4713, "step": 1000 }, { "epoch": 1.227355623100304, "grad_norm": 3.1325576305389404, "learning_rate": 7.361859574506017e-06, "loss": 0.4775, "step": 1010 }, { "epoch": 1.23951367781155, "grad_norm": 2.519160509109497, "learning_rate": 7.299232039925552e-06, "loss": 0.4747, "step": 1020 }, { "epoch": 1.2516717325227964, "grad_norm": 3.1819024085998535, "learning_rate": 7.236143654180311e-06, "loss": 0.4836, "step": 1030 }, { "epoch": 1.2638297872340425, "grad_norm": 2.5699877738952637, "learning_rate": 7.172607062516856e-06, "loss": 0.4471, "step": 1040 }, { "epoch": 1.2759878419452888, "grad_norm": 3.718123435974121, "learning_rate": 7.108635000018802e-06, "loss": 0.5022, "step": 1050 }, { "epoch": 1.288145896656535, "grad_norm": 3.3233118057250977, "learning_rate": 7.044240289054227e-06, "loss": 0.4829, "step": 1060 }, { "epoch": 1.3003039513677812, "grad_norm": 2.362790584564209, "learning_rate": 6.979435836705602e-06, "loss": 0.4801, "step": 1070 }, { "epoch": 1.3124620060790273, "grad_norm": 2.5460381507873535, "learning_rate": 6.9142346321827246e-06, "loss": 0.4922, "step": 1080 }, { "epoch": 1.3246200607902736, "grad_norm": 3.0161983966827393, "learning_rate": 6.84864974421921e-06, "loss": 0.4734, "step": 1090 }, { "epoch": 1.3367781155015197, "grad_norm": 2.387080669403076, "learning_rate": 6.782694318453033e-06, "loss": 0.4924, "step": 1100 }, { "epoch": 1.348936170212766, "grad_norm": 3.0172154903411865, "learning_rate": 6.716381574791648e-06, "loss": 0.4669, "step": 1110 }, { "epoch": 1.361094224924012, "grad_norm": 3.0593836307525635, "learning_rate": 6.649724804762236e-06, "loss": 0.4689, "step": 1120 }, { "epoch": 1.3732522796352584, "grad_norm": 2.0975329875946045, "learning_rate": 6.5827373688475925e-06, "loss": 0.501, "step": 1130 }, { "epoch": 1.3854103343465045, "grad_norm": 2.6222572326660156, "learning_rate": 6.5154326938081866e-06, "loss": 0.487, "step": 1140 }, { "epoch": 1.3975683890577508, "grad_norm": 2.6791985034942627, "learning_rate": 6.447824269990947e-06, "loss": 0.4589, "step": 1150 }, { "epoch": 1.409726443768997, "grad_norm": 3.0119071006774902, "learning_rate": 6.3799256486252945e-06, "loss": 0.4839, "step": 1160 }, { "epoch": 1.4218844984802432, "grad_norm": 3.6317973136901855, "learning_rate": 6.311750439106976e-06, "loss": 0.4391, "step": 1170 }, { "epoch": 1.4340425531914893, "grad_norm": 1.9667277336120605, "learning_rate": 6.243312306270235e-06, "loss": 0.4379, "step": 1180 }, { "epoch": 1.4462006079027356, "grad_norm": 2.5585360527038574, "learning_rate": 6.174624967648877e-06, "loss": 0.4954, "step": 1190 }, { "epoch": 1.4583586626139817, "grad_norm": 2.421276330947876, "learning_rate": 6.105702190726765e-06, "loss": 0.4558, "step": 1200 }, { "epoch": 1.470516717325228, "grad_norm": 2.7117786407470703, "learning_rate": 6.03655779017831e-06, "loss": 0.488, "step": 1210 }, { "epoch": 1.4826747720364741, "grad_norm": 2.3945164680480957, "learning_rate": 5.967205625099496e-06, "loss": 0.4849, "step": 1220 }, { "epoch": 1.4948328267477204, "grad_norm": 2.291707754135132, "learning_rate": 5.897659596230003e-06, "loss": 0.4614, "step": 1230 }, { "epoch": 1.5069908814589665, "grad_norm": 2.583559036254883, "learning_rate": 5.827933643166993e-06, "loss": 0.4626, "step": 1240 }, { "epoch": 1.5191489361702128, "grad_norm": 2.1779534816741943, "learning_rate": 5.758041741571088e-06, "loss": 0.4774, "step": 1250 }, { "epoch": 1.531306990881459, "grad_norm": 3.738179922103882, "learning_rate": 5.687997900365134e-06, "loss": 0.4487, "step": 1260 }, { "epoch": 1.543465045592705, "grad_norm": 2.461461305618286, "learning_rate": 5.617816158926303e-06, "loss": 0.4851, "step": 1270 }, { "epoch": 1.5556231003039513, "grad_norm": 2.784620523452759, "learning_rate": 5.547510584272069e-06, "loss": 0.5079, "step": 1280 }, { "epoch": 1.5677811550151977, "grad_norm": 2.8999369144439697, "learning_rate": 5.477095268240669e-06, "loss": 0.4596, "step": 1290 }, { "epoch": 1.5799392097264437, "grad_norm": 2.786457061767578, "learning_rate": 5.406584324666565e-06, "loss": 0.4226, "step": 1300 }, { "epoch": 1.5920972644376898, "grad_norm": 3.0281615257263184, "learning_rate": 5.335991886551526e-06, "loss": 0.4826, "step": 1310 }, { "epoch": 1.6042553191489362, "grad_norm": 2.60740327835083, "learning_rate": 5.2653321032318315e-06, "loss": 0.5185, "step": 1320 }, { "epoch": 1.6164133738601825, "grad_norm": 2.150132179260254, "learning_rate": 5.194619137542241e-06, "loss": 0.511, "step": 1330 }, { "epoch": 1.6285714285714286, "grad_norm": 2.7613778114318848, "learning_rate": 5.123867162977224e-06, "loss": 0.4653, "step": 1340 }, { "epoch": 1.6407294832826747, "grad_norm": 2.5103297233581543, "learning_rate": 5.053090360850072e-06, "loss": 0.4206, "step": 1350 }, { "epoch": 1.652887537993921, "grad_norm": 2.6275055408477783, "learning_rate": 4.9823029174504335e-06, "loss": 0.4727, "step": 1360 }, { "epoch": 1.6650455927051673, "grad_norm": 2.6730258464813232, "learning_rate": 4.9115190212008745e-06, "loss": 0.4616, "step": 1370 }, { "epoch": 1.6772036474164134, "grad_norm": 2.8876965045928955, "learning_rate": 4.840752859812972e-06, "loss": 0.4868, "step": 1380 }, { "epoch": 1.6893617021276595, "grad_norm": 2.7904891967773438, "learning_rate": 4.770018617443578e-06, "loss": 0.4453, "step": 1390 }, { "epoch": 1.7015197568389058, "grad_norm": 2.7088356018066406, "learning_rate": 4.699330471851798e-06, "loss": 0.4708, "step": 1400 }, { "epoch": 1.713677811550152, "grad_norm": 2.1268298625946045, "learning_rate": 4.628702591557237e-06, "loss": 0.4901, "step": 1410 }, { "epoch": 1.7258358662613982, "grad_norm": 2.3978848457336426, "learning_rate": 4.558149133000104e-06, "loss": 0.5164, "step": 1420 }, { "epoch": 1.7379939209726443, "grad_norm": 1.8490489721298218, "learning_rate": 4.487684237703734e-06, "loss": 0.4342, "step": 1430 }, { "epoch": 1.7501519756838906, "grad_norm": 2.7367939949035645, "learning_rate": 4.417322029440119e-06, "loss": 0.4887, "step": 1440 }, { "epoch": 1.762310030395137, "grad_norm": 2.217988967895508, "learning_rate": 4.347076611398961e-06, "loss": 0.46, "step": 1450 }, { "epoch": 1.774468085106383, "grad_norm": 2.8989672660827637, "learning_rate": 4.2769620633608835e-06, "loss": 0.4524, "step": 1460 }, { "epoch": 1.786626139817629, "grad_norm": 2.5125184059143066, "learning_rate": 4.206992438875318e-06, "loss": 0.4346, "step": 1470 }, { "epoch": 1.7987841945288754, "grad_norm": 2.9049086570739746, "learning_rate": 4.137181762443658e-06, "loss": 0.3629, "step": 1480 }, { "epoch": 1.8109422492401217, "grad_norm": 2.5007903575897217, "learning_rate": 4.0675440267082236e-06, "loss": 0.4943, "step": 1490 }, { "epoch": 1.8231003039513678, "grad_norm": 2.90285325050354, "learning_rate": 3.998093189647622e-06, "loss": 0.4331, "step": 1500 }, { "epoch": 1.8352583586626139, "grad_norm": 2.7467432022094727, "learning_rate": 3.928843171779051e-06, "loss": 0.4826, "step": 1510 }, { "epoch": 1.8474164133738602, "grad_norm": 2.5184881687164307, "learning_rate": 3.859807853368112e-06, "loss": 0.4204, "step": 1520 }, { "epoch": 1.8595744680851065, "grad_norm": 2.262559652328491, "learning_rate": 3.791001071646695e-06, "loss": 0.455, "step": 1530 }, { "epoch": 1.8717325227963526, "grad_norm": 2.8905036449432373, "learning_rate": 3.72243661803948e-06, "loss": 0.4584, "step": 1540 }, { "epoch": 1.8838905775075987, "grad_norm": 3.1106526851654053, "learning_rate": 3.6541282353996275e-06, "loss": 0.4559, "step": 1550 }, { "epoch": 1.896048632218845, "grad_norm": 2.7985055446624756, "learning_rate": 3.5860896152542013e-06, "loss": 0.4452, "step": 1560 }, { "epoch": 1.9082066869300913, "grad_norm": 2.4769771099090576, "learning_rate": 3.5183343950598825e-06, "loss": 0.463, "step": 1570 }, { "epoch": 1.9203647416413374, "grad_norm": 2.9029109477996826, "learning_rate": 3.450876155469518e-06, "loss": 0.4377, "step": 1580 }, { "epoch": 1.9325227963525835, "grad_norm": 2.444227457046509, "learning_rate": 3.3837284176100543e-06, "loss": 0.4559, "step": 1590 }, { "epoch": 1.9446808510638298, "grad_norm": 1.8885170221328735, "learning_rate": 3.3169046403724004e-06, "loss": 0.4315, "step": 1600 }, { "epoch": 1.9568389057750761, "grad_norm": 2.370542526245117, "learning_rate": 3.250418217713771e-06, "loss": 0.4496, "step": 1610 }, { "epoch": 1.9689969604863222, "grad_norm": 2.281675338745117, "learning_rate": 3.1842824759730518e-06, "loss": 0.4651, "step": 1620 }, { "epoch": 1.9811550151975683, "grad_norm": 2.4642460346221924, "learning_rate": 3.1185106711996848e-06, "loss": 0.4492, "step": 1630 }, { "epoch": 1.9933130699088146, "grad_norm": 2.898085355758667, "learning_rate": 3.0531159864966885e-06, "loss": 0.4217, "step": 1640 }, { "epoch": 2.0048632218844986, "grad_norm": 2.5043299198150635, "learning_rate": 2.9881115293782638e-06, "loss": 0.4087, "step": 1650 }, { "epoch": 2.0170212765957447, "grad_norm": 2.5139353275299072, "learning_rate": 2.923510329142568e-06, "loss": 0.3166, "step": 1660 }, { "epoch": 2.029179331306991, "grad_norm": 2.334691047668457, "learning_rate": 2.8593253342601557e-06, "loss": 0.2967, "step": 1670 }, { "epoch": 2.041337386018237, "grad_norm": 2.763742685317993, "learning_rate": 2.795569409778639e-06, "loss": 0.3263, "step": 1680 }, { "epoch": 2.0534954407294834, "grad_norm": 2.539466142654419, "learning_rate": 2.7322553347440368e-06, "loss": 0.2964, "step": 1690 }, { "epoch": 2.0656534954407295, "grad_norm": 3.0033152103424072, "learning_rate": 2.6693957996393984e-06, "loss": 0.3157, "step": 1700 }, { "epoch": 2.0778115501519756, "grad_norm": 2.8573410511016846, "learning_rate": 2.6070034038411553e-06, "loss": 0.3542, "step": 1710 }, { "epoch": 2.0899696048632217, "grad_norm": 2.31459641456604, "learning_rate": 2.545090653093738e-06, "loss": 0.2965, "step": 1720 }, { "epoch": 2.1021276595744682, "grad_norm": 2.5029146671295166, "learning_rate": 2.4836699570029623e-06, "loss": 0.295, "step": 1730 }, { "epoch": 2.1142857142857143, "grad_norm": 2.295614004135132, "learning_rate": 2.4227536265486885e-06, "loss": 0.3075, "step": 1740 }, { "epoch": 2.1264437689969604, "grad_norm": 2.5719516277313232, "learning_rate": 2.3623538716172394e-06, "loss": 0.3397, "step": 1750 }, { "epoch": 2.1386018237082065, "grad_norm": 2.6095409393310547, "learning_rate": 2.302482798554096e-06, "loss": 0.324, "step": 1760 }, { "epoch": 2.150759878419453, "grad_norm": 2.429758310317993, "learning_rate": 2.2431524077373314e-06, "loss": 0.2939, "step": 1770 }, { "epoch": 2.162917933130699, "grad_norm": 2.9179751873016357, "learning_rate": 2.1843745911722937e-06, "loss": 0.308, "step": 1780 }, { "epoch": 2.1750759878419452, "grad_norm": 3.281049966812134, "learning_rate": 2.1261611301080063e-06, "loss": 0.3229, "step": 1790 }, { "epoch": 2.1872340425531913, "grad_norm": 2.931143283843994, "learning_rate": 2.068523692675772e-06, "loss": 0.3107, "step": 1800 }, { "epoch": 2.199392097264438, "grad_norm": 2.432403802871704, "learning_rate": 2.0114738315504505e-06, "loss": 0.2925, "step": 1810 }, { "epoch": 2.211550151975684, "grad_norm": 3.296288251876831, "learning_rate": 1.955022981634863e-06, "loss": 0.3115, "step": 1820 }, { "epoch": 2.22370820668693, "grad_norm": 2.6482620239257812, "learning_rate": 1.8991824577678269e-06, "loss": 0.3423, "step": 1830 }, { "epoch": 2.235866261398176, "grad_norm": 2.5689730644226074, "learning_rate": 1.8439634524562423e-06, "loss": 0.344, "step": 1840 }, { "epoch": 2.2480243161094227, "grad_norm": 2.4932291507720947, "learning_rate": 1.7893770336316928e-06, "loss": 0.3052, "step": 1850 }, { "epoch": 2.2601823708206688, "grad_norm": 2.985034704208374, "learning_rate": 1.7354341424320286e-06, "loss": 0.3056, "step": 1860 }, { "epoch": 2.272340425531915, "grad_norm": 2.558969020843506, "learning_rate": 1.6821455910083535e-06, "loss": 0.2883, "step": 1870 }, { "epoch": 2.284498480243161, "grad_norm": 2.731433868408203, "learning_rate": 1.6295220603578727e-06, "loss": 0.3017, "step": 1880 }, { "epoch": 2.2966565349544075, "grad_norm": 2.7767181396484375, "learning_rate": 1.5775740981830262e-06, "loss": 0.3348, "step": 1890 }, { "epoch": 2.3088145896656536, "grad_norm": 2.289393424987793, "learning_rate": 1.526312116777336e-06, "loss": 0.3377, "step": 1900 }, { "epoch": 2.3209726443768997, "grad_norm": 2.3055357933044434, "learning_rate": 1.475746390938399e-06, "loss": 0.3111, "step": 1910 }, { "epoch": 2.3331306990881457, "grad_norm": 3.120150089263916, "learning_rate": 1.4258870559084387e-06, "loss": 0.3172, "step": 1920 }, { "epoch": 2.3452887537993923, "grad_norm": 2.520087718963623, "learning_rate": 1.3767441053428244e-06, "loss": 0.3071, "step": 1930 }, { "epoch": 2.3574468085106384, "grad_norm": 2.533917188644409, "learning_rate": 1.328327389306977e-06, "loss": 0.328, "step": 1940 }, { "epoch": 2.3696048632218845, "grad_norm": 2.611781120300293, "learning_rate": 1.2806466123020479e-06, "loss": 0.2373, "step": 1950 }, { "epoch": 2.3817629179331306, "grad_norm": 2.3127501010894775, "learning_rate": 1.2337113313197813e-06, "loss": 0.3226, "step": 1960 }, { "epoch": 2.393920972644377, "grad_norm": 3.0540225505828857, "learning_rate": 1.1875309539269332e-06, "loss": 0.3181, "step": 1970 }, { "epoch": 2.406079027355623, "grad_norm": 2.2503840923309326, "learning_rate": 1.1421147363796547e-06, "loss": 0.2918, "step": 1980 }, { "epoch": 2.4182370820668693, "grad_norm": 2.5295841693878174, "learning_rate": 1.097471781768194e-06, "loss": 0.2941, "step": 1990 }, { "epoch": 2.4303951367781154, "grad_norm": 2.7835609912872314, "learning_rate": 1.053611038192296e-06, "loss": 0.2901, "step": 2000 }, { "epoch": 2.4425531914893615, "grad_norm": 2.6751868724823, "learning_rate": 1.0105412969676758e-06, "loss": 0.335, "step": 2010 }, { "epoch": 2.454711246200608, "grad_norm": 2.333569049835205, "learning_rate": 9.682711908639137e-07, "loss": 0.2967, "step": 2020 }, { "epoch": 2.466869300911854, "grad_norm": 2.4557461738586426, "learning_rate": 9.268091923741246e-07, "loss": 0.2856, "step": 2030 }, { "epoch": 2.4790273556231, "grad_norm": 2.8368921279907227, "learning_rate": 8.861636120167632e-07, "loss": 0.3396, "step": 2040 }, { "epoch": 2.4911854103343467, "grad_norm": 2.238975763320923, "learning_rate": 8.463425966698857e-07, "loss": 0.3138, "step": 2050 }, { "epoch": 2.503343465045593, "grad_norm": 2.7118637561798096, "learning_rate": 8.073541279382135e-07, "loss": 0.3397, "step": 2060 }, { "epoch": 2.515501519756839, "grad_norm": 2.528172731399536, "learning_rate": 7.69206020553323e-07, "loss": 0.3273, "step": 2070 }, { "epoch": 2.527659574468085, "grad_norm": 2.3029799461364746, "learning_rate": 7.319059208072909e-07, "loss": 0.3238, "step": 2080 }, { "epoch": 2.539817629179331, "grad_norm": 2.535954475402832, "learning_rate": 6.954613050200859e-07, "loss": 0.328, "step": 2090 }, { "epoch": 2.5519756838905776, "grad_norm": 2.4356179237365723, "learning_rate": 6.5987947804104e-07, "loss": 0.3081, "step": 2100 }, { "epoch": 2.5641337386018237, "grad_norm": 2.039999008178711, "learning_rate": 6.251675717846905e-07, "loss": 0.311, "step": 2110 }, { "epoch": 2.57629179331307, "grad_norm": 2.296266794204712, "learning_rate": 5.913325438012773e-07, "loss": 0.2815, "step": 2120 }, { "epoch": 2.5884498480243163, "grad_norm": 2.9788355827331543, "learning_rate": 5.583811758821916e-07, "loss": 0.3346, "step": 2130 }, { "epoch": 2.6006079027355624, "grad_norm": 3.0497772693634033, "learning_rate": 5.263200727006568e-07, "loss": 0.2976, "step": 2140 }, { "epoch": 2.6127659574468085, "grad_norm": 2.972386121749878, "learning_rate": 4.951556604879049e-07, "loss": 0.272, "step": 2150 }, { "epoch": 2.6249240121580546, "grad_norm": 3.3032212257385254, "learning_rate": 4.648941857451228e-07, "loss": 0.2989, "step": 2160 }, { "epoch": 2.6370820668693007, "grad_norm": 2.078023672103882, "learning_rate": 4.355417139914242e-07, "loss": 0.3353, "step": 2170 }, { "epoch": 2.6492401215805472, "grad_norm": 2.5029313564300537, "learning_rate": 4.0710412854809255e-07, "loss": 0.3413, "step": 2180 }, { "epoch": 2.6613981762917933, "grad_norm": 2.516641616821289, "learning_rate": 3.7958712935934726e-07, "loss": 0.347, "step": 2190 }, { "epoch": 2.6735562310030394, "grad_norm": 2.115417718887329, "learning_rate": 3.5299623184986366e-07, "loss": 0.2955, "step": 2200 }, { "epoch": 2.685714285714286, "grad_norm": 2.419532299041748, "learning_rate": 3.273367658192778e-07, "loss": 0.285, "step": 2210 }, { "epoch": 2.697872340425532, "grad_norm": 2.186316967010498, "learning_rate": 3.0261387437389766e-07, "loss": 0.3091, "step": 2220 }, { "epoch": 2.710030395136778, "grad_norm": 2.725886583328247, "learning_rate": 2.7883251289583467e-07, "loss": 0.317, "step": 2230 }, { "epoch": 2.722188449848024, "grad_norm": 2.355541944503784, "learning_rate": 2.5599744804975956e-07, "loss": 0.3093, "step": 2240 }, { "epoch": 2.7343465045592703, "grad_norm": 2.3325600624084473, "learning_rate": 2.3411325682748843e-07, "loss": 0.2784, "step": 2250 }, { "epoch": 2.746504559270517, "grad_norm": 2.1935389041900635, "learning_rate": 2.1318432563058765e-07, "loss": 0.2835, "step": 2260 }, { "epoch": 2.758662613981763, "grad_norm": 2.6569478511810303, "learning_rate": 1.9321484939116843e-07, "loss": 0.2821, "step": 2270 }, { "epoch": 2.770820668693009, "grad_norm": 2.2891359329223633, "learning_rate": 1.742088307310741e-07, "loss": 0.3362, "step": 2280 }, { "epoch": 2.7829787234042556, "grad_norm": 2.387622833251953, "learning_rate": 1.561700791596038e-07, "loss": 0.2973, "step": 2290 }, { "epoch": 2.7951367781155017, "grad_norm": 2.550020217895508, "learning_rate": 1.3910221030994764e-07, "loss": 0.279, "step": 2300 }, { "epoch": 2.8072948328267477, "grad_norm": 2.9916679859161377, "learning_rate": 1.2300864521447575e-07, "loss": 0.3318, "step": 2310 }, { "epoch": 2.819452887537994, "grad_norm": 2.5227696895599365, "learning_rate": 1.0789260961904357e-07, "loss": 0.2887, "step": 2320 }, { "epoch": 2.83161094224924, "grad_norm": 2.750169038772583, "learning_rate": 9.375713333642677e-08, "loss": 0.3087, "step": 2330 }, { "epoch": 2.8437689969604865, "grad_norm": 2.2595717906951904, "learning_rate": 8.060504963903815e-08, "loss": 0.28, "step": 2340 }, { "epoch": 2.8559270516717326, "grad_norm": 2.5719687938690186, "learning_rate": 6.843899469103521e-08, "loss": 0.3007, "step": 2350 }, { "epoch": 2.8680851063829786, "grad_norm": 2.3983314037323, "learning_rate": 5.726140701993288e-08, "loss": 0.3049, "step": 2360 }, { "epoch": 2.880243161094225, "grad_norm": 2.910645008087158, "learning_rate": 4.707452702783388e-08, "loss": 0.3006, "step": 2370 }, { "epoch": 2.8924012158054713, "grad_norm": 2.546748161315918, "learning_rate": 3.7880396542369635e-08, "loss": 0.3431, "step": 2380 }, { "epoch": 2.9045592705167174, "grad_norm": 2.052919387817383, "learning_rate": 2.9680858407441503e-08, "loss": 0.2987, "step": 2390 }, { "epoch": 2.9167173252279635, "grad_norm": 2.4598567485809326, "learning_rate": 2.24775561138485e-08, "loss": 0.3207, "step": 2400 }, { "epoch": 2.9288753799392095, "grad_norm": 2.4281694889068604, "learning_rate": 1.627193346986744e-08, "loss": 0.2847, "step": 2410 }, { "epoch": 2.941033434650456, "grad_norm": 2.2293615341186523, "learning_rate": 1.1065234311864459e-08, "loss": 0.2952, "step": 2420 }, { "epoch": 2.953191489361702, "grad_norm": 2.4415087699890137, "learning_rate": 6.858502254981081e-09, "loss": 0.2756, "step": 2430 }, { "epoch": 2.9653495440729483, "grad_norm": 2.482046604156494, "learning_rate": 3.652580483956558e-09, "loss": 0.3162, "step": 2440 }, { "epoch": 2.977507598784195, "grad_norm": 2.2831978797912598, "learning_rate": 1.4481115841230574e-09, "loss": 0.2904, "step": 2450 }, { "epoch": 2.989665653495441, "grad_norm": 2.147287368774414, "learning_rate": 2.4553741260535667e-10, "loss": 0.3062, "step": 2460 } ], "logging_steps": 10, "max_steps": 2466, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0307400421054874e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }