|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9969604863221884, |
|
"eval_steps": 500, |
|
"global_step": 2466, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0121580547112462, |
|
"grad_norm": 5.922801494598389, |
|
"learning_rate": 3.6437246963562754e-07, |
|
"loss": 1.1863, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0243161094224924, |
|
"grad_norm": 6.66644811630249, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 1.1376, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0364741641337386, |
|
"grad_norm": 7.5532402992248535, |
|
"learning_rate": 1.174089068825911e-06, |
|
"loss": 1.1034, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0486322188449848, |
|
"grad_norm": 4.451807498931885, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 1.0824, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.060790273556231005, |
|
"grad_norm": 4.282721996307373, |
|
"learning_rate": 1.9838056680161946e-06, |
|
"loss": 0.9437, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0729483282674772, |
|
"grad_norm": 3.476186990737915, |
|
"learning_rate": 2.3886639676113362e-06, |
|
"loss": 0.8906, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 2.9487781524658203, |
|
"learning_rate": 2.7935222672064783e-06, |
|
"loss": 0.8558, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0972644376899696, |
|
"grad_norm": 3.981879949569702, |
|
"learning_rate": 3.19838056680162e-06, |
|
"loss": 0.7978, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1094224924012158, |
|
"grad_norm": 3.0333662033081055, |
|
"learning_rate": 3.6032388663967615e-06, |
|
"loss": 0.7652, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12158054711246201, |
|
"grad_norm": 2.985710859298706, |
|
"learning_rate": 4.008097165991903e-06, |
|
"loss": 0.7671, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1337386018237082, |
|
"grad_norm": 2.512871026992798, |
|
"learning_rate": 4.412955465587045e-06, |
|
"loss": 0.7848, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1458966565349544, |
|
"grad_norm": 3.5035741329193115, |
|
"learning_rate": 4.817813765182186e-06, |
|
"loss": 0.7419, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1580547112462006, |
|
"grad_norm": 4.36132287979126, |
|
"learning_rate": 5.222672064777329e-06, |
|
"loss": 0.7379, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 3.3608834743499756, |
|
"learning_rate": 5.6275303643724695e-06, |
|
"loss": 0.7135, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.182370820668693, |
|
"grad_norm": 3.3592512607574463, |
|
"learning_rate": 6.0323886639676124e-06, |
|
"loss": 0.7427, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1945288753799392, |
|
"grad_norm": 2.913890838623047, |
|
"learning_rate": 6.437246963562754e-06, |
|
"loss": 0.6847, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2066869300911854, |
|
"grad_norm": 2.6773297786712646, |
|
"learning_rate": 6.842105263157896e-06, |
|
"loss": 0.7409, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2188449848024316, |
|
"grad_norm": 2.984339714050293, |
|
"learning_rate": 7.246963562753037e-06, |
|
"loss": 0.6755, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23100303951367782, |
|
"grad_norm": 3.006441593170166, |
|
"learning_rate": 7.651821862348178e-06, |
|
"loss": 0.7105, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 3.7500977516174316, |
|
"learning_rate": 8.056680161943322e-06, |
|
"loss": 0.6796, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 3.6839091777801514, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 0.6716, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2674772036474164, |
|
"grad_norm": 3.265120267868042, |
|
"learning_rate": 8.866396761133604e-06, |
|
"loss": 0.6728, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2796352583586626, |
|
"grad_norm": 3.2442452907562256, |
|
"learning_rate": 9.271255060728746e-06, |
|
"loss": 0.6807, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2917933130699088, |
|
"grad_norm": 3.7613420486450195, |
|
"learning_rate": 9.676113360323888e-06, |
|
"loss": 0.674, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.303951367781155, |
|
"grad_norm": 4.201897621154785, |
|
"learning_rate": 9.999979955978923e-06, |
|
"loss": 0.6987, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3161094224924012, |
|
"grad_norm": 3.1414012908935547, |
|
"learning_rate": 9.999278432115106e-06, |
|
"loss": 0.6584, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3282674772036474, |
|
"grad_norm": 3.2702925205230713, |
|
"learning_rate": 9.99757486789673e-06, |
|
"loss": 0.683, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 2.9207000732421875, |
|
"learning_rate": 9.9948696047811e-06, |
|
"loss": 0.6895, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3525835866261398, |
|
"grad_norm": 3.319397211074829, |
|
"learning_rate": 9.991163185003028e-06, |
|
"loss": 0.66, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.364741641337386, |
|
"grad_norm": 2.8080954551696777, |
|
"learning_rate": 9.98645635146616e-06, |
|
"loss": 0.6914, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3768996960486322, |
|
"grad_norm": 4.024887561798096, |
|
"learning_rate": 9.980750047594076e-06, |
|
"loss": 0.6511, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3890577507598784, |
|
"grad_norm": 2.6032838821411133, |
|
"learning_rate": 9.974045417141186e-06, |
|
"loss": 0.6522, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4012158054711246, |
|
"grad_norm": 2.4978721141815186, |
|
"learning_rate": 9.966343803963481e-06, |
|
"loss": 0.6517, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4133738601823708, |
|
"grad_norm": 3.0864603519439697, |
|
"learning_rate": 9.957646751749178e-06, |
|
"loss": 0.662, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 3.713203191757202, |
|
"learning_rate": 9.947956003709301e-06, |
|
"loss": 0.6728, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4376899696048632, |
|
"grad_norm": 2.8905911445617676, |
|
"learning_rate": 9.937273502228283e-06, |
|
"loss": 0.6905, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.44984802431610943, |
|
"grad_norm": 3.0076723098754883, |
|
"learning_rate": 9.925601388474637e-06, |
|
"loss": 0.6955, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.46200607902735563, |
|
"grad_norm": 3.2814724445343018, |
|
"learning_rate": 9.912942001971792e-06, |
|
"loss": 0.6176, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.47416413373860183, |
|
"grad_norm": 2.9065964221954346, |
|
"learning_rate": 9.899297880129156e-06, |
|
"loss": 0.6768, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 2.724536657333374, |
|
"learning_rate": 9.884671757733534e-06, |
|
"loss": 0.6382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.49848024316109424, |
|
"grad_norm": 2.8882269859313965, |
|
"learning_rate": 9.869066566400975e-06, |
|
"loss": 0.6603, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 2.5395541191101074, |
|
"learning_rate": 9.852485433989158e-06, |
|
"loss": 0.642, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5227963525835866, |
|
"grad_norm": 2.759591579437256, |
|
"learning_rate": 9.834931683970468e-06, |
|
"loss": 0.6424, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5349544072948328, |
|
"grad_norm": 2.4878828525543213, |
|
"learning_rate": 9.816408834765838e-06, |
|
"loss": 0.6435, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.547112462006079, |
|
"grad_norm": 2.7571871280670166, |
|
"learning_rate": 9.796920599039536e-06, |
|
"loss": 0.6766, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5592705167173252, |
|
"grad_norm": 2.585879325866699, |
|
"learning_rate": 9.776470882954998e-06, |
|
"loss": 0.6082, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 2.5847771167755127, |
|
"learning_rate": 9.7550637853919e-06, |
|
"loss": 0.6551, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5835866261398176, |
|
"grad_norm": 2.795027732849121, |
|
"learning_rate": 9.732703597124586e-06, |
|
"loss": 0.6429, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 2.377835273742676, |
|
"learning_rate": 9.709394799962038e-06, |
|
"loss": 0.6386, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.60790273556231, |
|
"grad_norm": 3.254490852355957, |
|
"learning_rate": 9.685142065849556e-06, |
|
"loss": 0.5844, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6200607902735562, |
|
"grad_norm": 2.866058588027954, |
|
"learning_rate": 9.659950255932324e-06, |
|
"loss": 0.6079, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6322188449848024, |
|
"grad_norm": 2.680647373199463, |
|
"learning_rate": 9.633824419581069e-06, |
|
"loss": 0.6294, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6443768996960486, |
|
"grad_norm": 3.1423895359039307, |
|
"learning_rate": 9.60676979337996e-06, |
|
"loss": 0.6311, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6565349544072948, |
|
"grad_norm": 2.480421304702759, |
|
"learning_rate": 9.578791800077021e-06, |
|
"loss": 0.6395, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.668693009118541, |
|
"grad_norm": 3.06001877784729, |
|
"learning_rate": 9.549896047497202e-06, |
|
"loss": 0.6613, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 2.8360841274261475, |
|
"learning_rate": 9.520088327418371e-06, |
|
"loss": 0.6161, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6930091185410334, |
|
"grad_norm": 3.1503562927246094, |
|
"learning_rate": 9.489374614410413e-06, |
|
"loss": 0.6137, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7051671732522796, |
|
"grad_norm": 2.579737663269043, |
|
"learning_rate": 9.457761064637727e-06, |
|
"loss": 0.6068, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7173252279635258, |
|
"grad_norm": 3.200549602508545, |
|
"learning_rate": 9.425254014625278e-06, |
|
"loss": 0.6436, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 2.9303839206695557, |
|
"learning_rate": 9.391859979988546e-06, |
|
"loss": 0.6062, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7416413373860182, |
|
"grad_norm": 3.0887465476989746, |
|
"learning_rate": 9.35758565412754e-06, |
|
"loss": 0.6244, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7537993920972644, |
|
"grad_norm": 3.22578763961792, |
|
"learning_rate": 9.322437906885199e-06, |
|
"loss": 0.6544, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 3.8590402603149414, |
|
"learning_rate": 9.28642378317042e-06, |
|
"loss": 0.6369, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7781155015197568, |
|
"grad_norm": 3.001377820968628, |
|
"learning_rate": 9.249550501545998e-06, |
|
"loss": 0.6556, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.790273556231003, |
|
"grad_norm": 2.9286224842071533, |
|
"learning_rate": 9.211825452781762e-06, |
|
"loss": 0.599, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8024316109422492, |
|
"grad_norm": 3.0069026947021484, |
|
"learning_rate": 9.173256198373185e-06, |
|
"loss": 0.6284, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8145896656534954, |
|
"grad_norm": 3.2098565101623535, |
|
"learning_rate": 9.133850469025786e-06, |
|
"loss": 0.6047, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8267477203647416, |
|
"grad_norm": 2.4998652935028076, |
|
"learning_rate": 9.093616163105609e-06, |
|
"loss": 0.6233, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8389057750759878, |
|
"grad_norm": 2.6951255798339844, |
|
"learning_rate": 9.052561345056095e-06, |
|
"loss": 0.6288, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 2.79081654548645, |
|
"learning_rate": 9.010694243781671e-06, |
|
"loss": 0.6248, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8632218844984803, |
|
"grad_norm": 3.0113465785980225, |
|
"learning_rate": 8.96802325099838e-06, |
|
"loss": 0.6262, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8753799392097265, |
|
"grad_norm": 2.396683692932129, |
|
"learning_rate": 8.924556919551863e-06, |
|
"loss": 0.6154, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8875379939209727, |
|
"grad_norm": 2.604168176651001, |
|
"learning_rate": 8.880303961703048e-06, |
|
"loss": 0.6044, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8996960486322189, |
|
"grad_norm": 2.6339340209960938, |
|
"learning_rate": 8.835273247381903e-06, |
|
"loss": 0.6367, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9118541033434651, |
|
"grad_norm": 2.699192762374878, |
|
"learning_rate": 8.789473802409565e-06, |
|
"loss": 0.6598, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9240121580547113, |
|
"grad_norm": 3.0326826572418213, |
|
"learning_rate": 8.742914806689234e-06, |
|
"loss": 0.596, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 3.168898582458496, |
|
"learning_rate": 8.695605592366184e-06, |
|
"loss": 0.5843, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9483282674772037, |
|
"grad_norm": 2.6533005237579346, |
|
"learning_rate": 8.647555641957243e-06, |
|
"loss": 0.598, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9604863221884499, |
|
"grad_norm": 3.4256536960601807, |
|
"learning_rate": 8.59877458645017e-06, |
|
"loss": 0.588, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9726443768996961, |
|
"grad_norm": 2.429436445236206, |
|
"learning_rate": 8.54927220337322e-06, |
|
"loss": 0.5955, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9848024316109423, |
|
"grad_norm": 2.6898012161254883, |
|
"learning_rate": 8.499058414835389e-06, |
|
"loss": 0.6068, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9969604863221885, |
|
"grad_norm": 2.064389944076538, |
|
"learning_rate": 8.448143285537645e-06, |
|
"loss": 0.5694, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.0085106382978724, |
|
"grad_norm": 2.412479877471924, |
|
"learning_rate": 8.396537020755588e-06, |
|
"loss": 0.4937, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0206686930091184, |
|
"grad_norm": 2.439929246902466, |
|
"learning_rate": 8.344249964293942e-06, |
|
"loss": 0.4945, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0328267477203648, |
|
"grad_norm": 2.665113687515259, |
|
"learning_rate": 8.291292596413272e-06, |
|
"loss": 0.499, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0449848024316108, |
|
"grad_norm": 2.39530086517334, |
|
"learning_rate": 8.237675531729345e-06, |
|
"loss": 0.4825, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0571428571428572, |
|
"grad_norm": 2.943648099899292, |
|
"learning_rate": 8.18340951708558e-06, |
|
"loss": 0.4683, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0693009118541033, |
|
"grad_norm": 2.766900062561035, |
|
"learning_rate": 8.128505429398976e-06, |
|
"loss": 0.477, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0814589665653496, |
|
"grad_norm": 2.389639139175415, |
|
"learning_rate": 8.072974273479972e-06, |
|
"loss": 0.5606, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0936170212765957, |
|
"grad_norm": 2.3432369232177734, |
|
"learning_rate": 8.016827179826685e-06, |
|
"loss": 0.4753, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.105775075987842, |
|
"grad_norm": 2.27811598777771, |
|
"learning_rate": 7.960075402393937e-06, |
|
"loss": 0.4733, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.117933130699088, |
|
"grad_norm": 2.5332465171813965, |
|
"learning_rate": 7.902730316337556e-06, |
|
"loss": 0.4444, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.1300911854103344, |
|
"grad_norm": 2.445450782775879, |
|
"learning_rate": 7.844803415734368e-06, |
|
"loss": 0.4694, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1422492401215805, |
|
"grad_norm": 2.309290647506714, |
|
"learning_rate": 7.786306311278354e-06, |
|
"loss": 0.4617, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1544072948328268, |
|
"grad_norm": 2.7566754817962646, |
|
"learning_rate": 7.727250727953445e-06, |
|
"loss": 0.5046, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1665653495440729, |
|
"grad_norm": 2.475858211517334, |
|
"learning_rate": 7.667648502683406e-06, |
|
"loss": 0.5175, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1787234042553192, |
|
"grad_norm": 2.5813775062561035, |
|
"learning_rate": 7.607511581959261e-06, |
|
"loss": 0.4656, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1908814589665653, |
|
"grad_norm": 2.9376819133758545, |
|
"learning_rate": 7.5468520194447925e-06, |
|
"loss": 0.4945, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.2030395136778116, |
|
"grad_norm": 2.2708747386932373, |
|
"learning_rate": 7.485681973560532e-06, |
|
"loss": 0.4931, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.2151975683890577, |
|
"grad_norm": 2.87481427192688, |
|
"learning_rate": 7.4240137050467635e-06, |
|
"loss": 0.4713, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.227355623100304, |
|
"grad_norm": 3.1325576305389404, |
|
"learning_rate": 7.361859574506017e-06, |
|
"loss": 0.4775, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.23951367781155, |
|
"grad_norm": 2.519160509109497, |
|
"learning_rate": 7.299232039925552e-06, |
|
"loss": 0.4747, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2516717325227964, |
|
"grad_norm": 3.1819024085998535, |
|
"learning_rate": 7.236143654180311e-06, |
|
"loss": 0.4836, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2638297872340425, |
|
"grad_norm": 2.5699877738952637, |
|
"learning_rate": 7.172607062516856e-06, |
|
"loss": 0.4471, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.2759878419452888, |
|
"grad_norm": 3.718123435974121, |
|
"learning_rate": 7.108635000018802e-06, |
|
"loss": 0.5022, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.288145896656535, |
|
"grad_norm": 3.3233118057250977, |
|
"learning_rate": 7.044240289054227e-06, |
|
"loss": 0.4829, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.3003039513677812, |
|
"grad_norm": 2.362790584564209, |
|
"learning_rate": 6.979435836705602e-06, |
|
"loss": 0.4801, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.3124620060790273, |
|
"grad_norm": 2.5460381507873535, |
|
"learning_rate": 6.9142346321827246e-06, |
|
"loss": 0.4922, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.3246200607902736, |
|
"grad_norm": 3.0161983966827393, |
|
"learning_rate": 6.84864974421921e-06, |
|
"loss": 0.4734, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3367781155015197, |
|
"grad_norm": 2.387080669403076, |
|
"learning_rate": 6.782694318453033e-06, |
|
"loss": 0.4924, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.348936170212766, |
|
"grad_norm": 3.0172154903411865, |
|
"learning_rate": 6.716381574791648e-06, |
|
"loss": 0.4669, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.361094224924012, |
|
"grad_norm": 3.0593836307525635, |
|
"learning_rate": 6.649724804762236e-06, |
|
"loss": 0.4689, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3732522796352584, |
|
"grad_norm": 2.0975329875946045, |
|
"learning_rate": 6.5827373688475925e-06, |
|
"loss": 0.501, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3854103343465045, |
|
"grad_norm": 2.6222572326660156, |
|
"learning_rate": 6.5154326938081866e-06, |
|
"loss": 0.487, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3975683890577508, |
|
"grad_norm": 2.6791985034942627, |
|
"learning_rate": 6.447824269990947e-06, |
|
"loss": 0.4589, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.409726443768997, |
|
"grad_norm": 3.0119071006774902, |
|
"learning_rate": 6.3799256486252945e-06, |
|
"loss": 0.4839, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.4218844984802432, |
|
"grad_norm": 3.6317973136901855, |
|
"learning_rate": 6.311750439106976e-06, |
|
"loss": 0.4391, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.4340425531914893, |
|
"grad_norm": 1.9667277336120605, |
|
"learning_rate": 6.243312306270235e-06, |
|
"loss": 0.4379, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.4462006079027356, |
|
"grad_norm": 2.5585360527038574, |
|
"learning_rate": 6.174624967648877e-06, |
|
"loss": 0.4954, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.4583586626139817, |
|
"grad_norm": 2.421276330947876, |
|
"learning_rate": 6.105702190726765e-06, |
|
"loss": 0.4558, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.470516717325228, |
|
"grad_norm": 2.7117786407470703, |
|
"learning_rate": 6.03655779017831e-06, |
|
"loss": 0.488, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4826747720364741, |
|
"grad_norm": 2.3945164680480957, |
|
"learning_rate": 5.967205625099496e-06, |
|
"loss": 0.4849, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.4948328267477204, |
|
"grad_norm": 2.291707754135132, |
|
"learning_rate": 5.897659596230003e-06, |
|
"loss": 0.4614, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.5069908814589665, |
|
"grad_norm": 2.583559036254883, |
|
"learning_rate": 5.827933643166993e-06, |
|
"loss": 0.4626, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.5191489361702128, |
|
"grad_norm": 2.1779534816741943, |
|
"learning_rate": 5.758041741571088e-06, |
|
"loss": 0.4774, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.531306990881459, |
|
"grad_norm": 3.738179922103882, |
|
"learning_rate": 5.687997900365134e-06, |
|
"loss": 0.4487, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.543465045592705, |
|
"grad_norm": 2.461461305618286, |
|
"learning_rate": 5.617816158926303e-06, |
|
"loss": 0.4851, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5556231003039513, |
|
"grad_norm": 2.784620523452759, |
|
"learning_rate": 5.547510584272069e-06, |
|
"loss": 0.5079, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5677811550151977, |
|
"grad_norm": 2.8999369144439697, |
|
"learning_rate": 5.477095268240669e-06, |
|
"loss": 0.4596, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5799392097264437, |
|
"grad_norm": 2.786457061767578, |
|
"learning_rate": 5.406584324666565e-06, |
|
"loss": 0.4226, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5920972644376898, |
|
"grad_norm": 3.0281615257263184, |
|
"learning_rate": 5.335991886551526e-06, |
|
"loss": 0.4826, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.6042553191489362, |
|
"grad_norm": 2.60740327835083, |
|
"learning_rate": 5.2653321032318315e-06, |
|
"loss": 0.5185, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.6164133738601825, |
|
"grad_norm": 2.150132179260254, |
|
"learning_rate": 5.194619137542241e-06, |
|
"loss": 0.511, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6285714285714286, |
|
"grad_norm": 2.7613778114318848, |
|
"learning_rate": 5.123867162977224e-06, |
|
"loss": 0.4653, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.6407294832826747, |
|
"grad_norm": 2.5103297233581543, |
|
"learning_rate": 5.053090360850072e-06, |
|
"loss": 0.4206, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.652887537993921, |
|
"grad_norm": 2.6275055408477783, |
|
"learning_rate": 4.9823029174504335e-06, |
|
"loss": 0.4727, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6650455927051673, |
|
"grad_norm": 2.6730258464813232, |
|
"learning_rate": 4.9115190212008745e-06, |
|
"loss": 0.4616, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6772036474164134, |
|
"grad_norm": 2.8876965045928955, |
|
"learning_rate": 4.840752859812972e-06, |
|
"loss": 0.4868, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6893617021276595, |
|
"grad_norm": 2.7904891967773438, |
|
"learning_rate": 4.770018617443578e-06, |
|
"loss": 0.4453, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.7015197568389058, |
|
"grad_norm": 2.7088356018066406, |
|
"learning_rate": 4.699330471851798e-06, |
|
"loss": 0.4708, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.713677811550152, |
|
"grad_norm": 2.1268298625946045, |
|
"learning_rate": 4.628702591557237e-06, |
|
"loss": 0.4901, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.7258358662613982, |
|
"grad_norm": 2.3978848457336426, |
|
"learning_rate": 4.558149133000104e-06, |
|
"loss": 0.5164, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.7379939209726443, |
|
"grad_norm": 1.8490489721298218, |
|
"learning_rate": 4.487684237703734e-06, |
|
"loss": 0.4342, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.7501519756838906, |
|
"grad_norm": 2.7367939949035645, |
|
"learning_rate": 4.417322029440119e-06, |
|
"loss": 0.4887, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.762310030395137, |
|
"grad_norm": 2.217988967895508, |
|
"learning_rate": 4.347076611398961e-06, |
|
"loss": 0.46, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.774468085106383, |
|
"grad_norm": 2.8989672660827637, |
|
"learning_rate": 4.2769620633608835e-06, |
|
"loss": 0.4524, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.786626139817629, |
|
"grad_norm": 2.5125184059143066, |
|
"learning_rate": 4.206992438875318e-06, |
|
"loss": 0.4346, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.7987841945288754, |
|
"grad_norm": 2.9049086570739746, |
|
"learning_rate": 4.137181762443658e-06, |
|
"loss": 0.3629, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.8109422492401217, |
|
"grad_norm": 2.5007903575897217, |
|
"learning_rate": 4.0675440267082236e-06, |
|
"loss": 0.4943, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.8231003039513678, |
|
"grad_norm": 2.90285325050354, |
|
"learning_rate": 3.998093189647622e-06, |
|
"loss": 0.4331, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8352583586626139, |
|
"grad_norm": 2.7467432022094727, |
|
"learning_rate": 3.928843171779051e-06, |
|
"loss": 0.4826, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.8474164133738602, |
|
"grad_norm": 2.5184881687164307, |
|
"learning_rate": 3.859807853368112e-06, |
|
"loss": 0.4204, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.8595744680851065, |
|
"grad_norm": 2.262559652328491, |
|
"learning_rate": 3.791001071646695e-06, |
|
"loss": 0.455, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8717325227963526, |
|
"grad_norm": 2.8905036449432373, |
|
"learning_rate": 3.72243661803948e-06, |
|
"loss": 0.4584, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.8838905775075987, |
|
"grad_norm": 3.1106526851654053, |
|
"learning_rate": 3.6541282353996275e-06, |
|
"loss": 0.4559, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.896048632218845, |
|
"grad_norm": 2.7985055446624756, |
|
"learning_rate": 3.5860896152542013e-06, |
|
"loss": 0.4452, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.9082066869300913, |
|
"grad_norm": 2.4769771099090576, |
|
"learning_rate": 3.5183343950598825e-06, |
|
"loss": 0.463, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.9203647416413374, |
|
"grad_norm": 2.9029109477996826, |
|
"learning_rate": 3.450876155469518e-06, |
|
"loss": 0.4377, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.9325227963525835, |
|
"grad_norm": 2.444227457046509, |
|
"learning_rate": 3.3837284176100543e-06, |
|
"loss": 0.4559, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.9446808510638298, |
|
"grad_norm": 1.8885170221328735, |
|
"learning_rate": 3.3169046403724004e-06, |
|
"loss": 0.4315, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9568389057750761, |
|
"grad_norm": 2.370542526245117, |
|
"learning_rate": 3.250418217713771e-06, |
|
"loss": 0.4496, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9689969604863222, |
|
"grad_norm": 2.281675338745117, |
|
"learning_rate": 3.1842824759730518e-06, |
|
"loss": 0.4651, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9811550151975683, |
|
"grad_norm": 2.4642460346221924, |
|
"learning_rate": 3.1185106711996848e-06, |
|
"loss": 0.4492, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9933130699088146, |
|
"grad_norm": 2.898085355758667, |
|
"learning_rate": 3.0531159864966885e-06, |
|
"loss": 0.4217, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.0048632218844986, |
|
"grad_norm": 2.5043299198150635, |
|
"learning_rate": 2.9881115293782638e-06, |
|
"loss": 0.4087, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.0170212765957447, |
|
"grad_norm": 2.5139353275299072, |
|
"learning_rate": 2.923510329142568e-06, |
|
"loss": 0.3166, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.029179331306991, |
|
"grad_norm": 2.334691047668457, |
|
"learning_rate": 2.8593253342601557e-06, |
|
"loss": 0.2967, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.041337386018237, |
|
"grad_norm": 2.763742685317993, |
|
"learning_rate": 2.795569409778639e-06, |
|
"loss": 0.3263, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.0534954407294834, |
|
"grad_norm": 2.539466142654419, |
|
"learning_rate": 2.7322553347440368e-06, |
|
"loss": 0.2964, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.0656534954407295, |
|
"grad_norm": 3.0033152103424072, |
|
"learning_rate": 2.6693957996393984e-06, |
|
"loss": 0.3157, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.0778115501519756, |
|
"grad_norm": 2.8573410511016846, |
|
"learning_rate": 2.6070034038411553e-06, |
|
"loss": 0.3542, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.0899696048632217, |
|
"grad_norm": 2.31459641456604, |
|
"learning_rate": 2.545090653093738e-06, |
|
"loss": 0.2965, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.1021276595744682, |
|
"grad_norm": 2.5029146671295166, |
|
"learning_rate": 2.4836699570029623e-06, |
|
"loss": 0.295, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.1142857142857143, |
|
"grad_norm": 2.295614004135132, |
|
"learning_rate": 2.4227536265486885e-06, |
|
"loss": 0.3075, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.1264437689969604, |
|
"grad_norm": 2.5719516277313232, |
|
"learning_rate": 2.3623538716172394e-06, |
|
"loss": 0.3397, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.1386018237082065, |
|
"grad_norm": 2.6095409393310547, |
|
"learning_rate": 2.302482798554096e-06, |
|
"loss": 0.324, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.150759878419453, |
|
"grad_norm": 2.429758310317993, |
|
"learning_rate": 2.2431524077373314e-06, |
|
"loss": 0.2939, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.162917933130699, |
|
"grad_norm": 2.9179751873016357, |
|
"learning_rate": 2.1843745911722937e-06, |
|
"loss": 0.308, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.1750759878419452, |
|
"grad_norm": 3.281049966812134, |
|
"learning_rate": 2.1261611301080063e-06, |
|
"loss": 0.3229, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.1872340425531913, |
|
"grad_norm": 2.931143283843994, |
|
"learning_rate": 2.068523692675772e-06, |
|
"loss": 0.3107, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.199392097264438, |
|
"grad_norm": 2.432403802871704, |
|
"learning_rate": 2.0114738315504505e-06, |
|
"loss": 0.2925, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.211550151975684, |
|
"grad_norm": 3.296288251876831, |
|
"learning_rate": 1.955022981634863e-06, |
|
"loss": 0.3115, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.22370820668693, |
|
"grad_norm": 2.6482620239257812, |
|
"learning_rate": 1.8991824577678269e-06, |
|
"loss": 0.3423, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.235866261398176, |
|
"grad_norm": 2.5689730644226074, |
|
"learning_rate": 1.8439634524562423e-06, |
|
"loss": 0.344, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.2480243161094227, |
|
"grad_norm": 2.4932291507720947, |
|
"learning_rate": 1.7893770336316928e-06, |
|
"loss": 0.3052, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.2601823708206688, |
|
"grad_norm": 2.985034704208374, |
|
"learning_rate": 1.7354341424320286e-06, |
|
"loss": 0.3056, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.272340425531915, |
|
"grad_norm": 2.558969020843506, |
|
"learning_rate": 1.6821455910083535e-06, |
|
"loss": 0.2883, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.284498480243161, |
|
"grad_norm": 2.731433868408203, |
|
"learning_rate": 1.6295220603578727e-06, |
|
"loss": 0.3017, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.2966565349544075, |
|
"grad_norm": 2.7767181396484375, |
|
"learning_rate": 1.5775740981830262e-06, |
|
"loss": 0.3348, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.3088145896656536, |
|
"grad_norm": 2.289393424987793, |
|
"learning_rate": 1.526312116777336e-06, |
|
"loss": 0.3377, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.3209726443768997, |
|
"grad_norm": 2.3055357933044434, |
|
"learning_rate": 1.475746390938399e-06, |
|
"loss": 0.3111, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.3331306990881457, |
|
"grad_norm": 3.120150089263916, |
|
"learning_rate": 1.4258870559084387e-06, |
|
"loss": 0.3172, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.3452887537993923, |
|
"grad_norm": 2.520087718963623, |
|
"learning_rate": 1.3767441053428244e-06, |
|
"loss": 0.3071, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.3574468085106384, |
|
"grad_norm": 2.533917188644409, |
|
"learning_rate": 1.328327389306977e-06, |
|
"loss": 0.328, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.3696048632218845, |
|
"grad_norm": 2.611781120300293, |
|
"learning_rate": 1.2806466123020479e-06, |
|
"loss": 0.2373, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.3817629179331306, |
|
"grad_norm": 2.3127501010894775, |
|
"learning_rate": 1.2337113313197813e-06, |
|
"loss": 0.3226, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.393920972644377, |
|
"grad_norm": 3.0540225505828857, |
|
"learning_rate": 1.1875309539269332e-06, |
|
"loss": 0.3181, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.406079027355623, |
|
"grad_norm": 2.2503840923309326, |
|
"learning_rate": 1.1421147363796547e-06, |
|
"loss": 0.2918, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.4182370820668693, |
|
"grad_norm": 2.5295841693878174, |
|
"learning_rate": 1.097471781768194e-06, |
|
"loss": 0.2941, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.4303951367781154, |
|
"grad_norm": 2.7835609912872314, |
|
"learning_rate": 1.053611038192296e-06, |
|
"loss": 0.2901, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.4425531914893615, |
|
"grad_norm": 2.6751868724823, |
|
"learning_rate": 1.0105412969676758e-06, |
|
"loss": 0.335, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.454711246200608, |
|
"grad_norm": 2.333569049835205, |
|
"learning_rate": 9.682711908639137e-07, |
|
"loss": 0.2967, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.466869300911854, |
|
"grad_norm": 2.4557461738586426, |
|
"learning_rate": 9.268091923741246e-07, |
|
"loss": 0.2856, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.4790273556231, |
|
"grad_norm": 2.8368921279907227, |
|
"learning_rate": 8.861636120167632e-07, |
|
"loss": 0.3396, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.4911854103343467, |
|
"grad_norm": 2.238975763320923, |
|
"learning_rate": 8.463425966698857e-07, |
|
"loss": 0.3138, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.503343465045593, |
|
"grad_norm": 2.7118637561798096, |
|
"learning_rate": 8.073541279382135e-07, |
|
"loss": 0.3397, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.515501519756839, |
|
"grad_norm": 2.528172731399536, |
|
"learning_rate": 7.69206020553323e-07, |
|
"loss": 0.3273, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.527659574468085, |
|
"grad_norm": 2.3029799461364746, |
|
"learning_rate": 7.319059208072909e-07, |
|
"loss": 0.3238, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.539817629179331, |
|
"grad_norm": 2.535954475402832, |
|
"learning_rate": 6.954613050200859e-07, |
|
"loss": 0.328, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.5519756838905776, |
|
"grad_norm": 2.4356179237365723, |
|
"learning_rate": 6.5987947804104e-07, |
|
"loss": 0.3081, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5641337386018237, |
|
"grad_norm": 2.039999008178711, |
|
"learning_rate": 6.251675717846905e-07, |
|
"loss": 0.311, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.57629179331307, |
|
"grad_norm": 2.296266794204712, |
|
"learning_rate": 5.913325438012773e-07, |
|
"loss": 0.2815, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.5884498480243163, |
|
"grad_norm": 2.9788355827331543, |
|
"learning_rate": 5.583811758821916e-07, |
|
"loss": 0.3346, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.6006079027355624, |
|
"grad_norm": 3.0497772693634033, |
|
"learning_rate": 5.263200727006568e-07, |
|
"loss": 0.2976, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.6127659574468085, |
|
"grad_norm": 2.972386121749878, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.272, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.6249240121580546, |
|
"grad_norm": 3.3032212257385254, |
|
"learning_rate": 4.648941857451228e-07, |
|
"loss": 0.2989, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.6370820668693007, |
|
"grad_norm": 2.078023672103882, |
|
"learning_rate": 4.355417139914242e-07, |
|
"loss": 0.3353, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.6492401215805472, |
|
"grad_norm": 2.5029313564300537, |
|
"learning_rate": 4.0710412854809255e-07, |
|
"loss": 0.3413, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.6613981762917933, |
|
"grad_norm": 2.516641616821289, |
|
"learning_rate": 3.7958712935934726e-07, |
|
"loss": 0.347, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.6735562310030394, |
|
"grad_norm": 2.115417718887329, |
|
"learning_rate": 3.5299623184986366e-07, |
|
"loss": 0.2955, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.685714285714286, |
|
"grad_norm": 2.419532299041748, |
|
"learning_rate": 3.273367658192778e-07, |
|
"loss": 0.285, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.697872340425532, |
|
"grad_norm": 2.186316967010498, |
|
"learning_rate": 3.0261387437389766e-07, |
|
"loss": 0.3091, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.710030395136778, |
|
"grad_norm": 2.725886583328247, |
|
"learning_rate": 2.7883251289583467e-07, |
|
"loss": 0.317, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.722188449848024, |
|
"grad_norm": 2.355541944503784, |
|
"learning_rate": 2.5599744804975956e-07, |
|
"loss": 0.3093, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.7343465045592703, |
|
"grad_norm": 2.3325600624084473, |
|
"learning_rate": 2.3411325682748843e-07, |
|
"loss": 0.2784, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.746504559270517, |
|
"grad_norm": 2.1935389041900635, |
|
"learning_rate": 2.1318432563058765e-07, |
|
"loss": 0.2835, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.758662613981763, |
|
"grad_norm": 2.6569478511810303, |
|
"learning_rate": 1.9321484939116843e-07, |
|
"loss": 0.2821, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.770820668693009, |
|
"grad_norm": 2.2891359329223633, |
|
"learning_rate": 1.742088307310741e-07, |
|
"loss": 0.3362, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.7829787234042556, |
|
"grad_norm": 2.387622833251953, |
|
"learning_rate": 1.561700791596038e-07, |
|
"loss": 0.2973, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.7951367781155017, |
|
"grad_norm": 2.550020217895508, |
|
"learning_rate": 1.3910221030994764e-07, |
|
"loss": 0.279, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.8072948328267477, |
|
"grad_norm": 2.9916679859161377, |
|
"learning_rate": 1.2300864521447575e-07, |
|
"loss": 0.3318, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.819452887537994, |
|
"grad_norm": 2.5227696895599365, |
|
"learning_rate": 1.0789260961904357e-07, |
|
"loss": 0.2887, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.83161094224924, |
|
"grad_norm": 2.750169038772583, |
|
"learning_rate": 9.375713333642677e-08, |
|
"loss": 0.3087, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.8437689969604865, |
|
"grad_norm": 2.2595717906951904, |
|
"learning_rate": 8.060504963903815e-08, |
|
"loss": 0.28, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.8559270516717326, |
|
"grad_norm": 2.5719687938690186, |
|
"learning_rate": 6.843899469103521e-08, |
|
"loss": 0.3007, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.8680851063829786, |
|
"grad_norm": 2.3983314037323, |
|
"learning_rate": 5.726140701993288e-08, |
|
"loss": 0.3049, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.880243161094225, |
|
"grad_norm": 2.910645008087158, |
|
"learning_rate": 4.707452702783388e-08, |
|
"loss": 0.3006, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.8924012158054713, |
|
"grad_norm": 2.546748161315918, |
|
"learning_rate": 3.7880396542369635e-08, |
|
"loss": 0.3431, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.9045592705167174, |
|
"grad_norm": 2.052919387817383, |
|
"learning_rate": 2.9680858407441503e-08, |
|
"loss": 0.2987, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.9167173252279635, |
|
"grad_norm": 2.4598567485809326, |
|
"learning_rate": 2.24775561138485e-08, |
|
"loss": 0.3207, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.9288753799392095, |
|
"grad_norm": 2.4281694889068604, |
|
"learning_rate": 1.627193346986744e-08, |
|
"loss": 0.2847, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.941033434650456, |
|
"grad_norm": 2.2293615341186523, |
|
"learning_rate": 1.1065234311864459e-08, |
|
"loss": 0.2952, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.953191489361702, |
|
"grad_norm": 2.4415087699890137, |
|
"learning_rate": 6.858502254981081e-09, |
|
"loss": 0.2756, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.9653495440729483, |
|
"grad_norm": 2.482046604156494, |
|
"learning_rate": 3.652580483956558e-09, |
|
"loss": 0.3162, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.977507598784195, |
|
"grad_norm": 2.2831978797912598, |
|
"learning_rate": 1.4481115841230574e-09, |
|
"loss": 0.2904, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.989665653495441, |
|
"grad_norm": 2.147287368774414, |
|
"learning_rate": 2.4553741260535667e-10, |
|
"loss": 0.3062, |
|
"step": 2460 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2466, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0307400421054874e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|