|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.998269896193771, |
|
"eval_steps": 500, |
|
"global_step": 2889, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01730103806228374, |
|
"grad_norm": 7.182409286499023, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.9301, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03460207612456748, |
|
"grad_norm": 5.011022090911865, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.6074, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05190311418685121, |
|
"grad_norm": 2.866461992263794, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 0.3194, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06920415224913495, |
|
"grad_norm": 1.375663161277771, |
|
"learning_rate": 5.517241379310345e-05, |
|
"loss": 0.2251, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08650519031141868, |
|
"grad_norm": 2.0214180946350098, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 0.1793, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10380622837370242, |
|
"grad_norm": 1.3289449214935303, |
|
"learning_rate": 8.275862068965517e-05, |
|
"loss": 0.1596, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12110726643598616, |
|
"grad_norm": 1.1882271766662598, |
|
"learning_rate": 9.655172413793105e-05, |
|
"loss": 0.1548, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1384083044982699, |
|
"grad_norm": 1.2069551944732666, |
|
"learning_rate": 0.0001103448275862069, |
|
"loss": 0.1448, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15570934256055363, |
|
"grad_norm": 0.7576473355293274, |
|
"learning_rate": 0.00012413793103448277, |
|
"loss": 0.119, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17301038062283736, |
|
"grad_norm": 1.3533903360366821, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 0.1164, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1903114186851211, |
|
"grad_norm": 1.136509656906128, |
|
"learning_rate": 0.00015172413793103449, |
|
"loss": 0.1099, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20761245674740483, |
|
"grad_norm": 1.2311946153640747, |
|
"learning_rate": 0.00016551724137931035, |
|
"loss": 0.1293, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22491349480968859, |
|
"grad_norm": 1.382206678390503, |
|
"learning_rate": 0.0001793103448275862, |
|
"loss": 0.1049, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2422145328719723, |
|
"grad_norm": 1.087380290031433, |
|
"learning_rate": 0.0001931034482758621, |
|
"loss": 0.1011, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25951557093425603, |
|
"grad_norm": 1.564366340637207, |
|
"learning_rate": 0.0001999983615229662, |
|
"loss": 0.1222, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2768166089965398, |
|
"grad_norm": 0.7164430618286133, |
|
"learning_rate": 0.00019998525402884653, |
|
"loss": 0.1028, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.96863853931427, |
|
"learning_rate": 0.0001999590407586994, |
|
"loss": 0.0966, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.31141868512110726, |
|
"grad_norm": 0.5991038084030151, |
|
"learning_rate": 0.000199919725148484, |
|
"loss": 0.0912, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.328719723183391, |
|
"grad_norm": 0.7860184907913208, |
|
"learning_rate": 0.00019986731235157592, |
|
"loss": 0.0987, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3460207612456747, |
|
"grad_norm": 1.4145984649658203, |
|
"learning_rate": 0.00019980180923809214, |
|
"loss": 0.1068, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3633217993079585, |
|
"grad_norm": 0.9435092806816101, |
|
"learning_rate": 0.00019972322439399, |
|
"loss": 0.0907, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3806228373702422, |
|
"grad_norm": 0.521682858467102, |
|
"learning_rate": 0.00019963156811994215, |
|
"loss": 0.0955, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39792387543252594, |
|
"grad_norm": 0.5504732131958008, |
|
"learning_rate": 0.0001995268524299861, |
|
"loss": 0.0821, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41522491349480967, |
|
"grad_norm": 0.867936909198761, |
|
"learning_rate": 0.00019940909104994973, |
|
"loss": 0.073, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43252595155709345, |
|
"grad_norm": 0.9123047590255737, |
|
"learning_rate": 0.00019927829941565186, |
|
"loss": 0.0746, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44982698961937717, |
|
"grad_norm": 0.5221778750419617, |
|
"learning_rate": 0.00019913449467087916, |
|
"loss": 0.0718, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4671280276816609, |
|
"grad_norm": 0.7056018114089966, |
|
"learning_rate": 0.00019897769566513897, |
|
"loss": 0.0649, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4844290657439446, |
|
"grad_norm": 0.8671623468399048, |
|
"learning_rate": 0.00019880792295118852, |
|
"loss": 0.085, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5017301038062284, |
|
"grad_norm": 0.9353134632110596, |
|
"learning_rate": 0.00019862519878234084, |
|
"loss": 0.0635, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5190311418685121, |
|
"grad_norm": 0.669700026512146, |
|
"learning_rate": 0.00019842954710954812, |
|
"loss": 0.0704, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5363321799307958, |
|
"grad_norm": 0.6092180013656616, |
|
"learning_rate": 0.000198220993578262, |
|
"loss": 0.0616, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5536332179930796, |
|
"grad_norm": 0.44123342633247375, |
|
"learning_rate": 0.00019799956552507233, |
|
"loss": 0.0746, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5709342560553633, |
|
"grad_norm": 0.9491882920265198, |
|
"learning_rate": 0.00019776529197412362, |
|
"loss": 0.0814, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.7369952201843262, |
|
"learning_rate": 0.00019751820363331097, |
|
"loss": 0.0629, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6055363321799307, |
|
"grad_norm": 0.5006421208381653, |
|
"learning_rate": 0.00019725833289025476, |
|
"loss": 0.0616, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6228373702422145, |
|
"grad_norm": 0.7486262917518616, |
|
"learning_rate": 0.00019698571380805552, |
|
"loss": 0.0658, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6401384083044983, |
|
"grad_norm": 0.4780927896499634, |
|
"learning_rate": 0.00019670038212082886, |
|
"loss": 0.0492, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.657439446366782, |
|
"grad_norm": 0.6117326617240906, |
|
"learning_rate": 0.00019640237522902174, |
|
"loss": 0.0603, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6747404844290658, |
|
"grad_norm": 0.4989817440509796, |
|
"learning_rate": 0.00019609173219450998, |
|
"loss": 0.0635, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6920415224913494, |
|
"grad_norm": 0.32066667079925537, |
|
"learning_rate": 0.0001957684937354782, |
|
"loss": 0.0462, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7093425605536332, |
|
"grad_norm": 0.7751433253288269, |
|
"learning_rate": 0.00019543270222108268, |
|
"loss": 0.0556, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.726643598615917, |
|
"grad_norm": 0.4977259635925293, |
|
"learning_rate": 0.00019508440166589753, |
|
"loss": 0.0651, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7439446366782007, |
|
"grad_norm": 0.6962701082229614, |
|
"learning_rate": 0.00019472363772414563, |
|
"loss": 0.0554, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7612456747404844, |
|
"grad_norm": 0.5899763107299805, |
|
"learning_rate": 0.00019435045768371415, |
|
"loss": 0.0536, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7785467128027682, |
|
"grad_norm": 0.6964747905731201, |
|
"learning_rate": 0.00019396491045995648, |
|
"loss": 0.0677, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7958477508650519, |
|
"grad_norm": 0.59555584192276, |
|
"learning_rate": 0.00019356704658928035, |
|
"loss": 0.0515, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8131487889273357, |
|
"grad_norm": 0.5397696495056152, |
|
"learning_rate": 0.00019315691822252362, |
|
"loss": 0.0587, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8304498269896193, |
|
"grad_norm": 0.4341135025024414, |
|
"learning_rate": 0.0001927345791181187, |
|
"loss": 0.0512, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8477508650519031, |
|
"grad_norm": 0.3942558467388153, |
|
"learning_rate": 0.00019230008463504595, |
|
"loss": 0.0494, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8650519031141869, |
|
"grad_norm": 0.47972944378852844, |
|
"learning_rate": 0.00019185349172557724, |
|
"loss": 0.0439, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.6018040180206299, |
|
"learning_rate": 0.00019139485892781118, |
|
"loss": 0.05, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8996539792387543, |
|
"grad_norm": 0.7354676127433777, |
|
"learning_rate": 0.00019092424635799962, |
|
"loss": 0.0418, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.916955017301038, |
|
"grad_norm": 0.5309942960739136, |
|
"learning_rate": 0.0001904417157026683, |
|
"loss": 0.0474, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9342560553633218, |
|
"grad_norm": 0.4427996873855591, |
|
"learning_rate": 0.00018994733021053076, |
|
"loss": 0.0615, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9515570934256056, |
|
"grad_norm": 0.47377070784568787, |
|
"learning_rate": 0.00018944115468419809, |
|
"loss": 0.0426, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9688581314878892, |
|
"grad_norm": 0.4655269682407379, |
|
"learning_rate": 0.00018892325547168473, |
|
"loss": 0.0511, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.986159169550173, |
|
"grad_norm": 0.6078908443450928, |
|
"learning_rate": 0.00018839370045771182, |
|
"loss": 0.061, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0034602076124568, |
|
"grad_norm": 0.6679680943489075, |
|
"learning_rate": 0.00018785255905480897, |
|
"loss": 0.0539, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0207612456747406, |
|
"grad_norm": 0.752947986125946, |
|
"learning_rate": 0.00018729990219421594, |
|
"loss": 0.0415, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0380622837370241, |
|
"grad_norm": 0.7484295964241028, |
|
"learning_rate": 0.0001867358023165851, |
|
"loss": 0.0506, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.055363321799308, |
|
"grad_norm": 0.6325497031211853, |
|
"learning_rate": 0.00018616033336248632, |
|
"loss": 0.0521, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0726643598615917, |
|
"grad_norm": 0.6303852796554565, |
|
"learning_rate": 0.00018557357076271475, |
|
"loss": 0.0566, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0899653979238755, |
|
"grad_norm": 0.49704891443252563, |
|
"learning_rate": 0.0001849755914284039, |
|
"loss": 0.0432, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1072664359861593, |
|
"grad_norm": 0.5623544454574585, |
|
"learning_rate": 0.00018436647374094406, |
|
"loss": 0.0377, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1245674740484428, |
|
"grad_norm": 0.35646283626556396, |
|
"learning_rate": 0.00018374629754170854, |
|
"loss": 0.0477, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1418685121107266, |
|
"grad_norm": 0.5910556316375732, |
|
"learning_rate": 0.00018311514412158806, |
|
"loss": 0.0447, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1591695501730104, |
|
"grad_norm": 0.6531932353973389, |
|
"learning_rate": 0.0001824730962103356, |
|
"loss": 0.0397, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.5327192544937134, |
|
"learning_rate": 0.0001818202379657222, |
|
"loss": 0.0609, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.193771626297578, |
|
"grad_norm": 0.5592876076698303, |
|
"learning_rate": 0.0001811566549625061, |
|
"loss": 0.0445, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2110726643598615, |
|
"grad_norm": 0.5012805461883545, |
|
"learning_rate": 0.00018048243418121551, |
|
"loss": 0.0467, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2283737024221453, |
|
"grad_norm": 0.6008819341659546, |
|
"learning_rate": 0.00017979766399674776, |
|
"loss": 0.0475, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.245674740484429, |
|
"grad_norm": 0.3577163815498352, |
|
"learning_rate": 0.00017910243416678512, |
|
"loss": 0.0407, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2629757785467128, |
|
"grad_norm": 0.3694828152656555, |
|
"learning_rate": 0.00017839683582002982, |
|
"loss": 0.0498, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.2802768166089966, |
|
"grad_norm": 0.3278524577617645, |
|
"learning_rate": 0.00017768096144425902, |
|
"loss": 0.0392, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2975778546712804, |
|
"grad_norm": 0.5506361126899719, |
|
"learning_rate": 0.00017695490487420194, |
|
"loss": 0.043, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.314878892733564, |
|
"grad_norm": 0.3072315752506256, |
|
"learning_rate": 0.0001762187612792401, |
|
"loss": 0.0379, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3321799307958477, |
|
"grad_norm": 0.6110448837280273, |
|
"learning_rate": 0.00017547262715093291, |
|
"loss": 0.0475, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.3494809688581315, |
|
"grad_norm": 0.3272629678249359, |
|
"learning_rate": 0.00017471660029036987, |
|
"loss": 0.0471, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.3667820069204153, |
|
"grad_norm": 0.7174297571182251, |
|
"learning_rate": 0.00017395077979535088, |
|
"loss": 0.0386, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3840830449826989, |
|
"grad_norm": 0.5086835026741028, |
|
"learning_rate": 0.00017317526604739708, |
|
"loss": 0.0508, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4013840830449826, |
|
"grad_norm": 0.3599720597267151, |
|
"learning_rate": 0.0001723901606985929, |
|
"loss": 0.0377, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4186851211072664, |
|
"grad_norm": 0.4809829294681549, |
|
"learning_rate": 0.00017159556665826195, |
|
"loss": 0.0575, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4359861591695502, |
|
"grad_norm": 0.5327945947647095, |
|
"learning_rate": 0.0001707915880794778, |
|
"loss": 0.0427, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.453287197231834, |
|
"grad_norm": 0.6132643818855286, |
|
"learning_rate": 0.0001699783303454121, |
|
"loss": 0.0379, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.44660812616348267, |
|
"learning_rate": 0.00016915590005552118, |
|
"loss": 0.0395, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4878892733564013, |
|
"grad_norm": 0.7750672698020935, |
|
"learning_rate": 0.00016832440501157313, |
|
"loss": 0.0433, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.505190311418685, |
|
"grad_norm": 0.3744714856147766, |
|
"learning_rate": 0.0001674839542035178, |
|
"loss": 0.0398, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5224913494809689, |
|
"grad_norm": 0.4120178818702698, |
|
"learning_rate": 0.0001666346577952004, |
|
"loss": 0.0393, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5397923875432526, |
|
"grad_norm": 0.5227100849151611, |
|
"learning_rate": 0.00016577662710992174, |
|
"loss": 0.0441, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5570934256055362, |
|
"grad_norm": 0.6994874477386475, |
|
"learning_rate": 0.00016490997461584617, |
|
"loss": 0.0388, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.57439446366782, |
|
"grad_norm": 0.5372064113616943, |
|
"learning_rate": 0.00016403481391125973, |
|
"loss": 0.0379, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.5916955017301038, |
|
"grad_norm": 0.3524293601512909, |
|
"learning_rate": 0.00016315125970967978, |
|
"loss": 0.0477, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.6089965397923875, |
|
"grad_norm": 0.40851327776908875, |
|
"learning_rate": 0.000162259427824819, |
|
"loss": 0.0522, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.6262975778546713, |
|
"grad_norm": 0.3339862525463104, |
|
"learning_rate": 0.00016135943515540455, |
|
"loss": 0.0341, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.643598615916955, |
|
"grad_norm": 0.5174134373664856, |
|
"learning_rate": 0.0001604513996698556, |
|
"loss": 0.0402, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6608996539792389, |
|
"grad_norm": 0.3898767828941345, |
|
"learning_rate": 0.00015953544039082012, |
|
"loss": 0.0451, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6782006920415224, |
|
"grad_norm": 0.5794402956962585, |
|
"learning_rate": 0.00015861167737957397, |
|
"loss": 0.0327, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.6955017301038062, |
|
"grad_norm": 0.42949485778808594, |
|
"learning_rate": 0.00015768023172028342, |
|
"loss": 0.0478, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.71280276816609, |
|
"grad_norm": 0.48415741324424744, |
|
"learning_rate": 0.00015674122550413396, |
|
"loss": 0.0389, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.7301038062283736, |
|
"grad_norm": 0.4169400632381439, |
|
"learning_rate": 0.00015579478181332684, |
|
"loss": 0.0373, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7474048442906573, |
|
"grad_norm": 0.30931493639945984, |
|
"learning_rate": 0.00015484102470494576, |
|
"loss": 0.0498, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.4326338469982147, |
|
"learning_rate": 0.00015388007919469603, |
|
"loss": 0.033, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.782006920415225, |
|
"grad_norm": 0.37110018730163574, |
|
"learning_rate": 0.0001529120712405177, |
|
"loss": 0.0391, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.7993079584775087, |
|
"grad_norm": 0.33888983726501465, |
|
"learning_rate": 0.00015193712772607537, |
|
"loss": 0.036, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.8166089965397925, |
|
"grad_norm": 0.2806644141674042, |
|
"learning_rate": 0.0001509553764441267, |
|
"loss": 0.0308, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8339100346020762, |
|
"grad_norm": 0.3501754105091095, |
|
"learning_rate": 0.00014996694607977176, |
|
"loss": 0.0415, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.85121107266436, |
|
"grad_norm": 0.37800803780555725, |
|
"learning_rate": 0.00014897196619358526, |
|
"loss": 0.032, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8685121107266436, |
|
"grad_norm": 0.4112405478954315, |
|
"learning_rate": 0.0001479705672046341, |
|
"loss": 0.0362, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.8858131487889274, |
|
"grad_norm": 0.42961543798446655, |
|
"learning_rate": 0.00014696288037338256, |
|
"loss": 0.0408, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.903114186851211, |
|
"grad_norm": 0.22437059879302979, |
|
"learning_rate": 0.00014594903778448705, |
|
"loss": 0.0375, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9204152249134947, |
|
"grad_norm": 0.3606659770011902, |
|
"learning_rate": 0.00014492917232948263, |
|
"loss": 0.0309, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.9377162629757785, |
|
"grad_norm": 0.45117369294166565, |
|
"learning_rate": 0.00014390341768936413, |
|
"loss": 0.0319, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.9550173010380623, |
|
"grad_norm": 0.4739942252635956, |
|
"learning_rate": 0.00014287190831706372, |
|
"loss": 0.032, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.972318339100346, |
|
"grad_norm": 0.3133346140384674, |
|
"learning_rate": 0.00014183477941982704, |
|
"loss": 0.0279, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.9896193771626298, |
|
"grad_norm": 0.3473515212535858, |
|
"learning_rate": 0.00014079216694149076, |
|
"loss": 0.0326, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0069204152249136, |
|
"grad_norm": 0.3447270095348358, |
|
"learning_rate": 0.00013974420754466328, |
|
"loss": 0.0291, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.0242214532871974, |
|
"grad_norm": 0.36638471484184265, |
|
"learning_rate": 0.00013869103859281165, |
|
"loss": 0.0336, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.041522491349481, |
|
"grad_norm": 0.558734118938446, |
|
"learning_rate": 0.0001376327981322561, |
|
"loss": 0.0345, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 0.44841858744621277, |
|
"learning_rate": 0.0001365696248740756, |
|
"loss": 0.0377, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.0761245674740483, |
|
"grad_norm": 0.6627900004386902, |
|
"learning_rate": 0.0001355016581759257, |
|
"loss": 0.0394, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.093425605536332, |
|
"grad_norm": 0.4195360541343689, |
|
"learning_rate": 0.00013442903802377226, |
|
"loss": 0.0387, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.110726643598616, |
|
"grad_norm": 0.4721520245075226, |
|
"learning_rate": 0.00013335190501354227, |
|
"loss": 0.0363, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.1280276816608996, |
|
"grad_norm": 0.3159089684486389, |
|
"learning_rate": 0.000132270400332695, |
|
"loss": 0.03, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.1453287197231834, |
|
"grad_norm": 0.43079933524131775, |
|
"learning_rate": 0.00013118466574171564, |
|
"loss": 0.0324, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.162629757785467, |
|
"grad_norm": 0.4005158841609955, |
|
"learning_rate": 0.00013009484355553364, |
|
"loss": 0.0351, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.179930795847751, |
|
"grad_norm": 0.5328086614608765, |
|
"learning_rate": 0.00012900107662486857, |
|
"loss": 0.0255, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.1972318339100347, |
|
"grad_norm": 0.5238893628120422, |
|
"learning_rate": 0.00012790350831750556, |
|
"loss": 0.0281, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.2145328719723185, |
|
"grad_norm": 0.5240843892097473, |
|
"learning_rate": 0.0001268022824995032, |
|
"loss": 0.0368, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.2318339100346023, |
|
"grad_norm": 0.2866148054599762, |
|
"learning_rate": 0.0001256975435163359, |
|
"loss": 0.0295, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.2491349480968856, |
|
"grad_norm": 0.46996742486953735, |
|
"learning_rate": 0.00012458943617397344, |
|
"loss": 0.0331, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2664359861591694, |
|
"grad_norm": 0.36567744612693787, |
|
"learning_rate": 0.00012347810571990055, |
|
"loss": 0.0285, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.283737024221453, |
|
"grad_norm": 0.4469076693058014, |
|
"learning_rate": 0.00012236369782407783, |
|
"loss": 0.0256, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.301038062283737, |
|
"grad_norm": 0.5179305076599121, |
|
"learning_rate": 0.0001212463585598481, |
|
"loss": 0.0367, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.3183391003460208, |
|
"grad_norm": 0.4950549602508545, |
|
"learning_rate": 0.00012012623438478931, |
|
"loss": 0.0368, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.3356401384083045, |
|
"grad_norm": 0.23521831631660461, |
|
"learning_rate": 0.0001190034721215176, |
|
"loss": 0.0341, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.44627803564071655, |
|
"learning_rate": 0.00011787821893844189, |
|
"loss": 0.0366, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.370242214532872, |
|
"grad_norm": 0.4281276762485504, |
|
"learning_rate": 0.00011675062233047364, |
|
"loss": 0.0341, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.387543252595156, |
|
"grad_norm": 0.5153332948684692, |
|
"learning_rate": 0.00011562083009969366, |
|
"loss": 0.0302, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.404844290657439, |
|
"grad_norm": 0.3502303957939148, |
|
"learning_rate": 0.00011448899033597855, |
|
"loss": 0.0372, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.422145328719723, |
|
"grad_norm": 0.20930013060569763, |
|
"learning_rate": 0.00011335525139758962, |
|
"loss": 0.032, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4394463667820068, |
|
"grad_norm": 0.40936240553855896, |
|
"learning_rate": 0.00011221976189172644, |
|
"loss": 0.0369, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.4567474048442905, |
|
"grad_norm": 0.33430635929107666, |
|
"learning_rate": 0.0001110826706550479, |
|
"loss": 0.0329, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.4740484429065743, |
|
"grad_norm": 0.5178155303001404, |
|
"learning_rate": 0.00010994412673416303, |
|
"loss": 0.0326, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.491349480968858, |
|
"grad_norm": 0.5524935722351074, |
|
"learning_rate": 0.00010880427936609455, |
|
"loss": 0.0259, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.508650519031142, |
|
"grad_norm": 0.32262513041496277, |
|
"learning_rate": 0.0001076632779587172, |
|
"loss": 0.0338, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5259515570934257, |
|
"grad_norm": 0.5853790640830994, |
|
"learning_rate": 0.00010652127207117386, |
|
"loss": 0.0309, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.5432525951557095, |
|
"grad_norm": 0.45327532291412354, |
|
"learning_rate": 0.00010537841139427178, |
|
"loss": 0.0194, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.5605536332179932, |
|
"grad_norm": 0.319289892911911, |
|
"learning_rate": 0.00010423484573086138, |
|
"loss": 0.028, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.577854671280277, |
|
"grad_norm": 0.5092198848724365, |
|
"learning_rate": 0.00010309072497620081, |
|
"loss": 0.0267, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.595155709342561, |
|
"grad_norm": 0.29407837986946106, |
|
"learning_rate": 0.00010194619909830787, |
|
"loss": 0.0345, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.612456747404844, |
|
"grad_norm": 0.5686324238777161, |
|
"learning_rate": 0.00010080141811830277, |
|
"loss": 0.026, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.629757785467128, |
|
"grad_norm": 0.3031514585018158, |
|
"learning_rate": 9.965653209074378e-05, |
|
"loss": 0.027, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.29397931694984436, |
|
"learning_rate": 9.851169108395842e-05, |
|
"loss": 0.0284, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.6643598615916955, |
|
"grad_norm": 0.2728932201862335, |
|
"learning_rate": 9.736704516037317e-05, |
|
"loss": 0.0244, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.6816608996539792, |
|
"grad_norm": 0.23780933022499084, |
|
"learning_rate": 9.622274435684334e-05, |
|
"loss": 0.0359, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.698961937716263, |
|
"grad_norm": 0.3995797336101532, |
|
"learning_rate": 9.507893866498714e-05, |
|
"loss": 0.023, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.716262975778547, |
|
"grad_norm": 0.2033807337284088, |
|
"learning_rate": 9.393577801152486e-05, |
|
"loss": 0.0232, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.7335640138408306, |
|
"grad_norm": 0.1671728491783142, |
|
"learning_rate": 9.279341223862705e-05, |
|
"loss": 0.027, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.750865051903114, |
|
"grad_norm": 0.3328063189983368, |
|
"learning_rate": 9.165199108427364e-05, |
|
"loss": 0.0308, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.7681660899653977, |
|
"grad_norm": 0.5018717646598816, |
|
"learning_rate": 9.051166416262673e-05, |
|
"loss": 0.0266, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.7854671280276815, |
|
"grad_norm": 0.44454413652420044, |
|
"learning_rate": 8.937258094441953e-05, |
|
"loss": 0.0264, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.8027681660899653, |
|
"grad_norm": 0.29583296179771423, |
|
"learning_rate": 8.823489073736429e-05, |
|
"loss": 0.029, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.820069204152249, |
|
"grad_norm": 0.2867840826511383, |
|
"learning_rate": 8.70987426665814e-05, |
|
"loss": 0.031, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.837370242214533, |
|
"grad_norm": 0.17943017184734344, |
|
"learning_rate": 8.596428565505245e-05, |
|
"loss": 0.0235, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.8546712802768166, |
|
"grad_norm": 0.3584960699081421, |
|
"learning_rate": 8.483166840409995e-05, |
|
"loss": 0.0255, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8719723183391004, |
|
"grad_norm": 0.2367885261774063, |
|
"learning_rate": 8.370103937389595e-05, |
|
"loss": 0.0297, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.889273356401384, |
|
"grad_norm": 0.3012569546699524, |
|
"learning_rate": 8.257254676400237e-05, |
|
"loss": 0.0241, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.906574394463668, |
|
"grad_norm": 0.3492811322212219, |
|
"learning_rate": 8.144633849394527e-05, |
|
"loss": 0.0245, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.9238754325259517, |
|
"grad_norm": 0.5457948446273804, |
|
"learning_rate": 8.032256218382618e-05, |
|
"loss": 0.0417, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.26424387097358704, |
|
"learning_rate": 7.920136513497232e-05, |
|
"loss": 0.0275, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.9584775086505193, |
|
"grad_norm": 0.27352163195610046, |
|
"learning_rate": 7.808289431062892e-05, |
|
"loss": 0.0256, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.9757785467128026, |
|
"grad_norm": 0.47394421696662903, |
|
"learning_rate": 7.696729631669564e-05, |
|
"loss": 0.0287, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.9930795847750864, |
|
"grad_norm": 0.2788919508457184, |
|
"learning_rate": 7.585471738250984e-05, |
|
"loss": 0.0258, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.01038062283737, |
|
"grad_norm": 0.46854037046432495, |
|
"learning_rate": 7.474530334167935e-05, |
|
"loss": 0.0268, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.027681660899654, |
|
"grad_norm": 0.3487790822982788, |
|
"learning_rate": 7.363919961296699e-05, |
|
"loss": 0.0207, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0449826989619377, |
|
"grad_norm": 0.3000977337360382, |
|
"learning_rate": 7.253655118122948e-05, |
|
"loss": 0.0229, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.0622837370242215, |
|
"grad_norm": 0.2896054685115814, |
|
"learning_rate": 7.143750257841333e-05, |
|
"loss": 0.0224, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.0795847750865053, |
|
"grad_norm": 0.24790829420089722, |
|
"learning_rate": 7.034219786460987e-05, |
|
"loss": 0.021, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.096885813148789, |
|
"grad_norm": 0.3710263967514038, |
|
"learning_rate": 6.925078060917245e-05, |
|
"loss": 0.0244, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.114186851211073, |
|
"grad_norm": 0.28580549359321594, |
|
"learning_rate": 6.816339387189763e-05, |
|
"loss": 0.0298, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.131487889273356, |
|
"grad_norm": 0.2744482457637787, |
|
"learning_rate": 6.708018018427343e-05, |
|
"loss": 0.0199, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.14878892733564, |
|
"grad_norm": 0.26155564188957214, |
|
"learning_rate": 6.600128153079661e-05, |
|
"loss": 0.0166, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.1660899653979238, |
|
"grad_norm": 0.2101258784532547, |
|
"learning_rate": 6.492683933036183e-05, |
|
"loss": 0.0229, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.1833910034602075, |
|
"grad_norm": 0.37706953287124634, |
|
"learning_rate": 6.38569944177249e-05, |
|
"loss": 0.0271, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.2006920415224913, |
|
"grad_norm": 0.18298964202404022, |
|
"learning_rate": 6.279188702504252e-05, |
|
"loss": 0.0204, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.217993079584775, |
|
"grad_norm": 0.23048189282417297, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 0.0202, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 0.25672975182533264, |
|
"learning_rate": 6.0676442604966654e-05, |
|
"loss": 0.0154, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.2525951557093427, |
|
"grad_norm": 0.1890515834093094, |
|
"learning_rate": 5.9626382863869414e-05, |
|
"loss": 0.0255, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.2698961937716264, |
|
"grad_norm": 0.41655805706977844, |
|
"learning_rate": 5.8581615178973274e-05, |
|
"loss": 0.023, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.28719723183391, |
|
"grad_norm": 0.302661269903183, |
|
"learning_rate": 5.754227649538497e-05, |
|
"loss": 0.0277, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.304498269896194, |
|
"grad_norm": 0.5044915676116943, |
|
"learning_rate": 5.6508503046593484e-05, |
|
"loss": 0.0192, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.3217993079584773, |
|
"grad_norm": 0.32608142495155334, |
|
"learning_rate": 5.548043033661297e-05, |
|
"loss": 0.016, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.339100346020761, |
|
"grad_norm": 0.2057291716337204, |
|
"learning_rate": 5.44581931222214e-05, |
|
"loss": 0.0225, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.356401384083045, |
|
"grad_norm": 0.36772650480270386, |
|
"learning_rate": 5.3441925395297065e-05, |
|
"loss": 0.0152, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.3737024221453287, |
|
"grad_norm": 0.25624701380729675, |
|
"learning_rate": 5.243176036525499e-05, |
|
"loss": 0.0244, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.3910034602076125, |
|
"grad_norm": 0.2545239329338074, |
|
"learning_rate": 5.142783044158668e-05, |
|
"loss": 0.019, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.4083044982698962, |
|
"grad_norm": 0.3163670003414154, |
|
"learning_rate": 5.043026721650388e-05, |
|
"loss": 0.0284, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.42560553633218, |
|
"grad_norm": 0.18014021217823029, |
|
"learning_rate": 4.943920144769013e-05, |
|
"loss": 0.0245, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.442906574394464, |
|
"grad_norm": 0.3292304575443268, |
|
"learning_rate": 4.845476304116132e-05, |
|
"loss": 0.0198, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.4602076124567476, |
|
"grad_norm": 0.30157071352005005, |
|
"learning_rate": 4.74770810342379e-05, |
|
"loss": 0.0215, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.477508650519031, |
|
"grad_norm": 0.1187843605875969, |
|
"learning_rate": 4.650628357863113e-05, |
|
"loss": 0.0209, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.4948096885813147, |
|
"grad_norm": 0.17467384040355682, |
|
"learning_rate": 4.5542497923645456e-05, |
|
"loss": 0.0176, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.5121107266435985, |
|
"grad_norm": 0.3263590335845947, |
|
"learning_rate": 4.458585039949874e-05, |
|
"loss": 0.019, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.3304053544998169, |
|
"learning_rate": 4.363646640076355e-05, |
|
"loss": 0.0197, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.546712802768166, |
|
"grad_norm": 0.23609571158885956, |
|
"learning_rate": 4.2694470369930697e-05, |
|
"loss": 0.0173, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.56401384083045, |
|
"grad_norm": 0.13446663320064545, |
|
"learning_rate": 4.175998578109756e-05, |
|
"loss": 0.019, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.5813148788927336, |
|
"grad_norm": 0.23198455572128296, |
|
"learning_rate": 4.0833135123783683e-05, |
|
"loss": 0.0226, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.5986159169550174, |
|
"grad_norm": 0.2981939911842346, |
|
"learning_rate": 3.991403988687499e-05, |
|
"loss": 0.0203, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.615916955017301, |
|
"grad_norm": 0.18665547668933868, |
|
"learning_rate": 3.900282054269954e-05, |
|
"loss": 0.0199, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.633217993079585, |
|
"grad_norm": 0.25699618458747864, |
|
"learning_rate": 3.8099596531236357e-05, |
|
"loss": 0.0163, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.6505190311418687, |
|
"grad_norm": 0.35227668285369873, |
|
"learning_rate": 3.7204486244459334e-05, |
|
"loss": 0.022, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.6678200692041525, |
|
"grad_norm": 0.4024980664253235, |
|
"learning_rate": 3.631760701081913e-05, |
|
"loss": 0.0278, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.685121107266436, |
|
"grad_norm": 0.37001362442970276, |
|
"learning_rate": 3.5439075079863913e-05, |
|
"loss": 0.0243, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.7024221453287196, |
|
"grad_norm": 0.3014512360095978, |
|
"learning_rate": 3.456900560700158e-05, |
|
"loss": 0.0154, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.7197231833910034, |
|
"grad_norm": 0.24404117465019226, |
|
"learning_rate": 3.370751263840581e-05, |
|
"loss": 0.0165, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.737024221453287, |
|
"grad_norm": 0.16157107055187225, |
|
"learning_rate": 3.285470909606696e-05, |
|
"loss": 0.0214, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.754325259515571, |
|
"grad_norm": 0.2038198709487915, |
|
"learning_rate": 3.2010706762990736e-05, |
|
"loss": 0.0146, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.7716262975778547, |
|
"grad_norm": 0.21232356131076813, |
|
"learning_rate": 3.117561626854601e-05, |
|
"loss": 0.0127, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.7889273356401385, |
|
"grad_norm": 0.32062217593193054, |
|
"learning_rate": 3.0349547073963693e-05, |
|
"loss": 0.0193, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.8062283737024223, |
|
"grad_norm": 0.1892966628074646, |
|
"learning_rate": 2.953260745798898e-05, |
|
"loss": 0.0196, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.8235294117647056, |
|
"grad_norm": 0.23055952787399292, |
|
"learning_rate": 2.8724904502688566e-05, |
|
"loss": 0.0197, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.8408304498269894, |
|
"grad_norm": 0.25780200958251953, |
|
"learning_rate": 2.792654407941444e-05, |
|
"loss": 0.0186, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.858131487889273, |
|
"grad_norm": 0.4138200283050537, |
|
"learning_rate": 2.7137630834926788e-05, |
|
"loss": 0.0207, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.875432525951557, |
|
"grad_norm": 0.31212252378463745, |
|
"learning_rate": 2.635826817767708e-05, |
|
"loss": 0.0194, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.8927335640138407, |
|
"grad_norm": 0.23651407659053802, |
|
"learning_rate": 2.5588558264253547e-05, |
|
"loss": 0.0159, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.9100346020761245, |
|
"grad_norm": 0.1716080605983734, |
|
"learning_rate": 2.4828601985990983e-05, |
|
"loss": 0.0277, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.9273356401384083, |
|
"grad_norm": 0.2992190420627594, |
|
"learning_rate": 2.407849895574592e-05, |
|
"loss": 0.0229, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.944636678200692, |
|
"grad_norm": 0.32485488057136536, |
|
"learning_rate": 2.3338347494839997e-05, |
|
"loss": 0.024, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.961937716262976, |
|
"grad_norm": 0.15837271511554718, |
|
"learning_rate": 2.260824462017195e-05, |
|
"loss": 0.0179, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.9792387543252596, |
|
"grad_norm": 0.18397711217403412, |
|
"learning_rate": 2.1888286031501216e-05, |
|
"loss": 0.0195, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.9965397923875434, |
|
"grad_norm": 0.30589598417282104, |
|
"learning_rate": 2.1178566098903674e-05, |
|
"loss": 0.0165, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.013840830449827, |
|
"grad_norm": 0.20331960916519165, |
|
"learning_rate": 2.047917785040202e-05, |
|
"loss": 0.0173, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.031141868512111, |
|
"grad_norm": 0.17372752726078033, |
|
"learning_rate": 1.9790212959771815e-05, |
|
"loss": 0.0153, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.048442906574395, |
|
"grad_norm": 0.45032989978790283, |
|
"learning_rate": 1.911176173452529e-05, |
|
"loss": 0.0183, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.0657439446366785, |
|
"grad_norm": 0.381754994392395, |
|
"learning_rate": 1.8443913104073983e-05, |
|
"loss": 0.0191, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.083044982698962, |
|
"grad_norm": 0.2898407578468323, |
|
"learning_rate": 1.7786754608072154e-05, |
|
"loss": 0.0164, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.100346020761246, |
|
"grad_norm": 0.32434627413749695, |
|
"learning_rate": 1.7140372384942427e-05, |
|
"loss": 0.0202, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.19729329645633698, |
|
"learning_rate": 1.6504851160584854e-05, |
|
"loss": 0.0135, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.134948096885813, |
|
"grad_norm": 0.18434198200702667, |
|
"learning_rate": 1.5880274237271442e-05, |
|
"loss": 0.0204, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.1522491349480966, |
|
"grad_norm": 0.2019858956336975, |
|
"learning_rate": 1.5266723482727075e-05, |
|
"loss": 0.0131, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.16955017301038, |
|
"grad_norm": 0.2418513149023056, |
|
"learning_rate": 1.4664279319398566e-05, |
|
"loss": 0.0151, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.186851211072664, |
|
"grad_norm": 0.1929568648338318, |
|
"learning_rate": 1.4073020713912987e-05, |
|
"loss": 0.0201, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.204152249134948, |
|
"grad_norm": 0.20280393958091736, |
|
"learning_rate": 1.349302516672717e-05, |
|
"loss": 0.0184, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.221453287197232, |
|
"grad_norm": 0.11360491067171097, |
|
"learning_rate": 1.2924368701968936e-05, |
|
"loss": 0.0135, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.2387543252595155, |
|
"grad_norm": 0.20200875401496887, |
|
"learning_rate": 1.2367125857472283e-05, |
|
"loss": 0.0224, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.256055363321799, |
|
"grad_norm": 0.1830940693616867, |
|
"learning_rate": 1.1821369675007076e-05, |
|
"loss": 0.0186, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.273356401384083, |
|
"grad_norm": 0.21644911170005798, |
|
"learning_rate": 1.1287171690704923e-05, |
|
"loss": 0.0142, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.290657439446367, |
|
"grad_norm": 0.2183384746313095, |
|
"learning_rate": 1.076460192568246e-05, |
|
"loss": 0.0154, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.307958477508651, |
|
"grad_norm": 0.2756160795688629, |
|
"learning_rate": 1.0253728876863255e-05, |
|
"loss": 0.0218, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.325259515570934, |
|
"grad_norm": 0.23770609498023987, |
|
"learning_rate": 9.754619507999286e-06, |
|
"loss": 0.0217, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.342560553633218, |
|
"grad_norm": 0.2103499174118042, |
|
"learning_rate": 9.26733924089369e-06, |
|
"loss": 0.0161, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.359861591695502, |
|
"grad_norm": 0.13652902841567993, |
|
"learning_rate": 8.791951946825305e-06, |
|
"loss": 0.0136, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.377162629757786, |
|
"grad_norm": 0.15833500027656555, |
|
"learning_rate": 8.328519938176737e-06, |
|
"loss": 0.0137, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.3944636678200695, |
|
"grad_norm": 0.251105934381485, |
|
"learning_rate": 7.877103960266574e-06, |
|
"loss": 0.0207, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.411764705882353, |
|
"grad_norm": 0.2944389581680298, |
|
"learning_rate": 7.437763183387048e-06, |
|
"loss": 0.0164, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.429065743944637, |
|
"grad_norm": 0.40459296107292175, |
|
"learning_rate": 7.010555195048241e-06, |
|
"loss": 0.0223, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.446366782006921, |
|
"grad_norm": 0.211450457572937, |
|
"learning_rate": 6.59553599242958e-06, |
|
"loss": 0.0169, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.463667820069205, |
|
"grad_norm": 0.22154732048511505, |
|
"learning_rate": 6.1927599750399634e-06, |
|
"loss": 0.0185, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.4809688581314875, |
|
"grad_norm": 0.15479212999343872, |
|
"learning_rate": 5.802279937587218e-06, |
|
"loss": 0.0121, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.498269896193771, |
|
"grad_norm": 0.16314175724983215, |
|
"learning_rate": 5.424147063057938e-06, |
|
"loss": 0.0208, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.515570934256055, |
|
"grad_norm": 0.20242264866828918, |
|
"learning_rate": 5.058410916008494e-06, |
|
"loss": 0.0152, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.532871972318339, |
|
"grad_norm": 0.2115008533000946, |
|
"learning_rate": 4.70511943606835e-06, |
|
"loss": 0.0156, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.550173010380623, |
|
"grad_norm": 0.10006527602672577, |
|
"learning_rate": 4.364318931656186e-06, |
|
"loss": 0.0168, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.567474048442906, |
|
"grad_norm": 0.2213827222585678, |
|
"learning_rate": 4.0360540739100335e-06, |
|
"loss": 0.0191, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.58477508650519, |
|
"grad_norm": 0.27389973402023315, |
|
"learning_rate": 3.7203678908318327e-06, |
|
"loss": 0.0188, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.602076124567474, |
|
"grad_norm": 0.18938276171684265, |
|
"learning_rate": 3.417301761647429e-06, |
|
"loss": 0.0118, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.619377162629758, |
|
"grad_norm": 0.36289745569229126, |
|
"learning_rate": 3.1268954113827798e-06, |
|
"loss": 0.0224, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.6366782006920415, |
|
"grad_norm": 0.32742252945899963, |
|
"learning_rate": 2.8491869056568643e-06, |
|
"loss": 0.0163, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.653979238754325, |
|
"grad_norm": 0.15712793171405792, |
|
"learning_rate": 2.5842126456921633e-06, |
|
"loss": 0.0141, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.671280276816609, |
|
"grad_norm": 0.18424645066261292, |
|
"learning_rate": 2.3320073635432984e-06, |
|
"loss": 0.0152, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.688581314878893, |
|
"grad_norm": 0.2283022552728653, |
|
"learning_rate": 2.092604117544461e-06, |
|
"loss": 0.0125, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.19734638929367065, |
|
"learning_rate": 1.8660342879761817e-06, |
|
"loss": 0.0165, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.72318339100346, |
|
"grad_norm": 0.2148875743150711, |
|
"learning_rate": 1.6523275729521615e-06, |
|
"loss": 0.0105, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.740484429065744, |
|
"grad_norm": 0.20573090016841888, |
|
"learning_rate": 1.4515119845264658e-06, |
|
"loss": 0.011, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.757785467128028, |
|
"grad_norm": 0.24224159121513367, |
|
"learning_rate": 1.2636138450218382e-06, |
|
"loss": 0.0172, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.775086505190312, |
|
"grad_norm": 0.10488854348659515, |
|
"learning_rate": 1.0886577835793831e-06, |
|
"loss": 0.014, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.7923875432525955, |
|
"grad_norm": 0.16592086851596832, |
|
"learning_rate": 9.26666732930348e-07, |
|
"loss": 0.0114, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.809688581314878, |
|
"grad_norm": 0.12251131981611252, |
|
"learning_rate": 7.776619263900387e-07, |
|
"loss": 0.0135, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.826989619377162, |
|
"grad_norm": 0.16953104734420776, |
|
"learning_rate": 6.416628950747461e-07, |
|
"loss": 0.0156, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.844290657439446, |
|
"grad_norm": 0.3001477122306824, |
|
"learning_rate": 5.186874653415718e-07, |
|
"loss": 0.0165, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.86159169550173, |
|
"grad_norm": 0.14295035600662231, |
|
"learning_rate": 4.087517564518528e-07, |
|
"loss": 0.0117, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.8788927335640135, |
|
"grad_norm": 0.14125655591487885, |
|
"learning_rate": 3.1187017845827337e-07, |
|
"loss": 0.0137, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.896193771626297, |
|
"grad_norm": 0.20176155865192413, |
|
"learning_rate": 2.2805543031604314e-07, |
|
"loss": 0.0217, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.913494809688581, |
|
"grad_norm": 0.24088892340660095, |
|
"learning_rate": 1.5731849821833954e-07, |
|
"loss": 0.0159, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.930795847750865, |
|
"grad_norm": 0.27673816680908203, |
|
"learning_rate": 9.966865415631521e-08, |
|
"loss": 0.0164, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.948096885813149, |
|
"grad_norm": 0.18399952352046967, |
|
"learning_rate": 5.5113454703692445e-08, |
|
"loss": 0.013, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.965397923875432, |
|
"grad_norm": 0.24507218599319458, |
|
"learning_rate": 2.3658740026311077e-08, |
|
"loss": 0.0202, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.982698961937716, |
|
"grad_norm": 0.3336757719516754, |
|
"learning_rate": 5.3086331166074535e-09, |
|
"loss": 0.0137, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.998269896193771, |
|
"step": 2889, |
|
"total_flos": 1.012715054916768e+17, |
|
"train_loss": 0.04389128585140695, |
|
"train_runtime": 1270.1086, |
|
"train_samples_per_second": 36.394, |
|
"train_steps_per_second": 2.275 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2889, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.012715054916768e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|