|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.944055944055943, |
|
"eval_steps": 500, |
|
"global_step": 2852, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06993006993006994, |
|
"grad_norm": 30.581405639648438, |
|
"learning_rate": 1.3986013986013988e-05, |
|
"loss": 1.1014, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 2.1117727756500244, |
|
"learning_rate": 2.7972027972027976e-05, |
|
"loss": 0.693, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2097902097902098, |
|
"grad_norm": 1.6114064455032349, |
|
"learning_rate": 4.195804195804196e-05, |
|
"loss": 0.3677, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 0.6671944260597229, |
|
"learning_rate": 5.594405594405595e-05, |
|
"loss": 0.2686, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.34965034965034963, |
|
"grad_norm": 0.8836167454719543, |
|
"learning_rate": 6.993006993006993e-05, |
|
"loss": 0.2072, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.974694013595581, |
|
"learning_rate": 8.391608391608392e-05, |
|
"loss": 0.1783, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48951048951048953, |
|
"grad_norm": 0.8012842535972595, |
|
"learning_rate": 9.790209790209791e-05, |
|
"loss": 0.1542, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.5671571493148804, |
|
"learning_rate": 0.0001118881118881119, |
|
"loss": 0.1271, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6293706293706294, |
|
"grad_norm": 0.6660890579223633, |
|
"learning_rate": 0.00012587412587412587, |
|
"loss": 0.1197, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 0.4433947503566742, |
|
"learning_rate": 0.00013986013986013986, |
|
"loss": 0.1032, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.36532077193260193, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.1052, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.48936668038368225, |
|
"learning_rate": 0.00016783216783216784, |
|
"loss": 0.0878, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.6362347602844238, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.0865, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.4699188470840454, |
|
"learning_rate": 0.00019580419580419583, |
|
"loss": 0.0904, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.048951048951049, |
|
"grad_norm": 0.42141956090927124, |
|
"learning_rate": 0.00019999670507574947, |
|
"loss": 0.0842, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.118881118881119, |
|
"grad_norm": 0.3909319043159485, |
|
"learning_rate": 0.00019998056719395973, |
|
"loss": 0.0808, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1888111888111887, |
|
"grad_norm": 0.5473119616508484, |
|
"learning_rate": 0.00019995098333206742, |
|
"loss": 0.0713, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2587412587412588, |
|
"grad_norm": 0.37541836500167847, |
|
"learning_rate": 0.00019990795746868583, |
|
"loss": 0.0644, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3286713286713288, |
|
"grad_norm": 0.34557509422302246, |
|
"learning_rate": 0.00019985149539018855, |
|
"loss": 0.0707, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3986013986013985, |
|
"grad_norm": 0.29241743683815, |
|
"learning_rate": 0.00019978160468993094, |
|
"loss": 0.0612, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4685314685314685, |
|
"grad_norm": 0.481886625289917, |
|
"learning_rate": 0.00019969829476722923, |
|
"loss": 0.065, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.30471473932266235, |
|
"learning_rate": 0.00019960157682609632, |
|
"loss": 0.0596, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6083916083916083, |
|
"grad_norm": 0.30594855546951294, |
|
"learning_rate": 0.00019949146387373493, |
|
"loss": 0.06, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6783216783216783, |
|
"grad_norm": 0.31589317321777344, |
|
"learning_rate": 0.00019936797071878854, |
|
"loss": 0.0643, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7482517482517483, |
|
"grad_norm": 0.293947696685791, |
|
"learning_rate": 0.00019923111396934957, |
|
"loss": 0.0568, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.25768476724624634, |
|
"learning_rate": 0.00019908091203072598, |
|
"loss": 0.0556, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8881118881118881, |
|
"grad_norm": 0.258080393075943, |
|
"learning_rate": 0.00019891738510296602, |
|
"loss": 0.0543, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.958041958041958, |
|
"grad_norm": 0.27586156129837036, |
|
"learning_rate": 0.0001987405551781415, |
|
"loss": 0.051, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.027972027972028, |
|
"grad_norm": 0.26486584544181824, |
|
"learning_rate": 0.0001985504460373903, |
|
"loss": 0.0521, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.097902097902098, |
|
"grad_norm": 0.2695034146308899, |
|
"learning_rate": 0.00019834708324771797, |
|
"loss": 0.0531, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.167832167832168, |
|
"grad_norm": 0.30032461881637573, |
|
"learning_rate": 0.00019813049415855964, |
|
"loss": 0.0541, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.237762237762238, |
|
"grad_norm": 0.3316197395324707, |
|
"learning_rate": 0.00019790070789810145, |
|
"loss": 0.0458, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.3239120543003082, |
|
"learning_rate": 0.00019765775536936367, |
|
"loss": 0.0533, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.3776223776223775, |
|
"grad_norm": 0.20758603513240814, |
|
"learning_rate": 0.00019740166924604431, |
|
"loss": 0.0527, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.4475524475524475, |
|
"grad_norm": 0.31676483154296875, |
|
"learning_rate": 0.00019713248396812524, |
|
"loss": 0.0521, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5174825174825175, |
|
"grad_norm": 0.25248146057128906, |
|
"learning_rate": 0.00019685023573724037, |
|
"loss": 0.0518, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.5874125874125875, |
|
"grad_norm": 0.26571494340896606, |
|
"learning_rate": 0.0001965549625118071, |
|
"loss": 0.045, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.6573426573426575, |
|
"grad_norm": 0.2656267285346985, |
|
"learning_rate": 0.00019624670400192126, |
|
"loss": 0.0434, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.30044880509376526, |
|
"learning_rate": 0.00019592550166401695, |
|
"loss": 0.0443, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.797202797202797, |
|
"grad_norm": 0.27586933970451355, |
|
"learning_rate": 0.00019559139869529103, |
|
"loss": 0.044, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.867132867132867, |
|
"grad_norm": 0.34137022495269775, |
|
"learning_rate": 0.00019524444002789383, |
|
"loss": 0.0473, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.937062937062937, |
|
"grad_norm": 0.2440134882926941, |
|
"learning_rate": 0.0001948846723228862, |
|
"loss": 0.0438, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.006993006993007, |
|
"grad_norm": 0.23091812431812286, |
|
"learning_rate": 0.00019451214396396454, |
|
"loss": 0.0463, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.15170010924339294, |
|
"learning_rate": 0.00019412690505095365, |
|
"loss": 0.0416, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.1468531468531467, |
|
"grad_norm": 0.25807687640190125, |
|
"learning_rate": 0.00019372900739306908, |
|
"loss": 0.0419, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.2167832167832167, |
|
"grad_norm": 0.38740265369415283, |
|
"learning_rate": 0.00019331850450194957, |
|
"loss": 0.0379, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.2867132867132867, |
|
"grad_norm": 0.24318642914295197, |
|
"learning_rate": 0.00019289545158446045, |
|
"loss": 0.0391, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.3566433566433567, |
|
"grad_norm": 0.2211398035287857, |
|
"learning_rate": 0.00019245990553526905, |
|
"loss": 0.043, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.4265734265734267, |
|
"grad_norm": 0.3056994378566742, |
|
"learning_rate": 0.00019201192492919317, |
|
"loss": 0.0376, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.4965034965034967, |
|
"grad_norm": 0.18653175234794617, |
|
"learning_rate": 0.00019155157001332374, |
|
"loss": 0.0442, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.5664335664335667, |
|
"grad_norm": 0.21762531995773315, |
|
"learning_rate": 0.00019107890269892214, |
|
"loss": 0.0392, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.24727760255336761, |
|
"learning_rate": 0.0001905939865530944, |
|
"loss": 0.0413, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.7062937062937062, |
|
"grad_norm": 0.34239208698272705, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.0417, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.7762237762237763, |
|
"grad_norm": 0.23154211044311523, |
|
"learning_rate": 0.0001895876702632913, |
|
"loss": 0.0391, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 0.25248488783836365, |
|
"learning_rate": 0.00018906640545470355, |
|
"loss": 0.0367, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.916083916083916, |
|
"grad_norm": 0.2887849807739258, |
|
"learning_rate": 0.000188533162467264, |
|
"loss": 0.0394, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.986013986013986, |
|
"grad_norm": 0.30540716648101807, |
|
"learning_rate": 0.0001879880130146547, |
|
"loss": 0.0435, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.055944055944056, |
|
"grad_norm": 0.2462405562400818, |
|
"learning_rate": 0.0001874310304118096, |
|
"loss": 0.0354, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.125874125874126, |
|
"grad_norm": 0.2431563138961792, |
|
"learning_rate": 0.00018686228956505516, |
|
"loss": 0.0409, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.195804195804196, |
|
"grad_norm": 0.2564241886138916, |
|
"learning_rate": 0.00018628186696203612, |
|
"loss": 0.0397, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.265734265734266, |
|
"grad_norm": 0.24190612137317657, |
|
"learning_rate": 0.00018568984066142917, |
|
"loss": 0.0377, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.335664335664336, |
|
"grad_norm": 0.23087257146835327, |
|
"learning_rate": 0.00018508629028244519, |
|
"loss": 0.0412, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.405594405594406, |
|
"grad_norm": 0.4008491039276123, |
|
"learning_rate": 0.00018447129699412142, |
|
"loss": 0.039, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.475524475524476, |
|
"grad_norm": 0.27394184470176697, |
|
"learning_rate": 0.00018384494350440553, |
|
"loss": 0.038, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.20550896227359772, |
|
"learning_rate": 0.0001832073140490325, |
|
"loss": 0.0404, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.2519960403442383, |
|
"learning_rate": 0.00018255849438019608, |
|
"loss": 0.0378, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.685314685314685, |
|
"grad_norm": 0.3346613645553589, |
|
"learning_rate": 0.00018189857175501635, |
|
"loss": 0.035, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.755244755244755, |
|
"grad_norm": 0.32892099022865295, |
|
"learning_rate": 0.00018122763492380486, |
|
"loss": 0.0325, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.825174825174825, |
|
"grad_norm": 0.24375475943088531, |
|
"learning_rate": 0.00018054577411812895, |
|
"loss": 0.0318, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.895104895104895, |
|
"grad_norm": 0.19709643721580505, |
|
"learning_rate": 0.00017985308103867688, |
|
"loss": 0.0368, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.965034965034965, |
|
"grad_norm": 0.25721314549446106, |
|
"learning_rate": 0.00017914964884292544, |
|
"loss": 0.0377, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.034965034965035, |
|
"grad_norm": 0.3517259955406189, |
|
"learning_rate": 0.00017843557213261142, |
|
"loss": 0.0359, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.104895104895105, |
|
"grad_norm": 0.2421354055404663, |
|
"learning_rate": 0.00017771094694100925, |
|
"loss": 0.0375, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.174825174825175, |
|
"grad_norm": 0.22226205468177795, |
|
"learning_rate": 0.00017697587072001557, |
|
"loss": 0.0376, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.244755244755245, |
|
"grad_norm": 0.2880263924598694, |
|
"learning_rate": 0.0001762304423270436, |
|
"loss": 0.0377, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.314685314685315, |
|
"grad_norm": 0.2628781795501709, |
|
"learning_rate": 0.00017547476201172808, |
|
"loss": 0.0366, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.21351908147335052, |
|
"learning_rate": 0.00017470893140244303, |
|
"loss": 0.0346, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.23355896770954132, |
|
"learning_rate": 0.00017393305349263434, |
|
"loss": 0.0367, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.524475524475524, |
|
"grad_norm": 0.17863404750823975, |
|
"learning_rate": 0.00017314723262696848, |
|
"loss": 0.0319, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.594405594405594, |
|
"grad_norm": 0.2492302805185318, |
|
"learning_rate": 0.00017235157448729967, |
|
"loss": 0.0345, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.664335664335664, |
|
"grad_norm": 0.1684318482875824, |
|
"learning_rate": 0.00017154618607845702, |
|
"loss": 0.0325, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.734265734265734, |
|
"grad_norm": 0.19033940136432648, |
|
"learning_rate": 0.00017073117571385414, |
|
"loss": 0.0325, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.804195804195804, |
|
"grad_norm": 0.1792939007282257, |
|
"learning_rate": 0.00016990665300092224, |
|
"loss": 0.037, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.874125874125874, |
|
"grad_norm": 0.17368149757385254, |
|
"learning_rate": 0.00016907272882636968, |
|
"loss": 0.0329, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.944055944055944, |
|
"grad_norm": 0.20785829424858093, |
|
"learning_rate": 0.0001682295153412691, |
|
"loss": 0.0316, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.013986013986014, |
|
"grad_norm": 0.19825145602226257, |
|
"learning_rate": 0.00016737712594597483, |
|
"loss": 0.0274, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.083916083916084, |
|
"grad_norm": 0.15773798525333405, |
|
"learning_rate": 0.00016651567527487204, |
|
"loss": 0.0347, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.26358160376548767, |
|
"learning_rate": 0.00016564527918096005, |
|
"loss": 0.0299, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.223776223776224, |
|
"grad_norm": 0.3937130272388458, |
|
"learning_rate": 0.00016476605472027172, |
|
"loss": 0.0309, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.293706293706293, |
|
"grad_norm": 0.13225628435611725, |
|
"learning_rate": 0.00016387812013613103, |
|
"loss": 0.035, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.19665835797786713, |
|
"learning_rate": 0.00016298159484325118, |
|
"loss": 0.0316, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.433566433566433, |
|
"grad_norm": 0.21388469636440277, |
|
"learning_rate": 0.00016207659941167485, |
|
"loss": 0.0272, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.503496503496503, |
|
"grad_norm": 0.47336331009864807, |
|
"learning_rate": 0.00016116325555055915, |
|
"loss": 0.0293, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.573426573426573, |
|
"grad_norm": 0.1514803022146225, |
|
"learning_rate": 0.00016024168609180757, |
|
"loss": 0.0316, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.643356643356643, |
|
"grad_norm": 0.20940926671028137, |
|
"learning_rate": 0.00015931201497355088, |
|
"loss": 0.0282, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.713286713286713, |
|
"grad_norm": 0.22871293127536774, |
|
"learning_rate": 0.000158374367223479, |
|
"loss": 0.0349, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.783216783216783, |
|
"grad_norm": 0.18799573183059692, |
|
"learning_rate": 0.00015742886894202674, |
|
"loss": 0.0295, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.853146853146853, |
|
"grad_norm": 0.17343267798423767, |
|
"learning_rate": 0.00015647564728541485, |
|
"loss": 0.0299, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 0.21930481493473053, |
|
"learning_rate": 0.00015551483044854954, |
|
"loss": 0.0285, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 6.993006993006993, |
|
"grad_norm": 0.15558023750782013, |
|
"learning_rate": 0.00015454654764778187, |
|
"loss": 0.03, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.062937062937063, |
|
"grad_norm": 0.39069393277168274, |
|
"learning_rate": 0.00015357092910353001, |
|
"loss": 0.0263, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.1328671328671325, |
|
"grad_norm": 0.22350826859474182, |
|
"learning_rate": 0.00015258810602276654, |
|
"loss": 0.0341, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.2027972027972025, |
|
"grad_norm": 0.16752642393112183, |
|
"learning_rate": 0.00015159821058137278, |
|
"loss": 0.0279, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.17457683384418488, |
|
"learning_rate": 0.00015060137590636318, |
|
"loss": 0.0268, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.3426573426573425, |
|
"grad_norm": 0.1841355264186859, |
|
"learning_rate": 0.00014959773605798145, |
|
"loss": 0.0248, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.4125874125874125, |
|
"grad_norm": 0.18273039162158966, |
|
"learning_rate": 0.0001485874260116714, |
|
"loss": 0.0256, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.4825174825174825, |
|
"grad_norm": 0.21696248650550842, |
|
"learning_rate": 0.00014757058163992464, |
|
"loss": 0.026, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.5524475524475525, |
|
"grad_norm": 0.23004591464996338, |
|
"learning_rate": 0.0001465473396940078, |
|
"loss": 0.0305, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.6223776223776225, |
|
"grad_norm": 0.17306514084339142, |
|
"learning_rate": 0.000145517837785571, |
|
"loss": 0.0247, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.2152336984872818, |
|
"learning_rate": 0.0001444822143681415, |
|
"loss": 0.0274, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.7622377622377625, |
|
"grad_norm": 0.1681831181049347, |
|
"learning_rate": 0.00014344060871850325, |
|
"loss": 0.0247, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 7.8321678321678325, |
|
"grad_norm": 0.1374141126871109, |
|
"learning_rate": 0.00014239316091796647, |
|
"loss": 0.0285, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.902097902097902, |
|
"grad_norm": 0.3081527352333069, |
|
"learning_rate": 0.00014134001183352832, |
|
"loss": 0.0246, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 7.972027972027972, |
|
"grad_norm": 0.4359898865222931, |
|
"learning_rate": 0.0001402813030989286, |
|
"loss": 0.0284, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.041958041958042, |
|
"grad_norm": 0.15682591497898102, |
|
"learning_rate": 0.00013921717709560182, |
|
"loss": 0.0261, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.111888111888112, |
|
"grad_norm": 0.23213213682174683, |
|
"learning_rate": 0.000138147776933529, |
|
"loss": 0.0265, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.1675710529088974, |
|
"learning_rate": 0.00013707324643199114, |
|
"loss": 0.0297, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.251748251748252, |
|
"grad_norm": 0.1905348300933838, |
|
"learning_rate": 0.00013599373010022794, |
|
"loss": 0.0282, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.321678321678322, |
|
"grad_norm": 0.22106771171092987, |
|
"learning_rate": 0.0001349093731180031, |
|
"loss": 0.0265, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.391608391608392, |
|
"grad_norm": 0.21079835295677185, |
|
"learning_rate": 0.00013382032131607966, |
|
"loss": 0.0288, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 0.3102055788040161, |
|
"learning_rate": 0.00013272672115660796, |
|
"loss": 0.0226, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.531468531468532, |
|
"grad_norm": 0.16320601105690002, |
|
"learning_rate": 0.00013162871971342837, |
|
"loss": 0.026, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.601398601398602, |
|
"grad_norm": 0.2458457201719284, |
|
"learning_rate": 0.00013052646465229207, |
|
"loss": 0.0257, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 8.671328671328672, |
|
"grad_norm": 0.21947446465492249, |
|
"learning_rate": 0.00012942010421100207, |
|
"loss": 0.0255, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.741258741258742, |
|
"grad_norm": 0.1916748285293579, |
|
"learning_rate": 0.00012830978717947718, |
|
"loss": 0.0239, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.811188811188812, |
|
"grad_norm": 0.14072898030281067, |
|
"learning_rate": 0.00012719566287974204, |
|
"loss": 0.0261, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 8.881118881118882, |
|
"grad_norm": 0.1773282140493393, |
|
"learning_rate": 0.00012607788114584522, |
|
"loss": 0.023, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 8.951048951048952, |
|
"grad_norm": 0.13442933559417725, |
|
"learning_rate": 0.0001249565923037088, |
|
"loss": 0.0251, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.020979020979022, |
|
"grad_norm": 0.17046289145946503, |
|
"learning_rate": 0.00012383194715091163, |
|
"loss": 0.0253, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.25058501958847046, |
|
"learning_rate": 0.00012270409693640905, |
|
"loss": 0.0262, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.16083916083916, |
|
"grad_norm": 0.22269104421138763, |
|
"learning_rate": 0.00012157319334019219, |
|
"loss": 0.0235, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.25462645292282104, |
|
"learning_rate": 0.00012043938845288904, |
|
"loss": 0.0256, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.3006993006993, |
|
"grad_norm": 0.18311487138271332, |
|
"learning_rate": 0.00011930283475531048, |
|
"loss": 0.0259, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.37062937062937, |
|
"grad_norm": 0.16915184259414673, |
|
"learning_rate": 0.00011816368509794364, |
|
"loss": 0.0259, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.44055944055944, |
|
"grad_norm": 0.16743628680706024, |
|
"learning_rate": 0.00011702209268039581, |
|
"loss": 0.0229, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.51048951048951, |
|
"grad_norm": 0.16608262062072754, |
|
"learning_rate": 0.00011587821103079111, |
|
"loss": 0.0238, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 9.58041958041958, |
|
"grad_norm": 0.14811694622039795, |
|
"learning_rate": 0.00011473219398512316, |
|
"loss": 0.0226, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 9.65034965034965, |
|
"grad_norm": 0.1586376130580902, |
|
"learning_rate": 0.00011358419566656642, |
|
"loss": 0.0196, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 9.72027972027972, |
|
"grad_norm": 0.13793979585170746, |
|
"learning_rate": 0.00011243437046474853, |
|
"loss": 0.0209, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 9.79020979020979, |
|
"grad_norm": 0.207151859998703, |
|
"learning_rate": 0.00011128287301498739, |
|
"loss": 0.0235, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.86013986013986, |
|
"grad_norm": 0.48499253392219543, |
|
"learning_rate": 0.00011012985817749463, |
|
"loss": 0.0217, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 9.93006993006993, |
|
"grad_norm": 0.11960924416780472, |
|
"learning_rate": 0.00010897548101654926, |
|
"loss": 0.0217, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.12925837934017181, |
|
"learning_rate": 0.00010781989677964355, |
|
"loss": 0.0207, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 10.06993006993007, |
|
"grad_norm": 0.1686684936285019, |
|
"learning_rate": 0.00010666326087660458, |
|
"loss": 0.0204, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 10.13986013986014, |
|
"grad_norm": 0.15361668169498444, |
|
"learning_rate": 0.00010550572885869367, |
|
"loss": 0.0197, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 10.20979020979021, |
|
"grad_norm": 0.2281733602285385, |
|
"learning_rate": 0.00010434745639768705, |
|
"loss": 0.0252, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 10.27972027972028, |
|
"grad_norm": 0.14611436426639557, |
|
"learning_rate": 0.00010318859926494014, |
|
"loss": 0.02, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 10.34965034965035, |
|
"grad_norm": 0.3116054832935333, |
|
"learning_rate": 0.00010202931331043839, |
|
"loss": 0.0217, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 10.41958041958042, |
|
"grad_norm": 0.19651482999324799, |
|
"learning_rate": 0.00010086975444183782, |
|
"loss": 0.0232, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 10.48951048951049, |
|
"grad_norm": 0.18464644253253937, |
|
"learning_rate": 9.971007860349756e-05, |
|
"loss": 0.02, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 10.55944055944056, |
|
"grad_norm": 0.18407182395458221, |
|
"learning_rate": 9.855044175550756e-05, |
|
"loss": 0.0232, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 10.62937062937063, |
|
"grad_norm": 0.12513527274131775, |
|
"learning_rate": 9.739099985271394e-05, |
|
"loss": 0.0219, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 10.6993006993007, |
|
"grad_norm": 0.13158084452152252, |
|
"learning_rate": 9.623190882374564e-05, |
|
"loss": 0.0224, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 0.17640575766563416, |
|
"learning_rate": 9.507332455004395e-05, |
|
"loss": 0.0226, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 10.83916083916084, |
|
"grad_norm": 0.30499282479286194, |
|
"learning_rate": 9.391540284489862e-05, |
|
"loss": 0.0216, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.1940823644399643, |
|
"learning_rate": 9.275829943249334e-05, |
|
"loss": 0.0241, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 10.979020979020978, |
|
"grad_norm": 0.159086212515831, |
|
"learning_rate": 9.160216992696286e-05, |
|
"loss": 0.022, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 11.048951048951048, |
|
"grad_norm": 0.19389483332633972, |
|
"learning_rate": 9.044716981146526e-05, |
|
"loss": 0.022, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 11.118881118881118, |
|
"grad_norm": 0.12246847152709961, |
|
"learning_rate": 8.929345441727142e-05, |
|
"loss": 0.0205, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 11.188811188811188, |
|
"grad_norm": 0.16194018721580505, |
|
"learning_rate": 8.814117890287538e-05, |
|
"loss": 0.0207, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 11.258741258741258, |
|
"grad_norm": 0.17201407253742218, |
|
"learning_rate": 8.699049823312748e-05, |
|
"loss": 0.0216, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 11.328671328671328, |
|
"grad_norm": 0.12679952383041382, |
|
"learning_rate": 8.584156715839401e-05, |
|
"loss": 0.0194, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 11.398601398601398, |
|
"grad_norm": 0.19168208539485931, |
|
"learning_rate": 8.469454019374531e-05, |
|
"loss": 0.016, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 11.468531468531468, |
|
"grad_norm": 0.20202843844890594, |
|
"learning_rate": 8.354957159817561e-05, |
|
"loss": 0.0227, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 11.538461538461538, |
|
"grad_norm": 0.14431844651699066, |
|
"learning_rate": 8.240681535385757e-05, |
|
"loss": 0.0177, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 11.608391608391608, |
|
"grad_norm": 0.16581477224826813, |
|
"learning_rate": 8.126642514543359e-05, |
|
"loss": 0.017, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 11.678321678321678, |
|
"grad_norm": 0.15920375287532806, |
|
"learning_rate": 8.012855433934765e-05, |
|
"loss": 0.02, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 11.748251748251748, |
|
"grad_norm": 0.15303653478622437, |
|
"learning_rate": 7.899335596321944e-05, |
|
"loss": 0.0207, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 11.818181818181818, |
|
"grad_norm": 0.15237000584602356, |
|
"learning_rate": 7.786098268526448e-05, |
|
"loss": 0.0225, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 11.888111888111888, |
|
"grad_norm": 0.13649679720401764, |
|
"learning_rate": 7.673158679376234e-05, |
|
"loss": 0.0204, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 11.958041958041958, |
|
"grad_norm": 0.13087141513824463, |
|
"learning_rate": 7.560532017657585e-05, |
|
"loss": 0.0211, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 12.027972027972028, |
|
"grad_norm": 0.13551278412342072, |
|
"learning_rate": 7.448233430072466e-05, |
|
"loss": 0.0192, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 12.097902097902098, |
|
"grad_norm": 0.19065268337726593, |
|
"learning_rate": 7.336278019201462e-05, |
|
"loss": 0.0174, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 12.167832167832168, |
|
"grad_norm": 0.16062477231025696, |
|
"learning_rate": 7.224680841472741e-05, |
|
"loss": 0.0209, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 12.237762237762238, |
|
"grad_norm": 0.3261907696723938, |
|
"learning_rate": 7.113456905137132e-05, |
|
"loss": 0.0204, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 0.36956819891929626, |
|
"learning_rate": 7.002621168249759e-05, |
|
"loss": 0.0235, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 12.377622377622378, |
|
"grad_norm": 0.16811484098434448, |
|
"learning_rate": 6.892188536658369e-05, |
|
"loss": 0.0201, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 12.447552447552448, |
|
"grad_norm": 0.14103484153747559, |
|
"learning_rate": 6.782173861998726e-05, |
|
"loss": 0.0226, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 12.517482517482517, |
|
"grad_norm": 0.17502908408641815, |
|
"learning_rate": 6.672591939697261e-05, |
|
"loss": 0.0204, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 12.587412587412587, |
|
"grad_norm": 0.13043367862701416, |
|
"learning_rate": 6.563457506981297e-05, |
|
"loss": 0.0201, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 12.657342657342657, |
|
"grad_norm": 0.13106204569339752, |
|
"learning_rate": 6.454785240897112e-05, |
|
"loss": 0.0199, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 12.727272727272727, |
|
"grad_norm": 0.30531424283981323, |
|
"learning_rate": 6.34658975633605e-05, |
|
"loss": 0.0184, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 12.797202797202797, |
|
"grad_norm": 0.16103509068489075, |
|
"learning_rate": 6.238885604069075e-05, |
|
"loss": 0.0212, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 12.867132867132867, |
|
"grad_norm": 0.12883076071739197, |
|
"learning_rate": 6.131687268789838e-05, |
|
"loss": 0.0166, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 12.937062937062937, |
|
"grad_norm": 0.1775168776512146, |
|
"learning_rate": 6.0250091671667484e-05, |
|
"loss": 0.022, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 13.006993006993007, |
|
"grad_norm": 0.14147868752479553, |
|
"learning_rate": 5.9188656459040837e-05, |
|
"loss": 0.024, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 13.076923076923077, |
|
"grad_norm": 0.10535666346549988, |
|
"learning_rate": 5.813270979812589e-05, |
|
"loss": 0.0187, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 13.146853146853147, |
|
"grad_norm": 0.18340010941028595, |
|
"learning_rate": 5.7082393698897166e-05, |
|
"loss": 0.0201, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 13.216783216783217, |
|
"grad_norm": 0.12412499636411667, |
|
"learning_rate": 5.60378494140976e-05, |
|
"loss": 0.0189, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 13.286713286713287, |
|
"grad_norm": 0.21642570197582245, |
|
"learning_rate": 5.4999217420242576e-05, |
|
"loss": 0.0178, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 13.356643356643357, |
|
"grad_norm": 0.17489974200725555, |
|
"learning_rate": 5.396663739872725e-05, |
|
"loss": 0.0173, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 13.426573426573427, |
|
"grad_norm": 0.12585236132144928, |
|
"learning_rate": 5.294024821704172e-05, |
|
"loss": 0.0206, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 13.496503496503497, |
|
"grad_norm": 0.12454655766487122, |
|
"learning_rate": 5.192018791009521e-05, |
|
"loss": 0.0172, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 13.566433566433567, |
|
"grad_norm": 0.1352427452802658, |
|
"learning_rate": 5.090659366165227e-05, |
|
"loss": 0.0185, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 13.636363636363637, |
|
"grad_norm": 0.15033438801765442, |
|
"learning_rate": 4.989960178588357e-05, |
|
"loss": 0.0177, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 13.706293706293707, |
|
"grad_norm": 0.2707054018974304, |
|
"learning_rate": 4.889934770903336e-05, |
|
"loss": 0.0165, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 13.776223776223777, |
|
"grad_norm": 0.19302873313426971, |
|
"learning_rate": 4.790596595120699e-05, |
|
"loss": 0.0158, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 0.14991451799869537, |
|
"learning_rate": 4.6919590108279254e-05, |
|
"loss": 0.0191, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 13.916083916083917, |
|
"grad_norm": 0.1627596765756607, |
|
"learning_rate": 4.594035283392815e-05, |
|
"loss": 0.0178, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 13.986013986013987, |
|
"grad_norm": 0.1320677548646927, |
|
"learning_rate": 4.49683858217944e-05, |
|
"loss": 0.0169, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 14.055944055944057, |
|
"grad_norm": 0.25264090299606323, |
|
"learning_rate": 4.4003819787770964e-05, |
|
"loss": 0.0179, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 14.125874125874127, |
|
"grad_norm": 0.11774495244026184, |
|
"learning_rate": 4.304678445242309e-05, |
|
"loss": 0.0152, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 14.195804195804195, |
|
"grad_norm": 0.17087212204933167, |
|
"learning_rate": 4.209740852354313e-05, |
|
"loss": 0.017, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 14.265734265734265, |
|
"grad_norm": 0.1586153358221054, |
|
"learning_rate": 4.115581967884094e-05, |
|
"loss": 0.0199, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 14.335664335664335, |
|
"grad_norm": 0.15853182971477509, |
|
"learning_rate": 4.022214454877305e-05, |
|
"loss": 0.0148, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 14.405594405594405, |
|
"grad_norm": 0.131947860121727, |
|
"learning_rate": 3.929650869951278e-05, |
|
"loss": 0.0178, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 14.475524475524475, |
|
"grad_norm": 0.26362699270248413, |
|
"learning_rate": 3.8379036616063066e-05, |
|
"loss": 0.0173, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 14.545454545454545, |
|
"grad_norm": 0.12325582653284073, |
|
"learning_rate": 3.746985168551532e-05, |
|
"loss": 0.0167, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 14.615384615384615, |
|
"grad_norm": 0.1346844732761383, |
|
"learning_rate": 3.65690761804554e-05, |
|
"loss": 0.0161, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 14.685314685314685, |
|
"grad_norm": 0.17594227194786072, |
|
"learning_rate": 3.567683124251972e-05, |
|
"loss": 0.0198, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 14.755244755244755, |
|
"grad_norm": 0.11365789920091629, |
|
"learning_rate": 3.4793236866103294e-05, |
|
"loss": 0.0138, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 14.825174825174825, |
|
"grad_norm": 0.11247463524341583, |
|
"learning_rate": 3.391841188222246e-05, |
|
"loss": 0.0137, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 14.895104895104895, |
|
"grad_norm": 0.17104235291481018, |
|
"learning_rate": 3.305247394253349e-05, |
|
"loss": 0.0171, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 14.965034965034965, |
|
"grad_norm": 0.11906550079584122, |
|
"learning_rate": 3.2195539503510164e-05, |
|
"loss": 0.0195, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 15.034965034965035, |
|
"grad_norm": 0.13180206716060638, |
|
"learning_rate": 3.1347723810782134e-05, |
|
"loss": 0.0147, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 15.104895104895105, |
|
"grad_norm": 0.27480393648147583, |
|
"learning_rate": 3.05091408836359e-05, |
|
"loss": 0.0142, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 15.174825174825175, |
|
"grad_norm": 0.19437533617019653, |
|
"learning_rate": 2.967990349968086e-05, |
|
"loss": 0.0159, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 15.244755244755245, |
|
"grad_norm": 0.10018286108970642, |
|
"learning_rate": 2.8860123179682242e-05, |
|
"loss": 0.0139, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 15.314685314685315, |
|
"grad_norm": 0.10660770535469055, |
|
"learning_rate": 2.8049910172563188e-05, |
|
"loss": 0.0138, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.19103875756263733, |
|
"learning_rate": 2.7249373440577963e-05, |
|
"loss": 0.0148, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 15.454545454545455, |
|
"grad_norm": 0.11443736404180527, |
|
"learning_rate": 2.6458620644657693e-05, |
|
"loss": 0.0193, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 15.524475524475525, |
|
"grad_norm": 0.1157824918627739, |
|
"learning_rate": 2.567775812993186e-05, |
|
"loss": 0.0179, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 15.594405594405595, |
|
"grad_norm": 0.09878715127706528, |
|
"learning_rate": 2.4906890911426208e-05, |
|
"loss": 0.0177, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 15.664335664335665, |
|
"grad_norm": 0.18489457666873932, |
|
"learning_rate": 2.4146122659939686e-05, |
|
"loss": 0.0127, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 15.734265734265735, |
|
"grad_norm": 0.15683647990226746, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.0152, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 15.804195804195803, |
|
"grad_norm": 0.13426269590854645, |
|
"learning_rate": 2.2655290936615093e-05, |
|
"loss": 0.0163, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 15.874125874125873, |
|
"grad_norm": 0.10421716421842575, |
|
"learning_rate": 2.1925427960675894e-05, |
|
"loss": 0.0148, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 15.944055944055943, |
|
"grad_norm": 0.20967915654182434, |
|
"learning_rate": 2.120606491658966e-05, |
|
"loss": 0.0128, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 16.013986013986013, |
|
"grad_norm": 0.11200631409883499, |
|
"learning_rate": 2.049729854856832e-05, |
|
"loss": 0.0133, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 16.083916083916083, |
|
"grad_norm": 0.10998646169900894, |
|
"learning_rate": 1.9799224175719767e-05, |
|
"loss": 0.015, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 16.153846153846153, |
|
"grad_norm": 0.1218709722161293, |
|
"learning_rate": 1.9111935679229142e-05, |
|
"loss": 0.0155, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 16.223776223776223, |
|
"grad_norm": 0.11821790039539337, |
|
"learning_rate": 1.843552548973272e-05, |
|
"loss": 0.0144, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 16.293706293706293, |
|
"grad_norm": 0.1463775783777237, |
|
"learning_rate": 1.7770084574887567e-05, |
|
"loss": 0.0117, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 16.363636363636363, |
|
"grad_norm": 0.15775668621063232, |
|
"learning_rate": 1.7115702427137616e-05, |
|
"loss": 0.0174, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 16.433566433566433, |
|
"grad_norm": 0.1515921652317047, |
|
"learning_rate": 1.647246705167812e-05, |
|
"loss": 0.0137, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 16.503496503496503, |
|
"grad_norm": 0.16316431760787964, |
|
"learning_rate": 1.5840464954620206e-05, |
|
"loss": 0.016, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 16.573426573426573, |
|
"grad_norm": 0.07553193718194962, |
|
"learning_rate": 1.5219781131357103e-05, |
|
"loss": 0.0162, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 16.643356643356643, |
|
"grad_norm": 0.1977609395980835, |
|
"learning_rate": 1.4610499055133375e-05, |
|
"loss": 0.014, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 16.713286713286713, |
|
"grad_norm": 0.2475104033946991, |
|
"learning_rate": 1.401270066581899e-05, |
|
"loss": 0.0157, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 16.783216783216783, |
|
"grad_norm": 0.20461061596870422, |
|
"learning_rate": 1.3426466358889545e-05, |
|
"loss": 0.0154, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 16.853146853146853, |
|
"grad_norm": 0.17063836753368378, |
|
"learning_rate": 1.2851874974614097e-05, |
|
"loss": 0.0132, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 0.16239117085933685, |
|
"learning_rate": 1.2289003787452557e-05, |
|
"loss": 0.0142, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 16.993006993006993, |
|
"grad_norm": 0.09241970628499985, |
|
"learning_rate": 1.1737928495662964e-05, |
|
"loss": 0.0179, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 17.062937062937063, |
|
"grad_norm": 0.104482501745224, |
|
"learning_rate": 1.1198723211121442e-05, |
|
"loss": 0.0142, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 17.132867132867133, |
|
"grad_norm": 0.10794669389724731, |
|
"learning_rate": 1.0671460449355075e-05, |
|
"loss": 0.014, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 17.202797202797203, |
|
"grad_norm": 0.11316874623298645, |
|
"learning_rate": 1.0156211119789583e-05, |
|
"loss": 0.0152, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 17.272727272727273, |
|
"grad_norm": 0.0924522653222084, |
|
"learning_rate": 9.65304451621304e-06, |
|
"loss": 0.0113, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 17.342657342657343, |
|
"grad_norm": 0.08180312067270279, |
|
"learning_rate": 9.162028307456771e-06, |
|
"loss": 0.0144, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 17.412587412587413, |
|
"grad_norm": 0.19703391194343567, |
|
"learning_rate": 8.683228528294929e-06, |
|
"loss": 0.0151, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 17.482517482517483, |
|
"grad_norm": 0.10454503446817398, |
|
"learning_rate": 8.216709570563685e-06, |
|
"loss": 0.0144, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 17.552447552447553, |
|
"grad_norm": 0.12390058487653732, |
|
"learning_rate": 7.76253417450149e-06, |
|
"loss": 0.0129, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 17.622377622377623, |
|
"grad_norm": 0.0972568616271019, |
|
"learning_rate": 7.320763420311261e-06, |
|
"loss": 0.0153, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 17.692307692307693, |
|
"grad_norm": 0.14459727704524994, |
|
"learning_rate": 6.891456719946188e-06, |
|
"loss": 0.0151, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 17.762237762237763, |
|
"grad_norm": 0.15339726209640503, |
|
"learning_rate": 6.4746718091194254e-06, |
|
"loss": 0.0125, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 17.832167832167833, |
|
"grad_norm": 0.2148740440607071, |
|
"learning_rate": 6.07046473953955e-06, |
|
"loss": 0.0126, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 17.902097902097903, |
|
"grad_norm": 0.08716779202222824, |
|
"learning_rate": 5.678889871372428e-06, |
|
"loss": 0.0143, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 17.972027972027973, |
|
"grad_norm": 0.2405886948108673, |
|
"learning_rate": 5.299999865930505e-06, |
|
"loss": 0.0159, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 18.041958041958043, |
|
"grad_norm": 0.10744497925043106, |
|
"learning_rate": 4.933845678590587e-06, |
|
"loss": 0.0121, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 18.111888111888113, |
|
"grad_norm": 0.24450422823429108, |
|
"learning_rate": 4.580476551941037e-06, |
|
"loss": 0.0119, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 18.181818181818183, |
|
"grad_norm": 0.13196663558483124, |
|
"learning_rate": 4.2399400091594154e-06, |
|
"loss": 0.0146, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 18.251748251748253, |
|
"grad_norm": 0.0889667496085167, |
|
"learning_rate": 3.912281847621213e-06, |
|
"loss": 0.0157, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 18.32167832167832, |
|
"grad_norm": 0.10608735680580139, |
|
"learning_rate": 3.597546132740792e-06, |
|
"loss": 0.0143, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 18.39160839160839, |
|
"grad_norm": 0.0715111494064331, |
|
"learning_rate": 3.295775192045181e-06, |
|
"loss": 0.012, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 0.11866133660078049, |
|
"learning_rate": 3.0070096094816037e-06, |
|
"loss": 0.0136, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 18.53146853146853, |
|
"grad_norm": 0.1221456378698349, |
|
"learning_rate": 2.7312882199595826e-06, |
|
"loss": 0.0143, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 18.6013986013986, |
|
"grad_norm": 0.11350057274103165, |
|
"learning_rate": 2.4686481041280574e-06, |
|
"loss": 0.0119, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 18.67132867132867, |
|
"grad_norm": 0.10500749945640564, |
|
"learning_rate": 2.2191245833886987e-06, |
|
"loss": 0.0131, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 18.74125874125874, |
|
"grad_norm": 0.208559051156044, |
|
"learning_rate": 1.9827512151456173e-06, |
|
"loss": 0.0137, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 18.81118881118881, |
|
"grad_norm": 0.10356497019529343, |
|
"learning_rate": 1.7595597882923309e-06, |
|
"loss": 0.0142, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 18.88111888111888, |
|
"grad_norm": 0.12341023981571198, |
|
"learning_rate": 1.549580318936672e-06, |
|
"loss": 0.0125, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 18.95104895104895, |
|
"grad_norm": 0.19302833080291748, |
|
"learning_rate": 1.3528410463639728e-06, |
|
"loss": 0.0135, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 19.02097902097902, |
|
"grad_norm": 0.18865394592285156, |
|
"learning_rate": 1.1693684292393704e-06, |
|
"loss": 0.0122, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 19.09090909090909, |
|
"grad_norm": 0.08378203958272934, |
|
"learning_rate": 9.991871420493736e-07, |
|
"loss": 0.0128, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 19.16083916083916, |
|
"grad_norm": 0.12866666913032532, |
|
"learning_rate": 8.423200717835977e-07, |
|
"loss": 0.014, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 19.23076923076923, |
|
"grad_norm": 0.13293299078941345, |
|
"learning_rate": 6.987883148567131e-07, |
|
"loss": 0.0112, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 19.3006993006993, |
|
"grad_norm": 0.12522312998771667, |
|
"learning_rate": 5.686111742713162e-07, |
|
"loss": 0.0119, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 19.37062937062937, |
|
"grad_norm": 0.10424693673849106, |
|
"learning_rate": 4.5180615702192783e-07, |
|
"loss": 0.0119, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 19.44055944055944, |
|
"grad_norm": 0.11369384080171585, |
|
"learning_rate": 3.4838897174055417e-07, |
|
"loss": 0.0144, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 19.51048951048951, |
|
"grad_norm": 0.279256671667099, |
|
"learning_rate": 2.583735265840992e-07, |
|
"loss": 0.0114, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 19.58041958041958, |
|
"grad_norm": 0.12756973505020142, |
|
"learning_rate": 1.8177192736390515e-07, |
|
"loss": 0.0124, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 19.65034965034965, |
|
"grad_norm": 0.13745011389255524, |
|
"learning_rate": 1.1859447591769934e-07, |
|
"loss": 0.0163, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 19.72027972027972, |
|
"grad_norm": 0.14598555862903595, |
|
"learning_rate": 6.884966872412468e-08, |
|
"loss": 0.0127, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 19.79020979020979, |
|
"grad_norm": 0.08950311690568924, |
|
"learning_rate": 3.254419576012024e-08, |
|
"loss": 0.0154, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 19.86013986013986, |
|
"grad_norm": 0.09730339050292969, |
|
"learning_rate": 9.682939601185492e-09, |
|
"loss": 0.0126, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 19.93006993006993, |
|
"grad_norm": 0.07458403706550598, |
|
"learning_rate": 2.689747647166563e-10, |
|
"loss": 0.0124, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 19.944055944055943, |
|
"step": 2852, |
|
"total_flos": 3.989050496377632e+17, |
|
"train_loss": 0.03802360156629376, |
|
"train_runtime": 3245.0123, |
|
"train_samples_per_second": 56.249, |
|
"train_steps_per_second": 0.879 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2852, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.989050496377632e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|