|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 2735, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018281535648994516, |
|
"grad_norm": 16.712358474731445, |
|
"learning_rate": 1.45985401459854e-05, |
|
"loss": 1.4822, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03656307129798903, |
|
"grad_norm": 7.676208019256592, |
|
"learning_rate": 2.91970802919708e-05, |
|
"loss": 0.7397, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.054844606946983544, |
|
"grad_norm": 2.2206971645355225, |
|
"learning_rate": 4.379562043795621e-05, |
|
"loss": 0.4701, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07312614259597806, |
|
"grad_norm": 1.7638039588928223, |
|
"learning_rate": 5.83941605839416e-05, |
|
"loss": 0.2966, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09140767824497258, |
|
"grad_norm": 1.6052724123001099, |
|
"learning_rate": 7.299270072992701e-05, |
|
"loss": 0.2162, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10968921389396709, |
|
"grad_norm": 2.617760181427002, |
|
"learning_rate": 8.759124087591242e-05, |
|
"loss": 0.2019, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12797074954296161, |
|
"grad_norm": 1.7860541343688965, |
|
"learning_rate": 0.00010218978102189782, |
|
"loss": 0.1457, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14625228519195613, |
|
"grad_norm": 1.4474908113479614, |
|
"learning_rate": 0.0001167883211678832, |
|
"loss": 0.1387, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16453382084095064, |
|
"grad_norm": 1.5035394430160522, |
|
"learning_rate": 0.0001313868613138686, |
|
"loss": 0.1363, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18281535648994515, |
|
"grad_norm": 1.4598884582519531, |
|
"learning_rate": 0.00014598540145985403, |
|
"loss": 0.1124, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20109689213893966, |
|
"grad_norm": 1.7308577299118042, |
|
"learning_rate": 0.00016058394160583942, |
|
"loss": 0.1215, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21937842778793418, |
|
"grad_norm": 1.5704491138458252, |
|
"learning_rate": 0.00017518248175182484, |
|
"loss": 0.1332, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2376599634369287, |
|
"grad_norm": 0.7519080638885498, |
|
"learning_rate": 0.00018978102189781023, |
|
"loss": 0.1015, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25594149908592323, |
|
"grad_norm": 1.0302314758300781, |
|
"learning_rate": 0.00019999934198849153, |
|
"loss": 0.1043, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2742230347349177, |
|
"grad_norm": 1.1439878940582275, |
|
"learning_rate": 0.00019998764424701714, |
|
"loss": 0.1105, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.29250457038391225, |
|
"grad_norm": 0.8649179935455322, |
|
"learning_rate": 0.00019996132599641746, |
|
"loss": 0.0969, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.31078610603290674, |
|
"grad_norm": 0.9194239377975464, |
|
"learning_rate": 0.00019992039108503024, |
|
"loss": 0.097, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3290676416819013, |
|
"grad_norm": 0.6259992718696594, |
|
"learning_rate": 0.00019986484549848745, |
|
"loss": 0.0853, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3473491773308958, |
|
"grad_norm": 1.0033239126205444, |
|
"learning_rate": 0.00019979469735884026, |
|
"loss": 0.0944, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3656307129798903, |
|
"grad_norm": 1.261385440826416, |
|
"learning_rate": 0.00019970995692337114, |
|
"loss": 0.1078, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38391224862888484, |
|
"grad_norm": 0.9231658577919006, |
|
"learning_rate": 0.00019961063658309418, |
|
"loss": 0.0821, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.40219378427787933, |
|
"grad_norm": 0.996103048324585, |
|
"learning_rate": 0.00019949675086094326, |
|
"loss": 0.0911, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.42047531992687387, |
|
"grad_norm": 0.9832742810249329, |
|
"learning_rate": 0.0001993683164096483, |
|
"loss": 0.0692, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.43875685557586835, |
|
"grad_norm": 0.6472922563552856, |
|
"learning_rate": 0.00019922535200930046, |
|
"loss": 0.0706, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4570383912248629, |
|
"grad_norm": 0.5999054312705994, |
|
"learning_rate": 0.00019906787856460581, |
|
"loss": 0.0731, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4753199268738574, |
|
"grad_norm": 0.667738139629364, |
|
"learning_rate": 0.00019889591910182876, |
|
"loss": 0.0708, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4936014625228519, |
|
"grad_norm": 0.554964542388916, |
|
"learning_rate": 0.0001987094987654251, |
|
"loss": 0.0591, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5118829981718465, |
|
"grad_norm": 1.1600011587142944, |
|
"learning_rate": 0.00019850864481436514, |
|
"loss": 0.0795, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5301645338208409, |
|
"grad_norm": 0.6419970393180847, |
|
"learning_rate": 0.00019829338661814797, |
|
"loss": 0.0659, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5484460694698354, |
|
"grad_norm": 0.735856831073761, |
|
"learning_rate": 0.00019806375565250685, |
|
"loss": 0.0724, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.56672760511883, |
|
"grad_norm": 0.5395373106002808, |
|
"learning_rate": 0.00019781978549480682, |
|
"loss": 0.0626, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5850091407678245, |
|
"grad_norm": 0.8947715759277344, |
|
"learning_rate": 0.00019756151181913483, |
|
"loss": 0.0601, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.603290676416819, |
|
"grad_norm": 0.5075414180755615, |
|
"learning_rate": 0.00019728897239108342, |
|
"loss": 0.0691, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6215722120658135, |
|
"grad_norm": 1.3236219882965088, |
|
"learning_rate": 0.00019700220706222858, |
|
"loss": 0.0488, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6398537477148081, |
|
"grad_norm": 0.9153704047203064, |
|
"learning_rate": 0.00019670125776430228, |
|
"loss": 0.0622, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6581352833638026, |
|
"grad_norm": 0.6496918797492981, |
|
"learning_rate": 0.00019638616850306133, |
|
"loss": 0.0572, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.676416819012797, |
|
"grad_norm": 0.6905117034912109, |
|
"learning_rate": 0.00019605698535185266, |
|
"loss": 0.0506, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6946983546617916, |
|
"grad_norm": 0.6502402424812317, |
|
"learning_rate": 0.00019571375644487625, |
|
"loss": 0.0528, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7129798903107861, |
|
"grad_norm": 0.7400691509246826, |
|
"learning_rate": 0.0001953565319701469, |
|
"loss": 0.0674, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7312614259597806, |
|
"grad_norm": 0.5896055698394775, |
|
"learning_rate": 0.0001949853641621555, |
|
"loss": 0.0471, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7495429616087751, |
|
"grad_norm": 0.4026470482349396, |
|
"learning_rate": 0.00019460030729423114, |
|
"loss": 0.0512, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7678244972577697, |
|
"grad_norm": 0.47957828640937805, |
|
"learning_rate": 0.0001942014176706052, |
|
"loss": 0.0629, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7861060329067642, |
|
"grad_norm": 0.4520862400531769, |
|
"learning_rate": 0.00019378875361817817, |
|
"loss": 0.0533, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8043875685557587, |
|
"grad_norm": 0.4732885956764221, |
|
"learning_rate": 0.00019336237547799108, |
|
"loss": 0.058, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8226691042047533, |
|
"grad_norm": 0.7703008651733398, |
|
"learning_rate": 0.0001929223455964022, |
|
"loss": 0.0532, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8409506398537477, |
|
"grad_norm": 0.45097994804382324, |
|
"learning_rate": 0.00019246872831597055, |
|
"loss": 0.0465, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8592321755027422, |
|
"grad_norm": 0.5736098289489746, |
|
"learning_rate": 0.00019200158996604753, |
|
"loss": 0.0487, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8775137111517367, |
|
"grad_norm": 0.7237376570701599, |
|
"learning_rate": 0.0001915209988530779, |
|
"loss": 0.0551, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8957952468007313, |
|
"grad_norm": 0.4645770192146301, |
|
"learning_rate": 0.00019102702525061207, |
|
"loss": 0.0495, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9140767824497258, |
|
"grad_norm": 0.5169672966003418, |
|
"learning_rate": 0.00019051974138903027, |
|
"loss": 0.0433, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9323583180987203, |
|
"grad_norm": 0.7457365989685059, |
|
"learning_rate": 0.00018999922144498084, |
|
"loss": 0.0518, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9506398537477148, |
|
"grad_norm": 0.5059699416160583, |
|
"learning_rate": 0.00018946554153053395, |
|
"loss": 0.0474, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9689213893967094, |
|
"grad_norm": 0.8174113035202026, |
|
"learning_rate": 0.00018891877968205213, |
|
"loss": 0.0517, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9872029250457038, |
|
"grad_norm": 0.5508332252502441, |
|
"learning_rate": 0.00018835901584877973, |
|
"loss": 0.0709, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0054844606946984, |
|
"grad_norm": 0.5709052681922913, |
|
"learning_rate": 0.00018778633188115223, |
|
"loss": 0.0484, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.023765996343693, |
|
"grad_norm": 0.4354308247566223, |
|
"learning_rate": 0.0001872008115188281, |
|
"loss": 0.0544, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0420475319926874, |
|
"grad_norm": 0.535977303981781, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.0562, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0603290676416819, |
|
"grad_norm": 0.2939574420452118, |
|
"learning_rate": 0.00018599160594109522, |
|
"loss": 0.0489, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0786106032906764, |
|
"grad_norm": 0.3677907884120941, |
|
"learning_rate": 0.000185368097539545, |
|
"loss": 0.0358, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0968921389396709, |
|
"grad_norm": 0.5382636785507202, |
|
"learning_rate": 0.0001847321063451609, |
|
"loss": 0.0395, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1151736745886653, |
|
"grad_norm": 0.457963764667511, |
|
"learning_rate": 0.00018408372535458397, |
|
"loss": 0.0523, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.13345521023766, |
|
"grad_norm": 0.5560534000396729, |
|
"learning_rate": 0.00018342304937613032, |
|
"loss": 0.0531, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1517367458866545, |
|
"grad_norm": 0.6328279376029968, |
|
"learning_rate": 0.00018275017501592818, |
|
"loss": 0.0452, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.170018281535649, |
|
"grad_norm": 0.45685553550720215, |
|
"learning_rate": 0.0001820652006637915, |
|
"loss": 0.0402, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.1882998171846435, |
|
"grad_norm": 0.21566231548786163, |
|
"learning_rate": 0.0001813682264788334, |
|
"loss": 0.0401, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.206581352833638, |
|
"grad_norm": 0.36770665645599365, |
|
"learning_rate": 0.00018065935437482037, |
|
"loss": 0.04, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2248628884826325, |
|
"grad_norm": 0.4096185863018036, |
|
"learning_rate": 0.0001799386880052703, |
|
"loss": 0.0352, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2431444241316272, |
|
"grad_norm": 0.4246453642845154, |
|
"learning_rate": 0.00017920633274829575, |
|
"loss": 0.045, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2614259597806217, |
|
"grad_norm": 0.4160013496875763, |
|
"learning_rate": 0.00017846239569119528, |
|
"loss": 0.0357, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2797074954296161, |
|
"grad_norm": 0.5409733653068542, |
|
"learning_rate": 0.00017770698561479496, |
|
"loss": 0.0376, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2979890310786106, |
|
"grad_norm": 0.22224466502666473, |
|
"learning_rate": 0.00017694021297754188, |
|
"loss": 0.041, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3162705667276051, |
|
"grad_norm": 0.5606803894042969, |
|
"learning_rate": 0.00017616218989935272, |
|
"loss": 0.0367, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3345521023765996, |
|
"grad_norm": 0.3131175935268402, |
|
"learning_rate": 0.00017537303014521918, |
|
"loss": 0.0466, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.352833638025594, |
|
"grad_norm": 0.37444230914115906, |
|
"learning_rate": 0.0001745728491085728, |
|
"loss": 0.0401, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3711151736745886, |
|
"grad_norm": 0.6337727308273315, |
|
"learning_rate": 0.0001737617637944119, |
|
"loss": 0.0505, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.389396709323583, |
|
"grad_norm": 0.5669440627098083, |
|
"learning_rate": 0.00017293989280219274, |
|
"loss": 0.0372, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4076782449725778, |
|
"grad_norm": 0.388346791267395, |
|
"learning_rate": 0.00017210735630848745, |
|
"loss": 0.035, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4259597806215722, |
|
"grad_norm": 0.5280373096466064, |
|
"learning_rate": 0.00017126427604941148, |
|
"loss": 0.0466, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4442413162705667, |
|
"grad_norm": 0.565298855304718, |
|
"learning_rate": 0.00017041077530282294, |
|
"loss": 0.0365, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.4625228519195612, |
|
"grad_norm": 0.35680803656578064, |
|
"learning_rate": 0.00016954697887029655, |
|
"loss": 0.0383, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4808043875685557, |
|
"grad_norm": 0.42788997292518616, |
|
"learning_rate": 0.00016867301305887474, |
|
"loss": 0.0337, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4990859232175504, |
|
"grad_norm": 0.43233945965766907, |
|
"learning_rate": 0.00016778900566259865, |
|
"loss": 0.0505, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.517367458866545, |
|
"grad_norm": 0.4589940905570984, |
|
"learning_rate": 0.0001668950859438216, |
|
"loss": 0.0438, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5356489945155394, |
|
"grad_norm": 0.48594310879707336, |
|
"learning_rate": 0.00016599138461430814, |
|
"loss": 0.0323, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5539305301645339, |
|
"grad_norm": 0.31333279609680176, |
|
"learning_rate": 0.00016507803381612076, |
|
"loss": 0.0393, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5722120658135283, |
|
"grad_norm": 0.49847719073295593, |
|
"learning_rate": 0.00016415516710229766, |
|
"loss": 0.0453, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5904936014625228, |
|
"grad_norm": 0.4276566505432129, |
|
"learning_rate": 0.00016322291941732442, |
|
"loss": 0.0362, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6087751371115173, |
|
"grad_norm": 0.47734275460243225, |
|
"learning_rate": 0.0001622814270774018, |
|
"loss": 0.0349, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6270566727605118, |
|
"grad_norm": 0.24307364225387573, |
|
"learning_rate": 0.00016133082775051313, |
|
"loss": 0.0365, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6453382084095063, |
|
"grad_norm": 0.4327755272388458, |
|
"learning_rate": 0.00016037126043629422, |
|
"loss": 0.0318, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6636197440585008, |
|
"grad_norm": 0.2253831923007965, |
|
"learning_rate": 0.0001594028654457083, |
|
"loss": 0.0324, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.6819012797074955, |
|
"grad_norm": 0.42007511854171753, |
|
"learning_rate": 0.0001584257843805293, |
|
"loss": 0.0387, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.70018281535649, |
|
"grad_norm": 0.5654010772705078, |
|
"learning_rate": 0.00015744016011263638, |
|
"loss": 0.0461, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7184643510054844, |
|
"grad_norm": 0.5979740619659424, |
|
"learning_rate": 0.00015644613676312288, |
|
"loss": 0.0288, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.736745886654479, |
|
"grad_norm": 0.6250779628753662, |
|
"learning_rate": 0.00015544385968122227, |
|
"loss": 0.0339, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7550274223034736, |
|
"grad_norm": 0.4420310854911804, |
|
"learning_rate": 0.00015443347542305484, |
|
"loss": 0.0446, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7733089579524681, |
|
"grad_norm": 0.4242953956127167, |
|
"learning_rate": 0.0001534151317301979, |
|
"loss": 0.0402, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.7915904936014626, |
|
"grad_norm": 0.2853521406650543, |
|
"learning_rate": 0.00015238897750808242, |
|
"loss": 0.0367, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.809872029250457, |
|
"grad_norm": 0.5415486693382263, |
|
"learning_rate": 0.00015135516280421945, |
|
"loss": 0.0312, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8281535648994516, |
|
"grad_norm": 0.3944428265094757, |
|
"learning_rate": 0.00015031383878626016, |
|
"loss": 0.0293, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.846435100548446, |
|
"grad_norm": 0.42964455485343933, |
|
"learning_rate": 0.00014926515771989104, |
|
"loss": 0.0462, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8647166361974405, |
|
"grad_norm": 0.3574308454990387, |
|
"learning_rate": 0.00014820927294656973, |
|
"loss": 0.0358, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.882998171846435, |
|
"grad_norm": 0.38193315267562866, |
|
"learning_rate": 0.00014714633886110242, |
|
"loss": 0.0393, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9012797074954295, |
|
"grad_norm": 0.4956030249595642, |
|
"learning_rate": 0.00014607651088906809, |
|
"loss": 0.0312, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.919561243144424, |
|
"grad_norm": 0.4244064688682556, |
|
"learning_rate": 0.00014499994546409152, |
|
"loss": 0.031, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9378427787934185, |
|
"grad_norm": 0.46385011076927185, |
|
"learning_rate": 0.00014391680000496932, |
|
"loss": 0.0424, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9561243144424132, |
|
"grad_norm": 0.5440361499786377, |
|
"learning_rate": 0.0001428272328926512, |
|
"loss": 0.0328, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9744058500914077, |
|
"grad_norm": 0.3221015930175781, |
|
"learning_rate": 0.00014173140344708152, |
|
"loss": 0.0424, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.9926873857404022, |
|
"grad_norm": 0.520367443561554, |
|
"learning_rate": 0.00014062947190390262, |
|
"loss": 0.0396, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.010968921389397, |
|
"grad_norm": 0.29480573534965515, |
|
"learning_rate": 0.0001395215993910249, |
|
"loss": 0.0351, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0292504570383914, |
|
"grad_norm": 0.35179761052131653, |
|
"learning_rate": 0.00013840794790506616, |
|
"loss": 0.0271, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.047531992687386, |
|
"grad_norm": 0.377270370721817, |
|
"learning_rate": 0.00013728868028766377, |
|
"loss": 0.0311, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.0658135283363803, |
|
"grad_norm": 0.4772701859474182, |
|
"learning_rate": 0.0001361639602016637, |
|
"loss": 0.0372, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.084095063985375, |
|
"grad_norm": 0.30298906564712524, |
|
"learning_rate": 0.000135033952107189, |
|
"loss": 0.0255, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1023765996343693, |
|
"grad_norm": 0.39370113611221313, |
|
"learning_rate": 0.00013389882123759206, |
|
"loss": 0.0327, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.1206581352833638, |
|
"grad_norm": 0.2912181317806244, |
|
"learning_rate": 0.00013275873357529368, |
|
"loss": 0.0268, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.1389396709323583, |
|
"grad_norm": 0.29357820749282837, |
|
"learning_rate": 0.00013161385582751247, |
|
"loss": 0.0273, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1572212065813527, |
|
"grad_norm": 0.3242945075035095, |
|
"learning_rate": 0.00013046435540188848, |
|
"loss": 0.0296, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1755027422303472, |
|
"grad_norm": 1.168150544166565, |
|
"learning_rate": 0.00012931040038200435, |
|
"loss": 0.0416, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.1937842778793417, |
|
"grad_norm": 0.3501128852367401, |
|
"learning_rate": 0.00012815215950280753, |
|
"loss": 0.0379, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.212065813528336, |
|
"grad_norm": 0.46127256751060486, |
|
"learning_rate": 0.0001269898021259373, |
|
"loss": 0.0372, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.2303473491773307, |
|
"grad_norm": 0.4480052888393402, |
|
"learning_rate": 0.0001258234982149604, |
|
"loss": 0.0366, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.2486288848263256, |
|
"grad_norm": 0.38535383343696594, |
|
"learning_rate": 0.0001246534183105181, |
|
"loss": 0.0289, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.26691042047532, |
|
"grad_norm": 0.39918404817581177, |
|
"learning_rate": 0.00012347973350538936, |
|
"loss": 0.029, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.2851919561243146, |
|
"grad_norm": 0.27928563952445984, |
|
"learning_rate": 0.00012230261541947316, |
|
"loss": 0.0262, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.303473491773309, |
|
"grad_norm": 0.43867453932762146, |
|
"learning_rate": 0.00012112223617469372, |
|
"loss": 0.0227, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3217550274223036, |
|
"grad_norm": 0.3848976194858551, |
|
"learning_rate": 0.00011993876836983198, |
|
"loss": 0.0251, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.340036563071298, |
|
"grad_norm": 0.3365519046783447, |
|
"learning_rate": 0.0001187523850552881, |
|
"loss": 0.0345, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3583180987202925, |
|
"grad_norm": 0.3406737446784973, |
|
"learning_rate": 0.00011756325970777717, |
|
"loss": 0.0273, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.376599634369287, |
|
"grad_norm": 0.28142690658569336, |
|
"learning_rate": 0.00011637156620496308, |
|
"loss": 0.0275, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.3948811700182815, |
|
"grad_norm": 0.36976391077041626, |
|
"learning_rate": 0.00011517747880003335, |
|
"loss": 0.0243, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.413162705667276, |
|
"grad_norm": 0.22825664281845093, |
|
"learning_rate": 0.00011398117209621966, |
|
"loss": 0.0278, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.4314442413162705, |
|
"grad_norm": 0.3394540548324585, |
|
"learning_rate": 0.00011278282102126633, |
|
"loss": 0.0357, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.449725776965265, |
|
"grad_norm": 0.26682886481285095, |
|
"learning_rate": 0.00011158260080185226, |
|
"loss": 0.0407, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.4680073126142594, |
|
"grad_norm": 0.23459388315677643, |
|
"learning_rate": 0.00011038068693796846, |
|
"loss": 0.0263, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.4862888482632544, |
|
"grad_norm": 0.32797005772590637, |
|
"learning_rate": 0.00010917725517725608, |
|
"loss": 0.0354, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.504570383912249, |
|
"grad_norm": 0.2672847509384155, |
|
"learning_rate": 0.00010797248148930783, |
|
"loss": 0.0203, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.5228519195612433, |
|
"grad_norm": 0.34542036056518555, |
|
"learning_rate": 0.00010676654203993732, |
|
"loss": 0.0246, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.541133455210238, |
|
"grad_norm": 0.5064176321029663, |
|
"learning_rate": 0.00010555961316541946, |
|
"loss": 0.0276, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.5594149908592323, |
|
"grad_norm": 0.34542617201805115, |
|
"learning_rate": 0.00010435187134670607, |
|
"loss": 0.0238, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.577696526508227, |
|
"grad_norm": 0.3336438238620758, |
|
"learning_rate": 0.00010314349318362015, |
|
"loss": 0.0353, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.5959780621572213, |
|
"grad_norm": 0.22887752950191498, |
|
"learning_rate": 0.00010193465536903307, |
|
"loss": 0.028, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.6142595978062158, |
|
"grad_norm": 0.1399448662996292, |
|
"learning_rate": 0.00010072553466302784, |
|
"loss": 0.028, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.6325411334552102, |
|
"grad_norm": 0.36335644125938416, |
|
"learning_rate": 9.951630786705279e-05, |
|
"loss": 0.0196, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.6508226691042047, |
|
"grad_norm": 0.22947153449058533, |
|
"learning_rate": 9.830715179806905e-05, |
|
"loss": 0.0275, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.669104204753199, |
|
"grad_norm": 0.21563003957271576, |
|
"learning_rate": 9.709824326269576e-05, |
|
"loss": 0.0216, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.6873857404021937, |
|
"grad_norm": 0.3260309100151062, |
|
"learning_rate": 9.5889759031357e-05, |
|
"loss": 0.018, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.705667276051188, |
|
"grad_norm": 0.15418443083763123, |
|
"learning_rate": 9.468187581243378e-05, |
|
"loss": 0.0244, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.7239488117001827, |
|
"grad_norm": 0.2873231768608093, |
|
"learning_rate": 9.347477022642503e-05, |
|
"loss": 0.0186, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.742230347349177, |
|
"grad_norm": 0.2715139091014862, |
|
"learning_rate": 9.226861878012197e-05, |
|
"loss": 0.0273, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7605118829981716, |
|
"grad_norm": 0.17074620723724365, |
|
"learning_rate": 9.106359784079832e-05, |
|
"loss": 0.0174, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.778793418647166, |
|
"grad_norm": 0.2897492051124573, |
|
"learning_rate": 8.985988361042153e-05, |
|
"loss": 0.0283, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.797074954296161, |
|
"grad_norm": 0.5155644416809082, |
|
"learning_rate": 8.8657652099888e-05, |
|
"loss": 0.0216, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.8153564899451555, |
|
"grad_norm": 0.33276352286338806, |
|
"learning_rate": 8.745707910328615e-05, |
|
"loss": 0.0245, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.83363802559415, |
|
"grad_norm": 0.4756206274032593, |
|
"learning_rate": 8.625834017219113e-05, |
|
"loss": 0.0303, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.8519195612431445, |
|
"grad_norm": 0.2755451202392578, |
|
"learning_rate": 8.506161058999541e-05, |
|
"loss": 0.0199, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.870201096892139, |
|
"grad_norm": 0.26369351148605347, |
|
"learning_rate": 8.386706534627805e-05, |
|
"loss": 0.0204, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.8884826325411335, |
|
"grad_norm": 0.2358650118112564, |
|
"learning_rate": 8.267487911121715e-05, |
|
"loss": 0.0211, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.906764168190128, |
|
"grad_norm": 0.22182169556617737, |
|
"learning_rate": 8.148522621004926e-05, |
|
"loss": 0.0233, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.9250457038391224, |
|
"grad_norm": 0.30960527062416077, |
|
"learning_rate": 8.029828059757875e-05, |
|
"loss": 0.0243, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.943327239488117, |
|
"grad_norm": 0.38207757472991943, |
|
"learning_rate": 7.91142158327417e-05, |
|
"loss": 0.0295, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9616087751371114, |
|
"grad_norm": 0.24521781504154205, |
|
"learning_rate": 7.793320505322761e-05, |
|
"loss": 0.0206, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.979890310786106, |
|
"grad_norm": 0.3253994286060333, |
|
"learning_rate": 7.675542095016256e-05, |
|
"loss": 0.026, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.998171846435101, |
|
"grad_norm": 0.3253840208053589, |
|
"learning_rate": 7.558103574285779e-05, |
|
"loss": 0.0219, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.016453382084095, |
|
"grad_norm": 0.2342890352010727, |
|
"learning_rate": 7.441022115362729e-05, |
|
"loss": 0.0181, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.03473491773309, |
|
"grad_norm": 0.2249564677476883, |
|
"learning_rate": 7.324314838267796e-05, |
|
"loss": 0.0228, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.0530164533820843, |
|
"grad_norm": 0.24722999334335327, |
|
"learning_rate": 7.207998808307628e-05, |
|
"loss": 0.018, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.0712979890310788, |
|
"grad_norm": 0.22779327630996704, |
|
"learning_rate": 7.092091033579475e-05, |
|
"loss": 0.0193, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.0895795246800732, |
|
"grad_norm": 0.34452179074287415, |
|
"learning_rate": 6.976608462484226e-05, |
|
"loss": 0.0327, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.1078610603290677, |
|
"grad_norm": 0.30508124828338623, |
|
"learning_rate": 6.861567981248142e-05, |
|
"loss": 0.0261, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.126142595978062, |
|
"grad_norm": 0.319670706987381, |
|
"learning_rate": 6.746986411453717e-05, |
|
"loss": 0.0189, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.1444241316270567, |
|
"grad_norm": 0.35580283403396606, |
|
"learning_rate": 6.632880507579957e-05, |
|
"loss": 0.0242, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.162705667276051, |
|
"grad_norm": 0.3020285964012146, |
|
"learning_rate": 6.519266954552502e-05, |
|
"loss": 0.0176, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.1809872029250457, |
|
"grad_norm": 0.27105554938316345, |
|
"learning_rate": 6.406162365303882e-05, |
|
"loss": 0.0268, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.19926873857404, |
|
"grad_norm": 0.20928241312503815, |
|
"learning_rate": 6.293583278344361e-05, |
|
"loss": 0.0206, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.2175502742230346, |
|
"grad_norm": 0.2314785271883011, |
|
"learning_rate": 6.181546155343579e-05, |
|
"loss": 0.0198, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.235831809872029, |
|
"grad_norm": 0.2732461988925934, |
|
"learning_rate": 6.070067378723501e-05, |
|
"loss": 0.0177, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.2541133455210236, |
|
"grad_norm": 0.17697979509830475, |
|
"learning_rate": 5.959163249262913e-05, |
|
"loss": 0.0155, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.272394881170018, |
|
"grad_norm": 0.24429567158222198, |
|
"learning_rate": 5.848849983713894e-05, |
|
"loss": 0.0212, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.2906764168190126, |
|
"grad_norm": 0.36660096049308777, |
|
"learning_rate": 5.739143712430521e-05, |
|
"loss": 0.0281, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.3089579524680075, |
|
"grad_norm": 0.2895634174346924, |
|
"learning_rate": 5.630060477010253e-05, |
|
"loss": 0.018, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.327239488117002, |
|
"grad_norm": 0.3412606418132782, |
|
"learning_rate": 5.5216162279482964e-05, |
|
"loss": 0.0134, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.3455210237659965, |
|
"grad_norm": 0.22716091573238373, |
|
"learning_rate": 5.4138268223052326e-05, |
|
"loss": 0.016, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.363802559414991, |
|
"grad_norm": 0.24945920705795288, |
|
"learning_rate": 5.306708021388378e-05, |
|
"loss": 0.0208, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.3820840950639854, |
|
"grad_norm": 0.24487105011940002, |
|
"learning_rate": 5.200275488447104e-05, |
|
"loss": 0.018, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.40036563071298, |
|
"grad_norm": 0.24816852807998657, |
|
"learning_rate": 5.094544786382522e-05, |
|
"loss": 0.0159, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.4186471663619744, |
|
"grad_norm": 0.1848219782114029, |
|
"learning_rate": 4.989531375471805e-05, |
|
"loss": 0.0142, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.436928702010969, |
|
"grad_norm": 0.19923894107341766, |
|
"learning_rate": 4.885250611107558e-05, |
|
"loss": 0.0214, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.4552102376599634, |
|
"grad_norm": 0.1752861738204956, |
|
"learning_rate": 4.7817177415524796e-05, |
|
"loss": 0.0198, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.473491773308958, |
|
"grad_norm": 0.3053307831287384, |
|
"learning_rate": 4.678947905709744e-05, |
|
"loss": 0.0225, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.4917733089579523, |
|
"grad_norm": 0.19800381362438202, |
|
"learning_rate": 4.576956130909317e-05, |
|
"loss": 0.016, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.510054844606947, |
|
"grad_norm": 0.1873503029346466, |
|
"learning_rate": 4.475757330710621e-05, |
|
"loss": 0.0144, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.5283363802559418, |
|
"grad_norm": 0.23367895185947418, |
|
"learning_rate": 4.375366302721825e-05, |
|
"loss": 0.0161, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.5466179159049362, |
|
"grad_norm": 0.17103944718837738, |
|
"learning_rate": 4.2757977264361046e-05, |
|
"loss": 0.0146, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.5648994515539307, |
|
"grad_norm": 0.2473006546497345, |
|
"learning_rate": 4.177066161085148e-05, |
|
"loss": 0.0184, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.583180987202925, |
|
"grad_norm": 0.31398236751556396, |
|
"learning_rate": 4.0791860435102524e-05, |
|
"loss": 0.0146, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.6014625228519197, |
|
"grad_norm": 0.33835136890411377, |
|
"learning_rate": 3.982171686051334e-05, |
|
"loss": 0.021, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.619744058500914, |
|
"grad_norm": 0.16258537769317627, |
|
"learning_rate": 3.8860372744541407e-05, |
|
"loss": 0.0196, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.6380255941499087, |
|
"grad_norm": 0.3083174228668213, |
|
"learning_rate": 3.790796865795947e-05, |
|
"loss": 0.0152, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.656307129798903, |
|
"grad_norm": 0.21282333135604858, |
|
"learning_rate": 3.696464386430093e-05, |
|
"loss": 0.0215, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.6745886654478976, |
|
"grad_norm": 0.20800185203552246, |
|
"learning_rate": 3.6030536299496395e-05, |
|
"loss": 0.0155, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.692870201096892, |
|
"grad_norm": 0.251663476228714, |
|
"learning_rate": 3.5105782551704145e-05, |
|
"loss": 0.0222, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.7111517367458866, |
|
"grad_norm": 0.24097998440265656, |
|
"learning_rate": 3.419051784133773e-05, |
|
"loss": 0.0142, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.729433272394881, |
|
"grad_norm": 0.18417520821094513, |
|
"learning_rate": 3.328487600129371e-05, |
|
"loss": 0.0147, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.7477148080438756, |
|
"grad_norm": 0.18106205761432648, |
|
"learning_rate": 3.2388989457382126e-05, |
|
"loss": 0.0125, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.76599634369287, |
|
"grad_norm": 0.14622414112091064, |
|
"learning_rate": 3.1502989208962855e-05, |
|
"loss": 0.0151, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.7842778793418645, |
|
"grad_norm": 0.29628556966781616, |
|
"learning_rate": 3.062700480979046e-05, |
|
"loss": 0.0206, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.802559414990859, |
|
"grad_norm": 0.26881730556488037, |
|
"learning_rate": 2.9761164349070315e-05, |
|
"loss": 0.0176, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.8208409506398535, |
|
"grad_norm": 0.4180646240711212, |
|
"learning_rate": 2.8905594432729055e-05, |
|
"loss": 0.0179, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.839122486288848, |
|
"grad_norm": 0.25500163435935974, |
|
"learning_rate": 2.8060420164902012e-05, |
|
"loss": 0.0142, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.857404021937843, |
|
"grad_norm": 0.21968974173069, |
|
"learning_rate": 2.7225765129639836e-05, |
|
"loss": 0.0161, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.8756855575868374, |
|
"grad_norm": 0.24668078124523163, |
|
"learning_rate": 2.6401751372837813e-05, |
|
"loss": 0.0217, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.893967093235832, |
|
"grad_norm": 0.2258848249912262, |
|
"learning_rate": 2.5588499384389865e-05, |
|
"loss": 0.0178, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.9122486288848264, |
|
"grad_norm": 0.1784961074590683, |
|
"learning_rate": 2.478612808057018e-05, |
|
"loss": 0.0114, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.930530164533821, |
|
"grad_norm": 0.28832298517227173, |
|
"learning_rate": 2.3994754786644923e-05, |
|
"loss": 0.0109, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.9488117001828154, |
|
"grad_norm": 0.12029292434453964, |
|
"learning_rate": 2.3214495219716436e-05, |
|
"loss": 0.0211, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.96709323583181, |
|
"grad_norm": 0.19231897592544556, |
|
"learning_rate": 2.2445463471802785e-05, |
|
"loss": 0.0098, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.9853747714808043, |
|
"grad_norm": 0.08982887864112854, |
|
"learning_rate": 2.1687771993155004e-05, |
|
"loss": 0.0077, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.003656307129799, |
|
"grad_norm": 0.21466206014156342, |
|
"learning_rate": 2.0941531575813988e-05, |
|
"loss": 0.0159, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.021937842778794, |
|
"grad_norm": 0.17917244136333466, |
|
"learning_rate": 2.0206851337410415e-05, |
|
"loss": 0.0139, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.040219378427788, |
|
"grad_norm": 0.08883915841579437, |
|
"learning_rate": 1.9483838705209012e-05, |
|
"loss": 0.0152, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.058500914076783, |
|
"grad_norm": 0.16674405336380005, |
|
"learning_rate": 1.8772599400400258e-05, |
|
"loss": 0.0196, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.076782449725777, |
|
"grad_norm": 0.10342701524496078, |
|
"learning_rate": 1.807323742264162e-05, |
|
"loss": 0.0161, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.095063985374772, |
|
"grad_norm": 0.1896440088748932, |
|
"learning_rate": 1.7385855034850184e-05, |
|
"loss": 0.0122, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.113345521023766, |
|
"grad_norm": 0.16498374938964844, |
|
"learning_rate": 1.6710552748249598e-05, |
|
"loss": 0.0133, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.131627056672761, |
|
"grad_norm": 0.17953291535377502, |
|
"learning_rate": 1.604742930767298e-05, |
|
"loss": 0.0219, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.149908592321755, |
|
"grad_norm": 0.18694134056568146, |
|
"learning_rate": 1.5396581677124124e-05, |
|
"loss": 0.0169, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.16819012797075, |
|
"grad_norm": 0.17348328232765198, |
|
"learning_rate": 1.4758105025599068e-05, |
|
"loss": 0.0159, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.186471663619744, |
|
"grad_norm": 0.16517849266529083, |
|
"learning_rate": 1.4132092713170242e-05, |
|
"loss": 0.0137, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.204753199268739, |
|
"grad_norm": 0.13645470142364502, |
|
"learning_rate": 1.3518636277335084e-05, |
|
"loss": 0.0149, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.223034734917733, |
|
"grad_norm": 0.14027458429336548, |
|
"learning_rate": 1.291782541963107e-05, |
|
"loss": 0.0147, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.2413162705667276, |
|
"grad_norm": 0.11632464081048965, |
|
"learning_rate": 1.2329747992519269e-05, |
|
"loss": 0.0137, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.259597806215722, |
|
"grad_norm": 0.2426212579011917, |
|
"learning_rate": 1.1754489986538419e-05, |
|
"loss": 0.0117, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.2778793418647165, |
|
"grad_norm": 0.20155277848243713, |
|
"learning_rate": 1.1192135517730884e-05, |
|
"loss": 0.0147, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.296160877513711, |
|
"grad_norm": 0.14590322971343994, |
|
"learning_rate": 1.0642766815343196e-05, |
|
"loss": 0.0119, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.3144424131627055, |
|
"grad_norm": 0.17194287478923798, |
|
"learning_rate": 1.0106464209802013e-05, |
|
"loss": 0.0115, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.3327239488117, |
|
"grad_norm": 0.243038609623909, |
|
"learning_rate": 9.583306120968072e-06, |
|
"loss": 0.0153, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.3510054844606945, |
|
"grad_norm": 0.29729005694389343, |
|
"learning_rate": 9.0733690466694e-06, |
|
"loss": 0.0136, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.369287020109689, |
|
"grad_norm": 0.1595577597618103, |
|
"learning_rate": 8.576727551515474e-06, |
|
"loss": 0.0156, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.387568555758683, |
|
"grad_norm": 0.12783007323741913, |
|
"learning_rate": 8.093454255994248e-06, |
|
"loss": 0.0122, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.405850091407678, |
|
"grad_norm": 0.26608356833457947, |
|
"learning_rate": 7.6236198258532675e-06, |
|
"loss": 0.0136, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.424131627056672, |
|
"grad_norm": 0.1889527142047882, |
|
"learning_rate": 7.167292961766725e-06, |
|
"loss": 0.015, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.442413162705667, |
|
"grad_norm": 0.2580418884754181, |
|
"learning_rate": 6.724540389289913e-06, |
|
"loss": 0.0132, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.460694698354661, |
|
"grad_norm": 0.22082190215587616, |
|
"learning_rate": 6.295426849102271e-06, |
|
"loss": 0.0113, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.478976234003657, |
|
"grad_norm": 0.11176195740699768, |
|
"learning_rate": 5.8800150875408574e-06, |
|
"loss": 0.0141, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.497257769652651, |
|
"grad_norm": 0.1779015064239502, |
|
"learning_rate": 5.478365847425449e-06, |
|
"loss": 0.0113, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.515539305301646, |
|
"grad_norm": 0.15661382675170898, |
|
"learning_rate": 5.090537859176425e-06, |
|
"loss": 0.0102, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.53382084095064, |
|
"grad_norm": 0.21932142972946167, |
|
"learning_rate": 4.716587832227071e-06, |
|
"loss": 0.0147, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.552102376599635, |
|
"grad_norm": 0.30200353264808655, |
|
"learning_rate": 4.356570446731356e-06, |
|
"loss": 0.0152, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.570383912248629, |
|
"grad_norm": 0.11431296914815903, |
|
"learning_rate": 4.010538345568371e-06, |
|
"loss": 0.017, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.588665447897624, |
|
"grad_norm": 0.2187824845314026, |
|
"learning_rate": 3.678542126644813e-06, |
|
"loss": 0.0168, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.606946983546618, |
|
"grad_norm": 0.12425347417593002, |
|
"learning_rate": 3.360630335496362e-06, |
|
"loss": 0.0113, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.625228519195613, |
|
"grad_norm": 0.17450736463069916, |
|
"learning_rate": 3.056849458189115e-06, |
|
"loss": 0.015, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.643510054844607, |
|
"grad_norm": 0.2220509946346283, |
|
"learning_rate": 2.7672439145223773e-06, |
|
"loss": 0.0196, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.661791590493602, |
|
"grad_norm": 0.2917903959751129, |
|
"learning_rate": 2.491856051533392e-06, |
|
"loss": 0.0165, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.680073126142596, |
|
"grad_norm": 0.22880949079990387, |
|
"learning_rate": 2.230726137305206e-06, |
|
"loss": 0.0165, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.698354661791591, |
|
"grad_norm": 0.2307160645723343, |
|
"learning_rate": 1.983892355078587e-06, |
|
"loss": 0.0129, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.716636197440585, |
|
"grad_norm": 0.1975175142288208, |
|
"learning_rate": 1.7513907976687283e-06, |
|
"loss": 0.016, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.7349177330895795, |
|
"grad_norm": 0.23436793684959412, |
|
"learning_rate": 1.533255462187666e-06, |
|
"loss": 0.0108, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.753199268738574, |
|
"grad_norm": 0.14805355668067932, |
|
"learning_rate": 1.329518245073047e-06, |
|
"loss": 0.0182, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.7714808043875685, |
|
"grad_norm": 0.1988326609134674, |
|
"learning_rate": 1.1402089374242365e-06, |
|
"loss": 0.0119, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.789762340036563, |
|
"grad_norm": 0.12207505851984024, |
|
"learning_rate": 9.65355220646036e-07, |
|
"loss": 0.0128, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.8080438756855575, |
|
"grad_norm": 0.1775001883506775, |
|
"learning_rate": 8.049826624011881e-07, |
|
"loss": 0.0166, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.826325411334552, |
|
"grad_norm": 0.2577812075614929, |
|
"learning_rate": 6.591147128716224e-07, |
|
"loss": 0.0191, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.844606946983546, |
|
"grad_norm": 0.1870380938053131, |
|
"learning_rate": 5.277727013296097e-07, |
|
"loss": 0.0125, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.862888482632541, |
|
"grad_norm": 0.22090613842010498, |
|
"learning_rate": 4.1097583301888954e-07, |
|
"loss": 0.009, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.881170018281535, |
|
"grad_norm": 0.25381821393966675, |
|
"learning_rate": 3.0874118634640626e-07, |
|
"loss": 0.0158, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.89945155393053, |
|
"grad_norm": 0.25577688217163086, |
|
"learning_rate": 2.210837103850949e-07, |
|
"loss": 0.0074, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.917733089579524, |
|
"grad_norm": 0.1378592997789383, |
|
"learning_rate": 1.4801622268791892e-07, |
|
"loss": 0.0104, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.936014625228519, |
|
"grad_norm": 0.1672651469707489, |
|
"learning_rate": 8.954940741369155e-08, |
|
"loss": 0.0126, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.954296160877513, |
|
"grad_norm": 0.10131768137216568, |
|
"learning_rate": 4.5691813764803247e-08, |
|
"loss": 0.0093, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.972577696526509, |
|
"grad_norm": 0.19686748087406158, |
|
"learning_rate": 1.644985473709948e-08, |
|
"loss": 0.0132, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.990859232175502, |
|
"grad_norm": 0.16079658269882202, |
|
"learning_rate": 1.8278061821863646e-09, |
|
"loss": 0.0096, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2735, |
|
"total_flos": 9.752547304210464e+16, |
|
"train_loss": 0.045771664261164136, |
|
"train_runtime": 1237.486, |
|
"train_samples_per_second": 35.362, |
|
"train_steps_per_second": 2.21 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2735, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.752547304210464e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|