{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4995098039215686, "eval_steps": 500, "global_step": 5099, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004901960784313725, "grad_norm": 4.5476274490356445, "learning_rate": 7.84313725490196e-06, "loss": 0.6369, "step": 10 }, { "epoch": 0.00980392156862745, "grad_norm": 2.52302885055542, "learning_rate": 1.568627450980392e-05, "loss": 0.484, "step": 20 }, { "epoch": 0.014705882352941176, "grad_norm": 3.1443543434143066, "learning_rate": 2.3529411764705884e-05, "loss": 0.3252, "step": 30 }, { "epoch": 0.0196078431372549, "grad_norm": 2.1440389156341553, "learning_rate": 3.137254901960784e-05, "loss": 0.2779, "step": 40 }, { "epoch": 0.024509803921568627, "grad_norm": 1.7569645643234253, "learning_rate": 3.9215686274509805e-05, "loss": 0.2387, "step": 50 }, { "epoch": 0.029411764705882353, "grad_norm": 1.7137173414230347, "learning_rate": 4.705882352941177e-05, "loss": 0.2146, "step": 60 }, { "epoch": 0.03431372549019608, "grad_norm": 1.0686582326889038, "learning_rate": 5.490196078431373e-05, "loss": 0.1638, "step": 70 }, { "epoch": 0.0392156862745098, "grad_norm": 1.834192156791687, "learning_rate": 6.274509803921569e-05, "loss": 0.1594, "step": 80 }, { "epoch": 0.04411764705882353, "grad_norm": 2.2320666313171387, "learning_rate": 7.058823529411765e-05, "loss": 0.156, "step": 90 }, { "epoch": 0.049019607843137254, "grad_norm": 1.2987866401672363, "learning_rate": 7.843137254901961e-05, "loss": 0.1447, "step": 100 }, { "epoch": 0.05392156862745098, "grad_norm": 1.5711545944213867, "learning_rate": 8.627450980392158e-05, "loss": 0.1449, "step": 110 }, { "epoch": 0.058823529411764705, "grad_norm": 0.8892576098442078, "learning_rate": 9.411764705882353e-05, "loss": 0.1513, "step": 120 }, { "epoch": 0.06372549019607843, "grad_norm": 1.1401337385177612, "learning_rate": 0.00010196078431372549, "loss": 0.1128, "step": 130 }, { "epoch": 0.06862745098039216, "grad_norm": 1.375543475151062, "learning_rate": 0.00010980392156862746, "loss": 0.1186, "step": 140 }, { "epoch": 0.07352941176470588, "grad_norm": 1.6076676845550537, "learning_rate": 0.00011764705882352942, "loss": 0.1396, "step": 150 }, { "epoch": 0.0784313725490196, "grad_norm": 1.2637161016464233, "learning_rate": 0.00012549019607843137, "loss": 0.1187, "step": 160 }, { "epoch": 0.08333333333333333, "grad_norm": 0.5812987685203552, "learning_rate": 0.00013333333333333334, "loss": 0.1155, "step": 170 }, { "epoch": 0.08823529411764706, "grad_norm": 0.7302483916282654, "learning_rate": 0.0001411764705882353, "loss": 0.1068, "step": 180 }, { "epoch": 0.09313725490196079, "grad_norm": 0.6003187894821167, "learning_rate": 0.00014901960784313728, "loss": 0.1084, "step": 190 }, { "epoch": 0.09803921568627451, "grad_norm": 1.3157514333724976, "learning_rate": 0.00015686274509803922, "loss": 0.1127, "step": 200 }, { "epoch": 0.10294117647058823, "grad_norm": 0.8480639457702637, "learning_rate": 0.0001647058823529412, "loss": 0.1025, "step": 210 }, { "epoch": 0.10784313725490197, "grad_norm": 1.0640238523483276, "learning_rate": 0.00017254901960784316, "loss": 0.1214, "step": 220 }, { "epoch": 0.11274509803921569, "grad_norm": 0.7853420972824097, "learning_rate": 0.0001803921568627451, "loss": 0.1103, "step": 230 }, { "epoch": 0.11764705882352941, "grad_norm": 0.760675847530365, "learning_rate": 0.00018823529411764707, "loss": 0.1071, "step": 240 }, { "epoch": 0.12254901960784313, "grad_norm": 1.1404098272323608, "learning_rate": 0.000196078431372549, "loss": 0.1004, "step": 250 }, { "epoch": 0.12745098039215685, "grad_norm": 0.6359620690345764, "learning_rate": 0.0001999994742235753, "loss": 0.1065, "step": 260 }, { "epoch": 0.1323529411764706, "grad_norm": 0.7933241724967957, "learning_rate": 0.00019999526804535039, "loss": 0.0943, "step": 270 }, { "epoch": 0.13725490196078433, "grad_norm": 1.599077582359314, "learning_rate": 0.00019998685586582082, "loss": 0.1304, "step": 280 }, { "epoch": 0.14215686274509803, "grad_norm": 0.8844221234321594, "learning_rate": 0.00019997423803881975, "loss": 0.0917, "step": 290 }, { "epoch": 0.14705882352941177, "grad_norm": 1.2456647157669067, "learning_rate": 0.00019995741509507825, "loss": 0.111, "step": 300 }, { "epoch": 0.15196078431372548, "grad_norm": 0.6590626239776611, "learning_rate": 0.00019993638774220307, "loss": 0.1022, "step": 310 }, { "epoch": 0.1568627450980392, "grad_norm": 0.7061448693275452, "learning_rate": 0.00019991115686464675, "loss": 0.0938, "step": 320 }, { "epoch": 0.16176470588235295, "grad_norm": 1.0512727499008179, "learning_rate": 0.00019988172352367056, "loss": 0.1059, "step": 330 }, { "epoch": 0.16666666666666666, "grad_norm": 0.6884363889694214, "learning_rate": 0.00019984808895729978, "loss": 0.0801, "step": 340 }, { "epoch": 0.1715686274509804, "grad_norm": 0.8961064219474792, "learning_rate": 0.00019981025458027169, "loss": 0.0872, "step": 350 }, { "epoch": 0.17647058823529413, "grad_norm": 0.7410668730735779, "learning_rate": 0.00019976822198397595, "loss": 0.0935, "step": 360 }, { "epoch": 0.18137254901960784, "grad_norm": 0.8089532256126404, "learning_rate": 0.00019972199293638777, "loss": 0.0806, "step": 370 }, { "epoch": 0.18627450980392157, "grad_norm": 0.6644020676612854, "learning_rate": 0.00019967156938199355, "loss": 0.0885, "step": 380 }, { "epoch": 0.19117647058823528, "grad_norm": 0.8422799110412598, "learning_rate": 0.00019961695344170895, "loss": 0.0952, "step": 390 }, { "epoch": 0.19607843137254902, "grad_norm": 0.8162615299224854, "learning_rate": 0.00019955814741278986, "loss": 0.0802, "step": 400 }, { "epoch": 0.20098039215686275, "grad_norm": 0.7302709221839905, "learning_rate": 0.0001994951537687357, "loss": 0.0884, "step": 410 }, { "epoch": 0.20588235294117646, "grad_norm": 0.7032344937324524, "learning_rate": 0.00019942797515918527, "loss": 0.0896, "step": 420 }, { "epoch": 0.2107843137254902, "grad_norm": 0.8042428493499756, "learning_rate": 0.00019935661440980554, "loss": 0.0811, "step": 430 }, { "epoch": 0.21568627450980393, "grad_norm": 0.6656658053398132, "learning_rate": 0.00019928107452217255, "loss": 0.0856, "step": 440 }, { "epoch": 0.22058823529411764, "grad_norm": 1.2202825546264648, "learning_rate": 0.00019920135867364534, "loss": 0.0895, "step": 450 }, { "epoch": 0.22549019607843138, "grad_norm": 0.8210168480873108, "learning_rate": 0.00019911747021723216, "loss": 0.0807, "step": 460 }, { "epoch": 0.23039215686274508, "grad_norm": 0.7217456102371216, "learning_rate": 0.0001990294126814496, "loss": 0.0814, "step": 470 }, { "epoch": 0.23529411764705882, "grad_norm": 0.743126392364502, "learning_rate": 0.00019893718977017402, "loss": 0.0887, "step": 480 }, { "epoch": 0.24019607843137256, "grad_norm": 0.7190248370170593, "learning_rate": 0.00019884080536248578, "loss": 0.0859, "step": 490 }, { "epoch": 0.24509803921568626, "grad_norm": 0.5253967046737671, "learning_rate": 0.00019874026351250623, "loss": 0.0678, "step": 500 }, { "epoch": 0.25, "grad_norm": 0.8271141052246094, "learning_rate": 0.00019863556844922696, "loss": 0.0762, "step": 510 }, { "epoch": 0.2549019607843137, "grad_norm": 0.7656545639038086, "learning_rate": 0.0001985267245763321, "loss": 0.0724, "step": 520 }, { "epoch": 0.25980392156862747, "grad_norm": 0.6673869490623474, "learning_rate": 0.00019841373647201297, "loss": 0.0817, "step": 530 }, { "epoch": 0.2647058823529412, "grad_norm": 0.880395770072937, "learning_rate": 0.00019829660888877565, "loss": 0.0897, "step": 540 }, { "epoch": 0.2696078431372549, "grad_norm": 0.7278539538383484, "learning_rate": 0.00019817534675324093, "loss": 0.0808, "step": 550 }, { "epoch": 0.27450980392156865, "grad_norm": 0.5380986928939819, "learning_rate": 0.00019804995516593712, "loss": 0.077, "step": 560 }, { "epoch": 0.27941176470588236, "grad_norm": 0.9306485652923584, "learning_rate": 0.00019792043940108564, "loss": 0.0883, "step": 570 }, { "epoch": 0.28431372549019607, "grad_norm": 0.9304268956184387, "learning_rate": 0.00019778680490637902, "loss": 0.0899, "step": 580 }, { "epoch": 0.28921568627450983, "grad_norm": 0.7331838607788086, "learning_rate": 0.00019764905730275184, "loss": 0.0709, "step": 590 }, { "epoch": 0.29411764705882354, "grad_norm": 0.47580260038375854, "learning_rate": 0.00019750720238414425, "loss": 0.0857, "step": 600 }, { "epoch": 0.29901960784313725, "grad_norm": 0.5752102732658386, "learning_rate": 0.0001973612461172583, "loss": 0.0838, "step": 610 }, { "epoch": 0.30392156862745096, "grad_norm": 0.4644894599914551, "learning_rate": 0.00019721119464130707, "loss": 0.0851, "step": 620 }, { "epoch": 0.3088235294117647, "grad_norm": 0.7036497592926025, "learning_rate": 0.00019705705426775616, "loss": 0.0741, "step": 630 }, { "epoch": 0.3137254901960784, "grad_norm": 0.499897301197052, "learning_rate": 0.0001968988314800585, "loss": 0.0718, "step": 640 }, { "epoch": 0.31862745098039214, "grad_norm": 0.7211794853210449, "learning_rate": 0.0001967365329333816, "loss": 0.0798, "step": 650 }, { "epoch": 0.3235294117647059, "grad_norm": 0.6176502108573914, "learning_rate": 0.0001965701654543274, "loss": 0.0695, "step": 660 }, { "epoch": 0.3284313725490196, "grad_norm": 0.7395395636558533, "learning_rate": 0.0001963997360406454, "loss": 0.0581, "step": 670 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5304160714149475, "learning_rate": 0.00019622525186093818, "loss": 0.0826, "step": 680 }, { "epoch": 0.3382352941176471, "grad_norm": 0.46235349774360657, "learning_rate": 0.0001960467202543599, "loss": 0.056, "step": 690 }, { "epoch": 0.3431372549019608, "grad_norm": 0.5242049098014832, "learning_rate": 0.00019586414873030758, "loss": 0.0728, "step": 700 }, { "epoch": 0.3480392156862745, "grad_norm": 0.552486777305603, "learning_rate": 0.00019567754496810534, "loss": 0.0806, "step": 710 }, { "epoch": 0.35294117647058826, "grad_norm": 0.5002785325050354, "learning_rate": 0.0001954869168166812, "loss": 0.0643, "step": 720 }, { "epoch": 0.35784313725490197, "grad_norm": 0.47353097796440125, "learning_rate": 0.00019529227229423717, "loss": 0.0838, "step": 730 }, { "epoch": 0.3627450980392157, "grad_norm": 0.4200286865234375, "learning_rate": 0.00019509361958791174, "loss": 0.0776, "step": 740 }, { "epoch": 0.36764705882352944, "grad_norm": 0.6603316068649292, "learning_rate": 0.00019489096705343578, "loss": 0.0705, "step": 750 }, { "epoch": 0.37254901960784315, "grad_norm": 0.37562692165374756, "learning_rate": 0.0001946843232147809, "loss": 0.072, "step": 760 }, { "epoch": 0.37745098039215685, "grad_norm": 0.6199838519096375, "learning_rate": 0.0001944736967638009, "loss": 0.0649, "step": 770 }, { "epoch": 0.38235294117647056, "grad_norm": 0.7614375948905945, "learning_rate": 0.0001942590965598663, "loss": 0.0735, "step": 780 }, { "epoch": 0.3872549019607843, "grad_norm": 0.671489953994751, "learning_rate": 0.00019404053162949155, "loss": 0.065, "step": 790 }, { "epoch": 0.39215686274509803, "grad_norm": 0.5170246362686157, "learning_rate": 0.0001938180111659556, "loss": 0.078, "step": 800 }, { "epoch": 0.39705882352941174, "grad_norm": 0.5392031073570251, "learning_rate": 0.00019359154452891483, "loss": 0.063, "step": 810 }, { "epoch": 0.4019607843137255, "grad_norm": 0.6858069896697998, "learning_rate": 0.00019336114124400978, "loss": 0.0783, "step": 820 }, { "epoch": 0.4068627450980392, "grad_norm": 0.7257099151611328, "learning_rate": 0.0001931268110024642, "loss": 0.0798, "step": 830 }, { "epoch": 0.4117647058823529, "grad_norm": 0.7296270132064819, "learning_rate": 0.00019288856366067746, "loss": 0.0619, "step": 840 }, { "epoch": 0.4166666666666667, "grad_norm": 0.6048017740249634, "learning_rate": 0.0001926464092398101, "loss": 0.0634, "step": 850 }, { "epoch": 0.4215686274509804, "grad_norm": 0.3223126232624054, "learning_rate": 0.00019240035792536216, "loss": 0.0755, "step": 860 }, { "epoch": 0.4264705882352941, "grad_norm": 0.45046138763427734, "learning_rate": 0.0001921504200667449, "loss": 0.0661, "step": 870 }, { "epoch": 0.43137254901960786, "grad_norm": 0.609027624130249, "learning_rate": 0.00019189660617684537, "loss": 0.0711, "step": 880 }, { "epoch": 0.4362745098039216, "grad_norm": 0.4166688323020935, "learning_rate": 0.00019163892693158425, "loss": 0.0644, "step": 890 }, { "epoch": 0.4411764705882353, "grad_norm": 0.6250641345977783, "learning_rate": 0.00019137739316946685, "loss": 0.0674, "step": 900 }, { "epoch": 0.44607843137254904, "grad_norm": 0.6781248450279236, "learning_rate": 0.00019111201589112718, "loss": 0.0657, "step": 910 }, { "epoch": 0.45098039215686275, "grad_norm": 0.9098891615867615, "learning_rate": 0.00019084280625886516, "loss": 0.0765, "step": 920 }, { "epoch": 0.45588235294117646, "grad_norm": 0.5926252603530884, "learning_rate": 0.00019056977559617731, "loss": 0.0896, "step": 930 }, { "epoch": 0.46078431372549017, "grad_norm": 0.6467915773391724, "learning_rate": 0.0001902929353872803, "loss": 0.0595, "step": 940 }, { "epoch": 0.46568627450980393, "grad_norm": 0.4950433671474457, "learning_rate": 0.0001900122972766279, "loss": 0.0651, "step": 950 }, { "epoch": 0.47058823529411764, "grad_norm": 0.6317784190177917, "learning_rate": 0.0001897278730684213, "loss": 0.08, "step": 960 }, { "epoch": 0.47549019607843135, "grad_norm": 0.47558578848838806, "learning_rate": 0.0001894396747261125, "loss": 0.0622, "step": 970 }, { "epoch": 0.4803921568627451, "grad_norm": 0.5610472559928894, "learning_rate": 0.0001891477143719012, "loss": 0.0667, "step": 980 }, { "epoch": 0.4852941176470588, "grad_norm": 0.7227151989936829, "learning_rate": 0.00018885200428622474, "loss": 0.0648, "step": 990 }, { "epoch": 0.49019607843137253, "grad_norm": 0.49453797936439514, "learning_rate": 0.0001885525569072418, "loss": 0.0663, "step": 1000 }, { "epoch": 0.4950980392156863, "grad_norm": 0.4297734200954437, "learning_rate": 0.000188249384830309, "loss": 0.0779, "step": 1010 }, { "epoch": 0.5, "grad_norm": 0.39416739344596863, "learning_rate": 0.00018794250080745136, "loss": 0.0577, "step": 1020 }, { "epoch": 0.5049019607843137, "grad_norm": 0.6955050230026245, "learning_rate": 0.0001876319177468256, "loss": 0.0579, "step": 1030 }, { "epoch": 0.5098039215686274, "grad_norm": 0.5533928871154785, "learning_rate": 0.00018731764871217753, "loss": 0.0583, "step": 1040 }, { "epoch": 0.5147058823529411, "grad_norm": 0.4718644618988037, "learning_rate": 0.00018699970692229233, "loss": 0.0609, "step": 1050 }, { "epoch": 0.5196078431372549, "grad_norm": 0.39921796321868896, "learning_rate": 0.00018667810575043864, "loss": 0.0612, "step": 1060 }, { "epoch": 0.5245098039215687, "grad_norm": 0.34913963079452515, "learning_rate": 0.0001863528587238061, "loss": 0.0522, "step": 1070 }, { "epoch": 0.5294117647058824, "grad_norm": 0.5829554796218872, "learning_rate": 0.00018602397952293618, "loss": 0.0651, "step": 1080 }, { "epoch": 0.5343137254901961, "grad_norm": 0.7142338156700134, "learning_rate": 0.00018569148198114695, "loss": 0.0643, "step": 1090 }, { "epoch": 0.5392156862745098, "grad_norm": 0.24581728875637054, "learning_rate": 0.00018535538008395124, "loss": 0.0537, "step": 1100 }, { "epoch": 0.5441176470588235, "grad_norm": 0.41139382123947144, "learning_rate": 0.0001850156879684681, "loss": 0.0631, "step": 1110 }, { "epoch": 0.5490196078431373, "grad_norm": 0.4532317519187927, "learning_rate": 0.00018467241992282843, "loss": 0.0573, "step": 1120 }, { "epoch": 0.553921568627451, "grad_norm": 0.45865532755851746, "learning_rate": 0.00018432559038557397, "loss": 0.053, "step": 1130 }, { "epoch": 0.5588235294117647, "grad_norm": 0.3976840078830719, "learning_rate": 0.00018397521394504995, "loss": 0.0529, "step": 1140 }, { "epoch": 0.5637254901960784, "grad_norm": 0.47105035185813904, "learning_rate": 0.00018362130533879133, "loss": 0.0671, "step": 1150 }, { "epoch": 0.5686274509803921, "grad_norm": 0.5433268547058105, "learning_rate": 0.00018326387945290313, "loss": 0.0529, "step": 1160 }, { "epoch": 0.5735294117647058, "grad_norm": 0.6220820546150208, "learning_rate": 0.00018290295132143415, "loss": 0.0697, "step": 1170 }, { "epoch": 0.5784313725490197, "grad_norm": 0.38451075553894043, "learning_rate": 0.00018253853612574473, "loss": 0.0621, "step": 1180 }, { "epoch": 0.5833333333333334, "grad_norm": 0.49342110753059387, "learning_rate": 0.00018217064919386807, "loss": 0.0603, "step": 1190 }, { "epoch": 0.5882352941176471, "grad_norm": 0.5240128636360168, "learning_rate": 0.00018179930599986554, "loss": 0.0614, "step": 1200 }, { "epoch": 0.5931372549019608, "grad_norm": 0.5015797019004822, "learning_rate": 0.0001814245221631758, "loss": 0.0651, "step": 1210 }, { "epoch": 0.5980392156862745, "grad_norm": 0.7029892802238464, "learning_rate": 0.0001810463134479579, "loss": 0.0598, "step": 1220 }, { "epoch": 0.6029411764705882, "grad_norm": 0.3569225072860718, "learning_rate": 0.00018066469576242806, "loss": 0.0479, "step": 1230 }, { "epoch": 0.6078431372549019, "grad_norm": 0.4940333068370819, "learning_rate": 0.00018027968515819072, "loss": 0.055, "step": 1240 }, { "epoch": 0.6127450980392157, "grad_norm": 0.5233299732208252, "learning_rate": 0.00017989129782956323, "loss": 0.0555, "step": 1250 }, { "epoch": 0.6176470588235294, "grad_norm": 0.35107848048210144, "learning_rate": 0.00017949955011289465, "loss": 0.0472, "step": 1260 }, { "epoch": 0.6225490196078431, "grad_norm": 0.42003870010375977, "learning_rate": 0.00017910445848587885, "loss": 0.0454, "step": 1270 }, { "epoch": 0.6274509803921569, "grad_norm": 0.24233393371105194, "learning_rate": 0.00017870603956686117, "loss": 0.0631, "step": 1280 }, { "epoch": 0.6323529411764706, "grad_norm": 0.5557372570037842, "learning_rate": 0.0001783043101141395, "loss": 0.0628, "step": 1290 }, { "epoch": 0.6372549019607843, "grad_norm": 0.33980950713157654, "learning_rate": 0.00017789928702525952, "loss": 0.0591, "step": 1300 }, { "epoch": 0.6421568627450981, "grad_norm": 0.2716699242591858, "learning_rate": 0.00017749098733630368, "loss": 0.0584, "step": 1310 }, { "epoch": 0.6470588235294118, "grad_norm": 0.42181700468063354, "learning_rate": 0.00017707942822117495, "loss": 0.0572, "step": 1320 }, { "epoch": 0.6519607843137255, "grad_norm": 0.46250826120376587, "learning_rate": 0.00017666462699087422, "loss": 0.0614, "step": 1330 }, { "epoch": 0.6568627450980392, "grad_norm": 0.7147281169891357, "learning_rate": 0.00017624660109277223, "loss": 0.0666, "step": 1340 }, { "epoch": 0.6617647058823529, "grad_norm": 0.6105577945709229, "learning_rate": 0.00017582536810987576, "loss": 0.0508, "step": 1350 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5564696788787842, "learning_rate": 0.00017540094576008796, "loss": 0.0581, "step": 1360 }, { "epoch": 0.6715686274509803, "grad_norm": 0.4955359399318695, "learning_rate": 0.00017497335189546308, "loss": 0.0569, "step": 1370 }, { "epoch": 0.6764705882352942, "grad_norm": 0.40812528133392334, "learning_rate": 0.0001745426045014558, "loss": 0.065, "step": 1380 }, { "epoch": 0.6813725490196079, "grad_norm": 0.31670013070106506, "learning_rate": 0.00017410872169616447, "loss": 0.0632, "step": 1390 }, { "epoch": 0.6862745098039216, "grad_norm": 0.5076479911804199, "learning_rate": 0.00017367172172956906, "loss": 0.0558, "step": 1400 }, { "epoch": 0.6911764705882353, "grad_norm": 0.5511890053749084, "learning_rate": 0.0001732316229827637, "loss": 0.0669, "step": 1410 }, { "epoch": 0.696078431372549, "grad_norm": 0.32073989510536194, "learning_rate": 0.00017278844396718336, "loss": 0.0543, "step": 1420 }, { "epoch": 0.7009803921568627, "grad_norm": 0.5955519080162048, "learning_rate": 0.00017234220332382528, "loss": 0.0594, "step": 1430 }, { "epoch": 0.7058823529411765, "grad_norm": 0.5735410451889038, "learning_rate": 0.00017189291982246493, "loss": 0.0498, "step": 1440 }, { "epoch": 0.7107843137254902, "grad_norm": 0.4336249530315399, "learning_rate": 0.0001714406123608665, "loss": 0.0577, "step": 1450 }, { "epoch": 0.7156862745098039, "grad_norm": 0.2760525047779083, "learning_rate": 0.00017098529996398796, "loss": 0.05, "step": 1460 }, { "epoch": 0.7205882352941176, "grad_norm": 0.466795951128006, "learning_rate": 0.00017052700178318088, "loss": 0.0435, "step": 1470 }, { "epoch": 0.7254901960784313, "grad_norm": 0.2780659794807434, "learning_rate": 0.00017006573709538492, "loss": 0.0516, "step": 1480 }, { "epoch": 0.7303921568627451, "grad_norm": 0.31002551317214966, "learning_rate": 0.00016960152530231696, "loss": 0.0494, "step": 1490 }, { "epoch": 0.7352941176470589, "grad_norm": 0.4070112407207489, "learning_rate": 0.00016913438592965497, "loss": 0.0594, "step": 1500 }, { "epoch": 0.7401960784313726, "grad_norm": 0.5858240127563477, "learning_rate": 0.00016866433862621692, "loss": 0.0421, "step": 1510 }, { "epoch": 0.7450980392156863, "grad_norm": 0.2468300759792328, "learning_rate": 0.00016819140316313397, "loss": 0.0499, "step": 1520 }, { "epoch": 0.75, "grad_norm": 0.4881417155265808, "learning_rate": 0.00016771559943301926, "loss": 0.0557, "step": 1530 }, { "epoch": 0.7549019607843137, "grad_norm": 0.428586483001709, "learning_rate": 0.00016723694744913087, "loss": 0.0547, "step": 1540 }, { "epoch": 0.7598039215686274, "grad_norm": 0.4626677334308624, "learning_rate": 0.0001667554673445302, "loss": 0.0577, "step": 1550 }, { "epoch": 0.7647058823529411, "grad_norm": 0.44555196166038513, "learning_rate": 0.000166271179371235, "loss": 0.0498, "step": 1560 }, { "epoch": 0.7696078431372549, "grad_norm": 0.4426194131374359, "learning_rate": 0.0001657841038993677, "loss": 0.0491, "step": 1570 }, { "epoch": 0.7745098039215687, "grad_norm": 0.3198953866958618, "learning_rate": 0.00016529426141629843, "loss": 0.0472, "step": 1580 }, { "epoch": 0.7794117647058824, "grad_norm": 0.2759292423725128, "learning_rate": 0.0001648016725257834, "loss": 0.0508, "step": 1590 }, { "epoch": 0.7843137254901961, "grad_norm": 0.33216890692710876, "learning_rate": 0.00016430635794709817, "loss": 0.0516, "step": 1600 }, { "epoch": 0.7892156862745098, "grad_norm": 0.40735071897506714, "learning_rate": 0.0001638083385141662, "loss": 0.0549, "step": 1610 }, { "epoch": 0.7941176470588235, "grad_norm": 0.3739156424999237, "learning_rate": 0.0001633076351746827, "loss": 0.0543, "step": 1620 }, { "epoch": 0.7990196078431373, "grad_norm": 0.2865970730781555, "learning_rate": 0.0001628042689892331, "loss": 0.0557, "step": 1630 }, { "epoch": 0.803921568627451, "grad_norm": 0.6009504795074463, "learning_rate": 0.00016229826113040767, "loss": 0.0481, "step": 1640 }, { "epoch": 0.8088235294117647, "grad_norm": 0.2973253130912781, "learning_rate": 0.00016178963288191072, "loss": 0.0465, "step": 1650 }, { "epoch": 0.8137254901960784, "grad_norm": 0.36205539107322693, "learning_rate": 0.00016127840563766527, "loss": 0.0676, "step": 1660 }, { "epoch": 0.8186274509803921, "grad_norm": 0.41221827268600464, "learning_rate": 0.0001607646009009135, "loss": 0.0544, "step": 1670 }, { "epoch": 0.8235294117647058, "grad_norm": 0.7448700666427612, "learning_rate": 0.00016024824028331195, "loss": 0.0544, "step": 1680 }, { "epoch": 0.8284313725490197, "grad_norm": 0.5625671744346619, "learning_rate": 0.0001597293455040227, "loss": 0.0659, "step": 1690 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5039217472076416, "learning_rate": 0.00015920793838879966, "loss": 0.0522, "step": 1700 }, { "epoch": 0.8382352941176471, "grad_norm": 0.6583822965621948, "learning_rate": 0.00015868404086907077, "loss": 0.0473, "step": 1710 }, { "epoch": 0.8431372549019608, "grad_norm": 0.4799802899360657, "learning_rate": 0.00015815767498101522, "loss": 0.0502, "step": 1720 }, { "epoch": 0.8480392156862745, "grad_norm": 0.38270649313926697, "learning_rate": 0.00015762886286463683, "loss": 0.0488, "step": 1730 }, { "epoch": 0.8529411764705882, "grad_norm": 0.5844431519508362, "learning_rate": 0.0001570976267628326, "loss": 0.0506, "step": 1740 }, { "epoch": 0.8578431372549019, "grad_norm": 0.3698306977748871, "learning_rate": 0.00015656398902045727, "loss": 0.0474, "step": 1750 }, { "epoch": 0.8627450980392157, "grad_norm": 0.4788702428340912, "learning_rate": 0.00015602797208338337, "loss": 0.0452, "step": 1760 }, { "epoch": 0.8676470588235294, "grad_norm": 0.47195127606391907, "learning_rate": 0.00015548959849755715, "loss": 0.0497, "step": 1770 }, { "epoch": 0.8725490196078431, "grad_norm": 0.3890342712402344, "learning_rate": 0.00015494889090805018, "loss": 0.0466, "step": 1780 }, { "epoch": 0.8774509803921569, "grad_norm": 0.24695785343647003, "learning_rate": 0.00015440587205810692, "loss": 0.0525, "step": 1790 }, { "epoch": 0.8823529411764706, "grad_norm": 0.41040509939193726, "learning_rate": 0.00015386056478818814, "loss": 0.0556, "step": 1800 }, { "epoch": 0.8872549019607843, "grad_norm": 0.5128910541534424, "learning_rate": 0.00015331299203501, "loss": 0.0485, "step": 1810 }, { "epoch": 0.8921568627450981, "grad_norm": 0.3270062804222107, "learning_rate": 0.0001527631768305796, "loss": 0.0423, "step": 1820 }, { "epoch": 0.8970588235294118, "grad_norm": 0.43017369508743286, "learning_rate": 0.00015221114230122584, "loss": 0.0461, "step": 1830 }, { "epoch": 0.9019607843137255, "grad_norm": 0.501888632774353, "learning_rate": 0.00015165691166662705, "loss": 0.0472, "step": 1840 }, { "epoch": 0.9068627450980392, "grad_norm": 0.286864310503006, "learning_rate": 0.00015110050823883406, "loss": 0.0418, "step": 1850 }, { "epoch": 0.9117647058823529, "grad_norm": 0.5109400153160095, "learning_rate": 0.00015054195542128968, "loss": 0.0426, "step": 1860 }, { "epoch": 0.9166666666666666, "grad_norm": 0.32924002408981323, "learning_rate": 0.00014998127670784448, "loss": 0.0389, "step": 1870 }, { "epoch": 0.9215686274509803, "grad_norm": 0.21681684255599976, "learning_rate": 0.0001494184956817684, "loss": 0.0487, "step": 1880 }, { "epoch": 0.9264705882352942, "grad_norm": 0.20006486773490906, "learning_rate": 0.00014885363601475888, "loss": 0.0521, "step": 1890 }, { "epoch": 0.9313725490196079, "grad_norm": 0.6207095980644226, "learning_rate": 0.00014828672146594511, "loss": 0.0542, "step": 1900 }, { "epoch": 0.9362745098039216, "grad_norm": 0.5559191703796387, "learning_rate": 0.00014771777588088884, "loss": 0.0446, "step": 1910 }, { "epoch": 0.9411764705882353, "grad_norm": 0.3139231503009796, "learning_rate": 0.00014714682319058112, "loss": 0.0403, "step": 1920 }, { "epoch": 0.946078431372549, "grad_norm": 0.3132689893245697, "learning_rate": 0.00014657388741043606, "loss": 0.0398, "step": 1930 }, { "epoch": 0.9509803921568627, "grad_norm": 0.2745281457901001, "learning_rate": 0.00014599899263928028, "loss": 0.0358, "step": 1940 }, { "epoch": 0.9558823529411765, "grad_norm": 0.2483411729335785, "learning_rate": 0.00014542216305833968, "loss": 0.0506, "step": 1950 }, { "epoch": 0.9607843137254902, "grad_norm": 0.3583033084869385, "learning_rate": 0.000144843422930222, "loss": 0.0442, "step": 1960 }, { "epoch": 0.9656862745098039, "grad_norm": 0.2526845932006836, "learning_rate": 0.00014426279659789651, "loss": 0.0458, "step": 1970 }, { "epoch": 0.9705882352941176, "grad_norm": 0.3186612129211426, "learning_rate": 0.00014368030848367, "loss": 0.052, "step": 1980 }, { "epoch": 0.9754901960784313, "grad_norm": 0.3088064193725586, "learning_rate": 0.00014309598308815945, "loss": 0.0453, "step": 1990 }, { "epoch": 0.9803921568627451, "grad_norm": 0.44008663296699524, "learning_rate": 0.00014250984498926167, "loss": 0.0449, "step": 2000 }, { "epoch": 0.9852941176470589, "grad_norm": 0.5169370174407959, "learning_rate": 0.0001419219188411194, "loss": 0.0411, "step": 2010 }, { "epoch": 0.9901960784313726, "grad_norm": 0.32191041111946106, "learning_rate": 0.0001413322293730842, "loss": 0.0406, "step": 2020 }, { "epoch": 0.9950980392156863, "grad_norm": 0.300047904253006, "learning_rate": 0.00014074080138867654, "loss": 0.0432, "step": 2030 }, { "epoch": 1.0, "grad_norm": 0.6571453809738159, "learning_rate": 0.00014014765976454231, "loss": 0.0421, "step": 2040 }, { "epoch": 1.0049019607843137, "grad_norm": 0.43792489171028137, "learning_rate": 0.00013955282944940652, "loss": 0.0389, "step": 2050 }, { "epoch": 1.0098039215686274, "grad_norm": 0.40551769733428955, "learning_rate": 0.0001389563354630239, "loss": 0.0418, "step": 2060 }, { "epoch": 1.0147058823529411, "grad_norm": 0.3299430012702942, "learning_rate": 0.0001383582028951265, "loss": 0.0465, "step": 2070 }, { "epoch": 1.0196078431372548, "grad_norm": 0.2800670266151428, "learning_rate": 0.00013775845690436848, "loss": 0.0443, "step": 2080 }, { "epoch": 1.0245098039215685, "grad_norm": 0.49185529351234436, "learning_rate": 0.00013715712271726772, "loss": 0.0415, "step": 2090 }, { "epoch": 1.0294117647058822, "grad_norm": 0.6726065278053284, "learning_rate": 0.0001365542256271448, "loss": 0.038, "step": 2100 }, { "epoch": 1.0343137254901962, "grad_norm": 0.5443005561828613, "learning_rate": 0.00013594979099305928, "loss": 0.0407, "step": 2110 }, { "epoch": 1.0392156862745099, "grad_norm": 0.3882359564304352, "learning_rate": 0.00013534384423874272, "loss": 0.0479, "step": 2120 }, { "epoch": 1.0441176470588236, "grad_norm": 0.3460078239440918, "learning_rate": 0.00013473641085152957, "loss": 0.0472, "step": 2130 }, { "epoch": 1.0490196078431373, "grad_norm": 0.3264780342578888, "learning_rate": 0.00013412751638128503, "loss": 0.0374, "step": 2140 }, { "epoch": 1.053921568627451, "grad_norm": 0.28663188219070435, "learning_rate": 0.0001335171864393304, "loss": 0.0386, "step": 2150 }, { "epoch": 1.0588235294117647, "grad_norm": 0.22994215786457062, "learning_rate": 0.00013290544669736576, "loss": 0.0492, "step": 2160 }, { "epoch": 1.0637254901960784, "grad_norm": 0.2018628716468811, "learning_rate": 0.0001322923228863902, "loss": 0.0367, "step": 2170 }, { "epoch": 1.0686274509803921, "grad_norm": 0.20426690578460693, "learning_rate": 0.0001316778407956196, "loss": 0.0322, "step": 2180 }, { "epoch": 1.0735294117647058, "grad_norm": 0.32902026176452637, "learning_rate": 0.00013106202627140163, "loss": 0.0321, "step": 2190 }, { "epoch": 1.0784313725490196, "grad_norm": 0.3210899233818054, "learning_rate": 0.00013044490521612904, "loss": 0.0405, "step": 2200 }, { "epoch": 1.0833333333333333, "grad_norm": 0.23476989567279816, "learning_rate": 0.00012982650358714967, "loss": 0.0416, "step": 2210 }, { "epoch": 1.088235294117647, "grad_norm": 0.3046343922615051, "learning_rate": 0.000129206847395675, "loss": 0.0394, "step": 2220 }, { "epoch": 1.093137254901961, "grad_norm": 0.27315208315849304, "learning_rate": 0.0001285859627056858, "loss": 0.0439, "step": 2230 }, { "epoch": 1.0980392156862746, "grad_norm": 0.34966346621513367, "learning_rate": 0.00012796387563283605, "loss": 0.0387, "step": 2240 }, { "epoch": 1.1029411764705883, "grad_norm": 0.24096551537513733, "learning_rate": 0.00012734061234335434, "loss": 0.0412, "step": 2250 }, { "epoch": 1.107843137254902, "grad_norm": 0.26461055874824524, "learning_rate": 0.00012671619905294326, "loss": 0.0494, "step": 2260 }, { "epoch": 1.1127450980392157, "grad_norm": 0.24669981002807617, "learning_rate": 0.0001260906620256767, "loss": 0.0396, "step": 2270 }, { "epoch": 1.1176470588235294, "grad_norm": 0.37667712569236755, "learning_rate": 0.00012546402757289532, "loss": 0.0426, "step": 2280 }, { "epoch": 1.1225490196078431, "grad_norm": 0.3720461130142212, "learning_rate": 0.00012483632205209953, "loss": 0.042, "step": 2290 }, { "epoch": 1.1274509803921569, "grad_norm": 0.3975540101528168, "learning_rate": 0.0001242075718658411, "loss": 0.0464, "step": 2300 }, { "epoch": 1.1323529411764706, "grad_norm": 0.34910279512405396, "learning_rate": 0.00012357780346061256, "loss": 0.0412, "step": 2310 }, { "epoch": 1.1372549019607843, "grad_norm": 0.34874585270881653, "learning_rate": 0.00012294704332573462, "loss": 0.0458, "step": 2320 }, { "epoch": 1.142156862745098, "grad_norm": 0.24936801195144653, "learning_rate": 0.0001223153179922423, "loss": 0.0437, "step": 2330 }, { "epoch": 1.1470588235294117, "grad_norm": 0.39299267530441284, "learning_rate": 0.00012168265403176864, "loss": 0.0419, "step": 2340 }, { "epoch": 1.1519607843137254, "grad_norm": 0.44494786858558655, "learning_rate": 0.0001210490780554274, "loss": 0.0391, "step": 2350 }, { "epoch": 1.156862745098039, "grad_norm": 0.26526331901550293, "learning_rate": 0.00012041461671269337, "loss": 0.0338, "step": 2360 }, { "epoch": 1.161764705882353, "grad_norm": 0.4189550280570984, "learning_rate": 0.00011977929669028174, "loss": 0.0441, "step": 2370 }, { "epoch": 1.1666666666666667, "grad_norm": 0.2872006595134735, "learning_rate": 0.00011914314471102545, "loss": 0.0427, "step": 2380 }, { "epoch": 1.1715686274509804, "grad_norm": 0.2823413908481598, "learning_rate": 0.0001185061875327512, "loss": 0.0443, "step": 2390 }, { "epoch": 1.1764705882352942, "grad_norm": 0.26944294571876526, "learning_rate": 0.00011786845194715403, "loss": 0.0387, "step": 2400 }, { "epoch": 1.1813725490196079, "grad_norm": 0.3672547936439514, "learning_rate": 0.00011722996477867026, "loss": 0.0397, "step": 2410 }, { "epoch": 1.1862745098039216, "grad_norm": 0.2237931340932846, "learning_rate": 0.00011659075288334938, "loss": 0.0444, "step": 2420 }, { "epoch": 1.1911764705882353, "grad_norm": 0.2790263295173645, "learning_rate": 0.00011595084314772429, "loss": 0.0358, "step": 2430 }, { "epoch": 1.196078431372549, "grad_norm": 0.2825845777988434, "learning_rate": 0.00011531026248768048, "loss": 0.0368, "step": 2440 }, { "epoch": 1.2009803921568627, "grad_norm": 0.27560582756996155, "learning_rate": 0.00011466903784732381, "loss": 0.0474, "step": 2450 }, { "epoch": 1.2058823529411764, "grad_norm": 0.4237360656261444, "learning_rate": 0.00011402719619784734, "loss": 0.0375, "step": 2460 }, { "epoch": 1.2107843137254901, "grad_norm": 0.3769036531448364, "learning_rate": 0.00011338476453639666, "loss": 0.0308, "step": 2470 }, { "epoch": 1.215686274509804, "grad_norm": 0.32824084162712097, "learning_rate": 0.00011274176988493454, "loss": 0.0386, "step": 2480 }, { "epoch": 1.2205882352941178, "grad_norm": 0.30048561096191406, "learning_rate": 0.0001120982392891042, "loss": 0.0391, "step": 2490 }, { "epoch": 1.2254901960784315, "grad_norm": 0.30690962076187134, "learning_rate": 0.00011145419981709169, "loss": 0.0443, "step": 2500 }, { "epoch": 1.2303921568627452, "grad_norm": 0.4496728479862213, "learning_rate": 0.00011080967855848755, "loss": 0.0447, "step": 2510 }, { "epoch": 1.2352941176470589, "grad_norm": 0.25722137093544006, "learning_rate": 0.00011016470262314707, "loss": 0.0333, "step": 2520 }, { "epoch": 1.2401960784313726, "grad_norm": 0.32415884733200073, "learning_rate": 0.00010951929914005033, "loss": 0.0375, "step": 2530 }, { "epoch": 1.2450980392156863, "grad_norm": 0.33738696575164795, "learning_rate": 0.00010887349525616075, "loss": 0.0408, "step": 2540 }, { "epoch": 1.25, "grad_norm": 0.3490372896194458, "learning_rate": 0.00010822731813528354, "loss": 0.0337, "step": 2550 }, { "epoch": 1.2549019607843137, "grad_norm": 0.42863723635673523, "learning_rate": 0.00010758079495692294, "loss": 0.0442, "step": 2560 }, { "epoch": 1.2598039215686274, "grad_norm": 0.29293495416641235, "learning_rate": 0.00010693395291513908, "loss": 0.0408, "step": 2570 }, { "epoch": 1.2647058823529411, "grad_norm": 0.27024197578430176, "learning_rate": 0.00010628681921740414, "loss": 0.0377, "step": 2580 }, { "epoch": 1.2696078431372548, "grad_norm": 0.35880324244499207, "learning_rate": 0.00010563942108345785, "loss": 0.0364, "step": 2590 }, { "epoch": 1.2745098039215685, "grad_norm": 0.3253026306629181, "learning_rate": 0.0001049917857441628, "loss": 0.0374, "step": 2600 }, { "epoch": 1.2794117647058822, "grad_norm": 0.33250367641448975, "learning_rate": 0.00010434394044035878, "loss": 0.0384, "step": 2610 }, { "epoch": 1.284313725490196, "grad_norm": 0.34700486063957214, "learning_rate": 0.00010369591242171719, "loss": 0.0369, "step": 2620 }, { "epoch": 1.2892156862745099, "grad_norm": 0.3287598788738251, "learning_rate": 0.00010304772894559475, "loss": 0.0419, "step": 2630 }, { "epoch": 1.2941176470588236, "grad_norm": 0.3363092243671417, "learning_rate": 0.00010239941727588707, "loss": 0.0419, "step": 2640 }, { "epoch": 1.2990196078431373, "grad_norm": 0.32619622349739075, "learning_rate": 0.0001017510046818817, "loss": 0.0353, "step": 2650 }, { "epoch": 1.303921568627451, "grad_norm": 0.28630563616752625, "learning_rate": 0.00010110251843711149, "loss": 0.0317, "step": 2660 }, { "epoch": 1.3088235294117647, "grad_norm": 0.470198392868042, "learning_rate": 0.00010045398581820702, "loss": 0.0397, "step": 2670 }, { "epoch": 1.3137254901960784, "grad_norm": 0.34590962529182434, "learning_rate": 9.98054341037495e-05, "loss": 0.0336, "step": 2680 }, { "epoch": 1.3186274509803921, "grad_norm": 0.18714579939842224, "learning_rate": 9.91568905731234e-05, "loss": 0.0268, "step": 2690 }, { "epoch": 1.3235294117647058, "grad_norm": 0.3055776357650757, "learning_rate": 9.850838250536885e-05, "loss": 0.0384, "step": 2700 }, { "epoch": 1.3284313725490196, "grad_norm": 0.33092889189720154, "learning_rate": 9.785993717803445e-05, "loss": 0.0323, "step": 2710 }, { "epoch": 1.3333333333333333, "grad_norm": 0.26138371229171753, "learning_rate": 9.721158186602979e-05, "loss": 0.0391, "step": 2720 }, { "epoch": 1.3382352941176472, "grad_norm": 0.2703782320022583, "learning_rate": 9.656334384047812e-05, "loss": 0.0268, "step": 2730 }, { "epoch": 1.343137254901961, "grad_norm": 0.24206914007663727, "learning_rate": 9.591525036756952e-05, "loss": 0.032, "step": 2740 }, { "epoch": 1.3480392156862746, "grad_norm": 0.3793281018733978, "learning_rate": 9.526732870741386e-05, "loss": 0.0399, "step": 2750 }, { "epoch": 1.3529411764705883, "grad_norm": 0.3143618404865265, "learning_rate": 9.46196061128942e-05, "loss": 0.0365, "step": 2760 }, { "epoch": 1.357843137254902, "grad_norm": 0.23540951311588287, "learning_rate": 9.397210982852053e-05, "loss": 0.0328, "step": 2770 }, { "epoch": 1.3627450980392157, "grad_norm": 0.2023368775844574, "learning_rate": 9.332486708928373e-05, "loss": 0.0316, "step": 2780 }, { "epoch": 1.3676470588235294, "grad_norm": 0.26689231395721436, "learning_rate": 9.267790511951015e-05, "loss": 0.0326, "step": 2790 }, { "epoch": 1.3725490196078431, "grad_norm": 0.2792396545410156, "learning_rate": 9.203125113171631e-05, "loss": 0.0336, "step": 2800 }, { "epoch": 1.3774509803921569, "grad_norm": 0.21045692265033722, "learning_rate": 9.13849323254645e-05, "loss": 0.0296, "step": 2810 }, { "epoch": 1.3823529411764706, "grad_norm": 0.26224854588508606, "learning_rate": 9.073897588621853e-05, "loss": 0.0311, "step": 2820 }, { "epoch": 1.3872549019607843, "grad_norm": 0.30219170451164246, "learning_rate": 9.009340898420029e-05, "loss": 0.0379, "step": 2830 }, { "epoch": 1.392156862745098, "grad_norm": 0.19660678505897522, "learning_rate": 8.944825877324708e-05, "loss": 0.035, "step": 2840 }, { "epoch": 1.3970588235294117, "grad_norm": 0.2348472774028778, "learning_rate": 8.880355238966923e-05, "loss": 0.0366, "step": 2850 }, { "epoch": 1.4019607843137254, "grad_norm": 0.44347622990608215, "learning_rate": 8.815931695110885e-05, "loss": 0.0333, "step": 2860 }, { "epoch": 1.406862745098039, "grad_norm": 0.34309887886047363, "learning_rate": 8.751557955539915e-05, "loss": 0.0394, "step": 2870 }, { "epoch": 1.4117647058823528, "grad_norm": 0.3023855984210968, "learning_rate": 8.687236727942465e-05, "loss": 0.0308, "step": 2880 }, { "epoch": 1.4166666666666667, "grad_norm": 0.25127673149108887, "learning_rate": 8.622970717798227e-05, "loss": 0.0384, "step": 2890 }, { "epoch": 1.4215686274509804, "grad_norm": 0.17014305293560028, "learning_rate": 8.558762628264345e-05, "loss": 0.0331, "step": 2900 }, { "epoch": 1.4264705882352942, "grad_norm": 0.32725852727890015, "learning_rate": 8.494615160061694e-05, "loss": 0.0326, "step": 2910 }, { "epoch": 1.4313725490196079, "grad_norm": 0.2895604968070984, "learning_rate": 8.430531011361298e-05, "loss": 0.0319, "step": 2920 }, { "epoch": 1.4362745098039216, "grad_norm": 0.3882890045642853, "learning_rate": 8.366512877670842e-05, "loss": 0.0331, "step": 2930 }, { "epoch": 1.4411764705882353, "grad_norm": 0.27492624521255493, "learning_rate": 8.302563451721282e-05, "loss": 0.0384, "step": 2940 }, { "epoch": 1.446078431372549, "grad_norm": 0.23316094279289246, "learning_rate": 8.238685423353588e-05, "loss": 0.0426, "step": 2950 }, { "epoch": 1.4509803921568627, "grad_norm": 0.395353227853775, "learning_rate": 8.174881479405607e-05, "loss": 0.0347, "step": 2960 }, { "epoch": 1.4558823529411764, "grad_norm": 0.5179559588432312, "learning_rate": 8.111154303599049e-05, "loss": 0.0371, "step": 2970 }, { "epoch": 1.4607843137254901, "grad_norm": 0.22947251796722412, "learning_rate": 8.047506576426596e-05, "loss": 0.03, "step": 2980 }, { "epoch": 1.465686274509804, "grad_norm": 0.3831785023212433, "learning_rate": 7.983940975039166e-05, "loss": 0.0346, "step": 2990 }, { "epoch": 1.4705882352941178, "grad_norm": 0.34753093123435974, "learning_rate": 7.920460173133304e-05, "loss": 0.0464, "step": 3000 }, { "epoch": 1.4754901960784315, "grad_norm": 0.24026577174663544, "learning_rate": 7.85706684083871e-05, "loss": 0.0334, "step": 3010 }, { "epoch": 1.4803921568627452, "grad_norm": 0.32264992594718933, "learning_rate": 7.793763644605947e-05, "loss": 0.0329, "step": 3020 }, { "epoch": 1.4852941176470589, "grad_norm": 0.25292444229125977, "learning_rate": 7.730553247094266e-05, "loss": 0.0306, "step": 3030 }, { "epoch": 1.4901960784313726, "grad_norm": 0.21395047008991241, "learning_rate": 7.667438307059627e-05, "loss": 0.0346, "step": 3040 }, { "epoch": 1.4950980392156863, "grad_norm": 0.29967001080513, "learning_rate": 7.604421479242846e-05, "loss": 0.0403, "step": 3050 }, { "epoch": 1.5, "grad_norm": 0.2828430235385895, "learning_rate": 7.541505414257959e-05, "loss": 0.0327, "step": 3060 }, { "epoch": 1.5049019607843137, "grad_norm": 0.43027809262275696, "learning_rate": 7.478692758480698e-05, "loss": 0.0331, "step": 3070 }, { "epoch": 1.5098039215686274, "grad_norm": 0.34473538398742676, "learning_rate": 7.415986153937202e-05, "loss": 0.0444, "step": 3080 }, { "epoch": 1.5147058823529411, "grad_norm": 0.3717981278896332, "learning_rate": 7.353388238192892e-05, "loss": 0.042, "step": 3090 }, { "epoch": 1.5196078431372548, "grad_norm": 0.15127846598625183, "learning_rate": 7.29090164424151e-05, "loss": 0.0296, "step": 3100 }, { "epoch": 1.5245098039215685, "grad_norm": 0.33397403359413147, "learning_rate": 7.22852900039438e-05, "loss": 0.0314, "step": 3110 }, { "epoch": 1.5294117647058822, "grad_norm": 0.25943371653556824, "learning_rate": 7.166272930169861e-05, "loss": 0.0342, "step": 3120 }, { "epoch": 1.534313725490196, "grad_norm": 0.3120077848434448, "learning_rate": 7.104136052182992e-05, "loss": 0.0317, "step": 3130 }, { "epoch": 1.5392156862745097, "grad_norm": 0.24377594888210297, "learning_rate": 7.042120980035346e-05, "loss": 0.0284, "step": 3140 }, { "epoch": 1.5441176470588234, "grad_norm": 0.19070957601070404, "learning_rate": 6.980230322205099e-05, "loss": 0.0343, "step": 3150 }, { "epoch": 1.5490196078431373, "grad_norm": 0.22296807169914246, "learning_rate": 6.918466681937308e-05, "loss": 0.0299, "step": 3160 }, { "epoch": 1.553921568627451, "grad_norm": 0.2279416173696518, "learning_rate": 6.856832657134424e-05, "loss": 0.0333, "step": 3170 }, { "epoch": 1.5588235294117647, "grad_norm": 0.2242182493209839, "learning_rate": 6.795330840247006e-05, "loss": 0.0331, "step": 3180 }, { "epoch": 1.5637254901960784, "grad_norm": 0.2774062752723694, "learning_rate": 6.733963818164686e-05, "loss": 0.0266, "step": 3190 }, { "epoch": 1.5686274509803921, "grad_norm": 0.37312522530555725, "learning_rate": 6.672734172107354e-05, "loss": 0.0376, "step": 3200 }, { "epoch": 1.5735294117647058, "grad_norm": 0.23322941362857819, "learning_rate": 6.611644477516595e-05, "loss": 0.0282, "step": 3210 }, { "epoch": 1.5784313725490198, "grad_norm": 0.32735109329223633, "learning_rate": 6.550697303947345e-05, "loss": 0.0294, "step": 3220 }, { "epoch": 1.5833333333333335, "grad_norm": 0.21853038668632507, "learning_rate": 6.489895214959828e-05, "loss": 0.0259, "step": 3230 }, { "epoch": 1.5882352941176472, "grad_norm": 0.3016158640384674, "learning_rate": 6.429240768011719e-05, "loss": 0.028, "step": 3240 }, { "epoch": 1.593137254901961, "grad_norm": 0.20449745655059814, "learning_rate": 6.368736514350568e-05, "loss": 0.0303, "step": 3250 }, { "epoch": 1.5980392156862746, "grad_norm": 0.2439008206129074, "learning_rate": 6.308384998906506e-05, "loss": 0.027, "step": 3260 }, { "epoch": 1.6029411764705883, "grad_norm": 0.28377825021743774, "learning_rate": 6.248188760185173e-05, "loss": 0.0302, "step": 3270 }, { "epoch": 1.607843137254902, "grad_norm": 0.32138168811798096, "learning_rate": 6.188150330160971e-05, "loss": 0.0255, "step": 3280 }, { "epoch": 1.6127450980392157, "grad_norm": 0.23931661248207092, "learning_rate": 6.128272234170547e-05, "loss": 0.0284, "step": 3290 }, { "epoch": 1.6176470588235294, "grad_norm": 0.26550066471099854, "learning_rate": 6.068556990806579e-05, "loss": 0.039, "step": 3300 }, { "epoch": 1.6225490196078431, "grad_norm": 0.20831480622291565, "learning_rate": 6.0090071118118355e-05, "loss": 0.0248, "step": 3310 }, { "epoch": 1.6274509803921569, "grad_norm": 0.26879703998565674, "learning_rate": 5.949625101973527e-05, "loss": 0.0303, "step": 3320 }, { "epoch": 1.6323529411764706, "grad_norm": 0.34190261363983154, "learning_rate": 5.890413459017958e-05, "loss": 0.0296, "step": 3330 }, { "epoch": 1.6372549019607843, "grad_norm": 0.3451369106769562, "learning_rate": 5.8313746735054544e-05, "loss": 0.0274, "step": 3340 }, { "epoch": 1.642156862745098, "grad_norm": 0.2692447304725647, "learning_rate": 5.77251122872561e-05, "loss": 0.0303, "step": 3350 }, { "epoch": 1.6470588235294117, "grad_norm": 0.28057631850242615, "learning_rate": 5.713825600592841e-05, "loss": 0.0335, "step": 3360 }, { "epoch": 1.6519607843137254, "grad_norm": 0.20118731260299683, "learning_rate": 5.6553202575422385e-05, "loss": 0.0339, "step": 3370 }, { "epoch": 1.656862745098039, "grad_norm": 0.27384528517723083, "learning_rate": 5.596997660425746e-05, "loss": 0.0296, "step": 3380 }, { "epoch": 1.6617647058823528, "grad_norm": 0.27839264273643494, "learning_rate": 5.538860262408632e-05, "loss": 0.0306, "step": 3390 }, { "epoch": 1.6666666666666665, "grad_norm": 0.26187360286712646, "learning_rate": 5.480910508866333e-05, "loss": 0.0327, "step": 3400 }, { "epoch": 1.6715686274509802, "grad_norm": 0.16635389626026154, "learning_rate": 5.423150837281585e-05, "loss": 0.0268, "step": 3410 }, { "epoch": 1.6764705882352942, "grad_norm": 0.3123128414154053, "learning_rate": 5.365583677141883e-05, "loss": 0.0345, "step": 3420 }, { "epoch": 1.6813725490196079, "grad_norm": 0.30305176973342896, "learning_rate": 5.308211449837315e-05, "loss": 0.0264, "step": 3430 }, { "epoch": 1.6862745098039216, "grad_norm": 0.19436487555503845, "learning_rate": 5.2510365685587026e-05, "loss": 0.0318, "step": 3440 }, { "epoch": 1.6911764705882353, "grad_norm": 0.2048874795436859, "learning_rate": 5.1940614381961004e-05, "loss": 0.0296, "step": 3450 }, { "epoch": 1.696078431372549, "grad_norm": 0.32843217253685, "learning_rate": 5.137288455237627e-05, "loss": 0.0288, "step": 3460 }, { "epoch": 1.7009803921568627, "grad_norm": 0.26734060049057007, "learning_rate": 5.080720007668689e-05, "loss": 0.0309, "step": 3470 }, { "epoch": 1.7058823529411766, "grad_norm": 0.2050999402999878, "learning_rate": 5.0243584748715235e-05, "loss": 0.0283, "step": 3480 }, { "epoch": 1.7107843137254903, "grad_norm": 0.32800912857055664, "learning_rate": 4.968206227525111e-05, "loss": 0.0356, "step": 3490 }, { "epoch": 1.715686274509804, "grad_norm": 0.2212320864200592, "learning_rate": 4.912265627505468e-05, "loss": 0.0278, "step": 3500 }, { "epoch": 1.7205882352941178, "grad_norm": 0.22088485956192017, "learning_rate": 4.856539027786305e-05, "loss": 0.0315, "step": 3510 }, { "epoch": 1.7254901960784315, "grad_norm": 0.2626785635948181, "learning_rate": 4.8010287723400494e-05, "loss": 0.0395, "step": 3520 }, { "epoch": 1.7303921568627452, "grad_norm": 0.343022882938385, "learning_rate": 4.745737196039259e-05, "loss": 0.0235, "step": 3530 }, { "epoch": 1.7352941176470589, "grad_norm": 0.4321844279766083, "learning_rate": 4.6906666245583965e-05, "loss": 0.0393, "step": 3540 }, { "epoch": 1.7401960784313726, "grad_norm": 0.34605053067207336, "learning_rate": 4.6358193742760305e-05, "loss": 0.0214, "step": 3550 }, { "epoch": 1.7450980392156863, "grad_norm": 0.2320283055305481, "learning_rate": 4.5811977521773906e-05, "loss": 0.0331, "step": 3560 }, { "epoch": 1.75, "grad_norm": 0.21285474300384521, "learning_rate": 4.526804055757328e-05, "loss": 0.0333, "step": 3570 }, { "epoch": 1.7549019607843137, "grad_norm": 0.185288667678833, "learning_rate": 4.472640572923687e-05, "loss": 0.0269, "step": 3580 }, { "epoch": 1.7598039215686274, "grad_norm": 0.2527756989002228, "learning_rate": 4.4187095819010674e-05, "loss": 0.0296, "step": 3590 }, { "epoch": 1.7647058823529411, "grad_norm": 0.1697828769683838, "learning_rate": 4.365013351135001e-05, "loss": 0.0272, "step": 3600 }, { "epoch": 1.7696078431372548, "grad_norm": 0.215946227312088, "learning_rate": 4.311554139196522e-05, "loss": 0.0262, "step": 3610 }, { "epoch": 1.7745098039215685, "grad_norm": 0.41971561312675476, "learning_rate": 4.258334194687188e-05, "loss": 0.0282, "step": 3620 }, { "epoch": 1.7794117647058822, "grad_norm": 0.18381306529045105, "learning_rate": 4.205355756144489e-05, "loss": 0.0293, "step": 3630 }, { "epoch": 1.784313725490196, "grad_norm": 0.24697428941726685, "learning_rate": 4.152621051947682e-05, "loss": 0.0205, "step": 3640 }, { "epoch": 1.7892156862745097, "grad_norm": 0.23640835285186768, "learning_rate": 4.1001323002240754e-05, "loss": 0.0304, "step": 3650 }, { "epoch": 1.7941176470588234, "grad_norm": 0.21955536305904388, "learning_rate": 4.047891708755724e-05, "loss": 0.0281, "step": 3660 }, { "epoch": 1.7990196078431373, "grad_norm": 0.34594038128852844, "learning_rate": 3.995901474886568e-05, "loss": 0.0344, "step": 3670 }, { "epoch": 1.803921568627451, "grad_norm": 0.4010615646839142, "learning_rate": 3.944163785429992e-05, "loss": 0.0241, "step": 3680 }, { "epoch": 1.8088235294117647, "grad_norm": 0.24570715427398682, "learning_rate": 3.8926808165768715e-05, "loss": 0.0269, "step": 3690 }, { "epoch": 1.8137254901960784, "grad_norm": 0.2714114785194397, "learning_rate": 3.841454733804016e-05, "loss": 0.0287, "step": 3700 }, { "epoch": 1.8186274509803921, "grad_norm": 0.29776889085769653, "learning_rate": 3.790487691783099e-05, "loss": 0.0325, "step": 3710 }, { "epoch": 1.8235294117647058, "grad_norm": 0.20955297350883484, "learning_rate": 3.739781834290006e-05, "loss": 0.0319, "step": 3720 }, { "epoch": 1.8284313725490198, "grad_norm": 0.2985910475254059, "learning_rate": 3.689339294114692e-05, "loss": 0.0244, "step": 3730 }, { "epoch": 1.8333333333333335, "grad_norm": 0.33746784925460815, "learning_rate": 3.639162192971457e-05, "loss": 0.0272, "step": 3740 }, { "epoch": 1.8382352941176472, "grad_norm": 0.2537771761417389, "learning_rate": 3.5892526414096925e-05, "loss": 0.0317, "step": 3750 }, { "epoch": 1.843137254901961, "grad_norm": 0.30214208364486694, "learning_rate": 3.53961273872513e-05, "loss": 0.0207, "step": 3760 }, { "epoch": 1.8480392156862746, "grad_norm": 0.3083361089229584, "learning_rate": 3.490244572871524e-05, "loss": 0.0216, "step": 3770 }, { "epoch": 1.8529411764705883, "grad_norm": 0.24969086050987244, "learning_rate": 3.44115022037284e-05, "loss": 0.0254, "step": 3780 }, { "epoch": 1.857843137254902, "grad_norm": 0.2084352970123291, "learning_rate": 3.3923317462358905e-05, "loss": 0.0305, "step": 3790 }, { "epoch": 1.8627450980392157, "grad_norm": 0.21671414375305176, "learning_rate": 3.3437912038635056e-05, "loss": 0.0303, "step": 3800 }, { "epoch": 1.8676470588235294, "grad_norm": 0.2956879436969757, "learning_rate": 3.295530634968147e-05, "loss": 0.0298, "step": 3810 }, { "epoch": 1.8725490196078431, "grad_norm": 0.29368528723716736, "learning_rate": 3.24755206948602e-05, "loss": 0.0261, "step": 3820 }, { "epoch": 1.8774509803921569, "grad_norm": 0.13201627135276794, "learning_rate": 3.199857525491714e-05, "loss": 0.0217, "step": 3830 }, { "epoch": 1.8823529411764706, "grad_norm": 0.21656860411167145, "learning_rate": 3.1524490091133e-05, "loss": 0.0288, "step": 3840 }, { "epoch": 1.8872549019607843, "grad_norm": 0.20571519434452057, "learning_rate": 3.105328514447957e-05, "loss": 0.0254, "step": 3850 }, { "epoch": 1.892156862745098, "grad_norm": 0.2538784444332123, "learning_rate": 3.0584980234780916e-05, "loss": 0.0325, "step": 3860 }, { "epoch": 1.8970588235294117, "grad_norm": 0.282520592212677, "learning_rate": 3.0119595059879678e-05, "loss": 0.0292, "step": 3870 }, { "epoch": 1.9019607843137254, "grad_norm": 0.2727642357349396, "learning_rate": 2.965714919480872e-05, "loss": 0.0264, "step": 3880 }, { "epoch": 1.906862745098039, "grad_norm": 0.1925729215145111, "learning_rate": 2.9197662090967625e-05, "loss": 0.0282, "step": 3890 }, { "epoch": 1.9117647058823528, "grad_norm": 0.23374304175376892, "learning_rate": 2.8741153075304438e-05, "loss": 0.0266, "step": 3900 }, { "epoch": 1.9166666666666665, "grad_norm": 0.24565072357654572, "learning_rate": 2.828764134950297e-05, "loss": 0.0198, "step": 3910 }, { "epoch": 1.9215686274509802, "grad_norm": 0.1271701604127884, "learning_rate": 2.7837145989174974e-05, "loss": 0.0206, "step": 3920 }, { "epoch": 1.9264705882352942, "grad_norm": 0.25117677450180054, "learning_rate": 2.7389685943057852e-05, "loss": 0.0249, "step": 3930 }, { "epoch": 1.9313725490196079, "grad_norm": 0.33383405208587646, "learning_rate": 2.6945280032217535e-05, "loss": 0.0298, "step": 3940 }, { "epoch": 1.9362745098039216, "grad_norm": 0.22882990539073944, "learning_rate": 2.6503946949256974e-05, "loss": 0.0273, "step": 3950 }, { "epoch": 1.9411764705882353, "grad_norm": 0.2781018316745758, "learning_rate": 2.6065705257529848e-05, "loss": 0.0345, "step": 3960 }, { "epoch": 1.946078431372549, "grad_norm": 0.23163281381130219, "learning_rate": 2.5630573390359624e-05, "loss": 0.024, "step": 3970 }, { "epoch": 1.9509803921568627, "grad_norm": 0.30135810375213623, "learning_rate": 2.5198569650264403e-05, "loss": 0.0245, "step": 3980 }, { "epoch": 1.9558823529411766, "grad_norm": 0.2169611006975174, "learning_rate": 2.4769712208186967e-05, "loss": 0.0217, "step": 3990 }, { "epoch": 1.9607843137254903, "grad_norm": 0.2509106397628784, "learning_rate": 2.4344019102730542e-05, "loss": 0.0225, "step": 4000 }, { "epoch": 1.965686274509804, "grad_norm": 0.3170604407787323, "learning_rate": 2.3921508239399913e-05, "loss": 0.027, "step": 4010 }, { "epoch": 1.9705882352941178, "grad_norm": 0.1460844874382019, "learning_rate": 2.350219738984849e-05, "loss": 0.021, "step": 4020 }, { "epoch": 1.9754901960784315, "grad_norm": 0.27143651247024536, "learning_rate": 2.3086104191130643e-05, "loss": 0.0262, "step": 4030 }, { "epoch": 1.9803921568627452, "grad_norm": 0.2494950294494629, "learning_rate": 2.2673246144959935e-05, "loss": 0.0249, "step": 4040 }, { "epoch": 1.9852941176470589, "grad_norm": 0.26637178659439087, "learning_rate": 2.226364061697287e-05, "loss": 0.0325, "step": 4050 }, { "epoch": 1.9901960784313726, "grad_norm": 0.2871919274330139, "learning_rate": 2.185730483599856e-05, "loss": 0.0286, "step": 4060 }, { "epoch": 1.9950980392156863, "grad_norm": 0.2098357230424881, "learning_rate": 2.1454255893334064e-05, "loss": 0.0332, "step": 4070 }, { "epoch": 2.0, "grad_norm": 0.2278250902891159, "learning_rate": 2.10545107420253e-05, "loss": 0.0234, "step": 4080 }, { "epoch": 2.0049019607843137, "grad_norm": 0.266368567943573, "learning_rate": 2.0658086196154236e-05, "loss": 0.0226, "step": 4090 }, { "epoch": 2.0098039215686274, "grad_norm": 0.10192721337080002, "learning_rate": 2.026499893013144e-05, "loss": 0.0217, "step": 4100 }, { "epoch": 2.014705882352941, "grad_norm": 0.2021161913871765, "learning_rate": 1.9875265477994875e-05, "loss": 0.0219, "step": 4110 }, { "epoch": 2.019607843137255, "grad_norm": 0.18436592817306519, "learning_rate": 1.9488902232714267e-05, "loss": 0.0216, "step": 4120 }, { "epoch": 2.0245098039215685, "grad_norm": 0.18808206915855408, "learning_rate": 1.9105925445501794e-05, "loss": 0.0184, "step": 4130 }, { "epoch": 2.0294117647058822, "grad_norm": 0.2091018408536911, "learning_rate": 1.87263512251284e-05, "loss": 0.0237, "step": 4140 }, { "epoch": 2.034313725490196, "grad_norm": 0.29868388175964355, "learning_rate": 1.8350195537246184e-05, "loss": 0.0251, "step": 4150 }, { "epoch": 2.0392156862745097, "grad_norm": 0.15926848351955414, "learning_rate": 1.797747420371699e-05, "loss": 0.0214, "step": 4160 }, { "epoch": 2.0441176470588234, "grad_norm": 0.24016976356506348, "learning_rate": 1.7608202901946826e-05, "loss": 0.0206, "step": 4170 }, { "epoch": 2.049019607843137, "grad_norm": 0.18072175979614258, "learning_rate": 1.7242397164226452e-05, "loss": 0.0192, "step": 4180 }, { "epoch": 2.053921568627451, "grad_norm": 0.21760503947734833, "learning_rate": 1.6880072377078026e-05, "loss": 0.0237, "step": 4190 }, { "epoch": 2.0588235294117645, "grad_norm": 0.28834259510040283, "learning_rate": 1.6521243780607974e-05, "loss": 0.0185, "step": 4200 }, { "epoch": 2.063725490196078, "grad_norm": 0.14096632599830627, "learning_rate": 1.616592646786599e-05, "loss": 0.0184, "step": 4210 }, { "epoch": 2.0686274509803924, "grad_norm": 0.15555402636528015, "learning_rate": 1.5814135384210026e-05, "loss": 0.02, "step": 4220 }, { "epoch": 2.073529411764706, "grad_norm": 0.09052202850580215, "learning_rate": 1.5465885326677897e-05, "loss": 0.019, "step": 4230 }, { "epoch": 2.0784313725490198, "grad_norm": 0.1283917874097824, "learning_rate": 1.512119094336466e-05, "loss": 0.0194, "step": 4240 }, { "epoch": 2.0833333333333335, "grad_norm": 0.268288791179657, "learning_rate": 1.4780066732806663e-05, "loss": 0.024, "step": 4250 }, { "epoch": 2.088235294117647, "grad_norm": 0.1498635858297348, "learning_rate": 1.4442527043371622e-05, "loss": 0.0226, "step": 4260 }, { "epoch": 2.093137254901961, "grad_norm": 0.31013911962509155, "learning_rate": 1.4108586072655062e-05, "loss": 0.0198, "step": 4270 }, { "epoch": 2.0980392156862746, "grad_norm": 0.19611553847789764, "learning_rate": 1.377825786688326e-05, "loss": 0.029, "step": 4280 }, { "epoch": 2.1029411764705883, "grad_norm": 0.15280510485172272, "learning_rate": 1.3451556320322344e-05, "loss": 0.0305, "step": 4290 }, { "epoch": 2.107843137254902, "grad_norm": 0.1233508512377739, "learning_rate": 1.3128495174693833e-05, "loss": 0.0214, "step": 4300 }, { "epoch": 2.1127450980392157, "grad_norm": 0.23667648434638977, "learning_rate": 1.280908801859676e-05, "loss": 0.017, "step": 4310 }, { "epoch": 2.1176470588235294, "grad_norm": 0.2285485714673996, "learning_rate": 1.2493348286936013e-05, "loss": 0.019, "step": 4320 }, { "epoch": 2.122549019607843, "grad_norm": 0.17249974608421326, "learning_rate": 1.2181289260357265e-05, "loss": 0.0233, "step": 4330 }, { "epoch": 2.127450980392157, "grad_norm": 0.18264269828796387, "learning_rate": 1.1872924064688328e-05, "loss": 0.0176, "step": 4340 }, { "epoch": 2.1323529411764706, "grad_norm": 0.23498280346393585, "learning_rate": 1.1568265670387125e-05, "loss": 0.0216, "step": 4350 }, { "epoch": 2.1372549019607843, "grad_norm": 0.20697841048240662, "learning_rate": 1.12673268919961e-05, "loss": 0.0221, "step": 4360 }, { "epoch": 2.142156862745098, "grad_norm": 0.20883601903915405, "learning_rate": 1.0970120387603122e-05, "loss": 0.0211, "step": 4370 }, { "epoch": 2.1470588235294117, "grad_norm": 0.2525753676891327, "learning_rate": 1.0676658658309225e-05, "loss": 0.0182, "step": 4380 }, { "epoch": 2.1519607843137254, "grad_norm": 0.20986422896385193, "learning_rate": 1.0386954047702646e-05, "loss": 0.0222, "step": 4390 }, { "epoch": 2.156862745098039, "grad_norm": 0.18922965228557587, "learning_rate": 1.010101874133973e-05, "loss": 0.0215, "step": 4400 }, { "epoch": 2.161764705882353, "grad_norm": 0.23508426547050476, "learning_rate": 9.81886476623226e-06, "loss": 0.0158, "step": 4410 }, { "epoch": 2.1666666666666665, "grad_norm": 0.17150916159152985, "learning_rate": 9.540503990341743e-06, "loss": 0.0204, "step": 4420 }, { "epoch": 2.1715686274509802, "grad_norm": 0.12821370363235474, "learning_rate": 9.265948122080048e-06, "loss": 0.0162, "step": 4430 }, { "epoch": 2.176470588235294, "grad_norm": 0.14045512676239014, "learning_rate": 8.995208709817071e-06, "loss": 0.0231, "step": 4440 }, { "epoch": 2.1813725490196076, "grad_norm": 0.24774880707263947, "learning_rate": 8.728297141394858e-06, "loss": 0.0205, "step": 4450 }, { "epoch": 2.186274509803922, "grad_norm": 0.20708513259887695, "learning_rate": 8.465224643648728e-06, "loss": 0.0217, "step": 4460 }, { "epoch": 2.1911764705882355, "grad_norm": 0.1988290697336197, "learning_rate": 8.206002281934977e-06, "loss": 0.0181, "step": 4470 }, { "epoch": 2.196078431372549, "grad_norm": 0.24248534440994263, "learning_rate": 7.950640959665457e-06, "loss": 0.014, "step": 4480 }, { "epoch": 2.200980392156863, "grad_norm": 0.199833944439888, "learning_rate": 7.69915141784896e-06, "loss": 0.0177, "step": 4490 }, { "epoch": 2.2058823529411766, "grad_norm": 0.18880146741867065, "learning_rate": 7.451544234639473e-06, "loss": 0.0309, "step": 4500 }, { "epoch": 2.2107843137254903, "grad_norm": 0.19817057251930237, "learning_rate": 7.207829824891199e-06, "loss": 0.0135, "step": 4510 }, { "epoch": 2.215686274509804, "grad_norm": 0.22702986001968384, "learning_rate": 6.968018439720414e-06, "loss": 0.0242, "step": 4520 }, { "epoch": 2.2205882352941178, "grad_norm": 0.23108519613742828, "learning_rate": 6.732120166074441e-06, "loss": 0.0266, "step": 4530 }, { "epoch": 2.2254901960784315, "grad_norm": 0.2683473229408264, "learning_rate": 6.500144926307295e-06, "loss": 0.0138, "step": 4540 }, { "epoch": 2.230392156862745, "grad_norm": 0.3906085193157196, "learning_rate": 6.272102477762254e-06, "loss": 0.0257, "step": 4550 }, { "epoch": 2.235294117647059, "grad_norm": 0.16685569286346436, "learning_rate": 6.048002412361598e-06, "loss": 0.0197, "step": 4560 }, { "epoch": 2.2401960784313726, "grad_norm": 0.19442564249038696, "learning_rate": 5.827854156203017e-06, "loss": 0.0183, "step": 4570 }, { "epoch": 2.2450980392156863, "grad_norm": 0.22825686633586884, "learning_rate": 5.611666969163243e-06, "loss": 0.0192, "step": 4580 }, { "epoch": 2.25, "grad_norm": 0.13165044784545898, "learning_rate": 5.399449944508439e-06, "loss": 0.0181, "step": 4590 }, { "epoch": 2.2549019607843137, "grad_norm": 0.32770487666130066, "learning_rate": 5.1912120085118365e-06, "loss": 0.0194, "step": 4600 }, { "epoch": 2.2598039215686274, "grad_norm": 0.20789536833763123, "learning_rate": 4.986961920078204e-06, "loss": 0.0274, "step": 4610 }, { "epoch": 2.264705882352941, "grad_norm": 0.3107619881629944, "learning_rate": 4.786708270375462e-06, "loss": 0.0244, "step": 4620 }, { "epoch": 2.269607843137255, "grad_norm": 0.11234085261821747, "learning_rate": 4.590459482473286e-06, "loss": 0.0125, "step": 4630 }, { "epoch": 2.2745098039215685, "grad_norm": 0.2838551700115204, "learning_rate": 4.398223810988866e-06, "loss": 0.0209, "step": 4640 }, { "epoch": 2.2794117647058822, "grad_norm": 0.2697765529155731, "learning_rate": 4.2100093417396845e-06, "loss": 0.0245, "step": 4650 }, { "epoch": 2.284313725490196, "grad_norm": 0.1846655011177063, "learning_rate": 4.0258239914033765e-06, "loss": 0.0271, "step": 4660 }, { "epoch": 2.2892156862745097, "grad_norm": 0.20711906254291534, "learning_rate": 3.8456755071847765e-06, "loss": 0.0262, "step": 4670 }, { "epoch": 2.2941176470588234, "grad_norm": 0.23577173054218292, "learning_rate": 3.6695714664900293e-06, "loss": 0.0147, "step": 4680 }, { "epoch": 2.299019607843137, "grad_norm": 0.21056267619132996, "learning_rate": 3.49751927660793e-06, "loss": 0.0242, "step": 4690 }, { "epoch": 2.303921568627451, "grad_norm": 0.1903896927833557, "learning_rate": 3.329526174398223e-06, "loss": 0.0199, "step": 4700 }, { "epoch": 2.3088235294117645, "grad_norm": 0.1740642637014389, "learning_rate": 3.165599225987381e-06, "loss": 0.0241, "step": 4710 }, { "epoch": 2.313725490196078, "grad_norm": 0.23523923754692078, "learning_rate": 3.005745326471254e-06, "loss": 0.0196, "step": 4720 }, { "epoch": 2.318627450980392, "grad_norm": 0.17898762226104736, "learning_rate": 2.849971199625112e-06, "loss": 0.0178, "step": 4730 }, { "epoch": 2.323529411764706, "grad_norm": 0.18050484359264374, "learning_rate": 2.6982833976208043e-06, "loss": 0.0264, "step": 4740 }, { "epoch": 2.3284313725490198, "grad_norm": 0.14259324967861176, "learning_rate": 2.5506883007511695e-06, "loss": 0.0163, "step": 4750 }, { "epoch": 2.3333333333333335, "grad_norm": 0.25935426354408264, "learning_rate": 2.407192117161683e-06, "loss": 0.0215, "step": 4760 }, { "epoch": 2.338235294117647, "grad_norm": 0.24692723155021667, "learning_rate": 2.2678008825893106e-06, "loss": 0.0185, "step": 4770 }, { "epoch": 2.343137254901961, "grad_norm": 0.22854046523571014, "learning_rate": 2.1325204601086222e-06, "loss": 0.0235, "step": 4780 }, { "epoch": 2.3480392156862746, "grad_norm": 0.31760168075561523, "learning_rate": 2.001356539885213e-06, "loss": 0.0279, "step": 4790 }, { "epoch": 2.3529411764705883, "grad_norm": 0.19233286380767822, "learning_rate": 1.8743146389363474e-06, "loss": 0.0174, "step": 4800 }, { "epoch": 2.357843137254902, "grad_norm": 0.28652098774909973, "learning_rate": 1.7514001008988923e-06, "loss": 0.0193, "step": 4810 }, { "epoch": 2.3627450980392157, "grad_norm": 0.16845420002937317, "learning_rate": 1.6326180958045502e-06, "loss": 0.0177, "step": 4820 }, { "epoch": 2.3676470588235294, "grad_norm": 0.12303224951028824, "learning_rate": 1.517973619862445e-06, "loss": 0.0205, "step": 4830 }, { "epoch": 2.372549019607843, "grad_norm": 0.1442280411720276, "learning_rate": 1.4074714952489132e-06, "loss": 0.0215, "step": 4840 }, { "epoch": 2.377450980392157, "grad_norm": 0.18173760175704956, "learning_rate": 1.3011163699046758e-06, "loss": 0.0166, "step": 4850 }, { "epoch": 2.3823529411764706, "grad_norm": 0.2543584406375885, "learning_rate": 1.1989127173393955e-06, "loss": 0.0195, "step": 4860 }, { "epoch": 2.3872549019607843, "grad_norm": 0.14100182056427002, "learning_rate": 1.1008648364434493e-06, "loss": 0.0252, "step": 4870 }, { "epoch": 2.392156862745098, "grad_norm": 0.22381238639354706, "learning_rate": 1.0069768513071287e-06, "loss": 0.0184, "step": 4880 }, { "epoch": 2.3970588235294117, "grad_norm": 0.1522032916545868, "learning_rate": 9.172527110472007e-07, "loss": 0.0226, "step": 4890 }, { "epoch": 2.4019607843137254, "grad_norm": 0.151298388838768, "learning_rate": 8.316961896407293e-07, "loss": 0.0161, "step": 4900 }, { "epoch": 2.406862745098039, "grad_norm": 0.10205589234828949, "learning_rate": 7.503108857664476e-07, "loss": 0.0221, "step": 4910 }, { "epoch": 2.411764705882353, "grad_norm": 0.29145070910453796, "learning_rate": 6.731002226532557e-07, "loss": 0.0256, "step": 4920 }, { "epoch": 2.4166666666666665, "grad_norm": 0.11820173263549805, "learning_rate": 6.000674479363366e-07, "loss": 0.0192, "step": 4930 }, { "epoch": 2.4215686274509802, "grad_norm": 0.34024494886398315, "learning_rate": 5.312156335205098e-07, "loss": 0.0184, "step": 4940 }, { "epoch": 2.426470588235294, "grad_norm": 0.2437254935503006, "learning_rate": 4.665476754510234e-07, "loss": 0.0172, "step": 4950 }, { "epoch": 2.431372549019608, "grad_norm": 0.22836612164974213, "learning_rate": 4.0606629379175143e-07, "loss": 0.0236, "step": 4960 }, { "epoch": 2.436274509803922, "grad_norm": 0.30892133712768555, "learning_rate": 3.497740325107746e-07, "loss": 0.021, "step": 4970 }, { "epoch": 2.4411764705882355, "grad_norm": 0.1466980129480362, "learning_rate": 2.9767325937338775e-07, "loss": 0.0158, "step": 4980 }, { "epoch": 2.446078431372549, "grad_norm": 0.17119024693965912, "learning_rate": 2.497661658424688e-07, "loss": 0.0211, "step": 4990 }, { "epoch": 2.450980392156863, "grad_norm": 0.24735775589942932, "learning_rate": 2.0605476698636328e-07, "loss": 0.0198, "step": 5000 }, { "epoch": 2.4558823529411766, "grad_norm": 0.22227314114570618, "learning_rate": 1.6654090139408551e-07, "loss": 0.0222, "step": 5010 }, { "epoch": 2.4607843137254903, "grad_norm": 0.2925474941730499, "learning_rate": 1.3122623109795839e-07, "loss": 0.0212, "step": 5020 }, { "epoch": 2.465686274509804, "grad_norm": 0.11938751488924026, "learning_rate": 1.0011224150379139e-07, "loss": 0.0206, "step": 5030 }, { "epoch": 2.4705882352941178, "grad_norm": 0.18019632995128632, "learning_rate": 7.320024132829729e-08, "loss": 0.0175, "step": 5040 }, { "epoch": 2.4754901960784315, "grad_norm": 0.1969996839761734, "learning_rate": 5.049136254413611e-08, "loss": 0.0196, "step": 5050 }, { "epoch": 2.480392156862745, "grad_norm": 0.26969224214553833, "learning_rate": 3.1986560332242234e-08, "loss": 0.0159, "step": 5060 }, { "epoch": 2.485294117647059, "grad_norm": 0.24808961153030396, "learning_rate": 1.768661304166752e-08, "loss": 0.0235, "step": 5070 }, { "epoch": 2.4901960784313726, "grad_norm": 0.08070988208055496, "learning_rate": 7.592122156829806e-09, "loss": 0.0196, "step": 5080 }, { "epoch": 2.4950980392156863, "grad_norm": 0.3003765642642975, "learning_rate": 1.7035122722663943e-09, "loss": 0.0214, "step": 5090 }, { "epoch": 2.4995098039215686, "step": 5099, "total_flos": 1.0010895665363712e+17, "train_loss": 0.049077389520344766, "train_runtime": 1734.3659, "train_samples_per_second": 47.04, "train_steps_per_second": 2.94 } ], "logging_steps": 10, "max_steps": 5099, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0010895665363712e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }