{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9912030727295255, "eval_steps": 100, "global_step": 8750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.8405508995056152, "learning_rate": 0.0001, "loss": 21.3118, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.61581563949585, "learning_rate": 0.0001, "loss": 22.0118, "step": 10 }, { "epoch": 0.0, "grad_norm": 8.54037094116211, "learning_rate": 0.0001, "loss": 19.0155, "step": 20 }, { "epoch": 0.0, "grad_norm": 8.802764892578125, "learning_rate": 0.0001, "loss": 13.4865, "step": 30 }, { "epoch": 0.0, "grad_norm": 2.1297531127929688, "learning_rate": 0.0001, "loss": 9.139, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.3078622817993164, "learning_rate": 0.0001, "loss": 8.3294, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.6661334037780762, "learning_rate": 0.0001, "loss": 7.2759, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.7333922386169434, "learning_rate": 0.0001, "loss": 6.2603, "step": 70 }, { "epoch": 0.01, "grad_norm": 1.2136372327804565, "learning_rate": 0.0001, "loss": 4.9664, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.9162020683288574, "learning_rate": 0.0001, "loss": 4.2368, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.4164468050003052, "learning_rate": 0.0001, "loss": 3.8171, "step": 100 }, { "epoch": 0.01, "grad_norm": 0.27735579013824463, "learning_rate": 0.0001, "loss": 3.6958, "step": 110 }, { "epoch": 0.01, "grad_norm": 0.2131226658821106, "learning_rate": 0.0001, "loss": 3.601, "step": 120 }, { "epoch": 0.01, "grad_norm": 0.27249181270599365, "learning_rate": 0.0001, "loss": 3.5588, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.14784656465053558, "learning_rate": 0.0001, "loss": 3.4919, "step": 140 }, { "epoch": 0.02, "grad_norm": 0.06692438572645187, "learning_rate": 0.0001, "loss": 3.5923, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.05319567397236824, "learning_rate": 0.0001, "loss": 3.5652, "step": 160 }, { "epoch": 0.02, "grad_norm": 0.1252022683620453, "learning_rate": 0.0001, "loss": 3.5525, "step": 170 }, { "epoch": 0.02, "grad_norm": 0.07382280379533768, "learning_rate": 0.0001, "loss": 3.406, "step": 180 }, { "epoch": 0.02, "grad_norm": 0.09082331508398056, "learning_rate": 0.0001, "loss": 3.5842, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.056070588529109955, "learning_rate": 0.0001, "loss": 3.582, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.055600181221961975, "learning_rate": 0.0001, "loss": 3.6051, "step": 210 }, { "epoch": 0.02, "grad_norm": 0.042016271501779556, "learning_rate": 0.0001, "loss": 3.5155, "step": 220 }, { "epoch": 0.03, "grad_norm": 0.038780346512794495, "learning_rate": 0.0001, "loss": 3.6078, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.06440210342407227, "learning_rate": 0.0001, "loss": 3.5627, "step": 240 }, { "epoch": 0.03, "grad_norm": 0.14532776176929474, "learning_rate": 0.0001, "loss": 3.5536, "step": 250 }, { "epoch": 0.03, "grad_norm": 0.04661674052476883, "learning_rate": 0.0001, "loss": 3.564, "step": 260 }, { "epoch": 0.03, "grad_norm": 0.04813728481531143, "learning_rate": 0.0001, "loss": 3.5477, "step": 270 }, { "epoch": 0.03, "grad_norm": 0.04259462654590607, "learning_rate": 0.0001, "loss": 3.6083, "step": 280 }, { "epoch": 0.03, "grad_norm": 0.030389174818992615, "learning_rate": 0.0001, "loss": 3.5148, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.03971175476908684, "learning_rate": 0.0001, "loss": 3.4379, "step": 300 }, { "epoch": 0.04, "grad_norm": 0.029011182487010956, "learning_rate": 0.0001, "loss": 3.512, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.02479287050664425, "learning_rate": 0.0001, "loss": 3.5507, "step": 320 }, { "epoch": 0.04, "grad_norm": 0.04145807772874832, "learning_rate": 0.0001, "loss": 3.5462, "step": 330 }, { "epoch": 0.04, "grad_norm": 0.022611157968640327, "learning_rate": 0.0001, "loss": 3.5233, "step": 340 }, { "epoch": 0.04, "grad_norm": 0.02720930241048336, "learning_rate": 0.0001, "loss": 3.4831, "step": 350 }, { "epoch": 0.04, "grad_norm": 0.038539037108421326, "learning_rate": 0.0001, "loss": 3.5607, "step": 360 }, { "epoch": 0.04, "grad_norm": 0.023368462920188904, "learning_rate": 0.0001, "loss": 3.5138, "step": 370 }, { "epoch": 0.04, "grad_norm": 0.03408854454755783, "learning_rate": 0.0001, "loss": 3.5124, "step": 380 }, { "epoch": 0.04, "grad_norm": 0.01943507231771946, "learning_rate": 0.0001, "loss": 3.549, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.01919199526309967, "learning_rate": 0.0001, "loss": 3.4256, "step": 400 }, { "epoch": 0.05, "grad_norm": 0.031397439539432526, "learning_rate": 0.0001, "loss": 3.4571, "step": 410 }, { "epoch": 0.05, "grad_norm": 0.022817950695753098, "learning_rate": 0.0001, "loss": 3.5159, "step": 420 }, { "epoch": 0.05, "grad_norm": 0.03026602230966091, "learning_rate": 0.0001, "loss": 3.5338, "step": 430 }, { "epoch": 0.05, "grad_norm": 0.022035052999854088, "learning_rate": 0.0001, "loss": 3.3941, "step": 440 }, { "epoch": 0.05, "grad_norm": 0.027268126606941223, "learning_rate": 0.0001, "loss": 3.6083, "step": 450 }, { "epoch": 0.05, "grad_norm": 0.0269852876663208, "learning_rate": 0.0001, "loss": 3.4366, "step": 460 }, { "epoch": 0.05, "grad_norm": 0.026059171184897423, "learning_rate": 0.0001, "loss": 3.5842, "step": 470 }, { "epoch": 0.05, "grad_norm": 0.023993048816919327, "learning_rate": 0.0001, "loss": 3.5072, "step": 480 }, { "epoch": 0.06, "grad_norm": 0.018095767125487328, "learning_rate": 0.0001, "loss": 3.4734, "step": 490 }, { "epoch": 0.06, "grad_norm": 0.020179396495223045, "learning_rate": 0.0001, "loss": 3.6047, "step": 500 }, { "epoch": 0.06, "grad_norm": 0.031518757343292236, "learning_rate": 0.0001, "loss": 3.4966, "step": 510 }, { "epoch": 0.06, "grad_norm": 0.020205531269311905, "learning_rate": 0.0001, "loss": 3.5573, "step": 520 }, { "epoch": 0.06, "grad_norm": 0.024177301675081253, "learning_rate": 0.0001, "loss": 3.6163, "step": 530 }, { "epoch": 0.06, "grad_norm": 0.024001598358154297, "learning_rate": 0.0001, "loss": 3.5187, "step": 540 }, { "epoch": 0.06, "grad_norm": 0.022960787639021873, "learning_rate": 0.0001, "loss": 3.4689, "step": 550 }, { "epoch": 0.06, "grad_norm": 0.018788259476423264, "learning_rate": 0.0001, "loss": 3.539, "step": 560 }, { "epoch": 0.06, "grad_norm": 0.022858906537294388, "learning_rate": 0.0001, "loss": 3.5878, "step": 570 }, { "epoch": 0.07, "grad_norm": 0.019678635522723198, "learning_rate": 0.0001, "loss": 3.4743, "step": 580 }, { "epoch": 0.07, "grad_norm": 0.034229714423418045, "learning_rate": 0.0001, "loss": 3.5307, "step": 590 }, { "epoch": 0.07, "grad_norm": 0.027546105906367302, "learning_rate": 0.0001, "loss": 3.51, "step": 600 }, { "epoch": 0.07, "grad_norm": 0.022891221567988396, "learning_rate": 0.0001, "loss": 3.4627, "step": 610 }, { "epoch": 0.07, "grad_norm": 0.0207009669393301, "learning_rate": 0.0001, "loss": 3.551, "step": 620 }, { "epoch": 0.07, "grad_norm": 0.01737021468579769, "learning_rate": 0.0001, "loss": 3.5684, "step": 630 }, { "epoch": 0.07, "grad_norm": 0.013728939928114414, "learning_rate": 0.0001, "loss": 3.4772, "step": 640 }, { "epoch": 0.07, "grad_norm": 0.014542106539011002, "learning_rate": 0.0001, "loss": 3.5362, "step": 650 }, { "epoch": 0.07, "grad_norm": 0.026827527210116386, "learning_rate": 0.0001, "loss": 3.4488, "step": 660 }, { "epoch": 0.08, "grad_norm": 0.023264160379767418, "learning_rate": 0.0001, "loss": 3.4934, "step": 670 }, { "epoch": 0.08, "grad_norm": 0.016662942245602608, "learning_rate": 0.0001, "loss": 3.5064, "step": 680 }, { "epoch": 0.08, "grad_norm": 0.015615697018802166, "learning_rate": 0.0001, "loss": 3.3728, "step": 690 }, { "epoch": 0.08, "grad_norm": 0.022345753386616707, "learning_rate": 0.0001, "loss": 3.4144, "step": 700 }, { "epoch": 0.08, "grad_norm": 0.032291021198034286, "learning_rate": 0.0001, "loss": 3.5969, "step": 710 }, { "epoch": 0.08, "grad_norm": 0.02001849189400673, "learning_rate": 0.0001, "loss": 3.6065, "step": 720 }, { "epoch": 0.08, "grad_norm": 0.021190255880355835, "learning_rate": 0.0001, "loss": 3.4794, "step": 730 }, { "epoch": 0.08, "grad_norm": 0.017116906121373177, "learning_rate": 0.0001, "loss": 3.5791, "step": 740 }, { "epoch": 0.08, "grad_norm": 0.014048201031982899, "learning_rate": 0.0001, "loss": 3.4301, "step": 750 }, { "epoch": 0.09, "grad_norm": 0.012558774091303349, "learning_rate": 0.0001, "loss": 3.5019, "step": 760 }, { "epoch": 0.09, "grad_norm": 0.015401260927319527, "learning_rate": 0.0001, "loss": 3.4723, "step": 770 }, { "epoch": 0.09, "grad_norm": 0.015502150170505047, "learning_rate": 0.0001, "loss": 3.5413, "step": 780 }, { "epoch": 0.09, "grad_norm": 0.01253562793135643, "learning_rate": 0.0001, "loss": 3.5216, "step": 790 }, { "epoch": 0.09, "grad_norm": 0.0138285793364048, "learning_rate": 0.0001, "loss": 3.5217, "step": 800 }, { "epoch": 0.09, "grad_norm": 0.02025364711880684, "learning_rate": 0.0001, "loss": 3.429, "step": 810 }, { "epoch": 0.09, "grad_norm": 0.016753552481532097, "learning_rate": 0.0001, "loss": 3.4932, "step": 820 }, { "epoch": 0.09, "grad_norm": 0.013180260546505451, "learning_rate": 0.0001, "loss": 3.4905, "step": 830 }, { "epoch": 0.1, "grad_norm": 0.01835305243730545, "learning_rate": 0.0001, "loss": 3.5897, "step": 840 }, { "epoch": 0.1, "grad_norm": 0.016174038872122765, "learning_rate": 0.0001, "loss": 3.4206, "step": 850 }, { "epoch": 0.1, "grad_norm": 0.010979576967656612, "learning_rate": 0.0001, "loss": 3.5301, "step": 860 }, { "epoch": 0.1, "grad_norm": 0.016643410548567772, "learning_rate": 0.0001, "loss": 3.5247, "step": 870 }, { "epoch": 0.1, "grad_norm": 0.01616811566054821, "learning_rate": 0.0001, "loss": 3.6033, "step": 880 }, { "epoch": 0.1, "grad_norm": 0.012211025692522526, "learning_rate": 0.0001, "loss": 3.4928, "step": 890 }, { "epoch": 0.1, "grad_norm": 0.015387449413537979, "learning_rate": 0.0001, "loss": 3.6303, "step": 900 }, { "epoch": 0.1, "grad_norm": 0.014718925580382347, "learning_rate": 0.0001, "loss": 3.5415, "step": 910 }, { "epoch": 0.1, "grad_norm": 0.028245406225323677, "learning_rate": 0.0001, "loss": 3.5769, "step": 920 }, { "epoch": 0.11, "grad_norm": 0.0184182021766901, "learning_rate": 0.0001, "loss": 3.5676, "step": 930 }, { "epoch": 0.11, "grad_norm": 0.01640215329825878, "learning_rate": 0.0001, "loss": 3.5217, "step": 940 }, { "epoch": 0.11, "grad_norm": 0.019277973100543022, "learning_rate": 0.0001, "loss": 3.345, "step": 950 }, { "epoch": 0.11, "grad_norm": 0.013479188084602356, "learning_rate": 0.0001, "loss": 3.5102, "step": 960 }, { "epoch": 0.11, "grad_norm": 0.014891591854393482, "learning_rate": 0.0001, "loss": 3.4637, "step": 970 }, { "epoch": 0.11, "grad_norm": 0.01347111351788044, "learning_rate": 0.0001, "loss": 3.5347, "step": 980 }, { "epoch": 0.11, "grad_norm": 0.012612530030310154, "learning_rate": 0.0001, "loss": 3.5144, "step": 990 }, { "epoch": 0.11, "grad_norm": 0.011820956133306026, "learning_rate": 0.0001, "loss": 3.5304, "step": 1000 }, { "epoch": 0.11, "grad_norm": 0.01737663522362709, "learning_rate": 0.0001, "loss": 3.4648, "step": 1010 }, { "epoch": 0.12, "grad_norm": 0.010161484591662884, "learning_rate": 0.0001, "loss": 3.5593, "step": 1020 }, { "epoch": 0.12, "grad_norm": 0.012214502319693565, "learning_rate": 0.0001, "loss": 3.4612, "step": 1030 }, { "epoch": 0.12, "grad_norm": 0.018603697419166565, "learning_rate": 0.0001, "loss": 3.5296, "step": 1040 }, { "epoch": 0.12, "grad_norm": 0.011940198950469494, "learning_rate": 0.0001, "loss": 3.5477, "step": 1050 }, { "epoch": 0.12, "grad_norm": 0.00909626018255949, "learning_rate": 0.0001, "loss": 3.474, "step": 1060 }, { "epoch": 0.12, "grad_norm": 0.012007161974906921, "learning_rate": 0.0001, "loss": 3.5067, "step": 1070 }, { "epoch": 0.12, "grad_norm": 0.016640914604067802, "learning_rate": 0.0001, "loss": 3.5673, "step": 1080 }, { "epoch": 0.12, "grad_norm": 0.01336682215332985, "learning_rate": 0.0001, "loss": 3.4554, "step": 1090 }, { "epoch": 0.12, "grad_norm": 0.016610946506261826, "learning_rate": 0.0001, "loss": 3.4339, "step": 1100 }, { "epoch": 0.13, "grad_norm": 0.015559005551040173, "learning_rate": 0.0001, "loss": 3.5063, "step": 1110 }, { "epoch": 0.13, "grad_norm": 0.016315005719661713, "learning_rate": 0.0001, "loss": 3.5319, "step": 1120 }, { "epoch": 0.13, "grad_norm": 0.0162800345569849, "learning_rate": 0.0001, "loss": 3.5185, "step": 1130 }, { "epoch": 0.13, "grad_norm": 0.01528267189860344, "learning_rate": 0.0001, "loss": 3.403, "step": 1140 }, { "epoch": 0.13, "grad_norm": 0.013532821089029312, "learning_rate": 0.0001, "loss": 3.451, "step": 1150 }, { "epoch": 0.13, "grad_norm": 0.01414052676409483, "learning_rate": 0.0001, "loss": 3.4627, "step": 1160 }, { "epoch": 0.13, "grad_norm": 0.011125383898615837, "learning_rate": 0.0001, "loss": 3.533, "step": 1170 }, { "epoch": 0.13, "grad_norm": 0.014148293063044548, "learning_rate": 0.0001, "loss": 3.5109, "step": 1180 }, { "epoch": 0.13, "grad_norm": 0.01400547195225954, "learning_rate": 0.0001, "loss": 3.4628, "step": 1190 }, { "epoch": 0.14, "grad_norm": 0.015002534724771976, "learning_rate": 0.0001, "loss": 3.4342, "step": 1200 }, { "epoch": 0.14, "grad_norm": 0.010034149512648582, "learning_rate": 0.0001, "loss": 3.5285, "step": 1210 }, { "epoch": 0.14, "grad_norm": 0.01355247013270855, "learning_rate": 0.0001, "loss": 3.4329, "step": 1220 }, { "epoch": 0.14, "grad_norm": 0.014688360504806042, "learning_rate": 0.0001, "loss": 3.5577, "step": 1230 }, { "epoch": 0.14, "grad_norm": 0.03996114060282707, "learning_rate": 0.0001, "loss": 3.5291, "step": 1240 }, { "epoch": 0.14, "grad_norm": 0.01624971814453602, "learning_rate": 0.0001, "loss": 3.4896, "step": 1250 }, { "epoch": 0.14, "grad_norm": 0.013721343129873276, "learning_rate": 0.0001, "loss": 3.4983, "step": 1260 }, { "epoch": 0.14, "grad_norm": 0.018142051994800568, "learning_rate": 0.0001, "loss": 3.5087, "step": 1270 }, { "epoch": 0.14, "grad_norm": 0.017276529222726822, "learning_rate": 0.0001, "loss": 3.5721, "step": 1280 }, { "epoch": 0.15, "grad_norm": 0.012962603941559792, "learning_rate": 0.0001, "loss": 3.4973, "step": 1290 }, { "epoch": 0.15, "grad_norm": 0.01329217478632927, "learning_rate": 0.0001, "loss": 3.4465, "step": 1300 }, { "epoch": 0.15, "grad_norm": 0.00778600201010704, "learning_rate": 0.0001, "loss": 3.5594, "step": 1310 }, { "epoch": 0.15, "grad_norm": 0.013329190202057362, "learning_rate": 0.0001, "loss": 3.5248, "step": 1320 }, { "epoch": 0.15, "grad_norm": 0.011055481620132923, "learning_rate": 0.0001, "loss": 3.4527, "step": 1330 }, { "epoch": 0.15, "grad_norm": 0.014390667900443077, "learning_rate": 0.0001, "loss": 3.4961, "step": 1340 }, { "epoch": 0.15, "grad_norm": 0.012227430008351803, "learning_rate": 0.0001, "loss": 3.5495, "step": 1350 }, { "epoch": 0.15, "grad_norm": 0.01374130416661501, "learning_rate": 0.0001, "loss": 3.4924, "step": 1360 }, { "epoch": 0.16, "grad_norm": 0.010804954916238785, "learning_rate": 0.0001, "loss": 3.5196, "step": 1370 }, { "epoch": 0.16, "grad_norm": 0.020522750914096832, "learning_rate": 0.0001, "loss": 3.5616, "step": 1380 }, { "epoch": 0.16, "grad_norm": 0.011031219735741615, "learning_rate": 0.0001, "loss": 3.5738, "step": 1390 }, { "epoch": 0.16, "grad_norm": 0.011859984137117863, "learning_rate": 0.0001, "loss": 3.4882, "step": 1400 }, { "epoch": 0.16, "grad_norm": 0.008377199992537498, "learning_rate": 0.0001, "loss": 3.5082, "step": 1410 }, { "epoch": 0.16, "grad_norm": 0.012245368212461472, "learning_rate": 0.0001, "loss": 3.4637, "step": 1420 }, { "epoch": 0.16, "grad_norm": 0.010553339496254921, "learning_rate": 0.0001, "loss": 3.4667, "step": 1430 }, { "epoch": 0.16, "grad_norm": 0.013036780059337616, "learning_rate": 0.0001, "loss": 3.4815, "step": 1440 }, { "epoch": 0.16, "grad_norm": 0.012668167240917683, "learning_rate": 0.0001, "loss": 3.4446, "step": 1450 }, { "epoch": 0.17, "grad_norm": 0.012890839949250221, "learning_rate": 0.0001, "loss": 3.5298, "step": 1460 }, { "epoch": 0.17, "grad_norm": 0.011748247779905796, "learning_rate": 0.0001, "loss": 3.4647, "step": 1470 }, { "epoch": 0.17, "grad_norm": 0.013375749811530113, "learning_rate": 0.0001, "loss": 3.5846, "step": 1480 }, { "epoch": 0.17, "grad_norm": 0.013845915906131268, "learning_rate": 0.0001, "loss": 3.4289, "step": 1490 }, { "epoch": 0.17, "grad_norm": 0.013674955815076828, "learning_rate": 0.0001, "loss": 3.5698, "step": 1500 }, { "epoch": 0.17, "grad_norm": 0.008059272542595863, "learning_rate": 0.0001, "loss": 3.5305, "step": 1510 }, { "epoch": 0.17, "grad_norm": 0.012349562719464302, "learning_rate": 0.0001, "loss": 3.4922, "step": 1520 }, { "epoch": 0.17, "grad_norm": 0.01913044974207878, "learning_rate": 0.0001, "loss": 3.5982, "step": 1530 }, { "epoch": 0.17, "grad_norm": 0.012789727188646793, "learning_rate": 0.0001, "loss": 3.6413, "step": 1540 }, { "epoch": 0.18, "grad_norm": 0.011001868173480034, "learning_rate": 0.0001, "loss": 3.5716, "step": 1550 }, { "epoch": 0.18, "grad_norm": 0.018423104658722878, "learning_rate": 0.0001, "loss": 3.4756, "step": 1560 }, { "epoch": 0.18, "grad_norm": 0.017299488186836243, "learning_rate": 0.0001, "loss": 3.5651, "step": 1570 }, { "epoch": 0.18, "grad_norm": 0.01379362028092146, "learning_rate": 0.0001, "loss": 3.5084, "step": 1580 }, { "epoch": 0.18, "grad_norm": 0.01530790701508522, "learning_rate": 0.0001, "loss": 3.5467, "step": 1590 }, { "epoch": 0.18, "grad_norm": 0.012769973836839199, "learning_rate": 0.0001, "loss": 3.515, "step": 1600 }, { "epoch": 0.18, "grad_norm": 0.013851183466613293, "learning_rate": 0.0001, "loss": 3.6471, "step": 1610 }, { "epoch": 0.18, "grad_norm": 0.014289574697613716, "learning_rate": 0.0001, "loss": 3.4631, "step": 1620 }, { "epoch": 0.18, "grad_norm": 0.011917946860194206, "learning_rate": 0.0001, "loss": 3.517, "step": 1630 }, { "epoch": 0.19, "grad_norm": 0.010024646297097206, "learning_rate": 0.0001, "loss": 3.444, "step": 1640 }, { "epoch": 0.19, "grad_norm": 0.01034802570939064, "learning_rate": 0.0001, "loss": 3.3565, "step": 1650 }, { "epoch": 0.19, "grad_norm": 0.00995150487869978, "learning_rate": 0.0001, "loss": 3.4406, "step": 1660 }, { "epoch": 0.19, "grad_norm": 0.011516589671373367, "learning_rate": 0.0001, "loss": 3.485, "step": 1670 }, { "epoch": 0.19, "grad_norm": 0.012601302936673164, "learning_rate": 0.0001, "loss": 3.4708, "step": 1680 }, { "epoch": 0.19, "grad_norm": 0.01430091354995966, "learning_rate": 0.0001, "loss": 3.5655, "step": 1690 }, { "epoch": 0.19, "grad_norm": 0.011478858068585396, "learning_rate": 0.0001, "loss": 3.4743, "step": 1700 }, { "epoch": 0.19, "grad_norm": 0.01450322661548853, "learning_rate": 0.0001, "loss": 3.5334, "step": 1710 }, { "epoch": 0.19, "grad_norm": 0.020822227001190186, "learning_rate": 0.0001, "loss": 3.5136, "step": 1720 }, { "epoch": 0.2, "grad_norm": 0.011523684486746788, "learning_rate": 0.0001, "loss": 3.5822, "step": 1730 }, { "epoch": 0.2, "grad_norm": 0.015746811404824257, "learning_rate": 0.0001, "loss": 3.4671, "step": 1740 }, { "epoch": 0.2, "grad_norm": 0.014710924588143826, "learning_rate": 0.0001, "loss": 3.5318, "step": 1750 }, { "epoch": 0.2, "grad_norm": 0.013147206045687199, "learning_rate": 0.0001, "loss": 3.6272, "step": 1760 }, { "epoch": 0.2, "grad_norm": 0.012628714554011822, "learning_rate": 0.0001, "loss": 3.5732, "step": 1770 }, { "epoch": 0.2, "grad_norm": 0.011829860508441925, "learning_rate": 0.0001, "loss": 3.4658, "step": 1780 }, { "epoch": 0.2, "grad_norm": 0.012807231396436691, "learning_rate": 0.0001, "loss": 3.4699, "step": 1790 }, { "epoch": 0.2, "grad_norm": 0.01245660986751318, "learning_rate": 0.0001, "loss": 3.4397, "step": 1800 }, { "epoch": 0.21, "grad_norm": 0.01004470232874155, "learning_rate": 0.0001, "loss": 3.5441, "step": 1810 }, { "epoch": 0.21, "grad_norm": 0.014064277522265911, "learning_rate": 0.0001, "loss": 3.5148, "step": 1820 }, { "epoch": 0.21, "grad_norm": 0.01796138659119606, "learning_rate": 0.0001, "loss": 3.4614, "step": 1830 }, { "epoch": 0.21, "grad_norm": 0.017156125977635384, "learning_rate": 0.0001, "loss": 3.518, "step": 1840 }, { "epoch": 0.21, "grad_norm": 0.013531261123716831, "learning_rate": 0.0001, "loss": 3.5567, "step": 1850 }, { "epoch": 0.21, "grad_norm": 0.010274061933159828, "learning_rate": 0.0001, "loss": 3.5332, "step": 1860 }, { "epoch": 0.21, "grad_norm": 0.019586782902479172, "learning_rate": 0.0001, "loss": 3.4796, "step": 1870 }, { "epoch": 0.21, "grad_norm": 0.012124857865273952, "learning_rate": 0.0001, "loss": 3.4592, "step": 1880 }, { "epoch": 0.21, "grad_norm": 0.010372117161750793, "learning_rate": 0.0001, "loss": 3.5007, "step": 1890 }, { "epoch": 0.22, "grad_norm": 0.009132340550422668, "learning_rate": 0.0001, "loss": 3.606, "step": 1900 }, { "epoch": 0.22, "grad_norm": 0.011178999207913876, "learning_rate": 0.0001, "loss": 3.4883, "step": 1910 }, { "epoch": 0.22, "grad_norm": 0.010463099926710129, "learning_rate": 0.0001, "loss": 3.5241, "step": 1920 }, { "epoch": 0.22, "grad_norm": 0.012110692448914051, "learning_rate": 0.0001, "loss": 3.4607, "step": 1930 }, { "epoch": 0.22, "grad_norm": 0.014725590124726295, "learning_rate": 0.0001, "loss": 3.6237, "step": 1940 }, { "epoch": 0.22, "grad_norm": 0.012522481381893158, "learning_rate": 0.0001, "loss": 3.4572, "step": 1950 }, { "epoch": 0.22, "grad_norm": 0.008732055313885212, "learning_rate": 0.0001, "loss": 3.564, "step": 1960 }, { "epoch": 0.22, "grad_norm": 0.01115155965089798, "learning_rate": 0.0001, "loss": 3.4383, "step": 1970 }, { "epoch": 0.22, "grad_norm": 0.0173783078789711, "learning_rate": 0.0001, "loss": 3.6288, "step": 1980 }, { "epoch": 0.23, "grad_norm": 0.01445601787418127, "learning_rate": 0.0001, "loss": 3.4926, "step": 1990 }, { "epoch": 0.23, "grad_norm": 0.014484569430351257, "learning_rate": 0.0001, "loss": 3.6023, "step": 2000 }, { "epoch": 0.23, "grad_norm": 0.010751591064035892, "learning_rate": 0.0001, "loss": 3.5822, "step": 2010 }, { "epoch": 0.23, "grad_norm": 0.017609944567084312, "learning_rate": 0.0001, "loss": 3.5565, "step": 2020 }, { "epoch": 0.23, "grad_norm": 0.013758196495473385, "learning_rate": 0.0001, "loss": 3.5089, "step": 2030 }, { "epoch": 0.23, "grad_norm": 0.011685581877827644, "learning_rate": 0.0001, "loss": 3.4922, "step": 2040 }, { "epoch": 0.23, "grad_norm": 0.015950001776218414, "learning_rate": 0.0001, "loss": 3.4915, "step": 2050 }, { "epoch": 0.23, "grad_norm": 0.01576152816414833, "learning_rate": 0.0001, "loss": 3.4526, "step": 2060 }, { "epoch": 0.23, "grad_norm": 0.014144735410809517, "learning_rate": 0.0001, "loss": 3.4778, "step": 2070 }, { "epoch": 0.24, "grad_norm": 0.019585562869906425, "learning_rate": 0.0001, "loss": 3.5104, "step": 2080 }, { "epoch": 0.24, "grad_norm": 0.011967881582677364, "learning_rate": 0.0001, "loss": 3.5368, "step": 2090 }, { "epoch": 0.24, "grad_norm": 0.01672540418803692, "learning_rate": 0.0001, "loss": 3.5015, "step": 2100 }, { "epoch": 0.24, "grad_norm": 0.01712319441139698, "learning_rate": 0.0001, "loss": 3.5989, "step": 2110 }, { "epoch": 0.24, "grad_norm": 0.013890121132135391, "learning_rate": 0.0001, "loss": 3.5115, "step": 2120 }, { "epoch": 0.24, "grad_norm": 0.015169600024819374, "learning_rate": 0.0001, "loss": 3.4137, "step": 2130 }, { "epoch": 0.24, "grad_norm": 0.012502512894570827, "learning_rate": 0.0001, "loss": 3.4326, "step": 2140 }, { "epoch": 0.24, "grad_norm": 0.01436410192400217, "learning_rate": 0.0001, "loss": 3.5123, "step": 2150 }, { "epoch": 0.24, "grad_norm": 0.009717009961605072, "learning_rate": 0.0001, "loss": 3.3847, "step": 2160 }, { "epoch": 0.25, "grad_norm": 0.014872290194034576, "learning_rate": 0.0001, "loss": 3.4862, "step": 2170 }, { "epoch": 0.25, "grad_norm": 0.017303990200161934, "learning_rate": 0.0001, "loss": 3.6064, "step": 2180 }, { "epoch": 0.25, "grad_norm": 0.01628187857568264, "learning_rate": 0.0001, "loss": 3.519, "step": 2190 }, { "epoch": 0.25, "grad_norm": 0.018905622884631157, "learning_rate": 0.0001, "loss": 3.5375, "step": 2200 }, { "epoch": 0.25, "grad_norm": 0.012672685086727142, "learning_rate": 0.0001, "loss": 3.6318, "step": 2210 }, { "epoch": 0.25, "grad_norm": 0.015055770985782146, "learning_rate": 0.0001, "loss": 3.5242, "step": 2220 }, { "epoch": 0.25, "grad_norm": 0.023709069937467575, "learning_rate": 0.0001, "loss": 3.4862, "step": 2230 }, { "epoch": 0.25, "grad_norm": 0.016241637989878654, "learning_rate": 0.0001, "loss": 3.4665, "step": 2240 }, { "epoch": 0.25, "grad_norm": 0.015929441899061203, "learning_rate": 0.0001, "loss": 3.4275, "step": 2250 }, { "epoch": 0.26, "grad_norm": 0.01856895349919796, "learning_rate": 0.0001, "loss": 3.5531, "step": 2260 }, { "epoch": 0.26, "grad_norm": 0.016707396134734154, "learning_rate": 0.0001, "loss": 3.4781, "step": 2270 }, { "epoch": 0.26, "grad_norm": 0.016373319551348686, "learning_rate": 0.0001, "loss": 3.5315, "step": 2280 }, { "epoch": 0.26, "grad_norm": 0.019585154950618744, "learning_rate": 0.0001, "loss": 3.6315, "step": 2290 }, { "epoch": 0.26, "grad_norm": 0.016674255952239037, "learning_rate": 0.0001, "loss": 3.5199, "step": 2300 }, { "epoch": 0.26, "grad_norm": 0.015188581310212612, "learning_rate": 0.0001, "loss": 3.4688, "step": 2310 }, { "epoch": 0.26, "grad_norm": 0.0196946132928133, "learning_rate": 0.0001, "loss": 3.3733, "step": 2320 }, { "epoch": 0.26, "grad_norm": 0.017355434596538544, "learning_rate": 0.0001, "loss": 3.5795, "step": 2330 }, { "epoch": 0.27, "grad_norm": 0.025467930361628532, "learning_rate": 0.0001, "loss": 3.4954, "step": 2340 }, { "epoch": 0.27, "grad_norm": 0.02046274207532406, "learning_rate": 0.0001, "loss": 3.4384, "step": 2350 }, { "epoch": 0.27, "grad_norm": 0.016698865219950676, "learning_rate": 0.0001, "loss": 3.6532, "step": 2360 }, { "epoch": 0.27, "grad_norm": 0.016054237261414528, "learning_rate": 0.0001, "loss": 3.5716, "step": 2370 }, { "epoch": 0.27, "grad_norm": 0.019108062610030174, "learning_rate": 0.0001, "loss": 3.4454, "step": 2380 }, { "epoch": 0.27, "grad_norm": 0.016826612874865532, "learning_rate": 0.0001, "loss": 3.6658, "step": 2390 }, { "epoch": 0.27, "grad_norm": 0.01868477091193199, "learning_rate": 0.0001, "loss": 3.5084, "step": 2400 }, { "epoch": 0.27, "grad_norm": 0.013858865946531296, "learning_rate": 0.0001, "loss": 3.5561, "step": 2410 }, { "epoch": 0.27, "grad_norm": 0.021234555169939995, "learning_rate": 0.0001, "loss": 3.5905, "step": 2420 }, { "epoch": 0.28, "grad_norm": 0.017526492476463318, "learning_rate": 0.0001, "loss": 3.5135, "step": 2430 }, { "epoch": 0.28, "grad_norm": 0.01538402121514082, "learning_rate": 0.0001, "loss": 3.5682, "step": 2440 }, { "epoch": 0.28, "grad_norm": 0.014927823096513748, "learning_rate": 0.0001, "loss": 3.5264, "step": 2450 }, { "epoch": 0.28, "grad_norm": 0.01917092315852642, "learning_rate": 0.0001, "loss": 3.5484, "step": 2460 }, { "epoch": 0.28, "grad_norm": 0.018008895218372345, "learning_rate": 0.0001, "loss": 3.4739, "step": 2470 }, { "epoch": 0.28, "grad_norm": 0.022217359393835068, "learning_rate": 0.0001, "loss": 3.5085, "step": 2480 }, { "epoch": 0.28, "grad_norm": 0.023347290232777596, "learning_rate": 0.0001, "loss": 3.4921, "step": 2490 }, { "epoch": 0.28, "grad_norm": 0.017970876768231392, "learning_rate": 0.0001, "loss": 3.4403, "step": 2500 }, { "epoch": 0.28, "grad_norm": 0.01640474982559681, "learning_rate": 0.0001, "loss": 3.5114, "step": 2510 }, { "epoch": 0.29, "grad_norm": 0.018040059134364128, "learning_rate": 0.0001, "loss": 3.5927, "step": 2520 }, { "epoch": 0.29, "grad_norm": 0.01956881582736969, "learning_rate": 0.0001, "loss": 3.5449, "step": 2530 }, { "epoch": 0.29, "grad_norm": 0.017009075731039047, "learning_rate": 0.0001, "loss": 3.5136, "step": 2540 }, { "epoch": 0.29, "grad_norm": 0.02923855185508728, "learning_rate": 0.0001, "loss": 3.5114, "step": 2550 }, { "epoch": 0.29, "grad_norm": 0.03892824053764343, "learning_rate": 0.0001, "loss": 3.4507, "step": 2560 }, { "epoch": 0.29, "grad_norm": 0.022937944158911705, "learning_rate": 0.0001, "loss": 3.5325, "step": 2570 }, { "epoch": 0.29, "grad_norm": 0.02040509507060051, "learning_rate": 0.0001, "loss": 3.4488, "step": 2580 }, { "epoch": 0.29, "grad_norm": 0.014985262416303158, "learning_rate": 0.0001, "loss": 3.477, "step": 2590 }, { "epoch": 0.29, "grad_norm": 0.016432279720902443, "learning_rate": 0.0001, "loss": 3.5045, "step": 2600 }, { "epoch": 0.3, "grad_norm": 0.01811091974377632, "learning_rate": 0.0001, "loss": 3.485, "step": 2610 }, { "epoch": 0.3, "grad_norm": 0.01788695901632309, "learning_rate": 0.0001, "loss": 3.4977, "step": 2620 }, { "epoch": 0.3, "grad_norm": 0.01897617243230343, "learning_rate": 0.0001, "loss": 3.4494, "step": 2630 }, { "epoch": 0.3, "grad_norm": 0.017073579132556915, "learning_rate": 0.0001, "loss": 3.4476, "step": 2640 }, { "epoch": 0.3, "grad_norm": 0.026845784857869148, "learning_rate": 0.0001, "loss": 3.436, "step": 2650 }, { "epoch": 0.3, "grad_norm": 0.029439518228173256, "learning_rate": 0.0001, "loss": 3.4882, "step": 2660 }, { "epoch": 0.3, "grad_norm": 0.021677788347005844, "learning_rate": 0.0001, "loss": 3.487, "step": 2670 }, { "epoch": 0.3, "grad_norm": 0.09971503913402557, "learning_rate": 0.0001, "loss": 3.4806, "step": 2680 }, { "epoch": 0.3, "grad_norm": 0.03712121769785881, "learning_rate": 0.0001, "loss": 3.5279, "step": 2690 }, { "epoch": 0.31, "grad_norm": 0.04873380437493324, "learning_rate": 0.0001, "loss": 3.4583, "step": 2700 }, { "epoch": 0.31, "grad_norm": 0.02550162747502327, "learning_rate": 0.0001, "loss": 3.5315, "step": 2710 }, { "epoch": 0.31, "grad_norm": 0.01897108368575573, "learning_rate": 0.0001, "loss": 3.4546, "step": 2720 }, { "epoch": 0.31, "grad_norm": 0.01844639889895916, "learning_rate": 0.0001, "loss": 3.4877, "step": 2730 }, { "epoch": 0.31, "grad_norm": 0.016689693555235863, "learning_rate": 0.0001, "loss": 3.5164, "step": 2740 }, { "epoch": 0.31, "grad_norm": 0.02138620987534523, "learning_rate": 0.0001, "loss": 3.5577, "step": 2750 }, { "epoch": 0.31, "grad_norm": 0.0179847814142704, "learning_rate": 0.0001, "loss": 3.5134, "step": 2760 }, { "epoch": 0.31, "grad_norm": 0.03954233601689339, "learning_rate": 0.0001, "loss": 3.5993, "step": 2770 }, { "epoch": 0.31, "grad_norm": 0.022799063473939896, "learning_rate": 0.0001, "loss": 3.5345, "step": 2780 }, { "epoch": 0.32, "grad_norm": 0.03264794126152992, "learning_rate": 0.0001, "loss": 3.4537, "step": 2790 }, { "epoch": 0.32, "grad_norm": 0.01976439170539379, "learning_rate": 0.0001, "loss": 3.5259, "step": 2800 }, { "epoch": 0.32, "grad_norm": 0.028939945623278618, "learning_rate": 0.0001, "loss": 3.5778, "step": 2810 }, { "epoch": 0.32, "grad_norm": 0.02064042165875435, "learning_rate": 0.0001, "loss": 3.5139, "step": 2820 }, { "epoch": 0.32, "grad_norm": 0.015877893194556236, "learning_rate": 0.0001, "loss": 3.518, "step": 2830 }, { "epoch": 0.32, "grad_norm": 0.017917482182383537, "learning_rate": 0.0001, "loss": 3.4562, "step": 2840 }, { "epoch": 0.32, "grad_norm": 0.022430971264839172, "learning_rate": 0.0001, "loss": 3.4535, "step": 2850 }, { "epoch": 0.32, "grad_norm": 0.01940193586051464, "learning_rate": 0.0001, "loss": 3.5439, "step": 2860 }, { "epoch": 0.33, "grad_norm": 0.023416908457875252, "learning_rate": 0.0001, "loss": 3.5676, "step": 2870 }, { "epoch": 0.33, "grad_norm": 0.02338486537337303, "learning_rate": 0.0001, "loss": 3.4261, "step": 2880 }, { "epoch": 0.33, "grad_norm": 0.017554474994540215, "learning_rate": 0.0001, "loss": 3.4995, "step": 2890 }, { "epoch": 0.33, "grad_norm": 0.025802314281463623, "learning_rate": 0.0001, "loss": 3.4978, "step": 2900 }, { "epoch": 0.33, "grad_norm": 0.03155793622136116, "learning_rate": 0.0001, "loss": 3.5695, "step": 2910 }, { "epoch": 0.33, "grad_norm": 0.02897248975932598, "learning_rate": 0.0001, "loss": 3.5113, "step": 2920 }, { "epoch": 0.33, "grad_norm": 0.016502562910318375, "learning_rate": 0.0001, "loss": 3.4861, "step": 2930 }, { "epoch": 0.33, "grad_norm": 0.014018010348081589, "learning_rate": 0.0001, "loss": 3.4917, "step": 2940 }, { "epoch": 0.33, "grad_norm": 0.01745024509727955, "learning_rate": 0.0001, "loss": 3.5143, "step": 2950 }, { "epoch": 0.34, "grad_norm": 0.027601344510912895, "learning_rate": 0.0001, "loss": 3.5134, "step": 2960 }, { "epoch": 0.34, "grad_norm": 0.019520936533808708, "learning_rate": 0.0001, "loss": 3.4828, "step": 2970 }, { "epoch": 0.34, "grad_norm": 0.014931585639715195, "learning_rate": 0.0001, "loss": 3.4935, "step": 2980 }, { "epoch": 0.34, "grad_norm": 0.01630624197423458, "learning_rate": 0.0001, "loss": 3.6453, "step": 2990 }, { "epoch": 0.34, "grad_norm": 0.019842753186821938, "learning_rate": 0.0001, "loss": 3.5771, "step": 3000 }, { "epoch": 0.34, "grad_norm": 0.019877957180142403, "learning_rate": 0.0001, "loss": 3.5369, "step": 3010 }, { "epoch": 0.34, "grad_norm": 0.024922620505094528, "learning_rate": 0.0001, "loss": 3.4612, "step": 3020 }, { "epoch": 0.34, "grad_norm": 0.01488751731812954, "learning_rate": 0.0001, "loss": 3.5582, "step": 3030 }, { "epoch": 0.34, "grad_norm": 0.015327321365475655, "learning_rate": 0.0001, "loss": 3.5483, "step": 3040 }, { "epoch": 0.35, "grad_norm": 0.01309808436781168, "learning_rate": 0.0001, "loss": 3.4811, "step": 3050 }, { "epoch": 0.35, "grad_norm": 0.019352128729224205, "learning_rate": 0.0001, "loss": 3.4227, "step": 3060 }, { "epoch": 0.35, "grad_norm": 0.014425340108573437, "learning_rate": 0.0001, "loss": 3.5572, "step": 3070 }, { "epoch": 0.35, "grad_norm": 0.01205863244831562, "learning_rate": 0.0001, "loss": 3.5665, "step": 3080 }, { "epoch": 0.35, "grad_norm": 0.023006176576018333, "learning_rate": 0.0001, "loss": 3.5593, "step": 3090 }, { "epoch": 0.35, "grad_norm": 0.019858254119753838, "learning_rate": 0.0001, "loss": 3.5486, "step": 3100 }, { "epoch": 0.35, "grad_norm": 0.01903071068227291, "learning_rate": 0.0001, "loss": 3.4419, "step": 3110 }, { "epoch": 0.35, "grad_norm": 0.01240404974669218, "learning_rate": 0.0001, "loss": 3.3157, "step": 3120 }, { "epoch": 0.35, "grad_norm": 0.014579487033188343, "learning_rate": 0.0001, "loss": 3.5221, "step": 3130 }, { "epoch": 0.36, "grad_norm": 0.015120552852749825, "learning_rate": 0.0001, "loss": 3.5088, "step": 3140 }, { "epoch": 0.36, "grad_norm": 0.017975907772779465, "learning_rate": 0.0001, "loss": 3.4567, "step": 3150 }, { "epoch": 0.36, "grad_norm": 0.02290377952158451, "learning_rate": 0.0001, "loss": 3.5371, "step": 3160 }, { "epoch": 0.36, "grad_norm": 0.02394021861255169, "learning_rate": 0.0001, "loss": 3.5216, "step": 3170 }, { "epoch": 0.36, "grad_norm": 0.01812875084578991, "learning_rate": 0.0001, "loss": 3.5377, "step": 3180 }, { "epoch": 0.36, "grad_norm": 0.019379355013370514, "learning_rate": 0.0001, "loss": 3.4379, "step": 3190 }, { "epoch": 0.36, "grad_norm": 0.014420004561543465, "learning_rate": 0.0001, "loss": 3.5185, "step": 3200 }, { "epoch": 0.36, "grad_norm": 0.012561817653477192, "learning_rate": 0.0001, "loss": 3.4231, "step": 3210 }, { "epoch": 0.36, "grad_norm": 0.018877729773521423, "learning_rate": 0.0001, "loss": 3.5479, "step": 3220 }, { "epoch": 0.37, "grad_norm": 0.01132214069366455, "learning_rate": 0.0001, "loss": 3.4648, "step": 3230 }, { "epoch": 0.37, "grad_norm": 0.022680630907416344, "learning_rate": 0.0001, "loss": 3.5728, "step": 3240 }, { "epoch": 0.37, "grad_norm": 0.031161852180957794, "learning_rate": 0.0001, "loss": 3.4955, "step": 3250 }, { "epoch": 0.37, "grad_norm": 0.012343869544565678, "learning_rate": 0.0001, "loss": 3.5017, "step": 3260 }, { "epoch": 0.37, "grad_norm": 0.010025468654930592, "learning_rate": 0.0001, "loss": 3.5499, "step": 3270 }, { "epoch": 0.37, "grad_norm": 0.011847359128296375, "learning_rate": 0.0001, "loss": 3.5514, "step": 3280 }, { "epoch": 0.37, "grad_norm": 0.015538055449724197, "learning_rate": 0.0001, "loss": 3.576, "step": 3290 }, { "epoch": 0.37, "grad_norm": 0.021085917949676514, "learning_rate": 0.0001, "loss": 3.4432, "step": 3300 }, { "epoch": 0.37, "grad_norm": 0.010648920200765133, "learning_rate": 0.0001, "loss": 3.5216, "step": 3310 }, { "epoch": 0.38, "grad_norm": 0.02155137062072754, "learning_rate": 0.0001, "loss": 3.5368, "step": 3320 }, { "epoch": 0.38, "grad_norm": 0.013663026504218578, "learning_rate": 0.0001, "loss": 3.3732, "step": 3330 }, { "epoch": 0.38, "grad_norm": 0.010358215309679508, "learning_rate": 0.0001, "loss": 3.4383, "step": 3340 }, { "epoch": 0.38, "grad_norm": 0.017488796263933182, "learning_rate": 0.0001, "loss": 3.5315, "step": 3350 }, { "epoch": 0.38, "grad_norm": 0.015512356534600258, "learning_rate": 0.0001, "loss": 3.5108, "step": 3360 }, { "epoch": 0.38, "grad_norm": 0.014322979375720024, "learning_rate": 0.0001, "loss": 3.6455, "step": 3370 }, { "epoch": 0.38, "grad_norm": 0.015895500779151917, "learning_rate": 0.0001, "loss": 3.4859, "step": 3380 }, { "epoch": 0.38, "grad_norm": 0.021278714761137962, "learning_rate": 0.0001, "loss": 3.4974, "step": 3390 }, { "epoch": 0.39, "grad_norm": 0.007831936702132225, "learning_rate": 0.0001, "loss": 3.4516, "step": 3400 }, { "epoch": 0.39, "grad_norm": 0.01164664514362812, "learning_rate": 0.0001, "loss": 3.5799, "step": 3410 }, { "epoch": 0.39, "grad_norm": 0.009992453269660473, "learning_rate": 0.0001, "loss": 3.4685, "step": 3420 }, { "epoch": 0.39, "grad_norm": 0.016423575580120087, "learning_rate": 0.0001, "loss": 3.5346, "step": 3430 }, { "epoch": 0.39, "grad_norm": 0.00949979666620493, "learning_rate": 0.0001, "loss": 3.4413, "step": 3440 }, { "epoch": 0.39, "grad_norm": 0.008454454131424427, "learning_rate": 0.0001, "loss": 3.558, "step": 3450 }, { "epoch": 0.39, "grad_norm": 0.010379428043961525, "learning_rate": 0.0001, "loss": 3.4974, "step": 3460 }, { "epoch": 0.39, "grad_norm": 0.013533222489058971, "learning_rate": 0.0001, "loss": 3.4392, "step": 3470 }, { "epoch": 0.39, "grad_norm": 0.010838096030056477, "learning_rate": 0.0001, "loss": 3.4614, "step": 3480 }, { "epoch": 0.4, "grad_norm": 0.011237652972340584, "learning_rate": 0.0001, "loss": 3.5319, "step": 3490 }, { "epoch": 0.4, "grad_norm": 0.020445378497242928, "learning_rate": 0.0001, "loss": 3.4498, "step": 3500 }, { "epoch": 0.4, "grad_norm": 0.006660849787294865, "learning_rate": 0.0001, "loss": 3.5694, "step": 3510 }, { "epoch": 0.4, "grad_norm": 0.007946416735649109, "learning_rate": 0.0001, "loss": 3.6128, "step": 3520 }, { "epoch": 0.4, "grad_norm": 0.015461256727576256, "learning_rate": 0.0001, "loss": 3.4204, "step": 3530 }, { "epoch": 0.4, "grad_norm": 0.02071903459727764, "learning_rate": 0.0001, "loss": 3.4094, "step": 3540 }, { "epoch": 0.4, "grad_norm": 0.012966283597052097, "learning_rate": 0.0001, "loss": 3.4852, "step": 3550 }, { "epoch": 0.4, "grad_norm": 0.010020465590059757, "learning_rate": 0.0001, "loss": 3.5771, "step": 3560 }, { "epoch": 0.4, "grad_norm": 0.011365647427737713, "learning_rate": 0.0001, "loss": 3.458, "step": 3570 }, { "epoch": 0.41, "grad_norm": 0.018414277583360672, "learning_rate": 0.0001, "loss": 3.4535, "step": 3580 }, { "epoch": 0.41, "grad_norm": 0.009047456085681915, "learning_rate": 0.0001, "loss": 3.4625, "step": 3590 }, { "epoch": 0.41, "grad_norm": 0.012175729498267174, "learning_rate": 0.0001, "loss": 3.5695, "step": 3600 }, { "epoch": 0.41, "grad_norm": 0.006391238421201706, "learning_rate": 0.0001, "loss": 3.4605, "step": 3610 }, { "epoch": 0.41, "grad_norm": 0.007453025784343481, "learning_rate": 0.0001, "loss": 3.6365, "step": 3620 }, { "epoch": 0.41, "grad_norm": 0.005637998227030039, "learning_rate": 0.0001, "loss": 3.5826, "step": 3630 }, { "epoch": 0.41, "grad_norm": 0.006138278637081385, "learning_rate": 0.0001, "loss": 3.4715, "step": 3640 }, { "epoch": 0.41, "grad_norm": 0.01041551772505045, "learning_rate": 0.0001, "loss": 3.4853, "step": 3650 }, { "epoch": 0.41, "grad_norm": 0.007091097068041563, "learning_rate": 0.0001, "loss": 3.5304, "step": 3660 }, { "epoch": 0.42, "grad_norm": 0.008768292143940926, "learning_rate": 0.0001, "loss": 3.5464, "step": 3670 }, { "epoch": 0.42, "grad_norm": 0.008567243814468384, "learning_rate": 0.0001, "loss": 3.5145, "step": 3680 }, { "epoch": 0.42, "grad_norm": 0.007764735724776983, "learning_rate": 0.0001, "loss": 3.4253, "step": 3690 }, { "epoch": 0.42, "grad_norm": 0.01701386459171772, "learning_rate": 0.0001, "loss": 3.4243, "step": 3700 }, { "epoch": 0.42, "grad_norm": 0.008088070899248123, "learning_rate": 0.0001, "loss": 3.4444, "step": 3710 }, { "epoch": 0.42, "grad_norm": 0.009699873626232147, "learning_rate": 0.0001, "loss": 3.5932, "step": 3720 }, { "epoch": 0.42, "grad_norm": 0.008919463492929935, "learning_rate": 0.0001, "loss": 3.5274, "step": 3730 }, { "epoch": 0.42, "grad_norm": 0.007431438192725182, "learning_rate": 0.0001, "loss": 3.5754, "step": 3740 }, { "epoch": 0.42, "grad_norm": 0.005464049056172371, "learning_rate": 0.0001, "loss": 3.5621, "step": 3750 }, { "epoch": 0.43, "grad_norm": 0.010427163913846016, "learning_rate": 0.0001, "loss": 3.5187, "step": 3760 }, { "epoch": 0.43, "grad_norm": 0.0065625510178506374, "learning_rate": 0.0001, "loss": 3.5243, "step": 3770 }, { "epoch": 0.43, "grad_norm": 0.005589956883341074, "learning_rate": 0.0001, "loss": 3.505, "step": 3780 }, { "epoch": 0.43, "grad_norm": 0.006880622357130051, "learning_rate": 0.0001, "loss": 3.5579, "step": 3790 }, { "epoch": 0.43, "grad_norm": 0.006927428301423788, "learning_rate": 0.0001, "loss": 3.531, "step": 3800 }, { "epoch": 0.43, "grad_norm": 0.004907497204840183, "learning_rate": 0.0001, "loss": 3.4542, "step": 3810 }, { "epoch": 0.43, "grad_norm": 0.015784764662384987, "learning_rate": 0.0001, "loss": 3.5553, "step": 3820 }, { "epoch": 0.43, "grad_norm": 0.005843338090926409, "learning_rate": 0.0001, "loss": 3.5229, "step": 3830 }, { "epoch": 0.43, "grad_norm": 0.03485801815986633, "learning_rate": 0.0001, "loss": 3.5625, "step": 3840 }, { "epoch": 0.44, "grad_norm": 0.010843326337635517, "learning_rate": 0.0001, "loss": 3.5401, "step": 3850 }, { "epoch": 0.44, "grad_norm": 0.005655507557094097, "learning_rate": 0.0001, "loss": 3.5277, "step": 3860 }, { "epoch": 0.44, "grad_norm": 0.007940387353301048, "learning_rate": 0.0001, "loss": 3.5436, "step": 3870 }, { "epoch": 0.44, "grad_norm": 0.023078449070453644, "learning_rate": 0.0001, "loss": 3.5451, "step": 3880 }, { "epoch": 0.44, "grad_norm": 0.022735636681318283, "learning_rate": 0.0001, "loss": 3.4816, "step": 3890 }, { "epoch": 0.44, "grad_norm": 0.02266281098127365, "learning_rate": 0.0001, "loss": 3.5027, "step": 3900 }, { "epoch": 0.44, "grad_norm": 0.03039221465587616, "learning_rate": 0.0001, "loss": 3.4869, "step": 3910 }, { "epoch": 0.44, "grad_norm": 0.010447009466588497, "learning_rate": 0.0001, "loss": 3.4708, "step": 3920 }, { "epoch": 0.45, "grad_norm": 0.011993595398962498, "learning_rate": 0.0001, "loss": 3.5296, "step": 3930 }, { "epoch": 0.45, "grad_norm": 0.009197100065648556, "learning_rate": 0.0001, "loss": 3.4425, "step": 3940 }, { "epoch": 0.45, "grad_norm": 0.009633350186049938, "learning_rate": 0.0001, "loss": 3.4983, "step": 3950 }, { "epoch": 0.45, "grad_norm": 0.0069016763009130955, "learning_rate": 0.0001, "loss": 3.5074, "step": 3960 }, { "epoch": 0.45, "grad_norm": 0.009412202052772045, "learning_rate": 0.0001, "loss": 3.5389, "step": 3970 }, { "epoch": 0.45, "grad_norm": 0.0066176713444292545, "learning_rate": 0.0001, "loss": 3.5366, "step": 3980 }, { "epoch": 0.45, "grad_norm": 0.006833420135080814, "learning_rate": 0.0001, "loss": 3.4753, "step": 3990 }, { "epoch": 0.45, "grad_norm": 0.008369448594748974, "learning_rate": 0.0001, "loss": 3.4061, "step": 4000 }, { "epoch": 0.45, "grad_norm": 0.007651700172573328, "learning_rate": 0.0001, "loss": 3.467, "step": 4010 }, { "epoch": 0.46, "grad_norm": 0.0064993686974048615, "learning_rate": 0.0001, "loss": 3.5594, "step": 4020 }, { "epoch": 0.46, "grad_norm": 0.0061715105548501015, "learning_rate": 0.0001, "loss": 3.5804, "step": 4030 }, { "epoch": 0.46, "grad_norm": 0.006790760438889265, "learning_rate": 0.0001, "loss": 3.4945, "step": 4040 }, { "epoch": 0.46, "grad_norm": 0.0047807940281927586, "learning_rate": 0.0001, "loss": 3.4802, "step": 4050 }, { "epoch": 0.46, "grad_norm": 0.007049499545246363, "learning_rate": 0.0001, "loss": 3.5403, "step": 4060 }, { "epoch": 0.46, "grad_norm": 0.00626937672495842, "learning_rate": 0.0001, "loss": 3.4693, "step": 4070 }, { "epoch": 0.46, "grad_norm": 0.004158223047852516, "learning_rate": 0.0001, "loss": 3.5802, "step": 4080 }, { "epoch": 0.46, "grad_norm": 0.005880121607333422, "learning_rate": 0.0001, "loss": 3.5306, "step": 4090 }, { "epoch": 0.46, "grad_norm": 0.004719247575849295, "learning_rate": 0.0001, "loss": 3.5452, "step": 4100 }, { "epoch": 0.47, "grad_norm": 0.007311057299375534, "learning_rate": 0.0001, "loss": 3.4807, "step": 4110 }, { "epoch": 0.47, "grad_norm": 0.0060755410231649876, "learning_rate": 0.0001, "loss": 3.5494, "step": 4120 }, { "epoch": 0.47, "grad_norm": 0.004775831010192633, "learning_rate": 0.0001, "loss": 3.4815, "step": 4130 }, { "epoch": 0.47, "grad_norm": 0.003588103223592043, "learning_rate": 0.0001, "loss": 3.5617, "step": 4140 }, { "epoch": 0.47, "grad_norm": 0.004304265137761831, "learning_rate": 0.0001, "loss": 3.5688, "step": 4150 }, { "epoch": 0.47, "grad_norm": 0.005103942472487688, "learning_rate": 0.0001, "loss": 3.4562, "step": 4160 }, { "epoch": 0.47, "grad_norm": 0.006716585252434015, "learning_rate": 0.0001, "loss": 3.5336, "step": 4170 }, { "epoch": 0.47, "grad_norm": 0.006754782982170582, "learning_rate": 0.0001, "loss": 3.5743, "step": 4180 }, { "epoch": 0.47, "grad_norm": 0.00561788072809577, "learning_rate": 0.0001, "loss": 3.5539, "step": 4190 }, { "epoch": 0.48, "grad_norm": 0.004580601584166288, "learning_rate": 0.0001, "loss": 3.6548, "step": 4200 }, { "epoch": 0.48, "grad_norm": 0.009967143647372723, "learning_rate": 0.0001, "loss": 3.5058, "step": 4210 }, { "epoch": 0.48, "grad_norm": 0.005966213531792164, "learning_rate": 0.0001, "loss": 3.587, "step": 4220 }, { "epoch": 0.48, "grad_norm": 0.005688393488526344, "learning_rate": 0.0001, "loss": 3.4741, "step": 4230 }, { "epoch": 0.48, "grad_norm": 0.0053281961008906364, "learning_rate": 0.0001, "loss": 3.5961, "step": 4240 }, { "epoch": 0.48, "grad_norm": 0.0052569652907550335, "learning_rate": 0.0001, "loss": 3.4757, "step": 4250 }, { "epoch": 0.48, "grad_norm": 0.006128863897174597, "learning_rate": 0.0001, "loss": 3.4782, "step": 4260 }, { "epoch": 0.48, "grad_norm": 0.009978166781365871, "learning_rate": 0.0001, "loss": 3.5193, "step": 4270 }, { "epoch": 0.48, "grad_norm": 0.011794301681220531, "learning_rate": 0.0001, "loss": 3.4534, "step": 4280 }, { "epoch": 0.49, "grad_norm": 0.0062560117803514, "learning_rate": 0.0001, "loss": 3.4635, "step": 4290 }, { "epoch": 0.49, "grad_norm": 0.004440414719283581, "learning_rate": 0.0001, "loss": 3.53, "step": 4300 }, { "epoch": 0.49, "grad_norm": 0.00489009078592062, "learning_rate": 0.0001, "loss": 3.5021, "step": 4310 }, { "epoch": 0.49, "grad_norm": 0.007063284981995821, "learning_rate": 0.0001, "loss": 3.5614, "step": 4320 }, { "epoch": 0.49, "grad_norm": 0.004356670659035444, "learning_rate": 0.0001, "loss": 3.4157, "step": 4330 }, { "epoch": 0.49, "grad_norm": 0.01129516214132309, "learning_rate": 0.0001, "loss": 3.5085, "step": 4340 }, { "epoch": 0.49, "grad_norm": 0.004313796758651733, "learning_rate": 0.0001, "loss": 3.4649, "step": 4350 }, { "epoch": 0.49, "grad_norm": 0.004005118273198605, "learning_rate": 0.0001, "loss": 3.547, "step": 4360 }, { "epoch": 0.5, "grad_norm": 0.0036425346042960882, "learning_rate": 0.0001, "loss": 3.5366, "step": 4370 }, { "epoch": 0.5, "grad_norm": 0.04148484393954277, "learning_rate": 0.0001, "loss": 3.5006, "step": 4380 }, { "epoch": 0.5, "grad_norm": 0.00812317244708538, "learning_rate": 0.0001, "loss": 3.5786, "step": 4390 }, { "epoch": 0.5, "grad_norm": 0.0076829190365970135, "learning_rate": 0.0001, "loss": 3.5302, "step": 4400 }, { "epoch": 0.5, "grad_norm": 0.013333278708159924, "learning_rate": 0.0001, "loss": 3.5847, "step": 4410 }, { "epoch": 0.5, "grad_norm": 0.003991179633885622, "learning_rate": 0.0001, "loss": 3.4896, "step": 4420 }, { "epoch": 0.5, "grad_norm": 0.004840311128646135, "learning_rate": 0.0001, "loss": 3.5633, "step": 4430 }, { "epoch": 0.5, "grad_norm": 0.007540411315858364, "learning_rate": 0.0001, "loss": 3.445, "step": 4440 }, { "epoch": 0.5, "grad_norm": 0.003558919532224536, "learning_rate": 0.0001, "loss": 3.4732, "step": 4450 }, { "epoch": 0.51, "grad_norm": 0.004998435731977224, "learning_rate": 0.0001, "loss": 3.5524, "step": 4460 }, { "epoch": 0.51, "grad_norm": 0.005125248804688454, "learning_rate": 0.0001, "loss": 3.5482, "step": 4470 }, { "epoch": 0.51, "grad_norm": 0.0038299821317195892, "learning_rate": 0.0001, "loss": 3.5663, "step": 4480 }, { "epoch": 0.51, "grad_norm": 0.003909745253622532, "learning_rate": 0.0001, "loss": 3.5524, "step": 4490 }, { "epoch": 0.51, "grad_norm": 0.003921572584658861, "learning_rate": 0.0001, "loss": 3.4442, "step": 4500 }, { "epoch": 0.51, "grad_norm": 0.004062464460730553, "learning_rate": 0.0001, "loss": 3.4952, "step": 4510 }, { "epoch": 0.51, "grad_norm": 0.006176056805998087, "learning_rate": 0.0001, "loss": 3.5, "step": 4520 }, { "epoch": 0.51, "grad_norm": 0.004539003595709801, "learning_rate": 0.0001, "loss": 3.4264, "step": 4530 }, { "epoch": 0.51, "grad_norm": 0.0047151786275208, "learning_rate": 0.0001, "loss": 3.5237, "step": 4540 }, { "epoch": 0.52, "grad_norm": 0.003135774051770568, "learning_rate": 0.0001, "loss": 3.5331, "step": 4550 }, { "epoch": 0.52, "grad_norm": 0.003931379411369562, "learning_rate": 0.0001, "loss": 3.5101, "step": 4560 }, { "epoch": 0.52, "grad_norm": 0.005207459907978773, "learning_rate": 0.0001, "loss": 3.6127, "step": 4570 }, { "epoch": 0.52, "grad_norm": 0.011244265362620354, "learning_rate": 0.0001, "loss": 3.5931, "step": 4580 }, { "epoch": 0.52, "grad_norm": 0.01563664898276329, "learning_rate": 0.0001, "loss": 3.4802, "step": 4590 }, { "epoch": 0.52, "grad_norm": 0.009943433105945587, "learning_rate": 0.0001, "loss": 3.4633, "step": 4600 }, { "epoch": 0.52, "grad_norm": 0.018843140453100204, "learning_rate": 0.0001, "loss": 3.5562, "step": 4610 }, { "epoch": 0.52, "grad_norm": 0.006936580408364534, "learning_rate": 0.0001, "loss": 3.4169, "step": 4620 }, { "epoch": 0.52, "grad_norm": 0.0038798069581389427, "learning_rate": 0.0001, "loss": 3.4782, "step": 4630 }, { "epoch": 0.53, "grad_norm": 0.003797045908868313, "learning_rate": 0.0001, "loss": 3.4863, "step": 4640 }, { "epoch": 0.53, "grad_norm": 0.003531055059283972, "learning_rate": 0.0001, "loss": 3.5611, "step": 4650 }, { "epoch": 0.53, "grad_norm": 0.0029926279094070196, "learning_rate": 0.0001, "loss": 3.511, "step": 4660 }, { "epoch": 0.53, "grad_norm": 0.0031156709883362055, "learning_rate": 0.0001, "loss": 3.5306, "step": 4670 }, { "epoch": 0.53, "grad_norm": 0.0032562497071921825, "learning_rate": 0.0001, "loss": 3.5593, "step": 4680 }, { "epoch": 0.53, "grad_norm": 0.0027861979324370623, "learning_rate": 0.0001, "loss": 3.5274, "step": 4690 }, { "epoch": 0.53, "grad_norm": 0.0038752038963139057, "learning_rate": 0.0001, "loss": 3.4485, "step": 4700 }, { "epoch": 0.53, "grad_norm": 0.0028181427624076605, "learning_rate": 0.0001, "loss": 3.451, "step": 4710 }, { "epoch": 0.53, "grad_norm": 0.00272397231310606, "learning_rate": 0.0001, "loss": 3.5355, "step": 4720 }, { "epoch": 0.54, "grad_norm": 0.0031912873964756727, "learning_rate": 0.0001, "loss": 3.5292, "step": 4730 }, { "epoch": 0.54, "grad_norm": 0.002808883087709546, "learning_rate": 0.0001, "loss": 3.5226, "step": 4740 }, { "epoch": 0.54, "grad_norm": 0.003119220957159996, "learning_rate": 0.0001, "loss": 3.4446, "step": 4750 }, { "epoch": 0.54, "grad_norm": 0.0029981997795403004, "learning_rate": 0.0001, "loss": 3.5747, "step": 4760 }, { "epoch": 0.54, "grad_norm": 0.007231538183987141, "learning_rate": 0.0001, "loss": 3.4832, "step": 4770 }, { "epoch": 0.54, "grad_norm": 0.004966236650943756, "learning_rate": 0.0001, "loss": 3.4248, "step": 4780 }, { "epoch": 0.54, "grad_norm": 0.010362996719777584, "learning_rate": 0.0001, "loss": 3.4359, "step": 4790 }, { "epoch": 0.54, "grad_norm": 0.006567921489477158, "learning_rate": 0.0001, "loss": 3.5201, "step": 4800 }, { "epoch": 0.54, "grad_norm": 0.006164238788187504, "learning_rate": 0.0001, "loss": 3.5051, "step": 4810 }, { "epoch": 0.55, "grad_norm": 0.006149006076157093, "learning_rate": 0.0001, "loss": 3.5282, "step": 4820 }, { "epoch": 0.55, "grad_norm": 0.0034897231962531805, "learning_rate": 0.0001, "loss": 3.5177, "step": 4830 }, { "epoch": 0.55, "grad_norm": 0.0044860756024718285, "learning_rate": 0.0001, "loss": 3.5561, "step": 4840 }, { "epoch": 0.55, "grad_norm": 0.003326837904751301, "learning_rate": 0.0001, "loss": 3.5037, "step": 4850 }, { "epoch": 0.55, "grad_norm": 0.0026917154900729656, "learning_rate": 0.0001, "loss": 3.5622, "step": 4860 }, { "epoch": 0.55, "grad_norm": 0.005890505854040384, "learning_rate": 0.0001, "loss": 3.6115, "step": 4870 }, { "epoch": 0.55, "grad_norm": 0.0035702097229659557, "learning_rate": 0.0001, "loss": 3.4598, "step": 4880 }, { "epoch": 0.55, "grad_norm": 0.004115230869501829, "learning_rate": 0.0001, "loss": 3.5296, "step": 4890 }, { "epoch": 0.56, "grad_norm": 0.005481770262122154, "learning_rate": 0.0001, "loss": 3.4853, "step": 4900 }, { "epoch": 0.56, "grad_norm": 0.003967753611505032, "learning_rate": 0.0001, "loss": 3.5014, "step": 4910 }, { "epoch": 0.56, "grad_norm": 0.0041259825229644775, "learning_rate": 0.0001, "loss": 3.469, "step": 4920 }, { "epoch": 0.56, "grad_norm": 0.002385517815127969, "learning_rate": 0.0001, "loss": 3.4925, "step": 4930 }, { "epoch": 0.56, "grad_norm": 0.0032553309574723244, "learning_rate": 0.0001, "loss": 3.5526, "step": 4940 }, { "epoch": 0.56, "grad_norm": 0.003530130721628666, "learning_rate": 0.0001, "loss": 3.6186, "step": 4950 }, { "epoch": 0.56, "grad_norm": 0.005288898013532162, "learning_rate": 0.0001, "loss": 3.5498, "step": 4960 }, { "epoch": 0.56, "grad_norm": 0.004701194819062948, "learning_rate": 0.0001, "loss": 3.4999, "step": 4970 }, { "epoch": 0.56, "grad_norm": 0.0058403704315423965, "learning_rate": 0.0001, "loss": 3.5369, "step": 4980 }, { "epoch": 0.57, "grad_norm": 0.0035412381403148174, "learning_rate": 0.0001, "loss": 3.4257, "step": 4990 }, { "epoch": 0.57, "grad_norm": 0.00541894044727087, "learning_rate": 0.0001, "loss": 3.5031, "step": 5000 }, { "epoch": 0.57, "grad_norm": 0.0033005536533892155, "learning_rate": 0.0001, "loss": 3.5676, "step": 5010 }, { "epoch": 0.57, "grad_norm": 0.0037940237671136856, "learning_rate": 0.0001, "loss": 3.4441, "step": 5020 }, { "epoch": 0.57, "grad_norm": 0.0028874778654426336, "learning_rate": 0.0001, "loss": 3.5744, "step": 5030 }, { "epoch": 0.57, "grad_norm": 0.002617127262055874, "learning_rate": 0.0001, "loss": 3.455, "step": 5040 }, { "epoch": 0.57, "grad_norm": 0.002719414420425892, "learning_rate": 0.0001, "loss": 3.4812, "step": 5050 }, { "epoch": 0.57, "grad_norm": 0.00237189419567585, "learning_rate": 0.0001, "loss": 3.6061, "step": 5060 }, { "epoch": 0.57, "grad_norm": 0.007999159395694733, "learning_rate": 0.0001, "loss": 3.4401, "step": 5070 }, { "epoch": 0.58, "grad_norm": 0.0032298292499035597, "learning_rate": 0.0001, "loss": 3.4186, "step": 5080 }, { "epoch": 0.58, "grad_norm": 0.003447327297180891, "learning_rate": 0.0001, "loss": 3.3732, "step": 5090 }, { "epoch": 0.58, "grad_norm": 0.002567682880908251, "learning_rate": 0.0001, "loss": 3.4513, "step": 5100 }, { "epoch": 0.58, "grad_norm": 0.005258865188807249, "learning_rate": 0.0001, "loss": 3.4067, "step": 5110 }, { "epoch": 0.58, "grad_norm": 0.0055098384618759155, "learning_rate": 0.0001, "loss": 3.4873, "step": 5120 }, { "epoch": 0.58, "grad_norm": 0.007869357243180275, "learning_rate": 0.0001, "loss": 3.5824, "step": 5130 }, { "epoch": 0.58, "grad_norm": 0.007011805661022663, "learning_rate": 0.0001, "loss": 3.5192, "step": 5140 }, { "epoch": 0.58, "grad_norm": 0.006085286848247051, "learning_rate": 0.0001, "loss": 3.4534, "step": 5150 }, { "epoch": 0.58, "grad_norm": 0.004661299753934145, "learning_rate": 0.0001, "loss": 3.5288, "step": 5160 }, { "epoch": 0.59, "grad_norm": 0.0029236809350550175, "learning_rate": 0.0001, "loss": 3.4763, "step": 5170 }, { "epoch": 0.59, "grad_norm": 0.004299537744373083, "learning_rate": 0.0001, "loss": 3.3701, "step": 5180 }, { "epoch": 0.59, "grad_norm": 0.003542792983353138, "learning_rate": 0.0001, "loss": 3.5394, "step": 5190 }, { "epoch": 0.59, "grad_norm": 0.0032088463194668293, "learning_rate": 0.0001, "loss": 3.5032, "step": 5200 }, { "epoch": 0.59, "grad_norm": 0.0028244787827134132, "learning_rate": 0.0001, "loss": 3.5015, "step": 5210 }, { "epoch": 0.59, "grad_norm": 0.0023988205939531326, "learning_rate": 0.0001, "loss": 3.5653, "step": 5220 }, { "epoch": 0.59, "grad_norm": 0.005232866387814283, "learning_rate": 0.0001, "loss": 3.5328, "step": 5230 }, { "epoch": 0.59, "grad_norm": 0.002708225278183818, "learning_rate": 0.0001, "loss": 3.4703, "step": 5240 }, { "epoch": 0.59, "grad_norm": 0.0029032945167273283, "learning_rate": 0.0001, "loss": 3.4976, "step": 5250 }, { "epoch": 0.6, "grad_norm": 0.002627847483381629, "learning_rate": 0.0001, "loss": 3.5896, "step": 5260 }, { "epoch": 0.6, "grad_norm": 0.0021770005114376545, "learning_rate": 0.0001, "loss": 3.4624, "step": 5270 }, { "epoch": 0.6, "grad_norm": 0.0029759714379906654, "learning_rate": 0.0001, "loss": 3.5221, "step": 5280 }, { "epoch": 0.6, "grad_norm": 0.00346871349029243, "learning_rate": 0.0001, "loss": 3.4821, "step": 5290 }, { "epoch": 0.6, "grad_norm": 0.0032139397226274014, "learning_rate": 0.0001, "loss": 3.4923, "step": 5300 }, { "epoch": 0.6, "grad_norm": 0.0029621082358062267, "learning_rate": 0.0001, "loss": 3.5431, "step": 5310 }, { "epoch": 0.6, "grad_norm": 0.004221236798912287, "learning_rate": 0.0001, "loss": 3.5232, "step": 5320 }, { "epoch": 0.6, "grad_norm": 0.003510931273922324, "learning_rate": 0.0001, "loss": 3.559, "step": 5330 }, { "epoch": 0.6, "grad_norm": 0.003262583166360855, "learning_rate": 0.0001, "loss": 3.5515, "step": 5340 }, { "epoch": 0.61, "grad_norm": 0.0026294193230569363, "learning_rate": 0.0001, "loss": 3.5166, "step": 5350 }, { "epoch": 0.61, "grad_norm": 0.0037345760501921177, "learning_rate": 0.0001, "loss": 3.4799, "step": 5360 }, { "epoch": 0.61, "grad_norm": 0.002509775571525097, "learning_rate": 0.0001, "loss": 3.5291, "step": 5370 }, { "epoch": 0.61, "grad_norm": 0.002384202554821968, "learning_rate": 0.0001, "loss": 3.5312, "step": 5380 }, { "epoch": 0.61, "grad_norm": 0.0033852660562843084, "learning_rate": 0.0001, "loss": 3.5023, "step": 5390 }, { "epoch": 0.61, "grad_norm": 0.0026892900932580233, "learning_rate": 0.0001, "loss": 3.5452, "step": 5400 }, { "epoch": 0.61, "grad_norm": 0.002228036755695939, "learning_rate": 0.0001, "loss": 3.5714, "step": 5410 }, { "epoch": 0.61, "grad_norm": 0.0019552321173250675, "learning_rate": 0.0001, "loss": 3.6076, "step": 5420 }, { "epoch": 0.62, "grad_norm": 0.0023829012643545866, "learning_rate": 0.0001, "loss": 3.4082, "step": 5430 }, { "epoch": 0.62, "grad_norm": 0.0022175966296344995, "learning_rate": 0.0001, "loss": 3.404, "step": 5440 }, { "epoch": 0.62, "grad_norm": 0.0036344940308481455, "learning_rate": 0.0001, "loss": 3.5012, "step": 5450 }, { "epoch": 0.62, "grad_norm": 0.0034123938530683517, "learning_rate": 0.0001, "loss": 3.4459, "step": 5460 }, { "epoch": 0.62, "grad_norm": 0.0023325171787291765, "learning_rate": 0.0001, "loss": 3.5585, "step": 5470 }, { "epoch": 0.62, "grad_norm": 0.004763777367770672, "learning_rate": 0.0001, "loss": 3.4468, "step": 5480 }, { "epoch": 0.62, "grad_norm": 0.0035664106253534555, "learning_rate": 0.0001, "loss": 3.5413, "step": 5490 }, { "epoch": 0.62, "grad_norm": 0.055835988372564316, "learning_rate": 0.0001, "loss": 3.5339, "step": 5500 }, { "epoch": 0.62, "grad_norm": 0.02960539050400257, "learning_rate": 0.0001, "loss": 3.5052, "step": 5510 }, { "epoch": 0.63, "grad_norm": 0.04090074077248573, "learning_rate": 0.0001, "loss": 3.5967, "step": 5520 }, { "epoch": 0.63, "grad_norm": 0.029112419113516808, "learning_rate": 0.0001, "loss": 3.5398, "step": 5530 }, { "epoch": 0.63, "grad_norm": 0.010812697932124138, "learning_rate": 0.0001, "loss": 3.5084, "step": 5540 }, { "epoch": 0.63, "grad_norm": 0.009114735759794712, "learning_rate": 0.0001, "loss": 3.5803, "step": 5550 }, { "epoch": 0.63, "grad_norm": 0.006859294138848782, "learning_rate": 0.0001, "loss": 3.4868, "step": 5560 }, { "epoch": 0.63, "grad_norm": 0.008377332240343094, "learning_rate": 0.0001, "loss": 3.5998, "step": 5570 }, { "epoch": 0.63, "grad_norm": 0.00438141543418169, "learning_rate": 0.0001, "loss": 3.5486, "step": 5580 }, { "epoch": 0.63, "grad_norm": 0.0028850266244262457, "learning_rate": 0.0001, "loss": 3.5473, "step": 5590 }, { "epoch": 0.63, "grad_norm": 0.014922870323061943, "learning_rate": 0.0001, "loss": 3.6008, "step": 5600 }, { "epoch": 0.64, "grad_norm": 0.029176076874136925, "learning_rate": 0.0001, "loss": 3.4432, "step": 5610 }, { "epoch": 0.64, "grad_norm": 0.012344942428171635, "learning_rate": 0.0001, "loss": 3.5845, "step": 5620 }, { "epoch": 0.64, "grad_norm": 0.004864950198680162, "learning_rate": 0.0001, "loss": 3.5763, "step": 5630 }, { "epoch": 0.64, "grad_norm": 0.003967669326812029, "learning_rate": 0.0001, "loss": 3.4425, "step": 5640 }, { "epoch": 0.64, "grad_norm": 0.006634435150772333, "learning_rate": 0.0001, "loss": 3.4812, "step": 5650 }, { "epoch": 0.64, "grad_norm": 0.0033314877655357122, "learning_rate": 0.0001, "loss": 3.5244, "step": 5660 }, { "epoch": 0.64, "grad_norm": 0.0033424210269004107, "learning_rate": 0.0001, "loss": 3.4927, "step": 5670 }, { "epoch": 0.64, "grad_norm": 0.02300536260008812, "learning_rate": 0.0001, "loss": 3.4715, "step": 5680 }, { "epoch": 0.64, "grad_norm": 0.020596669986844063, "learning_rate": 0.0001, "loss": 3.4376, "step": 5690 }, { "epoch": 0.65, "grad_norm": 0.012730974704027176, "learning_rate": 0.0001, "loss": 3.4749, "step": 5700 }, { "epoch": 0.65, "grad_norm": 0.005509315058588982, "learning_rate": 0.0001, "loss": 3.46, "step": 5710 }, { "epoch": 0.65, "grad_norm": 0.004550841636955738, "learning_rate": 0.0001, "loss": 3.5683, "step": 5720 }, { "epoch": 0.65, "grad_norm": 0.00411552470177412, "learning_rate": 0.0001, "loss": 3.3635, "step": 5730 }, { "epoch": 0.65, "grad_norm": 0.0026570293121039867, "learning_rate": 0.0001, "loss": 3.5694, "step": 5740 }, { "epoch": 0.65, "grad_norm": 0.0036943620070815086, "learning_rate": 0.0001, "loss": 3.4722, "step": 5750 }, { "epoch": 0.65, "grad_norm": 0.0028080944903194904, "learning_rate": 0.0001, "loss": 3.5155, "step": 5760 }, { "epoch": 0.65, "grad_norm": 0.0027384727727621794, "learning_rate": 0.0001, "loss": 3.4185, "step": 5770 }, { "epoch": 0.65, "grad_norm": 0.002460733288899064, "learning_rate": 0.0001, "loss": 3.5207, "step": 5780 }, { "epoch": 0.66, "grad_norm": 0.00212690606713295, "learning_rate": 0.0001, "loss": 3.4825, "step": 5790 }, { "epoch": 0.66, "grad_norm": 0.0024670776911079884, "learning_rate": 0.0001, "loss": 3.5629, "step": 5800 }, { "epoch": 0.66, "grad_norm": 0.0034288140013813972, "learning_rate": 0.0001, "loss": 3.4807, "step": 5810 }, { "epoch": 0.66, "grad_norm": 0.010083252564072609, "learning_rate": 0.0001, "loss": 3.3923, "step": 5820 }, { "epoch": 0.66, "grad_norm": 0.0026547396555542946, "learning_rate": 0.0001, "loss": 3.4753, "step": 5830 }, { "epoch": 0.66, "grad_norm": 0.00223186775110662, "learning_rate": 0.0001, "loss": 3.5195, "step": 5840 }, { "epoch": 0.66, "grad_norm": 0.0022819829173386097, "learning_rate": 0.0001, "loss": 3.5642, "step": 5850 }, { "epoch": 0.66, "grad_norm": 0.002137158066034317, "learning_rate": 0.0001, "loss": 3.4124, "step": 5860 }, { "epoch": 0.66, "grad_norm": 0.0021879777777940035, "learning_rate": 0.0001, "loss": 3.4699, "step": 5870 }, { "epoch": 0.67, "grad_norm": 0.0020602773874998093, "learning_rate": 0.0001, "loss": 3.5854, "step": 5880 }, { "epoch": 0.67, "grad_norm": 0.0021237293258309364, "learning_rate": 0.0001, "loss": 3.512, "step": 5890 }, { "epoch": 0.67, "grad_norm": 0.0022292216308414936, "learning_rate": 0.0001, "loss": 3.4991, "step": 5900 }, { "epoch": 0.67, "grad_norm": 0.0022322346922010183, "learning_rate": 0.0001, "loss": 3.561, "step": 5910 }, { "epoch": 0.67, "grad_norm": 0.0039224689826369286, "learning_rate": 0.0001, "loss": 3.4681, "step": 5920 }, { "epoch": 0.67, "grad_norm": 0.0025937852915376425, "learning_rate": 0.0001, "loss": 3.5209, "step": 5930 }, { "epoch": 0.67, "grad_norm": 0.005112130660563707, "learning_rate": 0.0001, "loss": 3.5241, "step": 5940 }, { "epoch": 0.67, "grad_norm": 0.0027209343388676643, "learning_rate": 0.0001, "loss": 3.485, "step": 5950 }, { "epoch": 0.68, "grad_norm": 0.0019430328393355012, "learning_rate": 0.0001, "loss": 3.5187, "step": 5960 }, { "epoch": 0.68, "grad_norm": 0.0026707707438617945, "learning_rate": 0.0001, "loss": 3.55, "step": 5970 }, { "epoch": 0.68, "grad_norm": 0.001779652200639248, "learning_rate": 0.0001, "loss": 3.5599, "step": 5980 }, { "epoch": 0.68, "grad_norm": 0.002639837795868516, "learning_rate": 0.0001, "loss": 3.5733, "step": 5990 }, { "epoch": 0.68, "grad_norm": 0.0017955248476937413, "learning_rate": 0.0001, "loss": 3.5608, "step": 6000 }, { "epoch": 0.68, "grad_norm": 0.002939866855740547, "learning_rate": 0.0001, "loss": 3.4963, "step": 6010 }, { "epoch": 0.68, "grad_norm": 0.0017706416547298431, "learning_rate": 0.0001, "loss": 3.5619, "step": 6020 }, { "epoch": 0.68, "grad_norm": 0.0017689632950350642, "learning_rate": 0.0001, "loss": 3.4732, "step": 6030 }, { "epoch": 0.68, "grad_norm": 0.002011792967095971, "learning_rate": 0.0001, "loss": 3.4312, "step": 6040 }, { "epoch": 0.69, "grad_norm": 0.0026646656915545464, "learning_rate": 0.0001, "loss": 3.5369, "step": 6050 }, { "epoch": 0.69, "grad_norm": 0.0035712404642254114, "learning_rate": 0.0001, "loss": 3.498, "step": 6060 }, { "epoch": 0.69, "grad_norm": 0.0017177624395117164, "learning_rate": 0.0001, "loss": 3.531, "step": 6070 }, { "epoch": 0.69, "grad_norm": 0.002107428153976798, "learning_rate": 0.0001, "loss": 3.5261, "step": 6080 }, { "epoch": 0.69, "grad_norm": 0.001888448721729219, "learning_rate": 0.0001, "loss": 3.5391, "step": 6090 }, { "epoch": 0.69, "grad_norm": 0.0021331310272216797, "learning_rate": 0.0001, "loss": 3.4185, "step": 6100 }, { "epoch": 0.69, "grad_norm": 0.0015360262477770448, "learning_rate": 0.0001, "loss": 3.4601, "step": 6110 }, { "epoch": 0.69, "grad_norm": 0.004883873276412487, "learning_rate": 0.0001, "loss": 3.4866, "step": 6120 }, { "epoch": 0.69, "grad_norm": 0.0035849157720804214, "learning_rate": 0.0001, "loss": 3.5602, "step": 6130 }, { "epoch": 0.7, "grad_norm": 0.004911376163363457, "learning_rate": 0.0001, "loss": 3.5617, "step": 6140 }, { "epoch": 0.7, "grad_norm": 0.0022321329452097416, "learning_rate": 0.0001, "loss": 3.5117, "step": 6150 }, { "epoch": 0.7, "grad_norm": 0.002402201760560274, "learning_rate": 0.0001, "loss": 3.4076, "step": 6160 }, { "epoch": 0.7, "grad_norm": 0.002760558854788542, "learning_rate": 0.0001, "loss": 3.4474, "step": 6170 }, { "epoch": 0.7, "grad_norm": 0.002066223882138729, "learning_rate": 0.0001, "loss": 3.5238, "step": 6180 }, { "epoch": 0.7, "grad_norm": 0.0021176980808377266, "learning_rate": 0.0001, "loss": 3.4709, "step": 6190 }, { "epoch": 0.7, "grad_norm": 0.0021869249176234007, "learning_rate": 0.0001, "loss": 3.515, "step": 6200 }, { "epoch": 0.7, "grad_norm": 0.0026778511237353086, "learning_rate": 0.0001, "loss": 3.5087, "step": 6210 }, { "epoch": 0.7, "grad_norm": 0.0038428560364991426, "learning_rate": 0.0001, "loss": 3.5096, "step": 6220 }, { "epoch": 0.71, "grad_norm": 0.002261572750285268, "learning_rate": 0.0001, "loss": 3.5225, "step": 6230 }, { "epoch": 0.71, "grad_norm": 0.002262414200231433, "learning_rate": 0.0001, "loss": 3.4115, "step": 6240 }, { "epoch": 0.71, "grad_norm": 0.0023483489640057087, "learning_rate": 0.0001, "loss": 3.626, "step": 6250 }, { "epoch": 0.71, "grad_norm": 0.0020373782608658075, "learning_rate": 0.0001, "loss": 3.4309, "step": 6260 }, { "epoch": 0.71, "grad_norm": 0.001937716151587665, "learning_rate": 0.0001, "loss": 3.4485, "step": 6270 }, { "epoch": 0.71, "grad_norm": 0.0025742771103978157, "learning_rate": 0.0001, "loss": 3.4844, "step": 6280 }, { "epoch": 0.71, "grad_norm": 0.00254641892388463, "learning_rate": 0.0001, "loss": 3.5622, "step": 6290 }, { "epoch": 0.71, "grad_norm": 0.002595453057438135, "learning_rate": 0.0001, "loss": 3.5518, "step": 6300 }, { "epoch": 0.71, "grad_norm": 0.0020915817003697157, "learning_rate": 0.0001, "loss": 3.5254, "step": 6310 }, { "epoch": 0.72, "grad_norm": 0.0017456605564802885, "learning_rate": 0.0001, "loss": 3.5666, "step": 6320 }, { "epoch": 0.72, "grad_norm": 0.002608217764645815, "learning_rate": 0.0001, "loss": 3.5222, "step": 6330 }, { "epoch": 0.72, "grad_norm": 0.004172658082097769, "learning_rate": 0.0001, "loss": 3.4997, "step": 6340 }, { "epoch": 0.72, "grad_norm": 0.0027178700547665358, "learning_rate": 0.0001, "loss": 3.4445, "step": 6350 }, { "epoch": 0.72, "grad_norm": 0.004284811671823263, "learning_rate": 0.0001, "loss": 3.5405, "step": 6360 }, { "epoch": 0.72, "grad_norm": 0.007599606644362211, "learning_rate": 0.0001, "loss": 3.5489, "step": 6370 }, { "epoch": 0.72, "grad_norm": 0.009692888706922531, "learning_rate": 0.0001, "loss": 3.5764, "step": 6380 }, { "epoch": 0.72, "grad_norm": 0.006543254014104605, "learning_rate": 0.0001, "loss": 3.5205, "step": 6390 }, { "epoch": 0.72, "grad_norm": 0.003983669448643923, "learning_rate": 0.0001, "loss": 3.5557, "step": 6400 }, { "epoch": 0.73, "grad_norm": 0.003046205500140786, "learning_rate": 0.0001, "loss": 3.5775, "step": 6410 }, { "epoch": 0.73, "grad_norm": 0.0024033929221332073, "learning_rate": 0.0001, "loss": 3.4985, "step": 6420 }, { "epoch": 0.73, "grad_norm": 0.0022712298668920994, "learning_rate": 0.0001, "loss": 3.5098, "step": 6430 }, { "epoch": 0.73, "grad_norm": 0.003945188596844673, "learning_rate": 0.0001, "loss": 3.4616, "step": 6440 }, { "epoch": 0.73, "grad_norm": 0.0021597538143396378, "learning_rate": 0.0001, "loss": 3.5493, "step": 6450 }, { "epoch": 0.73, "grad_norm": 0.0028619160875678062, "learning_rate": 0.0001, "loss": 3.4671, "step": 6460 }, { "epoch": 0.73, "grad_norm": 0.002111823996528983, "learning_rate": 0.0001, "loss": 3.5653, "step": 6470 }, { "epoch": 0.73, "grad_norm": 0.005053882487118244, "learning_rate": 0.0001, "loss": 3.6688, "step": 6480 }, { "epoch": 0.74, "grad_norm": 0.002283085137605667, "learning_rate": 0.0001, "loss": 3.5123, "step": 6490 }, { "epoch": 0.74, "grad_norm": 0.0021745525300502777, "learning_rate": 0.0001, "loss": 3.4865, "step": 6500 }, { "epoch": 0.74, "grad_norm": 0.0036136761773377657, "learning_rate": 0.0001, "loss": 3.4717, "step": 6510 }, { "epoch": 0.74, "grad_norm": 0.002270336728543043, "learning_rate": 0.0001, "loss": 3.5267, "step": 6520 }, { "epoch": 0.74, "grad_norm": 0.002152936765924096, "learning_rate": 0.0001, "loss": 3.5606, "step": 6530 }, { "epoch": 0.74, "grad_norm": 0.0016581750242039561, "learning_rate": 0.0001, "loss": 3.4797, "step": 6540 }, { "epoch": 0.74, "grad_norm": 0.0018523556645959616, "learning_rate": 0.0001, "loss": 3.4933, "step": 6550 }, { "epoch": 0.74, "grad_norm": 0.0033584292978048325, "learning_rate": 0.0001, "loss": 3.586, "step": 6560 }, { "epoch": 0.74, "grad_norm": 0.003546541789546609, "learning_rate": 0.0001, "loss": 3.4627, "step": 6570 }, { "epoch": 0.75, "grad_norm": 0.003265738720074296, "learning_rate": 0.0001, "loss": 3.51, "step": 6580 }, { "epoch": 0.75, "grad_norm": 0.0039695920422673225, "learning_rate": 0.0001, "loss": 3.4609, "step": 6590 }, { "epoch": 0.75, "grad_norm": 0.002952682552859187, "learning_rate": 0.0001, "loss": 3.5163, "step": 6600 }, { "epoch": 0.75, "grad_norm": 0.004474209621548653, "learning_rate": 0.0001, "loss": 3.6306, "step": 6610 }, { "epoch": 0.75, "grad_norm": 0.003792545525357127, "learning_rate": 0.0001, "loss": 3.5607, "step": 6620 }, { "epoch": 0.75, "grad_norm": 0.001925045857205987, "learning_rate": 0.0001, "loss": 3.4413, "step": 6630 }, { "epoch": 0.75, "grad_norm": 0.007960534654557705, "learning_rate": 0.0001, "loss": 3.531, "step": 6640 }, { "epoch": 0.75, "grad_norm": 0.0032272860407829285, "learning_rate": 0.0001, "loss": 3.5038, "step": 6650 }, { "epoch": 0.75, "grad_norm": 0.004008360207080841, "learning_rate": 0.0001, "loss": 3.461, "step": 6660 }, { "epoch": 0.76, "grad_norm": 0.0027241287752985954, "learning_rate": 0.0001, "loss": 3.5522, "step": 6670 }, { "epoch": 0.76, "grad_norm": 0.0031653789337724447, "learning_rate": 0.0001, "loss": 3.581, "step": 6680 }, { "epoch": 0.76, "grad_norm": 0.002086544642224908, "learning_rate": 0.0001, "loss": 3.4335, "step": 6690 }, { "epoch": 0.76, "grad_norm": 0.003233155468478799, "learning_rate": 0.0001, "loss": 3.6119, "step": 6700 }, { "epoch": 0.76, "grad_norm": 0.002152677858248353, "learning_rate": 0.0001, "loss": 3.4723, "step": 6710 }, { "epoch": 0.76, "grad_norm": 0.0016281877178698778, "learning_rate": 0.0001, "loss": 3.4861, "step": 6720 }, { "epoch": 0.76, "grad_norm": 0.0015240154461935163, "learning_rate": 0.0001, "loss": 3.5635, "step": 6730 }, { "epoch": 0.76, "grad_norm": 0.003363180672749877, "learning_rate": 0.0001, "loss": 3.4608, "step": 6740 }, { "epoch": 0.76, "grad_norm": 0.002496914705261588, "learning_rate": 0.0001, "loss": 3.4728, "step": 6750 }, { "epoch": 0.77, "grad_norm": 0.002102766651660204, "learning_rate": 0.0001, "loss": 3.4712, "step": 6760 }, { "epoch": 0.77, "grad_norm": 0.00232588779181242, "learning_rate": 0.0001, "loss": 3.5, "step": 6770 }, { "epoch": 0.77, "grad_norm": 0.0021653014700859785, "learning_rate": 0.0001, "loss": 3.5763, "step": 6780 }, { "epoch": 0.77, "grad_norm": 0.0022148117423057556, "learning_rate": 0.0001, "loss": 3.526, "step": 6790 }, { "epoch": 0.77, "grad_norm": 0.002361193997785449, "learning_rate": 0.0001, "loss": 3.5583, "step": 6800 }, { "epoch": 0.77, "grad_norm": 0.002198055386543274, "learning_rate": 0.0001, "loss": 3.6021, "step": 6810 }, { "epoch": 0.77, "grad_norm": 0.002976398915052414, "learning_rate": 0.0001, "loss": 3.5281, "step": 6820 }, { "epoch": 0.77, "grad_norm": 0.002503900555893779, "learning_rate": 0.0001, "loss": 3.5366, "step": 6830 }, { "epoch": 0.77, "grad_norm": 0.001769381109625101, "learning_rate": 0.0001, "loss": 3.5557, "step": 6840 }, { "epoch": 0.78, "grad_norm": 0.002877818187698722, "learning_rate": 0.0001, "loss": 3.5773, "step": 6850 }, { "epoch": 0.78, "grad_norm": 0.005399429239332676, "learning_rate": 0.0001, "loss": 3.6085, "step": 6860 }, { "epoch": 0.78, "grad_norm": 0.0037224770057946444, "learning_rate": 0.0001, "loss": 3.4699, "step": 6870 }, { "epoch": 0.78, "grad_norm": 0.004061304964125156, "learning_rate": 0.0001, "loss": 3.5677, "step": 6880 }, { "epoch": 0.78, "grad_norm": 0.0026404671370983124, "learning_rate": 0.0001, "loss": 3.5213, "step": 6890 }, { "epoch": 0.78, "grad_norm": 0.005314365029335022, "learning_rate": 0.0001, "loss": 3.5424, "step": 6900 }, { "epoch": 0.78, "grad_norm": 0.005096503533422947, "learning_rate": 0.0001, "loss": 3.5514, "step": 6910 }, { "epoch": 0.78, "grad_norm": 0.0021523437462747097, "learning_rate": 0.0001, "loss": 3.5454, "step": 6920 }, { "epoch": 0.79, "grad_norm": 0.0026106773875653744, "learning_rate": 0.0001, "loss": 3.6114, "step": 6930 }, { "epoch": 0.79, "grad_norm": 0.002172949491068721, "learning_rate": 0.0001, "loss": 3.5143, "step": 6940 }, { "epoch": 0.79, "grad_norm": 0.001509875524789095, "learning_rate": 0.0001, "loss": 3.5963, "step": 6950 }, { "epoch": 0.79, "grad_norm": 0.0025583370588719845, "learning_rate": 0.0001, "loss": 3.578, "step": 6960 }, { "epoch": 0.79, "grad_norm": 0.0016860014293342829, "learning_rate": 0.0001, "loss": 3.4862, "step": 6970 }, { "epoch": 0.79, "grad_norm": 0.0014592402148991823, "learning_rate": 0.0001, "loss": 3.4456, "step": 6980 }, { "epoch": 0.79, "grad_norm": 0.002015444915741682, "learning_rate": 0.0001, "loss": 3.5774, "step": 6990 }, { "epoch": 0.79, "grad_norm": 0.0019285731250420213, "learning_rate": 0.0001, "loss": 3.5576, "step": 7000 }, { "epoch": 0.79, "grad_norm": 0.0014443027321249247, "learning_rate": 0.0001, "loss": 3.603, "step": 7010 }, { "epoch": 0.8, "grad_norm": 0.0016567070269957185, "learning_rate": 0.0001, "loss": 3.5527, "step": 7020 }, { "epoch": 0.8, "grad_norm": 0.0014556662645190954, "learning_rate": 0.0001, "loss": 3.4599, "step": 7030 }, { "epoch": 0.8, "grad_norm": 0.0015021538129076362, "learning_rate": 0.0001, "loss": 3.4801, "step": 7040 }, { "epoch": 0.8, "grad_norm": 0.0017875488847494125, "learning_rate": 0.0001, "loss": 3.4179, "step": 7050 }, { "epoch": 0.8, "grad_norm": 0.0017138911643996835, "learning_rate": 0.0001, "loss": 3.4651, "step": 7060 }, { "epoch": 0.8, "grad_norm": 0.0019092384027317166, "learning_rate": 0.0001, "loss": 3.5147, "step": 7070 }, { "epoch": 0.8, "grad_norm": 0.001827580388635397, "learning_rate": 0.0001, "loss": 3.504, "step": 7080 }, { "epoch": 0.8, "grad_norm": 0.0034550947602838278, "learning_rate": 0.0001, "loss": 3.5478, "step": 7090 }, { "epoch": 0.8, "grad_norm": 0.0013913381844758987, "learning_rate": 0.0001, "loss": 3.3851, "step": 7100 }, { "epoch": 0.81, "grad_norm": 0.0030989309307187796, "learning_rate": 0.0001, "loss": 3.5113, "step": 7110 }, { "epoch": 0.81, "grad_norm": 0.002065989887341857, "learning_rate": 0.0001, "loss": 3.6111, "step": 7120 }, { "epoch": 0.81, "grad_norm": 0.0018296150956302881, "learning_rate": 0.0001, "loss": 3.4966, "step": 7130 }, { "epoch": 0.81, "grad_norm": 0.001597485737875104, "learning_rate": 0.0001, "loss": 3.387, "step": 7140 }, { "epoch": 0.81, "grad_norm": 0.0024349491577595472, "learning_rate": 0.0001, "loss": 3.4924, "step": 7150 }, { "epoch": 0.81, "grad_norm": 0.0015689561842009425, "learning_rate": 0.0001, "loss": 3.5074, "step": 7160 }, { "epoch": 0.81, "grad_norm": 0.0016878427704796195, "learning_rate": 0.0001, "loss": 3.5758, "step": 7170 }, { "epoch": 0.81, "grad_norm": 0.001728406292386353, "learning_rate": 0.0001, "loss": 3.578, "step": 7180 }, { "epoch": 0.81, "grad_norm": 0.0024347121361643076, "learning_rate": 0.0001, "loss": 3.4941, "step": 7190 }, { "epoch": 0.82, "grad_norm": 0.0015046331100165844, "learning_rate": 0.0001, "loss": 3.5673, "step": 7200 }, { "epoch": 0.82, "grad_norm": 0.0021255433093756437, "learning_rate": 0.0001, "loss": 3.4735, "step": 7210 }, { "epoch": 0.82, "grad_norm": 0.0013471359852701426, "learning_rate": 0.0001, "loss": 3.3372, "step": 7220 }, { "epoch": 0.82, "grad_norm": 0.002698899246752262, "learning_rate": 0.0001, "loss": 3.4613, "step": 7230 }, { "epoch": 0.82, "grad_norm": 0.0028237944934517145, "learning_rate": 0.0001, "loss": 3.4952, "step": 7240 }, { "epoch": 0.82, "grad_norm": 0.0020659673027694225, "learning_rate": 0.0001, "loss": 3.4296, "step": 7250 }, { "epoch": 0.82, "grad_norm": 0.0019609539303928614, "learning_rate": 0.0001, "loss": 3.5447, "step": 7260 }, { "epoch": 0.82, "grad_norm": 0.0029372270219027996, "learning_rate": 0.0001, "loss": 3.5498, "step": 7270 }, { "epoch": 0.82, "grad_norm": 0.0023613173980265856, "learning_rate": 0.0001, "loss": 3.5279, "step": 7280 }, { "epoch": 0.83, "grad_norm": 0.0019921702332794666, "learning_rate": 0.0001, "loss": 3.515, "step": 7290 }, { "epoch": 0.83, "grad_norm": 0.0018286476843059063, "learning_rate": 0.0001, "loss": 3.5901, "step": 7300 }, { "epoch": 0.83, "grad_norm": 0.0019646466244012117, "learning_rate": 0.0001, "loss": 3.4548, "step": 7310 }, { "epoch": 0.83, "grad_norm": 0.001541351666674018, "learning_rate": 0.0001, "loss": 3.5205, "step": 7320 }, { "epoch": 0.83, "grad_norm": 0.0014243964105844498, "learning_rate": 0.0001, "loss": 3.4736, "step": 7330 }, { "epoch": 0.83, "grad_norm": 0.00150727026630193, "learning_rate": 0.0001, "loss": 3.4336, "step": 7340 }, { "epoch": 0.83, "grad_norm": 0.003093665698543191, "learning_rate": 0.0001, "loss": 3.5161, "step": 7350 }, { "epoch": 0.83, "grad_norm": 0.002368559828028083, "learning_rate": 0.0001, "loss": 3.468, "step": 7360 }, { "epoch": 0.83, "grad_norm": 0.001803843304514885, "learning_rate": 0.0001, "loss": 3.5316, "step": 7370 }, { "epoch": 0.84, "grad_norm": 0.005914970301091671, "learning_rate": 0.0001, "loss": 3.5009, "step": 7380 }, { "epoch": 0.84, "grad_norm": 0.003101928625255823, "learning_rate": 0.0001, "loss": 3.5174, "step": 7390 }, { "epoch": 0.84, "grad_norm": 0.00524236261844635, "learning_rate": 0.0001, "loss": 3.568, "step": 7400 }, { "epoch": 0.84, "grad_norm": 0.0018921655137091875, "learning_rate": 0.0001, "loss": 3.4927, "step": 7410 }, { "epoch": 0.84, "grad_norm": 0.001932639628648758, "learning_rate": 0.0001, "loss": 3.4672, "step": 7420 }, { "epoch": 0.84, "grad_norm": 0.0015908328350633383, "learning_rate": 0.0001, "loss": 3.4384, "step": 7430 }, { "epoch": 0.84, "grad_norm": 0.0015303731197491288, "learning_rate": 0.0001, "loss": 3.5326, "step": 7440 }, { "epoch": 0.84, "grad_norm": 0.001463241525925696, "learning_rate": 0.0001, "loss": 3.4452, "step": 7450 }, { "epoch": 0.85, "grad_norm": 0.0023358971811830997, "learning_rate": 0.0001, "loss": 3.4901, "step": 7460 }, { "epoch": 0.85, "grad_norm": 0.0012427878100425005, "learning_rate": 0.0001, "loss": 3.5108, "step": 7470 }, { "epoch": 0.85, "grad_norm": 0.0029764787759631872, "learning_rate": 0.0001, "loss": 3.4142, "step": 7480 }, { "epoch": 0.85, "grad_norm": 0.00797741673886776, "learning_rate": 0.0001, "loss": 3.5709, "step": 7490 }, { "epoch": 0.85, "grad_norm": 0.002695192815735936, "learning_rate": 0.0001, "loss": 3.6059, "step": 7500 }, { "epoch": 0.85, "grad_norm": 0.0044453623704612255, "learning_rate": 0.0001, "loss": 3.5277, "step": 7510 }, { "epoch": 0.85, "grad_norm": 0.045631904155015945, "learning_rate": 0.0001, "loss": 3.5964, "step": 7520 }, { "epoch": 0.85, "grad_norm": 0.03496149182319641, "learning_rate": 0.0001, "loss": 3.501, "step": 7530 }, { "epoch": 0.85, "grad_norm": 0.05206778272986412, "learning_rate": 0.0001, "loss": 3.4756, "step": 7540 }, { "epoch": 0.86, "grad_norm": 0.027322160080075264, "learning_rate": 0.0001, "loss": 3.4262, "step": 7550 }, { "epoch": 0.86, "grad_norm": 0.014653191901743412, "learning_rate": 0.0001, "loss": 3.5232, "step": 7560 }, { "epoch": 0.86, "grad_norm": 0.01232716254889965, "learning_rate": 0.0001, "loss": 3.5063, "step": 7570 }, { "epoch": 0.86, "grad_norm": 0.005905072204768658, "learning_rate": 0.0001, "loss": 3.6057, "step": 7580 }, { "epoch": 0.86, "grad_norm": 0.0038961265236139297, "learning_rate": 0.0001, "loss": 3.5686, "step": 7590 }, { "epoch": 0.86, "grad_norm": 0.002840026980265975, "learning_rate": 0.0001, "loss": 3.4314, "step": 7600 }, { "epoch": 0.86, "grad_norm": 0.002874474273994565, "learning_rate": 0.0001, "loss": 3.5452, "step": 7610 }, { "epoch": 0.86, "grad_norm": 0.0023985644802451134, "learning_rate": 0.0001, "loss": 3.5312, "step": 7620 }, { "epoch": 0.86, "grad_norm": 0.00290778954513371, "learning_rate": 0.0001, "loss": 3.4743, "step": 7630 }, { "epoch": 0.87, "grad_norm": 0.0027740399818867445, "learning_rate": 0.0001, "loss": 3.4529, "step": 7640 }, { "epoch": 0.87, "grad_norm": 0.0018519266741350293, "learning_rate": 0.0001, "loss": 3.5161, "step": 7650 }, { "epoch": 0.87, "grad_norm": 0.0053995088674128056, "learning_rate": 0.0001, "loss": 3.4832, "step": 7660 }, { "epoch": 0.87, "grad_norm": 0.0021069904323667288, "learning_rate": 0.0001, "loss": 3.5989, "step": 7670 }, { "epoch": 0.87, "grad_norm": 0.0022181272506713867, "learning_rate": 0.0001, "loss": 3.5235, "step": 7680 }, { "epoch": 0.87, "grad_norm": 0.0020184614695608616, "learning_rate": 0.0001, "loss": 3.5765, "step": 7690 }, { "epoch": 0.87, "grad_norm": 0.016120506450533867, "learning_rate": 0.0001, "loss": 3.4951, "step": 7700 }, { "epoch": 0.87, "grad_norm": 0.0031799643766134977, "learning_rate": 0.0001, "loss": 3.5935, "step": 7710 }, { "epoch": 0.87, "grad_norm": 0.004949579946696758, "learning_rate": 0.0001, "loss": 3.5075, "step": 7720 }, { "epoch": 0.88, "grad_norm": 0.002587946131825447, "learning_rate": 0.0001, "loss": 3.4781, "step": 7730 }, { "epoch": 0.88, "grad_norm": 0.0028098858892917633, "learning_rate": 0.0001, "loss": 3.5373, "step": 7740 }, { "epoch": 0.88, "grad_norm": 0.0017217363929376006, "learning_rate": 0.0001, "loss": 3.5993, "step": 7750 }, { "epoch": 0.88, "grad_norm": 0.00233808858320117, "learning_rate": 0.0001, "loss": 3.5068, "step": 7760 }, { "epoch": 0.88, "grad_norm": 0.0022718068212270737, "learning_rate": 0.0001, "loss": 3.5228, "step": 7770 }, { "epoch": 0.88, "grad_norm": 0.00200471724383533, "learning_rate": 0.0001, "loss": 3.5087, "step": 7780 }, { "epoch": 0.88, "grad_norm": 0.001667297095991671, "learning_rate": 0.0001, "loss": 3.4176, "step": 7790 }, { "epoch": 0.88, "grad_norm": 0.0024583961348980665, "learning_rate": 0.0001, "loss": 3.5167, "step": 7800 }, { "epoch": 0.88, "grad_norm": 0.0020187138579785824, "learning_rate": 0.0001, "loss": 3.462, "step": 7810 }, { "epoch": 0.89, "grad_norm": 0.0011740680783987045, "learning_rate": 0.0001, "loss": 3.5415, "step": 7820 }, { "epoch": 0.89, "grad_norm": 0.0030410962644964457, "learning_rate": 0.0001, "loss": 3.5107, "step": 7830 }, { "epoch": 0.89, "grad_norm": 0.005019365809857845, "learning_rate": 0.0001, "loss": 3.5137, "step": 7840 }, { "epoch": 0.89, "grad_norm": 0.0014164326712489128, "learning_rate": 0.0001, "loss": 3.4566, "step": 7850 }, { "epoch": 0.89, "grad_norm": 0.0011655398411676288, "learning_rate": 0.0001, "loss": 3.4812, "step": 7860 }, { "epoch": 0.89, "grad_norm": 0.0014012375613674521, "learning_rate": 0.0001, "loss": 3.4385, "step": 7870 }, { "epoch": 0.89, "grad_norm": 0.0019577438943088055, "learning_rate": 0.0001, "loss": 3.5095, "step": 7880 }, { "epoch": 0.89, "grad_norm": 0.0015591892879456282, "learning_rate": 0.0001, "loss": 3.4578, "step": 7890 }, { "epoch": 0.89, "grad_norm": 0.015045158565044403, "learning_rate": 0.0001, "loss": 3.4875, "step": 7900 }, { "epoch": 0.9, "grad_norm": 0.0035269451327621937, "learning_rate": 0.0001, "loss": 3.4442, "step": 7910 }, { "epoch": 0.9, "grad_norm": 0.0021643659565597773, "learning_rate": 0.0001, "loss": 3.5124, "step": 7920 }, { "epoch": 0.9, "grad_norm": 0.0012154363794252276, "learning_rate": 0.0001, "loss": 3.5871, "step": 7930 }, { "epoch": 0.9, "grad_norm": 0.0013683476718142629, "learning_rate": 0.0001, "loss": 3.5871, "step": 7940 }, { "epoch": 0.9, "grad_norm": 0.00120898662135005, "learning_rate": 0.0001, "loss": 3.5069, "step": 7950 }, { "epoch": 0.9, "grad_norm": 0.0015796332154422998, "learning_rate": 0.0001, "loss": 3.4103, "step": 7960 }, { "epoch": 0.9, "grad_norm": 0.0012266021221876144, "learning_rate": 0.0001, "loss": 3.4803, "step": 7970 }, { "epoch": 0.9, "grad_norm": 0.008350533433258533, "learning_rate": 0.0001, "loss": 3.5769, "step": 7980 }, { "epoch": 0.91, "grad_norm": 0.00542466202750802, "learning_rate": 0.0001, "loss": 3.5892, "step": 7990 }, { "epoch": 0.91, "grad_norm": 0.0026403493247926235, "learning_rate": 0.0001, "loss": 3.5482, "step": 8000 }, { "epoch": 0.91, "grad_norm": 0.0018821638077497482, "learning_rate": 0.0001, "loss": 3.4679, "step": 8010 }, { "epoch": 0.91, "grad_norm": 0.0020988560281693935, "learning_rate": 0.0001, "loss": 3.6059, "step": 8020 }, { "epoch": 0.91, "grad_norm": 0.0018965898780152202, "learning_rate": 0.0001, "loss": 3.5853, "step": 8030 }, { "epoch": 0.91, "grad_norm": 0.0017398440977558494, "learning_rate": 0.0001, "loss": 3.4603, "step": 8040 }, { "epoch": 0.91, "grad_norm": 0.001547396183013916, "learning_rate": 0.0001, "loss": 3.5475, "step": 8050 }, { "epoch": 0.91, "grad_norm": 0.0020740798208862543, "learning_rate": 0.0001, "loss": 3.5418, "step": 8060 }, { "epoch": 0.91, "grad_norm": 0.0017837915802374482, "learning_rate": 0.0001, "loss": 3.4464, "step": 8070 }, { "epoch": 0.92, "grad_norm": 0.001405497663654387, "learning_rate": 0.0001, "loss": 3.4422, "step": 8080 }, { "epoch": 0.92, "grad_norm": 0.0017704376950860023, "learning_rate": 0.0001, "loss": 3.4188, "step": 8090 }, { "epoch": 0.92, "grad_norm": 0.0017055340576916933, "learning_rate": 0.0001, "loss": 3.4999, "step": 8100 }, { "epoch": 0.92, "grad_norm": 0.0016433538403362036, "learning_rate": 0.0001, "loss": 3.5667, "step": 8110 }, { "epoch": 0.92, "grad_norm": 0.0024393522180616856, "learning_rate": 0.0001, "loss": 3.5607, "step": 8120 }, { "epoch": 0.92, "grad_norm": 0.0018067143391817808, "learning_rate": 0.0001, "loss": 3.5078, "step": 8130 }, { "epoch": 0.92, "grad_norm": 0.0026220311410725117, "learning_rate": 0.0001, "loss": 3.5176, "step": 8140 }, { "epoch": 0.92, "grad_norm": 0.0016884409124031663, "learning_rate": 0.0001, "loss": 3.4673, "step": 8150 }, { "epoch": 0.92, "grad_norm": 0.001484960550442338, "learning_rate": 0.0001, "loss": 3.5377, "step": 8160 }, { "epoch": 0.93, "grad_norm": 0.0013857349986210465, "learning_rate": 0.0001, "loss": 3.5313, "step": 8170 }, { "epoch": 0.93, "grad_norm": 0.0013743892777711153, "learning_rate": 0.0001, "loss": 3.6478, "step": 8180 }, { "epoch": 0.93, "grad_norm": 0.0014420408988371491, "learning_rate": 0.0001, "loss": 3.4701, "step": 8190 }, { "epoch": 0.93, "grad_norm": 0.0012977155856788158, "learning_rate": 0.0001, "loss": 3.5372, "step": 8200 }, { "epoch": 0.93, "grad_norm": 0.0015329618472605944, "learning_rate": 0.0001, "loss": 3.539, "step": 8210 }, { "epoch": 0.93, "grad_norm": 0.002373595256358385, "learning_rate": 0.0001, "loss": 3.5279, "step": 8220 }, { "epoch": 0.93, "grad_norm": 0.0018490661168470979, "learning_rate": 0.0001, "loss": 3.4894, "step": 8230 }, { "epoch": 0.93, "grad_norm": 0.001816432224586606, "learning_rate": 0.0001, "loss": 3.3795, "step": 8240 }, { "epoch": 0.93, "grad_norm": 0.0031627179123461246, "learning_rate": 0.0001, "loss": 3.4793, "step": 8250 }, { "epoch": 0.94, "grad_norm": 0.0018617124296724796, "learning_rate": 0.0001, "loss": 3.4797, "step": 8260 }, { "epoch": 0.94, "grad_norm": 0.001693052239716053, "learning_rate": 0.0001, "loss": 3.4799, "step": 8270 }, { "epoch": 0.94, "grad_norm": 0.0013699023984372616, "learning_rate": 0.0001, "loss": 3.4739, "step": 8280 }, { "epoch": 0.94, "grad_norm": 0.00122186285443604, "learning_rate": 0.0001, "loss": 3.5235, "step": 8290 }, { "epoch": 0.94, "grad_norm": 0.0012969328090548515, "learning_rate": 0.0001, "loss": 3.4885, "step": 8300 }, { "epoch": 0.94, "grad_norm": 0.0019971418660134077, "learning_rate": 0.0001, "loss": 3.5553, "step": 8310 }, { "epoch": 0.94, "grad_norm": 0.0014962393324822187, "learning_rate": 0.0001, "loss": 3.6409, "step": 8320 }, { "epoch": 0.94, "grad_norm": 0.0012482035672292113, "learning_rate": 0.0001, "loss": 3.4702, "step": 8330 }, { "epoch": 0.94, "grad_norm": 0.0015909472713246942, "learning_rate": 0.0001, "loss": 3.5255, "step": 8340 }, { "epoch": 0.95, "grad_norm": 0.001999499276280403, "learning_rate": 0.0001, "loss": 3.6009, "step": 8350 }, { "epoch": 0.95, "grad_norm": 0.0015827047172933817, "learning_rate": 0.0001, "loss": 3.5525, "step": 8360 }, { "epoch": 0.95, "grad_norm": 0.003549994667991996, "learning_rate": 0.0001, "loss": 3.4509, "step": 8370 }, { "epoch": 0.95, "grad_norm": 0.0014238920994102955, "learning_rate": 0.0001, "loss": 3.432, "step": 8380 }, { "epoch": 0.95, "grad_norm": 0.0012334833154454827, "learning_rate": 0.0001, "loss": 3.4496, "step": 8390 }, { "epoch": 0.95, "grad_norm": 0.0017806489486247301, "learning_rate": 0.0001, "loss": 3.4943, "step": 8400 }, { "epoch": 0.95, "grad_norm": 0.0013871816918253899, "learning_rate": 0.0001, "loss": 3.5126, "step": 8410 }, { "epoch": 0.95, "grad_norm": 0.0018306664424017072, "learning_rate": 0.0001, "loss": 3.5599, "step": 8420 }, { "epoch": 0.95, "grad_norm": 0.001294511603191495, "learning_rate": 0.0001, "loss": 3.4735, "step": 8430 }, { "epoch": 0.96, "grad_norm": 0.0014242121251299977, "learning_rate": 0.0001, "loss": 3.5125, "step": 8440 }, { "epoch": 0.96, "grad_norm": 0.0026348403189331293, "learning_rate": 0.0001, "loss": 3.5332, "step": 8450 }, { "epoch": 0.96, "grad_norm": 0.0013578971847891808, "learning_rate": 0.0001, "loss": 3.5633, "step": 8460 }, { "epoch": 0.96, "grad_norm": 0.0012382150162011385, "learning_rate": 0.0001, "loss": 3.4439, "step": 8470 }, { "epoch": 0.96, "grad_norm": 0.0012294066837057471, "learning_rate": 0.0001, "loss": 3.3983, "step": 8480 }, { "epoch": 0.96, "grad_norm": 0.0017983483849093318, "learning_rate": 0.0001, "loss": 3.451, "step": 8490 }, { "epoch": 0.96, "grad_norm": 0.0015307184075936675, "learning_rate": 0.0001, "loss": 3.4538, "step": 8500 }, { "epoch": 0.96, "grad_norm": 0.0020233129616826773, "learning_rate": 0.0001, "loss": 3.3824, "step": 8510 }, { "epoch": 0.97, "grad_norm": 0.0011595963733270764, "learning_rate": 0.0001, "loss": 3.4434, "step": 8520 }, { "epoch": 0.97, "grad_norm": 0.0012451488291844726, "learning_rate": 0.0001, "loss": 3.4467, "step": 8530 }, { "epoch": 0.97, "grad_norm": 0.0029814129229635, "learning_rate": 0.0001, "loss": 3.5629, "step": 8540 }, { "epoch": 0.97, "grad_norm": 0.0025485665537416935, "learning_rate": 0.0001, "loss": 3.4909, "step": 8550 }, { "epoch": 0.97, "grad_norm": 0.0017052841139957309, "learning_rate": 0.0001, "loss": 3.4524, "step": 8560 }, { "epoch": 0.97, "grad_norm": 0.0011431340826675296, "learning_rate": 0.0001, "loss": 3.5121, "step": 8570 }, { "epoch": 0.97, "grad_norm": 0.0013961438089609146, "learning_rate": 0.0001, "loss": 3.536, "step": 8580 }, { "epoch": 0.97, "grad_norm": 0.001101571717299521, "learning_rate": 0.0001, "loss": 3.5659, "step": 8590 }, { "epoch": 0.97, "grad_norm": 0.001677903812378645, "learning_rate": 0.0001, "loss": 3.3924, "step": 8600 }, { "epoch": 0.98, "grad_norm": 0.00424036243930459, "learning_rate": 0.0001, "loss": 3.5545, "step": 8610 }, { "epoch": 0.98, "grad_norm": 0.002840942470356822, "learning_rate": 0.0001, "loss": 3.4977, "step": 8620 }, { "epoch": 0.98, "grad_norm": 0.001998503692448139, "learning_rate": 0.0001, "loss": 3.5613, "step": 8630 }, { "epoch": 0.98, "grad_norm": 0.002215249463915825, "learning_rate": 0.0001, "loss": 3.4366, "step": 8640 }, { "epoch": 0.98, "grad_norm": 0.0023502458352595568, "learning_rate": 0.0001, "loss": 3.5164, "step": 8650 }, { "epoch": 0.98, "grad_norm": 0.0033554863184690475, "learning_rate": 0.0001, "loss": 3.5432, "step": 8660 }, { "epoch": 0.98, "grad_norm": 0.006383153609931469, "learning_rate": 0.0001, "loss": 3.5177, "step": 8670 }, { "epoch": 0.98, "grad_norm": 0.004156162030994892, "learning_rate": 0.0001, "loss": 3.5303, "step": 8680 }, { "epoch": 0.98, "grad_norm": 0.0023237932473421097, "learning_rate": 0.0001, "loss": 3.4269, "step": 8690 }, { "epoch": 0.99, "grad_norm": 0.0018931159283965826, "learning_rate": 0.0001, "loss": 3.4463, "step": 8700 }, { "epoch": 0.99, "grad_norm": 0.0019322831649333239, "learning_rate": 0.0001, "loss": 3.5048, "step": 8710 }, { "epoch": 0.99, "grad_norm": 0.0014460397651419044, "learning_rate": 0.0001, "loss": 3.4326, "step": 8720 }, { "epoch": 0.99, "grad_norm": 0.0011596990516409278, "learning_rate": 0.0001, "loss": 3.3883, "step": 8730 }, { "epoch": 0.99, "grad_norm": 0.007338897790759802, "learning_rate": 0.0001, "loss": 3.4742, "step": 8740 }, { "epoch": 0.99, "grad_norm": 0.0024052837397903204, "learning_rate": 0.0001, "loss": 3.5001, "step": 8750 } ], "logging_steps": 10, "max_steps": 8827, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "total_flos": 4.5626607796224e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }