{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998681608437706, "eval_steps": 500, "global_step": 7583, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0065919578114700065, "grad_norm": 11.249381065368652, "learning_rate": 5.263157894736842e-06, "loss": 1.324, "step": 10 }, { "epoch": 0.013183915622940013, "grad_norm": 8.759961128234863, "learning_rate": 1.0526315789473684e-05, "loss": 1.1761, "step": 20 }, { "epoch": 0.01977587343441002, "grad_norm": 2.233778953552246, "learning_rate": 1.5789473684210526e-05, "loss": 0.6049, "step": 30 }, { "epoch": 0.026367831245880026, "grad_norm": 3.505425453186035, "learning_rate": 2.105263157894737e-05, "loss": 0.3419, "step": 40 }, { "epoch": 0.03295978905735003, "grad_norm": 1.7327282428741455, "learning_rate": 2.6315789473684212e-05, "loss": 0.2418, "step": 50 }, { "epoch": 0.03955174686882004, "grad_norm": 1.4897282123565674, "learning_rate": 3.157894736842105e-05, "loss": 0.2017, "step": 60 }, { "epoch": 0.04614370468029005, "grad_norm": 0.8921314477920532, "learning_rate": 3.6842105263157895e-05, "loss": 0.1668, "step": 70 }, { "epoch": 0.05273566249176005, "grad_norm": 1.3826392889022827, "learning_rate": 4.210526315789474e-05, "loss": 0.1532, "step": 80 }, { "epoch": 0.05932762030323006, "grad_norm": 1.489062786102295, "learning_rate": 4.736842105263158e-05, "loss": 0.1084, "step": 90 }, { "epoch": 0.06591957811470006, "grad_norm": 1.280565619468689, "learning_rate": 5.2631578947368424e-05, "loss": 0.1128, "step": 100 }, { "epoch": 0.07251153592617007, "grad_norm": 1.2948462963104248, "learning_rate": 5.789473684210527e-05, "loss": 0.1044, "step": 110 }, { "epoch": 0.07910349373764008, "grad_norm": 1.5762895345687866, "learning_rate": 6.31578947368421e-05, "loss": 0.1034, "step": 120 }, { "epoch": 0.08569545154911008, "grad_norm": 1.0561785697937012, "learning_rate": 6.842105263157895e-05, "loss": 0.0798, "step": 130 }, { "epoch": 0.0922874093605801, "grad_norm": 0.9102309346199036, "learning_rate": 7.368421052631579e-05, "loss": 0.0752, "step": 140 }, { "epoch": 0.09887936717205009, "grad_norm": 1.4243663549423218, "learning_rate": 7.894736842105263e-05, "loss": 0.0863, "step": 150 }, { "epoch": 0.1054713249835201, "grad_norm": 0.7150789499282837, "learning_rate": 8.421052631578948e-05, "loss": 0.0778, "step": 160 }, { "epoch": 0.11206328279499012, "grad_norm": 0.9231832027435303, "learning_rate": 8.947368421052632e-05, "loss": 0.0796, "step": 170 }, { "epoch": 0.11865524060646011, "grad_norm": 0.5305670499801636, "learning_rate": 9.473684210526316e-05, "loss": 0.0733, "step": 180 }, { "epoch": 0.12524719841793014, "grad_norm": 1.0431275367736816, "learning_rate": 0.0001, "loss": 0.0713, "step": 190 }, { "epoch": 0.13183915622940012, "grad_norm": 1.0667047500610352, "learning_rate": 0.00010526315789473685, "loss": 0.0738, "step": 200 }, { "epoch": 0.13843111404087013, "grad_norm": 0.9431530833244324, "learning_rate": 0.0001105263157894737, "loss": 0.0695, "step": 210 }, { "epoch": 0.14502307185234015, "grad_norm": 1.231911063194275, "learning_rate": 0.00011578947368421053, "loss": 0.0707, "step": 220 }, { "epoch": 0.15161502966381016, "grad_norm": 0.5772905945777893, "learning_rate": 0.00012105263157894738, "loss": 0.0642, "step": 230 }, { "epoch": 0.15820698747528017, "grad_norm": 0.6241514086723328, "learning_rate": 0.0001263157894736842, "loss": 0.0621, "step": 240 }, { "epoch": 0.16479894528675015, "grad_norm": 0.7449037432670593, "learning_rate": 0.00013157894736842108, "loss": 0.0639, "step": 250 }, { "epoch": 0.17139090309822017, "grad_norm": 0.9040747880935669, "learning_rate": 0.0001368421052631579, "loss": 0.0595, "step": 260 }, { "epoch": 0.17798286090969018, "grad_norm": 0.6246598958969116, "learning_rate": 0.00014210526315789474, "loss": 0.0612, "step": 270 }, { "epoch": 0.1845748187211602, "grad_norm": 0.6300843358039856, "learning_rate": 0.00014736842105263158, "loss": 0.0574, "step": 280 }, { "epoch": 0.1911667765326302, "grad_norm": 0.7051455974578857, "learning_rate": 0.00015263157894736845, "loss": 0.0489, "step": 290 }, { "epoch": 0.19775873434410018, "grad_norm": 0.8903814554214478, "learning_rate": 0.00015789473684210527, "loss": 0.0588, "step": 300 }, { "epoch": 0.2043506921555702, "grad_norm": 0.8815051317214966, "learning_rate": 0.0001631578947368421, "loss": 0.0605, "step": 310 }, { "epoch": 0.2109426499670402, "grad_norm": 0.7266796231269836, "learning_rate": 0.00016842105263157895, "loss": 0.0555, "step": 320 }, { "epoch": 0.21753460777851022, "grad_norm": 1.033163070678711, "learning_rate": 0.0001736842105263158, "loss": 0.056, "step": 330 }, { "epoch": 0.22412656558998023, "grad_norm": 1.339528203010559, "learning_rate": 0.00017894736842105264, "loss": 0.0513, "step": 340 }, { "epoch": 0.23071852340145024, "grad_norm": 1.1713142395019531, "learning_rate": 0.00018421052631578948, "loss": 0.0604, "step": 350 }, { "epoch": 0.23731048121292023, "grad_norm": 0.7305978536605835, "learning_rate": 0.00018947368421052632, "loss": 0.061, "step": 360 }, { "epoch": 0.24390243902439024, "grad_norm": 0.6867638826370239, "learning_rate": 0.00019473684210526317, "loss": 0.0446, "step": 370 }, { "epoch": 0.2504943968358603, "grad_norm": 0.480622798204422, "learning_rate": 0.0002, "loss": 0.0507, "step": 380 }, { "epoch": 0.25708635464733026, "grad_norm": 0.6892393827438354, "learning_rate": 0.00019999904886484996, "loss": 0.0562, "step": 390 }, { "epoch": 0.26367831245880025, "grad_norm": 0.8014799952507019, "learning_rate": 0.00019999619547749294, "loss": 0.0407, "step": 400 }, { "epoch": 0.2702702702702703, "grad_norm": 0.8931164741516113, "learning_rate": 0.0001999914398922081, "loss": 0.0488, "step": 410 }, { "epoch": 0.27686222808174027, "grad_norm": 0.5557290315628052, "learning_rate": 0.00019998478219945958, "loss": 0.0533, "step": 420 }, { "epoch": 0.2834541858932103, "grad_norm": 0.9810464978218079, "learning_rate": 0.00019997622252589464, "loss": 0.052, "step": 430 }, { "epoch": 0.2900461437046803, "grad_norm": 0.6797704696655273, "learning_rate": 0.00019996576103434137, "loss": 0.0514, "step": 440 }, { "epoch": 0.2966381015161503, "grad_norm": 1.141650915145874, "learning_rate": 0.0001999533979238057, "loss": 0.0489, "step": 450 }, { "epoch": 0.3032300593276203, "grad_norm": 0.6689559817314148, "learning_rate": 0.00019993913342946734, "loss": 0.0441, "step": 460 }, { "epoch": 0.3098220171390903, "grad_norm": 0.524917721748352, "learning_rate": 0.0001999229678226756, "loss": 0.0457, "step": 470 }, { "epoch": 0.31641397495056034, "grad_norm": 0.7408258318901062, "learning_rate": 0.00019990490141094392, "loss": 0.0428, "step": 480 }, { "epoch": 0.3230059327620303, "grad_norm": 0.5927634835243225, "learning_rate": 0.0001998849345379444, "loss": 0.0431, "step": 490 }, { "epoch": 0.3295978905735003, "grad_norm": 0.4574936628341675, "learning_rate": 0.00019986306758350083, "loss": 0.038, "step": 500 }, { "epoch": 0.33618984838497035, "grad_norm": 0.6031479835510254, "learning_rate": 0.00019983930096358188, "loss": 0.0442, "step": 510 }, { "epoch": 0.34278180619644033, "grad_norm": 0.4019775688648224, "learning_rate": 0.00019981363513029283, "loss": 0.0336, "step": 520 }, { "epoch": 0.34937376400791037, "grad_norm": 0.6691102981567383, "learning_rate": 0.00019978607057186725, "loss": 0.0387, "step": 530 }, { "epoch": 0.35596572181938035, "grad_norm": 0.39324843883514404, "learning_rate": 0.00019975660781265753, "loss": 0.0449, "step": 540 }, { "epoch": 0.36255767963085034, "grad_norm": 0.5069633722305298, "learning_rate": 0.00019972524741312497, "loss": 0.0319, "step": 550 }, { "epoch": 0.3691496374423204, "grad_norm": 0.5699636936187744, "learning_rate": 0.00019969198996982917, "loss": 0.0402, "step": 560 }, { "epoch": 0.37574159525379036, "grad_norm": 1.0686895847320557, "learning_rate": 0.00019965683611541655, "loss": 0.0542, "step": 570 }, { "epoch": 0.3823335530652604, "grad_norm": 0.4853604733943939, "learning_rate": 0.00019961978651860854, "loss": 0.0476, "step": 580 }, { "epoch": 0.3889255108767304, "grad_norm": 0.8250619173049927, "learning_rate": 0.0001995808418841885, "loss": 0.034, "step": 590 }, { "epoch": 0.39551746868820037, "grad_norm": 0.6085853576660156, "learning_rate": 0.00019954000295298871, "loss": 0.0389, "step": 600 }, { "epoch": 0.4021094264996704, "grad_norm": 3.688549041748047, "learning_rate": 0.000199497270501876, "loss": 0.0511, "step": 610 }, { "epoch": 0.4087013843111404, "grad_norm": 1.5635132789611816, "learning_rate": 0.00019945264534373714, "loss": 0.1116, "step": 620 }, { "epoch": 0.41529334212261043, "grad_norm": 0.7884135246276855, "learning_rate": 0.00019940612832746322, "loss": 0.0737, "step": 630 }, { "epoch": 0.4218852999340804, "grad_norm": 0.9017935395240784, "learning_rate": 0.0001993577203379336, "loss": 0.0789, "step": 640 }, { "epoch": 0.42847725774555045, "grad_norm": 0.8649272918701172, "learning_rate": 0.00019930742229599914, "loss": 0.0728, "step": 650 }, { "epoch": 0.43506921555702044, "grad_norm": 0.772191047668457, "learning_rate": 0.00019925523515846455, "loss": 0.0697, "step": 660 }, { "epoch": 0.4416611733684904, "grad_norm": 0.5265079140663147, "learning_rate": 0.00019920115991807022, "loss": 0.0622, "step": 670 }, { "epoch": 0.44825313117996046, "grad_norm": 0.8318515419960022, "learning_rate": 0.0001991451976034734, "loss": 0.0786, "step": 680 }, { "epoch": 0.45484508899143045, "grad_norm": 0.7197186946868896, "learning_rate": 0.0001990873492792286, "loss": 0.059, "step": 690 }, { "epoch": 0.4614370468029005, "grad_norm": 0.9418641328811646, "learning_rate": 0.00019902761604576725, "loss": 0.078, "step": 700 }, { "epoch": 0.46802900461437047, "grad_norm": 0.7985256314277649, "learning_rate": 0.00019896599903937697, "loss": 0.0834, "step": 710 }, { "epoch": 0.47462096242584045, "grad_norm": 0.6049144268035889, "learning_rate": 0.00019890249943217976, "loss": 0.0656, "step": 720 }, { "epoch": 0.4812129202373105, "grad_norm": 0.6395105719566345, "learning_rate": 0.0001988371184321098, "loss": 0.0764, "step": 730 }, { "epoch": 0.4878048780487805, "grad_norm": 0.58722984790802, "learning_rate": 0.00019876985728289038, "loss": 0.0588, "step": 740 }, { "epoch": 0.4943968358602505, "grad_norm": 0.4679464101791382, "learning_rate": 0.00019870071726401043, "loss": 0.0638, "step": 750 }, { "epoch": 0.5009887936717206, "grad_norm": 0.509775698184967, "learning_rate": 0.00019862969969069996, "loss": 0.0602, "step": 760 }, { "epoch": 0.5075807514831905, "grad_norm": 0.8126184344291687, "learning_rate": 0.00019855680591390518, "loss": 0.069, "step": 770 }, { "epoch": 0.5141727092946605, "grad_norm": 0.7676377892494202, "learning_rate": 0.00019848203732026275, "loss": 0.0704, "step": 780 }, { "epoch": 0.5207646671061306, "grad_norm": 1.0301965475082397, "learning_rate": 0.00019840539533207344, "loss": 0.0666, "step": 790 }, { "epoch": 0.5273566249176005, "grad_norm": 0.6810826063156128, "learning_rate": 0.000198326881407275, "loss": 0.0698, "step": 800 }, { "epoch": 0.5339485827290705, "grad_norm": 0.4939572513103485, "learning_rate": 0.00019824649703941455, "loss": 0.0548, "step": 810 }, { "epoch": 0.5405405405405406, "grad_norm": 0.6614457964897156, "learning_rate": 0.00019816424375762001, "loss": 0.0748, "step": 820 }, { "epoch": 0.5471324983520105, "grad_norm": 0.7715848088264465, "learning_rate": 0.00019808012312657114, "loss": 0.0653, "step": 830 }, { "epoch": 0.5537244561634805, "grad_norm": 0.5254570245742798, "learning_rate": 0.00019799413674646973, "loss": 0.0537, "step": 840 }, { "epoch": 0.5603164139749506, "grad_norm": 0.7626491785049438, "learning_rate": 0.0001979062862530091, "loss": 0.0599, "step": 850 }, { "epoch": 0.5669083717864206, "grad_norm": 0.6767850518226624, "learning_rate": 0.00019781657331734316, "loss": 0.0644, "step": 860 }, { "epoch": 0.5735003295978905, "grad_norm": 0.4016531705856323, "learning_rate": 0.0001977249996460544, "loss": 0.0543, "step": 870 }, { "epoch": 0.5800922874093606, "grad_norm": 1.0104889869689941, "learning_rate": 0.0001976315669811216, "loss": 0.0681, "step": 880 }, { "epoch": 0.5866842452208306, "grad_norm": 0.7674484252929688, "learning_rate": 0.00019753627709988658, "loss": 0.0562, "step": 890 }, { "epoch": 0.5932762030323006, "grad_norm": 1.2781016826629639, "learning_rate": 0.00019743913181502048, "loss": 0.0602, "step": 900 }, { "epoch": 0.5998681608437706, "grad_norm": 0.5540818572044373, "learning_rate": 0.00019734013297448914, "loss": 0.0631, "step": 910 }, { "epoch": 0.6064601186552406, "grad_norm": 0.7823266386985779, "learning_rate": 0.00019723928246151814, "loss": 0.0637, "step": 920 }, { "epoch": 0.6130520764667106, "grad_norm": 0.6756680607795715, "learning_rate": 0.00019713658219455685, "loss": 0.0684, "step": 930 }, { "epoch": 0.6196440342781806, "grad_norm": 0.8224459290504456, "learning_rate": 0.0001970320341272419, "loss": 0.0512, "step": 940 }, { "epoch": 0.6262359920896506, "grad_norm": 0.8429596424102783, "learning_rate": 0.00019692564024836016, "loss": 0.0516, "step": 950 }, { "epoch": 0.6328279499011207, "grad_norm": 0.7025866508483887, "learning_rate": 0.0001968174025818108, "loss": 0.0667, "step": 960 }, { "epoch": 0.6394199077125906, "grad_norm": 0.624162495136261, "learning_rate": 0.00019670732318656677, "loss": 0.0575, "step": 970 }, { "epoch": 0.6460118655240606, "grad_norm": 0.5887486338615417, "learning_rate": 0.00019659540415663571, "loss": 0.0488, "step": 980 }, { "epoch": 0.6526038233355307, "grad_norm": 0.45346468687057495, "learning_rate": 0.00019648164762102013, "loss": 0.0483, "step": 990 }, { "epoch": 0.6591957811470006, "grad_norm": 0.6038155555725098, "learning_rate": 0.0001963660557436768, "loss": 0.054, "step": 1000 }, { "epoch": 0.6657877389584707, "grad_norm": 0.5043258666992188, "learning_rate": 0.00019624863072347564, "loss": 0.0631, "step": 1010 }, { "epoch": 0.6723796967699407, "grad_norm": 0.6452742218971252, "learning_rate": 0.000196129374794158, "loss": 0.0551, "step": 1020 }, { "epoch": 0.6789716545814107, "grad_norm": 0.6438404321670532, "learning_rate": 0.0001960082902242939, "loss": 0.0501, "step": 1030 }, { "epoch": 0.6855636123928807, "grad_norm": 0.8768063187599182, "learning_rate": 0.00019588537931723927, "loss": 0.0516, "step": 1040 }, { "epoch": 0.6921555702043507, "grad_norm": 0.767848014831543, "learning_rate": 0.00019576064441109172, "loss": 0.0501, "step": 1050 }, { "epoch": 0.6987475280158207, "grad_norm": 0.6131387948989868, "learning_rate": 0.00019563408787864634, "loss": 0.0595, "step": 1060 }, { "epoch": 0.7053394858272907, "grad_norm": 0.4806978404521942, "learning_rate": 0.00019550571212735048, "loss": 0.0475, "step": 1070 }, { "epoch": 0.7119314436387607, "grad_norm": 0.4950248897075653, "learning_rate": 0.00019537551959925787, "loss": 0.048, "step": 1080 }, { "epoch": 0.7185234014502307, "grad_norm": 0.5537814497947693, "learning_rate": 0.0001952435127709824, "loss": 0.046, "step": 1090 }, { "epoch": 0.7251153592617007, "grad_norm": 0.4151875078678131, "learning_rate": 0.00019510969415365063, "loss": 0.0429, "step": 1100 }, { "epoch": 0.7317073170731707, "grad_norm": 0.42159780859947205, "learning_rate": 0.0001949740662928545, "loss": 0.0434, "step": 1110 }, { "epoch": 0.7382992748846408, "grad_norm": 0.454226016998291, "learning_rate": 0.00019483663176860248, "loss": 0.0421, "step": 1120 }, { "epoch": 0.7448912326961108, "grad_norm": 0.37481585144996643, "learning_rate": 0.00019469739319527064, "loss": 0.043, "step": 1130 }, { "epoch": 0.7514831905075807, "grad_norm": 0.6487095952033997, "learning_rate": 0.00019455635322155313, "loss": 0.0433, "step": 1140 }, { "epoch": 0.7580751483190508, "grad_norm": 0.44085580110549927, "learning_rate": 0.00019441351453041138, "loss": 0.0492, "step": 1150 }, { "epoch": 0.7646671061305208, "grad_norm": 0.49984055757522583, "learning_rate": 0.00019426887983902343, "loss": 0.0431, "step": 1160 }, { "epoch": 0.7712590639419907, "grad_norm": 0.5114363431930542, "learning_rate": 0.00019412245189873203, "loss": 0.0448, "step": 1170 }, { "epoch": 0.7778510217534608, "grad_norm": 0.5482351779937744, "learning_rate": 0.00019397423349499246, "loss": 0.0481, "step": 1180 }, { "epoch": 0.7844429795649308, "grad_norm": 0.7064313888549805, "learning_rate": 0.00019382422744731933, "loss": 0.0476, "step": 1190 }, { "epoch": 0.7910349373764007, "grad_norm": 0.5201088190078735, "learning_rate": 0.0001936724366092332, "loss": 0.0596, "step": 1200 }, { "epoch": 0.7976268951878708, "grad_norm": 0.794978678226471, "learning_rate": 0.000193518863868206, "loss": 0.0484, "step": 1210 }, { "epoch": 0.8042188529993408, "grad_norm": 0.5086749196052551, "learning_rate": 0.00019336351214560647, "loss": 0.0482, "step": 1220 }, { "epoch": 0.8108108108108109, "grad_norm": 0.5501623749732971, "learning_rate": 0.00019320638439664426, "loss": 0.0417, "step": 1230 }, { "epoch": 0.8174027686222808, "grad_norm": 0.4340960383415222, "learning_rate": 0.0001930474836103138, "loss": 0.0406, "step": 1240 }, { "epoch": 0.8239947264337508, "grad_norm": 0.5098422169685364, "learning_rate": 0.00019288681280933768, "loss": 0.0485, "step": 1250 }, { "epoch": 0.8305866842452209, "grad_norm": 0.4968768358230591, "learning_rate": 0.00019272437505010877, "loss": 0.0412, "step": 1260 }, { "epoch": 0.8371786420566908, "grad_norm": 0.46997663378715515, "learning_rate": 0.00019256017342263228, "loss": 0.0388, "step": 1270 }, { "epoch": 0.8437705998681608, "grad_norm": 0.5510318279266357, "learning_rate": 0.00019239421105046706, "loss": 0.056, "step": 1280 }, { "epoch": 0.8503625576796309, "grad_norm": 0.47607627511024475, "learning_rate": 0.000192226491090666, "loss": 0.0462, "step": 1290 }, { "epoch": 0.8569545154911009, "grad_norm": 0.4591579735279083, "learning_rate": 0.00019205701673371606, "loss": 0.0456, "step": 1300 }, { "epoch": 0.8635464733025708, "grad_norm": 0.45051664113998413, "learning_rate": 0.00019188579120347766, "loss": 0.0402, "step": 1310 }, { "epoch": 0.8701384311140409, "grad_norm": 0.3680923283100128, "learning_rate": 0.00019171281775712316, "loss": 0.0378, "step": 1320 }, { "epoch": 0.8767303889255109, "grad_norm": 0.4515272080898285, "learning_rate": 0.00019153809968507505, "loss": 0.0439, "step": 1330 }, { "epoch": 0.8833223467369808, "grad_norm": 0.5114394426345825, "learning_rate": 0.00019136164031094337, "loss": 0.0522, "step": 1340 }, { "epoch": 0.8899143045484509, "grad_norm": 0.6060967445373535, "learning_rate": 0.00019118344299146235, "loss": 0.04, "step": 1350 }, { "epoch": 0.8965062623599209, "grad_norm": 0.7507016658782959, "learning_rate": 0.00019100351111642666, "loss": 0.0557, "step": 1360 }, { "epoch": 0.9030982201713909, "grad_norm": 0.4493657648563385, "learning_rate": 0.00019082184810862698, "loss": 0.0424, "step": 1370 }, { "epoch": 0.9096901779828609, "grad_norm": 0.5429974794387817, "learning_rate": 0.00019063845742378467, "loss": 0.0441, "step": 1380 }, { "epoch": 0.9162821357943309, "grad_norm": 0.43085166811943054, "learning_rate": 0.00019045334255048634, "loss": 0.046, "step": 1390 }, { "epoch": 0.922874093605801, "grad_norm": 0.41755935549736023, "learning_rate": 0.0001902665070101172, "loss": 0.0461, "step": 1400 }, { "epoch": 0.9294660514172709, "grad_norm": 0.44052428007125854, "learning_rate": 0.00019007795435679428, "loss": 0.052, "step": 1410 }, { "epoch": 0.9360580092287409, "grad_norm": 0.4310389757156372, "learning_rate": 0.00018988768817729864, "loss": 0.0442, "step": 1420 }, { "epoch": 0.942649967040211, "grad_norm": 0.3892590403556824, "learning_rate": 0.0001896957120910074, "loss": 0.0416, "step": 1430 }, { "epoch": 0.9492419248516809, "grad_norm": 0.7788804769515991, "learning_rate": 0.00018950202974982454, "loss": 0.0339, "step": 1440 }, { "epoch": 0.955833882663151, "grad_norm": 0.5524693727493286, "learning_rate": 0.00018930664483811173, "loss": 0.045, "step": 1450 }, { "epoch": 0.962425840474621, "grad_norm": 0.41249391436576843, "learning_rate": 0.00018910956107261816, "loss": 0.0381, "step": 1460 }, { "epoch": 0.9690177982860909, "grad_norm": 0.3245869576931, "learning_rate": 0.00018891078220240973, "loss": 0.0277, "step": 1470 }, { "epoch": 0.975609756097561, "grad_norm": 0.28615134954452515, "learning_rate": 0.0001887103120087979, "loss": 0.0365, "step": 1480 }, { "epoch": 0.982201713909031, "grad_norm": 0.32258233428001404, "learning_rate": 0.00018850815430526758, "loss": 0.0339, "step": 1490 }, { "epoch": 0.988793671720501, "grad_norm": 0.4749410152435303, "learning_rate": 0.00018830431293740473, "loss": 0.0414, "step": 1500 }, { "epoch": 0.995385629531971, "grad_norm": 0.44143855571746826, "learning_rate": 0.00018809879178282313, "loss": 0.0288, "step": 1510 }, { "epoch": 1.0019775873434411, "grad_norm": 0.4565713107585907, "learning_rate": 0.00018789159475109067, "loss": 0.0343, "step": 1520 }, { "epoch": 1.008569545154911, "grad_norm": 0.5609179735183716, "learning_rate": 0.000187682725783655, "loss": 0.0423, "step": 1530 }, { "epoch": 1.015161502966381, "grad_norm": 0.4169975221157074, "learning_rate": 0.00018747218885376842, "loss": 0.0341, "step": 1540 }, { "epoch": 1.0217534607778511, "grad_norm": 0.44291096925735474, "learning_rate": 0.0001872599879664124, "loss": 0.0435, "step": 1550 }, { "epoch": 1.028345418589321, "grad_norm": 0.31878435611724854, "learning_rate": 0.00018704612715822144, "loss": 0.0402, "step": 1560 }, { "epoch": 1.034937376400791, "grad_norm": 0.4876072406768799, "learning_rate": 0.0001868306104974061, "loss": 0.0298, "step": 1570 }, { "epoch": 1.0415293342122611, "grad_norm": 0.4452480375766754, "learning_rate": 0.0001866134420836759, "loss": 0.042, "step": 1580 }, { "epoch": 1.048121292023731, "grad_norm": 0.5295068025588989, "learning_rate": 0.00018639462604816103, "loss": 0.0408, "step": 1590 }, { "epoch": 1.054713249835201, "grad_norm": 0.349461168050766, "learning_rate": 0.00018617416655333395, "loss": 0.037, "step": 1600 }, { "epoch": 1.0613052076466711, "grad_norm": 0.39832666516304016, "learning_rate": 0.00018595206779293015, "loss": 0.0406, "step": 1610 }, { "epoch": 1.067897165458141, "grad_norm": 0.5740079283714294, "learning_rate": 0.00018572833399186836, "loss": 0.0411, "step": 1620 }, { "epoch": 1.074489123269611, "grad_norm": 0.20162849128246307, "learning_rate": 0.00018550296940617034, "loss": 0.0333, "step": 1630 }, { "epoch": 1.0810810810810811, "grad_norm": 0.40781688690185547, "learning_rate": 0.00018527597832287954, "loss": 0.036, "step": 1640 }, { "epoch": 1.087673038892551, "grad_norm": 0.2796386182308197, "learning_rate": 0.00018504736505997997, "loss": 0.0313, "step": 1650 }, { "epoch": 1.094264996704021, "grad_norm": 0.6502156853675842, "learning_rate": 0.00018481713396631383, "loss": 0.0428, "step": 1660 }, { "epoch": 1.1008569545154911, "grad_norm": 0.3565762937068939, "learning_rate": 0.00018458528942149886, "loss": 0.0363, "step": 1670 }, { "epoch": 1.107448912326961, "grad_norm": 0.2560652792453766, "learning_rate": 0.00018435183583584498, "loss": 0.0404, "step": 1680 }, { "epoch": 1.1140408701384312, "grad_norm": 0.4972442388534546, "learning_rate": 0.00018411677765027036, "loss": 0.053, "step": 1690 }, { "epoch": 1.1206328279499012, "grad_norm": 0.36633139848709106, "learning_rate": 0.0001838801193362171, "loss": 0.0363, "step": 1700 }, { "epoch": 1.127224785761371, "grad_norm": 0.4480843245983124, "learning_rate": 0.000183641865395566, "loss": 0.031, "step": 1710 }, { "epoch": 1.133816743572841, "grad_norm": 0.42788198590278625, "learning_rate": 0.00018340202036055102, "loss": 0.0408, "step": 1720 }, { "epoch": 1.1404087013843112, "grad_norm": 0.3363877534866333, "learning_rate": 0.00018316058879367303, "loss": 0.0431, "step": 1730 }, { "epoch": 1.147000659195781, "grad_norm": 0.48484691977500916, "learning_rate": 0.000182917575287613, "loss": 0.0497, "step": 1740 }, { "epoch": 1.1535926170072512, "grad_norm": 0.4944576025009155, "learning_rate": 0.00018267298446514473, "loss": 0.0381, "step": 1750 }, { "epoch": 1.1601845748187212, "grad_norm": 0.31334227323532104, "learning_rate": 0.00018242682097904673, "loss": 0.0374, "step": 1760 }, { "epoch": 1.166776532630191, "grad_norm": 0.4245593845844269, "learning_rate": 0.00018217908951201394, "loss": 0.0384, "step": 1770 }, { "epoch": 1.1733684904416612, "grad_norm": 0.3156047463417053, "learning_rate": 0.00018192979477656845, "loss": 0.0375, "step": 1780 }, { "epoch": 1.1799604482531312, "grad_norm": 0.38936617970466614, "learning_rate": 0.00018167894151497, "loss": 0.0383, "step": 1790 }, { "epoch": 1.186552406064601, "grad_norm": 0.39287203550338745, "learning_rate": 0.00018142653449912564, "loss": 0.0384, "step": 1800 }, { "epoch": 1.1931443638760713, "grad_norm": 0.4132576882839203, "learning_rate": 0.0001811725785304991, "loss": 0.0333, "step": 1810 }, { "epoch": 1.1997363216875412, "grad_norm": 0.42320823669433594, "learning_rate": 0.00018091707844001935, "loss": 0.0282, "step": 1820 }, { "epoch": 1.2063282794990111, "grad_norm": 0.4071812927722931, "learning_rate": 0.00018066003908798873, "loss": 0.0315, "step": 1830 }, { "epoch": 1.2129202373104813, "grad_norm": 0.40392544865608215, "learning_rate": 0.0001804014653639904, "loss": 0.0331, "step": 1840 }, { "epoch": 1.2195121951219512, "grad_norm": 0.4608232080936432, "learning_rate": 0.00018014136218679567, "loss": 0.0327, "step": 1850 }, { "epoch": 1.2261041529334213, "grad_norm": 0.5048249959945679, "learning_rate": 0.00017987973450426994, "loss": 0.0334, "step": 1860 }, { "epoch": 1.2326961107448913, "grad_norm": 0.5134670734405518, "learning_rate": 0.0001796165872932789, "loss": 0.0361, "step": 1870 }, { "epoch": 1.2392880685563612, "grad_norm": 0.339224249124527, "learning_rate": 0.00017935192555959385, "loss": 0.0336, "step": 1880 }, { "epoch": 1.2458800263678311, "grad_norm": 0.5917630195617676, "learning_rate": 0.0001790857543377963, "loss": 0.0447, "step": 1890 }, { "epoch": 1.2524719841793013, "grad_norm": 0.641945481300354, "learning_rate": 0.00017881807869118234, "loss": 0.0546, "step": 1900 }, { "epoch": 1.2590639419907712, "grad_norm": 0.4399726986885071, "learning_rate": 0.00017854890371166637, "loss": 0.0358, "step": 1910 }, { "epoch": 1.2656558998022414, "grad_norm": 0.32603511214256287, "learning_rate": 0.00017827823451968398, "loss": 0.0342, "step": 1920 }, { "epoch": 1.2722478576137113, "grad_norm": 0.659220814704895, "learning_rate": 0.0001780060762640949, "loss": 0.039, "step": 1930 }, { "epoch": 1.2788398154251812, "grad_norm": 0.4240771234035492, "learning_rate": 0.00017773243412208474, "loss": 0.035, "step": 1940 }, { "epoch": 1.2854317732366514, "grad_norm": 0.4172196090221405, "learning_rate": 0.0001774573132990667, "loss": 0.0379, "step": 1950 }, { "epoch": 1.2920237310481213, "grad_norm": 0.42398178577423096, "learning_rate": 0.00017718071902858256, "loss": 0.0373, "step": 1960 }, { "epoch": 1.2986156888595912, "grad_norm": 0.5154095888137817, "learning_rate": 0.00017690265657220288, "loss": 0.0403, "step": 1970 }, { "epoch": 1.3052076466710614, "grad_norm": 0.396801233291626, "learning_rate": 0.00017662313121942727, "loss": 0.0391, "step": 1980 }, { "epoch": 1.3117996044825313, "grad_norm": 0.4826532006263733, "learning_rate": 0.00017634214828758342, "loss": 0.0297, "step": 1990 }, { "epoch": 1.3183915622940012, "grad_norm": 0.508990466594696, "learning_rate": 0.00017605971312172622, "loss": 0.0378, "step": 2000 }, { "epoch": 1.3249835201054714, "grad_norm": 0.3308925926685333, "learning_rate": 0.000175775831094536, "loss": 0.0379, "step": 2010 }, { "epoch": 1.3315754779169413, "grad_norm": 0.4720020294189453, "learning_rate": 0.00017549050760621614, "loss": 0.0392, "step": 2020 }, { "epoch": 1.3381674357284115, "grad_norm": 0.6246912479400635, "learning_rate": 0.00017520374808439076, "loss": 0.0363, "step": 2030 }, { "epoch": 1.3447593935398814, "grad_norm": 0.33079174160957336, "learning_rate": 0.00017491555798400095, "loss": 0.0316, "step": 2040 }, { "epoch": 1.3513513513513513, "grad_norm": 0.2520120143890381, "learning_rate": 0.00017462594278720145, "loss": 0.0325, "step": 2050 }, { "epoch": 1.3579433091628212, "grad_norm": 0.23862145841121674, "learning_rate": 0.00017433490800325614, "loss": 0.0351, "step": 2060 }, { "epoch": 1.3645352669742914, "grad_norm": 0.3477911353111267, "learning_rate": 0.00017404245916843324, "loss": 0.0389, "step": 2070 }, { "epoch": 1.3711272247857613, "grad_norm": 0.5003520846366882, "learning_rate": 0.00017374860184590015, "loss": 0.0368, "step": 2080 }, { "epoch": 1.3777191825972315, "grad_norm": 0.3755623698234558, "learning_rate": 0.00017345334162561734, "loss": 0.0341, "step": 2090 }, { "epoch": 1.3843111404087014, "grad_norm": 0.5258712768554688, "learning_rate": 0.00017315668412423238, "loss": 0.0334, "step": 2100 }, { "epoch": 1.3909030982201713, "grad_norm": 0.567348062992096, "learning_rate": 0.0001728586349849728, "loss": 0.0366, "step": 2110 }, { "epoch": 1.3974950560316415, "grad_norm": 0.4541948139667511, "learning_rate": 0.00017255919987753878, "loss": 0.0503, "step": 2120 }, { "epoch": 1.4040870138431114, "grad_norm": 0.44722017645835876, "learning_rate": 0.0001722583844979955, "loss": 0.0433, "step": 2130 }, { "epoch": 1.4106789716545813, "grad_norm": 0.25077545642852783, "learning_rate": 0.0001719561945686646, "loss": 0.0345, "step": 2140 }, { "epoch": 1.4172709294660515, "grad_norm": 0.3619667887687683, "learning_rate": 0.00017165263583801535, "loss": 0.0325, "step": 2150 }, { "epoch": 1.4238628872775214, "grad_norm": 0.6268120408058167, "learning_rate": 0.0001713477140805553, "loss": 0.0364, "step": 2160 }, { "epoch": 1.4304548450889913, "grad_norm": 0.5806043148040771, "learning_rate": 0.0001710414350967204, "loss": 0.037, "step": 2170 }, { "epoch": 1.4370468029004615, "grad_norm": 0.3783499002456665, "learning_rate": 0.00017073380471276496, "loss": 0.0318, "step": 2180 }, { "epoch": 1.4436387607119314, "grad_norm": 0.45143669843673706, "learning_rate": 0.0001704248287806503, "loss": 0.0344, "step": 2190 }, { "epoch": 1.4502307185234016, "grad_norm": 0.3384231626987457, "learning_rate": 0.00017011451317793384, "loss": 0.0306, "step": 2200 }, { "epoch": 1.4568226763348715, "grad_norm": 0.45972728729248047, "learning_rate": 0.00016980286380765714, "loss": 0.0394, "step": 2210 }, { "epoch": 1.4634146341463414, "grad_norm": 0.31935372948646545, "learning_rate": 0.0001694898865982336, "loss": 0.0327, "step": 2220 }, { "epoch": 1.4700065919578114, "grad_norm": 0.3758127689361572, "learning_rate": 0.0001691755875033357, "loss": 0.0376, "step": 2230 }, { "epoch": 1.4765985497692815, "grad_norm": 0.7778825759887695, "learning_rate": 0.00016885997250178184, "loss": 0.0346, "step": 2240 }, { "epoch": 1.4831905075807514, "grad_norm": 0.7735721468925476, "learning_rate": 0.00016854304759742237, "loss": 0.038, "step": 2250 }, { "epoch": 1.4897824653922216, "grad_norm": 0.6678999662399292, "learning_rate": 0.00016822481881902568, "loss": 0.0488, "step": 2260 }, { "epoch": 1.4963744232036915, "grad_norm": 0.5145410895347595, "learning_rate": 0.00016790529222016328, "loss": 0.0423, "step": 2270 }, { "epoch": 1.5029663810151614, "grad_norm": 1.2216230630874634, "learning_rate": 0.00016758447387909474, "loss": 0.0435, "step": 2280 }, { "epoch": 1.5095583388266314, "grad_norm": 0.46562644839286804, "learning_rate": 0.00016726236989865213, "loss": 0.0329, "step": 2290 }, { "epoch": 1.5161502966381015, "grad_norm": 0.552429735660553, "learning_rate": 0.00016693898640612382, "loss": 0.041, "step": 2300 }, { "epoch": 1.5227422544495717, "grad_norm": 0.4718281328678131, "learning_rate": 0.00016661432955313789, "loss": 0.0317, "step": 2310 }, { "epoch": 1.5293342122610416, "grad_norm": 0.5447438955307007, "learning_rate": 0.00016628840551554522, "loss": 0.0365, "step": 2320 }, { "epoch": 1.5359261700725115, "grad_norm": 0.5384830236434937, "learning_rate": 0.00016596122049330206, "loss": 0.0365, "step": 2330 }, { "epoch": 1.5425181278839815, "grad_norm": 0.48313167691230774, "learning_rate": 0.0001656327807103518, "loss": 0.0381, "step": 2340 }, { "epoch": 1.5491100856954514, "grad_norm": 0.4898654520511627, "learning_rate": 0.000165303092414507, "loss": 0.0343, "step": 2350 }, { "epoch": 1.5557020435069215, "grad_norm": 0.47862598299980164, "learning_rate": 0.00016497216187733016, "loss": 0.0333, "step": 2360 }, { "epoch": 1.5622940013183917, "grad_norm": 0.4709709584712982, "learning_rate": 0.00016463999539401454, "loss": 0.0351, "step": 2370 }, { "epoch": 1.5688859591298616, "grad_norm": 0.5032598972320557, "learning_rate": 0.00016430659928326458, "loss": 0.0306, "step": 2380 }, { "epoch": 1.5754779169413315, "grad_norm": 0.9953115582466125, "learning_rate": 0.00016397197988717542, "loss": 0.0388, "step": 2390 }, { "epoch": 1.5820698747528015, "grad_norm": 0.5729079246520996, "learning_rate": 0.00016363614357111245, "loss": 0.0336, "step": 2400 }, { "epoch": 1.5886618325642716, "grad_norm": 0.8332236409187317, "learning_rate": 0.0001632990967235902, "loss": 0.0414, "step": 2410 }, { "epoch": 1.5952537903757416, "grad_norm": 1.0546754598617554, "learning_rate": 0.00016296084575615077, "loss": 0.0383, "step": 2420 }, { "epoch": 1.6018457481872117, "grad_norm": 0.546684205532074, "learning_rate": 0.0001626213971032418, "loss": 0.0382, "step": 2430 }, { "epoch": 1.6084377059986816, "grad_norm": 0.6224532723426819, "learning_rate": 0.00016228075722209422, "loss": 0.0379, "step": 2440 }, { "epoch": 1.6150296638101516, "grad_norm": 0.39089900255203247, "learning_rate": 0.00016193893259259934, "loss": 0.0364, "step": 2450 }, { "epoch": 1.6216216216216215, "grad_norm": 0.5209794044494629, "learning_rate": 0.00016159592971718548, "loss": 0.0329, "step": 2460 }, { "epoch": 1.6282135794330916, "grad_norm": 0.45939525961875916, "learning_rate": 0.0001612517551206946, "loss": 0.0316, "step": 2470 }, { "epoch": 1.6348055372445618, "grad_norm": 0.4331035614013672, "learning_rate": 0.00016090641535025774, "loss": 0.0424, "step": 2480 }, { "epoch": 1.6413974950560317, "grad_norm": 0.447710782289505, "learning_rate": 0.0001605599169751708, "loss": 0.0387, "step": 2490 }, { "epoch": 1.6479894528675016, "grad_norm": 0.4073365330696106, "learning_rate": 0.00016021226658676947, "loss": 0.0404, "step": 2500 }, { "epoch": 1.6545814106789716, "grad_norm": 0.36032500863075256, "learning_rate": 0.00015986347079830382, "loss": 0.0311, "step": 2510 }, { "epoch": 1.6611733684904415, "grad_norm": 0.23349802196025848, "learning_rate": 0.00015951353624481257, "loss": 0.0248, "step": 2520 }, { "epoch": 1.6677653263019117, "grad_norm": 0.3381997048854828, "learning_rate": 0.0001591624695829968, "loss": 0.0316, "step": 2530 }, { "epoch": 1.6743572841133818, "grad_norm": 0.39666473865509033, "learning_rate": 0.0001588102774910933, "loss": 0.0399, "step": 2540 }, { "epoch": 1.6809492419248517, "grad_norm": 0.38981807231903076, "learning_rate": 0.00015845696666874772, "loss": 0.0325, "step": 2550 }, { "epoch": 1.6875411997363217, "grad_norm": 0.614475667476654, "learning_rate": 0.00015810254383688682, "loss": 0.0386, "step": 2560 }, { "epoch": 1.6941331575477916, "grad_norm": 0.6012241244316101, "learning_rate": 0.0001577470157375909, "loss": 0.0426, "step": 2570 }, { "epoch": 1.7007251153592617, "grad_norm": 0.8984513878822327, "learning_rate": 0.00015739038913396546, "loss": 0.0385, "step": 2580 }, { "epoch": 1.7073170731707317, "grad_norm": 0.5758917331695557, "learning_rate": 0.00015703267081001237, "loss": 0.0327, "step": 2590 }, { "epoch": 1.7139090309822018, "grad_norm": 0.39728182554244995, "learning_rate": 0.00015667386757050106, "loss": 0.0359, "step": 2600 }, { "epoch": 1.7205009887936717, "grad_norm": 0.44694146513938904, "learning_rate": 0.00015631398624083907, "loss": 0.032, "step": 2610 }, { "epoch": 1.7270929466051417, "grad_norm": 0.5872260332107544, "learning_rate": 0.000155953033666942, "loss": 0.0307, "step": 2620 }, { "epoch": 1.7336849044166116, "grad_norm": 0.5661513209342957, "learning_rate": 0.00015559101671510349, "loss": 0.0326, "step": 2630 }, { "epoch": 1.7402768622280818, "grad_norm": 0.3842809796333313, "learning_rate": 0.00015522794227186443, "loss": 0.0326, "step": 2640 }, { "epoch": 1.746868820039552, "grad_norm": 0.24816927313804626, "learning_rate": 0.00015486381724388222, "loss": 0.0251, "step": 2650 }, { "epoch": 1.7534607778510218, "grad_norm": 0.2353767305612564, "learning_rate": 0.00015449864855779903, "loss": 0.0272, "step": 2660 }, { "epoch": 1.7600527356624918, "grad_norm": 0.25328564643859863, "learning_rate": 0.00015413244316011038, "loss": 0.0338, "step": 2670 }, { "epoch": 1.7666446934739617, "grad_norm": 0.37852951884269714, "learning_rate": 0.0001537652080170328, "loss": 0.0308, "step": 2680 }, { "epoch": 1.7732366512854316, "grad_norm": 0.294085294008255, "learning_rate": 0.00015339695011437127, "loss": 0.0236, "step": 2690 }, { "epoch": 1.7798286090969018, "grad_norm": 0.3499051034450531, "learning_rate": 0.00015302767645738655, "loss": 0.0305, "step": 2700 }, { "epoch": 1.786420566908372, "grad_norm": 0.4269741177558899, "learning_rate": 0.00015265739407066176, "loss": 0.0279, "step": 2710 }, { "epoch": 1.7930125247198418, "grad_norm": 0.3368455767631531, "learning_rate": 0.00015228610999796875, "loss": 0.0306, "step": 2720 }, { "epoch": 1.7996044825313118, "grad_norm": 0.36064472794532776, "learning_rate": 0.00015191383130213417, "loss": 0.0281, "step": 2730 }, { "epoch": 1.8061964403427817, "grad_norm": 0.42101433873176575, "learning_rate": 0.00015154056506490505, "loss": 0.0299, "step": 2740 }, { "epoch": 1.8127883981542519, "grad_norm": 0.3719172179698944, "learning_rate": 0.0001511663183868142, "loss": 0.0323, "step": 2750 }, { "epoch": 1.8193803559657218, "grad_norm": 0.3902226984500885, "learning_rate": 0.00015079109838704504, "loss": 0.0327, "step": 2760 }, { "epoch": 1.825972313777192, "grad_norm": 0.36405107378959656, "learning_rate": 0.00015041491220329616, "loss": 0.0278, "step": 2770 }, { "epoch": 1.8325642715886619, "grad_norm": 0.31391507387161255, "learning_rate": 0.0001500377669916456, "loss": 0.0325, "step": 2780 }, { "epoch": 1.8391562294001318, "grad_norm": 0.4089469611644745, "learning_rate": 0.0001496596699264147, "loss": 0.0253, "step": 2790 }, { "epoch": 1.8457481872116017, "grad_norm": 0.5822712779045105, "learning_rate": 0.00014928062820003166, "loss": 0.0337, "step": 2800 }, { "epoch": 1.8523401450230719, "grad_norm": 0.5532752275466919, "learning_rate": 0.00014890064902289466, "loss": 0.0316, "step": 2810 }, { "epoch": 1.858932102834542, "grad_norm": 0.39222195744514465, "learning_rate": 0.0001485197396232348, "loss": 0.0304, "step": 2820 }, { "epoch": 1.865524060646012, "grad_norm": 0.3746655285358429, "learning_rate": 0.00014813790724697832, "loss": 0.0361, "step": 2830 }, { "epoch": 1.8721160184574819, "grad_norm": 0.5020349621772766, "learning_rate": 0.0001477551591576092, "loss": 0.0351, "step": 2840 }, { "epoch": 1.8787079762689518, "grad_norm": 0.40259358286857605, "learning_rate": 0.00014737150263603063, "loss": 0.027, "step": 2850 }, { "epoch": 1.8852999340804217, "grad_norm": 0.6693785190582275, "learning_rate": 0.00014698694498042675, "loss": 0.0345, "step": 2860 }, { "epoch": 1.8918918918918919, "grad_norm": 0.6384851932525635, "learning_rate": 0.00014660149350612353, "loss": 0.0315, "step": 2870 }, { "epoch": 1.898483849703362, "grad_norm": 0.5224544405937195, "learning_rate": 0.00014621515554544997, "loss": 0.0259, "step": 2880 }, { "epoch": 1.905075807514832, "grad_norm": 0.5825631022453308, "learning_rate": 0.0001458279384475983, "loss": 0.0415, "step": 2890 }, { "epoch": 1.911667765326302, "grad_norm": 0.36511966586112976, "learning_rate": 0.0001454398495784844, "loss": 0.033, "step": 2900 }, { "epoch": 1.9182597231377718, "grad_norm": 0.4093778431415558, "learning_rate": 0.00014505089632060753, "loss": 0.0309, "step": 2910 }, { "epoch": 1.924851680949242, "grad_norm": 0.4290638566017151, "learning_rate": 0.00014466108607291003, "loss": 0.0309, "step": 2920 }, { "epoch": 1.931443638760712, "grad_norm": 0.6213640570640564, "learning_rate": 0.00014427042625063646, "loss": 0.0358, "step": 2930 }, { "epoch": 1.938035596572182, "grad_norm": 0.6244672536849976, "learning_rate": 0.00014387892428519258, "loss": 0.0387, "step": 2940 }, { "epoch": 1.944627554383652, "grad_norm": 0.380691796541214, "learning_rate": 0.000143486587624004, "loss": 0.0464, "step": 2950 }, { "epoch": 1.951219512195122, "grad_norm": 0.4133692979812622, "learning_rate": 0.00014309342373037455, "loss": 0.0329, "step": 2960 }, { "epoch": 1.9578114700065918, "grad_norm": 0.4502374529838562, "learning_rate": 0.00014269944008334418, "loss": 0.0334, "step": 2970 }, { "epoch": 1.964403427818062, "grad_norm": 0.5235921740531921, "learning_rate": 0.00014230464417754675, "loss": 0.033, "step": 2980 }, { "epoch": 1.9709953856295321, "grad_norm": 0.5345565676689148, "learning_rate": 0.00014190904352306757, "loss": 0.0371, "step": 2990 }, { "epoch": 1.977587343441002, "grad_norm": 0.34067875146865845, "learning_rate": 0.0001415126456453004, "loss": 0.0408, "step": 3000 }, { "epoch": 1.984179301252472, "grad_norm": 0.36922353506088257, "learning_rate": 0.00014111545808480434, "loss": 0.0315, "step": 3010 }, { "epoch": 1.990771259063942, "grad_norm": 0.36315643787384033, "learning_rate": 0.0001407174883971604, "loss": 0.0311, "step": 3020 }, { "epoch": 1.9973632168754119, "grad_norm": 0.35053545236587524, "learning_rate": 0.0001403187441528277, "loss": 0.0367, "step": 3030 }, { "epoch": 2.0039551746868822, "grad_norm": 0.5017916560173035, "learning_rate": 0.00013991923293699956, "loss": 0.0353, "step": 3040 }, { "epoch": 2.010547132498352, "grad_norm": 0.3657391667366028, "learning_rate": 0.00013951896234945925, "loss": 0.0404, "step": 3050 }, { "epoch": 2.017139090309822, "grad_norm": 0.5382429957389832, "learning_rate": 0.00013911794000443528, "loss": 0.0346, "step": 3060 }, { "epoch": 2.023731048121292, "grad_norm": 0.5115209221839905, "learning_rate": 0.0001387161735304566, "loss": 0.0288, "step": 3070 }, { "epoch": 2.030323005932762, "grad_norm": 0.5078955888748169, "learning_rate": 0.00013831367057020748, "loss": 0.0323, "step": 3080 }, { "epoch": 2.036914963744232, "grad_norm": 0.4034331440925598, "learning_rate": 0.00013791043878038224, "loss": 0.0397, "step": 3090 }, { "epoch": 2.0435069215557022, "grad_norm": 0.23669302463531494, "learning_rate": 0.0001375064858315394, "loss": 0.0314, "step": 3100 }, { "epoch": 2.050098879367172, "grad_norm": 0.3059588074684143, "learning_rate": 0.000137101819407956, "loss": 0.0276, "step": 3110 }, { "epoch": 2.056690837178642, "grad_norm": 0.5819403529167175, "learning_rate": 0.00013669644720748118, "loss": 0.0285, "step": 3120 }, { "epoch": 2.063282794990112, "grad_norm": 0.6815973520278931, "learning_rate": 0.00013629037694138995, "loss": 0.0329, "step": 3130 }, { "epoch": 2.069874752801582, "grad_norm": 0.28361934423446655, "learning_rate": 0.0001358836163342364, "loss": 0.0271, "step": 3140 }, { "epoch": 2.076466710613052, "grad_norm": 0.2907734513282776, "learning_rate": 0.00013547617312370663, "loss": 0.0309, "step": 3150 }, { "epoch": 2.0830586684245223, "grad_norm": 0.5272607207298279, "learning_rate": 0.00013506805506047198, "loss": 0.0308, "step": 3160 }, { "epoch": 2.089650626235992, "grad_norm": 0.23821255564689636, "learning_rate": 0.00013465926990804107, "loss": 0.0341, "step": 3170 }, { "epoch": 2.096242584047462, "grad_norm": 0.5370649099349976, "learning_rate": 0.00013424982544261248, "loss": 0.0316, "step": 3180 }, { "epoch": 2.102834541858932, "grad_norm": 0.3361760675907135, "learning_rate": 0.00013383972945292665, "loss": 0.0248, "step": 3190 }, { "epoch": 2.109426499670402, "grad_norm": 0.48819541931152344, "learning_rate": 0.00013342898974011774, "loss": 0.0347, "step": 3200 }, { "epoch": 2.1160184574818723, "grad_norm": 0.24430608749389648, "learning_rate": 0.00013301761411756543, "loss": 0.0269, "step": 3210 }, { "epoch": 2.1226104152933423, "grad_norm": 0.4588664770126343, "learning_rate": 0.00013260561041074598, "loss": 0.0276, "step": 3220 }, { "epoch": 2.129202373104812, "grad_norm": 0.5559895634651184, "learning_rate": 0.0001321929864570835, "loss": 0.0257, "step": 3230 }, { "epoch": 2.135794330916282, "grad_norm": 0.547458827495575, "learning_rate": 0.00013177975010580085, "loss": 0.0223, "step": 3240 }, { "epoch": 2.142386288727752, "grad_norm": 0.3017808198928833, "learning_rate": 0.00013136590921777053, "loss": 0.031, "step": 3250 }, { "epoch": 2.148978246539222, "grad_norm": 0.44043952226638794, "learning_rate": 0.00013095147166536486, "loss": 0.0276, "step": 3260 }, { "epoch": 2.1555702043506924, "grad_norm": 0.4227822422981262, "learning_rate": 0.0001305364453323062, "loss": 0.0296, "step": 3270 }, { "epoch": 2.1621621621621623, "grad_norm": 0.4026118516921997, "learning_rate": 0.0001301208381135173, "loss": 0.0301, "step": 3280 }, { "epoch": 2.168754119973632, "grad_norm": 0.5354869961738586, "learning_rate": 0.0001297046579149708, "loss": 0.0286, "step": 3290 }, { "epoch": 2.175346077785102, "grad_norm": 0.42211246490478516, "learning_rate": 0.00012928791265353902, "loss": 0.0336, "step": 3300 }, { "epoch": 2.181938035596572, "grad_norm": 0.3645992577075958, "learning_rate": 0.00012887061025684333, "loss": 0.0242, "step": 3310 }, { "epoch": 2.188529993408042, "grad_norm": 0.2105298638343811, "learning_rate": 0.00012845275866310324, "loss": 0.0228, "step": 3320 }, { "epoch": 2.1951219512195124, "grad_norm": 0.25215044617652893, "learning_rate": 0.00012803436582098558, "loss": 0.0243, "step": 3330 }, { "epoch": 2.2017139090309823, "grad_norm": 0.4196263253688812, "learning_rate": 0.00012761543968945306, "loss": 0.0282, "step": 3340 }, { "epoch": 2.2083058668424522, "grad_norm": 0.1937485933303833, "learning_rate": 0.00012719598823761308, "loss": 0.0278, "step": 3350 }, { "epoch": 2.214897824653922, "grad_norm": 0.5221042037010193, "learning_rate": 0.00012677601944456604, "loss": 0.0311, "step": 3360 }, { "epoch": 2.221489782465392, "grad_norm": 0.2941031754016876, "learning_rate": 0.0001263555412992535, "loss": 0.0303, "step": 3370 }, { "epoch": 2.2280817402768625, "grad_norm": 0.31689217686653137, "learning_rate": 0.00012593456180030646, "loss": 0.0252, "step": 3380 }, { "epoch": 2.2346736980883324, "grad_norm": 0.42106205224990845, "learning_rate": 0.0001255130889558928, "loss": 0.0249, "step": 3390 }, { "epoch": 2.2412656558998023, "grad_norm": 0.576701283454895, "learning_rate": 0.0001250911307835653, "loss": 0.0303, "step": 3400 }, { "epoch": 2.2478576137112722, "grad_norm": 0.49954476952552795, "learning_rate": 0.00012466869531010895, "loss": 0.0323, "step": 3410 }, { "epoch": 2.254449571522742, "grad_norm": 0.4963241517543793, "learning_rate": 0.0001242457905713883, "loss": 0.0316, "step": 3420 }, { "epoch": 2.261041529334212, "grad_norm": 0.23066122829914093, "learning_rate": 0.00012382242461219452, "loss": 0.0226, "step": 3430 }, { "epoch": 2.267633487145682, "grad_norm": 0.540354311466217, "learning_rate": 0.00012339860548609262, "loss": 0.0365, "step": 3440 }, { "epoch": 2.2742254449571524, "grad_norm": 0.48116335272789, "learning_rate": 0.0001229743412552679, "loss": 0.0268, "step": 3450 }, { "epoch": 2.2808174027686223, "grad_norm": 0.4430583417415619, "learning_rate": 0.00012254963999037285, "loss": 0.0263, "step": 3460 }, { "epoch": 2.2874093605800923, "grad_norm": 0.42470598220825195, "learning_rate": 0.0001221245097703735, "loss": 0.0354, "step": 3470 }, { "epoch": 2.294001318391562, "grad_norm": 0.31455087661743164, "learning_rate": 0.00012169895868239574, "loss": 0.0241, "step": 3480 }, { "epoch": 2.300593276203032, "grad_norm": 0.3215204179286957, "learning_rate": 0.00012127299482157149, "loss": 0.0332, "step": 3490 }, { "epoch": 2.3071852340145025, "grad_norm": 0.3963293135166168, "learning_rate": 0.00012084662629088481, "loss": 0.025, "step": 3500 }, { "epoch": 2.3137771918259724, "grad_norm": 0.4304813742637634, "learning_rate": 0.00012041986120101764, "loss": 0.0354, "step": 3510 }, { "epoch": 2.3203691496374423, "grad_norm": 0.3873739242553711, "learning_rate": 0.00011999270767019553, "loss": 0.0277, "step": 3520 }, { "epoch": 2.3269611074489123, "grad_norm": 0.4315703809261322, "learning_rate": 0.00011956517382403321, "loss": 0.0301, "step": 3530 }, { "epoch": 2.333553065260382, "grad_norm": 0.4416598081588745, "learning_rate": 0.00011913726779538008, "loss": 0.0283, "step": 3540 }, { "epoch": 2.3401450230718526, "grad_norm": 0.3677782416343689, "learning_rate": 0.0001187089977241654, "loss": 0.0355, "step": 3550 }, { "epoch": 2.3467369808833225, "grad_norm": 0.4988672733306885, "learning_rate": 0.00011828037175724356, "loss": 0.0314, "step": 3560 }, { "epoch": 2.3533289386947924, "grad_norm": 0.4604177474975586, "learning_rate": 0.00011785139804823906, "loss": 0.0337, "step": 3570 }, { "epoch": 2.3599208965062624, "grad_norm": 0.3596359193325043, "learning_rate": 0.00011742208475739133, "loss": 0.0295, "step": 3580 }, { "epoch": 2.3665128543177323, "grad_norm": 0.16485251486301422, "learning_rate": 0.0001169924400513996, "loss": 0.0275, "step": 3590 }, { "epoch": 2.373104812129202, "grad_norm": 0.3272377550601959, "learning_rate": 0.00011656247210326748, "loss": 0.0305, "step": 3600 }, { "epoch": 2.379696769940672, "grad_norm": 0.32883545756340027, "learning_rate": 0.0001161321890921476, "loss": 0.0314, "step": 3610 }, { "epoch": 2.3862887277521425, "grad_norm": 0.49502697587013245, "learning_rate": 0.00011570159920318584, "loss": 0.0323, "step": 3620 }, { "epoch": 2.3928806855636124, "grad_norm": 0.3317064344882965, "learning_rate": 0.00011527071062736583, "loss": 0.0284, "step": 3630 }, { "epoch": 2.3994726433750824, "grad_norm": 0.29318150877952576, "learning_rate": 0.00011483953156135292, "loss": 0.0226, "step": 3640 }, { "epoch": 2.4060646011865523, "grad_norm": 0.48932701349258423, "learning_rate": 0.00011440807020733843, "loss": 0.0287, "step": 3650 }, { "epoch": 2.4126565589980222, "grad_norm": 0.358005166053772, "learning_rate": 0.00011397633477288359, "loss": 0.0235, "step": 3660 }, { "epoch": 2.4192485168094926, "grad_norm": 0.3554854691028595, "learning_rate": 0.00011354433347076331, "loss": 0.0269, "step": 3670 }, { "epoch": 2.4258404746209625, "grad_norm": 0.3954286277294159, "learning_rate": 0.00011311207451881008, "loss": 0.0264, "step": 3680 }, { "epoch": 2.4324324324324325, "grad_norm": 0.3300182819366455, "learning_rate": 0.00011267956613975752, "loss": 0.0291, "step": 3690 }, { "epoch": 2.4390243902439024, "grad_norm": 0.22343868017196655, "learning_rate": 0.00011224681656108411, "loss": 0.0251, "step": 3700 }, { "epoch": 2.4456163480553723, "grad_norm": 0.3663915991783142, "learning_rate": 0.00011181383401485656, "loss": 0.0295, "step": 3710 }, { "epoch": 2.4522083058668427, "grad_norm": 0.39715585112571716, "learning_rate": 0.00011138062673757325, "loss": 0.0299, "step": 3720 }, { "epoch": 2.4588002636783126, "grad_norm": 0.3747979402542114, "learning_rate": 0.00011094720297000753, "loss": 0.0295, "step": 3730 }, { "epoch": 2.4653922214897825, "grad_norm": 0.2834596037864685, "learning_rate": 0.00011051357095705101, "loss": 0.0284, "step": 3740 }, { "epoch": 2.4719841793012525, "grad_norm": 0.3044513165950775, "learning_rate": 0.0001100797389475567, "loss": 0.0272, "step": 3750 }, { "epoch": 2.4785761371127224, "grad_norm": 0.39235764741897583, "learning_rate": 0.00010964571519418207, "loss": 0.024, "step": 3760 }, { "epoch": 2.4851680949241923, "grad_norm": 0.31392836570739746, "learning_rate": 0.00010921150795323207, "loss": 0.0229, "step": 3770 }, { "epoch": 2.4917600527356623, "grad_norm": 0.3227923512458801, "learning_rate": 0.00010877712548450207, "loss": 0.0235, "step": 3780 }, { "epoch": 2.4983520105471326, "grad_norm": 0.35434576869010925, "learning_rate": 0.00010834257605112079, "loss": 0.0265, "step": 3790 }, { "epoch": 2.5049439683586026, "grad_norm": 0.3610621988773346, "learning_rate": 0.00010790786791939301, "loss": 0.0286, "step": 3800 }, { "epoch": 2.5115359261700725, "grad_norm": 0.26061367988586426, "learning_rate": 0.00010747300935864243, "loss": 0.0302, "step": 3810 }, { "epoch": 2.5181278839815424, "grad_norm": 0.3455495536327362, "learning_rate": 0.00010703800864105429, "loss": 0.0283, "step": 3820 }, { "epoch": 2.5247198417930123, "grad_norm": 0.5354321002960205, "learning_rate": 0.00010660287404151807, "loss": 0.0279, "step": 3830 }, { "epoch": 2.5313117996044827, "grad_norm": 0.23394666612148285, "learning_rate": 0.00010616761383747, "loss": 0.0318, "step": 3840 }, { "epoch": 2.5379037574159526, "grad_norm": 0.3995780348777771, "learning_rate": 0.00010573223630873565, "loss": 0.0265, "step": 3850 }, { "epoch": 2.5444957152274226, "grad_norm": 0.4800235331058502, "learning_rate": 0.00010529674973737252, "loss": 0.0281, "step": 3860 }, { "epoch": 2.5510876730388925, "grad_norm": 0.2611030042171478, "learning_rate": 0.00010486116240751223, "loss": 0.0297, "step": 3870 }, { "epoch": 2.5576796308503624, "grad_norm": 0.3945279121398926, "learning_rate": 0.0001044254826052032, "loss": 0.025, "step": 3880 }, { "epoch": 2.564271588661833, "grad_norm": 0.5326240658760071, "learning_rate": 0.00010398971861825297, "loss": 0.0264, "step": 3890 }, { "epoch": 2.5708635464733027, "grad_norm": 0.3610016703605652, "learning_rate": 0.00010355387873607036, "loss": 0.0259, "step": 3900 }, { "epoch": 2.5774555042847727, "grad_norm": 0.3786564767360687, "learning_rate": 0.0001031179712495081, "loss": 0.0253, "step": 3910 }, { "epoch": 2.5840474620962426, "grad_norm": 0.5698022246360779, "learning_rate": 0.0001026820044507048, "loss": 0.021, "step": 3920 }, { "epoch": 2.5906394199077125, "grad_norm": 0.4795434772968292, "learning_rate": 0.00010224598663292737, "loss": 0.0267, "step": 3930 }, { "epoch": 2.5972313777191824, "grad_norm": 0.4011961817741394, "learning_rate": 0.00010180992609041325, "loss": 0.035, "step": 3940 }, { "epoch": 2.6038233355306524, "grad_norm": 0.5173267126083374, "learning_rate": 0.00010137383111821266, "loss": 0.0298, "step": 3950 }, { "epoch": 2.6104152933421227, "grad_norm": 0.47045668959617615, "learning_rate": 0.00010093771001203076, "loss": 0.0296, "step": 3960 }, { "epoch": 2.6170072511535927, "grad_norm": 0.5313148498535156, "learning_rate": 0.0001005015710680698, "loss": 0.026, "step": 3970 }, { "epoch": 2.6235992089650626, "grad_norm": 0.40992313623428345, "learning_rate": 0.00010006542258287139, "loss": 0.0213, "step": 3980 }, { "epoch": 2.6301911667765325, "grad_norm": 0.2713076174259186, "learning_rate": 9.96292728531586e-05, "loss": 0.0238, "step": 3990 }, { "epoch": 2.6367831245880025, "grad_norm": 0.41798898577690125, "learning_rate": 9.919313017567822e-05, "loss": 0.0269, "step": 4000 }, { "epoch": 2.643375082399473, "grad_norm": 0.26005855202674866, "learning_rate": 9.875700284704286e-05, "loss": 0.0262, "step": 4010 }, { "epoch": 2.6499670402109428, "grad_norm": 0.24366049468517303, "learning_rate": 9.83208991635732e-05, "loss": 0.0234, "step": 4020 }, { "epoch": 2.6565589980224127, "grad_norm": 0.424334317445755, "learning_rate": 9.788482742114003e-05, "loss": 0.0296, "step": 4030 }, { "epoch": 2.6631509558338826, "grad_norm": 0.3093094229698181, "learning_rate": 9.744879591500662e-05, "loss": 0.0282, "step": 4040 }, { "epoch": 2.6697429136453525, "grad_norm": 0.42985987663269043, "learning_rate": 9.701281293967083e-05, "loss": 0.031, "step": 4050 }, { "epoch": 2.676334871456823, "grad_norm": 0.3328607380390167, "learning_rate": 9.657688678870728e-05, "loss": 0.0318, "step": 4060 }, { "epoch": 2.682926829268293, "grad_norm": 0.35078462958335876, "learning_rate": 9.614102575460973e-05, "loss": 0.0268, "step": 4070 }, { "epoch": 2.6895187870797628, "grad_norm": 0.4191462993621826, "learning_rate": 9.57052381286331e-05, "loss": 0.03, "step": 4080 }, { "epoch": 2.6961107448912327, "grad_norm": 0.4283992648124695, "learning_rate": 9.526953220063603e-05, "loss": 0.0235, "step": 4090 }, { "epoch": 2.7027027027027026, "grad_norm": 0.35658934712409973, "learning_rate": 9.483391625892293e-05, "loss": 0.0243, "step": 4100 }, { "epoch": 2.7092946605141726, "grad_norm": 0.2613814175128937, "learning_rate": 9.439839859008653e-05, "loss": 0.0232, "step": 4110 }, { "epoch": 2.7158866183256425, "grad_norm": 0.24698810279369354, "learning_rate": 9.396298747885013e-05, "loss": 0.0232, "step": 4120 }, { "epoch": 2.722478576137113, "grad_norm": 0.25733861327171326, "learning_rate": 9.352769120790988e-05, "loss": 0.0231, "step": 4130 }, { "epoch": 2.729070533948583, "grad_norm": 0.288001149892807, "learning_rate": 9.309251805777754e-05, "loss": 0.0247, "step": 4140 }, { "epoch": 2.7356624917600527, "grad_norm": 0.47979527711868286, "learning_rate": 9.265747630662265e-05, "loss": 0.0315, "step": 4150 }, { "epoch": 2.7422544495715226, "grad_norm": 0.5932050943374634, "learning_rate": 9.22225742301153e-05, "loss": 0.0252, "step": 4160 }, { "epoch": 2.7488464073829926, "grad_norm": 0.3525910973548889, "learning_rate": 9.178782010126844e-05, "loss": 0.0249, "step": 4170 }, { "epoch": 2.755438365194463, "grad_norm": 0.27204054594039917, "learning_rate": 9.135322219028079e-05, "loss": 0.025, "step": 4180 }, { "epoch": 2.762030323005933, "grad_norm": 0.3478144407272339, "learning_rate": 9.091878876437933e-05, "loss": 0.0216, "step": 4190 }, { "epoch": 2.768622280817403, "grad_norm": 0.29393240809440613, "learning_rate": 9.04845280876621e-05, "loss": 0.0214, "step": 4200 }, { "epoch": 2.7752142386288727, "grad_norm": 0.21876759827136993, "learning_rate": 9.005044842094101e-05, "loss": 0.0245, "step": 4210 }, { "epoch": 2.7818061964403427, "grad_norm": 0.423742413520813, "learning_rate": 8.961655802158456e-05, "loss": 0.0241, "step": 4220 }, { "epoch": 2.788398154251813, "grad_norm": 0.38848140835762024, "learning_rate": 8.918286514336099e-05, "loss": 0.0238, "step": 4230 }, { "epoch": 2.794990112063283, "grad_norm": 0.28686466813087463, "learning_rate": 8.874937803628115e-05, "loss": 0.022, "step": 4240 }, { "epoch": 2.801582069874753, "grad_norm": 0.3457236588001251, "learning_rate": 8.831610494644148e-05, "loss": 0.0345, "step": 4250 }, { "epoch": 2.808174027686223, "grad_norm": 0.339136004447937, "learning_rate": 8.788305411586736e-05, "loss": 0.0194, "step": 4260 }, { "epoch": 2.8147659854976927, "grad_norm": 0.3297877907752991, "learning_rate": 8.745023378235602e-05, "loss": 0.0199, "step": 4270 }, { "epoch": 2.8213579433091627, "grad_norm": 0.39552271366119385, "learning_rate": 8.701765217932022e-05, "loss": 0.0266, "step": 4280 }, { "epoch": 2.8279499011206326, "grad_norm": 0.40580829977989197, "learning_rate": 8.658531753563122e-05, "loss": 0.0367, "step": 4290 }, { "epoch": 2.834541858932103, "grad_norm": 0.3342481553554535, "learning_rate": 8.615323807546258e-05, "loss": 0.0223, "step": 4300 }, { "epoch": 2.841133816743573, "grad_norm": 0.25729164481163025, "learning_rate": 8.572142201813363e-05, "loss": 0.023, "step": 4310 }, { "epoch": 2.847725774555043, "grad_norm": 0.3168254792690277, "learning_rate": 8.528987757795286e-05, "loss": 0.0237, "step": 4320 }, { "epoch": 2.8543177323665128, "grad_norm": 0.4179421365261078, "learning_rate": 8.485861296406207e-05, "loss": 0.0268, "step": 4330 }, { "epoch": 2.8609096901779827, "grad_norm": 0.46458080410957336, "learning_rate": 8.442763638027985e-05, "loss": 0.0216, "step": 4340 }, { "epoch": 2.867501647989453, "grad_norm": 0.35828524827957153, "learning_rate": 8.399695602494581e-05, "loss": 0.0204, "step": 4350 }, { "epoch": 2.874093605800923, "grad_norm": 0.34387773275375366, "learning_rate": 8.356658009076441e-05, "loss": 0.0239, "step": 4360 }, { "epoch": 2.880685563612393, "grad_norm": 0.3083021342754364, "learning_rate": 8.313651676464923e-05, "loss": 0.0228, "step": 4370 }, { "epoch": 2.887277521423863, "grad_norm": 0.2175825834274292, "learning_rate": 8.270677422756725e-05, "loss": 0.0201, "step": 4380 }, { "epoch": 2.8938694792353328, "grad_norm": 0.2774793803691864, "learning_rate": 8.227736065438302e-05, "loss": 0.0234, "step": 4390 }, { "epoch": 2.900461437046803, "grad_norm": 0.2598700523376465, "learning_rate": 8.184828421370348e-05, "loss": 0.0241, "step": 4400 }, { "epoch": 2.9070533948582726, "grad_norm": 0.3586549460887909, "learning_rate": 8.141955306772229e-05, "loss": 0.0162, "step": 4410 }, { "epoch": 2.913645352669743, "grad_norm": 0.26286324858665466, "learning_rate": 8.099117537206477e-05, "loss": 0.0212, "step": 4420 }, { "epoch": 2.920237310481213, "grad_norm": 0.4125373661518097, "learning_rate": 8.05631592756325e-05, "loss": 0.0202, "step": 4430 }, { "epoch": 2.926829268292683, "grad_norm": 0.29703447222709656, "learning_rate": 8.013551292044859e-05, "loss": 0.0213, "step": 4440 }, { "epoch": 2.933421226104153, "grad_norm": 0.3580416738986969, "learning_rate": 7.97082444415027e-05, "loss": 0.0226, "step": 4450 }, { "epoch": 2.9400131839156227, "grad_norm": 0.4119264781475067, "learning_rate": 7.928136196659614e-05, "loss": 0.0242, "step": 4460 }, { "epoch": 2.946605141727093, "grad_norm": 0.5699878931045532, "learning_rate": 7.885487361618754e-05, "loss": 0.0262, "step": 4470 }, { "epoch": 2.953197099538563, "grad_norm": 0.4126439094543457, "learning_rate": 7.842878750323801e-05, "loss": 0.021, "step": 4480 }, { "epoch": 2.959789057350033, "grad_norm": 0.42604967951774597, "learning_rate": 7.800311173305718e-05, "loss": 0.0219, "step": 4490 }, { "epoch": 2.966381015161503, "grad_norm": 0.19208472967147827, "learning_rate": 7.757785440314882e-05, "loss": 0.0284, "step": 4500 }, { "epoch": 2.972972972972973, "grad_norm": 0.43162015080451965, "learning_rate": 7.715302360305678e-05, "loss": 0.0192, "step": 4510 }, { "epoch": 2.979564930784443, "grad_norm": 0.7263951301574707, "learning_rate": 7.672862741421126e-05, "loss": 0.0299, "step": 4520 }, { "epoch": 2.986156888595913, "grad_norm": 0.3890402615070343, "learning_rate": 7.63046739097748e-05, "loss": 0.0222, "step": 4530 }, { "epoch": 2.992748846407383, "grad_norm": 0.25311848521232605, "learning_rate": 7.588117115448911e-05, "loss": 0.0208, "step": 4540 }, { "epoch": 2.999340804218853, "grad_norm": 0.33752700686454773, "learning_rate": 7.545812720452127e-05, "loss": 0.0263, "step": 4550 }, { "epoch": 3.005932762030323, "grad_norm": 0.2610788345336914, "learning_rate": 7.50355501073107e-05, "loss": 0.0246, "step": 4560 }, { "epoch": 3.012524719841793, "grad_norm": 0.32036837935447693, "learning_rate": 7.461344790141607e-05, "loss": 0.0283, "step": 4570 }, { "epoch": 3.019116677653263, "grad_norm": 0.4340413212776184, "learning_rate": 7.419182861636218e-05, "loss": 0.0293, "step": 4580 }, { "epoch": 3.025708635464733, "grad_norm": 0.39858514070510864, "learning_rate": 7.377070027248756e-05, "loss": 0.0186, "step": 4590 }, { "epoch": 3.032300593276203, "grad_norm": 0.26919031143188477, "learning_rate": 7.335007088079156e-05, "loss": 0.0208, "step": 4600 }, { "epoch": 3.038892551087673, "grad_norm": 0.4067997634410858, "learning_rate": 7.292994844278223e-05, "loss": 0.0261, "step": 4610 }, { "epoch": 3.045484508899143, "grad_norm": 0.4950489103794098, "learning_rate": 7.251034095032388e-05, "loss": 0.0292, "step": 4620 }, { "epoch": 3.052076466710613, "grad_norm": 0.2269221693277359, "learning_rate": 7.20912563854852e-05, "loss": 0.0175, "step": 4630 }, { "epoch": 3.058668424522083, "grad_norm": 0.32157209515571594, "learning_rate": 7.167270272038747e-05, "loss": 0.0187, "step": 4640 }, { "epoch": 3.065260382333553, "grad_norm": 0.2660551369190216, "learning_rate": 7.12546879170527e-05, "loss": 0.023, "step": 4650 }, { "epoch": 3.071852340145023, "grad_norm": 0.29758307337760925, "learning_rate": 7.08372199272524e-05, "loss": 0.0291, "step": 4660 }, { "epoch": 3.078444297956493, "grad_norm": 0.32291552424430847, "learning_rate": 7.042030669235606e-05, "loss": 0.0334, "step": 4670 }, { "epoch": 3.085036255767963, "grad_norm": 0.481623113155365, "learning_rate": 7.000395614318038e-05, "loss": 0.0192, "step": 4680 }, { "epoch": 3.0916282135794333, "grad_norm": 0.36292940378189087, "learning_rate": 6.958817619983822e-05, "loss": 0.0279, "step": 4690 }, { "epoch": 3.098220171390903, "grad_norm": 0.34903573989868164, "learning_rate": 6.917297477158792e-05, "loss": 0.0219, "step": 4700 }, { "epoch": 3.104812129202373, "grad_norm": 0.290768563747406, "learning_rate": 6.875835975668298e-05, "loss": 0.0245, "step": 4710 }, { "epoch": 3.111404087013843, "grad_norm": 0.4250969886779785, "learning_rate": 6.834433904222162e-05, "loss": 0.0239, "step": 4720 }, { "epoch": 3.117996044825313, "grad_norm": 0.31465357542037964, "learning_rate": 6.793092050399698e-05, "loss": 0.0227, "step": 4730 }, { "epoch": 3.124588002636783, "grad_norm": 0.46385765075683594, "learning_rate": 6.75181120063471e-05, "loss": 0.0271, "step": 4740 }, { "epoch": 3.1311799604482533, "grad_norm": 0.37862929701805115, "learning_rate": 6.710592140200542e-05, "loss": 0.0227, "step": 4750 }, { "epoch": 3.1377719182597232, "grad_norm": 0.49200916290283203, "learning_rate": 6.669435653195146e-05, "loss": 0.0201, "step": 4760 }, { "epoch": 3.144363876071193, "grad_norm": 0.4198756217956543, "learning_rate": 6.628342522526143e-05, "loss": 0.0216, "step": 4770 }, { "epoch": 3.150955833882663, "grad_norm": 0.5533847212791443, "learning_rate": 6.587313529895957e-05, "loss": 0.034, "step": 4780 }, { "epoch": 3.157547791694133, "grad_norm": 0.37719669938087463, "learning_rate": 6.546349455786926e-05, "loss": 0.0282, "step": 4790 }, { "epoch": 3.164139749505603, "grad_norm": 0.6606992483139038, "learning_rate": 6.505451079446467e-05, "loss": 0.0217, "step": 4800 }, { "epoch": 3.1707317073170733, "grad_norm": 0.20845943689346313, "learning_rate": 6.464619178872247e-05, "loss": 0.023, "step": 4810 }, { "epoch": 3.1773236651285433, "grad_norm": 0.23495689034461975, "learning_rate": 6.42385453079738e-05, "loss": 0.0256, "step": 4820 }, { "epoch": 3.183915622940013, "grad_norm": 0.1919371336698532, "learning_rate": 6.38315791067567e-05, "loss": 0.019, "step": 4830 }, { "epoch": 3.190507580751483, "grad_norm": 0.3485127091407776, "learning_rate": 6.342530092666821e-05, "loss": 0.0205, "step": 4840 }, { "epoch": 3.197099538562953, "grad_norm": 0.2419605702161789, "learning_rate": 6.301971849621757e-05, "loss": 0.0197, "step": 4850 }, { "epoch": 3.2036914963744234, "grad_norm": 0.23359638452529907, "learning_rate": 6.261483953067886e-05, "loss": 0.0215, "step": 4860 }, { "epoch": 3.2102834541858933, "grad_norm": 0.4236893355846405, "learning_rate": 6.221067173194442e-05, "loss": 0.0259, "step": 4870 }, { "epoch": 3.2168754119973633, "grad_norm": 0.35271692276000977, "learning_rate": 6.180722278837825e-05, "loss": 0.0229, "step": 4880 }, { "epoch": 3.223467369808833, "grad_norm": 0.5368591547012329, "learning_rate": 6.140450037466974e-05, "loss": 0.0227, "step": 4890 }, { "epoch": 3.230059327620303, "grad_norm": 0.3813161849975586, "learning_rate": 6.1002512151687796e-05, "loss": 0.0175, "step": 4900 }, { "epoch": 3.236651285431773, "grad_norm": 0.40781912207603455, "learning_rate": 6.060126576633497e-05, "loss": 0.0278, "step": 4910 }, { "epoch": 3.2432432432432434, "grad_norm": 0.3028331398963928, "learning_rate": 6.0200768851402133e-05, "loss": 0.0212, "step": 4920 }, { "epoch": 3.2498352010547134, "grad_norm": 0.20801442861557007, "learning_rate": 5.980102902542306e-05, "loss": 0.0244, "step": 4930 }, { "epoch": 3.2564271588661833, "grad_norm": 0.3236633241176605, "learning_rate": 5.9402053892529794e-05, "loss": 0.023, "step": 4940 }, { "epoch": 3.263019116677653, "grad_norm": 0.3075791895389557, "learning_rate": 5.9003851042307804e-05, "loss": 0.0193, "step": 4950 }, { "epoch": 3.269611074489123, "grad_norm": 0.33486539125442505, "learning_rate": 5.86064280496516e-05, "loss": 0.0212, "step": 4960 }, { "epoch": 3.276203032300593, "grad_norm": 0.4018231928348541, "learning_rate": 5.8209792474620815e-05, "loss": 0.0215, "step": 4970 }, { "epoch": 3.2827949901120634, "grad_norm": 0.35829004645347595, "learning_rate": 5.78139518622961e-05, "loss": 0.0228, "step": 4980 }, { "epoch": 3.2893869479235334, "grad_norm": 0.2682739496231079, "learning_rate": 5.741891374263593e-05, "loss": 0.0255, "step": 4990 }, { "epoch": 3.2959789057350033, "grad_norm": 0.3929627537727356, "learning_rate": 5.702468563033306e-05, "loss": 0.0228, "step": 5000 }, { "epoch": 3.3025708635464732, "grad_norm": 0.2807949483394623, "learning_rate": 5.663127502467184e-05, "loss": 0.0207, "step": 5010 }, { "epoch": 3.309162821357943, "grad_norm": 0.33235079050064087, "learning_rate": 5.6238689409385346e-05, "loss": 0.0243, "step": 5020 }, { "epoch": 3.3157547791694135, "grad_norm": 0.28995218873023987, "learning_rate": 5.5846936252513174e-05, "loss": 0.017, "step": 5030 }, { "epoch": 3.3223467369808835, "grad_norm": 0.2601809799671173, "learning_rate": 5.54560230062593e-05, "loss": 0.0166, "step": 5040 }, { "epoch": 3.3289386947923534, "grad_norm": 0.3650406301021576, "learning_rate": 5.5065957106850204e-05, "loss": 0.021, "step": 5050 }, { "epoch": 3.3355306526038233, "grad_norm": 0.48497456312179565, "learning_rate": 5.4676745974393764e-05, "loss": 0.0173, "step": 5060 }, { "epoch": 3.3421226104152932, "grad_norm": 0.3954178988933563, "learning_rate": 5.4288397012737646e-05, "loss": 0.02, "step": 5070 }, { "epoch": 3.348714568226763, "grad_norm": 0.21555176377296448, "learning_rate": 5.390091760932887e-05, "loss": 0.0208, "step": 5080 }, { "epoch": 3.3553065260382335, "grad_norm": 0.4477789103984833, "learning_rate": 5.3514315135073076e-05, "loss": 0.023, "step": 5090 }, { "epoch": 3.3618984838497035, "grad_norm": 0.4595910906791687, "learning_rate": 5.3128596944194234e-05, "loss": 0.027, "step": 5100 }, { "epoch": 3.3684904416611734, "grad_norm": 0.3426424264907837, "learning_rate": 5.274377037409497e-05, "loss": 0.0224, "step": 5110 }, { "epoch": 3.3750823994726433, "grad_norm": 0.2647363841533661, "learning_rate": 5.235984274521684e-05, "loss": 0.0238, "step": 5120 }, { "epoch": 3.3816743572841133, "grad_norm": 0.21992464363574982, "learning_rate": 5.197682136090107e-05, "loss": 0.0163, "step": 5130 }, { "epoch": 3.388266315095583, "grad_norm": 0.6907774209976196, "learning_rate": 5.159471350724978e-05, "loss": 0.0223, "step": 5140 }, { "epoch": 3.3948582729070536, "grad_norm": 0.44378501176834106, "learning_rate": 5.121352645298708e-05, "loss": 0.0245, "step": 5150 }, { "epoch": 3.4014502307185235, "grad_norm": 0.25844740867614746, "learning_rate": 5.083326744932117e-05, "loss": 0.0211, "step": 5160 }, { "epoch": 3.4080421885299934, "grad_norm": 0.3211382031440735, "learning_rate": 5.0453943729806094e-05, "loss": 0.0207, "step": 5170 }, { "epoch": 3.4146341463414633, "grad_norm": 0.25202128291130066, "learning_rate": 5.007556251020434e-05, "loss": 0.0215, "step": 5180 }, { "epoch": 3.4212261041529333, "grad_norm": 0.3003428876399994, "learning_rate": 4.9698130988349424e-05, "loss": 0.0207, "step": 5190 }, { "epoch": 3.4278180619644036, "grad_norm": 0.32026761770248413, "learning_rate": 4.9321656344009115e-05, "loss": 0.0196, "step": 5200 }, { "epoch": 3.4344100197758736, "grad_norm": 0.26623809337615967, "learning_rate": 4.894614573874877e-05, "loss": 0.0219, "step": 5210 }, { "epoch": 3.4410019775873435, "grad_norm": 0.35238540172576904, "learning_rate": 4.857160631579509e-05, "loss": 0.0152, "step": 5220 }, { "epoch": 3.4475939353988134, "grad_norm": 0.3443749248981476, "learning_rate": 4.819804519990033e-05, "loss": 0.0232, "step": 5230 }, { "epoch": 3.4541858932102834, "grad_norm": 0.35800328850746155, "learning_rate": 4.782546949720658e-05, "loss": 0.0217, "step": 5240 }, { "epoch": 3.4607778510217533, "grad_norm": 0.37850216031074524, "learning_rate": 4.745388629511084e-05, "loss": 0.0167, "step": 5250 }, { "epoch": 3.4673698088332237, "grad_norm": 0.24581514298915863, "learning_rate": 4.708330266212993e-05, "loss": 0.0179, "step": 5260 }, { "epoch": 3.4739617666446936, "grad_norm": 0.16642197966575623, "learning_rate": 4.671372564776629e-05, "loss": 0.0169, "step": 5270 }, { "epoch": 3.4805537244561635, "grad_norm": 0.32910865545272827, "learning_rate": 4.634516228237372e-05, "loss": 0.019, "step": 5280 }, { "epoch": 3.4871456822676334, "grad_norm": 0.21662920713424683, "learning_rate": 4.59776195770236e-05, "loss": 0.0162, "step": 5290 }, { "epoch": 3.4937376400791034, "grad_norm": 0.3485572934150696, "learning_rate": 4.561110452337171e-05, "loss": 0.0217, "step": 5300 }, { "epoch": 3.5003295978905733, "grad_norm": 0.20581798255443573, "learning_rate": 4.5245624093525e-05, "loss": 0.0296, "step": 5310 }, { "epoch": 3.5069215557020437, "grad_norm": 0.35009968280792236, "learning_rate": 4.488118523990915e-05, "loss": 0.0208, "step": 5320 }, { "epoch": 3.5135135135135136, "grad_norm": 0.39382439851760864, "learning_rate": 4.451779489513628e-05, "loss": 0.0217, "step": 5330 }, { "epoch": 3.5201054713249835, "grad_norm": 0.348563551902771, "learning_rate": 4.415545997187296e-05, "loss": 0.0165, "step": 5340 }, { "epoch": 3.5266974291364535, "grad_norm": 0.494354784488678, "learning_rate": 4.379418736270886e-05, "loss": 0.0232, "step": 5350 }, { "epoch": 3.5332893869479234, "grad_norm": 0.1578008085489273, "learning_rate": 4.343398394002547e-05, "loss": 0.0226, "step": 5360 }, { "epoch": 3.5398813447593938, "grad_norm": 0.3410768210887909, "learning_rate": 4.307485655586557e-05, "loss": 0.0219, "step": 5370 }, { "epoch": 3.5464733025708637, "grad_norm": 0.20960773527622223, "learning_rate": 4.271681204180268e-05, "loss": 0.0209, "step": 5380 }, { "epoch": 3.5530652603823336, "grad_norm": 0.22281195223331451, "learning_rate": 4.2359857208811284e-05, "loss": 0.0233, "step": 5390 }, { "epoch": 3.5596572181938035, "grad_norm": 0.3393511474132538, "learning_rate": 4.2003998847137174e-05, "loss": 0.0209, "step": 5400 }, { "epoch": 3.5662491760052735, "grad_norm": 0.6712432503700256, "learning_rate": 4.164924372616821e-05, "loss": 0.0249, "step": 5410 }, { "epoch": 3.572841133816744, "grad_norm": 0.18807201087474823, "learning_rate": 4.129559859430573e-05, "loss": 0.024, "step": 5420 }, { "epoch": 3.5794330916282133, "grad_norm": 0.4251366853713989, "learning_rate": 4.094307017883606e-05, "loss": 0.0174, "step": 5430 }, { "epoch": 3.5860250494396837, "grad_norm": 0.2247576266527176, "learning_rate": 4.0591665185802576e-05, "loss": 0.0214, "step": 5440 }, { "epoch": 3.5926170072511536, "grad_norm": 0.643822968006134, "learning_rate": 4.0241390299878e-05, "loss": 0.0222, "step": 5450 }, { "epoch": 3.5992089650626236, "grad_norm": 0.37506723403930664, "learning_rate": 3.989225218423753e-05, "loss": 0.0147, "step": 5460 }, { "epoch": 3.6058009228740935, "grad_norm": 0.3052820861339569, "learning_rate": 3.954425748043186e-05, "loss": 0.0191, "step": 5470 }, { "epoch": 3.6123928806855634, "grad_norm": 0.3424012362957001, "learning_rate": 3.9197412808260805e-05, "loss": 0.0214, "step": 5480 }, { "epoch": 3.618984838497034, "grad_norm": 0.24967588484287262, "learning_rate": 3.885172476564765e-05, "loss": 0.0157, "step": 5490 }, { "epoch": 3.6255767963085037, "grad_norm": 0.2771139442920685, "learning_rate": 3.850719992851326e-05, "loss": 0.0198, "step": 5500 }, { "epoch": 3.6321687541199736, "grad_norm": 0.3275032043457031, "learning_rate": 3.8163844850651346e-05, "loss": 0.0204, "step": 5510 }, { "epoch": 3.6387607119314436, "grad_norm": 0.3696538507938385, "learning_rate": 3.7821666063603566e-05, "loss": 0.0172, "step": 5520 }, { "epoch": 3.6453526697429135, "grad_norm": 0.43786558508872986, "learning_rate": 3.748067007653536e-05, "loss": 0.0199, "step": 5530 }, { "epoch": 3.651944627554384, "grad_norm": 0.15298739075660706, "learning_rate": 3.714086337611217e-05, "loss": 0.0118, "step": 5540 }, { "epoch": 3.658536585365854, "grad_norm": 0.2643417716026306, "learning_rate": 3.680225242637583e-05, "loss": 0.0217, "step": 5550 }, { "epoch": 3.6651285431773237, "grad_norm": 0.29987242817878723, "learning_rate": 3.646484366862197e-05, "loss": 0.0218, "step": 5560 }, { "epoch": 3.6717205009887937, "grad_norm": 0.2553282678127289, "learning_rate": 3.6128643521277096e-05, "loss": 0.0192, "step": 5570 }, { "epoch": 3.6783124588002636, "grad_norm": 0.24411100149154663, "learning_rate": 3.57936583797768e-05, "loss": 0.0156, "step": 5580 }, { "epoch": 3.684904416611734, "grad_norm": 0.2638270854949951, "learning_rate": 3.5459894616443954e-05, "loss": 0.0188, "step": 5590 }, { "epoch": 3.6914963744232034, "grad_norm": 0.19742664694786072, "learning_rate": 3.5127358580367463e-05, "loss": 0.021, "step": 5600 }, { "epoch": 3.698088332234674, "grad_norm": 0.3131982386112213, "learning_rate": 3.479605659728159e-05, "loss": 0.0176, "step": 5610 }, { "epoch": 3.7046802900461437, "grad_norm": 0.24199941754341125, "learning_rate": 3.446599496944557e-05, "loss": 0.0178, "step": 5620 }, { "epoch": 3.7112722478576137, "grad_norm": 0.18790839612483978, "learning_rate": 3.413717997552376e-05, "loss": 0.012, "step": 5630 }, { "epoch": 3.7178642056690836, "grad_norm": 0.4031229317188263, "learning_rate": 3.380961787046605e-05, "loss": 0.022, "step": 5640 }, { "epoch": 3.7244561634805535, "grad_norm": 0.3094145357608795, "learning_rate": 3.348331488538913e-05, "loss": 0.0207, "step": 5650 }, { "epoch": 3.731048121292024, "grad_norm": 0.31893035769462585, "learning_rate": 3.315827722745779e-05, "loss": 0.0195, "step": 5660 }, { "epoch": 3.737640079103494, "grad_norm": 0.2687014639377594, "learning_rate": 3.28345110797668e-05, "loss": 0.0152, "step": 5670 }, { "epoch": 3.7442320369149638, "grad_norm": 0.3952026963233948, "learning_rate": 3.2512022601223515e-05, "loss": 0.0247, "step": 5680 }, { "epoch": 3.7508239947264337, "grad_norm": 0.25332149863243103, "learning_rate": 3.21908179264304e-05, "loss": 0.0142, "step": 5690 }, { "epoch": 3.7574159525379036, "grad_norm": 0.4335060119628906, "learning_rate": 3.187090316556861e-05, "loss": 0.0202, "step": 5700 }, { "epoch": 3.764007910349374, "grad_norm": 0.25930336117744446, "learning_rate": 3.155228440428164e-05, "loss": 0.0208, "step": 5710 }, { "epoch": 3.770599868160844, "grad_norm": 0.6695492267608643, "learning_rate": 3.123496770355956e-05, "loss": 0.0153, "step": 5720 }, { "epoch": 3.777191825972314, "grad_norm": 0.3357510566711426, "learning_rate": 3.091895909962375e-05, "loss": 0.021, "step": 5730 }, { "epoch": 3.7837837837837838, "grad_norm": 0.4220266342163086, "learning_rate": 3.060426460381195e-05, "loss": 0.0155, "step": 5740 }, { "epoch": 3.7903757415952537, "grad_norm": 0.2396579086780548, "learning_rate": 3.0290890202464182e-05, "loss": 0.017, "step": 5750 }, { "epoch": 3.796967699406724, "grad_norm": 0.4336076080799103, "learning_rate": 2.9978841856808525e-05, "loss": 0.0193, "step": 5760 }, { "epoch": 3.8035596572181936, "grad_norm": 0.4535181224346161, "learning_rate": 2.966812550284803e-05, "loss": 0.0151, "step": 5770 }, { "epoch": 3.810151615029664, "grad_norm": 0.2847338020801544, "learning_rate": 2.9358747051247637e-05, "loss": 0.0164, "step": 5780 }, { "epoch": 3.816743572841134, "grad_norm": 0.33757925033569336, "learning_rate": 2.905071238722169e-05, "loss": 0.0173, "step": 5790 }, { "epoch": 3.823335530652604, "grad_norm": 0.21222251653671265, "learning_rate": 2.8744027370422167e-05, "loss": 0.0186, "step": 5800 }, { "epoch": 3.8299274884640737, "grad_norm": 0.8053876757621765, "learning_rate": 2.843869783482701e-05, "loss": 0.0189, "step": 5810 }, { "epoch": 3.8365194462755436, "grad_norm": 0.2711152732372284, "learning_rate": 2.8134729588629303e-05, "loss": 0.0281, "step": 5820 }, { "epoch": 3.843111404087014, "grad_norm": 0.24810029566287994, "learning_rate": 2.7832128414126735e-05, "loss": 0.0169, "step": 5830 }, { "epoch": 3.849703361898484, "grad_norm": 0.3628500998020172, "learning_rate": 2.7530900067611577e-05, "loss": 0.0138, "step": 5840 }, { "epoch": 3.856295319709954, "grad_norm": 0.1820344775915146, "learning_rate": 2.7231050279261217e-05, "loss": 0.0201, "step": 5850 }, { "epoch": 3.862887277521424, "grad_norm": 0.5230331420898438, "learning_rate": 2.6932584753029068e-05, "loss": 0.0162, "step": 5860 }, { "epoch": 3.8694792353328937, "grad_norm": 0.27183738350868225, "learning_rate": 2.6635509166536243e-05, "loss": 0.0173, "step": 5870 }, { "epoch": 3.876071193144364, "grad_norm": 0.19195932149887085, "learning_rate": 2.633982917096335e-05, "loss": 0.0207, "step": 5880 }, { "epoch": 3.882663150955834, "grad_norm": 0.42282554507255554, "learning_rate": 2.6045550390943185e-05, "loss": 0.0159, "step": 5890 }, { "epoch": 3.889255108767304, "grad_norm": 0.2981650233268738, "learning_rate": 2.5752678424453514e-05, "loss": 0.0173, "step": 5900 }, { "epoch": 3.895847066578774, "grad_norm": 0.32203352451324463, "learning_rate": 2.5461218842710798e-05, "loss": 0.021, "step": 5910 }, { "epoch": 3.902439024390244, "grad_norm": 0.2388588786125183, "learning_rate": 2.517117719006411e-05, "loss": 0.0219, "step": 5920 }, { "epoch": 3.9090309822017137, "grad_norm": 0.40328285098075867, "learning_rate": 2.488255898388966e-05, "loss": 0.0169, "step": 5930 }, { "epoch": 3.9156229400131837, "grad_norm": 0.14190708100795746, "learning_rate": 2.4595369714485895e-05, "loss": 0.0167, "step": 5940 }, { "epoch": 3.922214897824654, "grad_norm": 0.418643593788147, "learning_rate": 2.430961484496893e-05, "loss": 0.0187, "step": 5950 }, { "epoch": 3.928806855636124, "grad_norm": 0.2280479073524475, "learning_rate": 2.4025299811168843e-05, "loss": 0.0151, "step": 5960 }, { "epoch": 3.935398813447594, "grad_norm": 0.5002431869506836, "learning_rate": 2.3742430021526018e-05, "loss": 0.019, "step": 5970 }, { "epoch": 3.941990771259064, "grad_norm": 0.22551734745502472, "learning_rate": 2.3461010856988473e-05, "loss": 0.013, "step": 5980 }, { "epoch": 3.9485827290705338, "grad_norm": 0.3069497048854828, "learning_rate": 2.318104767090944e-05, "loss": 0.018, "step": 5990 }, { "epoch": 3.955174686882004, "grad_norm": 0.36286690831184387, "learning_rate": 2.2902545788945396e-05, "loss": 0.024, "step": 6000 }, { "epoch": 3.961766644693474, "grad_norm": 0.2421414703130722, "learning_rate": 2.2625510508954952e-05, "loss": 0.0212, "step": 6010 }, { "epoch": 3.968358602504944, "grad_norm": 0.23019398748874664, "learning_rate": 2.234994710089795e-05, "loss": 0.0188, "step": 6020 }, { "epoch": 3.974950560316414, "grad_norm": 0.2802564203739166, "learning_rate": 2.207586080673528e-05, "loss": 0.0192, "step": 6030 }, { "epoch": 3.981542518127884, "grad_norm": 0.2667250633239746, "learning_rate": 2.1803256840329134e-05, "loss": 0.0213, "step": 6040 }, { "epoch": 3.988134475939354, "grad_norm": 0.4056625962257385, "learning_rate": 2.1532140387343735e-05, "loss": 0.0169, "step": 6050 }, { "epoch": 3.994726433750824, "grad_norm": 0.1790419965982437, "learning_rate": 2.126251660514691e-05, "loss": 0.0185, "step": 6060 }, { "epoch": 4.001318391562294, "grad_norm": 0.2861385941505432, "learning_rate": 2.0994390622711734e-05, "loss": 0.0191, "step": 6070 }, { "epoch": 4.0079103493737644, "grad_norm": 0.20970335602760315, "learning_rate": 2.0727767540519193e-05, "loss": 0.0171, "step": 6080 }, { "epoch": 4.014502307185234, "grad_norm": 0.2126467227935791, "learning_rate": 2.046265243046094e-05, "loss": 0.0175, "step": 6090 }, { "epoch": 4.021094264996704, "grad_norm": 0.4862785339355469, "learning_rate": 2.0199050335743007e-05, "loss": 0.0212, "step": 6100 }, { "epoch": 4.027686222808174, "grad_norm": 0.36454570293426514, "learning_rate": 1.9936966270789738e-05, "loss": 0.0159, "step": 6110 }, { "epoch": 4.034278180619644, "grad_norm": 0.1897134780883789, "learning_rate": 1.9676405221148475e-05, "loss": 0.0172, "step": 6120 }, { "epoch": 4.040870138431114, "grad_norm": 0.2542422115802765, "learning_rate": 1.9417372143394697e-05, "loss": 0.0251, "step": 6130 }, { "epoch": 4.047462096242584, "grad_norm": 0.20512335002422333, "learning_rate": 1.9159871965037657e-05, "loss": 0.0172, "step": 6140 }, { "epoch": 4.054054054054054, "grad_norm": 0.21565409004688263, "learning_rate": 1.8903909584426826e-05, "loss": 0.018, "step": 6150 }, { "epoch": 4.060646011865524, "grad_norm": 0.3546988368034363, "learning_rate": 1.86494898706585e-05, "loss": 0.0169, "step": 6160 }, { "epoch": 4.067237969676994, "grad_norm": 0.5294975638389587, "learning_rate": 1.8396617663483363e-05, "loss": 0.0159, "step": 6170 }, { "epoch": 4.073829927488464, "grad_norm": 0.2470693439245224, "learning_rate": 1.814529777321432e-05, "loss": 0.0211, "step": 6180 }, { "epoch": 4.080421885299934, "grad_norm": 0.4331272542476654, "learning_rate": 1.7895534980634954e-05, "loss": 0.0176, "step": 6190 }, { "epoch": 4.0870138431114045, "grad_norm": 0.3057391941547394, "learning_rate": 1.764733403690875e-05, "loss": 0.0203, "step": 6200 }, { "epoch": 4.093605800922874, "grad_norm": 0.11541125923395157, "learning_rate": 1.740069966348846e-05, "loss": 0.0193, "step": 6210 }, { "epoch": 4.100197758734344, "grad_norm": 0.28473731875419617, "learning_rate": 1.71556365520266e-05, "loss": 0.0196, "step": 6220 }, { "epoch": 4.106789716545814, "grad_norm": 0.14990141987800598, "learning_rate": 1.6912149364285958e-05, "loss": 0.0147, "step": 6230 }, { "epoch": 4.113381674357284, "grad_norm": 0.33358579874038696, "learning_rate": 1.667024273205092e-05, "loss": 0.02, "step": 6240 }, { "epoch": 4.119973632168755, "grad_norm": 0.2164691537618637, "learning_rate": 1.6429921257039592e-05, "loss": 0.0171, "step": 6250 }, { "epoch": 4.126565589980224, "grad_norm": 0.29503509402275085, "learning_rate": 1.619118951081594e-05, "loss": 0.0156, "step": 6260 }, { "epoch": 4.133157547791694, "grad_norm": 0.29893797636032104, "learning_rate": 1.5954052034703125e-05, "loss": 0.016, "step": 6270 }, { "epoch": 4.139749505603164, "grad_norm": 0.3970952033996582, "learning_rate": 1.5718513339696883e-05, "loss": 0.0191, "step": 6280 }, { "epoch": 4.146341463414634, "grad_norm": 0.2718060612678528, "learning_rate": 1.548457790637987e-05, "loss": 0.014, "step": 6290 }, { "epoch": 4.152933421226104, "grad_norm": 0.3720945119857788, "learning_rate": 1.525225018483638e-05, "loss": 0.0168, "step": 6300 }, { "epoch": 4.159525379037574, "grad_norm": 0.21513940393924713, "learning_rate": 1.5021534594567621e-05, "loss": 0.0159, "step": 6310 }, { "epoch": 4.1661173368490445, "grad_norm": 0.30618909001350403, "learning_rate": 1.4792435524407755e-05, "loss": 0.0151, "step": 6320 }, { "epoch": 4.172709294660514, "grad_norm": 0.409757524728775, "learning_rate": 1.4564957332440365e-05, "loss": 0.0177, "step": 6330 }, { "epoch": 4.179301252471984, "grad_norm": 0.2687203884124756, "learning_rate": 1.4339104345915554e-05, "loss": 0.0202, "step": 6340 }, { "epoch": 4.185893210283454, "grad_norm": 0.25398269295692444, "learning_rate": 1.4114880861167557e-05, "loss": 0.0189, "step": 6350 }, { "epoch": 4.192485168094924, "grad_norm": 0.2254013866186142, "learning_rate": 1.3892291143533154e-05, "loss": 0.0144, "step": 6360 }, { "epoch": 4.199077125906395, "grad_norm": 0.32205384969711304, "learning_rate": 1.3671339427270458e-05, "loss": 0.0161, "step": 6370 }, { "epoch": 4.205669083717864, "grad_norm": 0.3406763970851898, "learning_rate": 1.3452029915478304e-05, "loss": 0.02, "step": 6380 }, { "epoch": 4.2122610415293344, "grad_norm": 0.31815874576568604, "learning_rate": 1.3234366780016438e-05, "loss": 0.0185, "step": 6390 }, { "epoch": 4.218852999340804, "grad_norm": 0.1224733293056488, "learning_rate": 1.3018354161425994e-05, "loss": 0.0181, "step": 6400 }, { "epoch": 4.225444957152274, "grad_norm": 0.42326441407203674, "learning_rate": 1.2803996168850896e-05, "loss": 0.016, "step": 6410 }, { "epoch": 4.232036914963745, "grad_norm": 0.2917204797267914, "learning_rate": 1.2591296879959557e-05, "loss": 0.0146, "step": 6420 }, { "epoch": 4.238628872775214, "grad_norm": 0.27973493933677673, "learning_rate": 1.238026034086739e-05, "loss": 0.0167, "step": 6430 }, { "epoch": 4.2452208305866845, "grad_norm": 0.13871712982654572, "learning_rate": 1.2170890566059811e-05, "loss": 0.0161, "step": 6440 }, { "epoch": 4.251812788398154, "grad_norm": 0.2724437713623047, "learning_rate": 1.1963191538315833e-05, "loss": 0.0188, "step": 6450 }, { "epoch": 4.258404746209624, "grad_norm": 0.24582289159297943, "learning_rate": 1.1757167208632414e-05, "loss": 0.0142, "step": 6460 }, { "epoch": 4.264996704021094, "grad_norm": 0.6128583550453186, "learning_rate": 1.1552821496149135e-05, "loss": 0.015, "step": 6470 }, { "epoch": 4.271588661832564, "grad_norm": 0.38243502378463745, "learning_rate": 1.135015828807382e-05, "loss": 0.0135, "step": 6480 }, { "epoch": 4.278180619644035, "grad_norm": 0.22540901601314545, "learning_rate": 1.1149181439608514e-05, "loss": 0.0156, "step": 6490 }, { "epoch": 4.284772577455504, "grad_norm": 0.4100974500179291, "learning_rate": 1.0949894773876079e-05, "loss": 0.0156, "step": 6500 }, { "epoch": 4.2913645352669745, "grad_norm": 0.1929452121257782, "learning_rate": 1.0752302081847565e-05, "loss": 0.0184, "step": 6510 }, { "epoch": 4.297956493078444, "grad_norm": 0.27612316608428955, "learning_rate": 1.0556407122270096e-05, "loss": 0.0192, "step": 6520 }, { "epoch": 4.304548450889914, "grad_norm": 0.20837433636188507, "learning_rate": 1.0362213621595307e-05, "loss": 0.0135, "step": 6530 }, { "epoch": 4.311140408701385, "grad_norm": 0.38383790850639343, "learning_rate": 1.016972527390846e-05, "loss": 0.0186, "step": 6540 }, { "epoch": 4.317732366512854, "grad_norm": 0.3808279037475586, "learning_rate": 9.978945740858226e-06, "loss": 0.0172, "step": 6550 }, { "epoch": 4.324324324324325, "grad_norm": 0.12612776458263397, "learning_rate": 9.789878651587036e-06, "loss": 0.0131, "step": 6560 }, { "epoch": 4.330916282135794, "grad_norm": 0.47806084156036377, "learning_rate": 9.602527602661949e-06, "loss": 0.0175, "step": 6570 }, { "epoch": 4.337508239947264, "grad_norm": 0.5602189302444458, "learning_rate": 9.416896158006328e-06, "loss": 0.0161, "step": 6580 }, { "epoch": 4.344100197758735, "grad_norm": 0.5258492231369019, "learning_rate": 9.232987848832009e-06, "loss": 0.0151, "step": 6590 }, { "epoch": 4.350692155570204, "grad_norm": 0.18115440011024475, "learning_rate": 9.050806173572134e-06, "loss": 0.0115, "step": 6600 }, { "epoch": 4.357284113381675, "grad_norm": 0.2673959732055664, "learning_rate": 8.870354597814622e-06, "loss": 0.013, "step": 6610 }, { "epoch": 4.363876071193144, "grad_norm": 0.4614759385585785, "learning_rate": 8.691636554236182e-06, "loss": 0.0179, "step": 6620 }, { "epoch": 4.3704680290046145, "grad_norm": 0.31257471442222595, "learning_rate": 8.514655442537122e-06, "loss": 0.0152, "step": 6630 }, { "epoch": 4.377059986816084, "grad_norm": 0.1402910202741623, "learning_rate": 8.339414629376507e-06, "loss": 0.0155, "step": 6640 }, { "epoch": 4.383651944627554, "grad_norm": 0.19149114191532135, "learning_rate": 8.165917448308324e-06, "loss": 0.0132, "step": 6650 }, { "epoch": 4.390243902439025, "grad_norm": 0.31132665276527405, "learning_rate": 7.994167199717894e-06, "loss": 0.0159, "step": 6660 }, { "epoch": 4.396835860250494, "grad_norm": 0.30715203285217285, "learning_rate": 7.824167150759188e-06, "loss": 0.022, "step": 6670 }, { "epoch": 4.403427818061965, "grad_norm": 0.23801127076148987, "learning_rate": 7.655920535292682e-06, "loss": 0.0123, "step": 6680 }, { "epoch": 4.410019775873434, "grad_norm": 0.3437555730342865, "learning_rate": 7.4894305538237285e-06, "loss": 0.0154, "step": 6690 }, { "epoch": 4.4166117336849045, "grad_norm": 0.23300838470458984, "learning_rate": 7.324700373441828e-06, "loss": 0.0188, "step": 6700 }, { "epoch": 4.423203691496375, "grad_norm": 0.2827889621257782, "learning_rate": 7.161733127760228e-06, "loss": 0.0151, "step": 6710 }, { "epoch": 4.429795649307844, "grad_norm": 0.2165522575378418, "learning_rate": 7.000531916856512e-06, "loss": 0.0145, "step": 6720 }, { "epoch": 4.436387607119315, "grad_norm": 0.3993603587150574, "learning_rate": 6.841099807213392e-06, "loss": 0.024, "step": 6730 }, { "epoch": 4.442979564930784, "grad_norm": 0.21347716450691223, "learning_rate": 6.683439831660554e-06, "loss": 0.0254, "step": 6740 }, { "epoch": 4.4495715227422545, "grad_norm": 0.4783138036727905, "learning_rate": 6.527554989316897e-06, "loss": 0.0141, "step": 6750 }, { "epoch": 4.456163480553725, "grad_norm": 0.2551850378513336, "learning_rate": 6.373448245533464e-06, "loss": 0.0203, "step": 6760 }, { "epoch": 4.462755438365194, "grad_norm": 0.22933778166770935, "learning_rate": 6.221122531837076e-06, "loss": 0.0193, "step": 6770 }, { "epoch": 4.469347396176665, "grad_norm": 0.1832355260848999, "learning_rate": 6.070580745874544e-06, "loss": 0.0134, "step": 6780 }, { "epoch": 4.475939353988134, "grad_norm": 0.3792283535003662, "learning_rate": 5.921825751357557e-06, "loss": 0.0159, "step": 6790 }, { "epoch": 4.482531311799605, "grad_norm": 0.18225885927677155, "learning_rate": 5.7748603780081735e-06, "loss": 0.0217, "step": 6800 }, { "epoch": 4.489123269611074, "grad_norm": 0.49436914920806885, "learning_rate": 5.62968742150507e-06, "loss": 0.0158, "step": 6810 }, { "epoch": 4.4957152274225445, "grad_norm": 0.2793099582195282, "learning_rate": 5.4863096434302655e-06, "loss": 0.016, "step": 6820 }, { "epoch": 4.502307185234015, "grad_norm": 0.2998494505882263, "learning_rate": 5.344729771216661e-06, "loss": 0.0174, "step": 6830 }, { "epoch": 4.508899143045484, "grad_norm": 0.45131003856658936, "learning_rate": 5.204950498096117e-06, "loss": 0.0196, "step": 6840 }, { "epoch": 4.515491100856955, "grad_norm": 0.37397655844688416, "learning_rate": 5.066974483048215e-06, "loss": 0.0158, "step": 6850 }, { "epoch": 4.522083058668424, "grad_norm": 0.5381725430488586, "learning_rate": 4.930804350749729e-06, "loss": 0.016, "step": 6860 }, { "epoch": 4.528675016479895, "grad_norm": 0.2811379134654999, "learning_rate": 4.796442691524638e-06, "loss": 0.013, "step": 6870 }, { "epoch": 4.535266974291364, "grad_norm": 0.205452561378479, "learning_rate": 4.663892061294872e-06, "loss": 0.0165, "step": 6880 }, { "epoch": 4.541858932102834, "grad_norm": 0.2746995687484741, "learning_rate": 4.5331549815317174e-06, "loss": 0.0227, "step": 6890 }, { "epoch": 4.548450889914305, "grad_norm": 0.30904215574264526, "learning_rate": 4.404233939207791e-06, "loss": 0.0153, "step": 6900 }, { "epoch": 4.555042847725774, "grad_norm": 0.42725998163223267, "learning_rate": 4.2771313867498e-06, "loss": 0.0192, "step": 6910 }, { "epoch": 4.561634805537245, "grad_norm": 0.18472789227962494, "learning_rate": 4.151849741991864e-06, "loss": 0.025, "step": 6920 }, { "epoch": 4.568226763348715, "grad_norm": 0.3807401955127716, "learning_rate": 4.0283913881294935e-06, "loss": 0.0181, "step": 6930 }, { "epoch": 4.5748187211601845, "grad_norm": 0.17289142310619354, "learning_rate": 3.906758673674293e-06, "loss": 0.0148, "step": 6940 }, { "epoch": 4.581410678971655, "grad_norm": 0.32773271203041077, "learning_rate": 3.7869539124092525e-06, "loss": 0.0173, "step": 6950 }, { "epoch": 4.588002636783124, "grad_norm": 0.2213710993528366, "learning_rate": 3.6689793833447837e-06, "loss": 0.0137, "step": 6960 }, { "epoch": 4.594594594594595, "grad_norm": 0.17836393415927887, "learning_rate": 3.552837330675296e-06, "loss": 0.0184, "step": 6970 }, { "epoch": 4.601186552406064, "grad_norm": 0.2593984603881836, "learning_rate": 3.43852996373657e-06, "loss": 0.0138, "step": 6980 }, { "epoch": 4.607778510217535, "grad_norm": 0.2913285195827484, "learning_rate": 3.3260594569636928e-06, "loss": 0.0212, "step": 6990 }, { "epoch": 4.614370468029005, "grad_norm": 0.18963216245174408, "learning_rate": 3.215427949849714e-06, "loss": 0.0155, "step": 7000 }, { "epoch": 4.6209624258404745, "grad_norm": 0.30186694860458374, "learning_rate": 3.1066375469049337e-06, "loss": 0.0185, "step": 7010 }, { "epoch": 4.627554383651945, "grad_norm": 0.3594430685043335, "learning_rate": 2.9996903176168765e-06, "loss": 0.0157, "step": 7020 }, { "epoch": 4.634146341463414, "grad_norm": 0.407387912273407, "learning_rate": 2.8945882964109496e-06, "loss": 0.0155, "step": 7030 }, { "epoch": 4.640738299274885, "grad_norm": 0.1670001596212387, "learning_rate": 2.7913334826116357e-06, "loss": 0.0156, "step": 7040 }, { "epoch": 4.647330257086354, "grad_norm": 0.3461068272590637, "learning_rate": 2.689927840404638e-06, "loss": 0.0155, "step": 7050 }, { "epoch": 4.6539222148978245, "grad_norm": 0.1870720386505127, "learning_rate": 2.590373298799342e-06, "loss": 0.0137, "step": 7060 }, { "epoch": 4.660514172709295, "grad_norm": 0.5297737717628479, "learning_rate": 2.492671751592235e-06, "loss": 0.021, "step": 7070 }, { "epoch": 4.667106130520764, "grad_norm": 0.41437268257141113, "learning_rate": 2.3968250573308424e-06, "loss": 0.0166, "step": 7080 }, { "epoch": 4.673698088332235, "grad_norm": 0.2162405252456665, "learning_rate": 2.302835039278339e-06, "loss": 0.0163, "step": 7090 }, { "epoch": 4.680290046143705, "grad_norm": 0.3162844479084015, "learning_rate": 2.2107034853789288e-06, "loss": 0.0184, "step": 7100 }, { "epoch": 4.686882003955175, "grad_norm": 0.23974072933197021, "learning_rate": 2.1204321482238164e-06, "loss": 0.0187, "step": 7110 }, { "epoch": 4.693473961766645, "grad_norm": 0.24216875433921814, "learning_rate": 2.0320227450178254e-06, "loss": 0.0145, "step": 7120 }, { "epoch": 4.7000659195781145, "grad_norm": 0.3286508023738861, "learning_rate": 1.945476957546788e-06, "loss": 0.0189, "step": 7130 }, { "epoch": 4.706657877389585, "grad_norm": 0.22018277645111084, "learning_rate": 1.860796432145495e-06, "loss": 0.0164, "step": 7140 }, { "epoch": 4.713249835201054, "grad_norm": 0.18138107657432556, "learning_rate": 1.7779827796664538e-06, "loss": 0.0173, "step": 7150 }, { "epoch": 4.719841793012525, "grad_norm": 0.3609893321990967, "learning_rate": 1.6970375754491562e-06, "loss": 0.0291, "step": 7160 }, { "epoch": 4.726433750823995, "grad_norm": 0.31565043330192566, "learning_rate": 1.6179623592901926e-06, "loss": 0.014, "step": 7170 }, { "epoch": 4.733025708635465, "grad_norm": 0.27240124344825745, "learning_rate": 1.5407586354139193e-06, "loss": 0.0167, "step": 7180 }, { "epoch": 4.739617666446935, "grad_norm": 0.3199063837528229, "learning_rate": 1.4654278724438364e-06, "loss": 0.0164, "step": 7190 }, { "epoch": 4.746209624258404, "grad_norm": 0.23247933387756348, "learning_rate": 1.3919715033746893e-06, "loss": 0.0195, "step": 7200 }, { "epoch": 4.752801582069875, "grad_norm": 0.26770317554473877, "learning_rate": 1.3203909255451452e-06, "loss": 0.0125, "step": 7210 }, { "epoch": 4.759393539881344, "grad_norm": 0.2076646387577057, "learning_rate": 1.2506875006113027e-06, "loss": 0.0162, "step": 7220 }, { "epoch": 4.765985497692815, "grad_norm": 0.1567927598953247, "learning_rate": 1.1828625545207027e-06, "loss": 0.0142, "step": 7230 }, { "epoch": 4.772577455504285, "grad_norm": 0.3224427402019501, "learning_rate": 1.1169173774871478e-06, "loss": 0.0161, "step": 7240 }, { "epoch": 4.7791694133157545, "grad_norm": 0.5948562622070312, "learning_rate": 1.0528532239661547e-06, "loss": 0.0164, "step": 7250 }, { "epoch": 4.785761371127225, "grad_norm": 0.30895039439201355, "learning_rate": 9.906713126310974e-07, "loss": 0.0171, "step": 7260 }, { "epoch": 4.792353328938695, "grad_norm": 0.14259961247444153, "learning_rate": 9.303728263500011e-07, "loss": 0.0194, "step": 7270 }, { "epoch": 4.798945286750165, "grad_norm": 0.15019071102142334, "learning_rate": 8.719589121630622e-07, "loss": 0.0163, "step": 7280 }, { "epoch": 4.805537244561635, "grad_norm": 0.2892571687698364, "learning_rate": 8.154306812608315e-07, "loss": 0.0173, "step": 7290 }, { "epoch": 4.812129202373105, "grad_norm": 0.2563762962818146, "learning_rate": 7.607892089630308e-07, "loss": 0.0168, "step": 7300 }, { "epoch": 4.818721160184575, "grad_norm": 0.2222357541322708, "learning_rate": 7.080355346981815e-07, "loss": 0.014, "step": 7310 }, { "epoch": 4.8253131179960445, "grad_norm": 0.22898784279823303, "learning_rate": 6.571706619837526e-07, "loss": 0.0135, "step": 7320 }, { "epoch": 4.831905075807515, "grad_norm": 0.23187340795993805, "learning_rate": 6.081955584071097e-07, "loss": 0.0142, "step": 7330 }, { "epoch": 4.838497033618985, "grad_norm": 0.3049458861351013, "learning_rate": 5.61111155607108e-07, "loss": 0.0199, "step": 7340 }, { "epoch": 4.845088991430455, "grad_norm": 0.17564386129379272, "learning_rate": 5.159183492563613e-07, "loss": 0.0151, "step": 7350 }, { "epoch": 4.851680949241925, "grad_norm": 0.3510572016239166, "learning_rate": 4.7261799904420035e-07, "loss": 0.0164, "step": 7360 }, { "epoch": 4.8582729070533945, "grad_norm": 0.31466346979141235, "learning_rate": 4.3121092866031945e-07, "loss": 0.0176, "step": 7370 }, { "epoch": 4.864864864864865, "grad_norm": 0.2005147635936737, "learning_rate": 3.91697925779122e-07, "loss": 0.0168, "step": 7380 }, { "epoch": 4.871456822676334, "grad_norm": 0.1678527295589447, "learning_rate": 3.5407974204473284e-07, "loss": 0.0175, "step": 7390 }, { "epoch": 4.878048780487805, "grad_norm": 0.21754373610019684, "learning_rate": 3.1835709305668703e-07, "loss": 0.0127, "step": 7400 }, { "epoch": 4.884640738299275, "grad_norm": 0.21587257087230682, "learning_rate": 2.84530658356319e-07, "loss": 0.017, "step": 7410 }, { "epoch": 4.891232696110745, "grad_norm": 0.31447526812553406, "learning_rate": 2.526010814138613e-07, "loss": 0.0217, "step": 7420 }, { "epoch": 4.897824653922215, "grad_norm": 0.30843478441238403, "learning_rate": 2.2256896961616592e-07, "loss": 0.0181, "step": 7430 }, { "epoch": 4.904416611733685, "grad_norm": 0.29951369762420654, "learning_rate": 1.9443489425517992e-07, "loss": 0.0152, "step": 7440 }, { "epoch": 4.911008569545155, "grad_norm": 0.4117021858692169, "learning_rate": 1.6819939051706535e-07, "loss": 0.0127, "step": 7450 }, { "epoch": 4.917600527356625, "grad_norm": 0.11666778475046158, "learning_rate": 1.438629574720074e-07, "loss": 0.0144, "step": 7460 }, { "epoch": 4.924192485168095, "grad_norm": 0.3991844356060028, "learning_rate": 1.2142605806474417e-07, "loss": 0.0162, "step": 7470 }, { "epoch": 4.930784442979565, "grad_norm": 0.09675031900405884, "learning_rate": 1.0088911910576259e-07, "loss": 0.0223, "step": 7480 }, { "epoch": 4.937376400791035, "grad_norm": 0.3356577157974243, "learning_rate": 8.225253126314947e-08, "loss": 0.0168, "step": 7490 }, { "epoch": 4.943968358602505, "grad_norm": 0.27056625485420227, "learning_rate": 6.551664905517508e-08, "loss": 0.0166, "step": 7500 }, { "epoch": 4.950560316413975, "grad_norm": 0.24081185460090637, "learning_rate": 5.068179084355418e-08, "loss": 0.0164, "step": 7510 }, { "epoch": 4.957152274225445, "grad_norm": 0.3618698716163635, "learning_rate": 3.774823882738421e-08, "loss": 0.0176, "step": 7520 }, { "epoch": 4.963744232036915, "grad_norm": 0.20548762381076813, "learning_rate": 2.6716239037805068e-08, "loss": 0.0183, "step": 7530 }, { "epoch": 4.970336189848385, "grad_norm": 0.24806766211986542, "learning_rate": 1.7586001333258495e-08, "loss": 0.0156, "step": 7540 }, { "epoch": 4.976928147659855, "grad_norm": 0.3018137216567993, "learning_rate": 1.0357699395535658e-08, "loss": 0.0196, "step": 7550 }, { "epoch": 4.9835201054713245, "grad_norm": 0.24933604896068573, "learning_rate": 5.031470726490906e-09, "loss": 0.015, "step": 7560 }, { "epoch": 4.990112063282795, "grad_norm": 0.23485144972801208, "learning_rate": 1.6074166453883266e-09, "loss": 0.0103, "step": 7570 }, { "epoch": 4.996704021094265, "grad_norm": 0.4469901919364929, "learning_rate": 8.560228699217021e-11, "loss": 0.0147, "step": 7580 }, { "epoch": 4.998681608437706, "step": 7583, "total_flos": 2.658519488376864e+17, "train_loss": 0.03622536294503214, "train_runtime": 3445.8975, "train_samples_per_second": 35.209, "train_steps_per_second": 2.201 } ], "logging_steps": 10, "max_steps": 7583, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.658519488376864e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }