{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03893325477467703, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012977751591559012, "grad_norm": 29.529769897460938, "learning_rate": 2.1196980511755674e-09, "loss": 2.2098, "step": 50 }, { "epoch": 0.00025955503183118023, "grad_norm": 30.7341251373291, "learning_rate": 4.282655246252677e-09, "loss": 2.2367, "step": 100 }, { "epoch": 0.00038933254774677035, "grad_norm": 29.822181701660156, "learning_rate": 6.4456124413297865e-09, "loss": 2.198, "step": 150 }, { "epoch": 0.0005191100636623605, "grad_norm": 28.564294815063477, "learning_rate": 8.608569636406895e-09, "loss": 2.2167, "step": 200 }, { "epoch": 0.0006488875795779505, "grad_norm": 28.514026641845703, "learning_rate": 1.0771526831484006e-08, "loss": 2.2001, "step": 250 }, { "epoch": 0.0007786650954935407, "grad_norm": 29.033863067626953, "learning_rate": 1.2934484026561114e-08, "loss": 2.1908, "step": 300 }, { "epoch": 0.0009084426114091308, "grad_norm": 4.49394416809082, "learning_rate": 1.5097441221638225e-08, "loss": 2.1841, "step": 350 }, { "epoch": 0.001038220127324721, "grad_norm": 2.1610796451568604, "learning_rate": 1.7260398416715337e-08, "loss": 2.1702, "step": 400 }, { "epoch": 0.0011679976432403109, "grad_norm": 1.9879209995269775, "learning_rate": 1.9423355611792444e-08, "loss": 2.1613, "step": 450 }, { "epoch": 0.001297775159155901, "grad_norm": 1.880299687385559, "learning_rate": 2.1586312806869556e-08, "loss": 2.1529, "step": 500 }, { "epoch": 0.0014275526750714912, "grad_norm": 1.7597101926803589, "learning_rate": 2.3749270001946664e-08, "loss": 2.1641, "step": 550 }, { "epoch": 0.0015573301909870814, "grad_norm": 1.5660640001296997, "learning_rate": 2.591222719702377e-08, "loss": 2.1501, "step": 600 }, { "epoch": 0.0016871077069026714, "grad_norm": 1.5481892824172974, "learning_rate": 2.8075184392100883e-08, "loss": 2.1583, "step": 650 }, { "epoch": 0.0018168852228182615, "grad_norm": 1.525515079498291, "learning_rate": 3.0238141587177994e-08, "loss": 2.1547, "step": 700 }, { "epoch": 0.0019466627387338517, "grad_norm": 1.4705111980438232, "learning_rate": 3.24010987822551e-08, "loss": 2.1365, "step": 750 }, { "epoch": 0.002076440254649442, "grad_norm": 1.3471802473068237, "learning_rate": 3.4564055977332216e-08, "loss": 2.1777, "step": 800 }, { "epoch": 0.002206217770565032, "grad_norm": 1.3963252305984497, "learning_rate": 3.6727013172409324e-08, "loss": 2.1356, "step": 850 }, { "epoch": 0.0023359952864806218, "grad_norm": 1.2665836811065674, "learning_rate": 3.888997036748643e-08, "loss": 2.1565, "step": 900 }, { "epoch": 0.002465772802396212, "grad_norm": 1.4085878133773804, "learning_rate": 4.105292756256354e-08, "loss": 2.1497, "step": 950 }, { "epoch": 0.002595550318311802, "grad_norm": 1.4020620584487915, "learning_rate": 4.3215884757640654e-08, "loss": 2.1549, "step": 1000 }, { "epoch": 0.002725327834227392, "grad_norm": 1.5391197204589844, "learning_rate": 4.5378841952717755e-08, "loss": 2.1519, "step": 1050 }, { "epoch": 0.0028551053501429825, "grad_norm": 1.2532999515533447, "learning_rate": 4.754179914779487e-08, "loss": 2.1411, "step": 1100 }, { "epoch": 0.0029848828660585724, "grad_norm": 1.2143096923828125, "learning_rate": 4.970475634287198e-08, "loss": 2.1435, "step": 1150 }, { "epoch": 0.003114660381974163, "grad_norm": 1.4227972030639648, "learning_rate": 5.1867713537949086e-08, "loss": 2.142, "step": 1200 }, { "epoch": 0.0032444378978897528, "grad_norm": 1.0749537944793701, "learning_rate": 5.40306707330262e-08, "loss": 2.1134, "step": 1250 }, { "epoch": 0.0033742154138053427, "grad_norm": 1.169149398803711, "learning_rate": 5.619362792810331e-08, "loss": 2.1215, "step": 1300 }, { "epoch": 0.003503992929720933, "grad_norm": 1.3452564477920532, "learning_rate": 5.8356585123180416e-08, "loss": 2.1524, "step": 1350 }, { "epoch": 0.003633770445636523, "grad_norm": 1.3610825538635254, "learning_rate": 6.051954231825752e-08, "loss": 2.1389, "step": 1400 }, { "epoch": 0.003763547961552113, "grad_norm": 1.2172240018844604, "learning_rate": 6.268249951333464e-08, "loss": 2.1428, "step": 1450 }, { "epoch": 0.0038933254774677034, "grad_norm": 1.1681610345840454, "learning_rate": 6.484545670841174e-08, "loss": 2.1253, "step": 1500 }, { "epoch": 0.004023102993383294, "grad_norm": 1.0709235668182373, "learning_rate": 6.700841390348886e-08, "loss": 2.1533, "step": 1550 }, { "epoch": 0.004152880509298884, "grad_norm": 1.2001404762268066, "learning_rate": 6.917137109856597e-08, "loss": 2.1371, "step": 1600 }, { "epoch": 0.004282658025214474, "grad_norm": 1.2642266750335693, "learning_rate": 7.133432829364308e-08, "loss": 2.129, "step": 1650 }, { "epoch": 0.004412435541130064, "grad_norm": 1.030280590057373, "learning_rate": 7.349728548872018e-08, "loss": 2.128, "step": 1700 }, { "epoch": 0.004542213057045654, "grad_norm": 1.4037690162658691, "learning_rate": 7.566024268379729e-08, "loss": 2.1383, "step": 1750 }, { "epoch": 0.0046719905729612436, "grad_norm": 1.1600065231323242, "learning_rate": 7.78231998788744e-08, "loss": 2.1193, "step": 1800 }, { "epoch": 0.004801768088876834, "grad_norm": 1.2075531482696533, "learning_rate": 7.998615707395151e-08, "loss": 2.1126, "step": 1850 }, { "epoch": 0.004931545604792424, "grad_norm": 1.1088111400604248, "learning_rate": 8.214911426902863e-08, "loss": 2.1394, "step": 1900 }, { "epoch": 0.005061323120708014, "grad_norm": 1.195092797279358, "learning_rate": 8.431207146410574e-08, "loss": 2.1359, "step": 1950 }, { "epoch": 0.005191100636623604, "grad_norm": 1.1595982313156128, "learning_rate": 8.647502865918283e-08, "loss": 2.1366, "step": 2000 }, { "epoch": 0.005320878152539194, "grad_norm": 1.1727768182754517, "learning_rate": 8.863798585425995e-08, "loss": 2.1106, "step": 2050 }, { "epoch": 0.005450655668454784, "grad_norm": 1.2424023151397705, "learning_rate": 9.080094304933706e-08, "loss": 2.1339, "step": 2100 }, { "epoch": 0.005580433184370375, "grad_norm": 1.2954424619674683, "learning_rate": 9.296390024441417e-08, "loss": 2.1107, "step": 2150 }, { "epoch": 0.005710210700285965, "grad_norm": 1.0388058423995972, "learning_rate": 9.512685743949129e-08, "loss": 2.118, "step": 2200 }, { "epoch": 0.005839988216201555, "grad_norm": 4.136488914489746, "learning_rate": 9.728981463456838e-08, "loss": 2.1159, "step": 2250 }, { "epoch": 0.005969765732117145, "grad_norm": 1.0597412586212158, "learning_rate": 9.945277182964549e-08, "loss": 2.1303, "step": 2300 }, { "epoch": 0.006099543248032735, "grad_norm": 1.0780799388885498, "learning_rate": 1.0161572902472261e-07, "loss": 2.1095, "step": 2350 }, { "epoch": 0.006229320763948326, "grad_norm": 1.2739732265472412, "learning_rate": 1.0377868621979972e-07, "loss": 2.1264, "step": 2400 }, { "epoch": 0.0063590982798639156, "grad_norm": 3.3836748600006104, "learning_rate": 1.0594164341487683e-07, "loss": 2.1183, "step": 2450 }, { "epoch": 0.0064888757957795055, "grad_norm": 1.269440770149231, "learning_rate": 1.0810460060995395e-07, "loss": 2.1243, "step": 2500 }, { "epoch": 0.0066186533116950955, "grad_norm": 1.1834193468093872, "learning_rate": 1.1026755780503104e-07, "loss": 2.1207, "step": 2550 }, { "epoch": 0.006748430827610685, "grad_norm": 1.2411448955535889, "learning_rate": 1.1243051500010815e-07, "loss": 2.1201, "step": 2600 }, { "epoch": 0.006878208343526275, "grad_norm": 1.1330574750900269, "learning_rate": 1.1459347219518527e-07, "loss": 2.1125, "step": 2650 }, { "epoch": 0.007007985859441866, "grad_norm": 1.199055790901184, "learning_rate": 1.1675642939026238e-07, "loss": 2.13, "step": 2700 }, { "epoch": 0.007137763375357456, "grad_norm": 1.2272251844406128, "learning_rate": 1.1891938658533949e-07, "loss": 2.1239, "step": 2750 }, { "epoch": 0.007267540891273046, "grad_norm": 1.1257809400558472, "learning_rate": 1.2108234378041658e-07, "loss": 2.12, "step": 2800 }, { "epoch": 0.007397318407188636, "grad_norm": 1.1028845310211182, "learning_rate": 1.2324530097549372e-07, "loss": 2.1227, "step": 2850 }, { "epoch": 0.007527095923104226, "grad_norm": 1.2196507453918457, "learning_rate": 1.254082581705708e-07, "loss": 2.1241, "step": 2900 }, { "epoch": 0.007656873439019816, "grad_norm": 1.0342847108840942, "learning_rate": 1.275712153656479e-07, "loss": 2.1148, "step": 2950 }, { "epoch": 0.007786650954935407, "grad_norm": 1.1587927341461182, "learning_rate": 1.2973417256072504e-07, "loss": 2.1293, "step": 3000 }, { "epoch": 0.007916428470850997, "grad_norm": 1.1751174926757812, "learning_rate": 1.3189712975580215e-07, "loss": 2.1084, "step": 3050 }, { "epoch": 0.008046205986766588, "grad_norm": 12.056378364562988, "learning_rate": 1.3406008695087926e-07, "loss": 2.103, "step": 3100 }, { "epoch": 0.008175983502682177, "grad_norm": 1.0078307390213013, "learning_rate": 1.3622304414595637e-07, "loss": 2.1345, "step": 3150 }, { "epoch": 0.008305761018597767, "grad_norm": 1.0840246677398682, "learning_rate": 1.3838600134103347e-07, "loss": 2.1151, "step": 3200 }, { "epoch": 0.008435538534513357, "grad_norm": 1.213310956954956, "learning_rate": 1.4054895853611058e-07, "loss": 2.1275, "step": 3250 }, { "epoch": 0.008565316050428947, "grad_norm": 1.2903615236282349, "learning_rate": 1.427119157311877e-07, "loss": 2.1105, "step": 3300 }, { "epoch": 0.008695093566344536, "grad_norm": 1.1746351718902588, "learning_rate": 1.448748729262648e-07, "loss": 2.1344, "step": 3350 }, { "epoch": 0.008824871082260127, "grad_norm": 1.1928184032440186, "learning_rate": 1.470378301213419e-07, "loss": 2.1278, "step": 3400 }, { "epoch": 0.008954648598175718, "grad_norm": 1.1132676601409912, "learning_rate": 1.4920078731641904e-07, "loss": 2.1134, "step": 3450 }, { "epoch": 0.009084426114091307, "grad_norm": 1.4539573192596436, "learning_rate": 1.5136374451149612e-07, "loss": 2.0943, "step": 3500 }, { "epoch": 0.009214203630006898, "grad_norm": 1.176128625869751, "learning_rate": 1.5352670170657323e-07, "loss": 2.1025, "step": 3550 }, { "epoch": 0.009343981145922487, "grad_norm": 1.0798020362854004, "learning_rate": 1.5568965890165036e-07, "loss": 2.0946, "step": 3600 }, { "epoch": 0.009473758661838078, "grad_norm": 1.1570450067520142, "learning_rate": 1.5785261609672747e-07, "loss": 2.1119, "step": 3650 }, { "epoch": 0.009603536177753669, "grad_norm": 0.9933484792709351, "learning_rate": 1.6001557329180458e-07, "loss": 2.1171, "step": 3700 }, { "epoch": 0.009733313693669258, "grad_norm": 1.1642405986785889, "learning_rate": 1.6217853048688166e-07, "loss": 2.1047, "step": 3750 }, { "epoch": 0.009863091209584849, "grad_norm": 1.266423225402832, "learning_rate": 1.643414876819588e-07, "loss": 2.0984, "step": 3800 }, { "epoch": 0.009992868725500438, "grad_norm": 1.0179153680801392, "learning_rate": 1.665044448770359e-07, "loss": 2.0977, "step": 3850 }, { "epoch": 0.010122646241416029, "grad_norm": 1.0938276052474976, "learning_rate": 1.68667402072113e-07, "loss": 2.1249, "step": 3900 }, { "epoch": 0.01025242375733162, "grad_norm": 1.1981333494186401, "learning_rate": 1.7083035926719012e-07, "loss": 2.1186, "step": 3950 }, { "epoch": 0.010382201273247208, "grad_norm": 1.238035798072815, "learning_rate": 1.7299331646226725e-07, "loss": 2.1254, "step": 4000 }, { "epoch": 0.0105119787891628, "grad_norm": 1.0942214727401733, "learning_rate": 1.7515627365734433e-07, "loss": 2.0972, "step": 4050 }, { "epoch": 0.010641756305078388, "grad_norm": 1.1418002843856812, "learning_rate": 1.7731923085242144e-07, "loss": 2.1286, "step": 4100 }, { "epoch": 0.01077153382099398, "grad_norm": 2.164984703063965, "learning_rate": 1.7948218804749858e-07, "loss": 2.1195, "step": 4150 }, { "epoch": 0.010901311336909568, "grad_norm": 1.119779109954834, "learning_rate": 1.8164514524257566e-07, "loss": 2.1242, "step": 4200 }, { "epoch": 0.011031088852825159, "grad_norm": 1.0193780660629272, "learning_rate": 1.8380810243765277e-07, "loss": 2.0951, "step": 4250 }, { "epoch": 0.01116086636874075, "grad_norm": 1.1878997087478638, "learning_rate": 1.859710596327299e-07, "loss": 2.1067, "step": 4300 }, { "epoch": 0.011290643884656339, "grad_norm": 1.048115849494934, "learning_rate": 1.8813401682780698e-07, "loss": 2.0861, "step": 4350 }, { "epoch": 0.01142042140057193, "grad_norm": 1.1059479713439941, "learning_rate": 1.9029697402288412e-07, "loss": 2.1204, "step": 4400 }, { "epoch": 0.011550198916487519, "grad_norm": 0.942563533782959, "learning_rate": 1.9245993121796122e-07, "loss": 2.1256, "step": 4450 }, { "epoch": 0.01167997643240311, "grad_norm": 3.292470932006836, "learning_rate": 1.946228884130383e-07, "loss": 2.0964, "step": 4500 }, { "epoch": 0.0118097539483187, "grad_norm": 1.131181001663208, "learning_rate": 1.9678584560811544e-07, "loss": 2.1079, "step": 4550 }, { "epoch": 0.01193953146423429, "grad_norm": 1.1684244871139526, "learning_rate": 1.9894880280319255e-07, "loss": 2.0918, "step": 4600 }, { "epoch": 0.01206930898014988, "grad_norm": 1.094470500946045, "learning_rate": 2.0111175999826965e-07, "loss": 2.1129, "step": 4650 }, { "epoch": 0.01219908649606547, "grad_norm": 1.1580520868301392, "learning_rate": 2.0327471719334676e-07, "loss": 2.1175, "step": 4700 }, { "epoch": 0.01232886401198106, "grad_norm": 1.110461711883545, "learning_rate": 2.054376743884239e-07, "loss": 2.1188, "step": 4750 }, { "epoch": 0.012458641527896651, "grad_norm": 1.1533987522125244, "learning_rate": 2.0760063158350098e-07, "loss": 2.1072, "step": 4800 }, { "epoch": 0.01258841904381224, "grad_norm": 1.3358995914459229, "learning_rate": 2.0976358877857809e-07, "loss": 2.1031, "step": 4850 }, { "epoch": 0.012718196559727831, "grad_norm": 1.100576639175415, "learning_rate": 2.119265459736552e-07, "loss": 2.0903, "step": 4900 }, { "epoch": 0.01284797407564342, "grad_norm": 1.1505376100540161, "learning_rate": 2.140895031687323e-07, "loss": 2.123, "step": 4950 }, { "epoch": 0.012977751591559011, "grad_norm": 1.063899278640747, "learning_rate": 2.1625246036380944e-07, "loss": 2.0802, "step": 5000 }, { "epoch": 0.0131075291074746, "grad_norm": 0.9469916224479675, "learning_rate": 2.1841541755888652e-07, "loss": 2.0917, "step": 5050 }, { "epoch": 0.013237306623390191, "grad_norm": 1.1475921869277954, "learning_rate": 2.2057837475396363e-07, "loss": 2.116, "step": 5100 }, { "epoch": 0.013367084139305782, "grad_norm": 1.5712428092956543, "learning_rate": 2.2274133194904076e-07, "loss": 2.1019, "step": 5150 }, { "epoch": 0.01349686165522137, "grad_norm": 1.0301058292388916, "learning_rate": 2.2490428914411784e-07, "loss": 2.0985, "step": 5200 }, { "epoch": 0.013626639171136962, "grad_norm": 1.1751989126205444, "learning_rate": 2.2706724633919495e-07, "loss": 2.1023, "step": 5250 }, { "epoch": 0.01375641668705255, "grad_norm": 2.097857713699341, "learning_rate": 2.2923020353427208e-07, "loss": 2.1226, "step": 5300 }, { "epoch": 0.013886194202968142, "grad_norm": 1.0331202745437622, "learning_rate": 2.3139316072934916e-07, "loss": 2.1205, "step": 5350 }, { "epoch": 0.014015971718883732, "grad_norm": 1.0770936012268066, "learning_rate": 2.335561179244263e-07, "loss": 2.1037, "step": 5400 }, { "epoch": 0.014145749234799321, "grad_norm": 1.1082065105438232, "learning_rate": 2.357190751195034e-07, "loss": 2.1027, "step": 5450 }, { "epoch": 0.014275526750714912, "grad_norm": 1.2101250886917114, "learning_rate": 2.378820323145805e-07, "loss": 2.1103, "step": 5500 }, { "epoch": 0.014405304266630501, "grad_norm": 1.163560152053833, "learning_rate": 2.400449895096576e-07, "loss": 2.1035, "step": 5550 }, { "epoch": 0.014535081782546092, "grad_norm": 1.1837356090545654, "learning_rate": 2.4220794670473476e-07, "loss": 2.0891, "step": 5600 }, { "epoch": 0.014664859298461683, "grad_norm": 1.2113701105117798, "learning_rate": 2.4437090389981184e-07, "loss": 2.1096, "step": 5650 }, { "epoch": 0.014794636814377272, "grad_norm": 1.1284654140472412, "learning_rate": 2.4653386109488897e-07, "loss": 2.0948, "step": 5700 }, { "epoch": 0.014924414330292863, "grad_norm": 1.4470899105072021, "learning_rate": 2.4869681828996605e-07, "loss": 2.0954, "step": 5750 }, { "epoch": 0.015054191846208452, "grad_norm": 1.0791606903076172, "learning_rate": 2.5085977548504314e-07, "loss": 2.1022, "step": 5800 }, { "epoch": 0.015183969362124043, "grad_norm": 1.1813191175460815, "learning_rate": 2.5302273268012027e-07, "loss": 2.1102, "step": 5850 }, { "epoch": 0.015313746878039632, "grad_norm": 1.1993714570999146, "learning_rate": 2.551856898751974e-07, "loss": 2.0967, "step": 5900 }, { "epoch": 0.015443524393955223, "grad_norm": 1.1809765100479126, "learning_rate": 2.573486470702745e-07, "loss": 2.1054, "step": 5950 }, { "epoch": 0.015573301909870814, "grad_norm": 1.0799180269241333, "learning_rate": 2.595116042653516e-07, "loss": 2.1144, "step": 6000 }, { "epoch": 0.015703079425786404, "grad_norm": 1.0349640846252441, "learning_rate": 2.6167456146042875e-07, "loss": 2.0978, "step": 6050 }, { "epoch": 0.015832856941701993, "grad_norm": 0.9997969269752502, "learning_rate": 2.6383751865550584e-07, "loss": 2.1274, "step": 6100 }, { "epoch": 0.015962634457617583, "grad_norm": 1.3014293909072876, "learning_rate": 2.660004758505829e-07, "loss": 2.0857, "step": 6150 }, { "epoch": 0.016092411973533175, "grad_norm": 1.1863785982131958, "learning_rate": 2.6816343304566005e-07, "loss": 2.1051, "step": 6200 }, { "epoch": 0.016222189489448764, "grad_norm": 1.6769137382507324, "learning_rate": 2.7032639024073713e-07, "loss": 2.0934, "step": 6250 }, { "epoch": 0.016351967005364353, "grad_norm": 1.0514180660247803, "learning_rate": 2.7248934743581427e-07, "loss": 2.0948, "step": 6300 }, { "epoch": 0.016481744521279942, "grad_norm": 1.0475189685821533, "learning_rate": 2.746523046308914e-07, "loss": 2.088, "step": 6350 }, { "epoch": 0.016611522037195535, "grad_norm": 2.3959105014801025, "learning_rate": 2.768152618259685e-07, "loss": 2.1053, "step": 6400 }, { "epoch": 0.016741299553111124, "grad_norm": 1.291269302368164, "learning_rate": 2.789782190210456e-07, "loss": 2.0576, "step": 6450 }, { "epoch": 0.016871077069026713, "grad_norm": 3.1083991527557373, "learning_rate": 2.811411762161227e-07, "loss": 2.103, "step": 6500 }, { "epoch": 0.017000854584942306, "grad_norm": 1.2403531074523926, "learning_rate": 2.833041334111998e-07, "loss": 2.1099, "step": 6550 }, { "epoch": 0.017130632100857895, "grad_norm": 1.0552589893341064, "learning_rate": 2.854670906062769e-07, "loss": 2.1089, "step": 6600 }, { "epoch": 0.017260409616773484, "grad_norm": 1.15003502368927, "learning_rate": 2.8763004780135405e-07, "loss": 2.1039, "step": 6650 }, { "epoch": 0.017390187132689073, "grad_norm": 1.0832091569900513, "learning_rate": 2.8979300499643113e-07, "loss": 2.1084, "step": 6700 }, { "epoch": 0.017519964648604665, "grad_norm": 1.3141324520111084, "learning_rate": 2.9195596219150826e-07, "loss": 2.1135, "step": 6750 }, { "epoch": 0.017649742164520255, "grad_norm": 1.1030374765396118, "learning_rate": 2.941189193865854e-07, "loss": 2.1015, "step": 6800 }, { "epoch": 0.017779519680435844, "grad_norm": 1.1264283657073975, "learning_rate": 2.962818765816625e-07, "loss": 2.1046, "step": 6850 }, { "epoch": 0.017909297196351436, "grad_norm": 1.0352332592010498, "learning_rate": 2.9844483377673956e-07, "loss": 2.1123, "step": 6900 }, { "epoch": 0.018039074712267025, "grad_norm": 1.6451269388198853, "learning_rate": 3.006077909718167e-07, "loss": 2.0875, "step": 6950 }, { "epoch": 0.018168852228182614, "grad_norm": 1.0955110788345337, "learning_rate": 3.027707481668938e-07, "loss": 2.0838, "step": 7000 }, { "epoch": 0.018298629744098207, "grad_norm": 1.128531813621521, "learning_rate": 3.049337053619709e-07, "loss": 2.1029, "step": 7050 }, { "epoch": 0.018428407260013796, "grad_norm": 1.0489044189453125, "learning_rate": 3.0709666255704805e-07, "loss": 2.0908, "step": 7100 }, { "epoch": 0.018558184775929385, "grad_norm": 1.0876027345657349, "learning_rate": 3.0925961975212513e-07, "loss": 2.0817, "step": 7150 }, { "epoch": 0.018687962291844974, "grad_norm": 1.025060772895813, "learning_rate": 3.1142257694720226e-07, "loss": 2.0924, "step": 7200 }, { "epoch": 0.018817739807760567, "grad_norm": 1.3133209943771362, "learning_rate": 3.1358553414227934e-07, "loss": 2.0848, "step": 7250 }, { "epoch": 0.018947517323676156, "grad_norm": 1.159995436668396, "learning_rate": 3.157484913373565e-07, "loss": 2.0938, "step": 7300 }, { "epoch": 0.019077294839591745, "grad_norm": 1.151329755783081, "learning_rate": 3.1791144853243356e-07, "loss": 2.1002, "step": 7350 }, { "epoch": 0.019207072355507337, "grad_norm": 1.123695731163025, "learning_rate": 3.200744057275107e-07, "loss": 2.0951, "step": 7400 }, { "epoch": 0.019336849871422927, "grad_norm": 1.143547534942627, "learning_rate": 3.222373629225878e-07, "loss": 2.1034, "step": 7450 }, { "epoch": 0.019466627387338516, "grad_norm": 1.093329906463623, "learning_rate": 3.2440032011766486e-07, "loss": 2.0921, "step": 7500 }, { "epoch": 0.019596404903254105, "grad_norm": 1.3572251796722412, "learning_rate": 3.2656327731274204e-07, "loss": 2.0902, "step": 7550 }, { "epoch": 0.019726182419169697, "grad_norm": 1.146531343460083, "learning_rate": 3.287262345078191e-07, "loss": 2.0899, "step": 7600 }, { "epoch": 0.019855959935085286, "grad_norm": 1.0585743188858032, "learning_rate": 3.308891917028962e-07, "loss": 2.108, "step": 7650 }, { "epoch": 0.019985737451000875, "grad_norm": 1.1923290491104126, "learning_rate": 3.3305214889797334e-07, "loss": 2.098, "step": 7700 }, { "epoch": 0.020115514966916468, "grad_norm": 1.1357568502426147, "learning_rate": 3.352151060930504e-07, "loss": 2.1021, "step": 7750 }, { "epoch": 0.020245292482832057, "grad_norm": 1.2182716131210327, "learning_rate": 3.373780632881275e-07, "loss": 2.0816, "step": 7800 }, { "epoch": 0.020375069998747646, "grad_norm": 1.1091363430023193, "learning_rate": 3.395410204832047e-07, "loss": 2.0832, "step": 7850 }, { "epoch": 0.02050484751466324, "grad_norm": 1.1325336694717407, "learning_rate": 3.4170397767828177e-07, "loss": 2.0872, "step": 7900 }, { "epoch": 0.020634625030578828, "grad_norm": 1.020922064781189, "learning_rate": 3.4386693487335885e-07, "loss": 2.0798, "step": 7950 }, { "epoch": 0.020764402546494417, "grad_norm": 1.1414934396743774, "learning_rate": 3.46029892068436e-07, "loss": 2.0745, "step": 8000 }, { "epoch": 0.020894180062410006, "grad_norm": 1.1155861616134644, "learning_rate": 3.481928492635131e-07, "loss": 2.0975, "step": 8050 }, { "epoch": 0.0210239575783256, "grad_norm": 0.9747071266174316, "learning_rate": 3.503558064585902e-07, "loss": 2.0796, "step": 8100 }, { "epoch": 0.021153735094241188, "grad_norm": 1.2895739078521729, "learning_rate": 3.5251876365366734e-07, "loss": 2.0951, "step": 8150 }, { "epoch": 0.021283512610156777, "grad_norm": 1.147414207458496, "learning_rate": 3.546817208487444e-07, "loss": 2.0955, "step": 8200 }, { "epoch": 0.02141329012607237, "grad_norm": 1.1811184883117676, "learning_rate": 3.568446780438215e-07, "loss": 2.0895, "step": 8250 }, { "epoch": 0.02154306764198796, "grad_norm": 1.2630963325500488, "learning_rate": 3.590076352388987e-07, "loss": 2.0668, "step": 8300 }, { "epoch": 0.021672845157903547, "grad_norm": 1.0883618593215942, "learning_rate": 3.6117059243397577e-07, "loss": 2.0981, "step": 8350 }, { "epoch": 0.021802622673819137, "grad_norm": 1.2500261068344116, "learning_rate": 3.6333354962905285e-07, "loss": 2.0911, "step": 8400 }, { "epoch": 0.02193240018973473, "grad_norm": 1.133091926574707, "learning_rate": 3.6549650682413e-07, "loss": 2.082, "step": 8450 }, { "epoch": 0.022062177705650318, "grad_norm": 1.1500440835952759, "learning_rate": 3.6765946401920707e-07, "loss": 2.091, "step": 8500 }, { "epoch": 0.022191955221565907, "grad_norm": 1.1573790311813354, "learning_rate": 3.6982242121428415e-07, "loss": 2.091, "step": 8550 }, { "epoch": 0.0223217327374815, "grad_norm": 1.0022162199020386, "learning_rate": 3.7198537840936134e-07, "loss": 2.0734, "step": 8600 }, { "epoch": 0.02245151025339709, "grad_norm": 1.2101961374282837, "learning_rate": 3.741483356044384e-07, "loss": 2.0976, "step": 8650 }, { "epoch": 0.022581287769312678, "grad_norm": 1.1183929443359375, "learning_rate": 3.763112927995155e-07, "loss": 2.0742, "step": 8700 }, { "epoch": 0.02271106528522827, "grad_norm": 1.1698428392410278, "learning_rate": 3.784742499945927e-07, "loss": 2.1189, "step": 8750 }, { "epoch": 0.02284084280114386, "grad_norm": 1.238348126411438, "learning_rate": 3.8063720718966977e-07, "loss": 2.0864, "step": 8800 }, { "epoch": 0.02297062031705945, "grad_norm": 1.0891568660736084, "learning_rate": 3.8280016438474685e-07, "loss": 2.071, "step": 8850 }, { "epoch": 0.023100397832975038, "grad_norm": 1.0950003862380981, "learning_rate": 3.84963121579824e-07, "loss": 2.0944, "step": 8900 }, { "epoch": 0.02323017534889063, "grad_norm": 1.0031663179397583, "learning_rate": 3.8712607877490106e-07, "loss": 2.0688, "step": 8950 }, { "epoch": 0.02335995286480622, "grad_norm": 1.1025946140289307, "learning_rate": 3.8928903596997815e-07, "loss": 2.0853, "step": 9000 }, { "epoch": 0.02348973038072181, "grad_norm": 1.0795261859893799, "learning_rate": 3.9145199316505533e-07, "loss": 2.0813, "step": 9050 }, { "epoch": 0.0236195078966374, "grad_norm": 1.1669412851333618, "learning_rate": 3.936149503601324e-07, "loss": 2.0802, "step": 9100 }, { "epoch": 0.02374928541255299, "grad_norm": 1.186626672744751, "learning_rate": 3.957779075552095e-07, "loss": 2.1015, "step": 9150 }, { "epoch": 0.02387906292846858, "grad_norm": 1.053902506828308, "learning_rate": 3.9794086475028663e-07, "loss": 2.1003, "step": 9200 }, { "epoch": 0.02400884044438417, "grad_norm": 1.1948777437210083, "learning_rate": 4.001038219453637e-07, "loss": 2.0914, "step": 9250 }, { "epoch": 0.02413861796029976, "grad_norm": 1.0830193758010864, "learning_rate": 4.0226677914044085e-07, "loss": 2.0892, "step": 9300 }, { "epoch": 0.02426839547621535, "grad_norm": 1.0737528800964355, "learning_rate": 4.04429736335518e-07, "loss": 2.0949, "step": 9350 }, { "epoch": 0.02439817299213094, "grad_norm": 1.2443790435791016, "learning_rate": 4.0659269353059506e-07, "loss": 2.0725, "step": 9400 }, { "epoch": 0.02452795050804653, "grad_norm": 0.9910159111022949, "learning_rate": 4.0875565072567214e-07, "loss": 2.092, "step": 9450 }, { "epoch": 0.02465772802396212, "grad_norm": 1.1105308532714844, "learning_rate": 4.1091860792074933e-07, "loss": 2.0894, "step": 9500 }, { "epoch": 0.02478750553987771, "grad_norm": 1.3401215076446533, "learning_rate": 4.130815651158264e-07, "loss": 2.091, "step": 9550 }, { "epoch": 0.024917283055793302, "grad_norm": 1.1136138439178467, "learning_rate": 4.152445223109035e-07, "loss": 2.1022, "step": 9600 }, { "epoch": 0.02504706057170889, "grad_norm": 1.1129764318466187, "learning_rate": 4.1740747950598063e-07, "loss": 2.0841, "step": 9650 }, { "epoch": 0.02517683808762448, "grad_norm": 1.1361297369003296, "learning_rate": 4.195704367010577e-07, "loss": 2.0987, "step": 9700 }, { "epoch": 0.02530661560354007, "grad_norm": 1.2290136814117432, "learning_rate": 4.217333938961348e-07, "loss": 2.0967, "step": 9750 }, { "epoch": 0.025436393119455662, "grad_norm": 1.1932119131088257, "learning_rate": 4.23896351091212e-07, "loss": 2.1018, "step": 9800 }, { "epoch": 0.02556617063537125, "grad_norm": 1.1398112773895264, "learning_rate": 4.2605930828628906e-07, "loss": 2.076, "step": 9850 }, { "epoch": 0.02569594815128684, "grad_norm": 1.255175232887268, "learning_rate": 4.2822226548136614e-07, "loss": 2.0979, "step": 9900 }, { "epoch": 0.025825725667202433, "grad_norm": 1.063835620880127, "learning_rate": 4.303852226764433e-07, "loss": 2.0982, "step": 9950 }, { "epoch": 0.025955503183118022, "grad_norm": 1.0199131965637207, "learning_rate": 4.3254817987152036e-07, "loss": 2.077, "step": 10000 }, { "epoch": 0.02608528069903361, "grad_norm": 1.2938398122787476, "learning_rate": 4.347111370665975e-07, "loss": 2.0966, "step": 10050 }, { "epoch": 0.0262150582149492, "grad_norm": 1.2516087293624878, "learning_rate": 4.368740942616746e-07, "loss": 2.0996, "step": 10100 }, { "epoch": 0.026344835730864793, "grad_norm": 1.140458345413208, "learning_rate": 4.390370514567517e-07, "loss": 2.0862, "step": 10150 }, { "epoch": 0.026474613246780382, "grad_norm": 1.245771884918213, "learning_rate": 4.412000086518288e-07, "loss": 2.095, "step": 10200 }, { "epoch": 0.02660439076269597, "grad_norm": 1.1775400638580322, "learning_rate": 4.43362965846906e-07, "loss": 2.0818, "step": 10250 }, { "epoch": 0.026734168278611564, "grad_norm": 1.043639898300171, "learning_rate": 4.4552592304198306e-07, "loss": 2.075, "step": 10300 }, { "epoch": 0.026863945794527153, "grad_norm": 1.0813723802566528, "learning_rate": 4.4768888023706014e-07, "loss": 2.0525, "step": 10350 }, { "epoch": 0.02699372331044274, "grad_norm": 1.0008471012115479, "learning_rate": 4.4985183743213727e-07, "loss": 2.0835, "step": 10400 }, { "epoch": 0.027123500826358334, "grad_norm": 1.2055691480636597, "learning_rate": 4.5201479462721435e-07, "loss": 2.0877, "step": 10450 }, { "epoch": 0.027253278342273923, "grad_norm": 1.2838592529296875, "learning_rate": 4.5417775182229143e-07, "loss": 2.0844, "step": 10500 }, { "epoch": 0.027383055858189512, "grad_norm": 1.2694274187088013, "learning_rate": 4.563407090173686e-07, "loss": 2.0791, "step": 10550 }, { "epoch": 0.0275128333741051, "grad_norm": 1.1208597421646118, "learning_rate": 4.585036662124457e-07, "loss": 2.0846, "step": 10600 }, { "epoch": 0.027642610890020694, "grad_norm": 1.0968207120895386, "learning_rate": 4.606666234075228e-07, "loss": 2.0834, "step": 10650 }, { "epoch": 0.027772388405936283, "grad_norm": 0.9584913849830627, "learning_rate": 4.628295806025999e-07, "loss": 2.0792, "step": 10700 }, { "epoch": 0.027902165921851872, "grad_norm": 1.19157874584198, "learning_rate": 4.6499253779767705e-07, "loss": 2.0895, "step": 10750 }, { "epoch": 0.028031943437767465, "grad_norm": 1.1074448823928833, "learning_rate": 4.6715549499275413e-07, "loss": 2.0725, "step": 10800 }, { "epoch": 0.028161720953683054, "grad_norm": 1.2112274169921875, "learning_rate": 4.6931845218783127e-07, "loss": 2.097, "step": 10850 }, { "epoch": 0.028291498469598643, "grad_norm": 1.1916898488998413, "learning_rate": 4.7148140938290835e-07, "loss": 2.0703, "step": 10900 }, { "epoch": 0.028421275985514232, "grad_norm": 1.0195530652999878, "learning_rate": 4.7364436657798543e-07, "loss": 2.1164, "step": 10950 }, { "epoch": 0.028551053501429825, "grad_norm": 1.1741176843643188, "learning_rate": 4.758073237730626e-07, "loss": 2.0762, "step": 11000 }, { "epoch": 0.028680831017345414, "grad_norm": 1.100441336631775, "learning_rate": 4.779702809681397e-07, "loss": 2.0843, "step": 11050 }, { "epoch": 0.028810608533261003, "grad_norm": 1.2614903450012207, "learning_rate": 4.801332381632167e-07, "loss": 2.0912, "step": 11100 }, { "epoch": 0.028940386049176595, "grad_norm": 1.2177016735076904, "learning_rate": 4.82296195358294e-07, "loss": 2.0872, "step": 11150 }, { "epoch": 0.029070163565092184, "grad_norm": 1.0506807565689087, "learning_rate": 4.84459152553371e-07, "loss": 2.0469, "step": 11200 }, { "epoch": 0.029199941081007773, "grad_norm": 1.0414236783981323, "learning_rate": 4.866221097484481e-07, "loss": 2.0683, "step": 11250 }, { "epoch": 0.029329718596923366, "grad_norm": 1.0935665369033813, "learning_rate": 4.887850669435253e-07, "loss": 2.0809, "step": 11300 }, { "epoch": 0.029459496112838955, "grad_norm": 1.0495206117630005, "learning_rate": 4.909480241386023e-07, "loss": 2.0788, "step": 11350 }, { "epoch": 0.029589273628754544, "grad_norm": 1.3227081298828125, "learning_rate": 4.931109813336794e-07, "loss": 2.0858, "step": 11400 }, { "epoch": 0.029719051144670133, "grad_norm": 1.1815470457077026, "learning_rate": 4.952739385287566e-07, "loss": 2.0832, "step": 11450 }, { "epoch": 0.029848828660585726, "grad_norm": 1.1781071424484253, "learning_rate": 4.974368957238337e-07, "loss": 2.0796, "step": 11500 }, { "epoch": 0.029978606176501315, "grad_norm": 1.2186251878738403, "learning_rate": 4.995998529189107e-07, "loss": 2.0742, "step": 11550 }, { "epoch": 0.030108383692416904, "grad_norm": 1.1277467012405396, "learning_rate": 5.01762810113988e-07, "loss": 2.0745, "step": 11600 }, { "epoch": 0.030238161208332497, "grad_norm": 1.1841695308685303, "learning_rate": 5.03925767309065e-07, "loss": 2.0911, "step": 11650 }, { "epoch": 0.030367938724248086, "grad_norm": 1.2147952318191528, "learning_rate": 5.060887245041421e-07, "loss": 2.0719, "step": 11700 }, { "epoch": 0.030497716240163675, "grad_norm": 1.1648039817810059, "learning_rate": 5.082516816992193e-07, "loss": 2.0884, "step": 11750 }, { "epoch": 0.030627493756079264, "grad_norm": 1.2530500888824463, "learning_rate": 5.104146388942963e-07, "loss": 2.0804, "step": 11800 }, { "epoch": 0.030757271271994856, "grad_norm": 1.3491883277893066, "learning_rate": 5.125775960893734e-07, "loss": 2.0872, "step": 11850 }, { "epoch": 0.030887048787910446, "grad_norm": 1.2144231796264648, "learning_rate": 5.147405532844506e-07, "loss": 2.0628, "step": 11900 }, { "epoch": 0.031016826303826035, "grad_norm": 1.2478595972061157, "learning_rate": 5.169035104795277e-07, "loss": 2.0756, "step": 11950 }, { "epoch": 0.031146603819741627, "grad_norm": 1.0973178148269653, "learning_rate": 5.190664676746047e-07, "loss": 2.0966, "step": 12000 }, { "epoch": 0.031276381335657216, "grad_norm": 1.1409740447998047, "learning_rate": 5.212294248696819e-07, "loss": 2.0812, "step": 12050 }, { "epoch": 0.03140615885157281, "grad_norm": 1.206933617591858, "learning_rate": 5.23392382064759e-07, "loss": 2.1006, "step": 12100 }, { "epoch": 0.031535936367488394, "grad_norm": 1.085492491722107, "learning_rate": 5.255553392598361e-07, "loss": 2.1002, "step": 12150 }, { "epoch": 0.03166571388340399, "grad_norm": 1.0168064832687378, "learning_rate": 5.277182964549132e-07, "loss": 2.066, "step": 12200 }, { "epoch": 0.03179549139931958, "grad_norm": 1.0635970830917358, "learning_rate": 5.298812536499903e-07, "loss": 2.0697, "step": 12250 }, { "epoch": 0.031925268915235165, "grad_norm": 1.1196577548980713, "learning_rate": 5.320442108450674e-07, "loss": 2.0862, "step": 12300 }, { "epoch": 0.03205504643115076, "grad_norm": 1.4039437770843506, "learning_rate": 5.342071680401445e-07, "loss": 2.0736, "step": 12350 }, { "epoch": 0.03218482394706635, "grad_norm": 1.1052871942520142, "learning_rate": 5.363701252352217e-07, "loss": 2.0877, "step": 12400 }, { "epoch": 0.032314601462981936, "grad_norm": 0.966698408126831, "learning_rate": 5.385330824302987e-07, "loss": 2.0866, "step": 12450 }, { "epoch": 0.03244437897889753, "grad_norm": 0.9846018552780151, "learning_rate": 5.406960396253759e-07, "loss": 2.0717, "step": 12500 }, { "epoch": 0.032574156494813114, "grad_norm": 1.1824718713760376, "learning_rate": 5.42858996820453e-07, "loss": 2.0651, "step": 12550 }, { "epoch": 0.03270393401072871, "grad_norm": 1.1409893035888672, "learning_rate": 5.4502195401553e-07, "loss": 2.0904, "step": 12600 }, { "epoch": 0.0328337115266443, "grad_norm": 1.079150676727295, "learning_rate": 5.471849112106072e-07, "loss": 2.0842, "step": 12650 }, { "epoch": 0.032963489042559885, "grad_norm": 1.0430059432983398, "learning_rate": 5.493478684056843e-07, "loss": 2.0869, "step": 12700 }, { "epoch": 0.03309326655847548, "grad_norm": 1.085353136062622, "learning_rate": 5.515108256007614e-07, "loss": 2.0703, "step": 12750 }, { "epoch": 0.03322304407439107, "grad_norm": 1.1028053760528564, "learning_rate": 5.536737827958384e-07, "loss": 2.0804, "step": 12800 }, { "epoch": 0.033352821590306655, "grad_norm": 1.4301245212554932, "learning_rate": 5.558367399909157e-07, "loss": 2.0934, "step": 12850 }, { "epoch": 0.03348259910622225, "grad_norm": 1.1223058700561523, "learning_rate": 5.579996971859927e-07, "loss": 2.0927, "step": 12900 }, { "epoch": 0.03361237662213784, "grad_norm": 1.0447497367858887, "learning_rate": 5.601626543810699e-07, "loss": 2.0792, "step": 12950 }, { "epoch": 0.033742154138053426, "grad_norm": 1.1141220331192017, "learning_rate": 5.62325611576147e-07, "loss": 2.0661, "step": 13000 }, { "epoch": 0.03387193165396902, "grad_norm": 1.146681547164917, "learning_rate": 5.64488568771224e-07, "loss": 2.0906, "step": 13050 }, { "epoch": 0.03400170916988461, "grad_norm": 1.0003130435943604, "learning_rate": 5.666515259663012e-07, "loss": 2.0913, "step": 13100 }, { "epoch": 0.0341314866858002, "grad_norm": 1.2144840955734253, "learning_rate": 5.688144831613783e-07, "loss": 2.0802, "step": 13150 }, { "epoch": 0.03426126420171579, "grad_norm": 1.11890709400177, "learning_rate": 5.709774403564554e-07, "loss": 2.0709, "step": 13200 }, { "epoch": 0.03439104171763138, "grad_norm": 1.1874761581420898, "learning_rate": 5.731403975515324e-07, "loss": 2.0885, "step": 13250 }, { "epoch": 0.03452081923354697, "grad_norm": 1.155839443206787, "learning_rate": 5.753033547466096e-07, "loss": 2.0676, "step": 13300 }, { "epoch": 0.03465059674946256, "grad_norm": 1.1086000204086304, "learning_rate": 5.774663119416867e-07, "loss": 2.0721, "step": 13350 }, { "epoch": 0.034780374265378146, "grad_norm": 1.3329235315322876, "learning_rate": 5.796292691367639e-07, "loss": 2.0794, "step": 13400 }, { "epoch": 0.03491015178129374, "grad_norm": 1.0079444646835327, "learning_rate": 5.81792226331841e-07, "loss": 2.0946, "step": 13450 }, { "epoch": 0.03503992929720933, "grad_norm": 1.1033422946929932, "learning_rate": 5.83955183526918e-07, "loss": 2.0648, "step": 13500 }, { "epoch": 0.03516970681312492, "grad_norm": 1.1726226806640625, "learning_rate": 5.861181407219951e-07, "loss": 2.0772, "step": 13550 }, { "epoch": 0.03529948432904051, "grad_norm": 1.2747085094451904, "learning_rate": 5.882810979170723e-07, "loss": 2.0655, "step": 13600 }, { "epoch": 0.0354292618449561, "grad_norm": 1.3106458187103271, "learning_rate": 5.904440551121494e-07, "loss": 2.0703, "step": 13650 }, { "epoch": 0.03555903936087169, "grad_norm": 1.2663166522979736, "learning_rate": 5.926070123072264e-07, "loss": 2.0841, "step": 13700 }, { "epoch": 0.03568881687678728, "grad_norm": 1.0045231580734253, "learning_rate": 5.947699695023036e-07, "loss": 2.0815, "step": 13750 }, { "epoch": 0.03581859439270287, "grad_norm": 1.0900031328201294, "learning_rate": 5.969329266973807e-07, "loss": 2.0818, "step": 13800 }, { "epoch": 0.03594837190861846, "grad_norm": 1.065185308456421, "learning_rate": 5.990958838924577e-07, "loss": 2.0734, "step": 13850 }, { "epoch": 0.03607814942453405, "grad_norm": 1.117867112159729, "learning_rate": 6.01258841087535e-07, "loss": 2.065, "step": 13900 }, { "epoch": 0.03620792694044964, "grad_norm": 1.092624306678772, "learning_rate": 6.03421798282612e-07, "loss": 2.0847, "step": 13950 }, { "epoch": 0.03633770445636523, "grad_norm": 1.2159847021102905, "learning_rate": 6.055847554776891e-07, "loss": 2.0553, "step": 14000 }, { "epoch": 0.03646748197228082, "grad_norm": 1.0683104991912842, "learning_rate": 6.077477126727663e-07, "loss": 2.0834, "step": 14050 }, { "epoch": 0.036597259488196414, "grad_norm": 1.0531790256500244, "learning_rate": 6.099106698678434e-07, "loss": 2.0854, "step": 14100 }, { "epoch": 0.036727037004112, "grad_norm": 1.2325146198272705, "learning_rate": 6.120736270629204e-07, "loss": 2.0813, "step": 14150 }, { "epoch": 0.03685681452002759, "grad_norm": 1.091143012046814, "learning_rate": 6.142365842579976e-07, "loss": 2.0767, "step": 14200 }, { "epoch": 0.03698659203594318, "grad_norm": 1.235277533531189, "learning_rate": 6.163995414530747e-07, "loss": 2.1023, "step": 14250 }, { "epoch": 0.03711636955185877, "grad_norm": 1.065708041191101, "learning_rate": 6.185624986481517e-07, "loss": 2.0525, "step": 14300 }, { "epoch": 0.03724614706777436, "grad_norm": 1.121060848236084, "learning_rate": 6.20725455843229e-07, "loss": 2.086, "step": 14350 }, { "epoch": 0.03737592458368995, "grad_norm": 1.087768316268921, "learning_rate": 6.22888413038306e-07, "loss": 2.0788, "step": 14400 }, { "epoch": 0.03750570209960554, "grad_norm": 1.1760495901107788, "learning_rate": 6.250513702333831e-07, "loss": 2.0599, "step": 14450 }, { "epoch": 0.037635479615521134, "grad_norm": 1.1613560914993286, "learning_rate": 6.272143274284602e-07, "loss": 2.0784, "step": 14500 }, { "epoch": 0.03776525713143672, "grad_norm": 1.0611546039581299, "learning_rate": 6.293772846235373e-07, "loss": 2.0622, "step": 14550 }, { "epoch": 0.03789503464735231, "grad_norm": 1.23395836353302, "learning_rate": 6.315402418186145e-07, "loss": 2.0801, "step": 14600 }, { "epoch": 0.038024812163267904, "grad_norm": 1.3746421337127686, "learning_rate": 6.337031990136916e-07, "loss": 2.0865, "step": 14650 }, { "epoch": 0.03815458967918349, "grad_norm": 1.2074872255325317, "learning_rate": 6.358661562087687e-07, "loss": 2.0814, "step": 14700 }, { "epoch": 0.03828436719509908, "grad_norm": 1.3351000547409058, "learning_rate": 6.380291134038457e-07, "loss": 2.072, "step": 14750 }, { "epoch": 0.038414144711014675, "grad_norm": 1.2587641477584839, "learning_rate": 6.401920705989229e-07, "loss": 2.0674, "step": 14800 }, { "epoch": 0.03854392222693026, "grad_norm": 1.3082313537597656, "learning_rate": 6.423550277939999e-07, "loss": 2.0591, "step": 14850 }, { "epoch": 0.03867369974284585, "grad_norm": 1.0227408409118652, "learning_rate": 6.445179849890771e-07, "loss": 2.0622, "step": 14900 }, { "epoch": 0.038803477258761446, "grad_norm": 1.067315697669983, "learning_rate": 6.466809421841543e-07, "loss": 2.0742, "step": 14950 }, { "epoch": 0.03893325477467703, "grad_norm": 1.0379241704940796, "learning_rate": 6.488438993792313e-07, "loss": 2.0816, "step": 15000 } ], "logging_steps": 50, "max_steps": 1155822, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.232716578534195e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }