{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.988452655889146, "eval_steps": 500, "global_step": 4325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023094688221709007, "grad_norm": 10.050519943237305, "learning_rate": 9.216589861751153e-06, "loss": 1.0509, "step": 10 }, { "epoch": 0.046189376443418015, "grad_norm": 2.694617986679077, "learning_rate": 1.8433179723502307e-05, "loss": 0.6955, "step": 20 }, { "epoch": 0.06928406466512702, "grad_norm": 2.627452850341797, "learning_rate": 2.764976958525346e-05, "loss": 0.3561, "step": 30 }, { "epoch": 0.09237875288683603, "grad_norm": 1.1433345079421997, "learning_rate": 3.6866359447004614e-05, "loss": 0.2924, "step": 40 }, { "epoch": 0.11547344110854503, "grad_norm": 1.5595552921295166, "learning_rate": 4.608294930875576e-05, "loss": 0.2146, "step": 50 }, { "epoch": 0.13856812933025403, "grad_norm": 2.5837883949279785, "learning_rate": 5.529953917050692e-05, "loss": 0.1891, "step": 60 }, { "epoch": 0.16166281755196305, "grad_norm": 0.9749034643173218, "learning_rate": 6.451612903225807e-05, "loss": 0.1584, "step": 70 }, { "epoch": 0.18475750577367206, "grad_norm": 0.9678087830543518, "learning_rate": 7.373271889400923e-05, "loss": 0.1337, "step": 80 }, { "epoch": 0.20785219399538107, "grad_norm": 1.4949584007263184, "learning_rate": 8.294930875576037e-05, "loss": 0.1235, "step": 90 }, { "epoch": 0.23094688221709006, "grad_norm": 0.9887685179710388, "learning_rate": 9.216589861751152e-05, "loss": 0.1061, "step": 100 }, { "epoch": 0.2540415704387991, "grad_norm": 0.6556084752082825, "learning_rate": 0.00010138248847926268, "loss": 0.1055, "step": 110 }, { "epoch": 0.27713625866050806, "grad_norm": 0.6040177345275879, "learning_rate": 0.00011059907834101384, "loss": 0.0893, "step": 120 }, { "epoch": 0.3002309468822171, "grad_norm": 0.5599756240844727, "learning_rate": 0.00011981566820276497, "loss": 0.0784, "step": 130 }, { "epoch": 0.3233256351039261, "grad_norm": 0.5788015723228455, "learning_rate": 0.00012903225806451613, "loss": 0.0857, "step": 140 }, { "epoch": 0.3464203233256351, "grad_norm": 0.49766597151756287, "learning_rate": 0.00013824884792626728, "loss": 0.0811, "step": 150 }, { "epoch": 0.3695150115473441, "grad_norm": 1.4221667051315308, "learning_rate": 0.00014746543778801845, "loss": 0.0786, "step": 160 }, { "epoch": 0.39260969976905313, "grad_norm": 0.906166136264801, "learning_rate": 0.0001566820276497696, "loss": 0.0734, "step": 170 }, { "epoch": 0.41570438799076215, "grad_norm": 0.7869032025337219, "learning_rate": 0.00016589861751152075, "loss": 0.0666, "step": 180 }, { "epoch": 0.4387990762124711, "grad_norm": 0.7141720652580261, "learning_rate": 0.0001751152073732719, "loss": 0.0667, "step": 190 }, { "epoch": 0.4618937644341801, "grad_norm": 0.538352370262146, "learning_rate": 0.00018433179723502304, "loss": 0.0659, "step": 200 }, { "epoch": 0.48498845265588914, "grad_norm": 0.5275980234146118, "learning_rate": 0.00019354838709677422, "loss": 0.0636, "step": 210 }, { "epoch": 0.5080831408775982, "grad_norm": 0.6203627586364746, "learning_rate": 0.00019999973682102037, "loss": 0.0619, "step": 220 }, { "epoch": 0.5311778290993071, "grad_norm": 0.48453882336616516, "learning_rate": 0.00019999505812214085, "loss": 0.062, "step": 230 }, { "epoch": 0.5542725173210161, "grad_norm": 0.46422210335731506, "learning_rate": 0.00019998453131645004, "loss": 0.0623, "step": 240 }, { "epoch": 0.5773672055427251, "grad_norm": 0.8341119289398193, "learning_rate": 0.00019996815701959716, "loss": 0.0545, "step": 250 }, { "epoch": 0.6004618937644342, "grad_norm": 0.6169431805610657, "learning_rate": 0.00019994593618921592, "loss": 0.0594, "step": 260 }, { "epoch": 0.6235565819861432, "grad_norm": 0.6459381580352783, "learning_rate": 0.00019991787012486843, "loss": 0.0562, "step": 270 }, { "epoch": 0.6466512702078522, "grad_norm": 0.5904170274734497, "learning_rate": 0.0001998839604679692, "loss": 0.0529, "step": 280 }, { "epoch": 0.6697459584295612, "grad_norm": 0.5025797486305237, "learning_rate": 0.00019984420920168928, "loss": 0.0501, "step": 290 }, { "epoch": 0.6928406466512702, "grad_norm": 0.6631583571434021, "learning_rate": 0.00019979861865083995, "loss": 0.0583, "step": 300 }, { "epoch": 0.7159353348729792, "grad_norm": 0.5108867883682251, "learning_rate": 0.00019974719148173716, "loss": 0.0551, "step": 310 }, { "epoch": 0.7390300230946882, "grad_norm": 0.30322328209877014, "learning_rate": 0.00019968993070204534, "loss": 0.0444, "step": 320 }, { "epoch": 0.7621247113163973, "grad_norm": 0.5695562958717346, "learning_rate": 0.0001996268396606015, "loss": 0.0503, "step": 330 }, { "epoch": 0.7852193995381063, "grad_norm": 0.5680237412452698, "learning_rate": 0.0001995579220472195, "loss": 0.0607, "step": 340 }, { "epoch": 0.8083140877598153, "grad_norm": 0.6361756920814514, "learning_rate": 0.00019948318189247415, "loss": 0.0534, "step": 350 }, { "epoch": 0.8314087759815243, "grad_norm": 0.30916863679885864, "learning_rate": 0.00019940262356746554, "loss": 0.0545, "step": 360 }, { "epoch": 0.8545034642032333, "grad_norm": 0.6371238827705383, "learning_rate": 0.00019931625178356343, "loss": 0.0536, "step": 370 }, { "epoch": 0.8775981524249422, "grad_norm": 0.42440804839134216, "learning_rate": 0.0001992240715921316, "loss": 0.0476, "step": 380 }, { "epoch": 0.9006928406466512, "grad_norm": 0.42693522572517395, "learning_rate": 0.0001991260883842325, "loss": 0.049, "step": 390 }, { "epoch": 0.9237875288683602, "grad_norm": 0.36199280619621277, "learning_rate": 0.00019902230789031207, "loss": 0.0479, "step": 400 }, { "epoch": 0.9468822170900693, "grad_norm": 0.3671620786190033, "learning_rate": 0.0001989127361798643, "loss": 0.0496, "step": 410 }, { "epoch": 0.9699769053117783, "grad_norm": 0.524524450302124, "learning_rate": 0.00019879737966107654, "loss": 0.0493, "step": 420 }, { "epoch": 0.9930715935334873, "grad_norm": 0.5563727021217346, "learning_rate": 0.0001986762450804547, "loss": 0.0501, "step": 430 }, { "epoch": 1.0161662817551964, "grad_norm": 0.4217710793018341, "learning_rate": 0.00019854933952242844, "loss": 0.0432, "step": 440 }, { "epoch": 1.0392609699769053, "grad_norm": 0.3949870765209198, "learning_rate": 0.00019841667040893722, "loss": 0.0467, "step": 450 }, { "epoch": 1.0623556581986142, "grad_norm": 0.4789281189441681, "learning_rate": 0.0001982782454989959, "loss": 0.0441, "step": 460 }, { "epoch": 1.0854503464203233, "grad_norm": 0.42184802889823914, "learning_rate": 0.00019813407288824113, "loss": 0.043, "step": 470 }, { "epoch": 1.1085450346420322, "grad_norm": 0.40358513593673706, "learning_rate": 0.0001979841610084579, "loss": 0.0449, "step": 480 }, { "epoch": 1.1316397228637414, "grad_norm": 0.353373259305954, "learning_rate": 0.00019782851862708634, "loss": 0.0454, "step": 490 }, { "epoch": 1.1547344110854503, "grad_norm": 0.35266897082328796, "learning_rate": 0.00019766715484670894, "loss": 0.0496, "step": 500 }, { "epoch": 1.1778290993071594, "grad_norm": 0.4049600660800934, "learning_rate": 0.00019750007910451838, "loss": 0.0422, "step": 510 }, { "epoch": 1.2009237875288683, "grad_norm": 0.383989155292511, "learning_rate": 0.00019732730117176533, "loss": 0.0459, "step": 520 }, { "epoch": 1.2240184757505774, "grad_norm": 0.28790971636772156, "learning_rate": 0.0001971488311531873, "loss": 0.0473, "step": 530 }, { "epoch": 1.2471131639722863, "grad_norm": 0.42718982696533203, "learning_rate": 0.00019696467948641732, "loss": 0.0427, "step": 540 }, { "epoch": 1.2702078521939955, "grad_norm": 0.4704771041870117, "learning_rate": 0.00019677485694137388, "loss": 0.0395, "step": 550 }, { "epoch": 1.2933025404157044, "grad_norm": 0.3732239007949829, "learning_rate": 0.00019657937461963073, "loss": 0.0437, "step": 560 }, { "epoch": 1.3163972286374133, "grad_norm": 0.41071411967277527, "learning_rate": 0.0001963782439537678, "loss": 0.0495, "step": 570 }, { "epoch": 1.3394919168591224, "grad_norm": 0.4559623897075653, "learning_rate": 0.00019617147670670268, "loss": 0.0432, "step": 580 }, { "epoch": 1.3625866050808315, "grad_norm": 0.48421376943588257, "learning_rate": 0.00019595908497100235, "loss": 0.0435, "step": 590 }, { "epoch": 1.3856812933025404, "grad_norm": 0.5113271474838257, "learning_rate": 0.00019574108116817622, "loss": 0.0412, "step": 600 }, { "epoch": 1.4087759815242493, "grad_norm": 0.38500213623046875, "learning_rate": 0.00019551747804794967, "loss": 0.0428, "step": 610 }, { "epoch": 1.4318706697459584, "grad_norm": 0.27223697304725647, "learning_rate": 0.00019528828868751818, "loss": 0.0434, "step": 620 }, { "epoch": 1.4549653579676676, "grad_norm": 0.3776547312736511, "learning_rate": 0.00019505352649078282, "loss": 0.043, "step": 630 }, { "epoch": 1.4780600461893765, "grad_norm": 0.359902024269104, "learning_rate": 0.00019481320518756608, "loss": 0.043, "step": 640 }, { "epoch": 1.5011547344110854, "grad_norm": 0.413602352142334, "learning_rate": 0.0001945673388328091, "loss": 0.046, "step": 650 }, { "epoch": 1.5242494226327945, "grad_norm": 0.3547694981098175, "learning_rate": 0.00019431594180574944, "loss": 0.0437, "step": 660 }, { "epoch": 1.5473441108545036, "grad_norm": 0.3197380602359772, "learning_rate": 0.0001940590288090804, "loss": 0.0434, "step": 670 }, { "epoch": 1.5704387990762125, "grad_norm": 0.296234130859375, "learning_rate": 0.00019379661486809094, "loss": 0.0383, "step": 680 }, { "epoch": 1.5935334872979214, "grad_norm": 0.38352492451667786, "learning_rate": 0.00019352871532978712, "loss": 0.037, "step": 690 }, { "epoch": 1.6166281755196303, "grad_norm": 0.3082719147205353, "learning_rate": 0.00019325534586199424, "loss": 0.0368, "step": 700 }, { "epoch": 1.6397228637413395, "grad_norm": 0.3575062155723572, "learning_rate": 0.000192976522452441, "loss": 0.0414, "step": 710 }, { "epoch": 1.6628175519630486, "grad_norm": 0.2746741771697998, "learning_rate": 0.00019269226140782402, "loss": 0.04, "step": 720 }, { "epoch": 1.6859122401847575, "grad_norm": 0.35034051537513733, "learning_rate": 0.00019240257935285434, "loss": 0.0411, "step": 730 }, { "epoch": 1.7090069284064664, "grad_norm": 0.36941060423851013, "learning_rate": 0.0001921074932292852, "loss": 0.0417, "step": 740 }, { "epoch": 1.7321016166281755, "grad_norm": 0.4880543351173401, "learning_rate": 0.00019180702029492118, "loss": 0.0381, "step": 750 }, { "epoch": 1.7551963048498846, "grad_norm": 0.5262073278427124, "learning_rate": 0.00019150117812260882, "loss": 0.045, "step": 760 }, { "epoch": 1.7782909930715936, "grad_norm": 0.3127468526363373, "learning_rate": 0.00019118998459920902, "loss": 0.0437, "step": 770 }, { "epoch": 1.8013856812933025, "grad_norm": 0.21983791887760162, "learning_rate": 0.0001908734579245508, "loss": 0.0403, "step": 780 }, { "epoch": 1.8244803695150116, "grad_norm": 0.3254013955593109, "learning_rate": 0.000190551616610367, "loss": 0.0351, "step": 790 }, { "epoch": 1.8475750577367207, "grad_norm": 0.4744585156440735, "learning_rate": 0.00019022447947921167, "loss": 0.04, "step": 800 }, { "epoch": 1.8706697459584296, "grad_norm": 0.42813506722450256, "learning_rate": 0.00018989206566335907, "loss": 0.0371, "step": 810 }, { "epoch": 1.8937644341801385, "grad_norm": 0.3789767622947693, "learning_rate": 0.00018955439460368502, "loss": 0.0383, "step": 820 }, { "epoch": 1.9168591224018474, "grad_norm": 0.35561323165893555, "learning_rate": 0.00018921148604852971, "loss": 0.0381, "step": 830 }, { "epoch": 1.9399538106235565, "grad_norm": 0.3015558123588562, "learning_rate": 0.00018886336005254274, "loss": 0.0393, "step": 840 }, { "epoch": 1.9630484988452657, "grad_norm": 0.27170488238334656, "learning_rate": 0.00018851003697551038, "loss": 0.0391, "step": 850 }, { "epoch": 1.9861431870669746, "grad_norm": 0.4331890940666199, "learning_rate": 0.0001881515374811647, "loss": 0.0401, "step": 860 }, { "epoch": 2.0092378752886835, "grad_norm": 0.5359442234039307, "learning_rate": 0.00018778788253597523, "loss": 0.039, "step": 870 }, { "epoch": 2.032332563510393, "grad_norm": 0.4329531192779541, "learning_rate": 0.00018741909340792262, "loss": 0.0394, "step": 880 }, { "epoch": 2.0554272517321017, "grad_norm": 0.29946601390838623, "learning_rate": 0.00018704519166525482, "loss": 0.0394, "step": 890 }, { "epoch": 2.0785219399538106, "grad_norm": 0.45591849088668823, "learning_rate": 0.00018666619917522584, "loss": 0.0424, "step": 900 }, { "epoch": 2.1016166281755195, "grad_norm": 0.27476781606674194, "learning_rate": 0.00018628213810281658, "loss": 0.0365, "step": 910 }, { "epoch": 2.1247113163972284, "grad_norm": 0.3706781566143036, "learning_rate": 0.00018589303090943887, "loss": 0.035, "step": 920 }, { "epoch": 2.147806004618938, "grad_norm": 0.38271817564964294, "learning_rate": 0.00018549890035162155, "loss": 0.0372, "step": 930 }, { "epoch": 2.1709006928406467, "grad_norm": 0.4124270975589752, "learning_rate": 0.00018509976947967975, "loss": 0.0406, "step": 940 }, { "epoch": 2.1939953810623556, "grad_norm": 0.3650427460670471, "learning_rate": 0.00018469566163636673, "loss": 0.038, "step": 950 }, { "epoch": 2.2170900692840645, "grad_norm": 0.2632494568824768, "learning_rate": 0.00018428660045550875, "loss": 0.0358, "step": 960 }, { "epoch": 2.240184757505774, "grad_norm": 0.2518555819988251, "learning_rate": 0.0001838726098606229, "loss": 0.0384, "step": 970 }, { "epoch": 2.2632794457274827, "grad_norm": 0.35301706194877625, "learning_rate": 0.00018345371406351782, "loss": 0.0368, "step": 980 }, { "epoch": 2.2863741339491916, "grad_norm": 0.2957130968570709, "learning_rate": 0.00018302993756287795, "loss": 0.0375, "step": 990 }, { "epoch": 2.3094688221709005, "grad_norm": 0.33476200699806213, "learning_rate": 0.00018260130514283036, "loss": 0.0377, "step": 1000 }, { "epoch": 2.3325635103926095, "grad_norm": 0.3259429931640625, "learning_rate": 0.0001821678418714957, "loss": 0.0333, "step": 1010 }, { "epoch": 2.355658198614319, "grad_norm": 0.4147096872329712, "learning_rate": 0.00018172957309952188, "loss": 0.0375, "step": 1020 }, { "epoch": 2.3787528868360277, "grad_norm": 0.29218655824661255, "learning_rate": 0.00018128652445860145, "loss": 0.0345, "step": 1030 }, { "epoch": 2.4018475750577366, "grad_norm": 0.32885679602622986, "learning_rate": 0.00018083872185997274, "loss": 0.0368, "step": 1040 }, { "epoch": 2.424942263279446, "grad_norm": 0.37419071793556213, "learning_rate": 0.00018038619149290415, "loss": 0.03, "step": 1050 }, { "epoch": 2.448036951501155, "grad_norm": 0.3435191810131073, "learning_rate": 0.0001799289598231629, "loss": 0.0343, "step": 1060 }, { "epoch": 2.4711316397228638, "grad_norm": 0.46311062574386597, "learning_rate": 0.00017946705359146684, "loss": 0.0334, "step": 1070 }, { "epoch": 2.4942263279445727, "grad_norm": 0.4619911015033722, "learning_rate": 0.00017900049981192081, "loss": 0.0382, "step": 1080 }, { "epoch": 2.5173210161662816, "grad_norm": 0.30530160665512085, "learning_rate": 0.00017852932577043658, "loss": 0.0352, "step": 1090 }, { "epoch": 2.540415704387991, "grad_norm": 0.23941746354103088, "learning_rate": 0.0001780535590231372, "loss": 0.0349, "step": 1100 }, { "epoch": 2.5635103926097, "grad_norm": 0.2063404619693756, "learning_rate": 0.00017757322739474525, "loss": 0.0304, "step": 1110 }, { "epoch": 2.5866050808314087, "grad_norm": 0.33000999689102173, "learning_rate": 0.0001770883589769557, "loss": 0.0329, "step": 1120 }, { "epoch": 2.6096997690531176, "grad_norm": 0.34663257002830505, "learning_rate": 0.0001765989821267928, "loss": 0.0336, "step": 1130 }, { "epoch": 2.6327944572748265, "grad_norm": 0.39627474546432495, "learning_rate": 0.00017610512546495195, "loss": 0.034, "step": 1140 }, { "epoch": 2.655889145496536, "grad_norm": 0.28975868225097656, "learning_rate": 0.00017560681787412552, "loss": 0.0331, "step": 1150 }, { "epoch": 2.678983833718245, "grad_norm": 0.3867139518260956, "learning_rate": 0.00017510408849731392, "loss": 0.0349, "step": 1160 }, { "epoch": 2.7020785219399537, "grad_norm": 0.23290959000587463, "learning_rate": 0.00017459696673612104, "loss": 0.0382, "step": 1170 }, { "epoch": 2.725173210161663, "grad_norm": 0.4149172902107239, "learning_rate": 0.00017408548224903488, "loss": 0.0375, "step": 1180 }, { "epoch": 2.748267898383372, "grad_norm": 0.23780949413776398, "learning_rate": 0.00017356966494969277, "loss": 0.0297, "step": 1190 }, { "epoch": 2.771362586605081, "grad_norm": 0.27576741576194763, "learning_rate": 0.00017304954500513224, "loss": 0.0365, "step": 1200 }, { "epoch": 2.7944572748267897, "grad_norm": 0.2264527529478073, "learning_rate": 0.00017252515283402636, "loss": 0.0307, "step": 1210 }, { "epoch": 2.8175519630484986, "grad_norm": 0.35455092787742615, "learning_rate": 0.0001719965191049051, "loss": 0.0315, "step": 1220 }, { "epoch": 2.840646651270208, "grad_norm": 0.2472825050354004, "learning_rate": 0.00017146367473436138, "loss": 0.0354, "step": 1230 }, { "epoch": 2.863741339491917, "grad_norm": 0.2657906115055084, "learning_rate": 0.00017092665088524315, "loss": 0.0361, "step": 1240 }, { "epoch": 2.886836027713626, "grad_norm": 0.36936143040657043, "learning_rate": 0.00017038547896483083, "loss": 0.0319, "step": 1250 }, { "epoch": 2.909930715935335, "grad_norm": 0.34996992349624634, "learning_rate": 0.0001698401906230005, "loss": 0.0349, "step": 1260 }, { "epoch": 2.9330254041570436, "grad_norm": 0.2003779411315918, "learning_rate": 0.00016929081775037276, "loss": 0.0301, "step": 1270 }, { "epoch": 2.956120092378753, "grad_norm": 0.2517768442630768, "learning_rate": 0.00016873739247644785, "loss": 0.0342, "step": 1280 }, { "epoch": 2.979214780600462, "grad_norm": 0.26178500056266785, "learning_rate": 0.00016817994716772635, "loss": 0.0339, "step": 1290 }, { "epoch": 3.0023094688221708, "grad_norm": 0.30486026406288147, "learning_rate": 0.0001676185144258165, "loss": 0.0352, "step": 1300 }, { "epoch": 3.02540415704388, "grad_norm": 0.26309579610824585, "learning_rate": 0.0001670531270855274, "loss": 0.0317, "step": 1310 }, { "epoch": 3.048498845265589, "grad_norm": 0.23204182088375092, "learning_rate": 0.00016648381821294863, "loss": 0.0298, "step": 1320 }, { "epoch": 3.071593533487298, "grad_norm": 0.23810264468193054, "learning_rate": 0.00016591062110351662, "loss": 0.0351, "step": 1330 }, { "epoch": 3.094688221709007, "grad_norm": 0.36545702815055847, "learning_rate": 0.00016533356928006722, "loss": 0.0347, "step": 1340 }, { "epoch": 3.1177829099307157, "grad_norm": 0.2644146978855133, "learning_rate": 0.0001647526964908752, "loss": 0.0319, "step": 1350 }, { "epoch": 3.140877598152425, "grad_norm": 0.29174649715423584, "learning_rate": 0.00016416803670768056, "loss": 0.0329, "step": 1360 }, { "epoch": 3.163972286374134, "grad_norm": 0.25455746054649353, "learning_rate": 0.0001635796241237017, "loss": 0.0327, "step": 1370 }, { "epoch": 3.187066974595843, "grad_norm": 0.36102402210235596, "learning_rate": 0.00016298749315163567, "loss": 0.0291, "step": 1380 }, { "epoch": 3.2101616628175518, "grad_norm": 0.4295611083507538, "learning_rate": 0.00016239167842164548, "loss": 0.0305, "step": 1390 }, { "epoch": 3.233256351039261, "grad_norm": 0.2627660632133484, "learning_rate": 0.0001617922147793351, "loss": 0.0294, "step": 1400 }, { "epoch": 3.25635103926097, "grad_norm": 0.2932153046131134, "learning_rate": 0.00016118913728371107, "loss": 0.0287, "step": 1410 }, { "epoch": 3.279445727482679, "grad_norm": 0.32482001185417175, "learning_rate": 0.0001605824812051326, "loss": 0.0298, "step": 1420 }, { "epoch": 3.302540415704388, "grad_norm": 0.3234105408191681, "learning_rate": 0.0001599722820232484, "loss": 0.0335, "step": 1430 }, { "epoch": 3.325635103926097, "grad_norm": 0.28580203652381897, "learning_rate": 0.0001593585754249221, "loss": 0.0301, "step": 1440 }, { "epoch": 3.348729792147806, "grad_norm": 0.44926854968070984, "learning_rate": 0.00015874139730214477, "loss": 0.0317, "step": 1450 }, { "epoch": 3.371824480369515, "grad_norm": 0.3214055895805359, "learning_rate": 0.00015812078374993603, "loss": 0.0338, "step": 1460 }, { "epoch": 3.394919168591224, "grad_norm": 0.2814524471759796, "learning_rate": 0.000157496771064233, "loss": 0.0314, "step": 1470 }, { "epoch": 3.418013856812933, "grad_norm": 0.22763226926326752, "learning_rate": 0.0001568693957397675, "loss": 0.035, "step": 1480 }, { "epoch": 3.441108545034642, "grad_norm": 0.46513620018959045, "learning_rate": 0.000156238694467932, "loss": 0.0354, "step": 1490 }, { "epoch": 3.464203233256351, "grad_norm": 0.3795129060745239, "learning_rate": 0.0001556047041346333, "loss": 0.0285, "step": 1500 }, { "epoch": 3.48729792147806, "grad_norm": 0.22874844074249268, "learning_rate": 0.00015496746181813565, "loss": 0.0329, "step": 1510 }, { "epoch": 3.5103926096997693, "grad_norm": 0.43440544605255127, "learning_rate": 0.0001543270047868921, "loss": 0.0292, "step": 1520 }, { "epoch": 3.533487297921478, "grad_norm": 0.328204482793808, "learning_rate": 0.00015368337049736502, "loss": 0.0298, "step": 1530 }, { "epoch": 3.556581986143187, "grad_norm": 0.2836969196796417, "learning_rate": 0.00015303659659183534, "loss": 0.0326, "step": 1540 }, { "epoch": 3.579676674364896, "grad_norm": 0.2993479073047638, "learning_rate": 0.0001523867208962012, "loss": 0.0314, "step": 1550 }, { "epoch": 3.602771362586605, "grad_norm": 0.24445673823356628, "learning_rate": 0.00015173378141776568, "loss": 0.034, "step": 1560 }, { "epoch": 3.6258660508083143, "grad_norm": 0.2466069757938385, "learning_rate": 0.00015107781634301408, "loss": 0.0332, "step": 1570 }, { "epoch": 3.648960739030023, "grad_norm": 0.23315438628196716, "learning_rate": 0.0001504188640353804, "loss": 0.0279, "step": 1580 }, { "epoch": 3.672055427251732, "grad_norm": 0.2599908709526062, "learning_rate": 0.00014975696303300398, "loss": 0.0292, "step": 1590 }, { "epoch": 3.695150115473441, "grad_norm": 0.25828924775123596, "learning_rate": 0.00014909215204647548, "loss": 0.0312, "step": 1600 }, { "epoch": 3.71824480369515, "grad_norm": 0.32230719923973083, "learning_rate": 0.0001484244699565729, "loss": 0.0297, "step": 1610 }, { "epoch": 3.741339491916859, "grad_norm": 0.29874908924102783, "learning_rate": 0.00014775395581198778, "loss": 0.0298, "step": 1620 }, { "epoch": 3.764434180138568, "grad_norm": 0.2458525151014328, "learning_rate": 0.00014708064882704134, "loss": 0.0283, "step": 1630 }, { "epoch": 3.787528868360277, "grad_norm": 0.23176351189613342, "learning_rate": 0.00014640458837939129, "loss": 0.0272, "step": 1640 }, { "epoch": 3.8106235565819864, "grad_norm": 0.3253209590911865, "learning_rate": 0.00014572581400772863, "loss": 0.0275, "step": 1650 }, { "epoch": 3.8337182448036953, "grad_norm": 0.3006962537765503, "learning_rate": 0.00014504436540946548, "loss": 0.0264, "step": 1660 }, { "epoch": 3.856812933025404, "grad_norm": 0.3920678496360779, "learning_rate": 0.00014436028243841316, "loss": 0.0293, "step": 1670 }, { "epoch": 3.879907621247113, "grad_norm": 0.3614073097705841, "learning_rate": 0.0001436736051024517, "loss": 0.029, "step": 1680 }, { "epoch": 3.903002309468822, "grad_norm": 0.36879438161849976, "learning_rate": 0.00014298437356118982, "loss": 0.0333, "step": 1690 }, { "epoch": 3.9260969976905313, "grad_norm": 0.2536484897136688, "learning_rate": 0.00014229262812361622, "loss": 0.0294, "step": 1700 }, { "epoch": 3.9491916859122402, "grad_norm": 0.28662723302841187, "learning_rate": 0.00014159840924574222, "loss": 0.0297, "step": 1710 }, { "epoch": 3.972286374133949, "grad_norm": 0.2127179205417633, "learning_rate": 0.00014090175752823572, "loss": 0.0288, "step": 1720 }, { "epoch": 3.995381062355658, "grad_norm": 0.25283893942832947, "learning_rate": 0.00014020271371404682, "loss": 0.035, "step": 1730 }, { "epoch": 4.018475750577367, "grad_norm": 0.18742461502552032, "learning_rate": 0.0001395013186860247, "loss": 0.0262, "step": 1740 }, { "epoch": 4.041570438799076, "grad_norm": 0.36856940388679504, "learning_rate": 0.00013879761346452703, "loss": 0.0326, "step": 1750 }, { "epoch": 4.064665127020786, "grad_norm": 0.281253457069397, "learning_rate": 0.0001380916392050206, "loss": 0.0285, "step": 1760 }, { "epoch": 4.087759815242494, "grad_norm": 0.24782918393611908, "learning_rate": 0.00013738343719567464, "loss": 0.0272, "step": 1770 }, { "epoch": 4.1108545034642034, "grad_norm": 0.25319260358810425, "learning_rate": 0.00013667304885494588, "loss": 0.0315, "step": 1780 }, { "epoch": 4.133949191685912, "grad_norm": 0.288912296295166, "learning_rate": 0.0001359605157291565, "loss": 0.0308, "step": 1790 }, { "epoch": 4.157043879907621, "grad_norm": 0.2902250587940216, "learning_rate": 0.000135245879490064, "loss": 0.0279, "step": 1800 }, { "epoch": 4.180138568129331, "grad_norm": 0.2891808748245239, "learning_rate": 0.00013452918193242457, "loss": 0.0263, "step": 1810 }, { "epoch": 4.203233256351039, "grad_norm": 0.3770907521247864, "learning_rate": 0.00013381046497154816, "loss": 0.0308, "step": 1820 }, { "epoch": 4.226327944572748, "grad_norm": 0.31093716621398926, "learning_rate": 0.00013308977064084763, "loss": 0.0298, "step": 1830 }, { "epoch": 4.249422632794457, "grad_norm": 0.31624069809913635, "learning_rate": 0.00013236714108938013, "loss": 0.0296, "step": 1840 }, { "epoch": 4.272517321016166, "grad_norm": 0.376964807510376, "learning_rate": 0.00013164261857938228, "loss": 0.0339, "step": 1850 }, { "epoch": 4.295612009237876, "grad_norm": 0.48355865478515625, "learning_rate": 0.0001309162454837983, "loss": 0.0328, "step": 1860 }, { "epoch": 4.318706697459584, "grad_norm": 0.2842826545238495, "learning_rate": 0.00013018806428380207, "loss": 0.0264, "step": 1870 }, { "epoch": 4.341801385681293, "grad_norm": 0.39021459221839905, "learning_rate": 0.00012945811756631255, "loss": 0.029, "step": 1880 }, { "epoch": 4.364896073903003, "grad_norm": 0.20523010194301605, "learning_rate": 0.0001287264480215031, "loss": 0.0312, "step": 1890 }, { "epoch": 4.387990762124711, "grad_norm": 0.2571987509727478, "learning_rate": 0.00012799309844030497, "loss": 0.0282, "step": 1900 }, { "epoch": 4.4110854503464205, "grad_norm": 0.18886256217956543, "learning_rate": 0.00012725811171190437, "loss": 0.0271, "step": 1910 }, { "epoch": 4.434180138568129, "grad_norm": 0.27965831756591797, "learning_rate": 0.00012652153082123456, "loss": 0.0284, "step": 1920 }, { "epoch": 4.457274826789838, "grad_norm": 0.26982808113098145, "learning_rate": 0.00012578339884646166, "loss": 0.0274, "step": 1930 }, { "epoch": 4.480369515011548, "grad_norm": 0.17887461185455322, "learning_rate": 0.00012504375895646533, "loss": 0.0318, "step": 1940 }, { "epoch": 4.503464203233256, "grad_norm": 0.24898460507392883, "learning_rate": 0.00012430265440831396, "loss": 0.0284, "step": 1950 }, { "epoch": 4.5265588914549655, "grad_norm": 0.20980703830718994, "learning_rate": 0.0001235601285447352, "loss": 0.0276, "step": 1960 }, { "epoch": 4.549653579676674, "grad_norm": 0.21943718194961548, "learning_rate": 0.00012281622479158062, "loss": 0.0257, "step": 1970 }, { "epoch": 4.572748267898383, "grad_norm": 0.19166800379753113, "learning_rate": 0.00012207098665528636, "loss": 0.0264, "step": 1980 }, { "epoch": 4.595842956120093, "grad_norm": 0.23728342354297638, "learning_rate": 0.00012132445772032843, "loss": 0.0273, "step": 1990 }, { "epoch": 4.618937644341801, "grad_norm": 0.20198193192481995, "learning_rate": 0.00012057668164667406, "loss": 0.0257, "step": 2000 }, { "epoch": 4.64203233256351, "grad_norm": 0.21189096570014954, "learning_rate": 0.00011982770216722789, "loss": 0.0256, "step": 2010 }, { "epoch": 4.665127020785219, "grad_norm": 0.25154975056648254, "learning_rate": 0.0001190775630852746, "loss": 0.0294, "step": 2020 }, { "epoch": 4.688221709006928, "grad_norm": 0.24083566665649414, "learning_rate": 0.00011832630827191705, "loss": 0.0276, "step": 2030 }, { "epoch": 4.711316397228638, "grad_norm": 0.18492062389850616, "learning_rate": 0.00011757398166351038, "loss": 0.0285, "step": 2040 }, { "epoch": 4.734411085450346, "grad_norm": 0.2945506274700165, "learning_rate": 0.00011682062725909258, "loss": 0.0268, "step": 2050 }, { "epoch": 4.757505773672055, "grad_norm": 0.3341239392757416, "learning_rate": 0.00011606628911781123, "loss": 0.029, "step": 2060 }, { "epoch": 4.780600461893765, "grad_norm": 0.20814575254917145, "learning_rate": 0.0001153110113563468, "loss": 0.027, "step": 2070 }, { "epoch": 4.803695150115473, "grad_norm": 0.26448166370391846, "learning_rate": 0.00011455483814633238, "loss": 0.0281, "step": 2080 }, { "epoch": 4.826789838337183, "grad_norm": 0.24523232877254486, "learning_rate": 0.0001137978137117705, "loss": 0.0299, "step": 2090 }, { "epoch": 4.849884526558892, "grad_norm": 0.2897242605686188, "learning_rate": 0.00011303998232644657, "loss": 0.0308, "step": 2100 }, { "epoch": 4.8729792147806, "grad_norm": 0.23827065527439117, "learning_rate": 0.00011228138831133978, "loss": 0.0238, "step": 2110 }, { "epoch": 4.89607390300231, "grad_norm": 0.2748923897743225, "learning_rate": 0.00011152207603203088, "loss": 0.0255, "step": 2120 }, { "epoch": 4.919168591224018, "grad_norm": 0.2012598067522049, "learning_rate": 0.00011076208989610761, "loss": 0.0297, "step": 2130 }, { "epoch": 4.9422632794457275, "grad_norm": 0.21421535313129425, "learning_rate": 0.00011000147435056742, "loss": 0.0231, "step": 2140 }, { "epoch": 4.965357967667437, "grad_norm": 0.22895927727222443, "learning_rate": 0.00010924027387921818, "loss": 0.0265, "step": 2150 }, { "epoch": 4.988452655889145, "grad_norm": 0.2699049413204193, "learning_rate": 0.00010847853300007653, "loss": 0.0251, "step": 2160 }, { "epoch": 5.011547344110855, "grad_norm": 0.3293992877006531, "learning_rate": 0.00010771629626276428, "loss": 0.0229, "step": 2170 }, { "epoch": 5.034642032332563, "grad_norm": 0.2511574327945709, "learning_rate": 0.00010695360824590303, "loss": 0.0234, "step": 2180 }, { "epoch": 5.0577367205542725, "grad_norm": 0.3633870780467987, "learning_rate": 0.00010619051355450696, "loss": 0.0237, "step": 2190 }, { "epoch": 5.080831408775982, "grad_norm": 0.27033141255378723, "learning_rate": 0.00010542705681737422, "loss": 0.0278, "step": 2200 }, { "epoch": 5.10392609699769, "grad_norm": 0.33227813243865967, "learning_rate": 0.00010466328268447674, "loss": 0.028, "step": 2210 }, { "epoch": 5.1270207852194, "grad_norm": 0.19936224818229675, "learning_rate": 0.00010389923582434913, "loss": 0.0266, "step": 2220 }, { "epoch": 5.150115473441108, "grad_norm": 0.22531768679618835, "learning_rate": 0.0001031349609214761, "loss": 0.0279, "step": 2230 }, { "epoch": 5.173210161662817, "grad_norm": 0.2059541642665863, "learning_rate": 0.00010237050267367921, "loss": 0.0243, "step": 2240 }, { "epoch": 5.196304849884527, "grad_norm": 0.3016744554042816, "learning_rate": 0.00010160590578950273, "loss": 0.0287, "step": 2250 }, { "epoch": 5.219399538106235, "grad_norm": 0.3229542672634125, "learning_rate": 0.00010084121498559902, "loss": 0.028, "step": 2260 }, { "epoch": 5.242494226327945, "grad_norm": 0.28354501724243164, "learning_rate": 0.00010007647498411313, "loss": 0.0239, "step": 2270 }, { "epoch": 5.265588914549654, "grad_norm": 0.24493178725242615, "learning_rate": 9.931173051006746e-05, "loss": 0.0255, "step": 2280 }, { "epoch": 5.288683602771362, "grad_norm": 0.21446578204631805, "learning_rate": 9.854702628874596e-05, "loss": 0.0227, "step": 2290 }, { "epoch": 5.311778290993072, "grad_norm": 0.21182934939861298, "learning_rate": 9.778240704307844e-05, "loss": 0.0261, "step": 2300 }, { "epoch": 5.33487297921478, "grad_norm": 0.19283224642276764, "learning_rate": 9.701791749102495e-05, "loss": 0.0257, "step": 2310 }, { "epoch": 5.35796766743649, "grad_norm": 0.31905102729797363, "learning_rate": 9.625360234296065e-05, "loss": 0.0254, "step": 2320 }, { "epoch": 5.381062355658199, "grad_norm": 0.17523027956485748, "learning_rate": 9.548950629906077e-05, "loss": 0.0284, "step": 2330 }, { "epoch": 5.404157043879907, "grad_norm": 0.3119193911552429, "learning_rate": 9.47256740466865e-05, "loss": 0.0274, "step": 2340 }, { "epoch": 5.427251732101617, "grad_norm": 0.18288664519786835, "learning_rate": 9.396215025777139e-05, "loss": 0.0243, "step": 2350 }, { "epoch": 5.450346420323326, "grad_norm": 0.3371618092060089, "learning_rate": 9.3198979586209e-05, "loss": 0.0263, "step": 2360 }, { "epoch": 5.4734411085450345, "grad_norm": 0.1650622934103012, "learning_rate": 9.243620666524099e-05, "loss": 0.0238, "step": 2370 }, { "epoch": 5.496535796766744, "grad_norm": 0.2566249370574951, "learning_rate": 9.167387610484712e-05, "loss": 0.0233, "step": 2380 }, { "epoch": 5.519630484988452, "grad_norm": 0.312350869178772, "learning_rate": 9.091203248913607e-05, "loss": 0.0265, "step": 2390 }, { "epoch": 5.542725173210162, "grad_norm": 0.14863193035125732, "learning_rate": 9.015072037373816e-05, "loss": 0.0235, "step": 2400 }, { "epoch": 5.565819861431871, "grad_norm": 0.2509799003601074, "learning_rate": 8.938998428319937e-05, "loss": 0.0278, "step": 2410 }, { "epoch": 5.5889145496535795, "grad_norm": 0.1843155473470688, "learning_rate": 8.862986870837753e-05, "loss": 0.0284, "step": 2420 }, { "epoch": 5.612009237875289, "grad_norm": 0.3865255117416382, "learning_rate": 8.787041810384019e-05, "loss": 0.0256, "step": 2430 }, { "epoch": 5.635103926096997, "grad_norm": 0.2363845258951187, "learning_rate": 8.711167688526493e-05, "loss": 0.0271, "step": 2440 }, { "epoch": 5.658198614318707, "grad_norm": 0.209372416138649, "learning_rate": 8.635368942684153e-05, "loss": 0.0248, "step": 2450 }, { "epoch": 5.681293302540416, "grad_norm": 0.22078511118888855, "learning_rate": 8.559650005867684e-05, "loss": 0.0252, "step": 2460 }, { "epoch": 5.704387990762124, "grad_norm": 0.20760229229927063, "learning_rate": 8.484015306420242e-05, "loss": 0.0235, "step": 2470 }, { "epoch": 5.727482678983834, "grad_norm": 0.18812189996242523, "learning_rate": 8.408469267758432e-05, "loss": 0.0238, "step": 2480 }, { "epoch": 5.750577367205542, "grad_norm": 0.1914282590150833, "learning_rate": 8.333016308113629e-05, "loss": 0.0255, "step": 2490 }, { "epoch": 5.773672055427252, "grad_norm": 0.288441926240921, "learning_rate": 8.257660840273579e-05, "loss": 0.0271, "step": 2500 }, { "epoch": 5.796766743648961, "grad_norm": 0.23392659425735474, "learning_rate": 8.182407271324326e-05, "loss": 0.0279, "step": 2510 }, { "epoch": 5.819861431870669, "grad_norm": 0.25330981612205505, "learning_rate": 8.107260002392457e-05, "loss": 0.0248, "step": 2520 }, { "epoch": 5.842956120092379, "grad_norm": 0.22893041372299194, "learning_rate": 8.032223428387719e-05, "loss": 0.0233, "step": 2530 }, { "epoch": 5.866050808314088, "grad_norm": 0.18465740978717804, "learning_rate": 7.95730193774597e-05, "loss": 0.0227, "step": 2540 }, { "epoch": 5.8891454965357966, "grad_norm": 0.17142651975154877, "learning_rate": 7.882499912172557e-05, "loss": 0.024, "step": 2550 }, { "epoch": 5.912240184757506, "grad_norm": 0.19113211333751678, "learning_rate": 7.807821726386022e-05, "loss": 0.0231, "step": 2560 }, { "epoch": 5.935334872979215, "grad_norm": 0.20570556819438934, "learning_rate": 7.733271747862265e-05, "loss": 0.0225, "step": 2570 }, { "epoch": 5.958429561200924, "grad_norm": 0.21998938918113708, "learning_rate": 7.65885433657913e-05, "loss": 0.0226, "step": 2580 }, { "epoch": 5.981524249422633, "grad_norm": 0.35705238580703735, "learning_rate": 7.584573844761393e-05, "loss": 0.0197, "step": 2590 }, { "epoch": 6.0046189376443415, "grad_norm": 0.27228960394859314, "learning_rate": 7.510434616626243e-05, "loss": 0.0252, "step": 2600 }, { "epoch": 6.027713625866051, "grad_norm": 0.19615407288074493, "learning_rate": 7.4364409881292e-05, "loss": 0.026, "step": 2610 }, { "epoch": 6.05080831408776, "grad_norm": 0.27591854333877563, "learning_rate": 7.362597286710562e-05, "loss": 0.0262, "step": 2620 }, { "epoch": 6.073903002309469, "grad_norm": 0.26121532917022705, "learning_rate": 7.288907831042279e-05, "loss": 0.0206, "step": 2630 }, { "epoch": 6.096997690531178, "grad_norm": 0.365448921918869, "learning_rate": 7.215376930775403e-05, "loss": 0.0264, "step": 2640 }, { "epoch": 6.1200923787528865, "grad_norm": 0.24886159598827362, "learning_rate": 7.14200888628804e-05, "loss": 0.0225, "step": 2650 }, { "epoch": 6.143187066974596, "grad_norm": 0.23590990900993347, "learning_rate": 7.06880798843385e-05, "loss": 0.0214, "step": 2660 }, { "epoch": 6.166281755196305, "grad_norm": 0.34595030546188354, "learning_rate": 6.995778518291089e-05, "loss": 0.024, "step": 2670 }, { "epoch": 6.189376443418014, "grad_norm": 0.2593909502029419, "learning_rate": 6.922924746912245e-05, "loss": 0.0227, "step": 2680 }, { "epoch": 6.212471131639723, "grad_norm": 0.30012357234954834, "learning_rate": 6.850250935074243e-05, "loss": 0.0237, "step": 2690 }, { "epoch": 6.235565819861431, "grad_norm": 0.219698965549469, "learning_rate": 6.777761333029275e-05, "loss": 0.0211, "step": 2700 }, { "epoch": 6.258660508083141, "grad_norm": 0.20809656381607056, "learning_rate": 6.705460180256199e-05, "loss": 0.0216, "step": 2710 }, { "epoch": 6.28175519630485, "grad_norm": 0.19182954728603363, "learning_rate": 6.633351705212617e-05, "loss": 0.0228, "step": 2720 }, { "epoch": 6.304849884526559, "grad_norm": 0.171848326921463, "learning_rate": 6.561440125087587e-05, "loss": 0.0232, "step": 2730 }, { "epoch": 6.327944572748268, "grad_norm": 0.2251206785440445, "learning_rate": 6.489729645554959e-05, "loss": 0.023, "step": 2740 }, { "epoch": 6.351039260969977, "grad_norm": 0.20676329731941223, "learning_rate": 6.418224460527428e-05, "loss": 0.0216, "step": 2750 }, { "epoch": 6.374133949191686, "grad_norm": 0.1491789072751999, "learning_rate": 6.346928751911255e-05, "loss": 0.0196, "step": 2760 }, { "epoch": 6.397228637413395, "grad_norm": 0.1880139261484146, "learning_rate": 6.275846689361693e-05, "loss": 0.0251, "step": 2770 }, { "epoch": 6.4203233256351036, "grad_norm": 0.24984711408615112, "learning_rate": 6.204982430039124e-05, "loss": 0.0218, "step": 2780 }, { "epoch": 6.443418013856813, "grad_norm": 0.1841384768486023, "learning_rate": 6.134340118365936e-05, "loss": 0.0224, "step": 2790 }, { "epoch": 6.466512702078522, "grad_norm": 0.2632406949996948, "learning_rate": 6.063923885784138e-05, "loss": 0.021, "step": 2800 }, { "epoch": 6.489607390300231, "grad_norm": 0.21054299175739288, "learning_rate": 5.9937378505137476e-05, "loss": 0.0232, "step": 2810 }, { "epoch": 6.51270207852194, "grad_norm": 0.18742221593856812, "learning_rate": 5.9237861173119234e-05, "loss": 0.0218, "step": 2820 }, { "epoch": 6.535796766743649, "grad_norm": 0.16454491019248962, "learning_rate": 5.854072777232914e-05, "loss": 0.02, "step": 2830 }, { "epoch": 6.558891454965358, "grad_norm": 0.12906990945339203, "learning_rate": 5.7846019073887904e-05, "loss": 0.0233, "step": 2840 }, { "epoch": 6.581986143187067, "grad_norm": 0.1799432933330536, "learning_rate": 5.715377570711019e-05, "loss": 0.0225, "step": 2850 }, { "epoch": 6.605080831408776, "grad_norm": 0.25904062390327454, "learning_rate": 5.6464038157128106e-05, "loss": 0.0241, "step": 2860 }, { "epoch": 6.628175519630485, "grad_norm": 0.14367002248764038, "learning_rate": 5.577684676252384e-05, "loss": 0.022, "step": 2870 }, { "epoch": 6.651270207852194, "grad_norm": 0.20157106220722198, "learning_rate": 5.509224171297027e-05, "loss": 0.0215, "step": 2880 }, { "epoch": 6.674364896073903, "grad_norm": 0.18459396064281464, "learning_rate": 5.4410263046880524e-05, "loss": 0.0252, "step": 2890 }, { "epoch": 6.697459584295612, "grad_norm": 0.17748424410820007, "learning_rate": 5.373095064906657e-05, "loss": 0.024, "step": 2900 }, { "epoch": 6.720554272517321, "grad_norm": 0.19444628059864044, "learning_rate": 5.305434424840623e-05, "loss": 0.0202, "step": 2910 }, { "epoch": 6.74364896073903, "grad_norm": 0.21914024651050568, "learning_rate": 5.2380483415520196e-05, "loss": 0.0229, "step": 2920 }, { "epoch": 6.766743648960739, "grad_norm": 0.22515784204006195, "learning_rate": 5.170940756045725e-05, "loss": 0.0241, "step": 2930 }, { "epoch": 6.789838337182448, "grad_norm": 0.2511735260486603, "learning_rate": 5.104115593038976e-05, "loss": 0.0202, "step": 2940 }, { "epoch": 6.812933025404157, "grad_norm": 0.22903645038604736, "learning_rate": 5.0375767607318106e-05, "loss": 0.0229, "step": 2950 }, { "epoch": 6.836027713625866, "grad_norm": 0.24204668402671814, "learning_rate": 4.97132815057854e-05, "loss": 0.0209, "step": 2960 }, { "epoch": 6.859122401847575, "grad_norm": 0.3427051901817322, "learning_rate": 4.905373637060108e-05, "loss": 0.0226, "step": 2970 }, { "epoch": 6.882217090069284, "grad_norm": 0.189266636967659, "learning_rate": 4.8397170774575394e-05, "loss": 0.0183, "step": 2980 }, { "epoch": 6.905311778290993, "grad_norm": 0.17002785205841064, "learning_rate": 4.7743623116263245e-05, "loss": 0.0187, "step": 2990 }, { "epoch": 6.928406466512702, "grad_norm": 0.21065160632133484, "learning_rate": 4.709313161771867e-05, "loss": 0.0237, "step": 3000 }, { "epoch": 6.951501154734411, "grad_norm": 0.14337095618247986, "learning_rate": 4.644573432225939e-05, "loss": 0.0205, "step": 3010 }, { "epoch": 6.97459584295612, "grad_norm": 0.14885887503623962, "learning_rate": 4.580146909224173e-05, "loss": 0.0204, "step": 3020 }, { "epoch": 6.997690531177829, "grad_norm": 0.1806829869747162, "learning_rate": 4.516037360684673e-05, "loss": 0.0186, "step": 3030 }, { "epoch": 7.020785219399538, "grad_norm": 0.1977592259645462, "learning_rate": 4.452248535987588e-05, "loss": 0.0203, "step": 3040 }, { "epoch": 7.043879907621247, "grad_norm": 0.17835497856140137, "learning_rate": 4.388784165755894e-05, "loss": 0.0208, "step": 3050 }, { "epoch": 7.066974595842956, "grad_norm": 0.29322361946105957, "learning_rate": 4.3256479616371636e-05, "loss": 0.0184, "step": 3060 }, { "epoch": 7.090069284064665, "grad_norm": 0.17030254006385803, "learning_rate": 4.262843616086534e-05, "loss": 0.02, "step": 3070 }, { "epoch": 7.113163972286374, "grad_norm": 0.18691743910312653, "learning_rate": 4.2003748021507336e-05, "loss": 0.019, "step": 3080 }, { "epoch": 7.1362586605080836, "grad_norm": 0.23188500106334686, "learning_rate": 4.1382451732532665e-05, "loss": 0.0189, "step": 3090 }, { "epoch": 7.159353348729792, "grad_norm": 0.20676599442958832, "learning_rate": 4.076458362980764e-05, "loss": 0.0209, "step": 3100 }, { "epoch": 7.182448036951501, "grad_norm": 0.22161860764026642, "learning_rate": 4.0150179848704614e-05, "loss": 0.0203, "step": 3110 }, { "epoch": 7.20554272517321, "grad_norm": 0.17719246447086334, "learning_rate": 3.9539276321988764e-05, "loss": 0.0201, "step": 3120 }, { "epoch": 7.228637413394919, "grad_norm": 0.2837867736816406, "learning_rate": 3.89319087777164e-05, "loss": 0.0226, "step": 3130 }, { "epoch": 7.2517321016166285, "grad_norm": 0.2676970362663269, "learning_rate": 3.832811273714569e-05, "loss": 0.0219, "step": 3140 }, { "epoch": 7.274826789838337, "grad_norm": 0.14641159772872925, "learning_rate": 3.772792351265917e-05, "loss": 0.0202, "step": 3150 }, { "epoch": 7.297921478060046, "grad_norm": 0.13401910662651062, "learning_rate": 3.713137620569833e-05, "loss": 0.0186, "step": 3160 }, { "epoch": 7.321016166281755, "grad_norm": 0.20314963161945343, "learning_rate": 3.653850570471106e-05, "loss": 0.0195, "step": 3170 }, { "epoch": 7.344110854503464, "grad_norm": 0.2034531831741333, "learning_rate": 3.594934668311101e-05, "loss": 0.0174, "step": 3180 }, { "epoch": 7.3672055427251735, "grad_norm": 0.22085122764110565, "learning_rate": 3.536393359724989e-05, "loss": 0.0202, "step": 3190 }, { "epoch": 7.390300230946882, "grad_norm": 0.20668219029903412, "learning_rate": 3.4782300684402134e-05, "loss": 0.0215, "step": 3200 }, { "epoch": 7.413394919168591, "grad_norm": 0.20111581683158875, "learning_rate": 3.420448196076282e-05, "loss": 0.0194, "step": 3210 }, { "epoch": 7.436489607390301, "grad_norm": 0.2311002016067505, "learning_rate": 3.363051121945809e-05, "loss": 0.0182, "step": 3220 }, { "epoch": 7.459584295612009, "grad_norm": 0.2030162215232849, "learning_rate": 3.30604220285689e-05, "loss": 0.0166, "step": 3230 }, { "epoch": 7.482678983833718, "grad_norm": 0.1821678727865219, "learning_rate": 3.249424772916765e-05, "loss": 0.02, "step": 3240 }, { "epoch": 7.505773672055427, "grad_norm": 0.38144606351852417, "learning_rate": 3.19320214333685e-05, "loss": 0.0194, "step": 3250 }, { "epoch": 7.528868360277136, "grad_norm": 0.26288631558418274, "learning_rate": 3.1373776022390735e-05, "loss": 0.0201, "step": 3260 }, { "epoch": 7.551963048498846, "grad_norm": 0.26908478140830994, "learning_rate": 3.081954414463564e-05, "loss": 0.0203, "step": 3270 }, { "epoch": 7.575057736720554, "grad_norm": 0.1838100701570511, "learning_rate": 3.0269358213777276e-05, "loss": 0.02, "step": 3280 }, { "epoch": 7.598152424942263, "grad_norm": 0.19634702801704407, "learning_rate": 2.97232504068667e-05, "loss": 0.024, "step": 3290 }, { "epoch": 7.621247113163973, "grad_norm": 0.17007845640182495, "learning_rate": 2.9181252662450153e-05, "loss": 0.0209, "step": 3300 }, { "epoch": 7.644341801385681, "grad_norm": 0.1375173032283783, "learning_rate": 2.8643396678701073e-05, "loss": 0.0195, "step": 3310 }, { "epoch": 7.6674364896073905, "grad_norm": 0.1866098791360855, "learning_rate": 2.8109713911566428e-05, "loss": 0.0185, "step": 3320 }, { "epoch": 7.690531177829099, "grad_norm": 0.18592612445354462, "learning_rate": 2.758023557292695e-05, "loss": 0.0178, "step": 3330 }, { "epoch": 7.713625866050808, "grad_norm": 0.15712429583072662, "learning_rate": 2.7054992628771668e-05, "loss": 0.0196, "step": 3340 }, { "epoch": 7.736720554272518, "grad_norm": 0.1279047578573227, "learning_rate": 2.65340157973871e-05, "loss": 0.0197, "step": 3350 }, { "epoch": 7.759815242494226, "grad_norm": 0.22441239655017853, "learning_rate": 2.6017335547560452e-05, "loss": 0.0191, "step": 3360 }, { "epoch": 7.7829099307159355, "grad_norm": 0.20676462352275848, "learning_rate": 2.5504982096798025e-05, "loss": 0.0199, "step": 3370 }, { "epoch": 7.806004618937644, "grad_norm": 0.2775067985057831, "learning_rate": 2.4996985409557595e-05, "loss": 0.0185, "step": 3380 }, { "epoch": 7.829099307159353, "grad_norm": 0.2614492177963257, "learning_rate": 2.4493375195496292e-05, "loss": 0.0169, "step": 3390 }, { "epoch": 7.852193995381063, "grad_norm": 0.2875981330871582, "learning_rate": 2.3994180907732856e-05, "loss": 0.018, "step": 3400 }, { "epoch": 7.875288683602771, "grad_norm": 0.18182726204395294, "learning_rate": 2.349943174112521e-05, "loss": 0.0194, "step": 3410 }, { "epoch": 7.8983833718244805, "grad_norm": 0.18898028135299683, "learning_rate": 2.300915663056302e-05, "loss": 0.0157, "step": 3420 }, { "epoch": 7.921478060046189, "grad_norm": 0.5363194346427917, "learning_rate": 2.2523384249275347e-05, "loss": 0.0178, "step": 3430 }, { "epoch": 7.944572748267898, "grad_norm": 0.1563793420791626, "learning_rate": 2.2042143007153994e-05, "loss": 0.0183, "step": 3440 }, { "epoch": 7.967667436489608, "grad_norm": 0.1951935738325119, "learning_rate": 2.156546104909163e-05, "loss": 0.0216, "step": 3450 }, { "epoch": 7.990762124711316, "grad_norm": 0.19452138245105743, "learning_rate": 2.1093366253336066e-05, "loss": 0.0191, "step": 3460 }, { "epoch": 8.013856812933025, "grad_norm": 0.20190148055553436, "learning_rate": 2.0625886229859582e-05, "loss": 0.0188, "step": 3470 }, { "epoch": 8.036951501154734, "grad_norm": 0.2394581139087677, "learning_rate": 2.0163048318744493e-05, "loss": 0.0181, "step": 3480 }, { "epoch": 8.060046189376443, "grad_norm": 0.17284128069877625, "learning_rate": 1.9704879588583836e-05, "loss": 0.0172, "step": 3490 }, { "epoch": 8.083140877598153, "grad_norm": 0.1908102035522461, "learning_rate": 1.9251406834898556e-05, "loss": 0.0175, "step": 3500 }, { "epoch": 8.106235565819862, "grad_norm": 0.25007396936416626, "learning_rate": 1.8802656578570244e-05, "loss": 0.016, "step": 3510 }, { "epoch": 8.129330254041571, "grad_norm": 0.36770346760749817, "learning_rate": 1.835865506429021e-05, "loss": 0.0175, "step": 3520 }, { "epoch": 8.152424942263279, "grad_norm": 0.20147453248500824, "learning_rate": 1.7919428259024518e-05, "loss": 0.0213, "step": 3530 }, { "epoch": 8.175519630484988, "grad_norm": 0.1591079831123352, "learning_rate": 1.748500185049532e-05, "loss": 0.0166, "step": 3540 }, { "epoch": 8.198614318706698, "grad_norm": 0.2231104075908661, "learning_rate": 1.705540124567858e-05, "loss": 0.0173, "step": 3550 }, { "epoch": 8.221709006928407, "grad_norm": 0.20360170304775238, "learning_rate": 1.663065156931819e-05, "loss": 0.0181, "step": 3560 }, { "epoch": 8.244803695150116, "grad_norm": 0.20085148513317108, "learning_rate": 1.621077766245652e-05, "loss": 0.0173, "step": 3570 }, { "epoch": 8.267898383371824, "grad_norm": 0.1734970211982727, "learning_rate": 1.57958040809816e-05, "loss": 0.0196, "step": 3580 }, { "epoch": 8.290993071593533, "grad_norm": 0.17935489118099213, "learning_rate": 1.5385755094191102e-05, "loss": 0.0177, "step": 3590 }, { "epoch": 8.314087759815243, "grad_norm": 0.19008755683898926, "learning_rate": 1.4980654683372886e-05, "loss": 0.0193, "step": 3600 }, { "epoch": 8.337182448036952, "grad_norm": 0.17935799062252045, "learning_rate": 1.4580526540402461e-05, "loss": 0.0165, "step": 3610 }, { "epoch": 8.360277136258661, "grad_norm": 0.2387223094701767, "learning_rate": 1.4185394066357483e-05, "loss": 0.018, "step": 3620 }, { "epoch": 8.383371824480369, "grad_norm": 0.1772770881652832, "learning_rate": 1.37952803701491e-05, "loss": 0.0173, "step": 3630 }, { "epoch": 8.406466512702078, "grad_norm": 0.30018919706344604, "learning_rate": 1.3410208267170466e-05, "loss": 0.0186, "step": 3640 }, { "epoch": 8.429561200923787, "grad_norm": 0.3067221939563751, "learning_rate": 1.3030200277962369e-05, "loss": 0.0175, "step": 3650 }, { "epoch": 8.452655889145497, "grad_norm": 0.3162385821342468, "learning_rate": 1.2655278626896227e-05, "loss": 0.0181, "step": 3660 }, { "epoch": 8.475750577367206, "grad_norm": 0.16854941844940186, "learning_rate": 1.2285465240874283e-05, "loss": 0.0183, "step": 3670 }, { "epoch": 8.498845265588914, "grad_norm": 0.1159670352935791, "learning_rate": 1.192078174804715e-05, "loss": 0.016, "step": 3680 }, { "epoch": 8.521939953810623, "grad_norm": 0.12184049934148788, "learning_rate": 1.1561249476549052e-05, "loss": 0.0157, "step": 3690 }, { "epoch": 8.545034642032332, "grad_norm": 0.15742027759552002, "learning_rate": 1.1206889453250313e-05, "loss": 0.0184, "step": 3700 }, { "epoch": 8.568129330254042, "grad_norm": 0.2727457582950592, "learning_rate": 1.0857722402527847e-05, "loss": 0.016, "step": 3710 }, { "epoch": 8.591224018475751, "grad_norm": 0.17201822996139526, "learning_rate": 1.0513768745052843e-05, "loss": 0.0174, "step": 3720 }, { "epoch": 8.61431870669746, "grad_norm": 0.17287689447402954, "learning_rate": 1.0175048596596682e-05, "loss": 0.0167, "step": 3730 }, { "epoch": 8.637413394919168, "grad_norm": 0.1511421501636505, "learning_rate": 9.841581766854401e-06, "loss": 0.0184, "step": 3740 }, { "epoch": 8.660508083140877, "grad_norm": 0.1608920693397522, "learning_rate": 9.513387758286196e-06, "loss": 0.0179, "step": 3750 }, { "epoch": 8.683602771362587, "grad_norm": 0.20927520096302032, "learning_rate": 9.190485764976753e-06, "loss": 0.0159, "step": 3760 }, { "epoch": 8.706697459584296, "grad_norm": 0.21599853038787842, "learning_rate": 8.872894671512799e-06, "loss": 0.0199, "step": 3770 }, { "epoch": 8.729792147806005, "grad_norm": 0.19188404083251953, "learning_rate": 8.560633051878631e-06, "loss": 0.0158, "step": 3780 }, { "epoch": 8.752886836027713, "grad_norm": 0.1599506288766861, "learning_rate": 8.253719168369768e-06, "loss": 0.0161, "step": 3790 }, { "epoch": 8.775981524249422, "grad_norm": 0.1853979527950287, "learning_rate": 7.952170970524985e-06, "loss": 0.0191, "step": 3800 }, { "epoch": 8.799076212471132, "grad_norm": 0.2847438156604767, "learning_rate": 7.656006094076506e-06, "loss": 0.0178, "step": 3810 }, { "epoch": 8.822170900692841, "grad_norm": 0.13585425913333893, "learning_rate": 7.365241859918659e-06, "loss": 0.0166, "step": 3820 }, { "epoch": 8.84526558891455, "grad_norm": 0.19831091165542603, "learning_rate": 7.07989527309475e-06, "loss": 0.019, "step": 3830 }, { "epoch": 8.868360277136258, "grad_norm": 0.3466614782810211, "learning_rate": 6.799983021802692e-06, "loss": 0.0164, "step": 3840 }, { "epoch": 8.891454965357967, "grad_norm": 0.344882607460022, "learning_rate": 6.525521476418939e-06, "loss": 0.0162, "step": 3850 }, { "epoch": 8.914549653579677, "grad_norm": 0.2650250792503357, "learning_rate": 6.256526688541043e-06, "loss": 0.0158, "step": 3860 }, { "epoch": 8.937644341801386, "grad_norm": 0.15144085884094238, "learning_rate": 5.99301439004899e-06, "loss": 0.0176, "step": 3870 }, { "epoch": 8.960739030023095, "grad_norm": 0.11926089227199554, "learning_rate": 5.734999992185008e-06, "loss": 0.0143, "step": 3880 }, { "epoch": 8.983833718244803, "grad_norm": 0.43381285667419434, "learning_rate": 5.482498584652462e-06, "loss": 0.0163, "step": 3890 }, { "epoch": 9.006928406466512, "grad_norm": 0.19410696625709534, "learning_rate": 5.235524934733094e-06, "loss": 0.0182, "step": 3900 }, { "epoch": 9.030023094688222, "grad_norm": 0.2376604527235031, "learning_rate": 4.99409348642359e-06, "loss": 0.0164, "step": 3910 }, { "epoch": 9.053117782909931, "grad_norm": 0.18821699917316437, "learning_rate": 4.7582183595906715e-06, "loss": 0.0214, "step": 3920 }, { "epoch": 9.07621247113164, "grad_norm": 0.25581300258636475, "learning_rate": 4.527913349145441e-06, "loss": 0.0166, "step": 3930 }, { "epoch": 9.099307159353348, "grad_norm": 0.24828465282917023, "learning_rate": 4.303191924236538e-06, "loss": 0.0169, "step": 3940 }, { "epoch": 9.122401847575057, "grad_norm": 0.19727648794651031, "learning_rate": 4.0840672274623955e-06, "loss": 0.0161, "step": 3950 }, { "epoch": 9.145496535796767, "grad_norm": 0.1969439536333084, "learning_rate": 3.870552074102662e-06, "loss": 0.0164, "step": 3960 }, { "epoch": 9.168591224018476, "grad_norm": 0.28042295575141907, "learning_rate": 3.6626589513686473e-06, "loss": 0.0169, "step": 3970 }, { "epoch": 9.191685912240185, "grad_norm": 0.2694367468357086, "learning_rate": 3.4604000176731044e-06, "loss": 0.0177, "step": 3980 }, { "epoch": 9.214780600461895, "grad_norm": 0.27931103110313416, "learning_rate": 3.263787101919036e-06, "loss": 0.0204, "step": 3990 }, { "epoch": 9.237875288683602, "grad_norm": 0.29098454117774963, "learning_rate": 3.0728317028080657e-06, "loss": 0.02, "step": 4000 }, { "epoch": 9.260969976905312, "grad_norm": 0.1651170551776886, "learning_rate": 2.8875449881677676e-06, "loss": 0.0155, "step": 4010 }, { "epoch": 9.28406466512702, "grad_norm": 0.213905468583107, "learning_rate": 2.7079377942986427e-06, "loss": 0.0179, "step": 4020 }, { "epoch": 9.30715935334873, "grad_norm": 0.1109078899025917, "learning_rate": 2.5340206253403276e-06, "loss": 0.0132, "step": 4030 }, { "epoch": 9.33025404157044, "grad_norm": 0.19865043461322784, "learning_rate": 2.3658036526572726e-06, "loss": 0.0175, "step": 4040 }, { "epoch": 9.353348729792147, "grad_norm": 0.15456566214561462, "learning_rate": 2.2032967142439297e-06, "loss": 0.0149, "step": 4050 }, { "epoch": 9.376443418013857, "grad_norm": 0.11047529429197311, "learning_rate": 2.0465093141492696e-06, "loss": 0.0152, "step": 4060 }, { "epoch": 9.399538106235566, "grad_norm": 0.13611409068107605, "learning_rate": 1.895450621921091e-06, "loss": 0.0145, "step": 4070 }, { "epoch": 9.422632794457275, "grad_norm": 0.2234010398387909, "learning_rate": 1.7501294720696504e-06, "loss": 0.0189, "step": 4080 }, { "epoch": 9.445727482678985, "grad_norm": 0.26444539427757263, "learning_rate": 1.6105543635510202e-06, "loss": 0.0157, "step": 4090 }, { "epoch": 9.468822170900692, "grad_norm": 0.1748080998659134, "learning_rate": 1.4767334592700188e-06, "loss": 0.0173, "step": 4100 }, { "epoch": 9.491916859122401, "grad_norm": 0.16052615642547607, "learning_rate": 1.3486745856028381e-06, "loss": 0.0161, "step": 4110 }, { "epoch": 9.51501154734411, "grad_norm": 0.1584330052137375, "learning_rate": 1.2263852319393309e-06, "loss": 0.0169, "step": 4120 }, { "epoch": 9.53810623556582, "grad_norm": 0.14805783331394196, "learning_rate": 1.109872550244917e-06, "loss": 0.0149, "step": 4130 }, { "epoch": 9.56120092378753, "grad_norm": 0.228169247508049, "learning_rate": 9.991433546424512e-07, "loss": 0.0203, "step": 4140 }, { "epoch": 9.584295612009237, "grad_norm": 0.19104620814323425, "learning_rate": 8.942041210135755e-07, "loss": 0.0164, "step": 4150 }, { "epoch": 9.607390300230946, "grad_norm": 0.163773313164711, "learning_rate": 7.950609866200665e-07, "loss": 0.0165, "step": 4160 }, { "epoch": 9.630484988452656, "grad_norm": 0.1500762552022934, "learning_rate": 7.017197497448557e-07, "loss": 0.0174, "step": 4170 }, { "epoch": 9.653579676674365, "grad_norm": 0.2414599508047104, "learning_rate": 6.141858693529457e-07, "loss": 0.0155, "step": 4180 }, { "epoch": 9.676674364896074, "grad_norm": 0.1968170553445816, "learning_rate": 5.324644647721755e-07, "loss": 0.013, "step": 4190 }, { "epoch": 9.699769053117784, "grad_norm": 0.18126633763313293, "learning_rate": 4.565603153937281e-07, "loss": 0.0167, "step": 4200 }, { "epoch": 9.722863741339491, "grad_norm": 0.18440669775009155, "learning_rate": 3.8647786039273103e-07, "loss": 0.0145, "step": 4210 }, { "epoch": 9.7459584295612, "grad_norm": 0.2203577756881714, "learning_rate": 3.222211984685641e-07, "loss": 0.0148, "step": 4220 }, { "epoch": 9.76905311778291, "grad_norm": 0.15016986429691315, "learning_rate": 2.6379408760516254e-07, "loss": 0.0148, "step": 4230 }, { "epoch": 9.79214780600462, "grad_norm": 0.23910097777843475, "learning_rate": 2.1119994485123695e-07, "loss": 0.0146, "step": 4240 }, { "epoch": 9.815242494226329, "grad_norm": 0.3367011249065399, "learning_rate": 1.6444184612044444e-07, "loss": 0.0207, "step": 4250 }, { "epoch": 9.838337182448036, "grad_norm": 0.20982211828231812, "learning_rate": 1.2352252601147697e-07, "loss": 0.0143, "step": 4260 }, { "epoch": 9.861431870669746, "grad_norm": 0.19607298076152802, "learning_rate": 8.844437764815583e-08, "loss": 0.0143, "step": 4270 }, { "epoch": 9.884526558891455, "grad_norm": 0.21889039874076843, "learning_rate": 5.9209452539432664e-08, "loss": 0.017, "step": 4280 }, { "epoch": 9.907621247113164, "grad_norm": 0.21648484468460083, "learning_rate": 3.581946045947415e-08, "loss": 0.0163, "step": 4290 }, { "epoch": 9.930715935334874, "grad_norm": 0.343557745218277, "learning_rate": 1.8275769347575467e-08, "loss": 0.0156, "step": 4300 }, { "epoch": 9.953810623556581, "grad_norm": 0.18986669182777405, "learning_rate": 6.579405228257507e-09, "loss": 0.0181, "step": 4310 }, { "epoch": 9.97690531177829, "grad_norm": 0.17259903252124786, "learning_rate": 7.310521511705304e-10, "loss": 0.0163, "step": 4320 }, { "epoch": 9.988452655889146, "step": 4325, "total_flos": 5.8174995935902e+17, "train_loss": 0.036179359453606466, "train_runtime": 4740.7562, "train_samples_per_second": 58.387, "train_steps_per_second": 0.912 } ], "logging_steps": 10, "max_steps": 4325, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.8174995935902e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }