{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.072992700729927, "grad_norm": 5.749124526977539, "learning_rate": 1.4492753623188407e-05, "loss": 0.9214, "step": 10 }, { "epoch": 0.145985401459854, "grad_norm": 2.569692611694336, "learning_rate": 2.8985507246376814e-05, "loss": 0.506, "step": 20 }, { "epoch": 0.21897810218978103, "grad_norm": 1.676221489906311, "learning_rate": 4.347826086956522e-05, "loss": 0.3337, "step": 30 }, { "epoch": 0.291970802919708, "grad_norm": 1.1421884298324585, "learning_rate": 5.797101449275363e-05, "loss": 0.2416, "step": 40 }, { "epoch": 0.36496350364963503, "grad_norm": 0.7032585144042969, "learning_rate": 7.246376811594203e-05, "loss": 0.1709, "step": 50 }, { "epoch": 0.43795620437956206, "grad_norm": 0.8790648579597473, "learning_rate": 8.695652173913044e-05, "loss": 0.1541, "step": 60 }, { "epoch": 0.5109489051094891, "grad_norm": 0.9456900358200073, "learning_rate": 9.999985422436231e-05, "loss": 0.1368, "step": 70 }, { "epoch": 0.583941605839416, "grad_norm": 0.8083729147911072, "learning_rate": 9.998236217634196e-05, "loss": 0.1253, "step": 80 }, { "epoch": 0.656934306569343, "grad_norm": 0.5535995960235596, "learning_rate": 9.993572668745786e-05, "loss": 0.1117, "step": 90 }, { "epoch": 0.7299270072992701, "grad_norm": 0.7210536599159241, "learning_rate": 9.985997494967441e-05, "loss": 0.0905, "step": 100 }, { "epoch": 0.8029197080291971, "grad_norm": 0.5236138105392456, "learning_rate": 9.975515113189827e-05, "loss": 0.0917, "step": 110 }, { "epoch": 0.8759124087591241, "grad_norm": 0.4034636318683624, "learning_rate": 9.962131635422462e-05, "loss": 0.0802, "step": 120 }, { "epoch": 0.948905109489051, "grad_norm": 0.6598319411277771, "learning_rate": 9.945854865229965e-05, "loss": 0.0744, "step": 130 }, { "epoch": 1.0218978102189782, "grad_norm": 0.6162580847740173, "learning_rate": 9.926694293181986e-05, "loss": 0.0808, "step": 140 }, { "epoch": 1.094890510948905, "grad_norm": 0.5971336960792542, "learning_rate": 9.904661091319503e-05, "loss": 0.0765, "step": 150 }, { "epoch": 1.167883211678832, "grad_norm": 0.7087730765342712, "learning_rate": 9.879768106640687e-05, "loss": 0.0713, "step": 160 }, { "epoch": 1.2408759124087592, "grad_norm": 0.6969349980354309, "learning_rate": 9.852029853610148e-05, "loss": 0.0665, "step": 170 }, { "epoch": 1.313868613138686, "grad_norm": 0.6811894178390503, "learning_rate": 9.821462505695917e-05, "loss": 0.0631, "step": 180 }, { "epoch": 1.3868613138686132, "grad_norm": 0.34767240285873413, "learning_rate": 9.788083885939116e-05, "loss": 0.064, "step": 190 }, { "epoch": 1.4598540145985401, "grad_norm": 0.3470080494880676, "learning_rate": 9.75191345656179e-05, "loss": 0.0557, "step": 200 }, { "epoch": 1.5328467153284673, "grad_norm": 0.39414817094802856, "learning_rate": 9.712972307618981e-05, "loss": 0.0627, "step": 210 }, { "epoch": 1.6058394160583942, "grad_norm": 0.5173900723457336, "learning_rate": 9.671283144701663e-05, "loss": 0.0573, "step": 220 }, { "epoch": 1.6788321167883211, "grad_norm": 0.34683912992477417, "learning_rate": 9.626870275697682e-05, "loss": 0.0603, "step": 230 }, { "epoch": 1.7518248175182483, "grad_norm": 0.5500655174255371, "learning_rate": 9.579759596618454e-05, "loss": 0.062, "step": 240 }, { "epoch": 1.8248175182481752, "grad_norm": 0.3647141456604004, "learning_rate": 9.529978576499652e-05, "loss": 0.0504, "step": 250 }, { "epoch": 1.897810218978102, "grad_norm": 0.3856804072856903, "learning_rate": 9.477556241384724e-05, "loss": 0.0543, "step": 260 }, { "epoch": 1.9708029197080292, "grad_norm": 0.35589075088500977, "learning_rate": 9.422523157400533e-05, "loss": 0.0496, "step": 270 }, { "epoch": 2.0437956204379564, "grad_norm": 0.37966451048851013, "learning_rate": 9.36491141293504e-05, "loss": 0.0475, "step": 280 }, { "epoch": 2.116788321167883, "grad_norm": 0.4046926200389862, "learning_rate": 9.304754599927377e-05, "loss": 0.0466, "step": 290 }, { "epoch": 2.18978102189781, "grad_norm": 0.5988451242446899, "learning_rate": 9.242087794281243e-05, "loss": 0.0529, "step": 300 }, { "epoch": 2.2627737226277373, "grad_norm": 0.3989510238170624, "learning_rate": 9.176947535413046e-05, "loss": 0.0509, "step": 310 }, { "epoch": 2.335766423357664, "grad_norm": 0.4888112246990204, "learning_rate": 9.10937180494669e-05, "loss": 0.0405, "step": 320 }, { "epoch": 2.408759124087591, "grad_norm": 0.5207938551902771, "learning_rate": 9.039400004567469e-05, "loss": 0.0472, "step": 330 }, { "epoch": 2.4817518248175183, "grad_norm": 0.41739702224731445, "learning_rate": 8.967072933047945e-05, "loss": 0.0523, "step": 340 }, { "epoch": 2.554744525547445, "grad_norm": 0.5469692349433899, "learning_rate": 8.892432762459221e-05, "loss": 0.0466, "step": 350 }, { "epoch": 2.627737226277372, "grad_norm": 0.44405660033226013, "learning_rate": 8.815523013581488e-05, "loss": 0.0475, "step": 360 }, { "epoch": 2.7007299270072993, "grad_norm": 0.42182251811027527, "learning_rate": 8.736388530528162e-05, "loss": 0.0423, "step": 370 }, { "epoch": 2.7737226277372264, "grad_norm": 0.323812335729599, "learning_rate": 8.655075454598426e-05, "loss": 0.0421, "step": 380 }, { "epoch": 2.846715328467153, "grad_norm": 0.44694793224334717, "learning_rate": 8.571631197373422e-05, "loss": 0.0447, "step": 390 }, { "epoch": 2.9197080291970803, "grad_norm": 0.2432471662759781, "learning_rate": 8.486104413071755e-05, "loss": 0.0438, "step": 400 }, { "epoch": 2.9927007299270074, "grad_norm": 0.42758315801620483, "learning_rate": 8.398544970180469e-05, "loss": 0.0466, "step": 410 }, { "epoch": 3.065693430656934, "grad_norm": 0.3887377679347992, "learning_rate": 8.309003922377996e-05, "loss": 0.04, "step": 420 }, { "epoch": 3.1386861313868613, "grad_norm": 0.33644479513168335, "learning_rate": 8.217533478766068e-05, "loss": 0.0455, "step": 430 }, { "epoch": 3.2116788321167884, "grad_norm": 0.29482313990592957, "learning_rate": 8.124186973427911e-05, "loss": 0.0398, "step": 440 }, { "epoch": 3.2846715328467155, "grad_norm": 0.46486881375312805, "learning_rate": 8.029018834330506e-05, "loss": 0.0432, "step": 450 }, { "epoch": 3.3576642335766422, "grad_norm": 0.4706019163131714, "learning_rate": 7.932084551589027e-05, "loss": 0.036, "step": 460 }, { "epoch": 3.4306569343065694, "grad_norm": 0.502895176410675, "learning_rate": 7.833440645111975e-05, "loss": 0.0372, "step": 470 }, { "epoch": 3.5036496350364965, "grad_norm": 0.37358635663986206, "learning_rate": 7.733144631645852e-05, "loss": 0.0415, "step": 480 }, { "epoch": 3.576642335766423, "grad_norm": 0.38203373551368713, "learning_rate": 7.631254991238621e-05, "loss": 0.0362, "step": 490 }, { "epoch": 3.6496350364963503, "grad_norm": 0.38717857003211975, "learning_rate": 7.527831133141476e-05, "loss": 0.0416, "step": 500 }, { "epoch": 3.7226277372262775, "grad_norm": 0.32451218366622925, "learning_rate": 7.422933361168825e-05, "loss": 0.0378, "step": 510 }, { "epoch": 3.795620437956204, "grad_norm": 0.3965657651424408, "learning_rate": 7.316622838536673e-05, "loss": 0.0374, "step": 520 }, { "epoch": 3.8686131386861313, "grad_norm": 0.5710194110870361, "learning_rate": 7.208961552199913e-05, "loss": 0.0372, "step": 530 }, { "epoch": 3.9416058394160585, "grad_norm": 0.3017335832118988, "learning_rate": 7.100012276709302e-05, "loss": 0.0411, "step": 540 }, { "epoch": 4.014598540145985, "grad_norm": 0.42834600806236267, "learning_rate": 6.98983853760924e-05, "loss": 0.0324, "step": 550 }, { "epoch": 4.087591240875913, "grad_norm": 0.2624529302120209, "learning_rate": 6.878504574397626e-05, "loss": 0.0362, "step": 560 }, { "epoch": 4.160583941605839, "grad_norm": 0.3475762605667114, "learning_rate": 6.766075303069459e-05, "loss": 0.0398, "step": 570 }, { "epoch": 4.233576642335766, "grad_norm": 0.5206601023674011, "learning_rate": 6.65261627826597e-05, "loss": 0.0367, "step": 580 }, { "epoch": 4.306569343065694, "grad_norm": 0.4349361062049866, "learning_rate": 6.538193655051381e-05, "loss": 0.0365, "step": 590 }, { "epoch": 4.37956204379562, "grad_norm": 0.5204529166221619, "learning_rate": 6.422874150339579e-05, "loss": 0.0358, "step": 600 }, { "epoch": 4.452554744525547, "grad_norm": 0.27471956610679626, "learning_rate": 6.30672500399318e-05, "loss": 0.0336, "step": 610 }, { "epoch": 4.525547445255475, "grad_norm": 0.3909718990325928, "learning_rate": 6.189813939617682e-05, "loss": 0.0363, "step": 620 }, { "epoch": 4.598540145985401, "grad_norm": 0.24090121686458588, "learning_rate": 6.072209125073561e-05, "loss": 0.0363, "step": 630 }, { "epoch": 4.671532846715328, "grad_norm": 0.3554544448852539, "learning_rate": 5.95397913272932e-05, "loss": 0.0359, "step": 640 }, { "epoch": 4.744525547445256, "grad_norm": 0.31269019842147827, "learning_rate": 5.8351928994787006e-05, "loss": 0.0405, "step": 650 }, { "epoch": 4.817518248175182, "grad_norm": 0.40918299555778503, "learning_rate": 5.7159196865453294e-05, "loss": 0.0361, "step": 660 }, { "epoch": 4.89051094890511, "grad_norm": 0.522287130355835, "learning_rate": 5.596229039098271e-05, "loss": 0.0336, "step": 670 }, { "epoch": 4.963503649635037, "grad_norm": 0.48848339915275574, "learning_rate": 5.4761907457020077e-05, "loss": 0.0317, "step": 680 }, { "epoch": 5.036496350364963, "grad_norm": 0.4906352162361145, "learning_rate": 5.355874797624515e-05, "loss": 0.0295, "step": 690 }, { "epoch": 5.109489051094891, "grad_norm": 0.24832546710968018, "learning_rate": 5.235351348027129e-05, "loss": 0.0299, "step": 700 }, { "epoch": 5.182481751824818, "grad_norm": 0.3912707567214966, "learning_rate": 5.1146906710600306e-05, "loss": 0.0306, "step": 710 }, { "epoch": 5.255474452554744, "grad_norm": 0.23428483307361603, "learning_rate": 4.993963120887183e-05, "loss": 0.0312, "step": 720 }, { "epoch": 5.328467153284672, "grad_norm": 0.39608365297317505, "learning_rate": 4.8732390906646097e-05, "loss": 0.0323, "step": 730 }, { "epoch": 5.401459854014599, "grad_norm": 0.24625132977962494, "learning_rate": 4.75258897149594e-05, "loss": 0.0346, "step": 740 }, { "epoch": 5.474452554744525, "grad_norm": 0.4109422564506531, "learning_rate": 4.632083111389153e-05, "loss": 0.0305, "step": 750 }, { "epoch": 5.547445255474453, "grad_norm": 0.4575289487838745, "learning_rate": 4.5117917742384456e-05, "loss": 0.0335, "step": 760 }, { "epoch": 5.62043795620438, "grad_norm": 0.3231428265571594, "learning_rate": 4.391785098855156e-05, "loss": 0.032, "step": 770 }, { "epoch": 5.693430656934306, "grad_norm": 0.22695614397525787, "learning_rate": 4.272133058071595e-05, "loss": 0.0341, "step": 780 }, { "epoch": 5.766423357664234, "grad_norm": 0.3635699450969696, "learning_rate": 4.1529054179416875e-05, "loss": 0.0303, "step": 790 }, { "epoch": 5.839416058394161, "grad_norm": 0.44931599497795105, "learning_rate": 4.034171697062157e-05, "loss": 0.0312, "step": 800 }, { "epoch": 5.912408759124087, "grad_norm": 0.38062986731529236, "learning_rate": 3.916001126038008e-05, "loss": 0.0298, "step": 810 }, { "epoch": 5.985401459854015, "grad_norm": 0.44949817657470703, "learning_rate": 3.7984626071159224e-05, "loss": 0.0301, "step": 820 }, { "epoch": 6.0583941605839415, "grad_norm": 0.2569969892501831, "learning_rate": 3.681624674009121e-05, "loss": 0.03, "step": 830 }, { "epoch": 6.131386861313868, "grad_norm": 0.3157390356063843, "learning_rate": 3.5655554519370956e-05, "loss": 0.0303, "step": 840 }, { "epoch": 6.204379562043796, "grad_norm": 0.27934950590133667, "learning_rate": 3.450322617903543e-05, "loss": 0.0281, "step": 850 }, { "epoch": 6.2773722627737225, "grad_norm": 0.2728220224380493, "learning_rate": 3.3359933612356156e-05, "loss": 0.0287, "step": 860 }, { "epoch": 6.350364963503649, "grad_norm": 0.27907413244247437, "learning_rate": 3.2226343444075465e-05, "loss": 0.0268, "step": 870 }, { "epoch": 6.423357664233577, "grad_norm": 0.2804538309574127, "learning_rate": 3.110311664171458e-05, "loss": 0.03, "step": 880 }, { "epoch": 6.4963503649635035, "grad_norm": 0.5569368600845337, "learning_rate": 2.999090813018035e-05, "loss": 0.028, "step": 890 }, { "epoch": 6.569343065693431, "grad_norm": 0.7178159952163696, "learning_rate": 2.8890366409895148e-05, "loss": 0.0325, "step": 900 }, { "epoch": 6.642335766423358, "grad_norm": 0.19943037629127502, "learning_rate": 2.780213317867292e-05, "loss": 0.025, "step": 910 }, { "epoch": 6.7153284671532845, "grad_norm": 0.1796846240758896, "learning_rate": 2.672684295756147e-05, "loss": 0.0257, "step": 920 }, { "epoch": 6.788321167883212, "grad_norm": 0.23872807621955872, "learning_rate": 2.566512272086945e-05, "loss": 0.0304, "step": 930 }, { "epoch": 6.861313868613139, "grad_norm": 0.2793905735015869, "learning_rate": 2.4617591530593613e-05, "loss": 0.0245, "step": 940 }, { "epoch": 6.934306569343065, "grad_norm": 0.2569703161716461, "learning_rate": 2.3584860175459584e-05, "loss": 0.0269, "step": 950 }, { "epoch": 7.007299270072993, "grad_norm": 0.27794763445854187, "learning_rate": 2.2567530814786463e-05, "loss": 0.0287, "step": 960 }, { "epoch": 7.08029197080292, "grad_norm": 0.237870454788208, "learning_rate": 2.156619662738319e-05, "loss": 0.0261, "step": 970 }, { "epoch": 7.153284671532846, "grad_norm": 0.3608488142490387, "learning_rate": 2.0581441465680986e-05, "loss": 0.0259, "step": 980 }, { "epoch": 7.226277372262774, "grad_norm": 0.3045063316822052, "learning_rate": 1.961383951530394e-05, "loss": 0.0286, "step": 990 }, { "epoch": 7.299270072992701, "grad_norm": 0.2131444364786148, "learning_rate": 1.866395496027602e-05, "loss": 0.0254, "step": 1000 }, { "epoch": 7.372262773722627, "grad_norm": 0.25216737389564514, "learning_rate": 1.7732341654059785e-05, "loss": 0.0222, "step": 1010 }, { "epoch": 7.445255474452555, "grad_norm": 0.2963661849498749, "learning_rate": 1.6819542796618487e-05, "loss": 0.0292, "step": 1020 }, { "epoch": 7.518248175182482, "grad_norm": 0.4280974864959717, "learning_rate": 1.592609061769004e-05, "loss": 0.0263, "step": 1030 }, { "epoch": 7.591240875912408, "grad_norm": 0.23306907713413239, "learning_rate": 1.5052506066457461e-05, "loss": 0.0211, "step": 1040 }, { "epoch": 7.664233576642336, "grad_norm": 0.2898852825164795, "learning_rate": 1.4199298507796698e-05, "loss": 0.0209, "step": 1050 }, { "epoch": 7.737226277372263, "grad_norm": 0.22745266556739807, "learning_rate": 1.3366965425278899e-05, "loss": 0.0218, "step": 1060 }, { "epoch": 7.81021897810219, "grad_norm": 0.244761660695076, "learning_rate": 1.2555992131100457e-05, "loss": 0.0213, "step": 1070 }, { "epoch": 7.883211678832117, "grad_norm": 0.3487051725387573, "learning_rate": 1.1766851483109858e-05, "loss": 0.0251, "step": 1080 }, { "epoch": 7.956204379562044, "grad_norm": 0.38351985812187195, "learning_rate": 1.1000003609096337e-05, "loss": 0.0233, "step": 1090 }, { "epoch": 8.02919708029197, "grad_norm": 0.2925453782081604, "learning_rate": 1.0255895638501045e-05, "loss": 0.0258, "step": 1100 }, { "epoch": 8.102189781021897, "grad_norm": 0.2964305281639099, "learning_rate": 9.534961441707307e-06, "loss": 0.0193, "step": 1110 }, { "epoch": 8.175182481751825, "grad_norm": 0.4270758628845215, "learning_rate": 8.837621377061877e-06, "loss": 0.0223, "step": 1120 }, { "epoch": 8.248175182481752, "grad_norm": 0.22104066610336304, "learning_rate": 8.16428204577468e-06, "loss": 0.0217, "step": 1130 }, { "epoch": 8.321167883211679, "grad_norm": 0.24666981399059296, "learning_rate": 7.515336054840022e-06, "loss": 0.0254, "step": 1140 }, { "epoch": 8.394160583941606, "grad_norm": 0.16614589095115662, "learning_rate": 6.8911617881174725e-06, "loss": 0.02, "step": 1150 }, { "epoch": 8.467153284671532, "grad_norm": 0.2146356999874115, "learning_rate": 6.292123185705867e-06, "loss": 0.024, "step": 1160 }, { "epoch": 8.540145985401459, "grad_norm": 0.1602909117937088, "learning_rate": 5.718569531739154e-06, "loss": 0.0267, "step": 1170 }, { "epoch": 8.613138686131387, "grad_norm": 0.4400380849838257, "learning_rate": 5.170835250727663e-06, "loss": 0.0231, "step": 1180 }, { "epoch": 8.686131386861314, "grad_norm": 0.18121108412742615, "learning_rate": 4.6492397125637525e-06, "loss": 0.0192, "step": 1190 }, { "epoch": 8.75912408759124, "grad_norm": 0.26767614483833313, "learning_rate": 4.154087046305322e-06, "loss": 0.0249, "step": 1200 }, { "epoch": 8.832116788321168, "grad_norm": 0.2408941090106964, "learning_rate": 3.6856659628459912e-06, "loss": 0.0214, "step": 1210 }, { "epoch": 8.905109489051094, "grad_norm": 0.2535434663295746, "learning_rate": 3.244249586575038e-06, "loss": 0.0203, "step": 1220 }, { "epoch": 8.978102189781023, "grad_norm": 0.6523481011390686, "learning_rate": 2.830095296125612e-06, "loss": 0.0222, "step": 1230 }, { "epoch": 9.05109489051095, "grad_norm": 0.16764762997627258, "learning_rate": 2.4434445743037713e-06, "loss": 0.0225, "step": 1240 }, { "epoch": 9.124087591240876, "grad_norm": 0.35821276903152466, "learning_rate": 2.0845228672860538e-06, "loss": 0.0206, "step": 1250 }, { "epoch": 9.197080291970803, "grad_norm": 0.38857728242874146, "learning_rate": 1.7535394531675187e-06, "loss": 0.0222, "step": 1260 }, { "epoch": 9.27007299270073, "grad_norm": 0.2351900190114975, "learning_rate": 1.4506873199370497e-06, "loss": 0.021, "step": 1270 }, { "epoch": 9.343065693430656, "grad_norm": 0.33225876092910767, "learning_rate": 1.1761430529509899e-06, "loss": 0.0258, "step": 1280 }, { "epoch": 9.416058394160585, "grad_norm": 0.2640361785888672, "learning_rate": 9.300667319706857e-07, "loss": 0.0202, "step": 1290 }, { "epoch": 9.489051094890511, "grad_norm": 0.15353083610534668, "learning_rate": 7.126018378241062e-07, "loss": 0.0215, "step": 1300 }, { "epoch": 9.562043795620438, "grad_norm": 0.33280253410339355, "learning_rate": 5.238751687458021e-07, "loss": 0.0217, "step": 1310 }, { "epoch": 9.635036496350365, "grad_norm": 0.18690501153469086, "learning_rate": 3.639967664440802e-07, "loss": 0.0194, "step": 1320 }, { "epoch": 9.708029197080291, "grad_norm": 0.6559237837791443, "learning_rate": 2.3305985193852742e-07, "loss": 0.0242, "step": 1330 }, { "epoch": 9.78102189781022, "grad_norm": 0.3035587966442108, "learning_rate": 1.3114077120517376e-07, "loss": 0.0271, "step": 1340 }, { "epoch": 9.854014598540147, "grad_norm": 0.30467328429222107, "learning_rate": 5.8298950661112017e-08, "loss": 0.0187, "step": 1350 }, { "epoch": 9.927007299270073, "grad_norm": 0.22451713681221008, "learning_rate": 1.4576862514487089e-08, "loss": 0.022, "step": 1360 }, { "epoch": 10.0, "grad_norm": 0.33209139108657837, "learning_rate": 0.0, "loss": 0.0227, "step": 1370 }, { "epoch": 10.0, "step": 1370, "total_flos": 1.9634851190645568e+17, "train_loss": 0.05370959477485531, "train_runtime": 1800.9203, "train_samples_per_second": 48.631, "train_steps_per_second": 0.761 } ], "logging_steps": 10, "max_steps": 1370, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9634851190645568e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }