{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 8503, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005880277549100318, "grad_norm": 387.39825439453125, "learning_rate": 5.875440658049355e-07, "loss": 5.0951, "step": 50 }, { "epoch": 0.011760555098200636, "grad_norm": 66.9789047241211, "learning_rate": 1.175088131609871e-06, "loss": 0.8587, "step": 100 }, { "epoch": 0.01764083264730095, "grad_norm": 11.23346996307373, "learning_rate": 1.762632197414806e-06, "loss": 0.084, "step": 150 }, { "epoch": 0.02352111019640127, "grad_norm": 11.451313018798828, "learning_rate": 2.350176263219742e-06, "loss": 0.051, "step": 200 }, { "epoch": 0.02940138774550159, "grad_norm": 12.508331298828125, "learning_rate": 2.937720329024677e-06, "loss": 0.0592, "step": 250 }, { "epoch": 0.0352816652946019, "grad_norm": 42.7663688659668, "learning_rate": 3.525264394829612e-06, "loss": 0.0519, "step": 300 }, { "epoch": 0.041161942843702226, "grad_norm": 9.149981498718262, "learning_rate": 4.112808460634548e-06, "loss": 0.0418, "step": 350 }, { "epoch": 0.04704222039280254, "grad_norm": 16.077157974243164, "learning_rate": 4.700352526439484e-06, "loss": 0.0665, "step": 400 }, { "epoch": 0.05292249794190286, "grad_norm": 8.89875316619873, "learning_rate": 5.287896592244419e-06, "loss": 0.052, "step": 450 }, { "epoch": 0.05880277549100318, "grad_norm": 10.934527397155762, "learning_rate": 5.875440658049354e-06, "loss": 0.0405, "step": 500 }, { "epoch": 0.0646830530401035, "grad_norm": 5.134288787841797, "learning_rate": 6.46298472385429e-06, "loss": 0.0499, "step": 550 }, { "epoch": 0.0705633305892038, "grad_norm": 2.750032663345337, "learning_rate": 7.050528789659224e-06, "loss": 0.0444, "step": 600 }, { "epoch": 0.07644360813830413, "grad_norm": 3.613896131515503, "learning_rate": 7.63807285546416e-06, "loss": 0.0417, "step": 650 }, { "epoch": 0.08232388568740445, "grad_norm": 5.510785102844238, "learning_rate": 8.225616921269097e-06, "loss": 0.0337, "step": 700 }, { "epoch": 0.08820416323650476, "grad_norm": 21.227962493896484, "learning_rate": 8.81316098707403e-06, "loss": 0.0314, "step": 750 }, { "epoch": 0.09408444078560509, "grad_norm": 11.67798137664795, "learning_rate": 9.400705052878968e-06, "loss": 0.0386, "step": 800 }, { "epoch": 0.0999647183347054, "grad_norm": 0.9770076870918274, "learning_rate": 9.988249118683903e-06, "loss": 0.0322, "step": 850 }, { "epoch": 0.10584499588380572, "grad_norm": 2.259112596511841, "learning_rate": 9.998988263671598e-06, "loss": 0.037, "step": 900 }, { "epoch": 0.11172527343290603, "grad_norm": 5.629507064819336, "learning_rate": 9.995870471854679e-06, "loss": 0.0328, "step": 950 }, { "epoch": 0.11760555098200635, "grad_norm": 2.3545737266540527, "learning_rate": 9.990647516930925e-06, "loss": 0.0331, "step": 1000 }, { "epoch": 0.11760555098200635, "eval_loss": 0.03788302466273308, "eval_runtime": 619.4182, "eval_samples_per_second": 47.064, "eval_steps_per_second": 11.766, "step": 1000 }, { "epoch": 0.12348582853110666, "grad_norm": 1.4001481533050537, "learning_rate": 9.983321599752438e-06, "loss": 0.0359, "step": 1050 }, { "epoch": 0.129366106080207, "grad_norm": 10.03685188293457, "learning_rate": 9.9738958073189e-06, "loss": 0.0306, "step": 1100 }, { "epoch": 0.1352463836293073, "grad_norm": 8.22728443145752, "learning_rate": 9.962374111476778e-06, "loss": 0.031, "step": 1150 }, { "epoch": 0.1411266611784076, "grad_norm": 4.021151065826416, "learning_rate": 9.948761367245665e-06, "loss": 0.0333, "step": 1200 }, { "epoch": 0.14700693872750795, "grad_norm": 4.213313102722168, "learning_rate": 9.933063310772463e-06, "loss": 0.0347, "step": 1250 }, { "epoch": 0.15288721627660826, "grad_norm": 1.7054405212402344, "learning_rate": 9.915286556914286e-06, "loss": 0.0326, "step": 1300 }, { "epoch": 0.15876749382570857, "grad_norm": 3.276946783065796, "learning_rate": 9.89543859645109e-06, "loss": 0.0323, "step": 1350 }, { "epoch": 0.1646477713748089, "grad_norm": 5.019853591918945, "learning_rate": 9.873527792929196e-06, "loss": 0.0274, "step": 1400 }, { "epoch": 0.17052804892390921, "grad_norm": 6.062413692474365, "learning_rate": 9.84956337913706e-06, "loss": 0.0294, "step": 1450 }, { "epoch": 0.17640832647300952, "grad_norm": 4.063215255737305, "learning_rate": 9.82355545321475e-06, "loss": 0.0336, "step": 1500 }, { "epoch": 0.18228860402210983, "grad_norm": 3.2010011672973633, "learning_rate": 9.795514974398789e-06, "loss": 0.0295, "step": 1550 }, { "epoch": 0.18816888157121017, "grad_norm": 4.517515182495117, "learning_rate": 9.765453758404144e-06, "loss": 0.0301, "step": 1600 }, { "epoch": 0.19404915912031048, "grad_norm": 4.797945022583008, "learning_rate": 9.733384472445308e-06, "loss": 0.0309, "step": 1650 }, { "epoch": 0.1999294366694108, "grad_norm": 2.705932378768921, "learning_rate": 9.699320629898589e-06, "loss": 0.026, "step": 1700 }, { "epoch": 0.2058097142185111, "grad_norm": 20.035690307617188, "learning_rate": 9.663276584607831e-06, "loss": 0.0264, "step": 1750 }, { "epoch": 0.21168999176761144, "grad_norm": 1.2530241012573242, "learning_rate": 9.625267524835974e-06, "loss": 0.0277, "step": 1800 }, { "epoch": 0.21757026931671175, "grad_norm": 3.373883008956909, "learning_rate": 9.585309466865029e-06, "loss": 0.0276, "step": 1850 }, { "epoch": 0.22345054686581206, "grad_norm": 5.847758769989014, "learning_rate": 9.54341924824712e-06, "loss": 0.0289, "step": 1900 }, { "epoch": 0.22933082441491237, "grad_norm": 1.0913113355636597, "learning_rate": 9.499614520709457e-06, "loss": 0.0256, "step": 1950 }, { "epoch": 0.2352111019640127, "grad_norm": 4.371528148651123, "learning_rate": 9.453913742716256e-06, "loss": 0.0299, "step": 2000 }, { "epoch": 0.2352111019640127, "eval_loss": 0.026240274310112, "eval_runtime": 619.3143, "eval_samples_per_second": 47.071, "eval_steps_per_second": 11.768, "step": 2000 }, { "epoch": 0.24109137951311302, "grad_norm": 4.8858208656311035, "learning_rate": 9.40633617169069e-06, "loss": 0.0273, "step": 2050 }, { "epoch": 0.24697165706221333, "grad_norm": 1.7557135820388794, "learning_rate": 9.35690185590018e-06, "loss": 0.0265, "step": 2100 }, { "epoch": 0.25285193461131367, "grad_norm": 2.1983985900878906, "learning_rate": 9.305631626008454e-06, "loss": 0.0249, "step": 2150 }, { "epoch": 0.258732212160414, "grad_norm": 4.288529872894287, "learning_rate": 9.252547086297895e-06, "loss": 0.0243, "step": 2200 }, { "epoch": 0.2646124897095143, "grad_norm": 1.228149652481079, "learning_rate": 9.197670605565932e-06, "loss": 0.0271, "step": 2250 }, { "epoch": 0.2704927672586146, "grad_norm": 4.099296569824219, "learning_rate": 9.141025307699246e-06, "loss": 0.0265, "step": 2300 }, { "epoch": 0.2763730448077149, "grad_norm": 3.512291431427002, "learning_rate": 9.082635061929817e-06, "loss": 0.0286, "step": 2350 }, { "epoch": 0.2822533223568152, "grad_norm": 1.2961852550506592, "learning_rate": 9.022524472776897e-06, "loss": 0.0288, "step": 2400 }, { "epoch": 0.2881335999059156, "grad_norm": 4.3626322746276855, "learning_rate": 8.960718869679132e-06, "loss": 0.0252, "step": 2450 }, { "epoch": 0.2940138774550159, "grad_norm": 1.3140321969985962, "learning_rate": 8.89724429632124e-06, "loss": 0.0243, "step": 2500 }, { "epoch": 0.2998941550041162, "grad_norm": 4.2938432693481445, "learning_rate": 8.832127499659687e-06, "loss": 0.0216, "step": 2550 }, { "epoch": 0.3057744325532165, "grad_norm": 2.0306389331817627, "learning_rate": 8.765395918652062e-06, "loss": 0.0245, "step": 2600 }, { "epoch": 0.3116547101023168, "grad_norm": 3.160212993621826, "learning_rate": 8.697077672694809e-06, "loss": 0.0292, "step": 2650 }, { "epoch": 0.31753498765141713, "grad_norm": 8.705582618713379, "learning_rate": 8.627201549774273e-06, "loss": 0.0243, "step": 2700 }, { "epoch": 0.32341526520051744, "grad_norm": 5.516971588134766, "learning_rate": 8.55579699433599e-06, "loss": 0.0272, "step": 2750 }, { "epoch": 0.3292955427496178, "grad_norm": 3.2388885021209717, "learning_rate": 8.482894094877372e-06, "loss": 0.0268, "step": 2800 }, { "epoch": 0.3351758202987181, "grad_norm": 1.1581066846847534, "learning_rate": 8.408523571269e-06, "loss": 0.0251, "step": 2850 }, { "epoch": 0.34105609784781843, "grad_norm": 1.0647870302200317, "learning_rate": 8.332716761809857e-06, "loss": 0.0304, "step": 2900 }, { "epoch": 0.34693637539691874, "grad_norm": 1.6037498712539673, "learning_rate": 8.255505610021981e-06, "loss": 0.0241, "step": 2950 }, { "epoch": 0.35281665294601905, "grad_norm": 3.1624505519866943, "learning_rate": 8.176922651190085e-06, "loss": 0.0251, "step": 3000 }, { "epoch": 0.35281665294601905, "eval_loss": 0.028408875688910484, "eval_runtime": 619.7012, "eval_samples_per_second": 47.042, "eval_steps_per_second": 11.761, "step": 3000 }, { "epoch": 0.35869693049511936, "grad_norm": 4.156674861907959, "learning_rate": 8.097000998651812e-06, "loss": 0.0316, "step": 3050 }, { "epoch": 0.36457720804421967, "grad_norm": 1.2711337804794312, "learning_rate": 8.015774329844417e-06, "loss": 0.0239, "step": 3100 }, { "epoch": 0.37045748559332, "grad_norm": 1.4552150964736938, "learning_rate": 7.933276872113754e-06, "loss": 0.0226, "step": 3150 }, { "epoch": 0.37633776314242035, "grad_norm": 6.432965278625488, "learning_rate": 7.849543388291524e-06, "loss": 0.0256, "step": 3200 }, { "epoch": 0.38221804069152066, "grad_norm": 3.6660749912261963, "learning_rate": 7.764609162046894e-06, "loss": 0.0265, "step": 3250 }, { "epoch": 0.38809831824062097, "grad_norm": 3.6166951656341553, "learning_rate": 7.678509983018656e-06, "loss": 0.0256, "step": 3300 }, { "epoch": 0.3939785957897213, "grad_norm": 8.923437118530273, "learning_rate": 7.591282131734139e-06, "loss": 0.024, "step": 3350 }, { "epoch": 0.3998588733388216, "grad_norm": 3.535114049911499, "learning_rate": 7.50296236432132e-06, "loss": 0.0208, "step": 3400 }, { "epoch": 0.4057391508879219, "grad_norm": 3.5117030143737793, "learning_rate": 7.413587897020496e-06, "loss": 0.0243, "step": 3450 }, { "epoch": 0.4116194284370222, "grad_norm": 1.3753975629806519, "learning_rate": 7.323196390502074e-06, "loss": 0.0213, "step": 3500 }, { "epoch": 0.41749970598612257, "grad_norm": 1.682301640510559, "learning_rate": 7.231825933997105e-06, "loss": 0.0261, "step": 3550 }, { "epoch": 0.4233799835352229, "grad_norm": 4.877748489379883, "learning_rate": 7.139515029247213e-06, "loss": 0.0286, "step": 3600 }, { "epoch": 0.4292602610843232, "grad_norm": 1.995045781135559, "learning_rate": 7.046302574280703e-06, "loss": 0.0244, "step": 3650 }, { "epoch": 0.4351405386334235, "grad_norm": 2.8061089515686035, "learning_rate": 6.952227847021697e-06, "loss": 0.023, "step": 3700 }, { "epoch": 0.4410208161825238, "grad_norm": 1.9934951066970825, "learning_rate": 6.857330488739159e-06, "loss": 0.0224, "step": 3750 }, { "epoch": 0.4469010937316241, "grad_norm": 4.9554290771484375, "learning_rate": 6.76165048734285e-06, "loss": 0.0235, "step": 3800 }, { "epoch": 0.45278137128072443, "grad_norm": 2.459325075149536, "learning_rate": 6.665228160533186e-06, "loss": 0.0251, "step": 3850 }, { "epoch": 0.45866164882982474, "grad_norm": 3.0527613162994385, "learning_rate": 6.568104138812141e-06, "loss": 0.0208, "step": 3900 }, { "epoch": 0.4645419263789251, "grad_norm": 2.7639405727386475, "learning_rate": 6.470319348362344e-06, "loss": 0.0242, "step": 3950 }, { "epoch": 0.4704222039280254, "grad_norm": 1.0516586303710938, "learning_rate": 6.371914993801573e-06, "loss": 0.0213, "step": 4000 }, { "epoch": 0.4704222039280254, "eval_loss": 0.02518468163907528, "eval_runtime": 612.2227, "eval_samples_per_second": 47.617, "eval_steps_per_second": 11.904, "step": 4000 }, { "epoch": 0.4763024814771257, "grad_norm": 2.6117072105407715, "learning_rate": 6.272932540819929e-06, "loss": 0.0201, "step": 4050 }, { "epoch": 0.48218275902622604, "grad_norm": 3.2772061824798584, "learning_rate": 6.173413698706999e-06, "loss": 0.0245, "step": 4100 }, { "epoch": 0.48806303657532635, "grad_norm": 9.428531646728516, "learning_rate": 6.073400402776364e-06, "loss": 0.0229, "step": 4150 }, { "epoch": 0.49394331412442666, "grad_norm": 3.652819871902466, "learning_rate": 5.972934796694871e-06, "loss": 0.0203, "step": 4200 }, { "epoch": 0.49982359167352697, "grad_norm": 3.571290969848633, "learning_rate": 5.872059214724112e-06, "loss": 0.0224, "step": 4250 }, { "epoch": 0.5057038692226273, "grad_norm": 7.393253326416016, "learning_rate": 5.770816163881581e-06, "loss": 0.0233, "step": 4300 }, { "epoch": 0.5115841467717276, "grad_norm": 3.7029826641082764, "learning_rate": 5.669248306029042e-06, "loss": 0.0211, "step": 4350 }, { "epoch": 0.517464424320828, "grad_norm": 1.5849096775054932, "learning_rate": 5.567398439895643e-06, "loss": 0.0228, "step": 4400 }, { "epoch": 0.5233447018699282, "grad_norm": 4.666280269622803, "learning_rate": 5.465309483043364e-06, "loss": 0.019, "step": 4450 }, { "epoch": 0.5292249794190286, "grad_norm": 1.970779299736023, "learning_rate": 5.363024453782388e-06, "loss": 0.0262, "step": 4500 }, { "epoch": 0.5351052569681289, "grad_norm": 3.114126682281494, "learning_rate": 5.260586453044011e-06, "loss": 0.0208, "step": 4550 }, { "epoch": 0.5409855345172292, "grad_norm": 2.33022141456604, "learning_rate": 5.158038646218749e-06, "loss": 0.0248, "step": 4600 }, { "epoch": 0.5468658120663296, "grad_norm": 1.2761564254760742, "learning_rate": 5.055424244967284e-06, "loss": 0.0206, "step": 4650 }, { "epoch": 0.5527460896154298, "grad_norm": 1.679206132888794, "learning_rate": 4.95278648901189e-06, "loss": 0.0202, "step": 4700 }, { "epoch": 0.5586263671645302, "grad_norm": 3.895859479904175, "learning_rate": 4.850168627916068e-06, "loss": 0.023, "step": 4750 }, { "epoch": 0.5645066447136304, "grad_norm": 1.8703380823135376, "learning_rate": 4.7476139028600085e-06, "loss": 0.0243, "step": 4800 }, { "epoch": 0.5703869222627308, "grad_norm": 1.986175298690796, "learning_rate": 4.645165528419598e-06, "loss": 0.0199, "step": 4850 }, { "epoch": 0.5762671998118312, "grad_norm": 1.6648274660110474, "learning_rate": 4.542866674356627e-06, "loss": 0.0224, "step": 4900 }, { "epoch": 0.5821474773609314, "grad_norm": 1.7521028518676758, "learning_rate": 4.440760447427899e-06, "loss": 0.0206, "step": 4950 }, { "epoch": 0.5880277549100318, "grad_norm": 3.9067554473876953, "learning_rate": 4.338889873220875e-06, "loss": 0.0264, "step": 5000 }, { "epoch": 0.5880277549100318, "eval_loss": 0.022188851609826088, "eval_runtime": 615.8241, "eval_samples_per_second": 47.338, "eval_steps_per_second": 11.835, "step": 5000 }, { "epoch": 0.593908032459132, "grad_norm": 2.8073413372039795, "learning_rate": 4.237297878023512e-06, "loss": 0.0238, "step": 5050 }, { "epoch": 0.5997883100082324, "grad_norm": 2.105537176132202, "learning_rate": 4.136027270735971e-06, "loss": 0.0207, "step": 5100 }, { "epoch": 0.6056685875573327, "grad_norm": 3.5026049613952637, "learning_rate": 4.035120724831766e-06, "loss": 0.0208, "step": 5150 }, { "epoch": 0.611548865106433, "grad_norm": 3.762202262878418, "learning_rate": 3.9346207603759966e-06, "loss": 0.0214, "step": 5200 }, { "epoch": 0.6174291426555334, "grad_norm": 2.426508903503418, "learning_rate": 3.834569726108201e-06, "loss": 0.0195, "step": 5250 }, { "epoch": 0.6233094202046336, "grad_norm": 2.668912410736084, "learning_rate": 3.7350097815974395e-06, "loss": 0.0229, "step": 5300 }, { "epoch": 0.629189697753734, "grad_norm": 3.8856732845306396, "learning_rate": 3.6359828794770467e-06, "loss": 0.0253, "step": 5350 }, { "epoch": 0.6350699753028343, "grad_norm": 0.85421222448349, "learning_rate": 3.5375307477666134e-06, "loss": 0.0197, "step": 5400 }, { "epoch": 0.6409502528519346, "grad_norm": 6.5461225509643555, "learning_rate": 3.4396948722886065e-06, "loss": 0.018, "step": 5450 }, { "epoch": 0.6468305304010349, "grad_norm": 4.219273567199707, "learning_rate": 3.342516479187047e-06, "loss": 0.019, "step": 5500 }, { "epoch": 0.6527108079501353, "grad_norm": 2.8508059978485107, "learning_rate": 3.246036517555611e-06, "loss": 0.0218, "step": 5550 }, { "epoch": 0.6585910854992356, "grad_norm": 0.25915786623954773, "learning_rate": 3.1502956421824714e-06, "loss": 0.0172, "step": 5600 }, { "epoch": 0.6644713630483359, "grad_norm": 3.543858766555786, "learning_rate": 3.0553341964191587e-06, "loss": 0.0213, "step": 5650 }, { "epoch": 0.6703516405974362, "grad_norm": 4.268918514251709, "learning_rate": 2.961192195180657e-06, "loss": 0.0175, "step": 5700 }, { "epoch": 0.6762319181465365, "grad_norm": 3.376847982406616, "learning_rate": 2.867909308083885e-06, "loss": 0.0167, "step": 5750 }, { "epoch": 0.6821121956956369, "grad_norm": 3.2170565128326416, "learning_rate": 2.7755248427316976e-06, "loss": 0.0192, "step": 5800 }, { "epoch": 0.6879924732447371, "grad_norm": 4.281452178955078, "learning_rate": 2.68407772814942e-06, "loss": 0.0176, "step": 5850 }, { "epoch": 0.6938727507938375, "grad_norm": 2.3371615409851074, "learning_rate": 2.5936064983808994e-06, "loss": 0.0212, "step": 5900 }, { "epoch": 0.6997530283429377, "grad_norm": 3.5211522579193115, "learning_rate": 2.5041492762510245e-06, "loss": 0.0206, "step": 5950 }, { "epoch": 0.7056333058920381, "grad_norm": 3.2257282733917236, "learning_rate": 2.415743757301486e-06, "loss": 0.0183, "step": 6000 }, { "epoch": 0.7056333058920381, "eval_loss": 0.019069144502282143, "eval_runtime": 622.7936, "eval_samples_per_second": 46.808, "eval_steps_per_second": 11.702, "step": 6000 }, { "epoch": 0.7115135834411385, "grad_norm": 1.6855459213256836, "learning_rate": 2.3284271939066127e-06, "loss": 0.0175, "step": 6050 }, { "epoch": 0.7173938609902387, "grad_norm": 1.7634873390197754, "learning_rate": 2.2422363795759534e-06, "loss": 0.0212, "step": 6100 }, { "epoch": 0.7232741385393391, "grad_norm": 4.85552978515625, "learning_rate": 2.157207633450183e-06, "loss": 0.0192, "step": 6150 }, { "epoch": 0.7291544160884393, "grad_norm": 1.8883658647537231, "learning_rate": 2.073376784996931e-06, "loss": 0.0205, "step": 6200 }, { "epoch": 0.7350346936375397, "grad_norm": 2.118055820465088, "learning_rate": 1.990779158912943e-06, "loss": 0.0179, "step": 6250 }, { "epoch": 0.74091497118664, "grad_norm": 0.919160783290863, "learning_rate": 1.9094495602389235e-06, "loss": 0.0172, "step": 6300 }, { "epoch": 0.7467952487357403, "grad_norm": 1.7510900497436523, "learning_rate": 1.829422259693377e-06, "loss": 0.0186, "step": 6350 }, { "epoch": 0.7526755262848407, "grad_norm": 3.255988597869873, "learning_rate": 1.750730979231588e-06, "loss": 0.0209, "step": 6400 }, { "epoch": 0.758555803833941, "grad_norm": 4.849563121795654, "learning_rate": 1.6734088778358371e-06, "loss": 0.0217, "step": 6450 }, { "epoch": 0.7644360813830413, "grad_norm": 2.414062261581421, "learning_rate": 1.5974885375428494e-06, "loss": 0.0169, "step": 6500 }, { "epoch": 0.7703163589321416, "grad_norm": 2.1367931365966797, "learning_rate": 1.5230019497143633e-06, "loss": 0.0207, "step": 6550 }, { "epoch": 0.7761966364812419, "grad_norm": 5.911067008972168, "learning_rate": 1.4499805015565754e-06, "loss": 0.0158, "step": 6600 }, { "epoch": 0.7820769140303422, "grad_norm": 2.751194953918457, "learning_rate": 1.378454962894193e-06, "loss": 0.0189, "step": 6650 }, { "epoch": 0.7879571915794426, "grad_norm": 2.061358690261841, "learning_rate": 1.308455473204619e-06, "loss": 0.0171, "step": 6700 }, { "epoch": 0.7938374691285429, "grad_norm": 2.2449493408203125, "learning_rate": 1.240011528917756e-06, "loss": 0.019, "step": 6750 }, { "epoch": 0.7997177466776432, "grad_norm": 4.053592205047607, "learning_rate": 1.1731519709867933e-06, "loss": 0.0167, "step": 6800 }, { "epoch": 0.8055980242267435, "grad_norm": 0.6525304317474365, "learning_rate": 1.1079049727351726e-06, "loss": 0.0184, "step": 6850 }, { "epoch": 0.8114783017758438, "grad_norm": 1.0814751386642456, "learning_rate": 1.0442980279849086e-06, "loss": 0.0193, "step": 6900 }, { "epoch": 0.8173585793249442, "grad_norm": 4.694520473480225, "learning_rate": 9.823579394712175e-07, "loss": 0.0195, "step": 6950 }, { "epoch": 0.8232388568740444, "grad_norm": 3.2695419788360596, "learning_rate": 9.221108075483615e-07, "loss": 0.0171, "step": 7000 }, { "epoch": 0.8232388568740444, "eval_loss": 0.017895469442009926, "eval_runtime": 618.1531, "eval_samples_per_second": 47.16, "eval_steps_per_second": 11.79, "step": 7000 }, { "epoch": 0.8291191344231448, "grad_norm": 2.8549704551696777, "learning_rate": 8.63582019191469e-07, "loss": 0.0191, "step": 7050 }, { "epoch": 0.8349994119722451, "grad_norm": 1.8346871137619019, "learning_rate": 8.067962372989563e-07, "loss": 0.0168, "step": 7100 }, { "epoch": 0.8408796895213454, "grad_norm": 1.3449316024780273, "learning_rate": 7.517773903000519e-07, "loss": 0.0199, "step": 7150 }, { "epoch": 0.8467599670704458, "grad_norm": 1.1152535676956177, "learning_rate": 6.98548662071828e-07, "loss": 0.0166, "step": 7200 }, { "epoch": 0.852640244619546, "grad_norm": 2.3504021167755127, "learning_rate": 6.471324821699603e-07, "loss": 0.0156, "step": 7250 }, { "epoch": 0.8585205221686464, "grad_norm": 0.3554767072200775, "learning_rate": 5.975505163773437e-07, "loss": 0.0215, "step": 7300 }, { "epoch": 0.8644007997177466, "grad_norm": 0.13760164380073547, "learning_rate": 5.498236575745564e-07, "loss": 0.0196, "step": 7350 }, { "epoch": 0.870281077266847, "grad_norm": 1.4082413911819458, "learning_rate": 5.039720169360013e-07, "loss": 0.0159, "step": 7400 }, { "epoch": 0.8761613548159473, "grad_norm": 3.8390660285949707, "learning_rate": 4.600149154554501e-07, "loss": 0.0214, "step": 7450 }, { "epoch": 0.8820416323650476, "grad_norm": 3.1669695377349854, "learning_rate": 4.179708758045431e-07, "loss": 0.0181, "step": 7500 }, { "epoch": 0.887921909914148, "grad_norm": 6.687104225158691, "learning_rate": 3.7785761452770295e-07, "loss": 0.02, "step": 7550 }, { "epoch": 0.8938021874632482, "grad_norm": 4.694364070892334, "learning_rate": 3.396920345767185e-07, "loss": 0.0203, "step": 7600 }, { "epoch": 0.8996824650123486, "grad_norm": 2.0906476974487305, "learning_rate": 3.0349021818817326e-07, "loss": 0.0176, "step": 7650 }, { "epoch": 0.9055627425614489, "grad_norm": 3.2009048461914062, "learning_rate": 2.692674201066975e-07, "loss": 0.0189, "step": 7700 }, { "epoch": 0.9114430201105492, "grad_norm": 4.380706310272217, "learning_rate": 2.3703806115691951e-07, "loss": 0.0171, "step": 7750 }, { "epoch": 0.9173232976596495, "grad_norm": 0.9385294318199158, "learning_rate": 2.068157221668049e-07, "loss": 0.0161, "step": 7800 }, { "epoch": 0.9232035752087498, "grad_norm": 2.8777430057525635, "learning_rate": 1.786131382449602e-07, "loss": 0.0232, "step": 7850 }, { "epoch": 0.9290838527578502, "grad_norm": 0.5679008960723877, "learning_rate": 1.5244219341430443e-07, "loss": 0.0197, "step": 7900 }, { "epoch": 0.9349641303069505, "grad_norm": 2.9544992446899414, "learning_rate": 1.2831391560437278e-07, "loss": 0.0187, "step": 7950 }, { "epoch": 0.9408444078560508, "grad_norm": 2.460313081741333, "learning_rate": 1.0623847200435966e-07, "loss": 0.0185, "step": 8000 }, { "epoch": 0.9408444078560508, "eval_loss": 0.01769772544503212, "eval_runtime": 613.4256, "eval_samples_per_second": 47.523, "eval_steps_per_second": 11.881, "step": 8000 }, { "epoch": 0.9467246854051511, "grad_norm": 3.39328670501709, "learning_rate": 8.62251647788609e-08, "loss": 0.019, "step": 8050 }, { "epoch": 0.9526049629542515, "grad_norm": 4.006555080413818, "learning_rate": 6.828242714812527e-08, "loss": 0.0183, "step": 8100 }, { "epoch": 0.9584852405033517, "grad_norm": 2.6548595428466797, "learning_rate": 5.2417819834454374e-08, "loss": 0.0156, "step": 8150 }, { "epoch": 0.9643655180524521, "grad_norm": 1.8154182434082031, "learning_rate": 3.863802787626325e-08, "loss": 0.0175, "step": 8200 }, { "epoch": 0.9702457956015524, "grad_norm": 4.321004867553711, "learning_rate": 2.694885781113432e-08, "loss": 0.0195, "step": 8250 }, { "epoch": 0.9761260731506527, "grad_norm": 2.2515978813171387, "learning_rate": 1.735523522905347e-08, "loss": 0.0182, "step": 8300 }, { "epoch": 0.9820063506997531, "grad_norm": 2.5015530586242676, "learning_rate": 9.861202696864191e-09, "loss": 0.0197, "step": 8350 }, { "epoch": 0.9878866282488533, "grad_norm": 2.121727705001831, "learning_rate": 4.469918054806344e-09, "loss": 0.0156, "step": 8400 }, { "epoch": 0.9937669057979537, "grad_norm": 3.144368886947632, "learning_rate": 1.1836530858633234e-09, "loss": 0.0197, "step": 8450 }, { "epoch": 0.9996471833470539, "grad_norm": 2.387354850769043, "learning_rate": 3.792558477266894e-12, "loss": 0.0191, "step": 8500 }, { "epoch": 1.0, "step": 8503, "total_flos": 1.8987963054248755e+17, "train_loss": 0.0597800020030789, "train_runtime": 12067.3267, "train_samples_per_second": 5.637, "train_steps_per_second": 0.705 } ], "logging_steps": 50, "max_steps": 8503, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8987963054248755e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }