|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 8503, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005880277549100318, |
|
"grad_norm": 387.39825439453125, |
|
"learning_rate": 5.875440658049355e-07, |
|
"loss": 5.0951, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011760555098200636, |
|
"grad_norm": 66.9789047241211, |
|
"learning_rate": 1.175088131609871e-06, |
|
"loss": 0.8587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01764083264730095, |
|
"grad_norm": 11.23346996307373, |
|
"learning_rate": 1.762632197414806e-06, |
|
"loss": 0.084, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02352111019640127, |
|
"grad_norm": 11.451313018798828, |
|
"learning_rate": 2.350176263219742e-06, |
|
"loss": 0.051, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02940138774550159, |
|
"grad_norm": 12.508331298828125, |
|
"learning_rate": 2.937720329024677e-06, |
|
"loss": 0.0592, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0352816652946019, |
|
"grad_norm": 42.7663688659668, |
|
"learning_rate": 3.525264394829612e-06, |
|
"loss": 0.0519, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.041161942843702226, |
|
"grad_norm": 9.149981498718262, |
|
"learning_rate": 4.112808460634548e-06, |
|
"loss": 0.0418, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04704222039280254, |
|
"grad_norm": 16.077157974243164, |
|
"learning_rate": 4.700352526439484e-06, |
|
"loss": 0.0665, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05292249794190286, |
|
"grad_norm": 8.89875316619873, |
|
"learning_rate": 5.287896592244419e-06, |
|
"loss": 0.052, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05880277549100318, |
|
"grad_norm": 10.934527397155762, |
|
"learning_rate": 5.875440658049354e-06, |
|
"loss": 0.0405, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0646830530401035, |
|
"grad_norm": 5.134288787841797, |
|
"learning_rate": 6.46298472385429e-06, |
|
"loss": 0.0499, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.0705633305892038, |
|
"grad_norm": 2.750032663345337, |
|
"learning_rate": 7.050528789659224e-06, |
|
"loss": 0.0444, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07644360813830413, |
|
"grad_norm": 3.613896131515503, |
|
"learning_rate": 7.63807285546416e-06, |
|
"loss": 0.0417, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.08232388568740445, |
|
"grad_norm": 5.510785102844238, |
|
"learning_rate": 8.225616921269097e-06, |
|
"loss": 0.0337, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08820416323650476, |
|
"grad_norm": 21.227962493896484, |
|
"learning_rate": 8.81316098707403e-06, |
|
"loss": 0.0314, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09408444078560509, |
|
"grad_norm": 11.67798137664795, |
|
"learning_rate": 9.400705052878968e-06, |
|
"loss": 0.0386, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0999647183347054, |
|
"grad_norm": 0.9770076870918274, |
|
"learning_rate": 9.988249118683903e-06, |
|
"loss": 0.0322, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10584499588380572, |
|
"grad_norm": 2.259112596511841, |
|
"learning_rate": 9.998988263671598e-06, |
|
"loss": 0.037, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.11172527343290603, |
|
"grad_norm": 5.629507064819336, |
|
"learning_rate": 9.995870471854679e-06, |
|
"loss": 0.0328, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.11760555098200635, |
|
"grad_norm": 2.3545737266540527, |
|
"learning_rate": 9.990647516930925e-06, |
|
"loss": 0.0331, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11760555098200635, |
|
"eval_loss": 0.03788302466273308, |
|
"eval_runtime": 619.4182, |
|
"eval_samples_per_second": 47.064, |
|
"eval_steps_per_second": 11.766, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12348582853110666, |
|
"grad_norm": 1.4001481533050537, |
|
"learning_rate": 9.983321599752438e-06, |
|
"loss": 0.0359, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.129366106080207, |
|
"grad_norm": 10.03685188293457, |
|
"learning_rate": 9.9738958073189e-06, |
|
"loss": 0.0306, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1352463836293073, |
|
"grad_norm": 8.22728443145752, |
|
"learning_rate": 9.962374111476778e-06, |
|
"loss": 0.031, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1411266611784076, |
|
"grad_norm": 4.021151065826416, |
|
"learning_rate": 9.948761367245665e-06, |
|
"loss": 0.0333, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.14700693872750795, |
|
"grad_norm": 4.213313102722168, |
|
"learning_rate": 9.933063310772463e-06, |
|
"loss": 0.0347, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.15288721627660826, |
|
"grad_norm": 1.7054405212402344, |
|
"learning_rate": 9.915286556914286e-06, |
|
"loss": 0.0326, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.15876749382570857, |
|
"grad_norm": 3.276946783065796, |
|
"learning_rate": 9.89543859645109e-06, |
|
"loss": 0.0323, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.1646477713748089, |
|
"grad_norm": 5.019853591918945, |
|
"learning_rate": 9.873527792929196e-06, |
|
"loss": 0.0274, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.17052804892390921, |
|
"grad_norm": 6.062413692474365, |
|
"learning_rate": 9.84956337913706e-06, |
|
"loss": 0.0294, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.17640832647300952, |
|
"grad_norm": 4.063215255737305, |
|
"learning_rate": 9.82355545321475e-06, |
|
"loss": 0.0336, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.18228860402210983, |
|
"grad_norm": 3.2010011672973633, |
|
"learning_rate": 9.795514974398789e-06, |
|
"loss": 0.0295, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.18816888157121017, |
|
"grad_norm": 4.517515182495117, |
|
"learning_rate": 9.765453758404144e-06, |
|
"loss": 0.0301, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.19404915912031048, |
|
"grad_norm": 4.797945022583008, |
|
"learning_rate": 9.733384472445308e-06, |
|
"loss": 0.0309, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1999294366694108, |
|
"grad_norm": 2.705932378768921, |
|
"learning_rate": 9.699320629898589e-06, |
|
"loss": 0.026, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2058097142185111, |
|
"grad_norm": 20.035690307617188, |
|
"learning_rate": 9.663276584607831e-06, |
|
"loss": 0.0264, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.21168999176761144, |
|
"grad_norm": 1.2530241012573242, |
|
"learning_rate": 9.625267524835974e-06, |
|
"loss": 0.0277, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.21757026931671175, |
|
"grad_norm": 3.373883008956909, |
|
"learning_rate": 9.585309466865029e-06, |
|
"loss": 0.0276, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.22345054686581206, |
|
"grad_norm": 5.847758769989014, |
|
"learning_rate": 9.54341924824712e-06, |
|
"loss": 0.0289, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.22933082441491237, |
|
"grad_norm": 1.0913113355636597, |
|
"learning_rate": 9.499614520709457e-06, |
|
"loss": 0.0256, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.2352111019640127, |
|
"grad_norm": 4.371528148651123, |
|
"learning_rate": 9.453913742716256e-06, |
|
"loss": 0.0299, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2352111019640127, |
|
"eval_loss": 0.026240274310112, |
|
"eval_runtime": 619.3143, |
|
"eval_samples_per_second": 47.071, |
|
"eval_steps_per_second": 11.768, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.24109137951311302, |
|
"grad_norm": 4.8858208656311035, |
|
"learning_rate": 9.40633617169069e-06, |
|
"loss": 0.0273, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.24697165706221333, |
|
"grad_norm": 1.7557135820388794, |
|
"learning_rate": 9.35690185590018e-06, |
|
"loss": 0.0265, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.25285193461131367, |
|
"grad_norm": 2.1983985900878906, |
|
"learning_rate": 9.305631626008454e-06, |
|
"loss": 0.0249, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.258732212160414, |
|
"grad_norm": 4.288529872894287, |
|
"learning_rate": 9.252547086297895e-06, |
|
"loss": 0.0243, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2646124897095143, |
|
"grad_norm": 1.228149652481079, |
|
"learning_rate": 9.197670605565932e-06, |
|
"loss": 0.0271, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2704927672586146, |
|
"grad_norm": 4.099296569824219, |
|
"learning_rate": 9.141025307699246e-06, |
|
"loss": 0.0265, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.2763730448077149, |
|
"grad_norm": 3.512291431427002, |
|
"learning_rate": 9.082635061929817e-06, |
|
"loss": 0.0286, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.2822533223568152, |
|
"grad_norm": 1.2961852550506592, |
|
"learning_rate": 9.022524472776897e-06, |
|
"loss": 0.0288, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.2881335999059156, |
|
"grad_norm": 4.3626322746276855, |
|
"learning_rate": 8.960718869679132e-06, |
|
"loss": 0.0252, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.2940138774550159, |
|
"grad_norm": 1.3140321969985962, |
|
"learning_rate": 8.89724429632124e-06, |
|
"loss": 0.0243, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2998941550041162, |
|
"grad_norm": 4.2938432693481445, |
|
"learning_rate": 8.832127499659687e-06, |
|
"loss": 0.0216, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.3057744325532165, |
|
"grad_norm": 2.0306389331817627, |
|
"learning_rate": 8.765395918652062e-06, |
|
"loss": 0.0245, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3116547101023168, |
|
"grad_norm": 3.160212993621826, |
|
"learning_rate": 8.697077672694809e-06, |
|
"loss": 0.0292, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.31753498765141713, |
|
"grad_norm": 8.705582618713379, |
|
"learning_rate": 8.627201549774273e-06, |
|
"loss": 0.0243, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.32341526520051744, |
|
"grad_norm": 5.516971588134766, |
|
"learning_rate": 8.55579699433599e-06, |
|
"loss": 0.0272, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.3292955427496178, |
|
"grad_norm": 3.2388885021209717, |
|
"learning_rate": 8.482894094877372e-06, |
|
"loss": 0.0268, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3351758202987181, |
|
"grad_norm": 1.1581066846847534, |
|
"learning_rate": 8.408523571269e-06, |
|
"loss": 0.0251, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.34105609784781843, |
|
"grad_norm": 1.0647870302200317, |
|
"learning_rate": 8.332716761809857e-06, |
|
"loss": 0.0304, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.34693637539691874, |
|
"grad_norm": 1.6037498712539673, |
|
"learning_rate": 8.255505610021981e-06, |
|
"loss": 0.0241, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.35281665294601905, |
|
"grad_norm": 3.1624505519866943, |
|
"learning_rate": 8.176922651190085e-06, |
|
"loss": 0.0251, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.35281665294601905, |
|
"eval_loss": 0.028408875688910484, |
|
"eval_runtime": 619.7012, |
|
"eval_samples_per_second": 47.042, |
|
"eval_steps_per_second": 11.761, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.35869693049511936, |
|
"grad_norm": 4.156674861907959, |
|
"learning_rate": 8.097000998651812e-06, |
|
"loss": 0.0316, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.36457720804421967, |
|
"grad_norm": 1.2711337804794312, |
|
"learning_rate": 8.015774329844417e-06, |
|
"loss": 0.0239, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.37045748559332, |
|
"grad_norm": 1.4552150964736938, |
|
"learning_rate": 7.933276872113754e-06, |
|
"loss": 0.0226, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.37633776314242035, |
|
"grad_norm": 6.432965278625488, |
|
"learning_rate": 7.849543388291524e-06, |
|
"loss": 0.0256, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.38221804069152066, |
|
"grad_norm": 3.6660749912261963, |
|
"learning_rate": 7.764609162046894e-06, |
|
"loss": 0.0265, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.38809831824062097, |
|
"grad_norm": 3.6166951656341553, |
|
"learning_rate": 7.678509983018656e-06, |
|
"loss": 0.0256, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.3939785957897213, |
|
"grad_norm": 8.923437118530273, |
|
"learning_rate": 7.591282131734139e-06, |
|
"loss": 0.024, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.3998588733388216, |
|
"grad_norm": 3.535114049911499, |
|
"learning_rate": 7.50296236432132e-06, |
|
"loss": 0.0208, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.4057391508879219, |
|
"grad_norm": 3.5117030143737793, |
|
"learning_rate": 7.413587897020496e-06, |
|
"loss": 0.0243, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.4116194284370222, |
|
"grad_norm": 1.3753975629806519, |
|
"learning_rate": 7.323196390502074e-06, |
|
"loss": 0.0213, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.41749970598612257, |
|
"grad_norm": 1.682301640510559, |
|
"learning_rate": 7.231825933997105e-06, |
|
"loss": 0.0261, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.4233799835352229, |
|
"grad_norm": 4.877748489379883, |
|
"learning_rate": 7.139515029247213e-06, |
|
"loss": 0.0286, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4292602610843232, |
|
"grad_norm": 1.995045781135559, |
|
"learning_rate": 7.046302574280703e-06, |
|
"loss": 0.0244, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.4351405386334235, |
|
"grad_norm": 2.8061089515686035, |
|
"learning_rate": 6.952227847021697e-06, |
|
"loss": 0.023, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.4410208161825238, |
|
"grad_norm": 1.9934951066970825, |
|
"learning_rate": 6.857330488739159e-06, |
|
"loss": 0.0224, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.4469010937316241, |
|
"grad_norm": 4.9554290771484375, |
|
"learning_rate": 6.76165048734285e-06, |
|
"loss": 0.0235, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.45278137128072443, |
|
"grad_norm": 2.459325075149536, |
|
"learning_rate": 6.665228160533186e-06, |
|
"loss": 0.0251, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.45866164882982474, |
|
"grad_norm": 3.0527613162994385, |
|
"learning_rate": 6.568104138812141e-06, |
|
"loss": 0.0208, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.4645419263789251, |
|
"grad_norm": 2.7639405727386475, |
|
"learning_rate": 6.470319348362344e-06, |
|
"loss": 0.0242, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.4704222039280254, |
|
"grad_norm": 1.0516586303710938, |
|
"learning_rate": 6.371914993801573e-06, |
|
"loss": 0.0213, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4704222039280254, |
|
"eval_loss": 0.02518468163907528, |
|
"eval_runtime": 612.2227, |
|
"eval_samples_per_second": 47.617, |
|
"eval_steps_per_second": 11.904, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4763024814771257, |
|
"grad_norm": 2.6117072105407715, |
|
"learning_rate": 6.272932540819929e-06, |
|
"loss": 0.0201, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.48218275902622604, |
|
"grad_norm": 3.2772061824798584, |
|
"learning_rate": 6.173413698706999e-06, |
|
"loss": 0.0245, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.48806303657532635, |
|
"grad_norm": 9.428531646728516, |
|
"learning_rate": 6.073400402776364e-06, |
|
"loss": 0.0229, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.49394331412442666, |
|
"grad_norm": 3.652819871902466, |
|
"learning_rate": 5.972934796694871e-06, |
|
"loss": 0.0203, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.49982359167352697, |
|
"grad_norm": 3.571290969848633, |
|
"learning_rate": 5.872059214724112e-06, |
|
"loss": 0.0224, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.5057038692226273, |
|
"grad_norm": 7.393253326416016, |
|
"learning_rate": 5.770816163881581e-06, |
|
"loss": 0.0233, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.5115841467717276, |
|
"grad_norm": 3.7029826641082764, |
|
"learning_rate": 5.669248306029042e-06, |
|
"loss": 0.0211, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.517464424320828, |
|
"grad_norm": 1.5849096775054932, |
|
"learning_rate": 5.567398439895643e-06, |
|
"loss": 0.0228, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5233447018699282, |
|
"grad_norm": 4.666280269622803, |
|
"learning_rate": 5.465309483043364e-06, |
|
"loss": 0.019, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5292249794190286, |
|
"grad_norm": 1.970779299736023, |
|
"learning_rate": 5.363024453782388e-06, |
|
"loss": 0.0262, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5351052569681289, |
|
"grad_norm": 3.114126682281494, |
|
"learning_rate": 5.260586453044011e-06, |
|
"loss": 0.0208, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5409855345172292, |
|
"grad_norm": 2.33022141456604, |
|
"learning_rate": 5.158038646218749e-06, |
|
"loss": 0.0248, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5468658120663296, |
|
"grad_norm": 1.2761564254760742, |
|
"learning_rate": 5.055424244967284e-06, |
|
"loss": 0.0206, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.5527460896154298, |
|
"grad_norm": 1.679206132888794, |
|
"learning_rate": 4.95278648901189e-06, |
|
"loss": 0.0202, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5586263671645302, |
|
"grad_norm": 3.895859479904175, |
|
"learning_rate": 4.850168627916068e-06, |
|
"loss": 0.023, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.5645066447136304, |
|
"grad_norm": 1.8703380823135376, |
|
"learning_rate": 4.7476139028600085e-06, |
|
"loss": 0.0243, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5703869222627308, |
|
"grad_norm": 1.986175298690796, |
|
"learning_rate": 4.645165528419598e-06, |
|
"loss": 0.0199, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.5762671998118312, |
|
"grad_norm": 1.6648274660110474, |
|
"learning_rate": 4.542866674356627e-06, |
|
"loss": 0.0224, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5821474773609314, |
|
"grad_norm": 1.7521028518676758, |
|
"learning_rate": 4.440760447427899e-06, |
|
"loss": 0.0206, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.5880277549100318, |
|
"grad_norm": 3.9067554473876953, |
|
"learning_rate": 4.338889873220875e-06, |
|
"loss": 0.0264, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5880277549100318, |
|
"eval_loss": 0.022188851609826088, |
|
"eval_runtime": 615.8241, |
|
"eval_samples_per_second": 47.338, |
|
"eval_steps_per_second": 11.835, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.593908032459132, |
|
"grad_norm": 2.8073413372039795, |
|
"learning_rate": 4.237297878023512e-06, |
|
"loss": 0.0238, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.5997883100082324, |
|
"grad_norm": 2.105537176132202, |
|
"learning_rate": 4.136027270735971e-06, |
|
"loss": 0.0207, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.6056685875573327, |
|
"grad_norm": 3.5026049613952637, |
|
"learning_rate": 4.035120724831766e-06, |
|
"loss": 0.0208, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.611548865106433, |
|
"grad_norm": 3.762202262878418, |
|
"learning_rate": 3.9346207603759966e-06, |
|
"loss": 0.0214, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6174291426555334, |
|
"grad_norm": 2.426508903503418, |
|
"learning_rate": 3.834569726108201e-06, |
|
"loss": 0.0195, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.6233094202046336, |
|
"grad_norm": 2.668912410736084, |
|
"learning_rate": 3.7350097815974395e-06, |
|
"loss": 0.0229, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.629189697753734, |
|
"grad_norm": 3.8856732845306396, |
|
"learning_rate": 3.6359828794770467e-06, |
|
"loss": 0.0253, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.6350699753028343, |
|
"grad_norm": 0.85421222448349, |
|
"learning_rate": 3.5375307477666134e-06, |
|
"loss": 0.0197, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6409502528519346, |
|
"grad_norm": 6.5461225509643555, |
|
"learning_rate": 3.4396948722886065e-06, |
|
"loss": 0.018, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.6468305304010349, |
|
"grad_norm": 4.219273567199707, |
|
"learning_rate": 3.342516479187047e-06, |
|
"loss": 0.019, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6527108079501353, |
|
"grad_norm": 2.8508059978485107, |
|
"learning_rate": 3.246036517555611e-06, |
|
"loss": 0.0218, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.6585910854992356, |
|
"grad_norm": 0.25915786623954773, |
|
"learning_rate": 3.1502956421824714e-06, |
|
"loss": 0.0172, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6644713630483359, |
|
"grad_norm": 3.543858766555786, |
|
"learning_rate": 3.0553341964191587e-06, |
|
"loss": 0.0213, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.6703516405974362, |
|
"grad_norm": 4.268918514251709, |
|
"learning_rate": 2.961192195180657e-06, |
|
"loss": 0.0175, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6762319181465365, |
|
"grad_norm": 3.376847982406616, |
|
"learning_rate": 2.867909308083885e-06, |
|
"loss": 0.0167, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.6821121956956369, |
|
"grad_norm": 3.2170565128326416, |
|
"learning_rate": 2.7755248427316976e-06, |
|
"loss": 0.0192, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6879924732447371, |
|
"grad_norm": 4.281452178955078, |
|
"learning_rate": 2.68407772814942e-06, |
|
"loss": 0.0176, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.6938727507938375, |
|
"grad_norm": 2.3371615409851074, |
|
"learning_rate": 2.5936064983808994e-06, |
|
"loss": 0.0212, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6997530283429377, |
|
"grad_norm": 3.5211522579193115, |
|
"learning_rate": 2.5041492762510245e-06, |
|
"loss": 0.0206, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.7056333058920381, |
|
"grad_norm": 3.2257282733917236, |
|
"learning_rate": 2.415743757301486e-06, |
|
"loss": 0.0183, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7056333058920381, |
|
"eval_loss": 0.019069144502282143, |
|
"eval_runtime": 622.7936, |
|
"eval_samples_per_second": 46.808, |
|
"eval_steps_per_second": 11.702, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7115135834411385, |
|
"grad_norm": 1.6855459213256836, |
|
"learning_rate": 2.3284271939066127e-06, |
|
"loss": 0.0175, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.7173938609902387, |
|
"grad_norm": 1.7634873390197754, |
|
"learning_rate": 2.2422363795759534e-06, |
|
"loss": 0.0212, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.7232741385393391, |
|
"grad_norm": 4.85552978515625, |
|
"learning_rate": 2.157207633450183e-06, |
|
"loss": 0.0192, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.7291544160884393, |
|
"grad_norm": 1.8883658647537231, |
|
"learning_rate": 2.073376784996931e-06, |
|
"loss": 0.0205, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.7350346936375397, |
|
"grad_norm": 2.118055820465088, |
|
"learning_rate": 1.990779158912943e-06, |
|
"loss": 0.0179, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.74091497118664, |
|
"grad_norm": 0.919160783290863, |
|
"learning_rate": 1.9094495602389235e-06, |
|
"loss": 0.0172, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.7467952487357403, |
|
"grad_norm": 1.7510900497436523, |
|
"learning_rate": 1.829422259693377e-06, |
|
"loss": 0.0186, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.7526755262848407, |
|
"grad_norm": 3.255988597869873, |
|
"learning_rate": 1.750730979231588e-06, |
|
"loss": 0.0209, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.758555803833941, |
|
"grad_norm": 4.849563121795654, |
|
"learning_rate": 1.6734088778358371e-06, |
|
"loss": 0.0217, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.7644360813830413, |
|
"grad_norm": 2.414062261581421, |
|
"learning_rate": 1.5974885375428494e-06, |
|
"loss": 0.0169, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7703163589321416, |
|
"grad_norm": 2.1367931365966797, |
|
"learning_rate": 1.5230019497143633e-06, |
|
"loss": 0.0207, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.7761966364812419, |
|
"grad_norm": 5.911067008972168, |
|
"learning_rate": 1.4499805015565754e-06, |
|
"loss": 0.0158, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7820769140303422, |
|
"grad_norm": 2.751194953918457, |
|
"learning_rate": 1.378454962894193e-06, |
|
"loss": 0.0189, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.7879571915794426, |
|
"grad_norm": 2.061358690261841, |
|
"learning_rate": 1.308455473204619e-06, |
|
"loss": 0.0171, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.7938374691285429, |
|
"grad_norm": 2.2449493408203125, |
|
"learning_rate": 1.240011528917756e-06, |
|
"loss": 0.019, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.7997177466776432, |
|
"grad_norm": 4.053592205047607, |
|
"learning_rate": 1.1731519709867933e-06, |
|
"loss": 0.0167, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.8055980242267435, |
|
"grad_norm": 0.6525304317474365, |
|
"learning_rate": 1.1079049727351726e-06, |
|
"loss": 0.0184, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.8114783017758438, |
|
"grad_norm": 1.0814751386642456, |
|
"learning_rate": 1.0442980279849086e-06, |
|
"loss": 0.0193, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.8173585793249442, |
|
"grad_norm": 4.694520473480225, |
|
"learning_rate": 9.823579394712175e-07, |
|
"loss": 0.0195, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.8232388568740444, |
|
"grad_norm": 3.2695419788360596, |
|
"learning_rate": 9.221108075483615e-07, |
|
"loss": 0.0171, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8232388568740444, |
|
"eval_loss": 0.017895469442009926, |
|
"eval_runtime": 618.1531, |
|
"eval_samples_per_second": 47.16, |
|
"eval_steps_per_second": 11.79, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8291191344231448, |
|
"grad_norm": 2.8549704551696777, |
|
"learning_rate": 8.63582019191469e-07, |
|
"loss": 0.0191, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.8349994119722451, |
|
"grad_norm": 1.8346871137619019, |
|
"learning_rate": 8.067962372989563e-07, |
|
"loss": 0.0168, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.8408796895213454, |
|
"grad_norm": 1.3449316024780273, |
|
"learning_rate": 7.517773903000519e-07, |
|
"loss": 0.0199, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.8467599670704458, |
|
"grad_norm": 1.1152535676956177, |
|
"learning_rate": 6.98548662071828e-07, |
|
"loss": 0.0166, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.852640244619546, |
|
"grad_norm": 2.3504021167755127, |
|
"learning_rate": 6.471324821699603e-07, |
|
"loss": 0.0156, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.8585205221686464, |
|
"grad_norm": 0.3554767072200775, |
|
"learning_rate": 5.975505163773437e-07, |
|
"loss": 0.0215, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.8644007997177466, |
|
"grad_norm": 0.13760164380073547, |
|
"learning_rate": 5.498236575745564e-07, |
|
"loss": 0.0196, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.870281077266847, |
|
"grad_norm": 1.4082413911819458, |
|
"learning_rate": 5.039720169360013e-07, |
|
"loss": 0.0159, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8761613548159473, |
|
"grad_norm": 3.8390660285949707, |
|
"learning_rate": 4.600149154554501e-07, |
|
"loss": 0.0214, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.8820416323650476, |
|
"grad_norm": 3.1669695377349854, |
|
"learning_rate": 4.179708758045431e-07, |
|
"loss": 0.0181, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.887921909914148, |
|
"grad_norm": 6.687104225158691, |
|
"learning_rate": 3.7785761452770295e-07, |
|
"loss": 0.02, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.8938021874632482, |
|
"grad_norm": 4.694364070892334, |
|
"learning_rate": 3.396920345767185e-07, |
|
"loss": 0.0203, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.8996824650123486, |
|
"grad_norm": 2.0906476974487305, |
|
"learning_rate": 3.0349021818817326e-07, |
|
"loss": 0.0176, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.9055627425614489, |
|
"grad_norm": 3.2009048461914062, |
|
"learning_rate": 2.692674201066975e-07, |
|
"loss": 0.0189, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.9114430201105492, |
|
"grad_norm": 4.380706310272217, |
|
"learning_rate": 2.3703806115691951e-07, |
|
"loss": 0.0171, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.9173232976596495, |
|
"grad_norm": 0.9385294318199158, |
|
"learning_rate": 2.068157221668049e-07, |
|
"loss": 0.0161, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.9232035752087498, |
|
"grad_norm": 2.8777430057525635, |
|
"learning_rate": 1.786131382449602e-07, |
|
"loss": 0.0232, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.9290838527578502, |
|
"grad_norm": 0.5679008960723877, |
|
"learning_rate": 1.5244219341430443e-07, |
|
"loss": 0.0197, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.9349641303069505, |
|
"grad_norm": 2.9544992446899414, |
|
"learning_rate": 1.2831391560437278e-07, |
|
"loss": 0.0187, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.9408444078560508, |
|
"grad_norm": 2.460313081741333, |
|
"learning_rate": 1.0623847200435966e-07, |
|
"loss": 0.0185, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.9408444078560508, |
|
"eval_loss": 0.01769772544503212, |
|
"eval_runtime": 613.4256, |
|
"eval_samples_per_second": 47.523, |
|
"eval_steps_per_second": 11.881, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.9467246854051511, |
|
"grad_norm": 3.39328670501709, |
|
"learning_rate": 8.62251647788609e-08, |
|
"loss": 0.019, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.9526049629542515, |
|
"grad_norm": 4.006555080413818, |
|
"learning_rate": 6.828242714812527e-08, |
|
"loss": 0.0183, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.9584852405033517, |
|
"grad_norm": 2.6548595428466797, |
|
"learning_rate": 5.2417819834454374e-08, |
|
"loss": 0.0156, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.9643655180524521, |
|
"grad_norm": 1.8154182434082031, |
|
"learning_rate": 3.863802787626325e-08, |
|
"loss": 0.0175, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.9702457956015524, |
|
"grad_norm": 4.321004867553711, |
|
"learning_rate": 2.694885781113432e-08, |
|
"loss": 0.0195, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.9761260731506527, |
|
"grad_norm": 2.2515978813171387, |
|
"learning_rate": 1.735523522905347e-08, |
|
"loss": 0.0182, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.9820063506997531, |
|
"grad_norm": 2.5015530586242676, |
|
"learning_rate": 9.861202696864191e-09, |
|
"loss": 0.0197, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.9878866282488533, |
|
"grad_norm": 2.121727705001831, |
|
"learning_rate": 4.469918054806344e-09, |
|
"loss": 0.0156, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.9937669057979537, |
|
"grad_norm": 3.144368886947632, |
|
"learning_rate": 1.1836530858633234e-09, |
|
"loss": 0.0197, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.9996471833470539, |
|
"grad_norm": 2.387354850769043, |
|
"learning_rate": 3.792558477266894e-12, |
|
"loss": 0.0191, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 8503, |
|
"total_flos": 1.8987963054248755e+17, |
|
"train_loss": 0.0597800020030789, |
|
"train_runtime": 12067.3267, |
|
"train_samples_per_second": 5.637, |
|
"train_steps_per_second": 0.705 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8503, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8987963054248755e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|