|
{ |
|
"best_metric": 3.1832265853881836, |
|
"best_model_checkpoint": "./models/lora-finetuning/LLaMmlein_120M/checkpoint-118000", |
|
"epoch": 2.3397676944931898, |
|
"eval_steps": 1000, |
|
"global_step": 140000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008356313194618534, |
|
"grad_norm": 40.111167907714844, |
|
"learning_rate": 4.96e-05, |
|
"loss": 4.8904, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.016712626389237067, |
|
"grad_norm": 22.074819564819336, |
|
"learning_rate": 4.986173570570655e-05, |
|
"loss": 4.4642, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.016712626389237067, |
|
"eval_loss": 4.3232102394104, |
|
"eval_runtime": 27.5175, |
|
"eval_samples_per_second": 162.152, |
|
"eval_steps_per_second": 20.278, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.025068939583855605, |
|
"grad_norm": 19.863744735717773, |
|
"learning_rate": 4.972207480237983e-05, |
|
"loss": 4.3112, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"grad_norm": 19.56609344482422, |
|
"learning_rate": 4.958241389905311e-05, |
|
"loss": 4.2377, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"eval_loss": 4.127670764923096, |
|
"eval_runtime": 27.7514, |
|
"eval_samples_per_second": 160.785, |
|
"eval_steps_per_second": 20.107, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.04178156597309267, |
|
"grad_norm": 19.212339401245117, |
|
"learning_rate": 4.944275299572638e-05, |
|
"loss": 4.1323, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.05013787916771121, |
|
"grad_norm": 16.833269119262695, |
|
"learning_rate": 4.930309209239966e-05, |
|
"loss": 4.0461, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.05013787916771121, |
|
"eval_loss": 4.040640354156494, |
|
"eval_runtime": 27.5887, |
|
"eval_samples_per_second": 161.733, |
|
"eval_steps_per_second": 20.226, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.05849419236232974, |
|
"grad_norm": 12.117938995361328, |
|
"learning_rate": 4.9163431189072935e-05, |
|
"loss": 4.0848, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"grad_norm": 16.871612548828125, |
|
"learning_rate": 4.9023770285746213e-05, |
|
"loss": 4.0421, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"eval_loss": 3.966156005859375, |
|
"eval_runtime": 27.5488, |
|
"eval_samples_per_second": 161.967, |
|
"eval_steps_per_second": 20.255, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0752068187515668, |
|
"grad_norm": 23.424396514892578, |
|
"learning_rate": 4.8884109382419485e-05, |
|
"loss": 3.9967, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.08356313194618534, |
|
"grad_norm": 17.837812423706055, |
|
"learning_rate": 4.8744448479092763e-05, |
|
"loss": 3.9343, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.08356313194618534, |
|
"eval_loss": 3.9122862815856934, |
|
"eval_runtime": 27.6483, |
|
"eval_samples_per_second": 161.384, |
|
"eval_steps_per_second": 20.182, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.09191944514080387, |
|
"grad_norm": 19.62445640563965, |
|
"learning_rate": 4.860478757576604e-05, |
|
"loss": 3.9297, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"grad_norm": 16.499317169189453, |
|
"learning_rate": 4.846512667243932e-05, |
|
"loss": 3.8862, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"eval_loss": 3.8697621822357178, |
|
"eval_runtime": 30.2568, |
|
"eval_samples_per_second": 147.471, |
|
"eval_steps_per_second": 18.442, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.10863207153004095, |
|
"grad_norm": 22.481382369995117, |
|
"learning_rate": 4.83254657691126e-05, |
|
"loss": 3.8479, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.11698838472465949, |
|
"grad_norm": 15.390802383422852, |
|
"learning_rate": 4.818580486578587e-05, |
|
"loss": 3.8934, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.11698838472465949, |
|
"eval_loss": 3.8397133350372314, |
|
"eval_runtime": 27.593, |
|
"eval_samples_per_second": 161.708, |
|
"eval_steps_per_second": 20.223, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.12534469791927802, |
|
"grad_norm": 16.615388870239258, |
|
"learning_rate": 4.804670260607246e-05, |
|
"loss": 3.8718, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"grad_norm": 16.377056121826172, |
|
"learning_rate": 4.7907041702745736e-05, |
|
"loss": 3.813, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"eval_loss": 3.8209779262542725, |
|
"eval_runtime": 27.534, |
|
"eval_samples_per_second": 162.054, |
|
"eval_steps_per_second": 20.266, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.1420573243085151, |
|
"grad_norm": 20.620891571044922, |
|
"learning_rate": 4.7767380799419014e-05, |
|
"loss": 3.8194, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.1504136375031336, |
|
"grad_norm": 14.788801193237305, |
|
"learning_rate": 4.7627719896092286e-05, |
|
"loss": 3.8246, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.1504136375031336, |
|
"eval_loss": 3.7902626991271973, |
|
"eval_runtime": 27.5751, |
|
"eval_samples_per_second": 161.812, |
|
"eval_steps_per_second": 20.236, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.15876995069775215, |
|
"grad_norm": 12.923628807067871, |
|
"learning_rate": 4.748833831457222e-05, |
|
"loss": 3.7739, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"grad_norm": 15.342278480529785, |
|
"learning_rate": 4.7348677411245494e-05, |
|
"loss": 3.771, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"eval_loss": 3.7538962364196777, |
|
"eval_runtime": 27.5894, |
|
"eval_samples_per_second": 161.729, |
|
"eval_steps_per_second": 20.225, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.17548257708698922, |
|
"grad_norm": 14.153196334838867, |
|
"learning_rate": 4.720901650791877e-05, |
|
"loss": 3.7321, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.18383889028160774, |
|
"grad_norm": 29.715482711791992, |
|
"learning_rate": 4.706935560459205e-05, |
|
"loss": 3.7219, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.18383889028160774, |
|
"eval_loss": 3.736612558364868, |
|
"eval_runtime": 27.6054, |
|
"eval_samples_per_second": 161.635, |
|
"eval_steps_per_second": 20.213, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.1921952034762263, |
|
"grad_norm": 22.01296043395996, |
|
"learning_rate": 4.692969470126533e-05, |
|
"loss": 3.7397, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"grad_norm": 12.3303804397583, |
|
"learning_rate": 4.679003379793861e-05, |
|
"loss": 3.731, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"eval_loss": 3.7091643810272217, |
|
"eval_runtime": 27.5167, |
|
"eval_samples_per_second": 162.156, |
|
"eval_steps_per_second": 20.279, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.20890782986546336, |
|
"grad_norm": 17.74376106262207, |
|
"learning_rate": 4.6650372894611886e-05, |
|
"loss": 3.7524, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2172641430600819, |
|
"grad_norm": 11.327990531921387, |
|
"learning_rate": 4.6510711991285164e-05, |
|
"loss": 3.6842, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.2172641430600819, |
|
"eval_loss": 3.7014827728271484, |
|
"eval_runtime": 27.5638, |
|
"eval_samples_per_second": 161.879, |
|
"eval_steps_per_second": 20.244, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.22562045625470042, |
|
"grad_norm": 10.342738151550293, |
|
"learning_rate": 4.637133040976509e-05, |
|
"loss": 3.672, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"grad_norm": 14.019166946411133, |
|
"learning_rate": 4.623166950643837e-05, |
|
"loss": 3.6743, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"eval_loss": 3.680652618408203, |
|
"eval_runtime": 27.7236, |
|
"eval_samples_per_second": 160.946, |
|
"eval_steps_per_second": 20.127, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.2423330826439375, |
|
"grad_norm": 12.447026252746582, |
|
"learning_rate": 4.609200860311165e-05, |
|
"loss": 3.68, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"grad_norm": 16.875900268554688, |
|
"learning_rate": 4.595234769978493e-05, |
|
"loss": 3.6755, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"eval_loss": 3.656320095062256, |
|
"eval_runtime": 27.5417, |
|
"eval_samples_per_second": 162.009, |
|
"eval_steps_per_second": 20.26, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.2590457090331746, |
|
"grad_norm": 14.400089263916016, |
|
"learning_rate": 4.581268679645821e-05, |
|
"loss": 3.668, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"grad_norm": 17.918594360351562, |
|
"learning_rate": 4.567302589313148e-05, |
|
"loss": 3.6105, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"eval_loss": 3.6530709266662598, |
|
"eval_runtime": 27.5371, |
|
"eval_samples_per_second": 162.036, |
|
"eval_steps_per_second": 20.264, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2757583354224116, |
|
"grad_norm": 25.155494689941406, |
|
"learning_rate": 4.5533644311611415e-05, |
|
"loss": 3.6537, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.2841146486170302, |
|
"grad_norm": 19.406333923339844, |
|
"learning_rate": 4.5393983408284686e-05, |
|
"loss": 3.6321, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2841146486170302, |
|
"eval_loss": 3.625420331954956, |
|
"eval_runtime": 27.6545, |
|
"eval_samples_per_second": 161.348, |
|
"eval_steps_per_second": 20.178, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2924709618116487, |
|
"grad_norm": 16.296859741210938, |
|
"learning_rate": 4.5254322504957965e-05, |
|
"loss": 3.5798, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"grad_norm": 14.436009407043457, |
|
"learning_rate": 4.511466160163124e-05, |
|
"loss": 3.5772, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"eval_loss": 3.609403371810913, |
|
"eval_runtime": 27.6116, |
|
"eval_samples_per_second": 161.599, |
|
"eval_steps_per_second": 20.209, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.30918358820088576, |
|
"grad_norm": 14.975789070129395, |
|
"learning_rate": 4.497500069830452e-05, |
|
"loss": 3.5887, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.3175399013955043, |
|
"grad_norm": 13.919900894165039, |
|
"learning_rate": 4.48353397949778e-05, |
|
"loss": 3.5855, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.3175399013955043, |
|
"eval_loss": 3.593752384185791, |
|
"eval_runtime": 27.5247, |
|
"eval_samples_per_second": 162.109, |
|
"eval_steps_per_second": 20.273, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.32589621459012286, |
|
"grad_norm": 13.113483428955078, |
|
"learning_rate": 4.469567889165107e-05, |
|
"loss": 3.5958, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"grad_norm": 16.182727813720703, |
|
"learning_rate": 4.455601798832435e-05, |
|
"loss": 3.5544, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"eval_loss": 3.5966029167175293, |
|
"eval_runtime": 27.5874, |
|
"eval_samples_per_second": 161.741, |
|
"eval_steps_per_second": 20.227, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.3426088409793599, |
|
"grad_norm": 11.520635604858398, |
|
"learning_rate": 4.441635708499763e-05, |
|
"loss": 3.5985, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.35096515417397844, |
|
"grad_norm": 9.132452011108398, |
|
"learning_rate": 4.4276696181670906e-05, |
|
"loss": 3.5385, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.35096515417397844, |
|
"eval_loss": 3.5922670364379883, |
|
"eval_runtime": 27.5592, |
|
"eval_samples_per_second": 161.906, |
|
"eval_steps_per_second": 20.247, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.359321467368597, |
|
"grad_norm": 11.077112197875977, |
|
"learning_rate": 4.413703527834418e-05, |
|
"loss": 3.5502, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"grad_norm": 10.871437072753906, |
|
"learning_rate": 4.3997653696824114e-05, |
|
"loss": 3.5683, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"eval_loss": 3.570159673690796, |
|
"eval_runtime": 27.5952, |
|
"eval_samples_per_second": 161.695, |
|
"eval_steps_per_second": 20.221, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.37603409375783403, |
|
"grad_norm": 14.359475135803223, |
|
"learning_rate": 4.3858272115304044e-05, |
|
"loss": 3.6252, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.3843904069524526, |
|
"grad_norm": 14.215445518493652, |
|
"learning_rate": 4.371861121197732e-05, |
|
"loss": 3.5357, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.3843904069524526, |
|
"eval_loss": 3.5666186809539795, |
|
"eval_runtime": 27.4876, |
|
"eval_samples_per_second": 162.328, |
|
"eval_steps_per_second": 20.3, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.3927467201470711, |
|
"grad_norm": 12.356164932250977, |
|
"learning_rate": 4.3578950308650594e-05, |
|
"loss": 3.5561, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"grad_norm": 13.857044219970703, |
|
"learning_rate": 4.343956872713053e-05, |
|
"loss": 3.5108, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"eval_loss": 3.553119659423828, |
|
"eval_runtime": 27.5959, |
|
"eval_samples_per_second": 161.691, |
|
"eval_steps_per_second": 20.22, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.40945934653630817, |
|
"grad_norm": 18.891963958740234, |
|
"learning_rate": 4.329990782380381e-05, |
|
"loss": 3.5144, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.4178156597309267, |
|
"grad_norm": 11.144295692443848, |
|
"learning_rate": 4.316024692047708e-05, |
|
"loss": 3.4772, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.4178156597309267, |
|
"eval_loss": 3.539018154144287, |
|
"eval_runtime": 27.6102, |
|
"eval_samples_per_second": 161.607, |
|
"eval_steps_per_second": 20.21, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.42617197292554526, |
|
"grad_norm": 13.266519546508789, |
|
"learning_rate": 4.302058601715036e-05, |
|
"loss": 3.4983, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"grad_norm": 13.068937301635742, |
|
"learning_rate": 4.288092511382364e-05, |
|
"loss": 3.5372, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"eval_loss": 3.5329203605651855, |
|
"eval_runtime": 27.5669, |
|
"eval_samples_per_second": 161.861, |
|
"eval_steps_per_second": 20.242, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.4428845993147823, |
|
"grad_norm": 10.656412124633789, |
|
"learning_rate": 4.2741543532303566e-05, |
|
"loss": 3.4773, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.45124091250940085, |
|
"grad_norm": 13.304460525512695, |
|
"learning_rate": 4.2601882628976845e-05, |
|
"loss": 3.5119, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.45124091250940085, |
|
"eval_loss": 3.5271201133728027, |
|
"eval_runtime": 27.5897, |
|
"eval_samples_per_second": 161.727, |
|
"eval_steps_per_second": 20.225, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.4595972257040194, |
|
"grad_norm": 9.64598560333252, |
|
"learning_rate": 4.246222172565012e-05, |
|
"loss": 3.4904, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"grad_norm": 18.846969604492188, |
|
"learning_rate": 4.23225608223234e-05, |
|
"loss": 3.4627, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"eval_loss": 3.52040433883667, |
|
"eval_runtime": 27.5839, |
|
"eval_samples_per_second": 161.761, |
|
"eval_steps_per_second": 20.229, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.47630985209325644, |
|
"grad_norm": 11.906749725341797, |
|
"learning_rate": 4.218289991899668e-05, |
|
"loss": 3.4874, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.484666165287875, |
|
"grad_norm": 11.39976692199707, |
|
"learning_rate": 4.204323901566996e-05, |
|
"loss": 3.5089, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.484666165287875, |
|
"eval_loss": 3.5191400051116943, |
|
"eval_runtime": 27.5537, |
|
"eval_samples_per_second": 161.938, |
|
"eval_steps_per_second": 20.251, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.49302247848249353, |
|
"grad_norm": 16.590219497680664, |
|
"learning_rate": 4.190385743414989e-05, |
|
"loss": 3.4669, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"grad_norm": 12.839526176452637, |
|
"learning_rate": 4.1764196530823166e-05, |
|
"loss": 3.5124, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"eval_loss": 3.5098681449890137, |
|
"eval_runtime": 27.52, |
|
"eval_samples_per_second": 162.137, |
|
"eval_steps_per_second": 20.276, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.5097351048717306, |
|
"grad_norm": 15.809226989746094, |
|
"learning_rate": 4.1624535627496444e-05, |
|
"loss": 3.4721, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.5180914180663492, |
|
"grad_norm": 14.224934577941895, |
|
"learning_rate": 4.148487472416972e-05, |
|
"loss": 3.4529, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.5180914180663492, |
|
"eval_loss": 3.4936439990997314, |
|
"eval_runtime": 27.5971, |
|
"eval_samples_per_second": 161.684, |
|
"eval_steps_per_second": 20.22, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.5264477312609677, |
|
"grad_norm": 14.624770164489746, |
|
"learning_rate": 4.1345213820842994e-05, |
|
"loss": 3.4888, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"grad_norm": 11.033506393432617, |
|
"learning_rate": 4.120555291751627e-05, |
|
"loss": 3.4595, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"eval_loss": 3.4841041564941406, |
|
"eval_runtime": 27.5814, |
|
"eval_samples_per_second": 161.776, |
|
"eval_steps_per_second": 20.231, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.5431603576502048, |
|
"grad_norm": 11.615704536437988, |
|
"learning_rate": 4.106589201418955e-05, |
|
"loss": 3.4677, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.5515166708448233, |
|
"grad_norm": 13.639912605285645, |
|
"learning_rate": 4.092651043266948e-05, |
|
"loss": 3.4418, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.5515166708448233, |
|
"eval_loss": 3.478278398513794, |
|
"eval_runtime": 27.6018, |
|
"eval_samples_per_second": 161.656, |
|
"eval_steps_per_second": 20.216, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.5598729840394417, |
|
"grad_norm": 17.30266761779785, |
|
"learning_rate": 4.078684952934276e-05, |
|
"loss": 3.4707, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"grad_norm": 10.405800819396973, |
|
"learning_rate": 4.064718862601604e-05, |
|
"loss": 3.4683, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"eval_loss": 3.474083662033081, |
|
"eval_runtime": 27.598, |
|
"eval_samples_per_second": 161.678, |
|
"eval_steps_per_second": 20.219, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5765856104286788, |
|
"grad_norm": 12.891817092895508, |
|
"learning_rate": 4.0507527722689315e-05, |
|
"loss": 3.4564, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.5849419236232974, |
|
"grad_norm": 8.831328392028809, |
|
"learning_rate": 4.036786681936259e-05, |
|
"loss": 3.3926, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.5849419236232974, |
|
"eval_loss": 3.4699606895446777, |
|
"eval_runtime": 27.6266, |
|
"eval_samples_per_second": 161.511, |
|
"eval_steps_per_second": 20.198, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.5932982368179159, |
|
"grad_norm": 16.12383460998535, |
|
"learning_rate": 4.0228205916035865e-05, |
|
"loss": 3.4639, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"grad_norm": 12.538627624511719, |
|
"learning_rate": 4.0088545012709144e-05, |
|
"loss": 3.4289, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"eval_loss": 3.4501824378967285, |
|
"eval_runtime": 27.6054, |
|
"eval_samples_per_second": 161.635, |
|
"eval_steps_per_second": 20.213, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.610010863207153, |
|
"grad_norm": 13.25362777709961, |
|
"learning_rate": 3.994888410938242e-05, |
|
"loss": 3.4552, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.6183671764017715, |
|
"grad_norm": 14.144664764404297, |
|
"learning_rate": 3.980950252786235e-05, |
|
"loss": 3.432, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.6183671764017715, |
|
"eval_loss": 3.4386274814605713, |
|
"eval_runtime": 27.5191, |
|
"eval_samples_per_second": 162.142, |
|
"eval_steps_per_second": 20.277, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.6267234895963901, |
|
"grad_norm": 11.082966804504395, |
|
"learning_rate": 3.966984162453563e-05, |
|
"loss": 3.446, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"grad_norm": 12.105545997619629, |
|
"learning_rate": 3.953018072120891e-05, |
|
"loss": 3.4498, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"eval_loss": 3.4284586906433105, |
|
"eval_runtime": 27.6228, |
|
"eval_samples_per_second": 161.533, |
|
"eval_steps_per_second": 20.201, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.6434361159856271, |
|
"grad_norm": 11.420595169067383, |
|
"learning_rate": 3.939051981788218e-05, |
|
"loss": 3.3634, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.6517924291802457, |
|
"grad_norm": 13.421010971069336, |
|
"learning_rate": 3.9251138236362116e-05, |
|
"loss": 3.4011, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.6517924291802457, |
|
"eval_loss": 3.4333345890045166, |
|
"eval_runtime": 27.6141, |
|
"eval_samples_per_second": 161.584, |
|
"eval_steps_per_second": 20.207, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.6601487423748642, |
|
"grad_norm": 13.694308280944824, |
|
"learning_rate": 3.911147733303539e-05, |
|
"loss": 3.3544, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"grad_norm": 13.354585647583008, |
|
"learning_rate": 3.8971816429708666e-05, |
|
"loss": 3.4279, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"eval_loss": 3.428675651550293, |
|
"eval_runtime": 29.3177, |
|
"eval_samples_per_second": 152.194, |
|
"eval_steps_per_second": 19.033, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.6768613687641013, |
|
"grad_norm": 13.542250633239746, |
|
"learning_rate": 3.8832155526381945e-05, |
|
"loss": 3.3683, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.6852176819587198, |
|
"grad_norm": 15.259937286376953, |
|
"learning_rate": 3.869249462305522e-05, |
|
"loss": 3.3964, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.6852176819587198, |
|
"eval_loss": 3.4252407550811768, |
|
"eval_runtime": 32.0858, |
|
"eval_samples_per_second": 139.065, |
|
"eval_steps_per_second": 17.391, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.6935739951533384, |
|
"grad_norm": 8.693069458007812, |
|
"learning_rate": 3.85528337197285e-05, |
|
"loss": 3.3419, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"grad_norm": 10.193922996520996, |
|
"learning_rate": 3.841317281640178e-05, |
|
"loss": 3.361, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"eval_loss": 3.423677444458008, |
|
"eval_runtime": 27.5904, |
|
"eval_samples_per_second": 161.723, |
|
"eval_steps_per_second": 20.224, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.7102866215425754, |
|
"grad_norm": 13.626117706298828, |
|
"learning_rate": 3.827351191307506e-05, |
|
"loss": 3.3456, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.718642934737194, |
|
"grad_norm": 15.671127319335938, |
|
"learning_rate": 3.813413033155499e-05, |
|
"loss": 3.39, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.718642934737194, |
|
"eval_loss": 3.410151243209839, |
|
"eval_runtime": 27.586, |
|
"eval_samples_per_second": 161.749, |
|
"eval_steps_per_second": 20.228, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.7269992479318125, |
|
"grad_norm": 13.179340362548828, |
|
"learning_rate": 3.7994469428228266e-05, |
|
"loss": 3.3407, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"grad_norm": 11.219006538391113, |
|
"learning_rate": 3.7854808524901544e-05, |
|
"loss": 3.3509, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"eval_loss": 3.4000790119171143, |
|
"eval_runtime": 27.5216, |
|
"eval_samples_per_second": 162.127, |
|
"eval_steps_per_second": 20.275, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.7437118743210496, |
|
"grad_norm": 19.535215377807617, |
|
"learning_rate": 3.771514762157482e-05, |
|
"loss": 3.3784, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.7520681875156681, |
|
"grad_norm": 11.256051063537598, |
|
"learning_rate": 3.7575486718248094e-05, |
|
"loss": 3.3584, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.7520681875156681, |
|
"eval_loss": 3.3937857151031494, |
|
"eval_runtime": 27.6288, |
|
"eval_samples_per_second": 161.498, |
|
"eval_steps_per_second": 20.196, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.7604245007102867, |
|
"grad_norm": 14.835156440734863, |
|
"learning_rate": 3.743610513672803e-05, |
|
"loss": 3.3964, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"grad_norm": 13.36843204498291, |
|
"learning_rate": 3.72964442334013e-05, |
|
"loss": 3.3612, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"eval_loss": 3.4002346992492676, |
|
"eval_runtime": 27.6951, |
|
"eval_samples_per_second": 161.112, |
|
"eval_steps_per_second": 20.148, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.7771371270995237, |
|
"grad_norm": 15.822681427001953, |
|
"learning_rate": 3.715706265188124e-05, |
|
"loss": 3.3235, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.7854934402941423, |
|
"grad_norm": 11.626577377319336, |
|
"learning_rate": 3.701740174855451e-05, |
|
"loss": 3.3167, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.7854934402941423, |
|
"eval_loss": 3.3830487728118896, |
|
"eval_runtime": 27.7333, |
|
"eval_samples_per_second": 160.89, |
|
"eval_steps_per_second": 20.12, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.7938497534887607, |
|
"grad_norm": 18.387489318847656, |
|
"learning_rate": 3.687774084522779e-05, |
|
"loss": 3.3468, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"grad_norm": 10.468737602233887, |
|
"learning_rate": 3.673807994190107e-05, |
|
"loss": 3.3765, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"eval_loss": 3.3805253505706787, |
|
"eval_runtime": 27.5567, |
|
"eval_samples_per_second": 161.921, |
|
"eval_steps_per_second": 20.249, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.8105623798779978, |
|
"grad_norm": 12.263431549072266, |
|
"learning_rate": 3.6598419038574345e-05, |
|
"loss": 3.3353, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.8189186930726163, |
|
"grad_norm": 10.66336441040039, |
|
"learning_rate": 3.6458758135247623e-05, |
|
"loss": 3.2779, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.8189186930726163, |
|
"eval_loss": 3.3863792419433594, |
|
"eval_runtime": 27.5626, |
|
"eval_samples_per_second": 161.886, |
|
"eval_steps_per_second": 20.245, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.8272750062672349, |
|
"grad_norm": 13.781414031982422, |
|
"learning_rate": 3.6319097231920895e-05, |
|
"loss": 3.3591, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"grad_norm": 10.030421257019043, |
|
"learning_rate": 3.617943632859417e-05, |
|
"loss": 3.3354, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"eval_loss": 3.367990493774414, |
|
"eval_runtime": 27.5956, |
|
"eval_samples_per_second": 161.693, |
|
"eval_steps_per_second": 20.221, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.8439876326564719, |
|
"grad_norm": 14.890128135681152, |
|
"learning_rate": 3.603977542526745e-05, |
|
"loss": 3.3061, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.8523439458510905, |
|
"grad_norm": 16.298643112182617, |
|
"learning_rate": 3.590011452194073e-05, |
|
"loss": 3.3032, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.8523439458510905, |
|
"eval_loss": 3.3711585998535156, |
|
"eval_runtime": 27.5103, |
|
"eval_samples_per_second": 162.194, |
|
"eval_steps_per_second": 20.283, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.860700259045709, |
|
"grad_norm": 11.454365730285645, |
|
"learning_rate": 3.576045361861401e-05, |
|
"loss": 3.2919, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"grad_norm": 11.1732177734375, |
|
"learning_rate": 3.562107203709394e-05, |
|
"loss": 3.2936, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"eval_loss": 3.358672857284546, |
|
"eval_runtime": 27.5316, |
|
"eval_samples_per_second": 162.068, |
|
"eval_steps_per_second": 20.268, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.8774128854349461, |
|
"grad_norm": 12.489287376403809, |
|
"learning_rate": 3.5481411133767216e-05, |
|
"loss": 3.2694, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.8857691986295646, |
|
"grad_norm": 12.570661544799805, |
|
"learning_rate": 3.534175023044049e-05, |
|
"loss": 3.322, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.8857691986295646, |
|
"eval_loss": 3.3604071140289307, |
|
"eval_runtime": 27.5745, |
|
"eval_samples_per_second": 161.816, |
|
"eval_steps_per_second": 20.236, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.8941255118241832, |
|
"grad_norm": 17.960376739501953, |
|
"learning_rate": 3.5202368648920424e-05, |
|
"loss": 3.2955, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"grad_norm": 13.333609580993652, |
|
"learning_rate": 3.5062707745593696e-05, |
|
"loss": 3.3394, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"eval_loss": 3.3498120307922363, |
|
"eval_runtime": 27.561, |
|
"eval_samples_per_second": 161.896, |
|
"eval_steps_per_second": 20.246, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.9108381382134202, |
|
"grad_norm": 16.366514205932617, |
|
"learning_rate": 3.4923046842266974e-05, |
|
"loss": 3.3223, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.9191944514080388, |
|
"grad_norm": 10.783904075622559, |
|
"learning_rate": 3.478338593894025e-05, |
|
"loss": 3.2717, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.9191944514080388, |
|
"eval_loss": 3.3506462574005127, |
|
"eval_runtime": 27.5889, |
|
"eval_samples_per_second": 161.732, |
|
"eval_steps_per_second": 20.226, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.9275507646026573, |
|
"grad_norm": 12.693829536437988, |
|
"learning_rate": 3.464372503561353e-05, |
|
"loss": 3.2696, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"grad_norm": 20.29674530029297, |
|
"learning_rate": 3.450406413228681e-05, |
|
"loss": 3.3342, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"eval_loss": 3.333944797515869, |
|
"eval_runtime": 27.579, |
|
"eval_samples_per_second": 161.79, |
|
"eval_steps_per_second": 20.233, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.9442633909918944, |
|
"grad_norm": 14.309937477111816, |
|
"learning_rate": 3.436440322896009e-05, |
|
"loss": 3.3321, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.9526197041865129, |
|
"grad_norm": 8.43278980255127, |
|
"learning_rate": 3.4224742325633366e-05, |
|
"loss": 3.2396, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.9526197041865129, |
|
"eval_loss": 3.3314433097839355, |
|
"eval_runtime": 27.6061, |
|
"eval_samples_per_second": 161.631, |
|
"eval_steps_per_second": 20.213, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.9609760173811315, |
|
"grad_norm": 10.31540584564209, |
|
"learning_rate": 3.4085081422306644e-05, |
|
"loss": 3.2436, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"grad_norm": 10.261503219604492, |
|
"learning_rate": 3.3945699840786574e-05, |
|
"loss": 3.2845, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"eval_loss": 3.3237485885620117, |
|
"eval_runtime": 27.545, |
|
"eval_samples_per_second": 161.99, |
|
"eval_steps_per_second": 20.258, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.9776886437703685, |
|
"grad_norm": 10.157827377319336, |
|
"learning_rate": 3.380603893745985e-05, |
|
"loss": 3.2976, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.9860449569649871, |
|
"grad_norm": 12.794463157653809, |
|
"learning_rate": 3.366637803413313e-05, |
|
"loss": 3.2621, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.9860449569649871, |
|
"eval_loss": 3.325364351272583, |
|
"eval_runtime": 27.6749, |
|
"eval_samples_per_second": 161.229, |
|
"eval_steps_per_second": 20.163, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.9944012701596056, |
|
"grad_norm": 17.333826065063477, |
|
"learning_rate": 3.35267171308064e-05, |
|
"loss": 3.2696, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.0027575833542242, |
|
"grad_norm": 12.315380096435547, |
|
"learning_rate": 3.338705622747968e-05, |
|
"loss": 3.2115, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.0027575833542242, |
|
"eval_loss": 3.336367607116699, |
|
"eval_runtime": 27.5799, |
|
"eval_samples_per_second": 161.785, |
|
"eval_steps_per_second": 20.232, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.0111138965488426, |
|
"grad_norm": 23.34908676147461, |
|
"learning_rate": 3.324739532415296e-05, |
|
"loss": 2.9589, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.0194702097434611, |
|
"grad_norm": 10.89417839050293, |
|
"learning_rate": 3.310773442082624e-05, |
|
"loss": 3.0302, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.0194702097434611, |
|
"eval_loss": 3.325634479522705, |
|
"eval_runtime": 27.573, |
|
"eval_samples_per_second": 161.825, |
|
"eval_steps_per_second": 20.237, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.0278265229380796, |
|
"grad_norm": 16.59639549255371, |
|
"learning_rate": 3.296835283930617e-05, |
|
"loss": 2.9884, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.0361828361326983, |
|
"grad_norm": 13.3978910446167, |
|
"learning_rate": 3.2828691935979445e-05, |
|
"loss": 2.9762, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.0361828361326983, |
|
"eval_loss": 3.334028482437134, |
|
"eval_runtime": 27.5551, |
|
"eval_samples_per_second": 161.93, |
|
"eval_steps_per_second": 20.25, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.0445391493273168, |
|
"grad_norm": 10.937264442443848, |
|
"learning_rate": 3.2689031032652723e-05, |
|
"loss": 2.9597, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.0528954625219353, |
|
"grad_norm": 14.150084495544434, |
|
"learning_rate": 3.2549370129325995e-05, |
|
"loss": 2.997, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.0528954625219353, |
|
"eval_loss": 3.320002794265747, |
|
"eval_runtime": 27.5918, |
|
"eval_samples_per_second": 161.715, |
|
"eval_steps_per_second": 20.223, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.0612517757165538, |
|
"grad_norm": 10.700261116027832, |
|
"learning_rate": 3.240970922599927e-05, |
|
"loss": 2.9857, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.0696080889111723, |
|
"grad_norm": 11.004881858825684, |
|
"learning_rate": 3.22703276444792e-05, |
|
"loss": 2.9591, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.0696080889111723, |
|
"eval_loss": 3.333744525909424, |
|
"eval_runtime": 27.6282, |
|
"eval_samples_per_second": 161.502, |
|
"eval_steps_per_second": 20.197, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.077964402105791, |
|
"grad_norm": 10.794275283813477, |
|
"learning_rate": 3.213066674115248e-05, |
|
"loss": 2.9817, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.0863207153004095, |
|
"grad_norm": 15.817968368530273, |
|
"learning_rate": 3.199100583782576e-05, |
|
"loss": 2.9543, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.0863207153004095, |
|
"eval_loss": 3.3309056758880615, |
|
"eval_runtime": 27.5459, |
|
"eval_samples_per_second": 161.984, |
|
"eval_steps_per_second": 20.257, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.094677028495028, |
|
"grad_norm": 14.550418853759766, |
|
"learning_rate": 3.185134493449904e-05, |
|
"loss": 2.9485, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.1030333416896465, |
|
"grad_norm": 11.362966537475586, |
|
"learning_rate": 3.1711684031172316e-05, |
|
"loss": 2.9787, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.1030333416896465, |
|
"eval_loss": 3.332648992538452, |
|
"eval_runtime": 27.619, |
|
"eval_samples_per_second": 161.555, |
|
"eval_steps_per_second": 20.203, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.111389654884265, |
|
"grad_norm": 14.36471176147461, |
|
"learning_rate": 3.157202312784559e-05, |
|
"loss": 2.9943, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.1197459680788837, |
|
"grad_norm": 17.348573684692383, |
|
"learning_rate": 3.1432641546325524e-05, |
|
"loss": 3.033, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.1197459680788837, |
|
"eval_loss": 3.311136245727539, |
|
"eval_runtime": 27.6024, |
|
"eval_samples_per_second": 161.653, |
|
"eval_steps_per_second": 20.216, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.1281022812735022, |
|
"grad_norm": 13.361127853393555, |
|
"learning_rate": 3.1292980642998796e-05, |
|
"loss": 2.995, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.1364585944681207, |
|
"grad_norm": 12.931785583496094, |
|
"learning_rate": 3.1153319739672074e-05, |
|
"loss": 2.9679, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.1364585944681207, |
|
"eval_loss": 3.308124542236328, |
|
"eval_runtime": 27.5871, |
|
"eval_samples_per_second": 161.742, |
|
"eval_steps_per_second": 20.227, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.1448149076627392, |
|
"grad_norm": 15.317282676696777, |
|
"learning_rate": 3.101365883634535e-05, |
|
"loss": 3.0068, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.1531712208573577, |
|
"grad_norm": 16.179967880249023, |
|
"learning_rate": 3.087399793301863e-05, |
|
"loss": 2.9658, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.1531712208573577, |
|
"eval_loss": 3.3181824684143066, |
|
"eval_runtime": 27.6733, |
|
"eval_samples_per_second": 161.238, |
|
"eval_steps_per_second": 20.164, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.1615275340519762, |
|
"grad_norm": 15.436213493347168, |
|
"learning_rate": 3.073433702969191e-05, |
|
"loss": 3.0074, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.1698838472465949, |
|
"grad_norm": 27.164413452148438, |
|
"learning_rate": 3.059467612636519e-05, |
|
"loss": 2.9649, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.1698838472465949, |
|
"eval_loss": 3.3080978393554688, |
|
"eval_runtime": 27.6434, |
|
"eval_samples_per_second": 161.413, |
|
"eval_steps_per_second": 20.186, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.1782401604412134, |
|
"grad_norm": 11.414698600769043, |
|
"learning_rate": 3.045529454484512e-05, |
|
"loss": 3.0125, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.1865964736358319, |
|
"grad_norm": 15.268623352050781, |
|
"learning_rate": 3.0315633641518392e-05, |
|
"loss": 2.9853, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.1865964736358319, |
|
"eval_loss": 3.298069477081299, |
|
"eval_runtime": 27.61, |
|
"eval_samples_per_second": 161.608, |
|
"eval_steps_per_second": 20.21, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.1949527868304504, |
|
"grad_norm": 23.319032669067383, |
|
"learning_rate": 3.017625205999833e-05, |
|
"loss": 2.9738, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.2033091000250689, |
|
"grad_norm": 11.64974308013916, |
|
"learning_rate": 3.00365911566716e-05, |
|
"loss": 2.9607, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.2033091000250689, |
|
"eval_loss": 3.3039419651031494, |
|
"eval_runtime": 29.6202, |
|
"eval_samples_per_second": 150.64, |
|
"eval_steps_per_second": 18.838, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.2116654132196876, |
|
"grad_norm": 11.017394065856934, |
|
"learning_rate": 2.989693025334488e-05, |
|
"loss": 2.9694, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.220021726414306, |
|
"grad_norm": 11.3243989944458, |
|
"learning_rate": 2.9757269350018157e-05, |
|
"loss": 2.9665, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.220021726414306, |
|
"eval_loss": 3.302910804748535, |
|
"eval_runtime": 27.6122, |
|
"eval_samples_per_second": 161.595, |
|
"eval_steps_per_second": 20.208, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.2283780396089246, |
|
"grad_norm": 14.27160358428955, |
|
"learning_rate": 2.9617608446691435e-05, |
|
"loss": 2.9554, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.236734352803543, |
|
"grad_norm": 9.526435852050781, |
|
"learning_rate": 2.9478226865171365e-05, |
|
"loss": 3.0167, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.236734352803543, |
|
"eval_loss": 3.3012630939483643, |
|
"eval_runtime": 27.614, |
|
"eval_samples_per_second": 161.584, |
|
"eval_steps_per_second": 20.207, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.2450906659981615, |
|
"grad_norm": 14.875115394592285, |
|
"learning_rate": 2.9338565961844643e-05, |
|
"loss": 3.0263, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.25344697919278, |
|
"grad_norm": 16.816545486450195, |
|
"learning_rate": 2.919890505851792e-05, |
|
"loss": 2.9977, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.25344697919278, |
|
"eval_loss": 3.3035476207733154, |
|
"eval_runtime": 27.574, |
|
"eval_samples_per_second": 161.819, |
|
"eval_steps_per_second": 20.236, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.2618032923873987, |
|
"grad_norm": 16.662649154663086, |
|
"learning_rate": 2.9059244155191196e-05, |
|
"loss": 2.9594, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.2701596055820172, |
|
"grad_norm": 14.543773651123047, |
|
"learning_rate": 2.8919583251864475e-05, |
|
"loss": 2.9845, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.2701596055820172, |
|
"eval_loss": 3.302872896194458, |
|
"eval_runtime": 27.5299, |
|
"eval_samples_per_second": 162.078, |
|
"eval_steps_per_second": 20.269, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.2785159187766357, |
|
"grad_norm": 15.129777908325195, |
|
"learning_rate": 2.8779922348537753e-05, |
|
"loss": 2.9826, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.2868722319712542, |
|
"grad_norm": 13.58123779296875, |
|
"learning_rate": 2.864026144521103e-05, |
|
"loss": 2.9302, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.2868722319712542, |
|
"eval_loss": 3.287860155105591, |
|
"eval_runtime": 27.5656, |
|
"eval_samples_per_second": 161.868, |
|
"eval_steps_per_second": 20.243, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.2952285451658727, |
|
"grad_norm": 13.634276390075684, |
|
"learning_rate": 2.8500600541884303e-05, |
|
"loss": 2.9802, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.3035848583604914, |
|
"grad_norm": 12.221925735473633, |
|
"learning_rate": 2.836093963855758e-05, |
|
"loss": 3.0119, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.3035848583604914, |
|
"eval_loss": 3.27937650680542, |
|
"eval_runtime": 27.6023, |
|
"eval_samples_per_second": 161.653, |
|
"eval_steps_per_second": 20.216, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.31194117155511, |
|
"grad_norm": 9.44093132019043, |
|
"learning_rate": 2.822127873523086e-05, |
|
"loss": 2.9562, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.3202974847497284, |
|
"grad_norm": 13.62260627746582, |
|
"learning_rate": 2.8082176475517447e-05, |
|
"loss": 2.982, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.3202974847497284, |
|
"eval_loss": 3.2890851497650146, |
|
"eval_runtime": 27.5678, |
|
"eval_samples_per_second": 161.856, |
|
"eval_steps_per_second": 20.241, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.328653797944347, |
|
"grad_norm": 12.078137397766113, |
|
"learning_rate": 2.7942515572190725e-05, |
|
"loss": 2.9453, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.3370101111389654, |
|
"grad_norm": 11.467178344726562, |
|
"learning_rate": 2.7802854668863997e-05, |
|
"loss": 3.0008, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.3370101111389654, |
|
"eval_loss": 3.2852883338928223, |
|
"eval_runtime": 27.5861, |
|
"eval_samples_per_second": 161.748, |
|
"eval_steps_per_second": 20.228, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.345366424333584, |
|
"grad_norm": 14.292551040649414, |
|
"learning_rate": 2.7663473087343933e-05, |
|
"loss": 2.9664, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.3537227375282026, |
|
"grad_norm": 13.714376449584961, |
|
"learning_rate": 2.7523812184017205e-05, |
|
"loss": 2.9396, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.3537227375282026, |
|
"eval_loss": 3.2859437465667725, |
|
"eval_runtime": 27.6096, |
|
"eval_samples_per_second": 161.61, |
|
"eval_steps_per_second": 20.21, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.362079050722821, |
|
"grad_norm": 12.142716407775879, |
|
"learning_rate": 2.7384151280690483e-05, |
|
"loss": 2.9775, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.3704353639174396, |
|
"grad_norm": 11.3803071975708, |
|
"learning_rate": 2.724449037736376e-05, |
|
"loss": 2.9458, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.3704353639174396, |
|
"eval_loss": 3.278106689453125, |
|
"eval_runtime": 27.5893, |
|
"eval_samples_per_second": 161.73, |
|
"eval_steps_per_second": 20.225, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.378791677112058, |
|
"grad_norm": 16.39805030822754, |
|
"learning_rate": 2.710482947403704e-05, |
|
"loss": 3.0504, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.3871479903066768, |
|
"grad_norm": 13.994576454162598, |
|
"learning_rate": 2.6965168570710315e-05, |
|
"loss": 2.9656, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.3871479903066768, |
|
"eval_loss": 3.278665781021118, |
|
"eval_runtime": 27.5347, |
|
"eval_samples_per_second": 162.05, |
|
"eval_steps_per_second": 20.265, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.3955043035012953, |
|
"grad_norm": 11.802352905273438, |
|
"learning_rate": 2.6825507667383593e-05, |
|
"loss": 2.9786, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.4038606166959138, |
|
"grad_norm": 13.618844985961914, |
|
"learning_rate": 2.668584676405687e-05, |
|
"loss": 3.0007, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.4038606166959138, |
|
"eval_loss": 3.2725257873535156, |
|
"eval_runtime": 27.5621, |
|
"eval_samples_per_second": 161.889, |
|
"eval_steps_per_second": 20.245, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.4122169298905323, |
|
"grad_norm": 9.817100524902344, |
|
"learning_rate": 2.654618586073015e-05, |
|
"loss": 2.9268, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.4205732430851508, |
|
"grad_norm": 16.49465560913086, |
|
"learning_rate": 2.640652495740343e-05, |
|
"loss": 2.984, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.4205732430851508, |
|
"eval_loss": 3.278170108795166, |
|
"eval_runtime": 27.5927, |
|
"eval_samples_per_second": 161.71, |
|
"eval_steps_per_second": 20.223, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.4289295562797695, |
|
"grad_norm": 17.29984474182129, |
|
"learning_rate": 2.62668640540767e-05, |
|
"loss": 2.9955, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.437285869474388, |
|
"grad_norm": 12.310997009277344, |
|
"learning_rate": 2.612720315074998e-05, |
|
"loss": 2.9769, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.437285869474388, |
|
"eval_loss": 3.2687652111053467, |
|
"eval_runtime": 27.5431, |
|
"eval_samples_per_second": 162.0, |
|
"eval_steps_per_second": 20.259, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.4456421826690065, |
|
"grad_norm": 14.744447708129883, |
|
"learning_rate": 2.5987542247423257e-05, |
|
"loss": 2.966, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.453998495863625, |
|
"grad_norm": 10.83408260345459, |
|
"learning_rate": 2.5847881344096535e-05, |
|
"loss": 2.9281, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.453998495863625, |
|
"eval_loss": 3.260927677154541, |
|
"eval_runtime": 27.5537, |
|
"eval_samples_per_second": 161.938, |
|
"eval_steps_per_second": 20.251, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.4623548090582434, |
|
"grad_norm": 14.912446975708008, |
|
"learning_rate": 2.5708220440769813e-05, |
|
"loss": 2.964, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.4707111222528622, |
|
"grad_norm": 16.433135986328125, |
|
"learning_rate": 2.5568838859249743e-05, |
|
"loss": 2.9903, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.4707111222528622, |
|
"eval_loss": 3.2638683319091797, |
|
"eval_runtime": 27.5854, |
|
"eval_samples_per_second": 161.752, |
|
"eval_steps_per_second": 20.228, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.4790674354474806, |
|
"grad_norm": 10.865525245666504, |
|
"learning_rate": 2.542917795592302e-05, |
|
"loss": 2.9782, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.4874237486420991, |
|
"grad_norm": 18.059494018554688, |
|
"learning_rate": 2.5289517052596296e-05, |
|
"loss": 2.9746, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.4874237486420991, |
|
"eval_loss": 3.2576780319213867, |
|
"eval_runtime": 27.6301, |
|
"eval_samples_per_second": 161.491, |
|
"eval_steps_per_second": 20.195, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.4957800618367176, |
|
"grad_norm": 14.338726997375488, |
|
"learning_rate": 2.5149856149269575e-05, |
|
"loss": 2.9746, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 1.5041363750313361, |
|
"grad_norm": 16.35688018798828, |
|
"learning_rate": 2.5010195245942853e-05, |
|
"loss": 2.9235, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.5041363750313361, |
|
"eval_loss": 3.2603578567504883, |
|
"eval_runtime": 27.5275, |
|
"eval_samples_per_second": 162.093, |
|
"eval_steps_per_second": 20.271, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.5124926882259548, |
|
"grad_norm": 19.649658203125, |
|
"learning_rate": 2.4870534342616128e-05, |
|
"loss": 2.892, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 1.520849001420573, |
|
"grad_norm": 22.463607788085938, |
|
"learning_rate": 2.4730873439289406e-05, |
|
"loss": 2.9464, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 1.520849001420573, |
|
"eval_loss": 3.255012273788452, |
|
"eval_runtime": 27.5843, |
|
"eval_samples_per_second": 161.759, |
|
"eval_steps_per_second": 20.229, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 1.5292053146151918, |
|
"grad_norm": 11.892714500427246, |
|
"learning_rate": 2.4591212535962685e-05, |
|
"loss": 2.9404, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 1.5375616278098103, |
|
"grad_norm": 13.547897338867188, |
|
"learning_rate": 2.445155163263596e-05, |
|
"loss": 2.9935, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.5375616278098103, |
|
"eval_loss": 3.2467143535614014, |
|
"eval_runtime": 27.5751, |
|
"eval_samples_per_second": 161.813, |
|
"eval_steps_per_second": 20.236, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.5459179410044288, |
|
"grad_norm": 15.99018383026123, |
|
"learning_rate": 2.4311890729309238e-05, |
|
"loss": 2.9983, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 1.5542742541990475, |
|
"grad_norm": 10.513391494750977, |
|
"learning_rate": 2.4172229825982516e-05, |
|
"loss": 2.979, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 1.5542742541990475, |
|
"eval_loss": 3.2534940242767334, |
|
"eval_runtime": 27.6008, |
|
"eval_samples_per_second": 161.662, |
|
"eval_steps_per_second": 20.217, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 1.5626305673936658, |
|
"grad_norm": 14.598124504089355, |
|
"learning_rate": 2.4032568922655795e-05, |
|
"loss": 2.9401, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 1.5709868805882845, |
|
"grad_norm": 11.219178199768066, |
|
"learning_rate": 2.3893187341135724e-05, |
|
"loss": 2.9333, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.5709868805882845, |
|
"eval_loss": 3.2531471252441406, |
|
"eval_runtime": 27.5659, |
|
"eval_samples_per_second": 161.867, |
|
"eval_steps_per_second": 20.242, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.579343193782903, |
|
"grad_norm": 13.708407402038574, |
|
"learning_rate": 2.3753805759615654e-05, |
|
"loss": 2.9284, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 1.5876995069775215, |
|
"grad_norm": 15.64401912689209, |
|
"learning_rate": 2.3614144856288932e-05, |
|
"loss": 2.9355, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 1.5876995069775215, |
|
"eval_loss": 3.247119665145874, |
|
"eval_runtime": 27.5812, |
|
"eval_samples_per_second": 161.777, |
|
"eval_steps_per_second": 20.231, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 1.5960558201721402, |
|
"grad_norm": 12.710307121276855, |
|
"learning_rate": 2.347448395296221e-05, |
|
"loss": 2.9451, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 1.6044121333667585, |
|
"grad_norm": 12.77171516418457, |
|
"learning_rate": 2.333482304963549e-05, |
|
"loss": 2.904, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.6044121333667585, |
|
"eval_loss": 3.2489845752716064, |
|
"eval_runtime": 27.6157, |
|
"eval_samples_per_second": 161.575, |
|
"eval_steps_per_second": 20.206, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.6127684465613772, |
|
"grad_norm": 12.342710494995117, |
|
"learning_rate": 2.3195162146308764e-05, |
|
"loss": 2.9116, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 1.6211247597559957, |
|
"grad_norm": 12.343132019042969, |
|
"learning_rate": 2.3055501242982042e-05, |
|
"loss": 2.9464, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 1.6211247597559957, |
|
"eval_loss": 3.2335522174835205, |
|
"eval_runtime": 27.5638, |
|
"eval_samples_per_second": 161.879, |
|
"eval_steps_per_second": 20.244, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 1.6294810729506142, |
|
"grad_norm": 14.988670349121094, |
|
"learning_rate": 2.291611966146197e-05, |
|
"loss": 2.9173, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 1.6378373861452329, |
|
"grad_norm": 12.14406967163086, |
|
"learning_rate": 2.277645875813525e-05, |
|
"loss": 2.917, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 1.6378373861452329, |
|
"eval_loss": 3.240186929702759, |
|
"eval_runtime": 27.7378, |
|
"eval_samples_per_second": 160.864, |
|
"eval_steps_per_second": 20.117, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 1.6461936993398512, |
|
"grad_norm": 13.880926132202148, |
|
"learning_rate": 2.2636797854808525e-05, |
|
"loss": 2.9146, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 1.6545500125344699, |
|
"grad_norm": 7.802238941192627, |
|
"learning_rate": 2.2497136951481803e-05, |
|
"loss": 2.9218, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 1.6545500125344699, |
|
"eval_loss": 3.2389557361602783, |
|
"eval_runtime": 27.6304, |
|
"eval_samples_per_second": 161.489, |
|
"eval_steps_per_second": 20.195, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 1.6629063257290884, |
|
"grad_norm": 18.44457244873047, |
|
"learning_rate": 2.235747604815508e-05, |
|
"loss": 2.9043, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 1.6712626389237069, |
|
"grad_norm": 10.393033027648926, |
|
"learning_rate": 2.2217815144828357e-05, |
|
"loss": 2.9677, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.6712626389237069, |
|
"eval_loss": 3.2266006469726562, |
|
"eval_runtime": 27.5938, |
|
"eval_samples_per_second": 161.703, |
|
"eval_steps_per_second": 20.222, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.6796189521183253, |
|
"grad_norm": 7.137568473815918, |
|
"learning_rate": 2.2078154241501635e-05, |
|
"loss": 2.9498, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 1.6879752653129438, |
|
"grad_norm": 9.725958824157715, |
|
"learning_rate": 2.1938772659981565e-05, |
|
"loss": 2.8844, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 1.6879752653129438, |
|
"eval_loss": 3.223768949508667, |
|
"eval_runtime": 27.5732, |
|
"eval_samples_per_second": 161.824, |
|
"eval_steps_per_second": 20.237, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 1.6963315785075626, |
|
"grad_norm": 15.254230499267578, |
|
"learning_rate": 2.1799111756654843e-05, |
|
"loss": 2.8841, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 1.704687891702181, |
|
"grad_norm": 20.192659378051758, |
|
"learning_rate": 2.165945085332812e-05, |
|
"loss": 2.9283, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 1.704687891702181, |
|
"eval_loss": 3.2226974964141846, |
|
"eval_runtime": 27.58, |
|
"eval_samples_per_second": 161.784, |
|
"eval_steps_per_second": 20.232, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 1.7130442048967995, |
|
"grad_norm": 14.292362213134766, |
|
"learning_rate": 2.15197899500014e-05, |
|
"loss": 2.9358, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 1.721400518091418, |
|
"grad_norm": 9.396713256835938, |
|
"learning_rate": 2.1380129046674675e-05, |
|
"loss": 2.9472, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 1.721400518091418, |
|
"eval_loss": 3.224209785461426, |
|
"eval_runtime": 27.6671, |
|
"eval_samples_per_second": 161.274, |
|
"eval_steps_per_second": 20.168, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 1.7297568312860365, |
|
"grad_norm": 10.828228950500488, |
|
"learning_rate": 2.1240468143347953e-05, |
|
"loss": 2.9152, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 1.7381131444806552, |
|
"grad_norm": 13.493616104125977, |
|
"learning_rate": 2.1100807240021228e-05, |
|
"loss": 2.9518, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.7381131444806552, |
|
"eval_loss": 3.2257561683654785, |
|
"eval_runtime": 27.5631, |
|
"eval_samples_per_second": 161.883, |
|
"eval_steps_per_second": 20.244, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.7464694576752735, |
|
"grad_norm": 11.142574310302734, |
|
"learning_rate": 2.0961146336694506e-05, |
|
"loss": 2.9459, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 1.7548257708698922, |
|
"grad_norm": 10.669454574584961, |
|
"learning_rate": 2.082176475517444e-05, |
|
"loss": 2.9545, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 1.7548257708698922, |
|
"eval_loss": 3.2120673656463623, |
|
"eval_runtime": 27.5868, |
|
"eval_samples_per_second": 161.744, |
|
"eval_steps_per_second": 20.227, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 1.7631820840645107, |
|
"grad_norm": 10.605733871459961, |
|
"learning_rate": 2.0682103851847714e-05, |
|
"loss": 2.9228, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 1.7715383972591292, |
|
"grad_norm": 13.702558517456055, |
|
"learning_rate": 2.0542442948520993e-05, |
|
"loss": 2.9137, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 1.7715383972591292, |
|
"eval_loss": 3.218060255050659, |
|
"eval_runtime": 27.5869, |
|
"eval_samples_per_second": 161.744, |
|
"eval_steps_per_second": 20.227, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 1.779894710453748, |
|
"grad_norm": 18.355859756469727, |
|
"learning_rate": 2.0402782045194268e-05, |
|
"loss": 2.885, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 1.7882510236483662, |
|
"grad_norm": 12.07524299621582, |
|
"learning_rate": 2.0263679785480855e-05, |
|
"loss": 2.9016, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 1.7882510236483662, |
|
"eval_loss": 3.2088520526885986, |
|
"eval_runtime": 27.6024, |
|
"eval_samples_per_second": 161.653, |
|
"eval_steps_per_second": 20.216, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 1.796607336842985, |
|
"grad_norm": 11.443526268005371, |
|
"learning_rate": 2.0124298203960785e-05, |
|
"loss": 2.942, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 1.8049636500376034, |
|
"grad_norm": 8.035077095031738, |
|
"learning_rate": 1.9984637300634063e-05, |
|
"loss": 2.9247, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.8049636500376034, |
|
"eval_loss": 3.211854934692383, |
|
"eval_runtime": 27.5517, |
|
"eval_samples_per_second": 161.95, |
|
"eval_steps_per_second": 20.253, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.8133199632322219, |
|
"grad_norm": 12.948112487792969, |
|
"learning_rate": 1.9844976397307338e-05, |
|
"loss": 2.9112, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 1.8216762764268406, |
|
"grad_norm": 15.308154106140137, |
|
"learning_rate": 1.9705315493980616e-05, |
|
"loss": 2.9185, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 1.8216762764268406, |
|
"eval_loss": 3.1997740268707275, |
|
"eval_runtime": 27.6334, |
|
"eval_samples_per_second": 161.471, |
|
"eval_steps_per_second": 20.193, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 1.8300325896214589, |
|
"grad_norm": 10.738883018493652, |
|
"learning_rate": 1.956565459065389e-05, |
|
"loss": 2.8789, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 1.8383889028160776, |
|
"grad_norm": 13.379829406738281, |
|
"learning_rate": 1.942599368732717e-05, |
|
"loss": 2.9005, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.8383889028160776, |
|
"eval_loss": 3.202975273132324, |
|
"eval_runtime": 27.6255, |
|
"eval_samples_per_second": 161.517, |
|
"eval_steps_per_second": 20.199, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.846745216010696, |
|
"grad_norm": 9.87146282196045, |
|
"learning_rate": 1.9286332784000448e-05, |
|
"loss": 2.8856, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 1.8551015292053146, |
|
"grad_norm": 12.577339172363281, |
|
"learning_rate": 1.9146671880673726e-05, |
|
"loss": 2.9502, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 1.8551015292053146, |
|
"eval_loss": 3.1968445777893066, |
|
"eval_runtime": 27.5448, |
|
"eval_samples_per_second": 161.991, |
|
"eval_steps_per_second": 20.258, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 1.8634578423999333, |
|
"grad_norm": 12.57132625579834, |
|
"learning_rate": 1.9007010977347005e-05, |
|
"loss": 2.8951, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 1.8718141555945516, |
|
"grad_norm": 14.708492279052734, |
|
"learning_rate": 1.886735007402028e-05, |
|
"loss": 2.9093, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.8718141555945516, |
|
"eval_loss": 3.1939940452575684, |
|
"eval_runtime": 27.5701, |
|
"eval_samples_per_second": 161.842, |
|
"eval_steps_per_second": 20.239, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.8801704687891703, |
|
"grad_norm": 12.688665390014648, |
|
"learning_rate": 1.8727689170693558e-05, |
|
"loss": 2.885, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 1.8885267819837888, |
|
"grad_norm": 11.511554718017578, |
|
"learning_rate": 1.8588028267366833e-05, |
|
"loss": 2.8351, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 1.8885267819837888, |
|
"eval_loss": 3.1979987621307373, |
|
"eval_runtime": 27.6058, |
|
"eval_samples_per_second": 161.633, |
|
"eval_steps_per_second": 20.213, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 1.8968830951784073, |
|
"grad_norm": 7.7706708908081055, |
|
"learning_rate": 1.844892600765342e-05, |
|
"loss": 2.904, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 1.905239408373026, |
|
"grad_norm": 12.276754379272461, |
|
"learning_rate": 1.8309265104326695e-05, |
|
"loss": 2.8785, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 1.905239408373026, |
|
"eval_loss": 3.20162296295166, |
|
"eval_runtime": 27.5895, |
|
"eval_samples_per_second": 161.728, |
|
"eval_steps_per_second": 20.225, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 1.9135957215676442, |
|
"grad_norm": 11.900626182556152, |
|
"learning_rate": 1.8169604200999974e-05, |
|
"loss": 2.8922, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 1.921952034762263, |
|
"grad_norm": 16.15927505493164, |
|
"learning_rate": 1.802994329767325e-05, |
|
"loss": 2.8341, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 1.921952034762263, |
|
"eval_loss": 3.192532539367676, |
|
"eval_runtime": 27.5532, |
|
"eval_samples_per_second": 161.941, |
|
"eval_steps_per_second": 20.252, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 1.9303083479568814, |
|
"grad_norm": 7.972958087921143, |
|
"learning_rate": 1.7890282394346527e-05, |
|
"loss": 2.8838, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 1.9386646611515, |
|
"grad_norm": 10.199915885925293, |
|
"learning_rate": 1.775118013463311e-05, |
|
"loss": 2.8599, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.9386646611515, |
|
"eval_loss": 3.185673475265503, |
|
"eval_runtime": 27.8833, |
|
"eval_samples_per_second": 160.024, |
|
"eval_steps_per_second": 20.012, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.9470209743461186, |
|
"grad_norm": 12.25405216217041, |
|
"learning_rate": 1.761151923130639e-05, |
|
"loss": 2.8582, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 1.955377287540737, |
|
"grad_norm": 11.32104778289795, |
|
"learning_rate": 1.7471858327979664e-05, |
|
"loss": 2.9085, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 1.955377287540737, |
|
"eval_loss": 3.1832330226898193, |
|
"eval_runtime": 27.611, |
|
"eval_samples_per_second": 161.602, |
|
"eval_steps_per_second": 20.209, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 1.9637336007353556, |
|
"grad_norm": 12.988907814025879, |
|
"learning_rate": 1.7332197424652943e-05, |
|
"loss": 2.8994, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 1.9720899139299741, |
|
"grad_norm": 13.372990608215332, |
|
"learning_rate": 1.719253652132622e-05, |
|
"loss": 2.8882, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 1.9720899139299741, |
|
"eval_loss": 3.1832265853881836, |
|
"eval_runtime": 27.5792, |
|
"eval_samples_per_second": 161.789, |
|
"eval_steps_per_second": 20.233, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 1.9804462271245926, |
|
"grad_norm": 13.150843620300293, |
|
"learning_rate": 1.7052875617999496e-05, |
|
"loss": 2.8809, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 1.988802540319211, |
|
"grad_norm": 11.465882301330566, |
|
"learning_rate": 1.6913214714672774e-05, |
|
"loss": 2.8083, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 1.988802540319211, |
|
"eval_loss": 3.1877200603485107, |
|
"eval_runtime": 27.5585, |
|
"eval_samples_per_second": 161.91, |
|
"eval_steps_per_second": 20.248, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 1.9971588535138296, |
|
"grad_norm": 15.985895156860352, |
|
"learning_rate": 1.6773553811346053e-05, |
|
"loss": 2.8512, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 2.0055151667084483, |
|
"grad_norm": 20.66619110107422, |
|
"learning_rate": 1.663389290801933e-05, |
|
"loss": 2.6213, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.0055151667084483, |
|
"eval_loss": 3.257246494293213, |
|
"eval_runtime": 27.6181, |
|
"eval_samples_per_second": 161.561, |
|
"eval_steps_per_second": 20.204, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.0138714799030666, |
|
"grad_norm": 19.570964813232422, |
|
"learning_rate": 1.649451132649926e-05, |
|
"loss": 2.451, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 2.0222277930976853, |
|
"grad_norm": 17.980260848999023, |
|
"learning_rate": 1.635485042317254e-05, |
|
"loss": 2.4327, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.0222277930976853, |
|
"eval_loss": 3.2763614654541016, |
|
"eval_runtime": 27.6142, |
|
"eval_samples_per_second": 161.584, |
|
"eval_steps_per_second": 20.207, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.030584106292304, |
|
"grad_norm": 12.097052574157715, |
|
"learning_rate": 1.621546884165247e-05, |
|
"loss": 2.4341, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 2.0389404194869223, |
|
"grad_norm": 13.760506629943848, |
|
"learning_rate": 1.6075807938325747e-05, |
|
"loss": 2.4713, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.0389404194869223, |
|
"eval_loss": 3.28802490234375, |
|
"eval_runtime": 27.5322, |
|
"eval_samples_per_second": 162.065, |
|
"eval_steps_per_second": 20.267, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.047296732681541, |
|
"grad_norm": 13.664862632751465, |
|
"learning_rate": 1.5936147034999025e-05, |
|
"loss": 2.4953, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 2.0556530458761593, |
|
"grad_norm": 14.178253173828125, |
|
"learning_rate": 1.57964861316723e-05, |
|
"loss": 2.4641, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.0556530458761593, |
|
"eval_loss": 3.2855935096740723, |
|
"eval_runtime": 27.617, |
|
"eval_samples_per_second": 161.567, |
|
"eval_steps_per_second": 20.205, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.064009359070778, |
|
"grad_norm": 19.99590492248535, |
|
"learning_rate": 1.565682522834558e-05, |
|
"loss": 2.485, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 2.0723656722653967, |
|
"grad_norm": 17.964866638183594, |
|
"learning_rate": 1.5517164325018854e-05, |
|
"loss": 2.4679, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.0723656722653967, |
|
"eval_loss": 3.2831804752349854, |
|
"eval_runtime": 27.6195, |
|
"eval_samples_per_second": 161.553, |
|
"eval_steps_per_second": 20.203, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.080721985460015, |
|
"grad_norm": 16.556684494018555, |
|
"learning_rate": 1.5377503421692132e-05, |
|
"loss": 2.4853, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 2.0890782986546337, |
|
"grad_norm": 17.376474380493164, |
|
"learning_rate": 1.5237842518365409e-05, |
|
"loss": 2.4614, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.0890782986546337, |
|
"eval_loss": 3.2957663536071777, |
|
"eval_runtime": 27.5859, |
|
"eval_samples_per_second": 161.749, |
|
"eval_steps_per_second": 20.228, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.097434611849252, |
|
"grad_norm": 16.47422218322754, |
|
"learning_rate": 1.5098181615038687e-05, |
|
"loss": 2.5273, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 2.1057909250438707, |
|
"grad_norm": 20.20784568786621, |
|
"learning_rate": 1.4958800033518617e-05, |
|
"loss": 2.4934, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.1057909250438707, |
|
"eval_loss": 3.2978439331054688, |
|
"eval_runtime": 27.5756, |
|
"eval_samples_per_second": 161.81, |
|
"eval_steps_per_second": 20.235, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.1141472382384894, |
|
"grad_norm": 22.081459045410156, |
|
"learning_rate": 1.4819418451998548e-05, |
|
"loss": 2.4496, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 2.1225035514331076, |
|
"grad_norm": 14.315736770629883, |
|
"learning_rate": 1.4679757548671824e-05, |
|
"loss": 2.4892, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.1225035514331076, |
|
"eval_loss": 3.2925477027893066, |
|
"eval_runtime": 27.6072, |
|
"eval_samples_per_second": 161.624, |
|
"eval_steps_per_second": 20.212, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.1308598646277264, |
|
"grad_norm": 19.269027709960938, |
|
"learning_rate": 1.4540096645345103e-05, |
|
"loss": 2.4366, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 2.1392161778223446, |
|
"grad_norm": 13.193103790283203, |
|
"learning_rate": 1.4400435742018381e-05, |
|
"loss": 2.4774, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.1392161778223446, |
|
"eval_loss": 3.2914838790893555, |
|
"eval_runtime": 27.5768, |
|
"eval_samples_per_second": 161.803, |
|
"eval_steps_per_second": 20.234, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.1475724910169633, |
|
"grad_norm": 20.42424201965332, |
|
"learning_rate": 1.4260774838691656e-05, |
|
"loss": 2.4849, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 2.155928804211582, |
|
"grad_norm": 14.249687194824219, |
|
"learning_rate": 1.4121113935364934e-05, |
|
"loss": 2.4462, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.155928804211582, |
|
"eval_loss": 3.2914915084838867, |
|
"eval_runtime": 27.5308, |
|
"eval_samples_per_second": 162.073, |
|
"eval_steps_per_second": 20.268, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.1642851174062003, |
|
"grad_norm": 28.348491668701172, |
|
"learning_rate": 1.3981453032038211e-05, |
|
"loss": 2.49, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 2.172641430600819, |
|
"grad_norm": 17.39087677001953, |
|
"learning_rate": 1.384179212871149e-05, |
|
"loss": 2.4542, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.172641430600819, |
|
"eval_loss": 3.2924540042877197, |
|
"eval_runtime": 27.5959, |
|
"eval_samples_per_second": 161.691, |
|
"eval_steps_per_second": 20.22, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.1809977437954373, |
|
"grad_norm": 16.53498649597168, |
|
"learning_rate": 1.3702131225384768e-05, |
|
"loss": 2.4621, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 2.189354056990056, |
|
"grad_norm": 16.412384033203125, |
|
"learning_rate": 1.3562749643864697e-05, |
|
"loss": 2.4613, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.189354056990056, |
|
"eval_loss": 3.295562744140625, |
|
"eval_runtime": 27.578, |
|
"eval_samples_per_second": 161.796, |
|
"eval_steps_per_second": 20.234, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.1977103701846747, |
|
"grad_norm": 15.538711547851562, |
|
"learning_rate": 1.3423368062344627e-05, |
|
"loss": 2.4549, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 2.206066683379293, |
|
"grad_norm": 26.194726943969727, |
|
"learning_rate": 1.328398648082456e-05, |
|
"loss": 2.4887, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.206066683379293, |
|
"eval_loss": 3.28711199760437, |
|
"eval_runtime": 27.5596, |
|
"eval_samples_per_second": 161.904, |
|
"eval_steps_per_second": 20.247, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.2144229965739117, |
|
"grad_norm": 17.261341094970703, |
|
"learning_rate": 1.3144325577497835e-05, |
|
"loss": 2.4712, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 2.22277930976853, |
|
"grad_norm": 13.89609146118164, |
|
"learning_rate": 1.3004664674171113e-05, |
|
"loss": 2.505, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.22277930976853, |
|
"eval_loss": 3.281574249267578, |
|
"eval_runtime": 27.554, |
|
"eval_samples_per_second": 161.936, |
|
"eval_steps_per_second": 20.251, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.2311356229631487, |
|
"grad_norm": 20.80926513671875, |
|
"learning_rate": 1.2865003770844391e-05, |
|
"loss": 2.4657, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 2.2394919361577674, |
|
"grad_norm": 11.168779373168945, |
|
"learning_rate": 1.2725622189324321e-05, |
|
"loss": 2.4894, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 2.2394919361577674, |
|
"eval_loss": 3.277146100997925, |
|
"eval_runtime": 27.5992, |
|
"eval_samples_per_second": 161.671, |
|
"eval_steps_per_second": 20.218, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 2.2478482493523857, |
|
"grad_norm": 16.13945198059082, |
|
"learning_rate": 1.25859612859976e-05, |
|
"loss": 2.473, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 2.2562045625470044, |
|
"grad_norm": 15.301444053649902, |
|
"learning_rate": 1.2446300382670876e-05, |
|
"loss": 2.4731, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 2.2562045625470044, |
|
"eval_loss": 3.277038812637329, |
|
"eval_runtime": 27.7191, |
|
"eval_samples_per_second": 160.972, |
|
"eval_steps_per_second": 20.131, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 2.2645608757416227, |
|
"grad_norm": 14.259910583496094, |
|
"learning_rate": 1.2306639479344154e-05, |
|
"loss": 2.4209, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 2.2729171889362414, |
|
"grad_norm": 11.308433532714844, |
|
"learning_rate": 1.2166978576017431e-05, |
|
"loss": 2.469, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 2.2729171889362414, |
|
"eval_loss": 3.272756814956665, |
|
"eval_runtime": 27.5588, |
|
"eval_samples_per_second": 161.908, |
|
"eval_steps_per_second": 20.248, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 2.28127350213086, |
|
"grad_norm": 11.12240219116211, |
|
"learning_rate": 1.2027317672690708e-05, |
|
"loss": 2.4856, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 2.2896298153254784, |
|
"grad_norm": 20.829790115356445, |
|
"learning_rate": 1.1887656769363984e-05, |
|
"loss": 2.4522, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 2.2896298153254784, |
|
"eval_loss": 3.2834246158599854, |
|
"eval_runtime": 27.6412, |
|
"eval_samples_per_second": 161.426, |
|
"eval_steps_per_second": 20.187, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 2.297986128520097, |
|
"grad_norm": 16.781526565551758, |
|
"learning_rate": 1.1747995866037263e-05, |
|
"loss": 2.4827, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 2.3063424417147154, |
|
"grad_norm": 14.312143325805664, |
|
"learning_rate": 1.160833496271054e-05, |
|
"loss": 2.4823, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 2.3063424417147154, |
|
"eval_loss": 3.2799878120422363, |
|
"eval_runtime": 27.6358, |
|
"eval_samples_per_second": 161.457, |
|
"eval_steps_per_second": 20.191, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 2.314698754909334, |
|
"grad_norm": 16.167057037353516, |
|
"learning_rate": 1.1468674059383816e-05, |
|
"loss": 2.4814, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 2.3230550681039523, |
|
"grad_norm": 13.810924530029297, |
|
"learning_rate": 1.1329013156057094e-05, |
|
"loss": 2.4825, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 2.3230550681039523, |
|
"eval_loss": 3.278149127960205, |
|
"eval_runtime": 27.6409, |
|
"eval_samples_per_second": 161.428, |
|
"eval_steps_per_second": 20.187, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 2.331411381298571, |
|
"grad_norm": 15.337100982666016, |
|
"learning_rate": 1.1189352252730371e-05, |
|
"loss": 2.5083, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 2.3397676944931898, |
|
"grad_norm": 19.78384017944336, |
|
"learning_rate": 1.1049691349403648e-05, |
|
"loss": 2.4427, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 2.3397676944931898, |
|
"eval_loss": 3.276196002960205, |
|
"eval_runtime": 27.5881, |
|
"eval_samples_per_second": 161.737, |
|
"eval_steps_per_second": 20.226, |
|
"step": 140000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 179505, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.977262699675648e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|