innovation-hacking2's picture
Upload folder using huggingface_hub
ad5fb8e verified
{
"best_metric": 3.1832265853881836,
"best_model_checkpoint": "./models/lora-finetuning/LLaMmlein_120M/checkpoint-118000",
"epoch": 2.3397676944931898,
"eval_steps": 1000,
"global_step": 140000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008356313194618534,
"grad_norm": 40.111167907714844,
"learning_rate": 4.96e-05,
"loss": 4.8904,
"step": 500
},
{
"epoch": 0.016712626389237067,
"grad_norm": 22.074819564819336,
"learning_rate": 4.986173570570655e-05,
"loss": 4.4642,
"step": 1000
},
{
"epoch": 0.016712626389237067,
"eval_loss": 4.3232102394104,
"eval_runtime": 27.5175,
"eval_samples_per_second": 162.152,
"eval_steps_per_second": 20.278,
"step": 1000
},
{
"epoch": 0.025068939583855605,
"grad_norm": 19.863744735717773,
"learning_rate": 4.972207480237983e-05,
"loss": 4.3112,
"step": 1500
},
{
"epoch": 0.033425252778474135,
"grad_norm": 19.56609344482422,
"learning_rate": 4.958241389905311e-05,
"loss": 4.2377,
"step": 2000
},
{
"epoch": 0.033425252778474135,
"eval_loss": 4.127670764923096,
"eval_runtime": 27.7514,
"eval_samples_per_second": 160.785,
"eval_steps_per_second": 20.107,
"step": 2000
},
{
"epoch": 0.04178156597309267,
"grad_norm": 19.212339401245117,
"learning_rate": 4.944275299572638e-05,
"loss": 4.1323,
"step": 2500
},
{
"epoch": 0.05013787916771121,
"grad_norm": 16.833269119262695,
"learning_rate": 4.930309209239966e-05,
"loss": 4.0461,
"step": 3000
},
{
"epoch": 0.05013787916771121,
"eval_loss": 4.040640354156494,
"eval_runtime": 27.5887,
"eval_samples_per_second": 161.733,
"eval_steps_per_second": 20.226,
"step": 3000
},
{
"epoch": 0.05849419236232974,
"grad_norm": 12.117938995361328,
"learning_rate": 4.9163431189072935e-05,
"loss": 4.0848,
"step": 3500
},
{
"epoch": 0.06685050555694827,
"grad_norm": 16.871612548828125,
"learning_rate": 4.9023770285746213e-05,
"loss": 4.0421,
"step": 4000
},
{
"epoch": 0.06685050555694827,
"eval_loss": 3.966156005859375,
"eval_runtime": 27.5488,
"eval_samples_per_second": 161.967,
"eval_steps_per_second": 20.255,
"step": 4000
},
{
"epoch": 0.0752068187515668,
"grad_norm": 23.424396514892578,
"learning_rate": 4.8884109382419485e-05,
"loss": 3.9967,
"step": 4500
},
{
"epoch": 0.08356313194618534,
"grad_norm": 17.837812423706055,
"learning_rate": 4.8744448479092763e-05,
"loss": 3.9343,
"step": 5000
},
{
"epoch": 0.08356313194618534,
"eval_loss": 3.9122862815856934,
"eval_runtime": 27.6483,
"eval_samples_per_second": 161.384,
"eval_steps_per_second": 20.182,
"step": 5000
},
{
"epoch": 0.09191944514080387,
"grad_norm": 19.62445640563965,
"learning_rate": 4.860478757576604e-05,
"loss": 3.9297,
"step": 5500
},
{
"epoch": 0.10027575833542242,
"grad_norm": 16.499317169189453,
"learning_rate": 4.846512667243932e-05,
"loss": 3.8862,
"step": 6000
},
{
"epoch": 0.10027575833542242,
"eval_loss": 3.8697621822357178,
"eval_runtime": 30.2568,
"eval_samples_per_second": 147.471,
"eval_steps_per_second": 18.442,
"step": 6000
},
{
"epoch": 0.10863207153004095,
"grad_norm": 22.481382369995117,
"learning_rate": 4.83254657691126e-05,
"loss": 3.8479,
"step": 6500
},
{
"epoch": 0.11698838472465949,
"grad_norm": 15.390802383422852,
"learning_rate": 4.818580486578587e-05,
"loss": 3.8934,
"step": 7000
},
{
"epoch": 0.11698838472465949,
"eval_loss": 3.8397133350372314,
"eval_runtime": 27.593,
"eval_samples_per_second": 161.708,
"eval_steps_per_second": 20.223,
"step": 7000
},
{
"epoch": 0.12534469791927802,
"grad_norm": 16.615388870239258,
"learning_rate": 4.804670260607246e-05,
"loss": 3.8718,
"step": 7500
},
{
"epoch": 0.13370101111389654,
"grad_norm": 16.377056121826172,
"learning_rate": 4.7907041702745736e-05,
"loss": 3.813,
"step": 8000
},
{
"epoch": 0.13370101111389654,
"eval_loss": 3.8209779262542725,
"eval_runtime": 27.534,
"eval_samples_per_second": 162.054,
"eval_steps_per_second": 20.266,
"step": 8000
},
{
"epoch": 0.1420573243085151,
"grad_norm": 20.620891571044922,
"learning_rate": 4.7767380799419014e-05,
"loss": 3.8194,
"step": 8500
},
{
"epoch": 0.1504136375031336,
"grad_norm": 14.788801193237305,
"learning_rate": 4.7627719896092286e-05,
"loss": 3.8246,
"step": 9000
},
{
"epoch": 0.1504136375031336,
"eval_loss": 3.7902626991271973,
"eval_runtime": 27.5751,
"eval_samples_per_second": 161.812,
"eval_steps_per_second": 20.236,
"step": 9000
},
{
"epoch": 0.15876995069775215,
"grad_norm": 12.923628807067871,
"learning_rate": 4.748833831457222e-05,
"loss": 3.7739,
"step": 9500
},
{
"epoch": 0.16712626389237067,
"grad_norm": 15.342278480529785,
"learning_rate": 4.7348677411245494e-05,
"loss": 3.771,
"step": 10000
},
{
"epoch": 0.16712626389237067,
"eval_loss": 3.7538962364196777,
"eval_runtime": 27.5894,
"eval_samples_per_second": 161.729,
"eval_steps_per_second": 20.225,
"step": 10000
},
{
"epoch": 0.17548257708698922,
"grad_norm": 14.153196334838867,
"learning_rate": 4.720901650791877e-05,
"loss": 3.7321,
"step": 10500
},
{
"epoch": 0.18383889028160774,
"grad_norm": 29.715482711791992,
"learning_rate": 4.706935560459205e-05,
"loss": 3.7219,
"step": 11000
},
{
"epoch": 0.18383889028160774,
"eval_loss": 3.736612558364868,
"eval_runtime": 27.6054,
"eval_samples_per_second": 161.635,
"eval_steps_per_second": 20.213,
"step": 11000
},
{
"epoch": 0.1921952034762263,
"grad_norm": 22.01296043395996,
"learning_rate": 4.692969470126533e-05,
"loss": 3.7397,
"step": 11500
},
{
"epoch": 0.20055151667084484,
"grad_norm": 12.3303804397583,
"learning_rate": 4.679003379793861e-05,
"loss": 3.731,
"step": 12000
},
{
"epoch": 0.20055151667084484,
"eval_loss": 3.7091643810272217,
"eval_runtime": 27.5167,
"eval_samples_per_second": 162.156,
"eval_steps_per_second": 20.279,
"step": 12000
},
{
"epoch": 0.20890782986546336,
"grad_norm": 17.74376106262207,
"learning_rate": 4.6650372894611886e-05,
"loss": 3.7524,
"step": 12500
},
{
"epoch": 0.2172641430600819,
"grad_norm": 11.327990531921387,
"learning_rate": 4.6510711991285164e-05,
"loss": 3.6842,
"step": 13000
},
{
"epoch": 0.2172641430600819,
"eval_loss": 3.7014827728271484,
"eval_runtime": 27.5638,
"eval_samples_per_second": 161.879,
"eval_steps_per_second": 20.244,
"step": 13000
},
{
"epoch": 0.22562045625470042,
"grad_norm": 10.342738151550293,
"learning_rate": 4.637133040976509e-05,
"loss": 3.672,
"step": 13500
},
{
"epoch": 0.23397676944931897,
"grad_norm": 14.019166946411133,
"learning_rate": 4.623166950643837e-05,
"loss": 3.6743,
"step": 14000
},
{
"epoch": 0.23397676944931897,
"eval_loss": 3.680652618408203,
"eval_runtime": 27.7236,
"eval_samples_per_second": 160.946,
"eval_steps_per_second": 20.127,
"step": 14000
},
{
"epoch": 0.2423330826439375,
"grad_norm": 12.447026252746582,
"learning_rate": 4.609200860311165e-05,
"loss": 3.68,
"step": 14500
},
{
"epoch": 0.25068939583855604,
"grad_norm": 16.875900268554688,
"learning_rate": 4.595234769978493e-05,
"loss": 3.6755,
"step": 15000
},
{
"epoch": 0.25068939583855604,
"eval_loss": 3.656320095062256,
"eval_runtime": 27.5417,
"eval_samples_per_second": 162.009,
"eval_steps_per_second": 20.26,
"step": 15000
},
{
"epoch": 0.2590457090331746,
"grad_norm": 14.400089263916016,
"learning_rate": 4.581268679645821e-05,
"loss": 3.668,
"step": 15500
},
{
"epoch": 0.2674020222277931,
"grad_norm": 17.918594360351562,
"learning_rate": 4.567302589313148e-05,
"loss": 3.6105,
"step": 16000
},
{
"epoch": 0.2674020222277931,
"eval_loss": 3.6530709266662598,
"eval_runtime": 27.5371,
"eval_samples_per_second": 162.036,
"eval_steps_per_second": 20.264,
"step": 16000
},
{
"epoch": 0.2757583354224116,
"grad_norm": 25.155494689941406,
"learning_rate": 4.5533644311611415e-05,
"loss": 3.6537,
"step": 16500
},
{
"epoch": 0.2841146486170302,
"grad_norm": 19.406333923339844,
"learning_rate": 4.5393983408284686e-05,
"loss": 3.6321,
"step": 17000
},
{
"epoch": 0.2841146486170302,
"eval_loss": 3.625420331954956,
"eval_runtime": 27.6545,
"eval_samples_per_second": 161.348,
"eval_steps_per_second": 20.178,
"step": 17000
},
{
"epoch": 0.2924709618116487,
"grad_norm": 16.296859741210938,
"learning_rate": 4.5254322504957965e-05,
"loss": 3.5798,
"step": 17500
},
{
"epoch": 0.3008272750062672,
"grad_norm": 14.436009407043457,
"learning_rate": 4.511466160163124e-05,
"loss": 3.5772,
"step": 18000
},
{
"epoch": 0.3008272750062672,
"eval_loss": 3.609403371810913,
"eval_runtime": 27.6116,
"eval_samples_per_second": 161.599,
"eval_steps_per_second": 20.209,
"step": 18000
},
{
"epoch": 0.30918358820088576,
"grad_norm": 14.975789070129395,
"learning_rate": 4.497500069830452e-05,
"loss": 3.5887,
"step": 18500
},
{
"epoch": 0.3175399013955043,
"grad_norm": 13.919900894165039,
"learning_rate": 4.48353397949778e-05,
"loss": 3.5855,
"step": 19000
},
{
"epoch": 0.3175399013955043,
"eval_loss": 3.593752384185791,
"eval_runtime": 27.5247,
"eval_samples_per_second": 162.109,
"eval_steps_per_second": 20.273,
"step": 19000
},
{
"epoch": 0.32589621459012286,
"grad_norm": 13.113483428955078,
"learning_rate": 4.469567889165107e-05,
"loss": 3.5958,
"step": 19500
},
{
"epoch": 0.33425252778474135,
"grad_norm": 16.182727813720703,
"learning_rate": 4.455601798832435e-05,
"loss": 3.5544,
"step": 20000
},
{
"epoch": 0.33425252778474135,
"eval_loss": 3.5966029167175293,
"eval_runtime": 27.5874,
"eval_samples_per_second": 161.741,
"eval_steps_per_second": 20.227,
"step": 20000
},
{
"epoch": 0.3426088409793599,
"grad_norm": 11.520635604858398,
"learning_rate": 4.441635708499763e-05,
"loss": 3.5985,
"step": 20500
},
{
"epoch": 0.35096515417397844,
"grad_norm": 9.132452011108398,
"learning_rate": 4.4276696181670906e-05,
"loss": 3.5385,
"step": 21000
},
{
"epoch": 0.35096515417397844,
"eval_loss": 3.5922670364379883,
"eval_runtime": 27.5592,
"eval_samples_per_second": 161.906,
"eval_steps_per_second": 20.247,
"step": 21000
},
{
"epoch": 0.359321467368597,
"grad_norm": 11.077112197875977,
"learning_rate": 4.413703527834418e-05,
"loss": 3.5502,
"step": 21500
},
{
"epoch": 0.3676777805632155,
"grad_norm": 10.871437072753906,
"learning_rate": 4.3997653696824114e-05,
"loss": 3.5683,
"step": 22000
},
{
"epoch": 0.3676777805632155,
"eval_loss": 3.570159673690796,
"eval_runtime": 27.5952,
"eval_samples_per_second": 161.695,
"eval_steps_per_second": 20.221,
"step": 22000
},
{
"epoch": 0.37603409375783403,
"grad_norm": 14.359475135803223,
"learning_rate": 4.3858272115304044e-05,
"loss": 3.6252,
"step": 22500
},
{
"epoch": 0.3843904069524526,
"grad_norm": 14.215445518493652,
"learning_rate": 4.371861121197732e-05,
"loss": 3.5357,
"step": 23000
},
{
"epoch": 0.3843904069524526,
"eval_loss": 3.5666186809539795,
"eval_runtime": 27.4876,
"eval_samples_per_second": 162.328,
"eval_steps_per_second": 20.3,
"step": 23000
},
{
"epoch": 0.3927467201470711,
"grad_norm": 12.356164932250977,
"learning_rate": 4.3578950308650594e-05,
"loss": 3.5561,
"step": 23500
},
{
"epoch": 0.4011030333416897,
"grad_norm": 13.857044219970703,
"learning_rate": 4.343956872713053e-05,
"loss": 3.5108,
"step": 24000
},
{
"epoch": 0.4011030333416897,
"eval_loss": 3.553119659423828,
"eval_runtime": 27.5959,
"eval_samples_per_second": 161.691,
"eval_steps_per_second": 20.22,
"step": 24000
},
{
"epoch": 0.40945934653630817,
"grad_norm": 18.891963958740234,
"learning_rate": 4.329990782380381e-05,
"loss": 3.5144,
"step": 24500
},
{
"epoch": 0.4178156597309267,
"grad_norm": 11.144295692443848,
"learning_rate": 4.316024692047708e-05,
"loss": 3.4772,
"step": 25000
},
{
"epoch": 0.4178156597309267,
"eval_loss": 3.539018154144287,
"eval_runtime": 27.6102,
"eval_samples_per_second": 161.607,
"eval_steps_per_second": 20.21,
"step": 25000
},
{
"epoch": 0.42617197292554526,
"grad_norm": 13.266519546508789,
"learning_rate": 4.302058601715036e-05,
"loss": 3.4983,
"step": 25500
},
{
"epoch": 0.4345282861201638,
"grad_norm": 13.068937301635742,
"learning_rate": 4.288092511382364e-05,
"loss": 3.5372,
"step": 26000
},
{
"epoch": 0.4345282861201638,
"eval_loss": 3.5329203605651855,
"eval_runtime": 27.5669,
"eval_samples_per_second": 161.861,
"eval_steps_per_second": 20.242,
"step": 26000
},
{
"epoch": 0.4428845993147823,
"grad_norm": 10.656412124633789,
"learning_rate": 4.2741543532303566e-05,
"loss": 3.4773,
"step": 26500
},
{
"epoch": 0.45124091250940085,
"grad_norm": 13.304460525512695,
"learning_rate": 4.2601882628976845e-05,
"loss": 3.5119,
"step": 27000
},
{
"epoch": 0.45124091250940085,
"eval_loss": 3.5271201133728027,
"eval_runtime": 27.5897,
"eval_samples_per_second": 161.727,
"eval_steps_per_second": 20.225,
"step": 27000
},
{
"epoch": 0.4595972257040194,
"grad_norm": 9.64598560333252,
"learning_rate": 4.246222172565012e-05,
"loss": 3.4904,
"step": 27500
},
{
"epoch": 0.46795353889863794,
"grad_norm": 18.846969604492188,
"learning_rate": 4.23225608223234e-05,
"loss": 3.4627,
"step": 28000
},
{
"epoch": 0.46795353889863794,
"eval_loss": 3.52040433883667,
"eval_runtime": 27.5839,
"eval_samples_per_second": 161.761,
"eval_steps_per_second": 20.229,
"step": 28000
},
{
"epoch": 0.47630985209325644,
"grad_norm": 11.906749725341797,
"learning_rate": 4.218289991899668e-05,
"loss": 3.4874,
"step": 28500
},
{
"epoch": 0.484666165287875,
"grad_norm": 11.39976692199707,
"learning_rate": 4.204323901566996e-05,
"loss": 3.5089,
"step": 29000
},
{
"epoch": 0.484666165287875,
"eval_loss": 3.5191400051116943,
"eval_runtime": 27.5537,
"eval_samples_per_second": 161.938,
"eval_steps_per_second": 20.251,
"step": 29000
},
{
"epoch": 0.49302247848249353,
"grad_norm": 16.590219497680664,
"learning_rate": 4.190385743414989e-05,
"loss": 3.4669,
"step": 29500
},
{
"epoch": 0.5013787916771121,
"grad_norm": 12.839526176452637,
"learning_rate": 4.1764196530823166e-05,
"loss": 3.5124,
"step": 30000
},
{
"epoch": 0.5013787916771121,
"eval_loss": 3.5098681449890137,
"eval_runtime": 27.52,
"eval_samples_per_second": 162.137,
"eval_steps_per_second": 20.276,
"step": 30000
},
{
"epoch": 0.5097351048717306,
"grad_norm": 15.809226989746094,
"learning_rate": 4.1624535627496444e-05,
"loss": 3.4721,
"step": 30500
},
{
"epoch": 0.5180914180663492,
"grad_norm": 14.224934577941895,
"learning_rate": 4.148487472416972e-05,
"loss": 3.4529,
"step": 31000
},
{
"epoch": 0.5180914180663492,
"eval_loss": 3.4936439990997314,
"eval_runtime": 27.5971,
"eval_samples_per_second": 161.684,
"eval_steps_per_second": 20.22,
"step": 31000
},
{
"epoch": 0.5264477312609677,
"grad_norm": 14.624770164489746,
"learning_rate": 4.1345213820842994e-05,
"loss": 3.4888,
"step": 31500
},
{
"epoch": 0.5348040444555862,
"grad_norm": 11.033506393432617,
"learning_rate": 4.120555291751627e-05,
"loss": 3.4595,
"step": 32000
},
{
"epoch": 0.5348040444555862,
"eval_loss": 3.4841041564941406,
"eval_runtime": 27.5814,
"eval_samples_per_second": 161.776,
"eval_steps_per_second": 20.231,
"step": 32000
},
{
"epoch": 0.5431603576502048,
"grad_norm": 11.615704536437988,
"learning_rate": 4.106589201418955e-05,
"loss": 3.4677,
"step": 32500
},
{
"epoch": 0.5515166708448233,
"grad_norm": 13.639912605285645,
"learning_rate": 4.092651043266948e-05,
"loss": 3.4418,
"step": 33000
},
{
"epoch": 0.5515166708448233,
"eval_loss": 3.478278398513794,
"eval_runtime": 27.6018,
"eval_samples_per_second": 161.656,
"eval_steps_per_second": 20.216,
"step": 33000
},
{
"epoch": 0.5598729840394417,
"grad_norm": 17.30266761779785,
"learning_rate": 4.078684952934276e-05,
"loss": 3.4707,
"step": 33500
},
{
"epoch": 0.5682292972340603,
"grad_norm": 10.405800819396973,
"learning_rate": 4.064718862601604e-05,
"loss": 3.4683,
"step": 34000
},
{
"epoch": 0.5682292972340603,
"eval_loss": 3.474083662033081,
"eval_runtime": 27.598,
"eval_samples_per_second": 161.678,
"eval_steps_per_second": 20.219,
"step": 34000
},
{
"epoch": 0.5765856104286788,
"grad_norm": 12.891817092895508,
"learning_rate": 4.0507527722689315e-05,
"loss": 3.4564,
"step": 34500
},
{
"epoch": 0.5849419236232974,
"grad_norm": 8.831328392028809,
"learning_rate": 4.036786681936259e-05,
"loss": 3.3926,
"step": 35000
},
{
"epoch": 0.5849419236232974,
"eval_loss": 3.4699606895446777,
"eval_runtime": 27.6266,
"eval_samples_per_second": 161.511,
"eval_steps_per_second": 20.198,
"step": 35000
},
{
"epoch": 0.5932982368179159,
"grad_norm": 16.12383460998535,
"learning_rate": 4.0228205916035865e-05,
"loss": 3.4639,
"step": 35500
},
{
"epoch": 0.6016545500125344,
"grad_norm": 12.538627624511719,
"learning_rate": 4.0088545012709144e-05,
"loss": 3.4289,
"step": 36000
},
{
"epoch": 0.6016545500125344,
"eval_loss": 3.4501824378967285,
"eval_runtime": 27.6054,
"eval_samples_per_second": 161.635,
"eval_steps_per_second": 20.213,
"step": 36000
},
{
"epoch": 0.610010863207153,
"grad_norm": 13.25362777709961,
"learning_rate": 3.994888410938242e-05,
"loss": 3.4552,
"step": 36500
},
{
"epoch": 0.6183671764017715,
"grad_norm": 14.144664764404297,
"learning_rate": 3.980950252786235e-05,
"loss": 3.432,
"step": 37000
},
{
"epoch": 0.6183671764017715,
"eval_loss": 3.4386274814605713,
"eval_runtime": 27.5191,
"eval_samples_per_second": 162.142,
"eval_steps_per_second": 20.277,
"step": 37000
},
{
"epoch": 0.6267234895963901,
"grad_norm": 11.082966804504395,
"learning_rate": 3.966984162453563e-05,
"loss": 3.446,
"step": 37500
},
{
"epoch": 0.6350798027910086,
"grad_norm": 12.105545997619629,
"learning_rate": 3.953018072120891e-05,
"loss": 3.4498,
"step": 38000
},
{
"epoch": 0.6350798027910086,
"eval_loss": 3.4284586906433105,
"eval_runtime": 27.6228,
"eval_samples_per_second": 161.533,
"eval_steps_per_second": 20.201,
"step": 38000
},
{
"epoch": 0.6434361159856271,
"grad_norm": 11.420595169067383,
"learning_rate": 3.939051981788218e-05,
"loss": 3.3634,
"step": 38500
},
{
"epoch": 0.6517924291802457,
"grad_norm": 13.421010971069336,
"learning_rate": 3.9251138236362116e-05,
"loss": 3.4011,
"step": 39000
},
{
"epoch": 0.6517924291802457,
"eval_loss": 3.4333345890045166,
"eval_runtime": 27.6141,
"eval_samples_per_second": 161.584,
"eval_steps_per_second": 20.207,
"step": 39000
},
{
"epoch": 0.6601487423748642,
"grad_norm": 13.694308280944824,
"learning_rate": 3.911147733303539e-05,
"loss": 3.3544,
"step": 39500
},
{
"epoch": 0.6685050555694827,
"grad_norm": 13.354585647583008,
"learning_rate": 3.8971816429708666e-05,
"loss": 3.4279,
"step": 40000
},
{
"epoch": 0.6685050555694827,
"eval_loss": 3.428675651550293,
"eval_runtime": 29.3177,
"eval_samples_per_second": 152.194,
"eval_steps_per_second": 19.033,
"step": 40000
},
{
"epoch": 0.6768613687641013,
"grad_norm": 13.542250633239746,
"learning_rate": 3.8832155526381945e-05,
"loss": 3.3683,
"step": 40500
},
{
"epoch": 0.6852176819587198,
"grad_norm": 15.259937286376953,
"learning_rate": 3.869249462305522e-05,
"loss": 3.3964,
"step": 41000
},
{
"epoch": 0.6852176819587198,
"eval_loss": 3.4252407550811768,
"eval_runtime": 32.0858,
"eval_samples_per_second": 139.065,
"eval_steps_per_second": 17.391,
"step": 41000
},
{
"epoch": 0.6935739951533384,
"grad_norm": 8.693069458007812,
"learning_rate": 3.85528337197285e-05,
"loss": 3.3419,
"step": 41500
},
{
"epoch": 0.7019303083479569,
"grad_norm": 10.193922996520996,
"learning_rate": 3.841317281640178e-05,
"loss": 3.361,
"step": 42000
},
{
"epoch": 0.7019303083479569,
"eval_loss": 3.423677444458008,
"eval_runtime": 27.5904,
"eval_samples_per_second": 161.723,
"eval_steps_per_second": 20.224,
"step": 42000
},
{
"epoch": 0.7102866215425754,
"grad_norm": 13.626117706298828,
"learning_rate": 3.827351191307506e-05,
"loss": 3.3456,
"step": 42500
},
{
"epoch": 0.718642934737194,
"grad_norm": 15.671127319335938,
"learning_rate": 3.813413033155499e-05,
"loss": 3.39,
"step": 43000
},
{
"epoch": 0.718642934737194,
"eval_loss": 3.410151243209839,
"eval_runtime": 27.586,
"eval_samples_per_second": 161.749,
"eval_steps_per_second": 20.228,
"step": 43000
},
{
"epoch": 0.7269992479318125,
"grad_norm": 13.179340362548828,
"learning_rate": 3.7994469428228266e-05,
"loss": 3.3407,
"step": 43500
},
{
"epoch": 0.735355561126431,
"grad_norm": 11.219006538391113,
"learning_rate": 3.7854808524901544e-05,
"loss": 3.3509,
"step": 44000
},
{
"epoch": 0.735355561126431,
"eval_loss": 3.4000790119171143,
"eval_runtime": 27.5216,
"eval_samples_per_second": 162.127,
"eval_steps_per_second": 20.275,
"step": 44000
},
{
"epoch": 0.7437118743210496,
"grad_norm": 19.535215377807617,
"learning_rate": 3.771514762157482e-05,
"loss": 3.3784,
"step": 44500
},
{
"epoch": 0.7520681875156681,
"grad_norm": 11.256051063537598,
"learning_rate": 3.7575486718248094e-05,
"loss": 3.3584,
"step": 45000
},
{
"epoch": 0.7520681875156681,
"eval_loss": 3.3937857151031494,
"eval_runtime": 27.6288,
"eval_samples_per_second": 161.498,
"eval_steps_per_second": 20.196,
"step": 45000
},
{
"epoch": 0.7604245007102867,
"grad_norm": 14.835156440734863,
"learning_rate": 3.743610513672803e-05,
"loss": 3.3964,
"step": 45500
},
{
"epoch": 0.7687808139049052,
"grad_norm": 13.36843204498291,
"learning_rate": 3.72964442334013e-05,
"loss": 3.3612,
"step": 46000
},
{
"epoch": 0.7687808139049052,
"eval_loss": 3.4002346992492676,
"eval_runtime": 27.6951,
"eval_samples_per_second": 161.112,
"eval_steps_per_second": 20.148,
"step": 46000
},
{
"epoch": 0.7771371270995237,
"grad_norm": 15.822681427001953,
"learning_rate": 3.715706265188124e-05,
"loss": 3.3235,
"step": 46500
},
{
"epoch": 0.7854934402941423,
"grad_norm": 11.626577377319336,
"learning_rate": 3.701740174855451e-05,
"loss": 3.3167,
"step": 47000
},
{
"epoch": 0.7854934402941423,
"eval_loss": 3.3830487728118896,
"eval_runtime": 27.7333,
"eval_samples_per_second": 160.89,
"eval_steps_per_second": 20.12,
"step": 47000
},
{
"epoch": 0.7938497534887607,
"grad_norm": 18.387489318847656,
"learning_rate": 3.687774084522779e-05,
"loss": 3.3468,
"step": 47500
},
{
"epoch": 0.8022060666833793,
"grad_norm": 10.468737602233887,
"learning_rate": 3.673807994190107e-05,
"loss": 3.3765,
"step": 48000
},
{
"epoch": 0.8022060666833793,
"eval_loss": 3.3805253505706787,
"eval_runtime": 27.5567,
"eval_samples_per_second": 161.921,
"eval_steps_per_second": 20.249,
"step": 48000
},
{
"epoch": 0.8105623798779978,
"grad_norm": 12.263431549072266,
"learning_rate": 3.6598419038574345e-05,
"loss": 3.3353,
"step": 48500
},
{
"epoch": 0.8189186930726163,
"grad_norm": 10.66336441040039,
"learning_rate": 3.6458758135247623e-05,
"loss": 3.2779,
"step": 49000
},
{
"epoch": 0.8189186930726163,
"eval_loss": 3.3863792419433594,
"eval_runtime": 27.5626,
"eval_samples_per_second": 161.886,
"eval_steps_per_second": 20.245,
"step": 49000
},
{
"epoch": 0.8272750062672349,
"grad_norm": 13.781414031982422,
"learning_rate": 3.6319097231920895e-05,
"loss": 3.3591,
"step": 49500
},
{
"epoch": 0.8356313194618534,
"grad_norm": 10.030421257019043,
"learning_rate": 3.617943632859417e-05,
"loss": 3.3354,
"step": 50000
},
{
"epoch": 0.8356313194618534,
"eval_loss": 3.367990493774414,
"eval_runtime": 27.5956,
"eval_samples_per_second": 161.693,
"eval_steps_per_second": 20.221,
"step": 50000
},
{
"epoch": 0.8439876326564719,
"grad_norm": 14.890128135681152,
"learning_rate": 3.603977542526745e-05,
"loss": 3.3061,
"step": 50500
},
{
"epoch": 0.8523439458510905,
"grad_norm": 16.298643112182617,
"learning_rate": 3.590011452194073e-05,
"loss": 3.3032,
"step": 51000
},
{
"epoch": 0.8523439458510905,
"eval_loss": 3.3711585998535156,
"eval_runtime": 27.5103,
"eval_samples_per_second": 162.194,
"eval_steps_per_second": 20.283,
"step": 51000
},
{
"epoch": 0.860700259045709,
"grad_norm": 11.454365730285645,
"learning_rate": 3.576045361861401e-05,
"loss": 3.2919,
"step": 51500
},
{
"epoch": 0.8690565722403276,
"grad_norm": 11.1732177734375,
"learning_rate": 3.562107203709394e-05,
"loss": 3.2936,
"step": 52000
},
{
"epoch": 0.8690565722403276,
"eval_loss": 3.358672857284546,
"eval_runtime": 27.5316,
"eval_samples_per_second": 162.068,
"eval_steps_per_second": 20.268,
"step": 52000
},
{
"epoch": 0.8774128854349461,
"grad_norm": 12.489287376403809,
"learning_rate": 3.5481411133767216e-05,
"loss": 3.2694,
"step": 52500
},
{
"epoch": 0.8857691986295646,
"grad_norm": 12.570661544799805,
"learning_rate": 3.534175023044049e-05,
"loss": 3.322,
"step": 53000
},
{
"epoch": 0.8857691986295646,
"eval_loss": 3.3604071140289307,
"eval_runtime": 27.5745,
"eval_samples_per_second": 161.816,
"eval_steps_per_second": 20.236,
"step": 53000
},
{
"epoch": 0.8941255118241832,
"grad_norm": 17.960376739501953,
"learning_rate": 3.5202368648920424e-05,
"loss": 3.2955,
"step": 53500
},
{
"epoch": 0.9024818250188017,
"grad_norm": 13.333609580993652,
"learning_rate": 3.5062707745593696e-05,
"loss": 3.3394,
"step": 54000
},
{
"epoch": 0.9024818250188017,
"eval_loss": 3.3498120307922363,
"eval_runtime": 27.561,
"eval_samples_per_second": 161.896,
"eval_steps_per_second": 20.246,
"step": 54000
},
{
"epoch": 0.9108381382134202,
"grad_norm": 16.366514205932617,
"learning_rate": 3.4923046842266974e-05,
"loss": 3.3223,
"step": 54500
},
{
"epoch": 0.9191944514080388,
"grad_norm": 10.783904075622559,
"learning_rate": 3.478338593894025e-05,
"loss": 3.2717,
"step": 55000
},
{
"epoch": 0.9191944514080388,
"eval_loss": 3.3506462574005127,
"eval_runtime": 27.5889,
"eval_samples_per_second": 161.732,
"eval_steps_per_second": 20.226,
"step": 55000
},
{
"epoch": 0.9275507646026573,
"grad_norm": 12.693829536437988,
"learning_rate": 3.464372503561353e-05,
"loss": 3.2696,
"step": 55500
},
{
"epoch": 0.9359070777972759,
"grad_norm": 20.29674530029297,
"learning_rate": 3.450406413228681e-05,
"loss": 3.3342,
"step": 56000
},
{
"epoch": 0.9359070777972759,
"eval_loss": 3.333944797515869,
"eval_runtime": 27.579,
"eval_samples_per_second": 161.79,
"eval_steps_per_second": 20.233,
"step": 56000
},
{
"epoch": 0.9442633909918944,
"grad_norm": 14.309937477111816,
"learning_rate": 3.436440322896009e-05,
"loss": 3.3321,
"step": 56500
},
{
"epoch": 0.9526197041865129,
"grad_norm": 8.43278980255127,
"learning_rate": 3.4224742325633366e-05,
"loss": 3.2396,
"step": 57000
},
{
"epoch": 0.9526197041865129,
"eval_loss": 3.3314433097839355,
"eval_runtime": 27.6061,
"eval_samples_per_second": 161.631,
"eval_steps_per_second": 20.213,
"step": 57000
},
{
"epoch": 0.9609760173811315,
"grad_norm": 10.31540584564209,
"learning_rate": 3.4085081422306644e-05,
"loss": 3.2436,
"step": 57500
},
{
"epoch": 0.96933233057575,
"grad_norm": 10.261503219604492,
"learning_rate": 3.3945699840786574e-05,
"loss": 3.2845,
"step": 58000
},
{
"epoch": 0.96933233057575,
"eval_loss": 3.3237485885620117,
"eval_runtime": 27.545,
"eval_samples_per_second": 161.99,
"eval_steps_per_second": 20.258,
"step": 58000
},
{
"epoch": 0.9776886437703685,
"grad_norm": 10.157827377319336,
"learning_rate": 3.380603893745985e-05,
"loss": 3.2976,
"step": 58500
},
{
"epoch": 0.9860449569649871,
"grad_norm": 12.794463157653809,
"learning_rate": 3.366637803413313e-05,
"loss": 3.2621,
"step": 59000
},
{
"epoch": 0.9860449569649871,
"eval_loss": 3.325364351272583,
"eval_runtime": 27.6749,
"eval_samples_per_second": 161.229,
"eval_steps_per_second": 20.163,
"step": 59000
},
{
"epoch": 0.9944012701596056,
"grad_norm": 17.333826065063477,
"learning_rate": 3.35267171308064e-05,
"loss": 3.2696,
"step": 59500
},
{
"epoch": 1.0027575833542242,
"grad_norm": 12.315380096435547,
"learning_rate": 3.338705622747968e-05,
"loss": 3.2115,
"step": 60000
},
{
"epoch": 1.0027575833542242,
"eval_loss": 3.336367607116699,
"eval_runtime": 27.5799,
"eval_samples_per_second": 161.785,
"eval_steps_per_second": 20.232,
"step": 60000
},
{
"epoch": 1.0111138965488426,
"grad_norm": 23.34908676147461,
"learning_rate": 3.324739532415296e-05,
"loss": 2.9589,
"step": 60500
},
{
"epoch": 1.0194702097434611,
"grad_norm": 10.89417839050293,
"learning_rate": 3.310773442082624e-05,
"loss": 3.0302,
"step": 61000
},
{
"epoch": 1.0194702097434611,
"eval_loss": 3.325634479522705,
"eval_runtime": 27.573,
"eval_samples_per_second": 161.825,
"eval_steps_per_second": 20.237,
"step": 61000
},
{
"epoch": 1.0278265229380796,
"grad_norm": 16.59639549255371,
"learning_rate": 3.296835283930617e-05,
"loss": 2.9884,
"step": 61500
},
{
"epoch": 1.0361828361326983,
"grad_norm": 13.3978910446167,
"learning_rate": 3.2828691935979445e-05,
"loss": 2.9762,
"step": 62000
},
{
"epoch": 1.0361828361326983,
"eval_loss": 3.334028482437134,
"eval_runtime": 27.5551,
"eval_samples_per_second": 161.93,
"eval_steps_per_second": 20.25,
"step": 62000
},
{
"epoch": 1.0445391493273168,
"grad_norm": 10.937264442443848,
"learning_rate": 3.2689031032652723e-05,
"loss": 2.9597,
"step": 62500
},
{
"epoch": 1.0528954625219353,
"grad_norm": 14.150084495544434,
"learning_rate": 3.2549370129325995e-05,
"loss": 2.997,
"step": 63000
},
{
"epoch": 1.0528954625219353,
"eval_loss": 3.320002794265747,
"eval_runtime": 27.5918,
"eval_samples_per_second": 161.715,
"eval_steps_per_second": 20.223,
"step": 63000
},
{
"epoch": 1.0612517757165538,
"grad_norm": 10.700261116027832,
"learning_rate": 3.240970922599927e-05,
"loss": 2.9857,
"step": 63500
},
{
"epoch": 1.0696080889111723,
"grad_norm": 11.004881858825684,
"learning_rate": 3.22703276444792e-05,
"loss": 2.9591,
"step": 64000
},
{
"epoch": 1.0696080889111723,
"eval_loss": 3.333744525909424,
"eval_runtime": 27.6282,
"eval_samples_per_second": 161.502,
"eval_steps_per_second": 20.197,
"step": 64000
},
{
"epoch": 1.077964402105791,
"grad_norm": 10.794275283813477,
"learning_rate": 3.213066674115248e-05,
"loss": 2.9817,
"step": 64500
},
{
"epoch": 1.0863207153004095,
"grad_norm": 15.817968368530273,
"learning_rate": 3.199100583782576e-05,
"loss": 2.9543,
"step": 65000
},
{
"epoch": 1.0863207153004095,
"eval_loss": 3.3309056758880615,
"eval_runtime": 27.5459,
"eval_samples_per_second": 161.984,
"eval_steps_per_second": 20.257,
"step": 65000
},
{
"epoch": 1.094677028495028,
"grad_norm": 14.550418853759766,
"learning_rate": 3.185134493449904e-05,
"loss": 2.9485,
"step": 65500
},
{
"epoch": 1.1030333416896465,
"grad_norm": 11.362966537475586,
"learning_rate": 3.1711684031172316e-05,
"loss": 2.9787,
"step": 66000
},
{
"epoch": 1.1030333416896465,
"eval_loss": 3.332648992538452,
"eval_runtime": 27.619,
"eval_samples_per_second": 161.555,
"eval_steps_per_second": 20.203,
"step": 66000
},
{
"epoch": 1.111389654884265,
"grad_norm": 14.36471176147461,
"learning_rate": 3.157202312784559e-05,
"loss": 2.9943,
"step": 66500
},
{
"epoch": 1.1197459680788837,
"grad_norm": 17.348573684692383,
"learning_rate": 3.1432641546325524e-05,
"loss": 3.033,
"step": 67000
},
{
"epoch": 1.1197459680788837,
"eval_loss": 3.311136245727539,
"eval_runtime": 27.6024,
"eval_samples_per_second": 161.653,
"eval_steps_per_second": 20.216,
"step": 67000
},
{
"epoch": 1.1281022812735022,
"grad_norm": 13.361127853393555,
"learning_rate": 3.1292980642998796e-05,
"loss": 2.995,
"step": 67500
},
{
"epoch": 1.1364585944681207,
"grad_norm": 12.931785583496094,
"learning_rate": 3.1153319739672074e-05,
"loss": 2.9679,
"step": 68000
},
{
"epoch": 1.1364585944681207,
"eval_loss": 3.308124542236328,
"eval_runtime": 27.5871,
"eval_samples_per_second": 161.742,
"eval_steps_per_second": 20.227,
"step": 68000
},
{
"epoch": 1.1448149076627392,
"grad_norm": 15.317282676696777,
"learning_rate": 3.101365883634535e-05,
"loss": 3.0068,
"step": 68500
},
{
"epoch": 1.1531712208573577,
"grad_norm": 16.179967880249023,
"learning_rate": 3.087399793301863e-05,
"loss": 2.9658,
"step": 69000
},
{
"epoch": 1.1531712208573577,
"eval_loss": 3.3181824684143066,
"eval_runtime": 27.6733,
"eval_samples_per_second": 161.238,
"eval_steps_per_second": 20.164,
"step": 69000
},
{
"epoch": 1.1615275340519762,
"grad_norm": 15.436213493347168,
"learning_rate": 3.073433702969191e-05,
"loss": 3.0074,
"step": 69500
},
{
"epoch": 1.1698838472465949,
"grad_norm": 27.164413452148438,
"learning_rate": 3.059467612636519e-05,
"loss": 2.9649,
"step": 70000
},
{
"epoch": 1.1698838472465949,
"eval_loss": 3.3080978393554688,
"eval_runtime": 27.6434,
"eval_samples_per_second": 161.413,
"eval_steps_per_second": 20.186,
"step": 70000
},
{
"epoch": 1.1782401604412134,
"grad_norm": 11.414698600769043,
"learning_rate": 3.045529454484512e-05,
"loss": 3.0125,
"step": 70500
},
{
"epoch": 1.1865964736358319,
"grad_norm": 15.268623352050781,
"learning_rate": 3.0315633641518392e-05,
"loss": 2.9853,
"step": 71000
},
{
"epoch": 1.1865964736358319,
"eval_loss": 3.298069477081299,
"eval_runtime": 27.61,
"eval_samples_per_second": 161.608,
"eval_steps_per_second": 20.21,
"step": 71000
},
{
"epoch": 1.1949527868304504,
"grad_norm": 23.319032669067383,
"learning_rate": 3.017625205999833e-05,
"loss": 2.9738,
"step": 71500
},
{
"epoch": 1.2033091000250689,
"grad_norm": 11.64974308013916,
"learning_rate": 3.00365911566716e-05,
"loss": 2.9607,
"step": 72000
},
{
"epoch": 1.2033091000250689,
"eval_loss": 3.3039419651031494,
"eval_runtime": 29.6202,
"eval_samples_per_second": 150.64,
"eval_steps_per_second": 18.838,
"step": 72000
},
{
"epoch": 1.2116654132196876,
"grad_norm": 11.017394065856934,
"learning_rate": 2.989693025334488e-05,
"loss": 2.9694,
"step": 72500
},
{
"epoch": 1.220021726414306,
"grad_norm": 11.3243989944458,
"learning_rate": 2.9757269350018157e-05,
"loss": 2.9665,
"step": 73000
},
{
"epoch": 1.220021726414306,
"eval_loss": 3.302910804748535,
"eval_runtime": 27.6122,
"eval_samples_per_second": 161.595,
"eval_steps_per_second": 20.208,
"step": 73000
},
{
"epoch": 1.2283780396089246,
"grad_norm": 14.27160358428955,
"learning_rate": 2.9617608446691435e-05,
"loss": 2.9554,
"step": 73500
},
{
"epoch": 1.236734352803543,
"grad_norm": 9.526435852050781,
"learning_rate": 2.9478226865171365e-05,
"loss": 3.0167,
"step": 74000
},
{
"epoch": 1.236734352803543,
"eval_loss": 3.3012630939483643,
"eval_runtime": 27.614,
"eval_samples_per_second": 161.584,
"eval_steps_per_second": 20.207,
"step": 74000
},
{
"epoch": 1.2450906659981615,
"grad_norm": 14.875115394592285,
"learning_rate": 2.9338565961844643e-05,
"loss": 3.0263,
"step": 74500
},
{
"epoch": 1.25344697919278,
"grad_norm": 16.816545486450195,
"learning_rate": 2.919890505851792e-05,
"loss": 2.9977,
"step": 75000
},
{
"epoch": 1.25344697919278,
"eval_loss": 3.3035476207733154,
"eval_runtime": 27.574,
"eval_samples_per_second": 161.819,
"eval_steps_per_second": 20.236,
"step": 75000
},
{
"epoch": 1.2618032923873987,
"grad_norm": 16.662649154663086,
"learning_rate": 2.9059244155191196e-05,
"loss": 2.9594,
"step": 75500
},
{
"epoch": 1.2701596055820172,
"grad_norm": 14.543773651123047,
"learning_rate": 2.8919583251864475e-05,
"loss": 2.9845,
"step": 76000
},
{
"epoch": 1.2701596055820172,
"eval_loss": 3.302872896194458,
"eval_runtime": 27.5299,
"eval_samples_per_second": 162.078,
"eval_steps_per_second": 20.269,
"step": 76000
},
{
"epoch": 1.2785159187766357,
"grad_norm": 15.129777908325195,
"learning_rate": 2.8779922348537753e-05,
"loss": 2.9826,
"step": 76500
},
{
"epoch": 1.2868722319712542,
"grad_norm": 13.58123779296875,
"learning_rate": 2.864026144521103e-05,
"loss": 2.9302,
"step": 77000
},
{
"epoch": 1.2868722319712542,
"eval_loss": 3.287860155105591,
"eval_runtime": 27.5656,
"eval_samples_per_second": 161.868,
"eval_steps_per_second": 20.243,
"step": 77000
},
{
"epoch": 1.2952285451658727,
"grad_norm": 13.634276390075684,
"learning_rate": 2.8500600541884303e-05,
"loss": 2.9802,
"step": 77500
},
{
"epoch": 1.3035848583604914,
"grad_norm": 12.221925735473633,
"learning_rate": 2.836093963855758e-05,
"loss": 3.0119,
"step": 78000
},
{
"epoch": 1.3035848583604914,
"eval_loss": 3.27937650680542,
"eval_runtime": 27.6023,
"eval_samples_per_second": 161.653,
"eval_steps_per_second": 20.216,
"step": 78000
},
{
"epoch": 1.31194117155511,
"grad_norm": 9.44093132019043,
"learning_rate": 2.822127873523086e-05,
"loss": 2.9562,
"step": 78500
},
{
"epoch": 1.3202974847497284,
"grad_norm": 13.62260627746582,
"learning_rate": 2.8082176475517447e-05,
"loss": 2.982,
"step": 79000
},
{
"epoch": 1.3202974847497284,
"eval_loss": 3.2890851497650146,
"eval_runtime": 27.5678,
"eval_samples_per_second": 161.856,
"eval_steps_per_second": 20.241,
"step": 79000
},
{
"epoch": 1.328653797944347,
"grad_norm": 12.078137397766113,
"learning_rate": 2.7942515572190725e-05,
"loss": 2.9453,
"step": 79500
},
{
"epoch": 1.3370101111389654,
"grad_norm": 11.467178344726562,
"learning_rate": 2.7802854668863997e-05,
"loss": 3.0008,
"step": 80000
},
{
"epoch": 1.3370101111389654,
"eval_loss": 3.2852883338928223,
"eval_runtime": 27.5861,
"eval_samples_per_second": 161.748,
"eval_steps_per_second": 20.228,
"step": 80000
},
{
"epoch": 1.345366424333584,
"grad_norm": 14.292551040649414,
"learning_rate": 2.7663473087343933e-05,
"loss": 2.9664,
"step": 80500
},
{
"epoch": 1.3537227375282026,
"grad_norm": 13.714376449584961,
"learning_rate": 2.7523812184017205e-05,
"loss": 2.9396,
"step": 81000
},
{
"epoch": 1.3537227375282026,
"eval_loss": 3.2859437465667725,
"eval_runtime": 27.6096,
"eval_samples_per_second": 161.61,
"eval_steps_per_second": 20.21,
"step": 81000
},
{
"epoch": 1.362079050722821,
"grad_norm": 12.142716407775879,
"learning_rate": 2.7384151280690483e-05,
"loss": 2.9775,
"step": 81500
},
{
"epoch": 1.3704353639174396,
"grad_norm": 11.3803071975708,
"learning_rate": 2.724449037736376e-05,
"loss": 2.9458,
"step": 82000
},
{
"epoch": 1.3704353639174396,
"eval_loss": 3.278106689453125,
"eval_runtime": 27.5893,
"eval_samples_per_second": 161.73,
"eval_steps_per_second": 20.225,
"step": 82000
},
{
"epoch": 1.378791677112058,
"grad_norm": 16.39805030822754,
"learning_rate": 2.710482947403704e-05,
"loss": 3.0504,
"step": 82500
},
{
"epoch": 1.3871479903066768,
"grad_norm": 13.994576454162598,
"learning_rate": 2.6965168570710315e-05,
"loss": 2.9656,
"step": 83000
},
{
"epoch": 1.3871479903066768,
"eval_loss": 3.278665781021118,
"eval_runtime": 27.5347,
"eval_samples_per_second": 162.05,
"eval_steps_per_second": 20.265,
"step": 83000
},
{
"epoch": 1.3955043035012953,
"grad_norm": 11.802352905273438,
"learning_rate": 2.6825507667383593e-05,
"loss": 2.9786,
"step": 83500
},
{
"epoch": 1.4038606166959138,
"grad_norm": 13.618844985961914,
"learning_rate": 2.668584676405687e-05,
"loss": 3.0007,
"step": 84000
},
{
"epoch": 1.4038606166959138,
"eval_loss": 3.2725257873535156,
"eval_runtime": 27.5621,
"eval_samples_per_second": 161.889,
"eval_steps_per_second": 20.245,
"step": 84000
},
{
"epoch": 1.4122169298905323,
"grad_norm": 9.817100524902344,
"learning_rate": 2.654618586073015e-05,
"loss": 2.9268,
"step": 84500
},
{
"epoch": 1.4205732430851508,
"grad_norm": 16.49465560913086,
"learning_rate": 2.640652495740343e-05,
"loss": 2.984,
"step": 85000
},
{
"epoch": 1.4205732430851508,
"eval_loss": 3.278170108795166,
"eval_runtime": 27.5927,
"eval_samples_per_second": 161.71,
"eval_steps_per_second": 20.223,
"step": 85000
},
{
"epoch": 1.4289295562797695,
"grad_norm": 17.29984474182129,
"learning_rate": 2.62668640540767e-05,
"loss": 2.9955,
"step": 85500
},
{
"epoch": 1.437285869474388,
"grad_norm": 12.310997009277344,
"learning_rate": 2.612720315074998e-05,
"loss": 2.9769,
"step": 86000
},
{
"epoch": 1.437285869474388,
"eval_loss": 3.2687652111053467,
"eval_runtime": 27.5431,
"eval_samples_per_second": 162.0,
"eval_steps_per_second": 20.259,
"step": 86000
},
{
"epoch": 1.4456421826690065,
"grad_norm": 14.744447708129883,
"learning_rate": 2.5987542247423257e-05,
"loss": 2.966,
"step": 86500
},
{
"epoch": 1.453998495863625,
"grad_norm": 10.83408260345459,
"learning_rate": 2.5847881344096535e-05,
"loss": 2.9281,
"step": 87000
},
{
"epoch": 1.453998495863625,
"eval_loss": 3.260927677154541,
"eval_runtime": 27.5537,
"eval_samples_per_second": 161.938,
"eval_steps_per_second": 20.251,
"step": 87000
},
{
"epoch": 1.4623548090582434,
"grad_norm": 14.912446975708008,
"learning_rate": 2.5708220440769813e-05,
"loss": 2.964,
"step": 87500
},
{
"epoch": 1.4707111222528622,
"grad_norm": 16.433135986328125,
"learning_rate": 2.5568838859249743e-05,
"loss": 2.9903,
"step": 88000
},
{
"epoch": 1.4707111222528622,
"eval_loss": 3.2638683319091797,
"eval_runtime": 27.5854,
"eval_samples_per_second": 161.752,
"eval_steps_per_second": 20.228,
"step": 88000
},
{
"epoch": 1.4790674354474806,
"grad_norm": 10.865525245666504,
"learning_rate": 2.542917795592302e-05,
"loss": 2.9782,
"step": 88500
},
{
"epoch": 1.4874237486420991,
"grad_norm": 18.059494018554688,
"learning_rate": 2.5289517052596296e-05,
"loss": 2.9746,
"step": 89000
},
{
"epoch": 1.4874237486420991,
"eval_loss": 3.2576780319213867,
"eval_runtime": 27.6301,
"eval_samples_per_second": 161.491,
"eval_steps_per_second": 20.195,
"step": 89000
},
{
"epoch": 1.4957800618367176,
"grad_norm": 14.338726997375488,
"learning_rate": 2.5149856149269575e-05,
"loss": 2.9746,
"step": 89500
},
{
"epoch": 1.5041363750313361,
"grad_norm": 16.35688018798828,
"learning_rate": 2.5010195245942853e-05,
"loss": 2.9235,
"step": 90000
},
{
"epoch": 1.5041363750313361,
"eval_loss": 3.2603578567504883,
"eval_runtime": 27.5275,
"eval_samples_per_second": 162.093,
"eval_steps_per_second": 20.271,
"step": 90000
},
{
"epoch": 1.5124926882259548,
"grad_norm": 19.649658203125,
"learning_rate": 2.4870534342616128e-05,
"loss": 2.892,
"step": 90500
},
{
"epoch": 1.520849001420573,
"grad_norm": 22.463607788085938,
"learning_rate": 2.4730873439289406e-05,
"loss": 2.9464,
"step": 91000
},
{
"epoch": 1.520849001420573,
"eval_loss": 3.255012273788452,
"eval_runtime": 27.5843,
"eval_samples_per_second": 161.759,
"eval_steps_per_second": 20.229,
"step": 91000
},
{
"epoch": 1.5292053146151918,
"grad_norm": 11.892714500427246,
"learning_rate": 2.4591212535962685e-05,
"loss": 2.9404,
"step": 91500
},
{
"epoch": 1.5375616278098103,
"grad_norm": 13.547897338867188,
"learning_rate": 2.445155163263596e-05,
"loss": 2.9935,
"step": 92000
},
{
"epoch": 1.5375616278098103,
"eval_loss": 3.2467143535614014,
"eval_runtime": 27.5751,
"eval_samples_per_second": 161.813,
"eval_steps_per_second": 20.236,
"step": 92000
},
{
"epoch": 1.5459179410044288,
"grad_norm": 15.99018383026123,
"learning_rate": 2.4311890729309238e-05,
"loss": 2.9983,
"step": 92500
},
{
"epoch": 1.5542742541990475,
"grad_norm": 10.513391494750977,
"learning_rate": 2.4172229825982516e-05,
"loss": 2.979,
"step": 93000
},
{
"epoch": 1.5542742541990475,
"eval_loss": 3.2534940242767334,
"eval_runtime": 27.6008,
"eval_samples_per_second": 161.662,
"eval_steps_per_second": 20.217,
"step": 93000
},
{
"epoch": 1.5626305673936658,
"grad_norm": 14.598124504089355,
"learning_rate": 2.4032568922655795e-05,
"loss": 2.9401,
"step": 93500
},
{
"epoch": 1.5709868805882845,
"grad_norm": 11.219178199768066,
"learning_rate": 2.3893187341135724e-05,
"loss": 2.9333,
"step": 94000
},
{
"epoch": 1.5709868805882845,
"eval_loss": 3.2531471252441406,
"eval_runtime": 27.5659,
"eval_samples_per_second": 161.867,
"eval_steps_per_second": 20.242,
"step": 94000
},
{
"epoch": 1.579343193782903,
"grad_norm": 13.708407402038574,
"learning_rate": 2.3753805759615654e-05,
"loss": 2.9284,
"step": 94500
},
{
"epoch": 1.5876995069775215,
"grad_norm": 15.64401912689209,
"learning_rate": 2.3614144856288932e-05,
"loss": 2.9355,
"step": 95000
},
{
"epoch": 1.5876995069775215,
"eval_loss": 3.247119665145874,
"eval_runtime": 27.5812,
"eval_samples_per_second": 161.777,
"eval_steps_per_second": 20.231,
"step": 95000
},
{
"epoch": 1.5960558201721402,
"grad_norm": 12.710307121276855,
"learning_rate": 2.347448395296221e-05,
"loss": 2.9451,
"step": 95500
},
{
"epoch": 1.6044121333667585,
"grad_norm": 12.77171516418457,
"learning_rate": 2.333482304963549e-05,
"loss": 2.904,
"step": 96000
},
{
"epoch": 1.6044121333667585,
"eval_loss": 3.2489845752716064,
"eval_runtime": 27.6157,
"eval_samples_per_second": 161.575,
"eval_steps_per_second": 20.206,
"step": 96000
},
{
"epoch": 1.6127684465613772,
"grad_norm": 12.342710494995117,
"learning_rate": 2.3195162146308764e-05,
"loss": 2.9116,
"step": 96500
},
{
"epoch": 1.6211247597559957,
"grad_norm": 12.343132019042969,
"learning_rate": 2.3055501242982042e-05,
"loss": 2.9464,
"step": 97000
},
{
"epoch": 1.6211247597559957,
"eval_loss": 3.2335522174835205,
"eval_runtime": 27.5638,
"eval_samples_per_second": 161.879,
"eval_steps_per_second": 20.244,
"step": 97000
},
{
"epoch": 1.6294810729506142,
"grad_norm": 14.988670349121094,
"learning_rate": 2.291611966146197e-05,
"loss": 2.9173,
"step": 97500
},
{
"epoch": 1.6378373861452329,
"grad_norm": 12.14406967163086,
"learning_rate": 2.277645875813525e-05,
"loss": 2.917,
"step": 98000
},
{
"epoch": 1.6378373861452329,
"eval_loss": 3.240186929702759,
"eval_runtime": 27.7378,
"eval_samples_per_second": 160.864,
"eval_steps_per_second": 20.117,
"step": 98000
},
{
"epoch": 1.6461936993398512,
"grad_norm": 13.880926132202148,
"learning_rate": 2.2636797854808525e-05,
"loss": 2.9146,
"step": 98500
},
{
"epoch": 1.6545500125344699,
"grad_norm": 7.802238941192627,
"learning_rate": 2.2497136951481803e-05,
"loss": 2.9218,
"step": 99000
},
{
"epoch": 1.6545500125344699,
"eval_loss": 3.2389557361602783,
"eval_runtime": 27.6304,
"eval_samples_per_second": 161.489,
"eval_steps_per_second": 20.195,
"step": 99000
},
{
"epoch": 1.6629063257290884,
"grad_norm": 18.44457244873047,
"learning_rate": 2.235747604815508e-05,
"loss": 2.9043,
"step": 99500
},
{
"epoch": 1.6712626389237069,
"grad_norm": 10.393033027648926,
"learning_rate": 2.2217815144828357e-05,
"loss": 2.9677,
"step": 100000
},
{
"epoch": 1.6712626389237069,
"eval_loss": 3.2266006469726562,
"eval_runtime": 27.5938,
"eval_samples_per_second": 161.703,
"eval_steps_per_second": 20.222,
"step": 100000
},
{
"epoch": 1.6796189521183253,
"grad_norm": 7.137568473815918,
"learning_rate": 2.2078154241501635e-05,
"loss": 2.9498,
"step": 100500
},
{
"epoch": 1.6879752653129438,
"grad_norm": 9.725958824157715,
"learning_rate": 2.1938772659981565e-05,
"loss": 2.8844,
"step": 101000
},
{
"epoch": 1.6879752653129438,
"eval_loss": 3.223768949508667,
"eval_runtime": 27.5732,
"eval_samples_per_second": 161.824,
"eval_steps_per_second": 20.237,
"step": 101000
},
{
"epoch": 1.6963315785075626,
"grad_norm": 15.254230499267578,
"learning_rate": 2.1799111756654843e-05,
"loss": 2.8841,
"step": 101500
},
{
"epoch": 1.704687891702181,
"grad_norm": 20.192659378051758,
"learning_rate": 2.165945085332812e-05,
"loss": 2.9283,
"step": 102000
},
{
"epoch": 1.704687891702181,
"eval_loss": 3.2226974964141846,
"eval_runtime": 27.58,
"eval_samples_per_second": 161.784,
"eval_steps_per_second": 20.232,
"step": 102000
},
{
"epoch": 1.7130442048967995,
"grad_norm": 14.292362213134766,
"learning_rate": 2.15197899500014e-05,
"loss": 2.9358,
"step": 102500
},
{
"epoch": 1.721400518091418,
"grad_norm": 9.396713256835938,
"learning_rate": 2.1380129046674675e-05,
"loss": 2.9472,
"step": 103000
},
{
"epoch": 1.721400518091418,
"eval_loss": 3.224209785461426,
"eval_runtime": 27.6671,
"eval_samples_per_second": 161.274,
"eval_steps_per_second": 20.168,
"step": 103000
},
{
"epoch": 1.7297568312860365,
"grad_norm": 10.828228950500488,
"learning_rate": 2.1240468143347953e-05,
"loss": 2.9152,
"step": 103500
},
{
"epoch": 1.7381131444806552,
"grad_norm": 13.493616104125977,
"learning_rate": 2.1100807240021228e-05,
"loss": 2.9518,
"step": 104000
},
{
"epoch": 1.7381131444806552,
"eval_loss": 3.2257561683654785,
"eval_runtime": 27.5631,
"eval_samples_per_second": 161.883,
"eval_steps_per_second": 20.244,
"step": 104000
},
{
"epoch": 1.7464694576752735,
"grad_norm": 11.142574310302734,
"learning_rate": 2.0961146336694506e-05,
"loss": 2.9459,
"step": 104500
},
{
"epoch": 1.7548257708698922,
"grad_norm": 10.669454574584961,
"learning_rate": 2.082176475517444e-05,
"loss": 2.9545,
"step": 105000
},
{
"epoch": 1.7548257708698922,
"eval_loss": 3.2120673656463623,
"eval_runtime": 27.5868,
"eval_samples_per_second": 161.744,
"eval_steps_per_second": 20.227,
"step": 105000
},
{
"epoch": 1.7631820840645107,
"grad_norm": 10.605733871459961,
"learning_rate": 2.0682103851847714e-05,
"loss": 2.9228,
"step": 105500
},
{
"epoch": 1.7715383972591292,
"grad_norm": 13.702558517456055,
"learning_rate": 2.0542442948520993e-05,
"loss": 2.9137,
"step": 106000
},
{
"epoch": 1.7715383972591292,
"eval_loss": 3.218060255050659,
"eval_runtime": 27.5869,
"eval_samples_per_second": 161.744,
"eval_steps_per_second": 20.227,
"step": 106000
},
{
"epoch": 1.779894710453748,
"grad_norm": 18.355859756469727,
"learning_rate": 2.0402782045194268e-05,
"loss": 2.885,
"step": 106500
},
{
"epoch": 1.7882510236483662,
"grad_norm": 12.07524299621582,
"learning_rate": 2.0263679785480855e-05,
"loss": 2.9016,
"step": 107000
},
{
"epoch": 1.7882510236483662,
"eval_loss": 3.2088520526885986,
"eval_runtime": 27.6024,
"eval_samples_per_second": 161.653,
"eval_steps_per_second": 20.216,
"step": 107000
},
{
"epoch": 1.796607336842985,
"grad_norm": 11.443526268005371,
"learning_rate": 2.0124298203960785e-05,
"loss": 2.942,
"step": 107500
},
{
"epoch": 1.8049636500376034,
"grad_norm": 8.035077095031738,
"learning_rate": 1.9984637300634063e-05,
"loss": 2.9247,
"step": 108000
},
{
"epoch": 1.8049636500376034,
"eval_loss": 3.211854934692383,
"eval_runtime": 27.5517,
"eval_samples_per_second": 161.95,
"eval_steps_per_second": 20.253,
"step": 108000
},
{
"epoch": 1.8133199632322219,
"grad_norm": 12.948112487792969,
"learning_rate": 1.9844976397307338e-05,
"loss": 2.9112,
"step": 108500
},
{
"epoch": 1.8216762764268406,
"grad_norm": 15.308154106140137,
"learning_rate": 1.9705315493980616e-05,
"loss": 2.9185,
"step": 109000
},
{
"epoch": 1.8216762764268406,
"eval_loss": 3.1997740268707275,
"eval_runtime": 27.6334,
"eval_samples_per_second": 161.471,
"eval_steps_per_second": 20.193,
"step": 109000
},
{
"epoch": 1.8300325896214589,
"grad_norm": 10.738883018493652,
"learning_rate": 1.956565459065389e-05,
"loss": 2.8789,
"step": 109500
},
{
"epoch": 1.8383889028160776,
"grad_norm": 13.379829406738281,
"learning_rate": 1.942599368732717e-05,
"loss": 2.9005,
"step": 110000
},
{
"epoch": 1.8383889028160776,
"eval_loss": 3.202975273132324,
"eval_runtime": 27.6255,
"eval_samples_per_second": 161.517,
"eval_steps_per_second": 20.199,
"step": 110000
},
{
"epoch": 1.846745216010696,
"grad_norm": 9.87146282196045,
"learning_rate": 1.9286332784000448e-05,
"loss": 2.8856,
"step": 110500
},
{
"epoch": 1.8551015292053146,
"grad_norm": 12.577339172363281,
"learning_rate": 1.9146671880673726e-05,
"loss": 2.9502,
"step": 111000
},
{
"epoch": 1.8551015292053146,
"eval_loss": 3.1968445777893066,
"eval_runtime": 27.5448,
"eval_samples_per_second": 161.991,
"eval_steps_per_second": 20.258,
"step": 111000
},
{
"epoch": 1.8634578423999333,
"grad_norm": 12.57132625579834,
"learning_rate": 1.9007010977347005e-05,
"loss": 2.8951,
"step": 111500
},
{
"epoch": 1.8718141555945516,
"grad_norm": 14.708492279052734,
"learning_rate": 1.886735007402028e-05,
"loss": 2.9093,
"step": 112000
},
{
"epoch": 1.8718141555945516,
"eval_loss": 3.1939940452575684,
"eval_runtime": 27.5701,
"eval_samples_per_second": 161.842,
"eval_steps_per_second": 20.239,
"step": 112000
},
{
"epoch": 1.8801704687891703,
"grad_norm": 12.688665390014648,
"learning_rate": 1.8727689170693558e-05,
"loss": 2.885,
"step": 112500
},
{
"epoch": 1.8885267819837888,
"grad_norm": 11.511554718017578,
"learning_rate": 1.8588028267366833e-05,
"loss": 2.8351,
"step": 113000
},
{
"epoch": 1.8885267819837888,
"eval_loss": 3.1979987621307373,
"eval_runtime": 27.6058,
"eval_samples_per_second": 161.633,
"eval_steps_per_second": 20.213,
"step": 113000
},
{
"epoch": 1.8968830951784073,
"grad_norm": 7.7706708908081055,
"learning_rate": 1.844892600765342e-05,
"loss": 2.904,
"step": 113500
},
{
"epoch": 1.905239408373026,
"grad_norm": 12.276754379272461,
"learning_rate": 1.8309265104326695e-05,
"loss": 2.8785,
"step": 114000
},
{
"epoch": 1.905239408373026,
"eval_loss": 3.20162296295166,
"eval_runtime": 27.5895,
"eval_samples_per_second": 161.728,
"eval_steps_per_second": 20.225,
"step": 114000
},
{
"epoch": 1.9135957215676442,
"grad_norm": 11.900626182556152,
"learning_rate": 1.8169604200999974e-05,
"loss": 2.8922,
"step": 114500
},
{
"epoch": 1.921952034762263,
"grad_norm": 16.15927505493164,
"learning_rate": 1.802994329767325e-05,
"loss": 2.8341,
"step": 115000
},
{
"epoch": 1.921952034762263,
"eval_loss": 3.192532539367676,
"eval_runtime": 27.5532,
"eval_samples_per_second": 161.941,
"eval_steps_per_second": 20.252,
"step": 115000
},
{
"epoch": 1.9303083479568814,
"grad_norm": 7.972958087921143,
"learning_rate": 1.7890282394346527e-05,
"loss": 2.8838,
"step": 115500
},
{
"epoch": 1.9386646611515,
"grad_norm": 10.199915885925293,
"learning_rate": 1.775118013463311e-05,
"loss": 2.8599,
"step": 116000
},
{
"epoch": 1.9386646611515,
"eval_loss": 3.185673475265503,
"eval_runtime": 27.8833,
"eval_samples_per_second": 160.024,
"eval_steps_per_second": 20.012,
"step": 116000
},
{
"epoch": 1.9470209743461186,
"grad_norm": 12.25405216217041,
"learning_rate": 1.761151923130639e-05,
"loss": 2.8582,
"step": 116500
},
{
"epoch": 1.955377287540737,
"grad_norm": 11.32104778289795,
"learning_rate": 1.7471858327979664e-05,
"loss": 2.9085,
"step": 117000
},
{
"epoch": 1.955377287540737,
"eval_loss": 3.1832330226898193,
"eval_runtime": 27.611,
"eval_samples_per_second": 161.602,
"eval_steps_per_second": 20.209,
"step": 117000
},
{
"epoch": 1.9637336007353556,
"grad_norm": 12.988907814025879,
"learning_rate": 1.7332197424652943e-05,
"loss": 2.8994,
"step": 117500
},
{
"epoch": 1.9720899139299741,
"grad_norm": 13.372990608215332,
"learning_rate": 1.719253652132622e-05,
"loss": 2.8882,
"step": 118000
},
{
"epoch": 1.9720899139299741,
"eval_loss": 3.1832265853881836,
"eval_runtime": 27.5792,
"eval_samples_per_second": 161.789,
"eval_steps_per_second": 20.233,
"step": 118000
},
{
"epoch": 1.9804462271245926,
"grad_norm": 13.150843620300293,
"learning_rate": 1.7052875617999496e-05,
"loss": 2.8809,
"step": 118500
},
{
"epoch": 1.988802540319211,
"grad_norm": 11.465882301330566,
"learning_rate": 1.6913214714672774e-05,
"loss": 2.8083,
"step": 119000
},
{
"epoch": 1.988802540319211,
"eval_loss": 3.1877200603485107,
"eval_runtime": 27.5585,
"eval_samples_per_second": 161.91,
"eval_steps_per_second": 20.248,
"step": 119000
},
{
"epoch": 1.9971588535138296,
"grad_norm": 15.985895156860352,
"learning_rate": 1.6773553811346053e-05,
"loss": 2.8512,
"step": 119500
},
{
"epoch": 2.0055151667084483,
"grad_norm": 20.66619110107422,
"learning_rate": 1.663389290801933e-05,
"loss": 2.6213,
"step": 120000
},
{
"epoch": 2.0055151667084483,
"eval_loss": 3.257246494293213,
"eval_runtime": 27.6181,
"eval_samples_per_second": 161.561,
"eval_steps_per_second": 20.204,
"step": 120000
},
{
"epoch": 2.0138714799030666,
"grad_norm": 19.570964813232422,
"learning_rate": 1.649451132649926e-05,
"loss": 2.451,
"step": 120500
},
{
"epoch": 2.0222277930976853,
"grad_norm": 17.980260848999023,
"learning_rate": 1.635485042317254e-05,
"loss": 2.4327,
"step": 121000
},
{
"epoch": 2.0222277930976853,
"eval_loss": 3.2763614654541016,
"eval_runtime": 27.6142,
"eval_samples_per_second": 161.584,
"eval_steps_per_second": 20.207,
"step": 121000
},
{
"epoch": 2.030584106292304,
"grad_norm": 12.097052574157715,
"learning_rate": 1.621546884165247e-05,
"loss": 2.4341,
"step": 121500
},
{
"epoch": 2.0389404194869223,
"grad_norm": 13.760506629943848,
"learning_rate": 1.6075807938325747e-05,
"loss": 2.4713,
"step": 122000
},
{
"epoch": 2.0389404194869223,
"eval_loss": 3.28802490234375,
"eval_runtime": 27.5322,
"eval_samples_per_second": 162.065,
"eval_steps_per_second": 20.267,
"step": 122000
},
{
"epoch": 2.047296732681541,
"grad_norm": 13.664862632751465,
"learning_rate": 1.5936147034999025e-05,
"loss": 2.4953,
"step": 122500
},
{
"epoch": 2.0556530458761593,
"grad_norm": 14.178253173828125,
"learning_rate": 1.57964861316723e-05,
"loss": 2.4641,
"step": 123000
},
{
"epoch": 2.0556530458761593,
"eval_loss": 3.2855935096740723,
"eval_runtime": 27.617,
"eval_samples_per_second": 161.567,
"eval_steps_per_second": 20.205,
"step": 123000
},
{
"epoch": 2.064009359070778,
"grad_norm": 19.99590492248535,
"learning_rate": 1.565682522834558e-05,
"loss": 2.485,
"step": 123500
},
{
"epoch": 2.0723656722653967,
"grad_norm": 17.964866638183594,
"learning_rate": 1.5517164325018854e-05,
"loss": 2.4679,
"step": 124000
},
{
"epoch": 2.0723656722653967,
"eval_loss": 3.2831804752349854,
"eval_runtime": 27.6195,
"eval_samples_per_second": 161.553,
"eval_steps_per_second": 20.203,
"step": 124000
},
{
"epoch": 2.080721985460015,
"grad_norm": 16.556684494018555,
"learning_rate": 1.5377503421692132e-05,
"loss": 2.4853,
"step": 124500
},
{
"epoch": 2.0890782986546337,
"grad_norm": 17.376474380493164,
"learning_rate": 1.5237842518365409e-05,
"loss": 2.4614,
"step": 125000
},
{
"epoch": 2.0890782986546337,
"eval_loss": 3.2957663536071777,
"eval_runtime": 27.5859,
"eval_samples_per_second": 161.749,
"eval_steps_per_second": 20.228,
"step": 125000
},
{
"epoch": 2.097434611849252,
"grad_norm": 16.47422218322754,
"learning_rate": 1.5098181615038687e-05,
"loss": 2.5273,
"step": 125500
},
{
"epoch": 2.1057909250438707,
"grad_norm": 20.20784568786621,
"learning_rate": 1.4958800033518617e-05,
"loss": 2.4934,
"step": 126000
},
{
"epoch": 2.1057909250438707,
"eval_loss": 3.2978439331054688,
"eval_runtime": 27.5756,
"eval_samples_per_second": 161.81,
"eval_steps_per_second": 20.235,
"step": 126000
},
{
"epoch": 2.1141472382384894,
"grad_norm": 22.081459045410156,
"learning_rate": 1.4819418451998548e-05,
"loss": 2.4496,
"step": 126500
},
{
"epoch": 2.1225035514331076,
"grad_norm": 14.315736770629883,
"learning_rate": 1.4679757548671824e-05,
"loss": 2.4892,
"step": 127000
},
{
"epoch": 2.1225035514331076,
"eval_loss": 3.2925477027893066,
"eval_runtime": 27.6072,
"eval_samples_per_second": 161.624,
"eval_steps_per_second": 20.212,
"step": 127000
},
{
"epoch": 2.1308598646277264,
"grad_norm": 19.269027709960938,
"learning_rate": 1.4540096645345103e-05,
"loss": 2.4366,
"step": 127500
},
{
"epoch": 2.1392161778223446,
"grad_norm": 13.193103790283203,
"learning_rate": 1.4400435742018381e-05,
"loss": 2.4774,
"step": 128000
},
{
"epoch": 2.1392161778223446,
"eval_loss": 3.2914838790893555,
"eval_runtime": 27.5768,
"eval_samples_per_second": 161.803,
"eval_steps_per_second": 20.234,
"step": 128000
},
{
"epoch": 2.1475724910169633,
"grad_norm": 20.42424201965332,
"learning_rate": 1.4260774838691656e-05,
"loss": 2.4849,
"step": 128500
},
{
"epoch": 2.155928804211582,
"grad_norm": 14.249687194824219,
"learning_rate": 1.4121113935364934e-05,
"loss": 2.4462,
"step": 129000
},
{
"epoch": 2.155928804211582,
"eval_loss": 3.2914915084838867,
"eval_runtime": 27.5308,
"eval_samples_per_second": 162.073,
"eval_steps_per_second": 20.268,
"step": 129000
},
{
"epoch": 2.1642851174062003,
"grad_norm": 28.348491668701172,
"learning_rate": 1.3981453032038211e-05,
"loss": 2.49,
"step": 129500
},
{
"epoch": 2.172641430600819,
"grad_norm": 17.39087677001953,
"learning_rate": 1.384179212871149e-05,
"loss": 2.4542,
"step": 130000
},
{
"epoch": 2.172641430600819,
"eval_loss": 3.2924540042877197,
"eval_runtime": 27.5959,
"eval_samples_per_second": 161.691,
"eval_steps_per_second": 20.22,
"step": 130000
},
{
"epoch": 2.1809977437954373,
"grad_norm": 16.53498649597168,
"learning_rate": 1.3702131225384768e-05,
"loss": 2.4621,
"step": 130500
},
{
"epoch": 2.189354056990056,
"grad_norm": 16.412384033203125,
"learning_rate": 1.3562749643864697e-05,
"loss": 2.4613,
"step": 131000
},
{
"epoch": 2.189354056990056,
"eval_loss": 3.295562744140625,
"eval_runtime": 27.578,
"eval_samples_per_second": 161.796,
"eval_steps_per_second": 20.234,
"step": 131000
},
{
"epoch": 2.1977103701846747,
"grad_norm": 15.538711547851562,
"learning_rate": 1.3423368062344627e-05,
"loss": 2.4549,
"step": 131500
},
{
"epoch": 2.206066683379293,
"grad_norm": 26.194726943969727,
"learning_rate": 1.328398648082456e-05,
"loss": 2.4887,
"step": 132000
},
{
"epoch": 2.206066683379293,
"eval_loss": 3.28711199760437,
"eval_runtime": 27.5596,
"eval_samples_per_second": 161.904,
"eval_steps_per_second": 20.247,
"step": 132000
},
{
"epoch": 2.2144229965739117,
"grad_norm": 17.261341094970703,
"learning_rate": 1.3144325577497835e-05,
"loss": 2.4712,
"step": 132500
},
{
"epoch": 2.22277930976853,
"grad_norm": 13.89609146118164,
"learning_rate": 1.3004664674171113e-05,
"loss": 2.505,
"step": 133000
},
{
"epoch": 2.22277930976853,
"eval_loss": 3.281574249267578,
"eval_runtime": 27.554,
"eval_samples_per_second": 161.936,
"eval_steps_per_second": 20.251,
"step": 133000
},
{
"epoch": 2.2311356229631487,
"grad_norm": 20.80926513671875,
"learning_rate": 1.2865003770844391e-05,
"loss": 2.4657,
"step": 133500
},
{
"epoch": 2.2394919361577674,
"grad_norm": 11.168779373168945,
"learning_rate": 1.2725622189324321e-05,
"loss": 2.4894,
"step": 134000
},
{
"epoch": 2.2394919361577674,
"eval_loss": 3.277146100997925,
"eval_runtime": 27.5992,
"eval_samples_per_second": 161.671,
"eval_steps_per_second": 20.218,
"step": 134000
},
{
"epoch": 2.2478482493523857,
"grad_norm": 16.13945198059082,
"learning_rate": 1.25859612859976e-05,
"loss": 2.473,
"step": 134500
},
{
"epoch": 2.2562045625470044,
"grad_norm": 15.301444053649902,
"learning_rate": 1.2446300382670876e-05,
"loss": 2.4731,
"step": 135000
},
{
"epoch": 2.2562045625470044,
"eval_loss": 3.277038812637329,
"eval_runtime": 27.7191,
"eval_samples_per_second": 160.972,
"eval_steps_per_second": 20.131,
"step": 135000
},
{
"epoch": 2.2645608757416227,
"grad_norm": 14.259910583496094,
"learning_rate": 1.2306639479344154e-05,
"loss": 2.4209,
"step": 135500
},
{
"epoch": 2.2729171889362414,
"grad_norm": 11.308433532714844,
"learning_rate": 1.2166978576017431e-05,
"loss": 2.469,
"step": 136000
},
{
"epoch": 2.2729171889362414,
"eval_loss": 3.272756814956665,
"eval_runtime": 27.5588,
"eval_samples_per_second": 161.908,
"eval_steps_per_second": 20.248,
"step": 136000
},
{
"epoch": 2.28127350213086,
"grad_norm": 11.12240219116211,
"learning_rate": 1.2027317672690708e-05,
"loss": 2.4856,
"step": 136500
},
{
"epoch": 2.2896298153254784,
"grad_norm": 20.829790115356445,
"learning_rate": 1.1887656769363984e-05,
"loss": 2.4522,
"step": 137000
},
{
"epoch": 2.2896298153254784,
"eval_loss": 3.2834246158599854,
"eval_runtime": 27.6412,
"eval_samples_per_second": 161.426,
"eval_steps_per_second": 20.187,
"step": 137000
},
{
"epoch": 2.297986128520097,
"grad_norm": 16.781526565551758,
"learning_rate": 1.1747995866037263e-05,
"loss": 2.4827,
"step": 137500
},
{
"epoch": 2.3063424417147154,
"grad_norm": 14.312143325805664,
"learning_rate": 1.160833496271054e-05,
"loss": 2.4823,
"step": 138000
},
{
"epoch": 2.3063424417147154,
"eval_loss": 3.2799878120422363,
"eval_runtime": 27.6358,
"eval_samples_per_second": 161.457,
"eval_steps_per_second": 20.191,
"step": 138000
},
{
"epoch": 2.314698754909334,
"grad_norm": 16.167057037353516,
"learning_rate": 1.1468674059383816e-05,
"loss": 2.4814,
"step": 138500
},
{
"epoch": 2.3230550681039523,
"grad_norm": 13.810924530029297,
"learning_rate": 1.1329013156057094e-05,
"loss": 2.4825,
"step": 139000
},
{
"epoch": 2.3230550681039523,
"eval_loss": 3.278149127960205,
"eval_runtime": 27.6409,
"eval_samples_per_second": 161.428,
"eval_steps_per_second": 20.187,
"step": 139000
},
{
"epoch": 2.331411381298571,
"grad_norm": 15.337100982666016,
"learning_rate": 1.1189352252730371e-05,
"loss": 2.5083,
"step": 139500
},
{
"epoch": 2.3397676944931898,
"grad_norm": 19.78384017944336,
"learning_rate": 1.1049691349403648e-05,
"loss": 2.4427,
"step": 140000
},
{
"epoch": 2.3397676944931898,
"eval_loss": 3.276196002960205,
"eval_runtime": 27.5881,
"eval_samples_per_second": 161.737,
"eval_steps_per_second": 20.226,
"step": 140000
}
],
"logging_steps": 500,
"max_steps": 179505,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.977262699675648e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}