|
{ |
|
"best_metric": 0.3866276741027832, |
|
"best_model_checkpoint": "./output/checkpoint-3000", |
|
"epoch": 3.8537549407114624, |
|
"eval_steps": 150, |
|
"global_step": 3900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009881422924901186, |
|
"grad_norm": 2.4057295322418213, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 0.527, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019762845849802372, |
|
"grad_norm": 2.7988038063049316, |
|
"learning_rate": 1.9999999999999998e-05, |
|
"loss": 0.4789, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.029644268774703556, |
|
"grad_norm": 3.1874520778656006, |
|
"learning_rate": 2.999999999999999e-05, |
|
"loss": 0.5495, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.039525691699604744, |
|
"grad_norm": 3.206881046295166, |
|
"learning_rate": 3.9999999999999996e-05, |
|
"loss": 0.5418, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04940711462450593, |
|
"grad_norm": 2.863534927368164, |
|
"learning_rate": 4.999999999999999e-05, |
|
"loss": 0.5065, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05928853754940711, |
|
"grad_norm": 2.8917012214660645, |
|
"learning_rate": 5.999999999999998e-05, |
|
"loss": 0.5262, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0691699604743083, |
|
"grad_norm": 2.8896090984344482, |
|
"learning_rate": 6.999999999999998e-05, |
|
"loss": 0.5294, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": 3.138671875, |
|
"learning_rate": 7.999999999999999e-05, |
|
"loss": 0.4787, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08893280632411067, |
|
"grad_norm": 2.513195753097534, |
|
"learning_rate": 8.999999999999998e-05, |
|
"loss": 0.4873, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09881422924901186, |
|
"grad_norm": 2.762770414352417, |
|
"learning_rate": 9.999999999999998e-05, |
|
"loss": 0.5069, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10869565217391304, |
|
"grad_norm": 2.255554437637329, |
|
"learning_rate": 9.999897234791827e-05, |
|
"loss": 0.4713, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11857707509881422, |
|
"grad_norm": 2.8342161178588867, |
|
"learning_rate": 9.999588943391594e-05, |
|
"loss": 0.5146, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12845849802371542, |
|
"grad_norm": 2.1564338207244873, |
|
"learning_rate": 9.999075138471948e-05, |
|
"loss": 0.4976, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1383399209486166, |
|
"grad_norm": 2.606574773788452, |
|
"learning_rate": 9.998355841153397e-05, |
|
"loss": 0.5166, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1482213438735178, |
|
"grad_norm": 2.285940647125244, |
|
"learning_rate": 9.997431081003437e-05, |
|
"loss": 0.5137, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1482213438735178, |
|
"eval_loss": 0.5139818787574768, |
|
"eval_runtime": 35.3761, |
|
"eval_samples_per_second": 14.134, |
|
"eval_steps_per_second": 14.134, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 2.5471882820129395, |
|
"learning_rate": 9.996300896035337e-05, |
|
"loss": 0.5392, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16798418972332016, |
|
"grad_norm": 3.4715616703033447, |
|
"learning_rate": 9.994965332706571e-05, |
|
"loss": 0.517, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17786561264822134, |
|
"grad_norm": 2.321061134338379, |
|
"learning_rate": 9.99342444591692e-05, |
|
"loss": 0.4852, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18774703557312253, |
|
"grad_norm": 2.590848922729492, |
|
"learning_rate": 9.991678299006203e-05, |
|
"loss": 0.5101, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1976284584980237, |
|
"grad_norm": 2.412264823913574, |
|
"learning_rate": 9.989726963751679e-05, |
|
"loss": 0.5099, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2075098814229249, |
|
"grad_norm": 2.491581678390503, |
|
"learning_rate": 9.987570520365101e-05, |
|
"loss": 0.5057, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 2.79880690574646, |
|
"learning_rate": 9.985209057489407e-05, |
|
"loss": 0.4955, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 3.0157032012939453, |
|
"learning_rate": 9.98264267219509e-05, |
|
"loss": 0.4766, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": 2.266268491744995, |
|
"learning_rate": 9.979871469976193e-05, |
|
"loss": 0.4975, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24703557312252963, |
|
"grad_norm": 2.4801745414733887, |
|
"learning_rate": 9.976895564745989e-05, |
|
"loss": 0.5068, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25691699604743085, |
|
"grad_norm": 2.9422342777252197, |
|
"learning_rate": 9.973715078832285e-05, |
|
"loss": 0.4792, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.26679841897233203, |
|
"grad_norm": 2.364716053009033, |
|
"learning_rate": 9.970330142972399e-05, |
|
"loss": 0.475, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2766798418972332, |
|
"grad_norm": 2.0679702758789062, |
|
"learning_rate": 9.966740896307789e-05, |
|
"loss": 0.4818, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2865612648221344, |
|
"grad_norm": 2.8333821296691895, |
|
"learning_rate": 9.962947486378323e-05, |
|
"loss": 0.489, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2964426877470356, |
|
"grad_norm": 2.5552380084991455, |
|
"learning_rate": 9.958950069116228e-05, |
|
"loss": 0.4921, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2964426877470356, |
|
"eval_loss": 0.4803454577922821, |
|
"eval_runtime": 35.2266, |
|
"eval_samples_per_second": 14.194, |
|
"eval_steps_per_second": 14.194, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.30632411067193677, |
|
"grad_norm": 2.617976188659668, |
|
"learning_rate": 9.954748808839671e-05, |
|
"loss": 0.5139, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 2.61501407623291, |
|
"learning_rate": 9.950343878246007e-05, |
|
"loss": 0.4642, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.32608695652173914, |
|
"grad_norm": 2.3816354274749756, |
|
"learning_rate": 9.945735458404678e-05, |
|
"loss": 0.522, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3359683794466403, |
|
"grad_norm": 2.5774052143096924, |
|
"learning_rate": 9.940923738749776e-05, |
|
"loss": 0.505, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3458498023715415, |
|
"grad_norm": 2.245945692062378, |
|
"learning_rate": 9.935908917072249e-05, |
|
"loss": 0.4801, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3557312252964427, |
|
"grad_norm": 1.9169942140579224, |
|
"learning_rate": 9.930691199511772e-05, |
|
"loss": 0.4414, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.36561264822134387, |
|
"grad_norm": 2.1389174461364746, |
|
"learning_rate": 9.925270800548282e-05, |
|
"loss": 0.4659, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37549407114624506, |
|
"grad_norm": 2.4838740825653076, |
|
"learning_rate": 9.919647942993145e-05, |
|
"loss": 0.4594, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38537549407114624, |
|
"grad_norm": 2.629211187362671, |
|
"learning_rate": 9.913822857980017e-05, |
|
"loss": 0.492, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": 2.387241840362549, |
|
"learning_rate": 9.907795784955324e-05, |
|
"loss": 0.4556, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4051383399209486, |
|
"grad_norm": 2.450533390045166, |
|
"learning_rate": 9.901566971668434e-05, |
|
"loss": 0.4653, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4150197628458498, |
|
"grad_norm": 2.464107036590576, |
|
"learning_rate": 9.895136674161462e-05, |
|
"loss": 0.4955, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.424901185770751, |
|
"grad_norm": 2.31066632270813, |
|
"learning_rate": 9.888505156758756e-05, |
|
"loss": 0.4623, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 2.158092975616455, |
|
"learning_rate": 9.881672692056019e-05, |
|
"loss": 0.4692, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.44466403162055335, |
|
"grad_norm": 1.851882815361023, |
|
"learning_rate": 9.874639560909115e-05, |
|
"loss": 0.487, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.44466403162055335, |
|
"eval_loss": 0.46247246861457825, |
|
"eval_runtime": 34.6043, |
|
"eval_samples_per_second": 14.449, |
|
"eval_steps_per_second": 14.449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 2.608232021331787, |
|
"learning_rate": 9.867406052422521e-05, |
|
"loss": 0.4742, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4644268774703557, |
|
"grad_norm": 2.2287890911102295, |
|
"learning_rate": 9.859972463937438e-05, |
|
"loss": 0.4611, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 2.5854885578155518, |
|
"learning_rate": 9.852339101019572e-05, |
|
"loss": 0.4576, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4841897233201581, |
|
"grad_norm": 2.134244203567505, |
|
"learning_rate": 9.844506277446574e-05, |
|
"loss": 0.4824, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.49407114624505927, |
|
"grad_norm": 2.4558589458465576, |
|
"learning_rate": 9.836474315195144e-05, |
|
"loss": 0.4497, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5039525691699605, |
|
"grad_norm": 2.9862284660339355, |
|
"learning_rate": 9.828243544427793e-05, |
|
"loss": 0.4632, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5138339920948617, |
|
"grad_norm": 2.5558395385742188, |
|
"learning_rate": 9.819814303479264e-05, |
|
"loss": 0.4486, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5237154150197628, |
|
"grad_norm": 1.770102620124817, |
|
"learning_rate": 9.811186938842643e-05, |
|
"loss": 0.4476, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5335968379446641, |
|
"grad_norm": 3.525803565979004, |
|
"learning_rate": 9.802361805155094e-05, |
|
"loss": 0.4643, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5434782608695652, |
|
"grad_norm": 2.2628469467163086, |
|
"learning_rate": 9.7933392651833e-05, |
|
"loss": 0.4731, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5533596837944664, |
|
"grad_norm": 3.0324666500091553, |
|
"learning_rate": 9.784119689808542e-05, |
|
"loss": 0.4478, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5632411067193676, |
|
"grad_norm": 2.263258218765259, |
|
"learning_rate": 9.77470345801145e-05, |
|
"loss": 0.4497, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5731225296442688, |
|
"grad_norm": 2.645112991333008, |
|
"learning_rate": 9.765090956856434e-05, |
|
"loss": 0.4666, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5830039525691699, |
|
"grad_norm": 2.0175938606262207, |
|
"learning_rate": 9.755282581475766e-05, |
|
"loss": 0.47, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5928853754940712, |
|
"grad_norm": 2.175175666809082, |
|
"learning_rate": 9.74527873505334e-05, |
|
"loss": 0.4592, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5928853754940712, |
|
"eval_loss": 0.44894906878471375, |
|
"eval_runtime": 34.7044, |
|
"eval_samples_per_second": 14.407, |
|
"eval_steps_per_second": 14.407, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6027667984189723, |
|
"grad_norm": 2.4829459190368652, |
|
"learning_rate": 9.735079828808105e-05, |
|
"loss": 0.4453, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6126482213438735, |
|
"grad_norm": 2.2683026790618896, |
|
"learning_rate": 9.724686281977144e-05, |
|
"loss": 0.4701, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6225296442687747, |
|
"grad_norm": 2.6205978393554688, |
|
"learning_rate": 9.714098521798462e-05, |
|
"loss": 0.4581, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 2.5154802799224854, |
|
"learning_rate": 9.703316983493411e-05, |
|
"loss": 0.4372, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.642292490118577, |
|
"grad_norm": 2.478700637817383, |
|
"learning_rate": 9.6923421102488e-05, |
|
"loss": 0.4554, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 2.286890745162964, |
|
"learning_rate": 9.681174353198684e-05, |
|
"loss": 0.474, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6620553359683794, |
|
"grad_norm": 2.0446231365203857, |
|
"learning_rate": 9.669814171405813e-05, |
|
"loss": 0.4347, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6719367588932806, |
|
"grad_norm": 2.571877956390381, |
|
"learning_rate": 9.658262031842768e-05, |
|
"loss": 0.4355, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 2.160186290740967, |
|
"learning_rate": 9.646518409372757e-05, |
|
"loss": 0.4354, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.691699604743083, |
|
"grad_norm": 2.0123462677001953, |
|
"learning_rate": 9.634583786730107e-05, |
|
"loss": 0.4619, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7015810276679841, |
|
"grad_norm": 2.4318065643310547, |
|
"learning_rate": 9.622458654500406e-05, |
|
"loss": 0.4795, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7114624505928854, |
|
"grad_norm": 2.1146297454833984, |
|
"learning_rate": 9.610143511100351e-05, |
|
"loss": 0.442, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7213438735177866, |
|
"grad_norm": 2.312072992324829, |
|
"learning_rate": 9.597638862757252e-05, |
|
"loss": 0.4339, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7312252964426877, |
|
"grad_norm": 2.0367119312286377, |
|
"learning_rate": 9.584945223488224e-05, |
|
"loss": 0.4519, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.741106719367589, |
|
"grad_norm": 2.8992955684661865, |
|
"learning_rate": 9.57206311507906e-05, |
|
"loss": 0.4431, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.741106719367589, |
|
"eval_loss": 0.4378005266189575, |
|
"eval_runtime": 34.0956, |
|
"eval_samples_per_second": 14.665, |
|
"eval_steps_per_second": 14.665, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7509881422924901, |
|
"grad_norm": 2.3162925243377686, |
|
"learning_rate": 9.558993067062783e-05, |
|
"loss": 0.4426, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7608695652173914, |
|
"grad_norm": 1.9839439392089844, |
|
"learning_rate": 9.545735616697873e-05, |
|
"loss": 0.46, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7707509881422925, |
|
"grad_norm": 2.18251633644104, |
|
"learning_rate": 9.532291308946188e-05, |
|
"loss": 0.4254, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7806324110671937, |
|
"grad_norm": 2.241259813308716, |
|
"learning_rate": 9.518660696450565e-05, |
|
"loss": 0.4246, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 2.357609272003174, |
|
"learning_rate": 9.504844339512093e-05, |
|
"loss": 0.4497, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8003952569169961, |
|
"grad_norm": 2.2541675567626953, |
|
"learning_rate": 9.490842806067093e-05, |
|
"loss": 0.4605, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8102766798418972, |
|
"grad_norm": 2.1015920639038086, |
|
"learning_rate": 9.476656671663764e-05, |
|
"loss": 0.4227, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8201581027667985, |
|
"grad_norm": 2.2886059284210205, |
|
"learning_rate": 9.462286519438528e-05, |
|
"loss": 0.4385, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8300395256916996, |
|
"grad_norm": 2.2543296813964844, |
|
"learning_rate": 9.447732940092057e-05, |
|
"loss": 0.433, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8399209486166008, |
|
"grad_norm": 2.2577757835388184, |
|
"learning_rate": 9.432996531864999e-05, |
|
"loss": 0.4634, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.849802371541502, |
|
"grad_norm": 1.795832872390747, |
|
"learning_rate": 9.418077900513374e-05, |
|
"loss": 0.4068, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8596837944664032, |
|
"grad_norm": 1.9893933534622192, |
|
"learning_rate": 9.402977659283688e-05, |
|
"loss": 0.4527, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 2.5203518867492676, |
|
"learning_rate": 9.387696428887713e-05, |
|
"loss": 0.4591, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8794466403162056, |
|
"grad_norm": 2.42069673538208, |
|
"learning_rate": 9.372234837476975e-05, |
|
"loss": 0.4597, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8893280632411067, |
|
"grad_norm": 2.087778329849243, |
|
"learning_rate": 9.356593520616945e-05, |
|
"loss": 0.4226, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8893280632411067, |
|
"eval_loss": 0.42734819650650024, |
|
"eval_runtime": 34.3301, |
|
"eval_samples_per_second": 14.564, |
|
"eval_steps_per_second": 14.564, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8992094861660079, |
|
"grad_norm": 2.4652795791625977, |
|
"learning_rate": 9.34077312126089e-05, |
|
"loss": 0.4261, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 2.0327532291412354, |
|
"learning_rate": 9.324774289723465e-05, |
|
"loss": 0.4674, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9189723320158103, |
|
"grad_norm": 2.3021750450134277, |
|
"learning_rate": 9.308597683653974e-05, |
|
"loss": 0.4521, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9288537549407114, |
|
"grad_norm": 6.520279884338379, |
|
"learning_rate": 9.292243968009328e-05, |
|
"loss": 0.4443, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9387351778656127, |
|
"grad_norm": 2.0640597343444824, |
|
"learning_rate": 9.27571381502673e-05, |
|
"loss": 0.416, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 2.280644416809082, |
|
"learning_rate": 9.25900790419602e-05, |
|
"loss": 0.4331, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.958498023715415, |
|
"grad_norm": 2.3445639610290527, |
|
"learning_rate": 9.24212692223176e-05, |
|
"loss": 0.4512, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9683794466403162, |
|
"grad_norm": 2.072683334350586, |
|
"learning_rate": 9.225071563045005e-05, |
|
"loss": 0.3967, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9782608695652174, |
|
"grad_norm": 1.9060055017471313, |
|
"learning_rate": 9.207842527714764e-05, |
|
"loss": 0.4102, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9881422924901185, |
|
"grad_norm": 2.248657464981079, |
|
"learning_rate": 9.1904405244592e-05, |
|
"loss": 0.4505, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9980237154150198, |
|
"grad_norm": 2.048110008239746, |
|
"learning_rate": 9.172866268606511e-05, |
|
"loss": 0.4102, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.007905138339921, |
|
"grad_norm": 1.9891077280044556, |
|
"learning_rate": 9.155120482565518e-05, |
|
"loss": 0.3866, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.017786561264822, |
|
"grad_norm": 2.499363422393799, |
|
"learning_rate": 9.13720389579598e-05, |
|
"loss": 0.3584, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.0276679841897234, |
|
"grad_norm": 2.4077465534210205, |
|
"learning_rate": 9.119117244778605e-05, |
|
"loss": 0.3736, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0375494071146245, |
|
"grad_norm": 2.0941267013549805, |
|
"learning_rate": 9.100861272984777e-05, |
|
"loss": 0.3769, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0375494071146245, |
|
"eval_loss": 0.4222135841846466, |
|
"eval_runtime": 34.1626, |
|
"eval_samples_per_second": 14.636, |
|
"eval_steps_per_second": 14.636, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0474308300395256, |
|
"grad_norm": 2.29099702835083, |
|
"learning_rate": 9.082436730845992e-05, |
|
"loss": 0.3545, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0573122529644268, |
|
"grad_norm": 2.5847902297973633, |
|
"learning_rate": 9.063844375723012e-05, |
|
"loss": 0.3658, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.0671936758893281, |
|
"grad_norm": 1.9889037609100342, |
|
"learning_rate": 9.045084971874735e-05, |
|
"loss": 0.3662, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0770750988142292, |
|
"grad_norm": 2.0356063842773438, |
|
"learning_rate": 9.026159290426779e-05, |
|
"loss": 0.3952, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 1.95900559425354, |
|
"learning_rate": 9.007068109339781e-05, |
|
"loss": 0.3624, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0968379446640317, |
|
"grad_norm": 2.0315041542053223, |
|
"learning_rate": 8.987812213377421e-05, |
|
"loss": 0.355, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.1067193675889329, |
|
"grad_norm": 1.9098906517028809, |
|
"learning_rate": 8.968392394074161e-05, |
|
"loss": 0.3396, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.116600790513834, |
|
"grad_norm": 2.3436784744262695, |
|
"learning_rate": 8.94880944970271e-05, |
|
"loss": 0.3433, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.1264822134387351, |
|
"grad_norm": 2.013385534286499, |
|
"learning_rate": 8.92906418524121e-05, |
|
"loss": 0.3815, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 2.3570964336395264, |
|
"learning_rate": 8.909157412340148e-05, |
|
"loss": 0.3825, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1462450592885376, |
|
"grad_norm": 2.0097525119781494, |
|
"learning_rate": 8.889089949288984e-05, |
|
"loss": 0.3788, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1561264822134387, |
|
"grad_norm": 1.8614075183868408, |
|
"learning_rate": 8.868862620982532e-05, |
|
"loss": 0.3434, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1660079051383399, |
|
"grad_norm": 2.3193359375, |
|
"learning_rate": 8.848476258887028e-05, |
|
"loss": 0.3652, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1758893280632412, |
|
"grad_norm": 2.1564888954162598, |
|
"learning_rate": 8.827931701005971e-05, |
|
"loss": 0.3604, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.1857707509881423, |
|
"grad_norm": 2.278334856033325, |
|
"learning_rate": 8.80722979184567e-05, |
|
"loss": 0.351, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1857707509881423, |
|
"eval_loss": 0.4201338589191437, |
|
"eval_runtime": 34.2095, |
|
"eval_samples_per_second": 14.616, |
|
"eval_steps_per_second": 14.616, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1956521739130435, |
|
"grad_norm": 2.3817718029022217, |
|
"learning_rate": 8.786371382380525e-05, |
|
"loss": 0.3681, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2055335968379446, |
|
"grad_norm": 2.221449613571167, |
|
"learning_rate": 8.765357330018053e-05, |
|
"loss": 0.396, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.215415019762846, |
|
"grad_norm": 1.9129923582077026, |
|
"learning_rate": 8.744188498563639e-05, |
|
"loss": 0.3861, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.225296442687747, |
|
"grad_norm": 2.0991668701171875, |
|
"learning_rate": 8.722865758185034e-05, |
|
"loss": 0.373, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2351778656126482, |
|
"grad_norm": 1.9412460327148438, |
|
"learning_rate": 8.701389985376575e-05, |
|
"loss": 0.3592, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2450592885375493, |
|
"grad_norm": 2.6546976566314697, |
|
"learning_rate": 8.679762062923174e-05, |
|
"loss": 0.3871, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2549407114624507, |
|
"grad_norm": 2.3372902870178223, |
|
"learning_rate": 8.657982879864005e-05, |
|
"loss": 0.3776, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2648221343873518, |
|
"grad_norm": 1.9796963930130005, |
|
"learning_rate": 8.636053331455984e-05, |
|
"loss": 0.377, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.274703557312253, |
|
"grad_norm": 2.1785104274749756, |
|
"learning_rate": 8.613974319136955e-05, |
|
"loss": 0.3942, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2845849802371543, |
|
"grad_norm": 1.8092831373214722, |
|
"learning_rate": 8.591746750488636e-05, |
|
"loss": 0.367, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2944664031620554, |
|
"grad_norm": 2.4670629501342773, |
|
"learning_rate": 8.569371539199313e-05, |
|
"loss": 0.3744, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 2.104426383972168, |
|
"learning_rate": 8.546849605026287e-05, |
|
"loss": 0.3801, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.3142292490118577, |
|
"grad_norm": 2.5602879524230957, |
|
"learning_rate": 8.524181873758057e-05, |
|
"loss": 0.352, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.3241106719367588, |
|
"grad_norm": 2.211514949798584, |
|
"learning_rate": 8.501369277176273e-05, |
|
"loss": 0.3643, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3339920948616601, |
|
"grad_norm": 2.312812566757202, |
|
"learning_rate": 8.478412753017431e-05, |
|
"loss": 0.3681, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3339920948616601, |
|
"eval_loss": 0.417085200548172, |
|
"eval_runtime": 34.1379, |
|
"eval_samples_per_second": 14.646, |
|
"eval_steps_per_second": 14.646, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3438735177865613, |
|
"grad_norm": 2.7497165203094482, |
|
"learning_rate": 8.455313244934322e-05, |
|
"loss": 0.3739, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3537549407114624, |
|
"grad_norm": 2.3112716674804688, |
|
"learning_rate": 8.432071702457251e-05, |
|
"loss": 0.367, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 2.03934383392334, |
|
"learning_rate": 8.408689080954995e-05, |
|
"loss": 0.3506, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3735177865612649, |
|
"grad_norm": 1.942353367805481, |
|
"learning_rate": 8.385166341595547e-05, |
|
"loss": 0.3651, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.383399209486166, |
|
"grad_norm": 1.985518455505371, |
|
"learning_rate": 8.361504451306582e-05, |
|
"loss": 0.3593, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.3932806324110671, |
|
"grad_norm": 2.244945526123047, |
|
"learning_rate": 8.337704382735738e-05, |
|
"loss": 0.3616, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4031620553359683, |
|
"grad_norm": 2.084362268447876, |
|
"learning_rate": 8.313767114210614e-05, |
|
"loss": 0.3725, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.4130434782608696, |
|
"grad_norm": 2.0909502506256104, |
|
"learning_rate": 8.289693629698562e-05, |
|
"loss": 0.3708, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.4229249011857708, |
|
"grad_norm": 2.526142120361328, |
|
"learning_rate": 8.265484918766241e-05, |
|
"loss": 0.3688, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4328063241106719, |
|
"grad_norm": 1.7779805660247803, |
|
"learning_rate": 8.241141976538941e-05, |
|
"loss": 0.3628, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4426877470355732, |
|
"grad_norm": 2.2417075634002686, |
|
"learning_rate": 8.216665803659669e-05, |
|
"loss": 0.3539, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4525691699604744, |
|
"grad_norm": 2.3137755393981934, |
|
"learning_rate": 8.192057406248027e-05, |
|
"loss": 0.3526, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4624505928853755, |
|
"grad_norm": 2.308361768722534, |
|
"learning_rate": 8.167317795858849e-05, |
|
"loss": 0.364, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4723320158102766, |
|
"grad_norm": 2.1137235164642334, |
|
"learning_rate": 8.142447989440615e-05, |
|
"loss": 0.3725, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.4822134387351777, |
|
"grad_norm": 2.206882953643799, |
|
"learning_rate": 8.117449009293666e-05, |
|
"loss": 0.38, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4822134387351777, |
|
"eval_loss": 0.41128015518188477, |
|
"eval_runtime": 34.0412, |
|
"eval_samples_per_second": 14.688, |
|
"eval_steps_per_second": 14.688, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.492094861660079, |
|
"grad_norm": 2.540431499481201, |
|
"learning_rate": 8.092321883028156e-05, |
|
"loss": 0.3703, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5019762845849802, |
|
"grad_norm": 2.1057121753692627, |
|
"learning_rate": 8.067067643521832e-05, |
|
"loss": 0.3797, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.5118577075098814, |
|
"grad_norm": 2.375397205352783, |
|
"learning_rate": 8.041687328877564e-05, |
|
"loss": 0.3657, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 2.403914451599121, |
|
"learning_rate": 8.016181982380679e-05, |
|
"loss": 0.3807, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.5316205533596838, |
|
"grad_norm": 2.3958826065063477, |
|
"learning_rate": 7.990552652456078e-05, |
|
"loss": 0.3622, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.541501976284585, |
|
"grad_norm": 2.351919651031494, |
|
"learning_rate": 7.964800392625127e-05, |
|
"loss": 0.3762, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5513833992094863, |
|
"grad_norm": 2.015793800354004, |
|
"learning_rate": 7.938926261462365e-05, |
|
"loss": 0.3536, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5612648221343872, |
|
"grad_norm": 2.037121295928955, |
|
"learning_rate": 7.912931322551979e-05, |
|
"loss": 0.3718, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5711462450592886, |
|
"grad_norm": 2.1762428283691406, |
|
"learning_rate": 7.886816644444096e-05, |
|
"loss": 0.3504, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5810276679841897, |
|
"grad_norm": 1.8388617038726807, |
|
"learning_rate": 7.860583300610847e-05, |
|
"loss": 0.3431, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 1.9121774435043335, |
|
"learning_rate": 7.834232369402248e-05, |
|
"loss": 0.3769, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.6007905138339922, |
|
"grad_norm": 2.1485304832458496, |
|
"learning_rate": 7.807764934001872e-05, |
|
"loss": 0.3361, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.6106719367588933, |
|
"grad_norm": 2.162116289138794, |
|
"learning_rate": 7.781182082382322e-05, |
|
"loss": 0.3747, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.6205533596837944, |
|
"grad_norm": 2.514573335647583, |
|
"learning_rate": 7.754484907260511e-05, |
|
"loss": 0.3857, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.6304347826086958, |
|
"grad_norm": 2.3473386764526367, |
|
"learning_rate": 7.727674506052742e-05, |
|
"loss": 0.3269, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6304347826086958, |
|
"eval_loss": 0.4064118564128876, |
|
"eval_runtime": 34.0417, |
|
"eval_samples_per_second": 14.688, |
|
"eval_steps_per_second": 14.688, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6403162055335967, |
|
"grad_norm": 2.0117132663726807, |
|
"learning_rate": 7.700751980829599e-05, |
|
"loss": 0.3662, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.650197628458498, |
|
"grad_norm": 2.5356202125549316, |
|
"learning_rate": 7.673718438270646e-05, |
|
"loss": 0.3671, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6600790513833992, |
|
"grad_norm": 2.1220240592956543, |
|
"learning_rate": 7.646574989618936e-05, |
|
"loss": 0.3655, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6699604743083003, |
|
"grad_norm": 2.049267292022705, |
|
"learning_rate": 7.619322750635325e-05, |
|
"loss": 0.3916, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.6798418972332017, |
|
"grad_norm": 2.0539910793304443, |
|
"learning_rate": 7.591962841552624e-05, |
|
"loss": 0.4168, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6897233201581028, |
|
"grad_norm": 2.229034662246704, |
|
"learning_rate": 7.56449638702953e-05, |
|
"loss": 0.369, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.699604743083004, |
|
"grad_norm": 2.280418634414673, |
|
"learning_rate": 7.536924516104408e-05, |
|
"loss": 0.375, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.7094861660079053, |
|
"grad_norm": 1.9317281246185303, |
|
"learning_rate": 7.509248362148886e-05, |
|
"loss": 0.3602, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.7193675889328062, |
|
"grad_norm": 2.0074923038482666, |
|
"learning_rate": 7.481469062821249e-05, |
|
"loss": 0.3763, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.7292490118577075, |
|
"grad_norm": 2.6529626846313477, |
|
"learning_rate": 7.453587760019688e-05, |
|
"loss": 0.3867, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 2.64829421043396, |
|
"learning_rate": 7.425605599835358e-05, |
|
"loss": 0.3459, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.7490118577075098, |
|
"grad_norm": 2.139469861984253, |
|
"learning_rate": 7.397523732505269e-05, |
|
"loss": 0.3763, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7588932806324111, |
|
"grad_norm": 2.043088674545288, |
|
"learning_rate": 7.369343312364992e-05, |
|
"loss": 0.3313, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7687747035573123, |
|
"grad_norm": 2.4256412982940674, |
|
"learning_rate": 7.341065497801227e-05, |
|
"loss": 0.3607, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7786561264822134, |
|
"grad_norm": 2.4966022968292236, |
|
"learning_rate": 7.312691451204175e-05, |
|
"loss": 0.3413, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7786561264822134, |
|
"eval_loss": 0.39886847138404846, |
|
"eval_runtime": 34.0089, |
|
"eval_samples_per_second": 14.702, |
|
"eval_steps_per_second": 14.702, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7885375494071147, |
|
"grad_norm": 2.126098394393921, |
|
"learning_rate": 7.284222338919757e-05, |
|
"loss": 0.3505, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.7984189723320159, |
|
"grad_norm": 2.32716965675354, |
|
"learning_rate": 7.25565933120167e-05, |
|
"loss": 0.3706, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.808300395256917, |
|
"grad_norm": 2.4196839332580566, |
|
"learning_rate": 7.227003602163294e-05, |
|
"loss": 0.3672, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 2.1417181491851807, |
|
"learning_rate": 7.19825632972941e-05, |
|
"loss": 0.3467, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.8280632411067192, |
|
"grad_norm": 1.9946470260620117, |
|
"learning_rate": 7.169418695587788e-05, |
|
"loss": 0.3639, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.8379446640316206, |
|
"grad_norm": 2.3900909423828125, |
|
"learning_rate": 7.140491885140627e-05, |
|
"loss": 0.354, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8478260869565217, |
|
"grad_norm": 2.3250668048858643, |
|
"learning_rate": 7.111477087455798e-05, |
|
"loss": 0.3829, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8577075098814229, |
|
"grad_norm": 2.3011209964752197, |
|
"learning_rate": 7.082375495217994e-05, |
|
"loss": 0.3567, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8675889328063242, |
|
"grad_norm": 2.4620919227600098, |
|
"learning_rate": 7.053188304679689e-05, |
|
"loss": 0.3729, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.8774703557312253, |
|
"grad_norm": 1.9825767278671265, |
|
"learning_rate": 7.023916715611966e-05, |
|
"loss": 0.367, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8873517786561265, |
|
"grad_norm": 2.1703319549560547, |
|
"learning_rate": 6.994561931255207e-05, |
|
"loss": 0.3818, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.8972332015810278, |
|
"grad_norm": 1.79076087474823, |
|
"learning_rate": 6.965125158269616e-05, |
|
"loss": 0.3553, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.9071146245059287, |
|
"grad_norm": 2.1293234825134277, |
|
"learning_rate": 6.935607606685639e-05, |
|
"loss": 0.3665, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.91699604743083, |
|
"grad_norm": 2.227125883102417, |
|
"learning_rate": 6.906010489854208e-05, |
|
"loss": 0.3753, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.9268774703557312, |
|
"grad_norm": 1.7864975929260254, |
|
"learning_rate": 6.87633502439687e-05, |
|
"loss": 0.3534, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.9268774703557312, |
|
"eval_loss": 0.3949296772480011, |
|
"eval_runtime": 34.1382, |
|
"eval_samples_per_second": 14.646, |
|
"eval_steps_per_second": 14.646, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.9367588932806323, |
|
"grad_norm": 1.8437656164169312, |
|
"learning_rate": 6.84658243015578e-05, |
|
"loss": 0.3605, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.9466403162055337, |
|
"grad_norm": 2.3836982250213623, |
|
"learning_rate": 6.816753930143555e-05, |
|
"loss": 0.3686, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 2.2831881046295166, |
|
"learning_rate": 6.786850750493004e-05, |
|
"loss": 0.3655, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.966403162055336, |
|
"grad_norm": 2.3932294845581055, |
|
"learning_rate": 6.756874120406713e-05, |
|
"loss": 0.3802, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.9762845849802373, |
|
"grad_norm": 1.7772880792617798, |
|
"learning_rate": 6.726825272106537e-05, |
|
"loss": 0.3454, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9861660079051382, |
|
"grad_norm": 1.9510533809661865, |
|
"learning_rate": 6.696705440782937e-05, |
|
"loss": 0.3789, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.9960474308300395, |
|
"grad_norm": 2.113067150115967, |
|
"learning_rate": 6.666515864544208e-05, |
|
"loss": 0.3718, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.005928853754941, |
|
"grad_norm": 2.007193088531494, |
|
"learning_rate": 6.636257784365583e-05, |
|
"loss": 0.2785, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.015810276679842, |
|
"grad_norm": 1.9907631874084473, |
|
"learning_rate": 6.605932444038227e-05, |
|
"loss": 0.284, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.025691699604743, |
|
"grad_norm": 2.3482143878936768, |
|
"learning_rate": 6.575541090118102e-05, |
|
"loss": 0.2744, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.035573122529644, |
|
"grad_norm": 2.2984330654144287, |
|
"learning_rate": 6.545084971874736e-05, |
|
"loss": 0.2763, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 2.089308261871338, |
|
"learning_rate": 6.51456534123986e-05, |
|
"loss": 0.2993, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.0553359683794468, |
|
"grad_norm": 1.9980093240737915, |
|
"learning_rate": 6.483983452755952e-05, |
|
"loss": 0.295, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.0652173913043477, |
|
"grad_norm": 2.138206958770752, |
|
"learning_rate": 6.453340563524668e-05, |
|
"loss": 0.302, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.075098814229249, |
|
"grad_norm": 2.199354887008667, |
|
"learning_rate": 6.422637933155161e-05, |
|
"loss": 0.2791, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.075098814229249, |
|
"eval_loss": 0.4022028148174286, |
|
"eval_runtime": 34.24, |
|
"eval_samples_per_second": 14.603, |
|
"eval_steps_per_second": 14.603, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0849802371541504, |
|
"grad_norm": 1.7849321365356445, |
|
"learning_rate": 6.391876823712316e-05, |
|
"loss": 0.2882, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.0948616600790513, |
|
"grad_norm": 2.316427230834961, |
|
"learning_rate": 6.361058499664854e-05, |
|
"loss": 0.2893, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.1047430830039526, |
|
"grad_norm": 2.092482328414917, |
|
"learning_rate": 6.330184227833374e-05, |
|
"loss": 0.2851, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.1146245059288535, |
|
"grad_norm": 2.0520172119140625, |
|
"learning_rate": 6.299255277338263e-05, |
|
"loss": 0.2893, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.124505928853755, |
|
"grad_norm": 2.0501887798309326, |
|
"learning_rate": 6.268272919547534e-05, |
|
"loss": 0.2877, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.1343873517786562, |
|
"grad_norm": 1.967375636100769, |
|
"learning_rate": 6.23723842802457e-05, |
|
"loss": 0.2838, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.144268774703557, |
|
"grad_norm": 2.331676959991455, |
|
"learning_rate": 6.20615307847576e-05, |
|
"loss": 0.3169, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.1541501976284585, |
|
"grad_norm": 2.251298666000366, |
|
"learning_rate": 6.175018148698074e-05, |
|
"loss": 0.2882, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.16403162055336, |
|
"grad_norm": 2.0839710235595703, |
|
"learning_rate": 6.143834918526526e-05, |
|
"loss": 0.2862, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 2.633404493331909, |
|
"learning_rate": 6.112604669781571e-05, |
|
"loss": 0.2815, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.183794466403162, |
|
"grad_norm": 2.089813709259033, |
|
"learning_rate": 6.081328686216416e-05, |
|
"loss": 0.3046, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.1936758893280635, |
|
"grad_norm": 2.1596994400024414, |
|
"learning_rate": 6.050008253464245e-05, |
|
"loss": 0.2834, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.2035573122529644, |
|
"grad_norm": 1.879882574081421, |
|
"learning_rate": 6.018644658985377e-05, |
|
"loss": 0.2797, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.2134387351778657, |
|
"grad_norm": 2.6231043338775635, |
|
"learning_rate": 5.987239192014334e-05, |
|
"loss": 0.2761, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.2233201581027666, |
|
"grad_norm": 2.381791114807129, |
|
"learning_rate": 5.9557931435068606e-05, |
|
"loss": 0.3281, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.2233201581027666, |
|
"eval_loss": 0.40134918689727783, |
|
"eval_runtime": 34.1432, |
|
"eval_samples_per_second": 14.644, |
|
"eval_steps_per_second": 14.644, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.233201581027668, |
|
"grad_norm": 2.037327527999878, |
|
"learning_rate": 5.9243078060868426e-05, |
|
"loss": 0.2772, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.2430830039525693, |
|
"grad_norm": 2.2997586727142334, |
|
"learning_rate": 5.892784473993182e-05, |
|
"loss": 0.2905, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.2529644268774702, |
|
"grad_norm": 1.9744611978530884, |
|
"learning_rate": 5.861224443026593e-05, |
|
"loss": 0.2868, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.2628458498023716, |
|
"grad_norm": 2.116672992706299, |
|
"learning_rate": 5.8296290104963387e-05, |
|
"loss": 0.2858, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 2.145845890045166, |
|
"learning_rate": 5.797999475166895e-05, |
|
"loss": 0.2752, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.282608695652174, |
|
"grad_norm": 2.2736833095550537, |
|
"learning_rate": 5.766337137204578e-05, |
|
"loss": 0.2984, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.292490118577075, |
|
"grad_norm": 2.218451499938965, |
|
"learning_rate": 5.734643298124089e-05, |
|
"loss": 0.2912, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.302371541501976, |
|
"grad_norm": 1.9524636268615723, |
|
"learning_rate": 5.702919260735013e-05, |
|
"loss": 0.2919, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.3122529644268774, |
|
"grad_norm": 2.35251784324646, |
|
"learning_rate": 5.671166329088276e-05, |
|
"loss": 0.3256, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.322134387351779, |
|
"grad_norm": 2.4238321781158447, |
|
"learning_rate": 5.639385808422529e-05, |
|
"loss": 0.3056, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.3320158102766797, |
|
"grad_norm": 2.408384084701538, |
|
"learning_rate": 5.607579005110501e-05, |
|
"loss": 0.2833, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.341897233201581, |
|
"grad_norm": 2.345621109008789, |
|
"learning_rate": 5.575747226605297e-05, |
|
"loss": 0.2961, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.3517786561264824, |
|
"grad_norm": 2.226508140563965, |
|
"learning_rate": 5.543891781386654e-05, |
|
"loss": 0.3138, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.3616600790513833, |
|
"grad_norm": 2.230583429336548, |
|
"learning_rate": 5.5120139789071554e-05, |
|
"loss": 0.2837, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.3715415019762847, |
|
"grad_norm": 2.449136972427368, |
|
"learning_rate": 5.480115129538408e-05, |
|
"loss": 0.2763, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.3715415019762847, |
|
"eval_loss": 0.399911105632782, |
|
"eval_runtime": 34.0722, |
|
"eval_samples_per_second": 14.675, |
|
"eval_steps_per_second": 14.675, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.3814229249011856, |
|
"grad_norm": 2.04034161567688, |
|
"learning_rate": 5.4481965445171666e-05, |
|
"loss": 0.2889, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 2.589332103729248, |
|
"learning_rate": 5.416259535891445e-05, |
|
"loss": 0.2882, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.4011857707509883, |
|
"grad_norm": 1.8965773582458496, |
|
"learning_rate": 5.384305416466583e-05, |
|
"loss": 0.2664, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.411067193675889, |
|
"grad_norm": 1.9925569295883179, |
|
"learning_rate": 5.3523354997512684e-05, |
|
"loss": 0.3079, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.4209486166007905, |
|
"grad_norm": 1.982309341430664, |
|
"learning_rate": 5.320351099903564e-05, |
|
"loss": 0.2893, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.430830039525692, |
|
"grad_norm": 2.4800615310668945, |
|
"learning_rate": 5.288353531676871e-05, |
|
"loss": 0.2576, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.440711462450593, |
|
"grad_norm": 2.2494561672210693, |
|
"learning_rate": 5.256344110365895e-05, |
|
"loss": 0.292, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.450592885375494, |
|
"grad_norm": 1.8020946979522705, |
|
"learning_rate": 5.224324151752574e-05, |
|
"loss": 0.2863, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.4604743083003955, |
|
"grad_norm": 2.187232494354248, |
|
"learning_rate": 5.192294972051991e-05, |
|
"loss": 0.2753, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.4703557312252964, |
|
"grad_norm": 2.2467732429504395, |
|
"learning_rate": 5.160257887858276e-05, |
|
"loss": 0.3188, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.4802371541501977, |
|
"grad_norm": 1.9890021085739136, |
|
"learning_rate": 5.128214216090477e-05, |
|
"loss": 0.2838, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.4901185770750986, |
|
"grad_norm": 1.962117075920105, |
|
"learning_rate": 5.096165273938434e-05, |
|
"loss": 0.2858, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.8473106622695923, |
|
"learning_rate": 5.064112378808635e-05, |
|
"loss": 0.2692, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.5098814229249014, |
|
"grad_norm": 1.9031504392623901, |
|
"learning_rate": 5.032056848270054e-05, |
|
"loss": 0.2993, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.5197628458498023, |
|
"grad_norm": 2.2502200603485107, |
|
"learning_rate": 4.999999999999999e-05, |
|
"loss": 0.2696, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.5197628458498023, |
|
"eval_loss": 0.39579418301582336, |
|
"eval_runtime": 34.1604, |
|
"eval_samples_per_second": 14.637, |
|
"eval_steps_per_second": 14.637, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.5296442687747036, |
|
"grad_norm": 2.3031277656555176, |
|
"learning_rate": 4.9679431517299435e-05, |
|
"loss": 0.3062, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.5395256916996045, |
|
"grad_norm": 2.183401107788086, |
|
"learning_rate": 4.9358876211913624e-05, |
|
"loss": 0.325, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.549407114624506, |
|
"grad_norm": 1.9418132305145264, |
|
"learning_rate": 4.9038347260615636e-05, |
|
"loss": 0.2874, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.559288537549407, |
|
"grad_norm": 2.340853214263916, |
|
"learning_rate": 4.871785783909522e-05, |
|
"loss": 0.2914, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.5691699604743086, |
|
"grad_norm": 1.8216912746429443, |
|
"learning_rate": 4.839742112141723e-05, |
|
"loss": 0.2935, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.5790513833992095, |
|
"grad_norm": 2.1227974891662598, |
|
"learning_rate": 4.807705027948006e-05, |
|
"loss": 0.2903, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.588932806324111, |
|
"grad_norm": 2.1689720153808594, |
|
"learning_rate": 4.775675848247426e-05, |
|
"loss": 0.2919, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.5988142292490117, |
|
"grad_norm": 2.3520572185516357, |
|
"learning_rate": 4.7436558896341037e-05, |
|
"loss": 0.2947, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 2.0316853523254395, |
|
"learning_rate": 4.711646468323127e-05, |
|
"loss": 0.2921, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.6185770750988144, |
|
"grad_norm": 2.334075450897217, |
|
"learning_rate": 4.6796489000964345e-05, |
|
"loss": 0.3109, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.6284584980237153, |
|
"grad_norm": 2.4072225093841553, |
|
"learning_rate": 4.6476645002487286e-05, |
|
"loss": 0.2886, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.6383399209486167, |
|
"grad_norm": 2.3423006534576416, |
|
"learning_rate": 4.615694583533417e-05, |
|
"loss": 0.3002, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.6482213438735176, |
|
"grad_norm": 2.290945291519165, |
|
"learning_rate": 4.5837404641085526e-05, |
|
"loss": 0.3013, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.658102766798419, |
|
"grad_norm": 2.3166189193725586, |
|
"learning_rate": 4.551803455482832e-05, |
|
"loss": 0.2855, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.6679841897233203, |
|
"grad_norm": 2.2672386169433594, |
|
"learning_rate": 4.51988487046159e-05, |
|
"loss": 0.2732, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.6679841897233203, |
|
"eval_loss": 0.3915008008480072, |
|
"eval_runtime": 34.1864, |
|
"eval_samples_per_second": 14.626, |
|
"eval_steps_per_second": 14.626, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.677865612648221, |
|
"grad_norm": 2.359931468963623, |
|
"learning_rate": 4.487986021092842e-05, |
|
"loss": 0.3326, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.6877470355731226, |
|
"grad_norm": 1.9921296834945679, |
|
"learning_rate": 4.456108218613345e-05, |
|
"loss": 0.28, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.6976284584980235, |
|
"grad_norm": 2.083142042160034, |
|
"learning_rate": 4.4242527733947024e-05, |
|
"loss": 0.2936, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.707509881422925, |
|
"grad_norm": 2.4226627349853516, |
|
"learning_rate": 4.3924209948894975e-05, |
|
"loss": 0.3011, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.717391304347826, |
|
"grad_norm": 2.4604616165161133, |
|
"learning_rate": 4.360614191577469e-05, |
|
"loss": 0.2702, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 1.890394926071167, |
|
"learning_rate": 4.3288336709117236e-05, |
|
"loss": 0.2793, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.7371541501976284, |
|
"grad_norm": 2.458721160888672, |
|
"learning_rate": 4.297080739264986e-05, |
|
"loss": 0.2882, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.7470355731225298, |
|
"grad_norm": 2.1499905586242676, |
|
"learning_rate": 4.2653567018759094e-05, |
|
"loss": 0.2955, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.7569169960474307, |
|
"grad_norm": 2.4274818897247314, |
|
"learning_rate": 4.233662862795419e-05, |
|
"loss": 0.302, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.766798418972332, |
|
"grad_norm": 2.062570571899414, |
|
"learning_rate": 4.202000524833104e-05, |
|
"loss": 0.2715, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.7766798418972334, |
|
"grad_norm": 2.0354299545288086, |
|
"learning_rate": 4.170370989503661e-05, |
|
"loss": 0.3038, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.7865612648221343, |
|
"grad_norm": 2.3393170833587646, |
|
"learning_rate": 4.1387755569734046e-05, |
|
"loss": 0.2905, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.7964426877470356, |
|
"grad_norm": 2.224705934524536, |
|
"learning_rate": 4.1072155260068164e-05, |
|
"loss": 0.2989, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.8063241106719365, |
|
"grad_norm": 1.9835401773452759, |
|
"learning_rate": 4.075692193913155e-05, |
|
"loss": 0.2879, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.816205533596838, |
|
"grad_norm": 2.378260850906372, |
|
"learning_rate": 4.0442068564931385e-05, |
|
"loss": 0.3009, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.816205533596838, |
|
"eval_loss": 0.38761380314826965, |
|
"eval_runtime": 34.3575, |
|
"eval_samples_per_second": 14.553, |
|
"eval_steps_per_second": 14.553, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.8260869565217392, |
|
"grad_norm": 2.5100574493408203, |
|
"learning_rate": 4.012760807985664e-05, |
|
"loss": 0.2982, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.83596837944664, |
|
"grad_norm": 1.639036774635315, |
|
"learning_rate": 3.9813553410146214e-05, |
|
"loss": 0.3087, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.8458498023715415, |
|
"grad_norm": 2.490206241607666, |
|
"learning_rate": 3.949991746535752e-05, |
|
"loss": 0.2989, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.8557312252964424, |
|
"grad_norm": 2.3250222206115723, |
|
"learning_rate": 3.918671313783582e-05, |
|
"loss": 0.3034, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.8656126482213438, |
|
"grad_norm": 2.325777292251587, |
|
"learning_rate": 3.8873953302184275e-05, |
|
"loss": 0.2747, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.875494071146245, |
|
"grad_norm": 2.2305283546447754, |
|
"learning_rate": 3.856165081473473e-05, |
|
"loss": 0.2856, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.8853754940711465, |
|
"grad_norm": 2.2349300384521484, |
|
"learning_rate": 3.824981851301923e-05, |
|
"loss": 0.2716, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.8952569169960474, |
|
"grad_norm": 1.9070847034454346, |
|
"learning_rate": 3.793846921524236e-05, |
|
"loss": 0.3053, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.9051383399209487, |
|
"grad_norm": 2.1934654712677, |
|
"learning_rate": 3.7627615719754287e-05, |
|
"loss": 0.3143, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.9150197628458496, |
|
"grad_norm": 1.9646544456481934, |
|
"learning_rate": 3.7317270804524626e-05, |
|
"loss": 0.2865, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.924901185770751, |
|
"grad_norm": 2.232283353805542, |
|
"learning_rate": 3.700744722661735e-05, |
|
"loss": 0.2958, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.9347826086956523, |
|
"grad_norm": 2.3013436794281006, |
|
"learning_rate": 3.669815772166624e-05, |
|
"loss": 0.2834, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.9446640316205532, |
|
"grad_norm": 2.4497499465942383, |
|
"learning_rate": 3.6389415003351434e-05, |
|
"loss": 0.2978, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 1.9342460632324219, |
|
"learning_rate": 3.608123176287684e-05, |
|
"loss": 0.2826, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.9644268774703555, |
|
"grad_norm": 2.4852263927459717, |
|
"learning_rate": 3.577362066844837e-05, |
|
"loss": 0.2766, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.9644268774703555, |
|
"eval_loss": 0.3866276741027832, |
|
"eval_runtime": 34.3485, |
|
"eval_samples_per_second": 14.557, |
|
"eval_steps_per_second": 14.557, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.974308300395257, |
|
"grad_norm": 1.9710100889205933, |
|
"learning_rate": 3.546659436475331e-05, |
|
"loss": 0.282, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.984189723320158, |
|
"grad_norm": 2.269618511199951, |
|
"learning_rate": 3.516016547244046e-05, |
|
"loss": 0.2974, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.9940711462450595, |
|
"grad_norm": 2.038463830947876, |
|
"learning_rate": 3.485434658760139e-05, |
|
"loss": 0.2931, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 3.0039525691699605, |
|
"grad_norm": 1.9927372932434082, |
|
"learning_rate": 3.454915028125262e-05, |
|
"loss": 0.2778, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.013833992094862, |
|
"grad_norm": 2.1115493774414062, |
|
"learning_rate": 3.424458909881896e-05, |
|
"loss": 0.228, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.0237154150197627, |
|
"grad_norm": 2.1515655517578125, |
|
"learning_rate": 3.394067555961772e-05, |
|
"loss": 0.2172, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.033596837944664, |
|
"grad_norm": 2.6619913578033447, |
|
"learning_rate": 3.3637422156344146e-05, |
|
"loss": 0.2158, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 2.441153049468994, |
|
"learning_rate": 3.333484135455791e-05, |
|
"loss": 0.2305, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.0533596837944663, |
|
"grad_norm": 1.8996953964233398, |
|
"learning_rate": 3.3032945592170616e-05, |
|
"loss": 0.2436, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 3.0632411067193677, |
|
"grad_norm": 2.410764217376709, |
|
"learning_rate": 3.2731747278934616e-05, |
|
"loss": 0.2198, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.0731225296442686, |
|
"grad_norm": 1.9897364377975464, |
|
"learning_rate": 3.243125879593285e-05, |
|
"loss": 0.2309, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 3.08300395256917, |
|
"grad_norm": 2.573441982269287, |
|
"learning_rate": 3.213149249506996e-05, |
|
"loss": 0.213, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.0928853754940713, |
|
"grad_norm": 2.5531418323516846, |
|
"learning_rate": 3.1832460698564424e-05, |
|
"loss": 0.234, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.102766798418972, |
|
"grad_norm": 2.101729154586792, |
|
"learning_rate": 3.1534175698442184e-05, |
|
"loss": 0.239, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.1126482213438735, |
|
"grad_norm": 1.9485621452331543, |
|
"learning_rate": 3.123664975603129e-05, |
|
"loss": 0.2145, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.1126482213438735, |
|
"eval_loss": 0.40044403076171875, |
|
"eval_runtime": 34.1191, |
|
"eval_samples_per_second": 14.655, |
|
"eval_steps_per_second": 14.655, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.122529644268775, |
|
"grad_norm": 2.287511110305786, |
|
"learning_rate": 3.093989510145791e-05, |
|
"loss": 0.2225, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.132411067193676, |
|
"grad_norm": 2.047910451889038, |
|
"learning_rate": 3.064392393314359e-05, |
|
"loss": 0.2178, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.142292490118577, |
|
"grad_norm": 1.6208367347717285, |
|
"learning_rate": 3.0348748417303817e-05, |
|
"loss": 0.222, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.1521739130434785, |
|
"grad_norm": 2.4111440181732178, |
|
"learning_rate": 3.005438068744791e-05, |
|
"loss": 0.2177, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.1620553359683794, |
|
"grad_norm": 2.368447780609131, |
|
"learning_rate": 2.9760832843880303e-05, |
|
"loss": 0.2012, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.1719367588932808, |
|
"grad_norm": 2.473605155944824, |
|
"learning_rate": 2.94681169532031e-05, |
|
"loss": 0.2277, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 2.02504301071167, |
|
"learning_rate": 2.9176245047820055e-05, |
|
"loss": 0.2193, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.191699604743083, |
|
"grad_norm": 2.4214389324188232, |
|
"learning_rate": 2.8885229125442014e-05, |
|
"loss": 0.2196, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.2015810276679844, |
|
"grad_norm": 2.1690824031829834, |
|
"learning_rate": 2.859508114859373e-05, |
|
"loss": 0.2267, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.2114624505928853, |
|
"grad_norm": 1.960707426071167, |
|
"learning_rate": 2.830581304412209e-05, |
|
"loss": 0.2409, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.2213438735177866, |
|
"grad_norm": 2.320850133895874, |
|
"learning_rate": 2.8017436702705894e-05, |
|
"loss": 0.2245, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.2312252964426875, |
|
"grad_norm": 2.268925428390503, |
|
"learning_rate": 2.7729963978367035e-05, |
|
"loss": 0.2373, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.241106719367589, |
|
"grad_norm": 2.374448776245117, |
|
"learning_rate": 2.7443406687983255e-05, |
|
"loss": 0.2188, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.2509881422924902, |
|
"grad_norm": 1.9990499019622803, |
|
"learning_rate": 2.7157776610802408e-05, |
|
"loss": 0.253, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.260869565217391, |
|
"grad_norm": 2.6509578227996826, |
|
"learning_rate": 2.6873085487958243e-05, |
|
"loss": 0.2471, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.260869565217391, |
|
"eval_loss": 0.4012451171875, |
|
"eval_runtime": 34.3577, |
|
"eval_samples_per_second": 14.553, |
|
"eval_steps_per_second": 14.553, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.2707509881422925, |
|
"grad_norm": 2.4065935611724854, |
|
"learning_rate": 2.6589345021987714e-05, |
|
"loss": 0.2455, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.280632411067194, |
|
"grad_norm": 2.315992593765259, |
|
"learning_rate": 2.6306566876350062e-05, |
|
"loss": 0.2137, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.2905138339920947, |
|
"grad_norm": 2.162156820297241, |
|
"learning_rate": 2.6024762674947306e-05, |
|
"loss": 0.2075, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 3.300395256916996, |
|
"grad_norm": 2.3874671459198, |
|
"learning_rate": 2.5743944001646384e-05, |
|
"loss": 0.2452, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.3102766798418974, |
|
"grad_norm": 2.3413357734680176, |
|
"learning_rate": 2.5464122399803118e-05, |
|
"loss": 0.2491, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.3201581027667983, |
|
"grad_norm": 2.0517518520355225, |
|
"learning_rate": 2.5185309371787506e-05, |
|
"loss": 0.2304, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.3300395256916997, |
|
"grad_norm": 2.5444791316986084, |
|
"learning_rate": 2.490751637851113e-05, |
|
"loss": 0.2252, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 3.3399209486166006, |
|
"grad_norm": 3.271428108215332, |
|
"learning_rate": 2.4630754838955894e-05, |
|
"loss": 0.2221, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.349802371541502, |
|
"grad_norm": 2.159346103668213, |
|
"learning_rate": 2.4355036129704693e-05, |
|
"loss": 0.2355, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 3.3596837944664033, |
|
"grad_norm": 1.997672438621521, |
|
"learning_rate": 2.408037158447374e-05, |
|
"loss": 0.2128, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.369565217391304, |
|
"grad_norm": 1.8913171291351318, |
|
"learning_rate": 2.3806772493646716e-05, |
|
"loss": 0.2133, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 3.3794466403162056, |
|
"grad_norm": 2.4527885913848877, |
|
"learning_rate": 2.3534250103810622e-05, |
|
"loss": 0.2171, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.3893280632411065, |
|
"grad_norm": 2.4104301929473877, |
|
"learning_rate": 2.326281561729351e-05, |
|
"loss": 0.2268, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 3.399209486166008, |
|
"grad_norm": 1.9531340599060059, |
|
"learning_rate": 2.2992480191703996e-05, |
|
"loss": 0.2158, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 1.6948319673538208, |
|
"learning_rate": 2.2723254939472564e-05, |
|
"loss": 0.2223, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"eval_loss": 0.40440815687179565, |
|
"eval_runtime": 34.4399, |
|
"eval_samples_per_second": 14.518, |
|
"eval_steps_per_second": 14.518, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.4189723320158105, |
|
"grad_norm": 2.2849342823028564, |
|
"learning_rate": 2.2455150927394874e-05, |
|
"loss": 0.2354, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.4288537549407114, |
|
"grad_norm": 2.416132688522339, |
|
"learning_rate": 2.218817917617676e-05, |
|
"loss": 0.2322, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.438735177865613, |
|
"grad_norm": 2.2576396465301514, |
|
"learning_rate": 2.1922350659981254e-05, |
|
"loss": 0.2366, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.4486166007905137, |
|
"grad_norm": 2.38684344291687, |
|
"learning_rate": 2.1657676305977515e-05, |
|
"loss": 0.2322, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.458498023715415, |
|
"grad_norm": 2.6015143394470215, |
|
"learning_rate": 2.1394166993891523e-05, |
|
"loss": 0.2412, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.4683794466403164, |
|
"grad_norm": 2.134091854095459, |
|
"learning_rate": 2.1131833555559034e-05, |
|
"loss": 0.2176, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 2.0862109661102295, |
|
"learning_rate": 2.0870686774480193e-05, |
|
"loss": 0.212, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.4881422924901186, |
|
"grad_norm": 2.139421224594116, |
|
"learning_rate": 2.0610737385376345e-05, |
|
"loss": 0.2398, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.4980237154150196, |
|
"grad_norm": 2.5852270126342773, |
|
"learning_rate": 2.035199607374871e-05, |
|
"loss": 0.2417, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.507905138339921, |
|
"grad_norm": 2.2502431869506836, |
|
"learning_rate": 2.0094473475439195e-05, |
|
"loss": 0.2277, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.5177865612648223, |
|
"grad_norm": 1.9478706121444702, |
|
"learning_rate": 1.983818017619317e-05, |
|
"loss": 0.247, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.527667984189723, |
|
"grad_norm": 2.4664876461029053, |
|
"learning_rate": 1.9583126711224336e-05, |
|
"loss": 0.2398, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.5375494071146245, |
|
"grad_norm": 2.0479726791381836, |
|
"learning_rate": 1.9329323564781675e-05, |
|
"loss": 0.2291, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.5474308300395254, |
|
"grad_norm": 1.9972285032272339, |
|
"learning_rate": 1.907678116971842e-05, |
|
"loss": 0.2287, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.5573122529644268, |
|
"grad_norm": 2.7339351177215576, |
|
"learning_rate": 1.882550990706332e-05, |
|
"loss": 0.2329, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.5573122529644268, |
|
"eval_loss": 0.39887359738349915, |
|
"eval_runtime": 34.1894, |
|
"eval_samples_per_second": 14.624, |
|
"eval_steps_per_second": 14.624, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.567193675889328, |
|
"grad_norm": 2.279087781906128, |
|
"learning_rate": 1.8575520105593814e-05, |
|
"loss": 0.2165, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.5770750988142295, |
|
"grad_norm": 2.1736888885498047, |
|
"learning_rate": 1.8326822041411518e-05, |
|
"loss": 0.2408, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.5869565217391304, |
|
"grad_norm": 2.6511847972869873, |
|
"learning_rate": 1.8079425937519722e-05, |
|
"loss": 0.2075, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.5968379446640317, |
|
"grad_norm": 2.688720226287842, |
|
"learning_rate": 1.7833341963403307e-05, |
|
"loss": 0.2257, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.6067193675889326, |
|
"grad_norm": 2.398272752761841, |
|
"learning_rate": 1.7588580234610588e-05, |
|
"loss": 0.2248, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.616600790513834, |
|
"grad_norm": 2.060293197631836, |
|
"learning_rate": 1.7345150812337557e-05, |
|
"loss": 0.2491, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.6264822134387353, |
|
"grad_norm": 2.143730401992798, |
|
"learning_rate": 1.7103063703014366e-05, |
|
"loss": 0.2194, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 2.20381760597229, |
|
"learning_rate": 1.686232885789385e-05, |
|
"loss": 0.2299, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.6462450592885376, |
|
"grad_norm": 2.0883138179779053, |
|
"learning_rate": 1.6622956172642597e-05, |
|
"loss": 0.2377, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.6561264822134385, |
|
"grad_norm": 2.045193672180176, |
|
"learning_rate": 1.6384955486934152e-05, |
|
"loss": 0.2105, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.66600790513834, |
|
"grad_norm": 1.9251530170440674, |
|
"learning_rate": 1.6148336584044533e-05, |
|
"loss": 0.2149, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.675889328063241, |
|
"grad_norm": 2.162121534347534, |
|
"learning_rate": 1.591310919045003e-05, |
|
"loss": 0.2359, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.6857707509881426, |
|
"grad_norm": 2.423417329788208, |
|
"learning_rate": 1.5679282975427484e-05, |
|
"loss": 0.2305, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.6956521739130435, |
|
"grad_norm": 2.2101898193359375, |
|
"learning_rate": 1.5446867550656765e-05, |
|
"loss": 0.2174, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.705533596837945, |
|
"grad_norm": 2.565885066986084, |
|
"learning_rate": 1.5215872469825677e-05, |
|
"loss": 0.2184, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.705533596837945, |
|
"eval_loss": 0.39941611886024475, |
|
"eval_runtime": 34.6521, |
|
"eval_samples_per_second": 14.429, |
|
"eval_steps_per_second": 14.429, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.7154150197628457, |
|
"grad_norm": 2.2909016609191895, |
|
"learning_rate": 1.4986307228237263e-05, |
|
"loss": 0.2328, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.725296442687747, |
|
"grad_norm": 2.3509652614593506, |
|
"learning_rate": 1.475818126241942e-05, |
|
"loss": 0.2126, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.7351778656126484, |
|
"grad_norm": 2.112107515335083, |
|
"learning_rate": 1.4531503949737103e-05, |
|
"loss": 0.2093, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.7450592885375493, |
|
"grad_norm": 2.65423846244812, |
|
"learning_rate": 1.4306284608006833e-05, |
|
"loss": 0.2398, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.7549407114624507, |
|
"grad_norm": 2.521843194961548, |
|
"learning_rate": 1.4082532495113623e-05, |
|
"loss": 0.2215, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.7648221343873516, |
|
"grad_norm": 1.9202762842178345, |
|
"learning_rate": 1.3860256808630425e-05, |
|
"loss": 0.2201, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.774703557312253, |
|
"grad_norm": 2.7013628482818604, |
|
"learning_rate": 1.3639466685440129e-05, |
|
"loss": 0.2446, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.7845849802371543, |
|
"grad_norm": 1.9625245332717896, |
|
"learning_rate": 1.3420171201359928e-05, |
|
"loss": 0.2197, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.794466403162055, |
|
"grad_norm": 2.7485172748565674, |
|
"learning_rate": 1.3202379370768249e-05, |
|
"loss": 0.222, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.8043478260869565, |
|
"grad_norm": 2.577033281326294, |
|
"learning_rate": 1.2986100146234227e-05, |
|
"loss": 0.2217, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.8142292490118574, |
|
"grad_norm": 2.4234845638275146, |
|
"learning_rate": 1.2771342418149653e-05, |
|
"loss": 0.2205, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.824110671936759, |
|
"grad_norm": 2.127350330352783, |
|
"learning_rate": 1.2558115014363589e-05, |
|
"loss": 0.2076, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.83399209486166, |
|
"grad_norm": 2.254338502883911, |
|
"learning_rate": 1.2346426699819455e-05, |
|
"loss": 0.2408, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.8438735177865615, |
|
"grad_norm": 2.1729793548583984, |
|
"learning_rate": 1.2136286176194741e-05, |
|
"loss": 0.2253, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.8537549407114624, |
|
"grad_norm": 2.07226824760437, |
|
"learning_rate": 1.1927702081543275e-05, |
|
"loss": 0.2332, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.8537549407114624, |
|
"eval_loss": 0.39704427123069763, |
|
"eval_runtime": 34.4753, |
|
"eval_samples_per_second": 14.503, |
|
"eval_steps_per_second": 14.503, |
|
"step": 3900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.436859427356672e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|