{ "best_metric": 0.3866276741027832, "best_model_checkpoint": "./output/checkpoint-3000", "epoch": 3.8537549407114624, "eval_steps": 150, "global_step": 3900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009881422924901186, "grad_norm": 2.4057295322418213, "learning_rate": 9.999999999999999e-06, "loss": 0.527, "step": 10 }, { "epoch": 0.019762845849802372, "grad_norm": 2.7988038063049316, "learning_rate": 1.9999999999999998e-05, "loss": 0.4789, "step": 20 }, { "epoch": 0.029644268774703556, "grad_norm": 3.1874520778656006, "learning_rate": 2.999999999999999e-05, "loss": 0.5495, "step": 30 }, { "epoch": 0.039525691699604744, "grad_norm": 3.206881046295166, "learning_rate": 3.9999999999999996e-05, "loss": 0.5418, "step": 40 }, { "epoch": 0.04940711462450593, "grad_norm": 2.863534927368164, "learning_rate": 4.999999999999999e-05, "loss": 0.5065, "step": 50 }, { "epoch": 0.05928853754940711, "grad_norm": 2.8917012214660645, "learning_rate": 5.999999999999998e-05, "loss": 0.5262, "step": 60 }, { "epoch": 0.0691699604743083, "grad_norm": 2.8896090984344482, "learning_rate": 6.999999999999998e-05, "loss": 0.5294, "step": 70 }, { "epoch": 0.07905138339920949, "grad_norm": 3.138671875, "learning_rate": 7.999999999999999e-05, "loss": 0.4787, "step": 80 }, { "epoch": 0.08893280632411067, "grad_norm": 2.513195753097534, "learning_rate": 8.999999999999998e-05, "loss": 0.4873, "step": 90 }, { "epoch": 0.09881422924901186, "grad_norm": 2.762770414352417, "learning_rate": 9.999999999999998e-05, "loss": 0.5069, "step": 100 }, { "epoch": 0.10869565217391304, "grad_norm": 2.255554437637329, "learning_rate": 9.999897234791827e-05, "loss": 0.4713, "step": 110 }, { "epoch": 0.11857707509881422, "grad_norm": 2.8342161178588867, "learning_rate": 9.999588943391594e-05, "loss": 0.5146, "step": 120 }, { "epoch": 0.12845849802371542, "grad_norm": 2.1564338207244873, "learning_rate": 9.999075138471948e-05, "loss": 0.4976, "step": 130 }, { "epoch": 0.1383399209486166, "grad_norm": 2.606574773788452, "learning_rate": 9.998355841153397e-05, "loss": 0.5166, "step": 140 }, { "epoch": 0.1482213438735178, "grad_norm": 2.285940647125244, "learning_rate": 9.997431081003437e-05, "loss": 0.5137, "step": 150 }, { "epoch": 0.1482213438735178, "eval_loss": 0.5139818787574768, "eval_runtime": 35.3761, "eval_samples_per_second": 14.134, "eval_steps_per_second": 14.134, "step": 150 }, { "epoch": 0.15810276679841898, "grad_norm": 2.5471882820129395, "learning_rate": 9.996300896035337e-05, "loss": 0.5392, "step": 160 }, { "epoch": 0.16798418972332016, "grad_norm": 3.4715616703033447, "learning_rate": 9.994965332706571e-05, "loss": 0.517, "step": 170 }, { "epoch": 0.17786561264822134, "grad_norm": 2.321061134338379, "learning_rate": 9.99342444591692e-05, "loss": 0.4852, "step": 180 }, { "epoch": 0.18774703557312253, "grad_norm": 2.590848922729492, "learning_rate": 9.991678299006203e-05, "loss": 0.5101, "step": 190 }, { "epoch": 0.1976284584980237, "grad_norm": 2.412264823913574, "learning_rate": 9.989726963751679e-05, "loss": 0.5099, "step": 200 }, { "epoch": 0.2075098814229249, "grad_norm": 2.491581678390503, "learning_rate": 9.987570520365101e-05, "loss": 0.5057, "step": 210 }, { "epoch": 0.21739130434782608, "grad_norm": 2.79880690574646, "learning_rate": 9.985209057489407e-05, "loss": 0.4955, "step": 220 }, { "epoch": 0.22727272727272727, "grad_norm": 3.0157032012939453, "learning_rate": 9.98264267219509e-05, "loss": 0.4766, "step": 230 }, { "epoch": 0.23715415019762845, "grad_norm": 2.266268491744995, "learning_rate": 9.979871469976193e-05, "loss": 0.4975, "step": 240 }, { "epoch": 0.24703557312252963, "grad_norm": 2.4801745414733887, "learning_rate": 9.976895564745989e-05, "loss": 0.5068, "step": 250 }, { "epoch": 0.25691699604743085, "grad_norm": 2.9422342777252197, "learning_rate": 9.973715078832285e-05, "loss": 0.4792, "step": 260 }, { "epoch": 0.26679841897233203, "grad_norm": 2.364716053009033, "learning_rate": 9.970330142972399e-05, "loss": 0.475, "step": 270 }, { "epoch": 0.2766798418972332, "grad_norm": 2.0679702758789062, "learning_rate": 9.966740896307789e-05, "loss": 0.4818, "step": 280 }, { "epoch": 0.2865612648221344, "grad_norm": 2.8333821296691895, "learning_rate": 9.962947486378323e-05, "loss": 0.489, "step": 290 }, { "epoch": 0.2964426877470356, "grad_norm": 2.5552380084991455, "learning_rate": 9.958950069116228e-05, "loss": 0.4921, "step": 300 }, { "epoch": 0.2964426877470356, "eval_loss": 0.4803454577922821, "eval_runtime": 35.2266, "eval_samples_per_second": 14.194, "eval_steps_per_second": 14.194, "step": 300 }, { "epoch": 0.30632411067193677, "grad_norm": 2.617976188659668, "learning_rate": 9.954748808839671e-05, "loss": 0.5139, "step": 310 }, { "epoch": 0.31620553359683795, "grad_norm": 2.61501407623291, "learning_rate": 9.950343878246007e-05, "loss": 0.4642, "step": 320 }, { "epoch": 0.32608695652173914, "grad_norm": 2.3816354274749756, "learning_rate": 9.945735458404678e-05, "loss": 0.522, "step": 330 }, { "epoch": 0.3359683794466403, "grad_norm": 2.5774052143096924, "learning_rate": 9.940923738749776e-05, "loss": 0.505, "step": 340 }, { "epoch": 0.3458498023715415, "grad_norm": 2.245945692062378, "learning_rate": 9.935908917072249e-05, "loss": 0.4801, "step": 350 }, { "epoch": 0.3557312252964427, "grad_norm": 1.9169942140579224, "learning_rate": 9.930691199511772e-05, "loss": 0.4414, "step": 360 }, { "epoch": 0.36561264822134387, "grad_norm": 2.1389174461364746, "learning_rate": 9.925270800548282e-05, "loss": 0.4659, "step": 370 }, { "epoch": 0.37549407114624506, "grad_norm": 2.4838740825653076, "learning_rate": 9.919647942993145e-05, "loss": 0.4594, "step": 380 }, { "epoch": 0.38537549407114624, "grad_norm": 2.629211187362671, "learning_rate": 9.913822857980017e-05, "loss": 0.492, "step": 390 }, { "epoch": 0.3952569169960474, "grad_norm": 2.387241840362549, "learning_rate": 9.907795784955324e-05, "loss": 0.4556, "step": 400 }, { "epoch": 0.4051383399209486, "grad_norm": 2.450533390045166, "learning_rate": 9.901566971668434e-05, "loss": 0.4653, "step": 410 }, { "epoch": 0.4150197628458498, "grad_norm": 2.464107036590576, "learning_rate": 9.895136674161462e-05, "loss": 0.4955, "step": 420 }, { "epoch": 0.424901185770751, "grad_norm": 2.31066632270813, "learning_rate": 9.888505156758756e-05, "loss": 0.4623, "step": 430 }, { "epoch": 0.43478260869565216, "grad_norm": 2.158092975616455, "learning_rate": 9.881672692056019e-05, "loss": 0.4692, "step": 440 }, { "epoch": 0.44466403162055335, "grad_norm": 1.851882815361023, "learning_rate": 9.874639560909115e-05, "loss": 0.487, "step": 450 }, { "epoch": 0.44466403162055335, "eval_loss": 0.46247246861457825, "eval_runtime": 34.6043, "eval_samples_per_second": 14.449, "eval_steps_per_second": 14.449, "step": 450 }, { "epoch": 0.45454545454545453, "grad_norm": 2.608232021331787, "learning_rate": 9.867406052422521e-05, "loss": 0.4742, "step": 460 }, { "epoch": 0.4644268774703557, "grad_norm": 2.2287890911102295, "learning_rate": 9.859972463937438e-05, "loss": 0.4611, "step": 470 }, { "epoch": 0.4743083003952569, "grad_norm": 2.5854885578155518, "learning_rate": 9.852339101019572e-05, "loss": 0.4576, "step": 480 }, { "epoch": 0.4841897233201581, "grad_norm": 2.134244203567505, "learning_rate": 9.844506277446574e-05, "loss": 0.4824, "step": 490 }, { "epoch": 0.49407114624505927, "grad_norm": 2.4558589458465576, "learning_rate": 9.836474315195144e-05, "loss": 0.4497, "step": 500 }, { "epoch": 0.5039525691699605, "grad_norm": 2.9862284660339355, "learning_rate": 9.828243544427793e-05, "loss": 0.4632, "step": 510 }, { "epoch": 0.5138339920948617, "grad_norm": 2.5558395385742188, "learning_rate": 9.819814303479264e-05, "loss": 0.4486, "step": 520 }, { "epoch": 0.5237154150197628, "grad_norm": 1.770102620124817, "learning_rate": 9.811186938842643e-05, "loss": 0.4476, "step": 530 }, { "epoch": 0.5335968379446641, "grad_norm": 3.525803565979004, "learning_rate": 9.802361805155094e-05, "loss": 0.4643, "step": 540 }, { "epoch": 0.5434782608695652, "grad_norm": 2.2628469467163086, "learning_rate": 9.7933392651833e-05, "loss": 0.4731, "step": 550 }, { "epoch": 0.5533596837944664, "grad_norm": 3.0324666500091553, "learning_rate": 9.784119689808542e-05, "loss": 0.4478, "step": 560 }, { "epoch": 0.5632411067193676, "grad_norm": 2.263258218765259, "learning_rate": 9.77470345801145e-05, "loss": 0.4497, "step": 570 }, { "epoch": 0.5731225296442688, "grad_norm": 2.645112991333008, "learning_rate": 9.765090956856434e-05, "loss": 0.4666, "step": 580 }, { "epoch": 0.5830039525691699, "grad_norm": 2.0175938606262207, "learning_rate": 9.755282581475766e-05, "loss": 0.47, "step": 590 }, { "epoch": 0.5928853754940712, "grad_norm": 2.175175666809082, "learning_rate": 9.74527873505334e-05, "loss": 0.4592, "step": 600 }, { "epoch": 0.5928853754940712, "eval_loss": 0.44894906878471375, "eval_runtime": 34.7044, "eval_samples_per_second": 14.407, "eval_steps_per_second": 14.407, "step": 600 }, { "epoch": 0.6027667984189723, "grad_norm": 2.4829459190368652, "learning_rate": 9.735079828808105e-05, "loss": 0.4453, "step": 610 }, { "epoch": 0.6126482213438735, "grad_norm": 2.2683026790618896, "learning_rate": 9.724686281977144e-05, "loss": 0.4701, "step": 620 }, { "epoch": 0.6225296442687747, "grad_norm": 2.6205978393554688, "learning_rate": 9.714098521798462e-05, "loss": 0.4581, "step": 630 }, { "epoch": 0.6324110671936759, "grad_norm": 2.5154802799224854, "learning_rate": 9.703316983493411e-05, "loss": 0.4372, "step": 640 }, { "epoch": 0.642292490118577, "grad_norm": 2.478700637817383, "learning_rate": 9.6923421102488e-05, "loss": 0.4554, "step": 650 }, { "epoch": 0.6521739130434783, "grad_norm": 2.286890745162964, "learning_rate": 9.681174353198684e-05, "loss": 0.474, "step": 660 }, { "epoch": 0.6620553359683794, "grad_norm": 2.0446231365203857, "learning_rate": 9.669814171405813e-05, "loss": 0.4347, "step": 670 }, { "epoch": 0.6719367588932806, "grad_norm": 2.571877956390381, "learning_rate": 9.658262031842768e-05, "loss": 0.4355, "step": 680 }, { "epoch": 0.6818181818181818, "grad_norm": 2.160186290740967, "learning_rate": 9.646518409372757e-05, "loss": 0.4354, "step": 690 }, { "epoch": 0.691699604743083, "grad_norm": 2.0123462677001953, "learning_rate": 9.634583786730107e-05, "loss": 0.4619, "step": 700 }, { "epoch": 0.7015810276679841, "grad_norm": 2.4318065643310547, "learning_rate": 9.622458654500406e-05, "loss": 0.4795, "step": 710 }, { "epoch": 0.7114624505928854, "grad_norm": 2.1146297454833984, "learning_rate": 9.610143511100351e-05, "loss": 0.442, "step": 720 }, { "epoch": 0.7213438735177866, "grad_norm": 2.312072992324829, "learning_rate": 9.597638862757252e-05, "loss": 0.4339, "step": 730 }, { "epoch": 0.7312252964426877, "grad_norm": 2.0367119312286377, "learning_rate": 9.584945223488224e-05, "loss": 0.4519, "step": 740 }, { "epoch": 0.741106719367589, "grad_norm": 2.8992955684661865, "learning_rate": 9.57206311507906e-05, "loss": 0.4431, "step": 750 }, { "epoch": 0.741106719367589, "eval_loss": 0.4378005266189575, "eval_runtime": 34.0956, "eval_samples_per_second": 14.665, "eval_steps_per_second": 14.665, "step": 750 }, { "epoch": 0.7509881422924901, "grad_norm": 2.3162925243377686, "learning_rate": 9.558993067062783e-05, "loss": 0.4426, "step": 760 }, { "epoch": 0.7608695652173914, "grad_norm": 1.9839439392089844, "learning_rate": 9.545735616697873e-05, "loss": 0.46, "step": 770 }, { "epoch": 0.7707509881422925, "grad_norm": 2.18251633644104, "learning_rate": 9.532291308946188e-05, "loss": 0.4254, "step": 780 }, { "epoch": 0.7806324110671937, "grad_norm": 2.241259813308716, "learning_rate": 9.518660696450565e-05, "loss": 0.4246, "step": 790 }, { "epoch": 0.7905138339920948, "grad_norm": 2.357609272003174, "learning_rate": 9.504844339512093e-05, "loss": 0.4497, "step": 800 }, { "epoch": 0.8003952569169961, "grad_norm": 2.2541675567626953, "learning_rate": 9.490842806067093e-05, "loss": 0.4605, "step": 810 }, { "epoch": 0.8102766798418972, "grad_norm": 2.1015920639038086, "learning_rate": 9.476656671663764e-05, "loss": 0.4227, "step": 820 }, { "epoch": 0.8201581027667985, "grad_norm": 2.2886059284210205, "learning_rate": 9.462286519438528e-05, "loss": 0.4385, "step": 830 }, { "epoch": 0.8300395256916996, "grad_norm": 2.2543296813964844, "learning_rate": 9.447732940092057e-05, "loss": 0.433, "step": 840 }, { "epoch": 0.8399209486166008, "grad_norm": 2.2577757835388184, "learning_rate": 9.432996531864999e-05, "loss": 0.4634, "step": 850 }, { "epoch": 0.849802371541502, "grad_norm": 1.795832872390747, "learning_rate": 9.418077900513374e-05, "loss": 0.4068, "step": 860 }, { "epoch": 0.8596837944664032, "grad_norm": 1.9893933534622192, "learning_rate": 9.402977659283688e-05, "loss": 0.4527, "step": 870 }, { "epoch": 0.8695652173913043, "grad_norm": 2.5203518867492676, "learning_rate": 9.387696428887713e-05, "loss": 0.4591, "step": 880 }, { "epoch": 0.8794466403162056, "grad_norm": 2.42069673538208, "learning_rate": 9.372234837476975e-05, "loss": 0.4597, "step": 890 }, { "epoch": 0.8893280632411067, "grad_norm": 2.087778329849243, "learning_rate": 9.356593520616945e-05, "loss": 0.4226, "step": 900 }, { "epoch": 0.8893280632411067, "eval_loss": 0.42734819650650024, "eval_runtime": 34.3301, "eval_samples_per_second": 14.564, "eval_steps_per_second": 14.564, "step": 900 }, { "epoch": 0.8992094861660079, "grad_norm": 2.4652795791625977, "learning_rate": 9.34077312126089e-05, "loss": 0.4261, "step": 910 }, { "epoch": 0.9090909090909091, "grad_norm": 2.0327532291412354, "learning_rate": 9.324774289723465e-05, "loss": 0.4674, "step": 920 }, { "epoch": 0.9189723320158103, "grad_norm": 2.3021750450134277, "learning_rate": 9.308597683653974e-05, "loss": 0.4521, "step": 930 }, { "epoch": 0.9288537549407114, "grad_norm": 6.520279884338379, "learning_rate": 9.292243968009328e-05, "loss": 0.4443, "step": 940 }, { "epoch": 0.9387351778656127, "grad_norm": 2.0640597343444824, "learning_rate": 9.27571381502673e-05, "loss": 0.416, "step": 950 }, { "epoch": 0.9486166007905138, "grad_norm": 2.280644416809082, "learning_rate": 9.25900790419602e-05, "loss": 0.4331, "step": 960 }, { "epoch": 0.958498023715415, "grad_norm": 2.3445639610290527, "learning_rate": 9.24212692223176e-05, "loss": 0.4512, "step": 970 }, { "epoch": 0.9683794466403162, "grad_norm": 2.072683334350586, "learning_rate": 9.225071563045005e-05, "loss": 0.3967, "step": 980 }, { "epoch": 0.9782608695652174, "grad_norm": 1.9060055017471313, "learning_rate": 9.207842527714764e-05, "loss": 0.4102, "step": 990 }, { "epoch": 0.9881422924901185, "grad_norm": 2.248657464981079, "learning_rate": 9.1904405244592e-05, "loss": 0.4505, "step": 1000 }, { "epoch": 0.9980237154150198, "grad_norm": 2.048110008239746, "learning_rate": 9.172866268606511e-05, "loss": 0.4102, "step": 1010 }, { "epoch": 1.007905138339921, "grad_norm": 1.9891077280044556, "learning_rate": 9.155120482565518e-05, "loss": 0.3866, "step": 1020 }, { "epoch": 1.017786561264822, "grad_norm": 2.499363422393799, "learning_rate": 9.13720389579598e-05, "loss": 0.3584, "step": 1030 }, { "epoch": 1.0276679841897234, "grad_norm": 2.4077465534210205, "learning_rate": 9.119117244778605e-05, "loss": 0.3736, "step": 1040 }, { "epoch": 1.0375494071146245, "grad_norm": 2.0941267013549805, "learning_rate": 9.100861272984777e-05, "loss": 0.3769, "step": 1050 }, { "epoch": 1.0375494071146245, "eval_loss": 0.4222135841846466, "eval_runtime": 34.1626, "eval_samples_per_second": 14.636, "eval_steps_per_second": 14.636, "step": 1050 }, { "epoch": 1.0474308300395256, "grad_norm": 2.29099702835083, "learning_rate": 9.082436730845992e-05, "loss": 0.3545, "step": 1060 }, { "epoch": 1.0573122529644268, "grad_norm": 2.5847902297973633, "learning_rate": 9.063844375723012e-05, "loss": 0.3658, "step": 1070 }, { "epoch": 1.0671936758893281, "grad_norm": 1.9889037609100342, "learning_rate": 9.045084971874735e-05, "loss": 0.3662, "step": 1080 }, { "epoch": 1.0770750988142292, "grad_norm": 2.0356063842773438, "learning_rate": 9.026159290426779e-05, "loss": 0.3952, "step": 1090 }, { "epoch": 1.0869565217391304, "grad_norm": 1.95900559425354, "learning_rate": 9.007068109339781e-05, "loss": 0.3624, "step": 1100 }, { "epoch": 1.0968379446640317, "grad_norm": 2.0315041542053223, "learning_rate": 8.987812213377421e-05, "loss": 0.355, "step": 1110 }, { "epoch": 1.1067193675889329, "grad_norm": 1.9098906517028809, "learning_rate": 8.968392394074161e-05, "loss": 0.3396, "step": 1120 }, { "epoch": 1.116600790513834, "grad_norm": 2.3436784744262695, "learning_rate": 8.94880944970271e-05, "loss": 0.3433, "step": 1130 }, { "epoch": 1.1264822134387351, "grad_norm": 2.013385534286499, "learning_rate": 8.92906418524121e-05, "loss": 0.3815, "step": 1140 }, { "epoch": 1.1363636363636362, "grad_norm": 2.3570964336395264, "learning_rate": 8.909157412340148e-05, "loss": 0.3825, "step": 1150 }, { "epoch": 1.1462450592885376, "grad_norm": 2.0097525119781494, "learning_rate": 8.889089949288984e-05, "loss": 0.3788, "step": 1160 }, { "epoch": 1.1561264822134387, "grad_norm": 1.8614075183868408, "learning_rate": 8.868862620982532e-05, "loss": 0.3434, "step": 1170 }, { "epoch": 1.1660079051383399, "grad_norm": 2.3193359375, "learning_rate": 8.848476258887028e-05, "loss": 0.3652, "step": 1180 }, { "epoch": 1.1758893280632412, "grad_norm": 2.1564888954162598, "learning_rate": 8.827931701005971e-05, "loss": 0.3604, "step": 1190 }, { "epoch": 1.1857707509881423, "grad_norm": 2.278334856033325, "learning_rate": 8.80722979184567e-05, "loss": 0.351, "step": 1200 }, { "epoch": 1.1857707509881423, "eval_loss": 0.4201338589191437, "eval_runtime": 34.2095, "eval_samples_per_second": 14.616, "eval_steps_per_second": 14.616, "step": 1200 }, { "epoch": 1.1956521739130435, "grad_norm": 2.3817718029022217, "learning_rate": 8.786371382380525e-05, "loss": 0.3681, "step": 1210 }, { "epoch": 1.2055335968379446, "grad_norm": 2.221449613571167, "learning_rate": 8.765357330018053e-05, "loss": 0.396, "step": 1220 }, { "epoch": 1.215415019762846, "grad_norm": 1.9129923582077026, "learning_rate": 8.744188498563639e-05, "loss": 0.3861, "step": 1230 }, { "epoch": 1.225296442687747, "grad_norm": 2.0991668701171875, "learning_rate": 8.722865758185034e-05, "loss": 0.373, "step": 1240 }, { "epoch": 1.2351778656126482, "grad_norm": 1.9412460327148438, "learning_rate": 8.701389985376575e-05, "loss": 0.3592, "step": 1250 }, { "epoch": 1.2450592885375493, "grad_norm": 2.6546976566314697, "learning_rate": 8.679762062923174e-05, "loss": 0.3871, "step": 1260 }, { "epoch": 1.2549407114624507, "grad_norm": 2.3372902870178223, "learning_rate": 8.657982879864005e-05, "loss": 0.3776, "step": 1270 }, { "epoch": 1.2648221343873518, "grad_norm": 1.9796963930130005, "learning_rate": 8.636053331455984e-05, "loss": 0.377, "step": 1280 }, { "epoch": 1.274703557312253, "grad_norm": 2.1785104274749756, "learning_rate": 8.613974319136955e-05, "loss": 0.3942, "step": 1290 }, { "epoch": 1.2845849802371543, "grad_norm": 1.8092831373214722, "learning_rate": 8.591746750488636e-05, "loss": 0.367, "step": 1300 }, { "epoch": 1.2944664031620554, "grad_norm": 2.4670629501342773, "learning_rate": 8.569371539199313e-05, "loss": 0.3744, "step": 1310 }, { "epoch": 1.3043478260869565, "grad_norm": 2.104426383972168, "learning_rate": 8.546849605026287e-05, "loss": 0.3801, "step": 1320 }, { "epoch": 1.3142292490118577, "grad_norm": 2.5602879524230957, "learning_rate": 8.524181873758057e-05, "loss": 0.352, "step": 1330 }, { "epoch": 1.3241106719367588, "grad_norm": 2.211514949798584, "learning_rate": 8.501369277176273e-05, "loss": 0.3643, "step": 1340 }, { "epoch": 1.3339920948616601, "grad_norm": 2.312812566757202, "learning_rate": 8.478412753017431e-05, "loss": 0.3681, "step": 1350 }, { "epoch": 1.3339920948616601, "eval_loss": 0.417085200548172, "eval_runtime": 34.1379, "eval_samples_per_second": 14.646, "eval_steps_per_second": 14.646, "step": 1350 }, { "epoch": 1.3438735177865613, "grad_norm": 2.7497165203094482, "learning_rate": 8.455313244934322e-05, "loss": 0.3739, "step": 1360 }, { "epoch": 1.3537549407114624, "grad_norm": 2.3112716674804688, "learning_rate": 8.432071702457251e-05, "loss": 0.367, "step": 1370 }, { "epoch": 1.3636363636363638, "grad_norm": 2.03934383392334, "learning_rate": 8.408689080954995e-05, "loss": 0.3506, "step": 1380 }, { "epoch": 1.3735177865612649, "grad_norm": 1.942353367805481, "learning_rate": 8.385166341595547e-05, "loss": 0.3651, "step": 1390 }, { "epoch": 1.383399209486166, "grad_norm": 1.985518455505371, "learning_rate": 8.361504451306582e-05, "loss": 0.3593, "step": 1400 }, { "epoch": 1.3932806324110671, "grad_norm": 2.244945526123047, "learning_rate": 8.337704382735738e-05, "loss": 0.3616, "step": 1410 }, { "epoch": 1.4031620553359683, "grad_norm": 2.084362268447876, "learning_rate": 8.313767114210614e-05, "loss": 0.3725, "step": 1420 }, { "epoch": 1.4130434782608696, "grad_norm": 2.0909502506256104, "learning_rate": 8.289693629698562e-05, "loss": 0.3708, "step": 1430 }, { "epoch": 1.4229249011857708, "grad_norm": 2.526142120361328, "learning_rate": 8.265484918766241e-05, "loss": 0.3688, "step": 1440 }, { "epoch": 1.4328063241106719, "grad_norm": 1.7779805660247803, "learning_rate": 8.241141976538941e-05, "loss": 0.3628, "step": 1450 }, { "epoch": 1.4426877470355732, "grad_norm": 2.2417075634002686, "learning_rate": 8.216665803659669e-05, "loss": 0.3539, "step": 1460 }, { "epoch": 1.4525691699604744, "grad_norm": 2.3137755393981934, "learning_rate": 8.192057406248027e-05, "loss": 0.3526, "step": 1470 }, { "epoch": 1.4624505928853755, "grad_norm": 2.308361768722534, "learning_rate": 8.167317795858849e-05, "loss": 0.364, "step": 1480 }, { "epoch": 1.4723320158102766, "grad_norm": 2.1137235164642334, "learning_rate": 8.142447989440615e-05, "loss": 0.3725, "step": 1490 }, { "epoch": 1.4822134387351777, "grad_norm": 2.206882953643799, "learning_rate": 8.117449009293666e-05, "loss": 0.38, "step": 1500 }, { "epoch": 1.4822134387351777, "eval_loss": 0.41128015518188477, "eval_runtime": 34.0412, "eval_samples_per_second": 14.688, "eval_steps_per_second": 14.688, "step": 1500 }, { "epoch": 1.492094861660079, "grad_norm": 2.540431499481201, "learning_rate": 8.092321883028156e-05, "loss": 0.3703, "step": 1510 }, { "epoch": 1.5019762845849802, "grad_norm": 2.1057121753692627, "learning_rate": 8.067067643521832e-05, "loss": 0.3797, "step": 1520 }, { "epoch": 1.5118577075098814, "grad_norm": 2.375397205352783, "learning_rate": 8.041687328877564e-05, "loss": 0.3657, "step": 1530 }, { "epoch": 1.5217391304347827, "grad_norm": 2.403914451599121, "learning_rate": 8.016181982380679e-05, "loss": 0.3807, "step": 1540 }, { "epoch": 1.5316205533596838, "grad_norm": 2.3958826065063477, "learning_rate": 7.990552652456078e-05, "loss": 0.3622, "step": 1550 }, { "epoch": 1.541501976284585, "grad_norm": 2.351919651031494, "learning_rate": 7.964800392625127e-05, "loss": 0.3762, "step": 1560 }, { "epoch": 1.5513833992094863, "grad_norm": 2.015793800354004, "learning_rate": 7.938926261462365e-05, "loss": 0.3536, "step": 1570 }, { "epoch": 1.5612648221343872, "grad_norm": 2.037121295928955, "learning_rate": 7.912931322551979e-05, "loss": 0.3718, "step": 1580 }, { "epoch": 1.5711462450592886, "grad_norm": 2.1762428283691406, "learning_rate": 7.886816644444096e-05, "loss": 0.3504, "step": 1590 }, { "epoch": 1.5810276679841897, "grad_norm": 1.8388617038726807, "learning_rate": 7.860583300610847e-05, "loss": 0.3431, "step": 1600 }, { "epoch": 1.5909090909090908, "grad_norm": 1.9121774435043335, "learning_rate": 7.834232369402248e-05, "loss": 0.3769, "step": 1610 }, { "epoch": 1.6007905138339922, "grad_norm": 2.1485304832458496, "learning_rate": 7.807764934001872e-05, "loss": 0.3361, "step": 1620 }, { "epoch": 1.6106719367588933, "grad_norm": 2.162116289138794, "learning_rate": 7.781182082382322e-05, "loss": 0.3747, "step": 1630 }, { "epoch": 1.6205533596837944, "grad_norm": 2.514573335647583, "learning_rate": 7.754484907260511e-05, "loss": 0.3857, "step": 1640 }, { "epoch": 1.6304347826086958, "grad_norm": 2.3473386764526367, "learning_rate": 7.727674506052742e-05, "loss": 0.3269, "step": 1650 }, { "epoch": 1.6304347826086958, "eval_loss": 0.4064118564128876, "eval_runtime": 34.0417, "eval_samples_per_second": 14.688, "eval_steps_per_second": 14.688, "step": 1650 }, { "epoch": 1.6403162055335967, "grad_norm": 2.0117132663726807, "learning_rate": 7.700751980829599e-05, "loss": 0.3662, "step": 1660 }, { "epoch": 1.650197628458498, "grad_norm": 2.5356202125549316, "learning_rate": 7.673718438270646e-05, "loss": 0.3671, "step": 1670 }, { "epoch": 1.6600790513833992, "grad_norm": 2.1220240592956543, "learning_rate": 7.646574989618936e-05, "loss": 0.3655, "step": 1680 }, { "epoch": 1.6699604743083003, "grad_norm": 2.049267292022705, "learning_rate": 7.619322750635325e-05, "loss": 0.3916, "step": 1690 }, { "epoch": 1.6798418972332017, "grad_norm": 2.0539910793304443, "learning_rate": 7.591962841552624e-05, "loss": 0.4168, "step": 1700 }, { "epoch": 1.6897233201581028, "grad_norm": 2.229034662246704, "learning_rate": 7.56449638702953e-05, "loss": 0.369, "step": 1710 }, { "epoch": 1.699604743083004, "grad_norm": 2.280418634414673, "learning_rate": 7.536924516104408e-05, "loss": 0.375, "step": 1720 }, { "epoch": 1.7094861660079053, "grad_norm": 1.9317281246185303, "learning_rate": 7.509248362148886e-05, "loss": 0.3602, "step": 1730 }, { "epoch": 1.7193675889328062, "grad_norm": 2.0074923038482666, "learning_rate": 7.481469062821249e-05, "loss": 0.3763, "step": 1740 }, { "epoch": 1.7292490118577075, "grad_norm": 2.6529626846313477, "learning_rate": 7.453587760019688e-05, "loss": 0.3867, "step": 1750 }, { "epoch": 1.7391304347826086, "grad_norm": 2.64829421043396, "learning_rate": 7.425605599835358e-05, "loss": 0.3459, "step": 1760 }, { "epoch": 1.7490118577075098, "grad_norm": 2.139469861984253, "learning_rate": 7.397523732505269e-05, "loss": 0.3763, "step": 1770 }, { "epoch": 1.7588932806324111, "grad_norm": 2.043088674545288, "learning_rate": 7.369343312364992e-05, "loss": 0.3313, "step": 1780 }, { "epoch": 1.7687747035573123, "grad_norm": 2.4256412982940674, "learning_rate": 7.341065497801227e-05, "loss": 0.3607, "step": 1790 }, { "epoch": 1.7786561264822134, "grad_norm": 2.4966022968292236, "learning_rate": 7.312691451204175e-05, "loss": 0.3413, "step": 1800 }, { "epoch": 1.7786561264822134, "eval_loss": 0.39886847138404846, "eval_runtime": 34.0089, "eval_samples_per_second": 14.702, "eval_steps_per_second": 14.702, "step": 1800 }, { "epoch": 1.7885375494071147, "grad_norm": 2.126098394393921, "learning_rate": 7.284222338919757e-05, "loss": 0.3505, "step": 1810 }, { "epoch": 1.7984189723320159, "grad_norm": 2.32716965675354, "learning_rate": 7.25565933120167e-05, "loss": 0.3706, "step": 1820 }, { "epoch": 1.808300395256917, "grad_norm": 2.4196839332580566, "learning_rate": 7.227003602163294e-05, "loss": 0.3672, "step": 1830 }, { "epoch": 1.8181818181818183, "grad_norm": 2.1417181491851807, "learning_rate": 7.19825632972941e-05, "loss": 0.3467, "step": 1840 }, { "epoch": 1.8280632411067192, "grad_norm": 1.9946470260620117, "learning_rate": 7.169418695587788e-05, "loss": 0.3639, "step": 1850 }, { "epoch": 1.8379446640316206, "grad_norm": 2.3900909423828125, "learning_rate": 7.140491885140627e-05, "loss": 0.354, "step": 1860 }, { "epoch": 1.8478260869565217, "grad_norm": 2.3250668048858643, "learning_rate": 7.111477087455798e-05, "loss": 0.3829, "step": 1870 }, { "epoch": 1.8577075098814229, "grad_norm": 2.3011209964752197, "learning_rate": 7.082375495217994e-05, "loss": 0.3567, "step": 1880 }, { "epoch": 1.8675889328063242, "grad_norm": 2.4620919227600098, "learning_rate": 7.053188304679689e-05, "loss": 0.3729, "step": 1890 }, { "epoch": 1.8774703557312253, "grad_norm": 1.9825767278671265, "learning_rate": 7.023916715611966e-05, "loss": 0.367, "step": 1900 }, { "epoch": 1.8873517786561265, "grad_norm": 2.1703319549560547, "learning_rate": 6.994561931255207e-05, "loss": 0.3818, "step": 1910 }, { "epoch": 1.8972332015810278, "grad_norm": 1.79076087474823, "learning_rate": 6.965125158269616e-05, "loss": 0.3553, "step": 1920 }, { "epoch": 1.9071146245059287, "grad_norm": 2.1293234825134277, "learning_rate": 6.935607606685639e-05, "loss": 0.3665, "step": 1930 }, { "epoch": 1.91699604743083, "grad_norm": 2.227125883102417, "learning_rate": 6.906010489854208e-05, "loss": 0.3753, "step": 1940 }, { "epoch": 1.9268774703557312, "grad_norm": 1.7864975929260254, "learning_rate": 6.87633502439687e-05, "loss": 0.3534, "step": 1950 }, { "epoch": 1.9268774703557312, "eval_loss": 0.3949296772480011, "eval_runtime": 34.1382, "eval_samples_per_second": 14.646, "eval_steps_per_second": 14.646, "step": 1950 }, { "epoch": 1.9367588932806323, "grad_norm": 1.8437656164169312, "learning_rate": 6.84658243015578e-05, "loss": 0.3605, "step": 1960 }, { "epoch": 1.9466403162055337, "grad_norm": 2.3836982250213623, "learning_rate": 6.816753930143555e-05, "loss": 0.3686, "step": 1970 }, { "epoch": 1.9565217391304348, "grad_norm": 2.2831881046295166, "learning_rate": 6.786850750493004e-05, "loss": 0.3655, "step": 1980 }, { "epoch": 1.966403162055336, "grad_norm": 2.3932294845581055, "learning_rate": 6.756874120406713e-05, "loss": 0.3802, "step": 1990 }, { "epoch": 1.9762845849802373, "grad_norm": 1.7772880792617798, "learning_rate": 6.726825272106537e-05, "loss": 0.3454, "step": 2000 }, { "epoch": 1.9861660079051382, "grad_norm": 1.9510533809661865, "learning_rate": 6.696705440782937e-05, "loss": 0.3789, "step": 2010 }, { "epoch": 1.9960474308300395, "grad_norm": 2.113067150115967, "learning_rate": 6.666515864544208e-05, "loss": 0.3718, "step": 2020 }, { "epoch": 2.005928853754941, "grad_norm": 2.007193088531494, "learning_rate": 6.636257784365583e-05, "loss": 0.2785, "step": 2030 }, { "epoch": 2.015810276679842, "grad_norm": 1.9907631874084473, "learning_rate": 6.605932444038227e-05, "loss": 0.284, "step": 2040 }, { "epoch": 2.025691699604743, "grad_norm": 2.3482143878936768, "learning_rate": 6.575541090118102e-05, "loss": 0.2744, "step": 2050 }, { "epoch": 2.035573122529644, "grad_norm": 2.2984330654144287, "learning_rate": 6.545084971874736e-05, "loss": 0.2763, "step": 2060 }, { "epoch": 2.0454545454545454, "grad_norm": 2.089308261871338, "learning_rate": 6.51456534123986e-05, "loss": 0.2993, "step": 2070 }, { "epoch": 2.0553359683794468, "grad_norm": 1.9980093240737915, "learning_rate": 6.483983452755952e-05, "loss": 0.295, "step": 2080 }, { "epoch": 2.0652173913043477, "grad_norm": 2.138206958770752, "learning_rate": 6.453340563524668e-05, "loss": 0.302, "step": 2090 }, { "epoch": 2.075098814229249, "grad_norm": 2.199354887008667, "learning_rate": 6.422637933155161e-05, "loss": 0.2791, "step": 2100 }, { "epoch": 2.075098814229249, "eval_loss": 0.4022028148174286, "eval_runtime": 34.24, "eval_samples_per_second": 14.603, "eval_steps_per_second": 14.603, "step": 2100 }, { "epoch": 2.0849802371541504, "grad_norm": 1.7849321365356445, "learning_rate": 6.391876823712316e-05, "loss": 0.2882, "step": 2110 }, { "epoch": 2.0948616600790513, "grad_norm": 2.316427230834961, "learning_rate": 6.361058499664854e-05, "loss": 0.2893, "step": 2120 }, { "epoch": 2.1047430830039526, "grad_norm": 2.092482328414917, "learning_rate": 6.330184227833374e-05, "loss": 0.2851, "step": 2130 }, { "epoch": 2.1146245059288535, "grad_norm": 2.0520172119140625, "learning_rate": 6.299255277338263e-05, "loss": 0.2893, "step": 2140 }, { "epoch": 2.124505928853755, "grad_norm": 2.0501887798309326, "learning_rate": 6.268272919547534e-05, "loss": 0.2877, "step": 2150 }, { "epoch": 2.1343873517786562, "grad_norm": 1.967375636100769, "learning_rate": 6.23723842802457e-05, "loss": 0.2838, "step": 2160 }, { "epoch": 2.144268774703557, "grad_norm": 2.331676959991455, "learning_rate": 6.20615307847576e-05, "loss": 0.3169, "step": 2170 }, { "epoch": 2.1541501976284585, "grad_norm": 2.251298666000366, "learning_rate": 6.175018148698074e-05, "loss": 0.2882, "step": 2180 }, { "epoch": 2.16403162055336, "grad_norm": 2.0839710235595703, "learning_rate": 6.143834918526526e-05, "loss": 0.2862, "step": 2190 }, { "epoch": 2.1739130434782608, "grad_norm": 2.633404493331909, "learning_rate": 6.112604669781571e-05, "loss": 0.2815, "step": 2200 }, { "epoch": 2.183794466403162, "grad_norm": 2.089813709259033, "learning_rate": 6.081328686216416e-05, "loss": 0.3046, "step": 2210 }, { "epoch": 2.1936758893280635, "grad_norm": 2.1596994400024414, "learning_rate": 6.050008253464245e-05, "loss": 0.2834, "step": 2220 }, { "epoch": 2.2035573122529644, "grad_norm": 1.879882574081421, "learning_rate": 6.018644658985377e-05, "loss": 0.2797, "step": 2230 }, { "epoch": 2.2134387351778657, "grad_norm": 2.6231043338775635, "learning_rate": 5.987239192014334e-05, "loss": 0.2761, "step": 2240 }, { "epoch": 2.2233201581027666, "grad_norm": 2.381791114807129, "learning_rate": 5.9557931435068606e-05, "loss": 0.3281, "step": 2250 }, { "epoch": 2.2233201581027666, "eval_loss": 0.40134918689727783, "eval_runtime": 34.1432, "eval_samples_per_second": 14.644, "eval_steps_per_second": 14.644, "step": 2250 }, { "epoch": 2.233201581027668, "grad_norm": 2.037327527999878, "learning_rate": 5.9243078060868426e-05, "loss": 0.2772, "step": 2260 }, { "epoch": 2.2430830039525693, "grad_norm": 2.2997586727142334, "learning_rate": 5.892784473993182e-05, "loss": 0.2905, "step": 2270 }, { "epoch": 2.2529644268774702, "grad_norm": 1.9744611978530884, "learning_rate": 5.861224443026593e-05, "loss": 0.2868, "step": 2280 }, { "epoch": 2.2628458498023716, "grad_norm": 2.116672992706299, "learning_rate": 5.8296290104963387e-05, "loss": 0.2858, "step": 2290 }, { "epoch": 2.2727272727272725, "grad_norm": 2.145845890045166, "learning_rate": 5.797999475166895e-05, "loss": 0.2752, "step": 2300 }, { "epoch": 2.282608695652174, "grad_norm": 2.2736833095550537, "learning_rate": 5.766337137204578e-05, "loss": 0.2984, "step": 2310 }, { "epoch": 2.292490118577075, "grad_norm": 2.218451499938965, "learning_rate": 5.734643298124089e-05, "loss": 0.2912, "step": 2320 }, { "epoch": 2.302371541501976, "grad_norm": 1.9524636268615723, "learning_rate": 5.702919260735013e-05, "loss": 0.2919, "step": 2330 }, { "epoch": 2.3122529644268774, "grad_norm": 2.35251784324646, "learning_rate": 5.671166329088276e-05, "loss": 0.3256, "step": 2340 }, { "epoch": 2.322134387351779, "grad_norm": 2.4238321781158447, "learning_rate": 5.639385808422529e-05, "loss": 0.3056, "step": 2350 }, { "epoch": 2.3320158102766797, "grad_norm": 2.408384084701538, "learning_rate": 5.607579005110501e-05, "loss": 0.2833, "step": 2360 }, { "epoch": 2.341897233201581, "grad_norm": 2.345621109008789, "learning_rate": 5.575747226605297e-05, "loss": 0.2961, "step": 2370 }, { "epoch": 2.3517786561264824, "grad_norm": 2.226508140563965, "learning_rate": 5.543891781386654e-05, "loss": 0.3138, "step": 2380 }, { "epoch": 2.3616600790513833, "grad_norm": 2.230583429336548, "learning_rate": 5.5120139789071554e-05, "loss": 0.2837, "step": 2390 }, { "epoch": 2.3715415019762847, "grad_norm": 2.449136972427368, "learning_rate": 5.480115129538408e-05, "loss": 0.2763, "step": 2400 }, { "epoch": 2.3715415019762847, "eval_loss": 0.399911105632782, "eval_runtime": 34.0722, "eval_samples_per_second": 14.675, "eval_steps_per_second": 14.675, "step": 2400 }, { "epoch": 2.3814229249011856, "grad_norm": 2.04034161567688, "learning_rate": 5.4481965445171666e-05, "loss": 0.2889, "step": 2410 }, { "epoch": 2.391304347826087, "grad_norm": 2.589332103729248, "learning_rate": 5.416259535891445e-05, "loss": 0.2882, "step": 2420 }, { "epoch": 2.4011857707509883, "grad_norm": 1.8965773582458496, "learning_rate": 5.384305416466583e-05, "loss": 0.2664, "step": 2430 }, { "epoch": 2.411067193675889, "grad_norm": 1.9925569295883179, "learning_rate": 5.3523354997512684e-05, "loss": 0.3079, "step": 2440 }, { "epoch": 2.4209486166007905, "grad_norm": 1.982309341430664, "learning_rate": 5.320351099903564e-05, "loss": 0.2893, "step": 2450 }, { "epoch": 2.430830039525692, "grad_norm": 2.4800615310668945, "learning_rate": 5.288353531676871e-05, "loss": 0.2576, "step": 2460 }, { "epoch": 2.440711462450593, "grad_norm": 2.2494561672210693, "learning_rate": 5.256344110365895e-05, "loss": 0.292, "step": 2470 }, { "epoch": 2.450592885375494, "grad_norm": 1.8020946979522705, "learning_rate": 5.224324151752574e-05, "loss": 0.2863, "step": 2480 }, { "epoch": 2.4604743083003955, "grad_norm": 2.187232494354248, "learning_rate": 5.192294972051991e-05, "loss": 0.2753, "step": 2490 }, { "epoch": 2.4703557312252964, "grad_norm": 2.2467732429504395, "learning_rate": 5.160257887858276e-05, "loss": 0.3188, "step": 2500 }, { "epoch": 2.4802371541501977, "grad_norm": 1.9890021085739136, "learning_rate": 5.128214216090477e-05, "loss": 0.2838, "step": 2510 }, { "epoch": 2.4901185770750986, "grad_norm": 1.962117075920105, "learning_rate": 5.096165273938434e-05, "loss": 0.2858, "step": 2520 }, { "epoch": 2.5, "grad_norm": 1.8473106622695923, "learning_rate": 5.064112378808635e-05, "loss": 0.2692, "step": 2530 }, { "epoch": 2.5098814229249014, "grad_norm": 1.9031504392623901, "learning_rate": 5.032056848270054e-05, "loss": 0.2993, "step": 2540 }, { "epoch": 2.5197628458498023, "grad_norm": 2.2502200603485107, "learning_rate": 4.999999999999999e-05, "loss": 0.2696, "step": 2550 }, { "epoch": 2.5197628458498023, "eval_loss": 0.39579418301582336, "eval_runtime": 34.1604, "eval_samples_per_second": 14.637, "eval_steps_per_second": 14.637, "step": 2550 }, { "epoch": 2.5296442687747036, "grad_norm": 2.3031277656555176, "learning_rate": 4.9679431517299435e-05, "loss": 0.3062, "step": 2560 }, { "epoch": 2.5395256916996045, "grad_norm": 2.183401107788086, "learning_rate": 4.9358876211913624e-05, "loss": 0.325, "step": 2570 }, { "epoch": 2.549407114624506, "grad_norm": 1.9418132305145264, "learning_rate": 4.9038347260615636e-05, "loss": 0.2874, "step": 2580 }, { "epoch": 2.559288537549407, "grad_norm": 2.340853214263916, "learning_rate": 4.871785783909522e-05, "loss": 0.2914, "step": 2590 }, { "epoch": 2.5691699604743086, "grad_norm": 1.8216912746429443, "learning_rate": 4.839742112141723e-05, "loss": 0.2935, "step": 2600 }, { "epoch": 2.5790513833992095, "grad_norm": 2.1227974891662598, "learning_rate": 4.807705027948006e-05, "loss": 0.2903, "step": 2610 }, { "epoch": 2.588932806324111, "grad_norm": 2.1689720153808594, "learning_rate": 4.775675848247426e-05, "loss": 0.2919, "step": 2620 }, { "epoch": 2.5988142292490117, "grad_norm": 2.3520572185516357, "learning_rate": 4.7436558896341037e-05, "loss": 0.2947, "step": 2630 }, { "epoch": 2.608695652173913, "grad_norm": 2.0316853523254395, "learning_rate": 4.711646468323127e-05, "loss": 0.2921, "step": 2640 }, { "epoch": 2.6185770750988144, "grad_norm": 2.334075450897217, "learning_rate": 4.6796489000964345e-05, "loss": 0.3109, "step": 2650 }, { "epoch": 2.6284584980237153, "grad_norm": 2.4072225093841553, "learning_rate": 4.6476645002487286e-05, "loss": 0.2886, "step": 2660 }, { "epoch": 2.6383399209486167, "grad_norm": 2.3423006534576416, "learning_rate": 4.615694583533417e-05, "loss": 0.3002, "step": 2670 }, { "epoch": 2.6482213438735176, "grad_norm": 2.290945291519165, "learning_rate": 4.5837404641085526e-05, "loss": 0.3013, "step": 2680 }, { "epoch": 2.658102766798419, "grad_norm": 2.3166189193725586, "learning_rate": 4.551803455482832e-05, "loss": 0.2855, "step": 2690 }, { "epoch": 2.6679841897233203, "grad_norm": 2.2672386169433594, "learning_rate": 4.51988487046159e-05, "loss": 0.2732, "step": 2700 }, { "epoch": 2.6679841897233203, "eval_loss": 0.3915008008480072, "eval_runtime": 34.1864, "eval_samples_per_second": 14.626, "eval_steps_per_second": 14.626, "step": 2700 }, { "epoch": 2.677865612648221, "grad_norm": 2.359931468963623, "learning_rate": 4.487986021092842e-05, "loss": 0.3326, "step": 2710 }, { "epoch": 2.6877470355731226, "grad_norm": 1.9921296834945679, "learning_rate": 4.456108218613345e-05, "loss": 0.28, "step": 2720 }, { "epoch": 2.6976284584980235, "grad_norm": 2.083142042160034, "learning_rate": 4.4242527733947024e-05, "loss": 0.2936, "step": 2730 }, { "epoch": 2.707509881422925, "grad_norm": 2.4226627349853516, "learning_rate": 4.3924209948894975e-05, "loss": 0.3011, "step": 2740 }, { "epoch": 2.717391304347826, "grad_norm": 2.4604616165161133, "learning_rate": 4.360614191577469e-05, "loss": 0.2702, "step": 2750 }, { "epoch": 2.7272727272727275, "grad_norm": 1.890394926071167, "learning_rate": 4.3288336709117236e-05, "loss": 0.2793, "step": 2760 }, { "epoch": 2.7371541501976284, "grad_norm": 2.458721160888672, "learning_rate": 4.297080739264986e-05, "loss": 0.2882, "step": 2770 }, { "epoch": 2.7470355731225298, "grad_norm": 2.1499905586242676, "learning_rate": 4.2653567018759094e-05, "loss": 0.2955, "step": 2780 }, { "epoch": 2.7569169960474307, "grad_norm": 2.4274818897247314, "learning_rate": 4.233662862795419e-05, "loss": 0.302, "step": 2790 }, { "epoch": 2.766798418972332, "grad_norm": 2.062570571899414, "learning_rate": 4.202000524833104e-05, "loss": 0.2715, "step": 2800 }, { "epoch": 2.7766798418972334, "grad_norm": 2.0354299545288086, "learning_rate": 4.170370989503661e-05, "loss": 0.3038, "step": 2810 }, { "epoch": 2.7865612648221343, "grad_norm": 2.3393170833587646, "learning_rate": 4.1387755569734046e-05, "loss": 0.2905, "step": 2820 }, { "epoch": 2.7964426877470356, "grad_norm": 2.224705934524536, "learning_rate": 4.1072155260068164e-05, "loss": 0.2989, "step": 2830 }, { "epoch": 2.8063241106719365, "grad_norm": 1.9835401773452759, "learning_rate": 4.075692193913155e-05, "loss": 0.2879, "step": 2840 }, { "epoch": 2.816205533596838, "grad_norm": 2.378260850906372, "learning_rate": 4.0442068564931385e-05, "loss": 0.3009, "step": 2850 }, { "epoch": 2.816205533596838, "eval_loss": 0.38761380314826965, "eval_runtime": 34.3575, "eval_samples_per_second": 14.553, "eval_steps_per_second": 14.553, "step": 2850 }, { "epoch": 2.8260869565217392, "grad_norm": 2.5100574493408203, "learning_rate": 4.012760807985664e-05, "loss": 0.2982, "step": 2860 }, { "epoch": 2.83596837944664, "grad_norm": 1.639036774635315, "learning_rate": 3.9813553410146214e-05, "loss": 0.3087, "step": 2870 }, { "epoch": 2.8458498023715415, "grad_norm": 2.490206241607666, "learning_rate": 3.949991746535752e-05, "loss": 0.2989, "step": 2880 }, { "epoch": 2.8557312252964424, "grad_norm": 2.3250222206115723, "learning_rate": 3.918671313783582e-05, "loss": 0.3034, "step": 2890 }, { "epoch": 2.8656126482213438, "grad_norm": 2.325777292251587, "learning_rate": 3.8873953302184275e-05, "loss": 0.2747, "step": 2900 }, { "epoch": 2.875494071146245, "grad_norm": 2.2305283546447754, "learning_rate": 3.856165081473473e-05, "loss": 0.2856, "step": 2910 }, { "epoch": 2.8853754940711465, "grad_norm": 2.2349300384521484, "learning_rate": 3.824981851301923e-05, "loss": 0.2716, "step": 2920 }, { "epoch": 2.8952569169960474, "grad_norm": 1.9070847034454346, "learning_rate": 3.793846921524236e-05, "loss": 0.3053, "step": 2930 }, { "epoch": 2.9051383399209487, "grad_norm": 2.1934654712677, "learning_rate": 3.7627615719754287e-05, "loss": 0.3143, "step": 2940 }, { "epoch": 2.9150197628458496, "grad_norm": 1.9646544456481934, "learning_rate": 3.7317270804524626e-05, "loss": 0.2865, "step": 2950 }, { "epoch": 2.924901185770751, "grad_norm": 2.232283353805542, "learning_rate": 3.700744722661735e-05, "loss": 0.2958, "step": 2960 }, { "epoch": 2.9347826086956523, "grad_norm": 2.3013436794281006, "learning_rate": 3.669815772166624e-05, "loss": 0.2834, "step": 2970 }, { "epoch": 2.9446640316205532, "grad_norm": 2.4497499465942383, "learning_rate": 3.6389415003351434e-05, "loss": 0.2978, "step": 2980 }, { "epoch": 2.9545454545454546, "grad_norm": 1.9342460632324219, "learning_rate": 3.608123176287684e-05, "loss": 0.2826, "step": 2990 }, { "epoch": 2.9644268774703555, "grad_norm": 2.4852263927459717, "learning_rate": 3.577362066844837e-05, "loss": 0.2766, "step": 3000 }, { "epoch": 2.9644268774703555, "eval_loss": 0.3866276741027832, "eval_runtime": 34.3485, "eval_samples_per_second": 14.557, "eval_steps_per_second": 14.557, "step": 3000 }, { "epoch": 2.974308300395257, "grad_norm": 1.9710100889205933, "learning_rate": 3.546659436475331e-05, "loss": 0.282, "step": 3010 }, { "epoch": 2.984189723320158, "grad_norm": 2.269618511199951, "learning_rate": 3.516016547244046e-05, "loss": 0.2974, "step": 3020 }, { "epoch": 2.9940711462450595, "grad_norm": 2.038463830947876, "learning_rate": 3.485434658760139e-05, "loss": 0.2931, "step": 3030 }, { "epoch": 3.0039525691699605, "grad_norm": 1.9927372932434082, "learning_rate": 3.454915028125262e-05, "loss": 0.2778, "step": 3040 }, { "epoch": 3.013833992094862, "grad_norm": 2.1115493774414062, "learning_rate": 3.424458909881896e-05, "loss": 0.228, "step": 3050 }, { "epoch": 3.0237154150197627, "grad_norm": 2.1515655517578125, "learning_rate": 3.394067555961772e-05, "loss": 0.2172, "step": 3060 }, { "epoch": 3.033596837944664, "grad_norm": 2.6619913578033447, "learning_rate": 3.3637422156344146e-05, "loss": 0.2158, "step": 3070 }, { "epoch": 3.0434782608695654, "grad_norm": 2.441153049468994, "learning_rate": 3.333484135455791e-05, "loss": 0.2305, "step": 3080 }, { "epoch": 3.0533596837944663, "grad_norm": 1.8996953964233398, "learning_rate": 3.3032945592170616e-05, "loss": 0.2436, "step": 3090 }, { "epoch": 3.0632411067193677, "grad_norm": 2.410764217376709, "learning_rate": 3.2731747278934616e-05, "loss": 0.2198, "step": 3100 }, { "epoch": 3.0731225296442686, "grad_norm": 1.9897364377975464, "learning_rate": 3.243125879593285e-05, "loss": 0.2309, "step": 3110 }, { "epoch": 3.08300395256917, "grad_norm": 2.573441982269287, "learning_rate": 3.213149249506996e-05, "loss": 0.213, "step": 3120 }, { "epoch": 3.0928853754940713, "grad_norm": 2.5531418323516846, "learning_rate": 3.1832460698564424e-05, "loss": 0.234, "step": 3130 }, { "epoch": 3.102766798418972, "grad_norm": 2.101729154586792, "learning_rate": 3.1534175698442184e-05, "loss": 0.239, "step": 3140 }, { "epoch": 3.1126482213438735, "grad_norm": 1.9485621452331543, "learning_rate": 3.123664975603129e-05, "loss": 0.2145, "step": 3150 }, { "epoch": 3.1126482213438735, "eval_loss": 0.40044403076171875, "eval_runtime": 34.1191, "eval_samples_per_second": 14.655, "eval_steps_per_second": 14.655, "step": 3150 }, { "epoch": 3.122529644268775, "grad_norm": 2.287511110305786, "learning_rate": 3.093989510145791e-05, "loss": 0.2225, "step": 3160 }, { "epoch": 3.132411067193676, "grad_norm": 2.047910451889038, "learning_rate": 3.064392393314359e-05, "loss": 0.2178, "step": 3170 }, { "epoch": 3.142292490118577, "grad_norm": 1.6208367347717285, "learning_rate": 3.0348748417303817e-05, "loss": 0.222, "step": 3180 }, { "epoch": 3.1521739130434785, "grad_norm": 2.4111440181732178, "learning_rate": 3.005438068744791e-05, "loss": 0.2177, "step": 3190 }, { "epoch": 3.1620553359683794, "grad_norm": 2.368447780609131, "learning_rate": 2.9760832843880303e-05, "loss": 0.2012, "step": 3200 }, { "epoch": 3.1719367588932808, "grad_norm": 2.473605155944824, "learning_rate": 2.94681169532031e-05, "loss": 0.2277, "step": 3210 }, { "epoch": 3.1818181818181817, "grad_norm": 2.02504301071167, "learning_rate": 2.9176245047820055e-05, "loss": 0.2193, "step": 3220 }, { "epoch": 3.191699604743083, "grad_norm": 2.4214389324188232, "learning_rate": 2.8885229125442014e-05, "loss": 0.2196, "step": 3230 }, { "epoch": 3.2015810276679844, "grad_norm": 2.1690824031829834, "learning_rate": 2.859508114859373e-05, "loss": 0.2267, "step": 3240 }, { "epoch": 3.2114624505928853, "grad_norm": 1.960707426071167, "learning_rate": 2.830581304412209e-05, "loss": 0.2409, "step": 3250 }, { "epoch": 3.2213438735177866, "grad_norm": 2.320850133895874, "learning_rate": 2.8017436702705894e-05, "loss": 0.2245, "step": 3260 }, { "epoch": 3.2312252964426875, "grad_norm": 2.268925428390503, "learning_rate": 2.7729963978367035e-05, "loss": 0.2373, "step": 3270 }, { "epoch": 3.241106719367589, "grad_norm": 2.374448776245117, "learning_rate": 2.7443406687983255e-05, "loss": 0.2188, "step": 3280 }, { "epoch": 3.2509881422924902, "grad_norm": 1.9990499019622803, "learning_rate": 2.7157776610802408e-05, "loss": 0.253, "step": 3290 }, { "epoch": 3.260869565217391, "grad_norm": 2.6509578227996826, "learning_rate": 2.6873085487958243e-05, "loss": 0.2471, "step": 3300 }, { "epoch": 3.260869565217391, "eval_loss": 0.4012451171875, "eval_runtime": 34.3577, "eval_samples_per_second": 14.553, "eval_steps_per_second": 14.553, "step": 3300 }, { "epoch": 3.2707509881422925, "grad_norm": 2.4065935611724854, "learning_rate": 2.6589345021987714e-05, "loss": 0.2455, "step": 3310 }, { "epoch": 3.280632411067194, "grad_norm": 2.315992593765259, "learning_rate": 2.6306566876350062e-05, "loss": 0.2137, "step": 3320 }, { "epoch": 3.2905138339920947, "grad_norm": 2.162156820297241, "learning_rate": 2.6024762674947306e-05, "loss": 0.2075, "step": 3330 }, { "epoch": 3.300395256916996, "grad_norm": 2.3874671459198, "learning_rate": 2.5743944001646384e-05, "loss": 0.2452, "step": 3340 }, { "epoch": 3.3102766798418974, "grad_norm": 2.3413357734680176, "learning_rate": 2.5464122399803118e-05, "loss": 0.2491, "step": 3350 }, { "epoch": 3.3201581027667983, "grad_norm": 2.0517518520355225, "learning_rate": 2.5185309371787506e-05, "loss": 0.2304, "step": 3360 }, { "epoch": 3.3300395256916997, "grad_norm": 2.5444791316986084, "learning_rate": 2.490751637851113e-05, "loss": 0.2252, "step": 3370 }, { "epoch": 3.3399209486166006, "grad_norm": 3.271428108215332, "learning_rate": 2.4630754838955894e-05, "loss": 0.2221, "step": 3380 }, { "epoch": 3.349802371541502, "grad_norm": 2.159346103668213, "learning_rate": 2.4355036129704693e-05, "loss": 0.2355, "step": 3390 }, { "epoch": 3.3596837944664033, "grad_norm": 1.997672438621521, "learning_rate": 2.408037158447374e-05, "loss": 0.2128, "step": 3400 }, { "epoch": 3.369565217391304, "grad_norm": 1.8913171291351318, "learning_rate": 2.3806772493646716e-05, "loss": 0.2133, "step": 3410 }, { "epoch": 3.3794466403162056, "grad_norm": 2.4527885913848877, "learning_rate": 2.3534250103810622e-05, "loss": 0.2171, "step": 3420 }, { "epoch": 3.3893280632411065, "grad_norm": 2.4104301929473877, "learning_rate": 2.326281561729351e-05, "loss": 0.2268, "step": 3430 }, { "epoch": 3.399209486166008, "grad_norm": 1.9531340599060059, "learning_rate": 2.2992480191703996e-05, "loss": 0.2158, "step": 3440 }, { "epoch": 3.409090909090909, "grad_norm": 1.6948319673538208, "learning_rate": 2.2723254939472564e-05, "loss": 0.2223, "step": 3450 }, { "epoch": 3.409090909090909, "eval_loss": 0.40440815687179565, "eval_runtime": 34.4399, "eval_samples_per_second": 14.518, "eval_steps_per_second": 14.518, "step": 3450 }, { "epoch": 3.4189723320158105, "grad_norm": 2.2849342823028564, "learning_rate": 2.2455150927394874e-05, "loss": 0.2354, "step": 3460 }, { "epoch": 3.4288537549407114, "grad_norm": 2.416132688522339, "learning_rate": 2.218817917617676e-05, "loss": 0.2322, "step": 3470 }, { "epoch": 3.438735177865613, "grad_norm": 2.2576396465301514, "learning_rate": 2.1922350659981254e-05, "loss": 0.2366, "step": 3480 }, { "epoch": 3.4486166007905137, "grad_norm": 2.38684344291687, "learning_rate": 2.1657676305977515e-05, "loss": 0.2322, "step": 3490 }, { "epoch": 3.458498023715415, "grad_norm": 2.6015143394470215, "learning_rate": 2.1394166993891523e-05, "loss": 0.2412, "step": 3500 }, { "epoch": 3.4683794466403164, "grad_norm": 2.134091854095459, "learning_rate": 2.1131833555559034e-05, "loss": 0.2176, "step": 3510 }, { "epoch": 3.4782608695652173, "grad_norm": 2.0862109661102295, "learning_rate": 2.0870686774480193e-05, "loss": 0.212, "step": 3520 }, { "epoch": 3.4881422924901186, "grad_norm": 2.139421224594116, "learning_rate": 2.0610737385376345e-05, "loss": 0.2398, "step": 3530 }, { "epoch": 3.4980237154150196, "grad_norm": 2.5852270126342773, "learning_rate": 2.035199607374871e-05, "loss": 0.2417, "step": 3540 }, { "epoch": 3.507905138339921, "grad_norm": 2.2502431869506836, "learning_rate": 2.0094473475439195e-05, "loss": 0.2277, "step": 3550 }, { "epoch": 3.5177865612648223, "grad_norm": 1.9478706121444702, "learning_rate": 1.983818017619317e-05, "loss": 0.247, "step": 3560 }, { "epoch": 3.527667984189723, "grad_norm": 2.4664876461029053, "learning_rate": 1.9583126711224336e-05, "loss": 0.2398, "step": 3570 }, { "epoch": 3.5375494071146245, "grad_norm": 2.0479726791381836, "learning_rate": 1.9329323564781675e-05, "loss": 0.2291, "step": 3580 }, { "epoch": 3.5474308300395254, "grad_norm": 1.9972285032272339, "learning_rate": 1.907678116971842e-05, "loss": 0.2287, "step": 3590 }, { "epoch": 3.5573122529644268, "grad_norm": 2.7339351177215576, "learning_rate": 1.882550990706332e-05, "loss": 0.2329, "step": 3600 }, { "epoch": 3.5573122529644268, "eval_loss": 0.39887359738349915, "eval_runtime": 34.1894, "eval_samples_per_second": 14.624, "eval_steps_per_second": 14.624, "step": 3600 }, { "epoch": 3.567193675889328, "grad_norm": 2.279087781906128, "learning_rate": 1.8575520105593814e-05, "loss": 0.2165, "step": 3610 }, { "epoch": 3.5770750988142295, "grad_norm": 2.1736888885498047, "learning_rate": 1.8326822041411518e-05, "loss": 0.2408, "step": 3620 }, { "epoch": 3.5869565217391304, "grad_norm": 2.6511847972869873, "learning_rate": 1.8079425937519722e-05, "loss": 0.2075, "step": 3630 }, { "epoch": 3.5968379446640317, "grad_norm": 2.688720226287842, "learning_rate": 1.7833341963403307e-05, "loss": 0.2257, "step": 3640 }, { "epoch": 3.6067193675889326, "grad_norm": 2.398272752761841, "learning_rate": 1.7588580234610588e-05, "loss": 0.2248, "step": 3650 }, { "epoch": 3.616600790513834, "grad_norm": 2.060293197631836, "learning_rate": 1.7345150812337557e-05, "loss": 0.2491, "step": 3660 }, { "epoch": 3.6264822134387353, "grad_norm": 2.143730401992798, "learning_rate": 1.7103063703014366e-05, "loss": 0.2194, "step": 3670 }, { "epoch": 3.6363636363636362, "grad_norm": 2.20381760597229, "learning_rate": 1.686232885789385e-05, "loss": 0.2299, "step": 3680 }, { "epoch": 3.6462450592885376, "grad_norm": 2.0883138179779053, "learning_rate": 1.6622956172642597e-05, "loss": 0.2377, "step": 3690 }, { "epoch": 3.6561264822134385, "grad_norm": 2.045193672180176, "learning_rate": 1.6384955486934152e-05, "loss": 0.2105, "step": 3700 }, { "epoch": 3.66600790513834, "grad_norm": 1.9251530170440674, "learning_rate": 1.6148336584044533e-05, "loss": 0.2149, "step": 3710 }, { "epoch": 3.675889328063241, "grad_norm": 2.162121534347534, "learning_rate": 1.591310919045003e-05, "loss": 0.2359, "step": 3720 }, { "epoch": 3.6857707509881426, "grad_norm": 2.423417329788208, "learning_rate": 1.5679282975427484e-05, "loss": 0.2305, "step": 3730 }, { "epoch": 3.6956521739130435, "grad_norm": 2.2101898193359375, "learning_rate": 1.5446867550656765e-05, "loss": 0.2174, "step": 3740 }, { "epoch": 3.705533596837945, "grad_norm": 2.565885066986084, "learning_rate": 1.5215872469825677e-05, "loss": 0.2184, "step": 3750 }, { "epoch": 3.705533596837945, "eval_loss": 0.39941611886024475, "eval_runtime": 34.6521, "eval_samples_per_second": 14.429, "eval_steps_per_second": 14.429, "step": 3750 }, { "epoch": 3.7154150197628457, "grad_norm": 2.2909016609191895, "learning_rate": 1.4986307228237263e-05, "loss": 0.2328, "step": 3760 }, { "epoch": 3.725296442687747, "grad_norm": 2.3509652614593506, "learning_rate": 1.475818126241942e-05, "loss": 0.2126, "step": 3770 }, { "epoch": 3.7351778656126484, "grad_norm": 2.112107515335083, "learning_rate": 1.4531503949737103e-05, "loss": 0.2093, "step": 3780 }, { "epoch": 3.7450592885375493, "grad_norm": 2.65423846244812, "learning_rate": 1.4306284608006833e-05, "loss": 0.2398, "step": 3790 }, { "epoch": 3.7549407114624507, "grad_norm": 2.521843194961548, "learning_rate": 1.4082532495113623e-05, "loss": 0.2215, "step": 3800 }, { "epoch": 3.7648221343873516, "grad_norm": 1.9202762842178345, "learning_rate": 1.3860256808630425e-05, "loss": 0.2201, "step": 3810 }, { "epoch": 3.774703557312253, "grad_norm": 2.7013628482818604, "learning_rate": 1.3639466685440129e-05, "loss": 0.2446, "step": 3820 }, { "epoch": 3.7845849802371543, "grad_norm": 1.9625245332717896, "learning_rate": 1.3420171201359928e-05, "loss": 0.2197, "step": 3830 }, { "epoch": 3.794466403162055, "grad_norm": 2.7485172748565674, "learning_rate": 1.3202379370768249e-05, "loss": 0.222, "step": 3840 }, { "epoch": 3.8043478260869565, "grad_norm": 2.577033281326294, "learning_rate": 1.2986100146234227e-05, "loss": 0.2217, "step": 3850 }, { "epoch": 3.8142292490118574, "grad_norm": 2.4234845638275146, "learning_rate": 1.2771342418149653e-05, "loss": 0.2205, "step": 3860 }, { "epoch": 3.824110671936759, "grad_norm": 2.127350330352783, "learning_rate": 1.2558115014363589e-05, "loss": 0.2076, "step": 3870 }, { "epoch": 3.83399209486166, "grad_norm": 2.254338502883911, "learning_rate": 1.2346426699819455e-05, "loss": 0.2408, "step": 3880 }, { "epoch": 3.8438735177865615, "grad_norm": 2.1729793548583984, "learning_rate": 1.2136286176194741e-05, "loss": 0.2253, "step": 3890 }, { "epoch": 3.8537549407114624, "grad_norm": 2.07226824760437, "learning_rate": 1.1927702081543275e-05, "loss": 0.2332, "step": 3900 }, { "epoch": 3.8537549407114624, "eval_loss": 0.39704427123069763, "eval_runtime": 34.4753, "eval_samples_per_second": 14.503, "eval_steps_per_second": 14.503, "step": 3900 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.436859427356672e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }