{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.423800044238001, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022119000221190004, "grad_norm": 2561.85302734375, "learning_rate": 2.5e-06, "loss": 7244.9438, "step": 100 }, { "epoch": 0.04423800044238001, "grad_norm": 1641.6300048828125, "learning_rate": 5e-06, "loss": 278.0641, "step": 200 }, { "epoch": 0.06635700066357, "grad_norm": 413.9412841796875, "learning_rate": 7.5e-06, "loss": 305.1475, "step": 300 }, { "epoch": 0.08847600088476001, "grad_norm": 298.65997314453125, "learning_rate": 1e-05, "loss": 296.6687, "step": 400 }, { "epoch": 0.11059500110595001, "grad_norm": 612.1175537109375, "learning_rate": 1.25e-05, "loss": 247.2131, "step": 500 }, { "epoch": 0.13271400132714, "grad_norm": 1398.299560546875, "learning_rate": 1.5e-05, "loss": 273.615, "step": 600 }, { "epoch": 0.15483300154833002, "grad_norm": 538.8013305664062, "learning_rate": 1.75e-05, "loss": 277.284, "step": 700 }, { "epoch": 0.17695200176952003, "grad_norm": 834.0810546875, "learning_rate": 2e-05, "loss": 343.3682, "step": 800 }, { "epoch": 0.19907100199071, "grad_norm": 514.625732421875, "learning_rate": 2.25e-05, "loss": 172.4172, "step": 900 }, { "epoch": 0.22119000221190002, "grad_norm": 720.7930908203125, "learning_rate": 2.5e-05, "loss": 243.5434, "step": 1000 }, { "epoch": 0.24330900243309003, "grad_norm": 538.9693603515625, "learning_rate": 2.7500000000000004e-05, "loss": 227.322, "step": 1100 }, { "epoch": 0.26542800265428, "grad_norm": 365.2975769042969, "learning_rate": 3e-05, "loss": 289.7278, "step": 1200 }, { "epoch": 0.28754700287547, "grad_norm": 1308.2694091796875, "learning_rate": 3.2500000000000004e-05, "loss": 191.7522, "step": 1300 }, { "epoch": 0.30966600309666004, "grad_norm": 6047.15625, "learning_rate": 3.5e-05, "loss": 316.0845, "step": 1400 }, { "epoch": 0.33178500331785005, "grad_norm": 482.5833740234375, "learning_rate": 3.7500000000000003e-05, "loss": 275.0296, "step": 1500 }, { "epoch": 0.35390400353904006, "grad_norm": 811.97607421875, "learning_rate": 4e-05, "loss": 232.9227, "step": 1600 }, { "epoch": 0.37602300376023, "grad_norm": 585.74365234375, "learning_rate": 4.25e-05, "loss": 157.1875, "step": 1700 }, { "epoch": 0.39814200398142, "grad_norm": 2657.204833984375, "learning_rate": 4.5e-05, "loss": 202.9244, "step": 1800 }, { "epoch": 0.42026100420261003, "grad_norm": 788.7911376953125, "learning_rate": 4.75e-05, "loss": 230.3789, "step": 1900 }, { "epoch": 0.44238000442380004, "grad_norm": 1466.7137451171875, "learning_rate": 5e-05, "loss": 184.6603, "step": 2000 }, { "epoch": 0.46449900464499005, "grad_norm": 717.4619750976562, "learning_rate": 4.972222222222223e-05, "loss": 160.0884, "step": 2100 }, { "epoch": 0.48661800486618007, "grad_norm": 1592.6282958984375, "learning_rate": 4.9444444444444446e-05, "loss": 159.9456, "step": 2200 }, { "epoch": 0.5087370050873701, "grad_norm": 258.3177795410156, "learning_rate": 4.9166666666666665e-05, "loss": 272.3899, "step": 2300 }, { "epoch": 0.53085600530856, "grad_norm": 2940.7119140625, "learning_rate": 4.888888888888889e-05, "loss": 108.9058, "step": 2400 }, { "epoch": 0.5529750055297501, "grad_norm": 1595.506103515625, "learning_rate": 4.8611111111111115e-05, "loss": 137.0249, "step": 2500 }, { "epoch": 0.57509400575094, "grad_norm": 476.0992126464844, "learning_rate": 4.8333333333333334e-05, "loss": 157.8203, "step": 2600 }, { "epoch": 0.59721300597213, "grad_norm": 462.79119873046875, "learning_rate": 4.805555555555556e-05, "loss": 202.3364, "step": 2700 }, { "epoch": 0.6193320061933201, "grad_norm": 7305.47900390625, "learning_rate": 4.7777777777777784e-05, "loss": 176.8393, "step": 2800 }, { "epoch": 0.64145100641451, "grad_norm": 1471.9434814453125, "learning_rate": 4.75e-05, "loss": 192.2721, "step": 2900 }, { "epoch": 0.6635700066357001, "grad_norm": 253.4272003173828, "learning_rate": 4.722222222222222e-05, "loss": 204.4471, "step": 3000 }, { "epoch": 0.68568900685689, "grad_norm": 4707.13916015625, "learning_rate": 4.6944444444444446e-05, "loss": 154.4462, "step": 3100 }, { "epoch": 0.7078080070780801, "grad_norm": 477.1028747558594, "learning_rate": 4.666666666666667e-05, "loss": 93.3721, "step": 3200 }, { "epoch": 0.7299270072992701, "grad_norm": 492.2008361816406, "learning_rate": 4.638888888888889e-05, "loss": 129.002, "step": 3300 }, { "epoch": 0.75204600752046, "grad_norm": 414.3881530761719, "learning_rate": 4.6111111111111115e-05, "loss": 88.6518, "step": 3400 }, { "epoch": 0.7741650077416501, "grad_norm": 170.11138916015625, "learning_rate": 4.5833333333333334e-05, "loss": 193.1568, "step": 3500 }, { "epoch": 0.79628400796284, "grad_norm": 787.8067016601562, "learning_rate": 4.555555555555556e-05, "loss": 153.1195, "step": 3600 }, { "epoch": 0.8184030081840301, "grad_norm": 940.7820434570312, "learning_rate": 4.527777777777778e-05, "loss": 312.0311, "step": 3700 }, { "epoch": 0.8405220084052201, "grad_norm": 719.3717041015625, "learning_rate": 4.5e-05, "loss": 160.2157, "step": 3800 }, { "epoch": 0.8626410086264101, "grad_norm": 710.9347534179688, "learning_rate": 4.472222222222223e-05, "loss": 149.8423, "step": 3900 }, { "epoch": 0.8847600088476001, "grad_norm": 812.0198364257812, "learning_rate": 4.4444444444444447e-05, "loss": 171.4201, "step": 4000 }, { "epoch": 0.90687900906879, "grad_norm": 347.59246826171875, "learning_rate": 4.4166666666666665e-05, "loss": 85.4954, "step": 4100 }, { "epoch": 0.9289980092899801, "grad_norm": 494.03570556640625, "learning_rate": 4.388888888888889e-05, "loss": 122.7032, "step": 4200 }, { "epoch": 0.9511170095111701, "grad_norm": 1107.2685546875, "learning_rate": 4.3611111111111116e-05, "loss": 110.0679, "step": 4300 }, { "epoch": 0.9732360097323601, "grad_norm": 554.3077392578125, "learning_rate": 4.3333333333333334e-05, "loss": 162.5302, "step": 4400 }, { "epoch": 0.9953550099535501, "grad_norm": 1022.9378662109375, "learning_rate": 4.305555555555556e-05, "loss": 123.3146, "step": 4500 }, { "epoch": 1.0, "eval_loss": 317.1508483886719, "eval_runtime": 20.7601, "eval_samples_per_second": 96.82, "eval_steps_per_second": 24.229, "step": 4521 }, { "epoch": 1.0174740101747402, "grad_norm": 604.2803344726562, "learning_rate": 4.277777777777778e-05, "loss": 87.0649, "step": 4600 }, { "epoch": 1.03959301039593, "grad_norm": 655.1663208007812, "learning_rate": 4.25e-05, "loss": 60.0608, "step": 4700 }, { "epoch": 1.06171201061712, "grad_norm": 613.4644165039062, "learning_rate": 4.222222222222222e-05, "loss": 152.5058, "step": 4800 }, { "epoch": 1.08383101083831, "grad_norm": 943.52392578125, "learning_rate": 4.194444444444445e-05, "loss": 112.7452, "step": 4900 }, { "epoch": 1.1059500110595002, "grad_norm": 741.4888916015625, "learning_rate": 4.166666666666667e-05, "loss": 180.4975, "step": 5000 }, { "epoch": 1.1280690112806901, "grad_norm": 984.3648071289062, "learning_rate": 4.138888888888889e-05, "loss": 109.8211, "step": 5100 }, { "epoch": 1.15018801150188, "grad_norm": 4328.1767578125, "learning_rate": 4.111111111111111e-05, "loss": 86.7452, "step": 5200 }, { "epoch": 1.17230701172307, "grad_norm": 440.3537902832031, "learning_rate": 4.0833333333333334e-05, "loss": 145.0485, "step": 5300 }, { "epoch": 1.1944260119442602, "grad_norm": 1904.060302734375, "learning_rate": 4.055555555555556e-05, "loss": 137.598, "step": 5400 }, { "epoch": 1.2165450121654502, "grad_norm": 325.7207336425781, "learning_rate": 4.027777777777778e-05, "loss": 133.5432, "step": 5500 }, { "epoch": 1.2386640123866401, "grad_norm": 602.099609375, "learning_rate": 4e-05, "loss": 90.936, "step": 5600 }, { "epoch": 1.26078301260783, "grad_norm": 373.9248962402344, "learning_rate": 3.972222222222222e-05, "loss": 126.1215, "step": 5700 }, { "epoch": 1.28290201282902, "grad_norm": 1740.2845458984375, "learning_rate": 3.944444444444445e-05, "loss": 106.4278, "step": 5800 }, { "epoch": 1.3050210130502102, "grad_norm": 619.2306518554688, "learning_rate": 3.9166666666666665e-05, "loss": 69.9521, "step": 5900 }, { "epoch": 1.3271400132714002, "grad_norm": 397.7071838378906, "learning_rate": 3.888888888888889e-05, "loss": 92.6775, "step": 6000 }, { "epoch": 1.3492590134925901, "grad_norm": 261.9856872558594, "learning_rate": 3.8611111111111116e-05, "loss": 139.3951, "step": 6100 }, { "epoch": 1.37137801371378, "grad_norm": 326.20806884765625, "learning_rate": 3.8333333333333334e-05, "loss": 102.4056, "step": 6200 }, { "epoch": 1.39349701393497, "grad_norm": 5344.56005859375, "learning_rate": 3.805555555555555e-05, "loss": 108.2192, "step": 6300 }, { "epoch": 1.4156160141561602, "grad_norm": 1083.1983642578125, "learning_rate": 3.777777777777778e-05, "loss": 121.1852, "step": 6400 }, { "epoch": 1.4377350143773502, "grad_norm": 285.3344421386719, "learning_rate": 3.7500000000000003e-05, "loss": 113.1963, "step": 6500 }, { "epoch": 1.4598540145985401, "grad_norm": 1590.1190185546875, "learning_rate": 3.722222222222222e-05, "loss": 101.6212, "step": 6600 }, { "epoch": 1.48197301481973, "grad_norm": 242.43016052246094, "learning_rate": 3.694444444444445e-05, "loss": 100.9942, "step": 6700 }, { "epoch": 1.50409201504092, "grad_norm": 419.2840576171875, "learning_rate": 3.6666666666666666e-05, "loss": 91.4742, "step": 6800 }, { "epoch": 1.5262110152621102, "grad_norm": 145.44712829589844, "learning_rate": 3.638888888888889e-05, "loss": 108.68, "step": 6900 }, { "epoch": 1.5483300154833002, "grad_norm": 866.0416259765625, "learning_rate": 3.611111111111111e-05, "loss": 139.873, "step": 7000 }, { "epoch": 1.5704490157044901, "grad_norm": 543.1744995117188, "learning_rate": 3.5833333333333335e-05, "loss": 84.9976, "step": 7100 }, { "epoch": 1.5925680159256803, "grad_norm": 961.9137573242188, "learning_rate": 3.555555555555556e-05, "loss": 91.9599, "step": 7200 }, { "epoch": 1.61468701614687, "grad_norm": 1136.900146484375, "learning_rate": 3.527777777777778e-05, "loss": 100.5749, "step": 7300 }, { "epoch": 1.6368060163680602, "grad_norm": 845.6568603515625, "learning_rate": 3.5e-05, "loss": 128.4079, "step": 7400 }, { "epoch": 1.6589250165892502, "grad_norm": 1455.4808349609375, "learning_rate": 3.472222222222222e-05, "loss": 124.5921, "step": 7500 }, { "epoch": 1.6810440168104401, "grad_norm": 370.6483459472656, "learning_rate": 3.444444444444445e-05, "loss": 103.7647, "step": 7600 }, { "epoch": 1.7031630170316303, "grad_norm": 9296.732421875, "learning_rate": 3.4166666666666666e-05, "loss": 104.9438, "step": 7700 }, { "epoch": 1.72528201725282, "grad_norm": 2471.158447265625, "learning_rate": 3.388888888888889e-05, "loss": 109.1481, "step": 7800 }, { "epoch": 1.7474010174740102, "grad_norm": 230.1200408935547, "learning_rate": 3.3611111111111116e-05, "loss": 87.4035, "step": 7900 }, { "epoch": 1.7695200176952002, "grad_norm": 875.7386474609375, "learning_rate": 3.3333333333333335e-05, "loss": 87.8452, "step": 8000 }, { "epoch": 1.7916390179163901, "grad_norm": 1371.0994873046875, "learning_rate": 3.3055555555555553e-05, "loss": 98.3798, "step": 8100 }, { "epoch": 1.8137580181375803, "grad_norm": 682.475341796875, "learning_rate": 3.277777777777778e-05, "loss": 104.5044, "step": 8200 }, { "epoch": 1.83587701835877, "grad_norm": 512.0708618164062, "learning_rate": 3.2500000000000004e-05, "loss": 64.1792, "step": 8300 }, { "epoch": 1.8579960185799602, "grad_norm": 1042.0284423828125, "learning_rate": 3.222222222222223e-05, "loss": 74.1599, "step": 8400 }, { "epoch": 1.8801150188011502, "grad_norm": 358.9875183105469, "learning_rate": 3.194444444444444e-05, "loss": 93.9574, "step": 8500 }, { "epoch": 1.9022340190223401, "grad_norm": 4613.166015625, "learning_rate": 3.1666666666666666e-05, "loss": 103.6234, "step": 8600 }, { "epoch": 1.9243530192435303, "grad_norm": 436.9840393066406, "learning_rate": 3.138888888888889e-05, "loss": 98.4847, "step": 8700 }, { "epoch": 1.94647201946472, "grad_norm": 29151.51171875, "learning_rate": 3.111111111111111e-05, "loss": 96.9473, "step": 8800 }, { "epoch": 1.9685910196859102, "grad_norm": 189.884033203125, "learning_rate": 3.0833333333333335e-05, "loss": 65.5306, "step": 8900 }, { "epoch": 1.9907100199071002, "grad_norm": 416.74969482421875, "learning_rate": 3.055555555555556e-05, "loss": 104.1101, "step": 9000 }, { "epoch": 2.0, "eval_loss": 227.4776611328125, "eval_runtime": 20.702, "eval_samples_per_second": 97.092, "eval_steps_per_second": 24.297, "step": 9042 }, { "epoch": 2.01282902012829, "grad_norm": 345.6188049316406, "learning_rate": 3.0277777777777776e-05, "loss": 103.7359, "step": 9100 }, { "epoch": 2.0349480203494803, "grad_norm": 2839.026611328125, "learning_rate": 3e-05, "loss": 83.4189, "step": 9200 }, { "epoch": 2.05706702057067, "grad_norm": 493.89093017578125, "learning_rate": 2.9722222222222223e-05, "loss": 93.5555, "step": 9300 }, { "epoch": 2.07918602079186, "grad_norm": 239.73707580566406, "learning_rate": 2.9444444444444448e-05, "loss": 102.3052, "step": 9400 }, { "epoch": 2.1013050210130504, "grad_norm": 293.8266296386719, "learning_rate": 2.916666666666667e-05, "loss": 80.1726, "step": 9500 }, { "epoch": 2.12342402123424, "grad_norm": 251.94790649414062, "learning_rate": 2.8888888888888888e-05, "loss": 57.1753, "step": 9600 }, { "epoch": 2.1455430214554303, "grad_norm": 1789.1011962890625, "learning_rate": 2.861111111111111e-05, "loss": 68.5661, "step": 9700 }, { "epoch": 2.16766202167662, "grad_norm": 1088.1776123046875, "learning_rate": 2.8333333333333335e-05, "loss": 67.3807, "step": 9800 }, { "epoch": 2.18978102189781, "grad_norm": 511.173828125, "learning_rate": 2.8055555555555557e-05, "loss": 71.9351, "step": 9900 }, { "epoch": 2.2119000221190004, "grad_norm": 372.8301696777344, "learning_rate": 2.777777777777778e-05, "loss": 116.7822, "step": 10000 }, { "epoch": 2.23401902234019, "grad_norm": 673.3489379882812, "learning_rate": 2.7500000000000004e-05, "loss": 119.5753, "step": 10100 }, { "epoch": 2.2561380225613803, "grad_norm": 188.46298217773438, "learning_rate": 2.7222222222222223e-05, "loss": 77.327, "step": 10200 }, { "epoch": 2.2782570227825705, "grad_norm": 1344.5301513671875, "learning_rate": 2.6944444444444445e-05, "loss": 81.0025, "step": 10300 }, { "epoch": 2.30037602300376, "grad_norm": 950.5435791015625, "learning_rate": 2.6666666666666667e-05, "loss": 77.1479, "step": 10400 }, { "epoch": 2.3224950232249504, "grad_norm": 253.42047119140625, "learning_rate": 2.6388888888888892e-05, "loss": 78.325, "step": 10500 }, { "epoch": 2.34461402344614, "grad_norm": 537.2485961914062, "learning_rate": 2.6111111111111114e-05, "loss": 114.1938, "step": 10600 }, { "epoch": 2.3667330236673303, "grad_norm": 391.37054443359375, "learning_rate": 2.5833333333333336e-05, "loss": 74.7277, "step": 10700 }, { "epoch": 2.3888520238885205, "grad_norm": 777.1848754882812, "learning_rate": 2.5555555555555554e-05, "loss": 87.6657, "step": 10800 }, { "epoch": 2.41097102410971, "grad_norm": 233.57789611816406, "learning_rate": 2.527777777777778e-05, "loss": 74.0286, "step": 10900 }, { "epoch": 2.4330900243309004, "grad_norm": 683.5059814453125, "learning_rate": 2.5e-05, "loss": 120.9458, "step": 11000 }, { "epoch": 2.45520902455209, "grad_norm": 495.2901916503906, "learning_rate": 2.4722222222222223e-05, "loss": 86.7892, "step": 11100 }, { "epoch": 2.4773280247732803, "grad_norm": 550.5516967773438, "learning_rate": 2.4444444444444445e-05, "loss": 89.1761, "step": 11200 }, { "epoch": 2.4994470249944705, "grad_norm": 403.9693298339844, "learning_rate": 2.4166666666666667e-05, "loss": 68.0416, "step": 11300 }, { "epoch": 2.52156602521566, "grad_norm": 156.51942443847656, "learning_rate": 2.3888888888888892e-05, "loss": 102.727, "step": 11400 }, { "epoch": 2.5436850254368504, "grad_norm": 1048.75439453125, "learning_rate": 2.361111111111111e-05, "loss": 101.3001, "step": 11500 }, { "epoch": 2.56580402565804, "grad_norm": 1137.298828125, "learning_rate": 2.3333333333333336e-05, "loss": 80.9905, "step": 11600 }, { "epoch": 2.5879230258792303, "grad_norm": 600.23681640625, "learning_rate": 2.3055555555555558e-05, "loss": 68.7155, "step": 11700 }, { "epoch": 2.6100420261004205, "grad_norm": 840.565185546875, "learning_rate": 2.277777777777778e-05, "loss": 97.0005, "step": 11800 }, { "epoch": 2.63216102632161, "grad_norm": 722.6634521484375, "learning_rate": 2.25e-05, "loss": 43.6906, "step": 11900 }, { "epoch": 2.6542800265428004, "grad_norm": 399.0193176269531, "learning_rate": 2.2222222222222223e-05, "loss": 87.6618, "step": 12000 }, { "epoch": 2.67639902676399, "grad_norm": 1078.0657958984375, "learning_rate": 2.1944444444444445e-05, "loss": 90.9304, "step": 12100 }, { "epoch": 2.6985180269851803, "grad_norm": 719.8499145507812, "learning_rate": 2.1666666666666667e-05, "loss": 60.1098, "step": 12200 }, { "epoch": 2.7206370272063705, "grad_norm": 2423.7578125, "learning_rate": 2.138888888888889e-05, "loss": 80.944, "step": 12300 }, { "epoch": 2.74275602742756, "grad_norm": 1538.58447265625, "learning_rate": 2.111111111111111e-05, "loss": 84.306, "step": 12400 }, { "epoch": 2.7648750276487504, "grad_norm": 1219.7099609375, "learning_rate": 2.0833333333333336e-05, "loss": 61.9064, "step": 12500 }, { "epoch": 2.78699402786994, "grad_norm": 419.0851135253906, "learning_rate": 2.0555555555555555e-05, "loss": 58.863, "step": 12600 }, { "epoch": 2.8091130280911303, "grad_norm": 10715.943359375, "learning_rate": 2.027777777777778e-05, "loss": 74.7694, "step": 12700 }, { "epoch": 2.8312320283123205, "grad_norm": 201.3819122314453, "learning_rate": 2e-05, "loss": 83.1654, "step": 12800 }, { "epoch": 2.85335102853351, "grad_norm": 3191.744873046875, "learning_rate": 1.9722222222222224e-05, "loss": 79.8109, "step": 12900 }, { "epoch": 2.8754700287547004, "grad_norm": 284.8264465332031, "learning_rate": 1.9444444444444445e-05, "loss": 91.2385, "step": 13000 }, { "epoch": 2.89758902897589, "grad_norm": 355.34674072265625, "learning_rate": 1.9166666666666667e-05, "loss": 53.8721, "step": 13100 }, { "epoch": 2.9197080291970803, "grad_norm": 760.095703125, "learning_rate": 1.888888888888889e-05, "loss": 70.6495, "step": 13200 }, { "epoch": 2.9418270294182705, "grad_norm": 163.60043334960938, "learning_rate": 1.861111111111111e-05, "loss": 61.0286, "step": 13300 }, { "epoch": 2.96394602963946, "grad_norm": 694.377197265625, "learning_rate": 1.8333333333333333e-05, "loss": 70.8661, "step": 13400 }, { "epoch": 2.9860650298606504, "grad_norm": 789.5768432617188, "learning_rate": 1.8055555555555555e-05, "loss": 51.5301, "step": 13500 }, { "epoch": 3.0, "eval_loss": 166.22483825683594, "eval_runtime": 20.711, "eval_samples_per_second": 97.05, "eval_steps_per_second": 24.287, "step": 13563 }, { "epoch": 3.00818403008184, "grad_norm": 588.47265625, "learning_rate": 1.777777777777778e-05, "loss": 60.6116, "step": 13600 }, { "epoch": 3.0303030303030303, "grad_norm": 637.5720825195312, "learning_rate": 1.75e-05, "loss": 75.5694, "step": 13700 }, { "epoch": 3.0524220305242205, "grad_norm": 271.2422180175781, "learning_rate": 1.7222222222222224e-05, "loss": 67.7535, "step": 13800 }, { "epoch": 3.07454103074541, "grad_norm": 1230.089111328125, "learning_rate": 1.6944444444444446e-05, "loss": 84.6008, "step": 13900 }, { "epoch": 3.0966600309666004, "grad_norm": 493.8188781738281, "learning_rate": 1.6666666666666667e-05, "loss": 57.6848, "step": 14000 }, { "epoch": 3.11877903118779, "grad_norm": 927.6559448242188, "learning_rate": 1.638888888888889e-05, "loss": 77.8659, "step": 14100 }, { "epoch": 3.1408980314089803, "grad_norm": 1547.560302734375, "learning_rate": 1.6111111111111115e-05, "loss": 56.6124, "step": 14200 }, { "epoch": 3.1630170316301705, "grad_norm": 1353.518798828125, "learning_rate": 1.5833333333333333e-05, "loss": 52.2098, "step": 14300 }, { "epoch": 3.18513603185136, "grad_norm": 20034.76171875, "learning_rate": 1.5555555555555555e-05, "loss": 47.6291, "step": 14400 }, { "epoch": 3.2072550320725504, "grad_norm": 295.83868408203125, "learning_rate": 1.527777777777778e-05, "loss": 76.8827, "step": 14500 }, { "epoch": 3.22937403229374, "grad_norm": 1203.1685791015625, "learning_rate": 1.5e-05, "loss": 75.1167, "step": 14600 }, { "epoch": 3.2514930325149303, "grad_norm": 514.1155395507812, "learning_rate": 1.4722222222222224e-05, "loss": 56.7415, "step": 14700 }, { "epoch": 3.2736120327361204, "grad_norm": 254.90188598632812, "learning_rate": 1.4444444444444444e-05, "loss": 65.6232, "step": 14800 }, { "epoch": 3.29573103295731, "grad_norm": 2475.375, "learning_rate": 1.4166666666666668e-05, "loss": 82.0212, "step": 14900 }, { "epoch": 3.3178500331785004, "grad_norm": 695.0991821289062, "learning_rate": 1.388888888888889e-05, "loss": 70.9937, "step": 15000 }, { "epoch": 3.33996903339969, "grad_norm": 3679.786376953125, "learning_rate": 1.3611111111111111e-05, "loss": 87.9886, "step": 15100 }, { "epoch": 3.3620880336208803, "grad_norm": 335.57244873046875, "learning_rate": 1.3333333333333333e-05, "loss": 81.2132, "step": 15200 }, { "epoch": 3.3842070338420704, "grad_norm": 3056.2080078125, "learning_rate": 1.3055555555555557e-05, "loss": 84.2817, "step": 15300 }, { "epoch": 3.40632603406326, "grad_norm": 633.110107421875, "learning_rate": 1.2777777777777777e-05, "loss": 91.3568, "step": 15400 }, { "epoch": 3.4284450342844504, "grad_norm": 1400.1513671875, "learning_rate": 1.25e-05, "loss": 64.4974, "step": 15500 }, { "epoch": 3.4505640345056405, "grad_norm": 366.1932678222656, "learning_rate": 1.2222222222222222e-05, "loss": 65.3457, "step": 15600 }, { "epoch": 3.4726830347268303, "grad_norm": 77.0812759399414, "learning_rate": 1.1944444444444446e-05, "loss": 94.2989, "step": 15700 }, { "epoch": 3.4948020349480204, "grad_norm": 517.035888671875, "learning_rate": 1.1666666666666668e-05, "loss": 54.1213, "step": 15800 }, { "epoch": 3.5169210351692106, "grad_norm": 582.3343505859375, "learning_rate": 1.138888888888889e-05, "loss": 80.1348, "step": 15900 }, { "epoch": 3.5390400353904004, "grad_norm": 134.23880004882812, "learning_rate": 1.1111111111111112e-05, "loss": 76.6385, "step": 16000 }, { "epoch": 3.56115903561159, "grad_norm": 341.63604736328125, "learning_rate": 1.0833333333333334e-05, "loss": 78.0171, "step": 16100 }, { "epoch": 3.5832780358327803, "grad_norm": 1244.135009765625, "learning_rate": 1.0555555555555555e-05, "loss": 51.3064, "step": 16200 }, { "epoch": 3.6053970360539704, "grad_norm": 916.8475952148438, "learning_rate": 1.0277777777777777e-05, "loss": 79.2429, "step": 16300 }, { "epoch": 3.6275160362751606, "grad_norm": 269.9622802734375, "learning_rate": 1e-05, "loss": 68.2658, "step": 16400 }, { "epoch": 3.6496350364963503, "grad_norm": 272.8192138671875, "learning_rate": 9.722222222222223e-06, "loss": 54.9957, "step": 16500 }, { "epoch": 3.6717540367175405, "grad_norm": 529.631591796875, "learning_rate": 9.444444444444445e-06, "loss": 55.4425, "step": 16600 }, { "epoch": 3.6938730369387303, "grad_norm": 478.92327880859375, "learning_rate": 9.166666666666666e-06, "loss": 49.7692, "step": 16700 }, { "epoch": 3.7159920371599204, "grad_norm": 499.35406494140625, "learning_rate": 8.88888888888889e-06, "loss": 46.7543, "step": 16800 }, { "epoch": 3.7381110373811106, "grad_norm": 179.26344299316406, "learning_rate": 8.611111111111112e-06, "loss": 64.0389, "step": 16900 }, { "epoch": 3.7602300376023003, "grad_norm": 1287.7215576171875, "learning_rate": 8.333333333333334e-06, "loss": 59.5381, "step": 17000 }, { "epoch": 3.7823490378234905, "grad_norm": 916.1105346679688, "learning_rate": 8.055555555555557e-06, "loss": 60.6414, "step": 17100 }, { "epoch": 3.8044680380446803, "grad_norm": 398.42724609375, "learning_rate": 7.777777777777777e-06, "loss": 56.4996, "step": 17200 }, { "epoch": 3.8265870382658704, "grad_norm": 466.2370910644531, "learning_rate": 7.5e-06, "loss": 55.1325, "step": 17300 }, { "epoch": 3.8487060384870606, "grad_norm": 426.53271484375, "learning_rate": 7.222222222222222e-06, "loss": 74.4013, "step": 17400 }, { "epoch": 3.8708250387082503, "grad_norm": 595.925537109375, "learning_rate": 6.944444444444445e-06, "loss": 65.174, "step": 17500 }, { "epoch": 3.8929440389294405, "grad_norm": 960.3365478515625, "learning_rate": 6.666666666666667e-06, "loss": 61.204, "step": 17600 }, { "epoch": 3.9150630391506303, "grad_norm": 488.77978515625, "learning_rate": 6.3888888888888885e-06, "loss": 64.5583, "step": 17700 }, { "epoch": 3.9371820393718204, "grad_norm": 386.27911376953125, "learning_rate": 6.111111111111111e-06, "loss": 53.6574, "step": 17800 }, { "epoch": 3.9593010395930106, "grad_norm": 741.3096313476562, "learning_rate": 5.833333333333334e-06, "loss": 56.0408, "step": 17900 }, { "epoch": 3.9814200398142003, "grad_norm": 125.60317993164062, "learning_rate": 5.555555555555556e-06, "loss": 51.7872, "step": 18000 }, { "epoch": 4.0, "eval_loss": 174.72474670410156, "eval_runtime": 20.9561, "eval_samples_per_second": 95.915, "eval_steps_per_second": 24.003, "step": 18084 }, { "epoch": 4.00353904003539, "grad_norm": 206.36895751953125, "learning_rate": 5.277777777777778e-06, "loss": 40.9699, "step": 18100 }, { "epoch": 4.02565804025658, "grad_norm": 399.98455810546875, "learning_rate": 5e-06, "loss": 56.6913, "step": 18200 }, { "epoch": 4.04777704047777, "grad_norm": 1047.0919189453125, "learning_rate": 4.722222222222222e-06, "loss": 48.5654, "step": 18300 }, { "epoch": 4.069896040698961, "grad_norm": 882.0901489257812, "learning_rate": 4.444444444444445e-06, "loss": 47.7457, "step": 18400 }, { "epoch": 4.092015040920151, "grad_norm": 515.3323364257812, "learning_rate": 4.166666666666667e-06, "loss": 66.938, "step": 18500 }, { "epoch": 4.11413404114134, "grad_norm": 1115.885009765625, "learning_rate": 3.888888888888889e-06, "loss": 58.0705, "step": 18600 }, { "epoch": 4.13625304136253, "grad_norm": 672.346435546875, "learning_rate": 3.611111111111111e-06, "loss": 68.3134, "step": 18700 }, { "epoch": 4.15837204158372, "grad_norm": 592.4734497070312, "learning_rate": 3.3333333333333333e-06, "loss": 55.2237, "step": 18800 }, { "epoch": 4.180491041804911, "grad_norm": 642.23583984375, "learning_rate": 3.0555555555555556e-06, "loss": 77.2789, "step": 18900 }, { "epoch": 4.202610042026101, "grad_norm": 595.8189086914062, "learning_rate": 2.777777777777778e-06, "loss": 54.4822, "step": 19000 }, { "epoch": 4.22472904224729, "grad_norm": 440.1129150390625, "learning_rate": 2.5e-06, "loss": 50.1236, "step": 19100 }, { "epoch": 4.24684804246848, "grad_norm": 8661.6494140625, "learning_rate": 2.2222222222222225e-06, "loss": 81.7971, "step": 19200 }, { "epoch": 4.26896704268967, "grad_norm": 214.70748901367188, "learning_rate": 1.9444444444444444e-06, "loss": 51.6579, "step": 19300 }, { "epoch": 4.291086042910861, "grad_norm": 724.3253173828125, "learning_rate": 1.6666666666666667e-06, "loss": 68.406, "step": 19400 }, { "epoch": 4.313205043132051, "grad_norm": 1145.943115234375, "learning_rate": 1.388888888888889e-06, "loss": 62.2609, "step": 19500 }, { "epoch": 4.33532404335324, "grad_norm": 1727.2354736328125, "learning_rate": 1.1111111111111112e-06, "loss": 67.2163, "step": 19600 }, { "epoch": 4.35744304357443, "grad_norm": 734.8032836914062, "learning_rate": 8.333333333333333e-07, "loss": 57.1625, "step": 19700 }, { "epoch": 4.37956204379562, "grad_norm": 1056.7657470703125, "learning_rate": 5.555555555555556e-07, "loss": 61.3986, "step": 19800 }, { "epoch": 4.401681044016811, "grad_norm": 550.4881591796875, "learning_rate": 2.777777777777778e-07, "loss": 60.5436, "step": 19900 }, { "epoch": 4.423800044238001, "grad_norm": 435.5983581542969, "learning_rate": 0.0, "loss": 60.0424, "step": 20000 } ], "logging_steps": 100, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }