|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.423800044238001, |
|
"eval_steps": 500, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022119000221190004, |
|
"grad_norm": 329782.40625, |
|
"learning_rate": 2.5e-06, |
|
"loss": 11183.8312, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04423800044238001, |
|
"grad_norm": 578.6302490234375, |
|
"learning_rate": 5e-06, |
|
"loss": 501.9779, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06635700066357, |
|
"grad_norm": 3675.665283203125, |
|
"learning_rate": 7.5e-06, |
|
"loss": 299.6076, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08847600088476001, |
|
"grad_norm": 2041.4144287109375, |
|
"learning_rate": 1e-05, |
|
"loss": 327.6098, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11059500110595001, |
|
"grad_norm": 229.52532958984375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 383.8903, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13271400132714, |
|
"grad_norm": 839.3260498046875, |
|
"learning_rate": 1.5e-05, |
|
"loss": 229.51, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15483300154833002, |
|
"grad_norm": 2449.822021484375, |
|
"learning_rate": 1.75e-05, |
|
"loss": 393.3861, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.17695200176952003, |
|
"grad_norm": 596.9613037109375, |
|
"learning_rate": 2e-05, |
|
"loss": 170.8953, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.19907100199071, |
|
"grad_norm": 718.4219970703125, |
|
"learning_rate": 2.25e-05, |
|
"loss": 270.5029, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22119000221190002, |
|
"grad_norm": 765.644775390625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 240.2428, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.24330900243309003, |
|
"grad_norm": 757.395751953125, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 202.8483, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.26542800265428, |
|
"grad_norm": 2641.68408203125, |
|
"learning_rate": 3e-05, |
|
"loss": 197.4493, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.28754700287547, |
|
"grad_norm": 295.59307861328125, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 166.7725, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.30966600309666004, |
|
"grad_norm": 735.7752075195312, |
|
"learning_rate": 3.5e-05, |
|
"loss": 157.2355, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.33178500331785005, |
|
"grad_norm": 596.8727416992188, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 206.7806, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.35390400353904006, |
|
"grad_norm": 1486.5274658203125, |
|
"learning_rate": 4e-05, |
|
"loss": 187.7721, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.37602300376023, |
|
"grad_norm": 1032.82763671875, |
|
"learning_rate": 4.25e-05, |
|
"loss": 186.7065, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.39814200398142, |
|
"grad_norm": 563.8107299804688, |
|
"learning_rate": 4.5e-05, |
|
"loss": 182.7577, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.42026100420261003, |
|
"grad_norm": 287.0451354980469, |
|
"learning_rate": 4.75e-05, |
|
"loss": 279.2049, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.44238000442380004, |
|
"grad_norm": 984.99853515625, |
|
"learning_rate": 5e-05, |
|
"loss": 126.9518, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.46449900464499005, |
|
"grad_norm": 3091.0703125, |
|
"learning_rate": 4.972222222222223e-05, |
|
"loss": 172.3027, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.48661800486618007, |
|
"grad_norm": 3833.43798828125, |
|
"learning_rate": 4.9444444444444446e-05, |
|
"loss": 252.3367, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5087370050873701, |
|
"grad_norm": 765.114013671875, |
|
"learning_rate": 4.9166666666666665e-05, |
|
"loss": 226.8686, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.53085600530856, |
|
"grad_norm": 452.7674255371094, |
|
"learning_rate": 4.888888888888889e-05, |
|
"loss": 134.841, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5529750055297501, |
|
"grad_norm": 836.827880859375, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 225.3301, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.57509400575094, |
|
"grad_norm": 363.256591796875, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 148.2828, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.59721300597213, |
|
"grad_norm": 501.0595397949219, |
|
"learning_rate": 4.805555555555556e-05, |
|
"loss": 93.7487, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6193320061933201, |
|
"grad_norm": 334.7666931152344, |
|
"learning_rate": 4.7777777777777784e-05, |
|
"loss": 88.7822, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.64145100641451, |
|
"grad_norm": 503.07720947265625, |
|
"learning_rate": 4.75e-05, |
|
"loss": 253.4651, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6635700066357001, |
|
"grad_norm": 53895.6015625, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 147.7988, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.68568900685689, |
|
"grad_norm": 828.54296875, |
|
"learning_rate": 4.6944444444444446e-05, |
|
"loss": 138.5155, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7078080070780801, |
|
"grad_norm": 278.7898864746094, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 132.9807, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 330.3460693359375, |
|
"learning_rate": 4.638888888888889e-05, |
|
"loss": 118.2184, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.75204600752046, |
|
"grad_norm": 1073.325439453125, |
|
"learning_rate": 4.6111111111111115e-05, |
|
"loss": 131.6948, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.7741650077416501, |
|
"grad_norm": 679.9262084960938, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 116.4453, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.79628400796284, |
|
"grad_norm": 828.5316162109375, |
|
"learning_rate": 4.555555555555556e-05, |
|
"loss": 101.1161, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8184030081840301, |
|
"grad_norm": 26458.435546875, |
|
"learning_rate": 4.527777777777778e-05, |
|
"loss": 210.2304, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8405220084052201, |
|
"grad_norm": 485.78900146484375, |
|
"learning_rate": 4.5e-05, |
|
"loss": 138.0961, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8626410086264101, |
|
"grad_norm": 2006.3243408203125, |
|
"learning_rate": 4.472222222222223e-05, |
|
"loss": 88.3029, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.8847600088476001, |
|
"grad_norm": 7547.01123046875, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 134.9624, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.90687900906879, |
|
"grad_norm": 778.521728515625, |
|
"learning_rate": 4.4166666666666665e-05, |
|
"loss": 115.0598, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9289980092899801, |
|
"grad_norm": 593.7921752929688, |
|
"learning_rate": 4.388888888888889e-05, |
|
"loss": 161.5183, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9511170095111701, |
|
"grad_norm": 2227.79248046875, |
|
"learning_rate": 4.3611111111111116e-05, |
|
"loss": 111.9892, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.9732360097323601, |
|
"grad_norm": 422.0806579589844, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 76.96, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.9953550099535501, |
|
"grad_norm": 922.49169921875, |
|
"learning_rate": 4.305555555555556e-05, |
|
"loss": 64.7625, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 218.93350219726562, |
|
"eval_runtime": 62.6127, |
|
"eval_samples_per_second": 32.102, |
|
"eval_steps_per_second": 8.034, |
|
"step": 4521 |
|
}, |
|
{ |
|
"epoch": 1.0174740101747402, |
|
"grad_norm": 1710.8536376953125, |
|
"learning_rate": 4.277777777777778e-05, |
|
"loss": 88.4653, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.03959301039593, |
|
"grad_norm": 2913.376708984375, |
|
"learning_rate": 4.25e-05, |
|
"loss": 71.536, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.06171201061712, |
|
"grad_norm": 170.909912109375, |
|
"learning_rate": 4.222222222222222e-05, |
|
"loss": 98.4133, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.08383101083831, |
|
"grad_norm": 1084.526611328125, |
|
"learning_rate": 4.194444444444445e-05, |
|
"loss": 136.51, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.1059500110595002, |
|
"grad_norm": 1848.7337646484375, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 92.1182, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.1280690112806901, |
|
"grad_norm": 446.60064697265625, |
|
"learning_rate": 4.138888888888889e-05, |
|
"loss": 90.7799, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.15018801150188, |
|
"grad_norm": 1472.4832763671875, |
|
"learning_rate": 4.111111111111111e-05, |
|
"loss": 91.3228, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.17230701172307, |
|
"grad_norm": 2101.265625, |
|
"learning_rate": 4.0833333333333334e-05, |
|
"loss": 95.7063, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.1944260119442602, |
|
"grad_norm": 99.53836059570312, |
|
"learning_rate": 4.055555555555556e-05, |
|
"loss": 74.0728, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.2165450121654502, |
|
"grad_norm": 4299.07958984375, |
|
"learning_rate": 4.027777777777778e-05, |
|
"loss": 98.044, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.2386640123866401, |
|
"grad_norm": 574.811279296875, |
|
"learning_rate": 4e-05, |
|
"loss": 83.4273, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.26078301260783, |
|
"grad_norm": 792.8590087890625, |
|
"learning_rate": 3.972222222222222e-05, |
|
"loss": 138.9129, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.28290201282902, |
|
"grad_norm": 1043.4178466796875, |
|
"learning_rate": 3.944444444444445e-05, |
|
"loss": 82.8142, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.3050210130502102, |
|
"grad_norm": 793.00830078125, |
|
"learning_rate": 3.9166666666666665e-05, |
|
"loss": 80.1177, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.3271400132714002, |
|
"grad_norm": 645.7152709960938, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 67.7325, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.3492590134925901, |
|
"grad_norm": 284.2798767089844, |
|
"learning_rate": 3.8611111111111116e-05, |
|
"loss": 115.4644, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.37137801371378, |
|
"grad_norm": 1235.421142578125, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 85.7536, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.39349701393497, |
|
"grad_norm": 861.7305908203125, |
|
"learning_rate": 3.805555555555555e-05, |
|
"loss": 72.2837, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.4156160141561602, |
|
"grad_norm": 2955.749755859375, |
|
"learning_rate": 3.777777777777778e-05, |
|
"loss": 88.7108, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.4377350143773502, |
|
"grad_norm": 131.04417419433594, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 55.0814, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"grad_norm": 1373.429931640625, |
|
"learning_rate": 3.722222222222222e-05, |
|
"loss": 75.0703, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.48197301481973, |
|
"grad_norm": 515.3665771484375, |
|
"learning_rate": 3.694444444444445e-05, |
|
"loss": 91.6801, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.50409201504092, |
|
"grad_norm": 3621.946044921875, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 120.4573, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.5262110152621102, |
|
"grad_norm": 139.5118408203125, |
|
"learning_rate": 3.638888888888889e-05, |
|
"loss": 59.9174, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.5483300154833002, |
|
"grad_norm": 6172.529296875, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 74.676, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.5704490157044901, |
|
"grad_norm": 1593.488037109375, |
|
"learning_rate": 3.5833333333333335e-05, |
|
"loss": 72.2317, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.5925680159256803, |
|
"grad_norm": 1400.1859130859375, |
|
"learning_rate": 3.555555555555556e-05, |
|
"loss": 93.2606, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.61468701614687, |
|
"grad_norm": 687.2742309570312, |
|
"learning_rate": 3.527777777777778e-05, |
|
"loss": 83.7336, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.6368060163680602, |
|
"grad_norm": 237.36953735351562, |
|
"learning_rate": 3.5e-05, |
|
"loss": 68.9703, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.6589250165892502, |
|
"grad_norm": 2617.44482421875, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 55.2706, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.6810440168104401, |
|
"grad_norm": 164.34164428710938, |
|
"learning_rate": 3.444444444444445e-05, |
|
"loss": 64.0422, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.7031630170316303, |
|
"grad_norm": 550.0332641601562, |
|
"learning_rate": 3.4166666666666666e-05, |
|
"loss": 93.087, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.72528201725282, |
|
"grad_norm": 411.0719909667969, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 82.8394, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.7474010174740102, |
|
"grad_norm": 383.0977783203125, |
|
"learning_rate": 3.3611111111111116e-05, |
|
"loss": 95.9364, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.7695200176952002, |
|
"grad_norm": 100.02352905273438, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 74.7701, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.7916390179163901, |
|
"grad_norm": 930.63818359375, |
|
"learning_rate": 3.3055555555555553e-05, |
|
"loss": 97.197, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.8137580181375803, |
|
"grad_norm": 201.84938049316406, |
|
"learning_rate": 3.277777777777778e-05, |
|
"loss": 78.3827, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.83587701835877, |
|
"grad_norm": 429.5751953125, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 87.7946, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.8579960185799602, |
|
"grad_norm": 839.8375854492188, |
|
"learning_rate": 3.222222222222223e-05, |
|
"loss": 101.715, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.8801150188011502, |
|
"grad_norm": 723.384765625, |
|
"learning_rate": 3.194444444444444e-05, |
|
"loss": 87.1683, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.9022340190223401, |
|
"grad_norm": 695.84619140625, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 53.4962, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.9243530192435303, |
|
"grad_norm": 447.4176330566406, |
|
"learning_rate": 3.138888888888889e-05, |
|
"loss": 72.4244, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.94647201946472, |
|
"grad_norm": 418.347900390625, |
|
"learning_rate": 3.111111111111111e-05, |
|
"loss": 84.3226, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.9685910196859102, |
|
"grad_norm": 616.8546752929688, |
|
"learning_rate": 3.0833333333333335e-05, |
|
"loss": 57.9826, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.9907100199071002, |
|
"grad_norm": 664.2003784179688, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 51.4834, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 142.8988037109375, |
|
"eval_runtime": 61.1029, |
|
"eval_samples_per_second": 32.895, |
|
"eval_steps_per_second": 8.232, |
|
"step": 9042 |
|
}, |
|
{ |
|
"epoch": 2.01282902012829, |
|
"grad_norm": 418.1151123046875, |
|
"learning_rate": 3.0277777777777776e-05, |
|
"loss": 73.7381, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.0349480203494803, |
|
"grad_norm": 404.5308532714844, |
|
"learning_rate": 3e-05, |
|
"loss": 58.1056, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.05706702057067, |
|
"grad_norm": 2588.05322265625, |
|
"learning_rate": 2.9722222222222223e-05, |
|
"loss": 64.0834, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.07918602079186, |
|
"grad_norm": 145.60726928710938, |
|
"learning_rate": 2.9444444444444448e-05, |
|
"loss": 62.3793, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.1013050210130504, |
|
"grad_norm": 950.2636108398438, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 67.93, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.12342402123424, |
|
"grad_norm": 557.125244140625, |
|
"learning_rate": 2.8888888888888888e-05, |
|
"loss": 54.7434, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.1455430214554303, |
|
"grad_norm": 295.7925109863281, |
|
"learning_rate": 2.861111111111111e-05, |
|
"loss": 65.4703, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.16766202167662, |
|
"grad_norm": 193.5789031982422, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 58.263, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"grad_norm": 512.2490844726562, |
|
"learning_rate": 2.8055555555555557e-05, |
|
"loss": 47.4564, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.2119000221190004, |
|
"grad_norm": 1932.7911376953125, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 61.474, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.23401902234019, |
|
"grad_norm": 659.2971801757812, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 38.0625, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.2561380225613803, |
|
"grad_norm": 107.96562194824219, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 75.2617, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.2782570227825705, |
|
"grad_norm": 3581.160888671875, |
|
"learning_rate": 2.6944444444444445e-05, |
|
"loss": 83.6121, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.30037602300376, |
|
"grad_norm": 1640.62158203125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 57.5173, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.3224950232249504, |
|
"grad_norm": 758.580078125, |
|
"learning_rate": 2.6388888888888892e-05, |
|
"loss": 66.4868, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.34461402344614, |
|
"grad_norm": 842.703857421875, |
|
"learning_rate": 2.6111111111111114e-05, |
|
"loss": 68.3939, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.3667330236673303, |
|
"grad_norm": 452.1793518066406, |
|
"learning_rate": 2.5833333333333336e-05, |
|
"loss": 57.7072, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.3888520238885205, |
|
"grad_norm": 357.23828125, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 54.2972, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.41097102410971, |
|
"grad_norm": 2001.415771484375, |
|
"learning_rate": 2.527777777777778e-05, |
|
"loss": 73.0164, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.4330900243309004, |
|
"grad_norm": 2387.650634765625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 56.2423, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.45520902455209, |
|
"grad_norm": 1005.8760375976562, |
|
"learning_rate": 2.4722222222222223e-05, |
|
"loss": 57.5367, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.4773280247732803, |
|
"grad_norm": 711.6175537109375, |
|
"learning_rate": 2.4444444444444445e-05, |
|
"loss": 54.2811, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.4994470249944705, |
|
"grad_norm": 384.83013916015625, |
|
"learning_rate": 2.4166666666666667e-05, |
|
"loss": 58.3445, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.52156602521566, |
|
"grad_norm": 280.5820007324219, |
|
"learning_rate": 2.3888888888888892e-05, |
|
"loss": 52.6407, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.5436850254368504, |
|
"grad_norm": 3489.7578125, |
|
"learning_rate": 2.361111111111111e-05, |
|
"loss": 77.9429, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.56580402565804, |
|
"grad_norm": 2384.225341796875, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 60.7436, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.5879230258792303, |
|
"grad_norm": 287.5812072753906, |
|
"learning_rate": 2.3055555555555558e-05, |
|
"loss": 42.1971, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.6100420261004205, |
|
"grad_norm": 1218.7347412109375, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 46.3975, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.63216102632161, |
|
"grad_norm": 1416.937255859375, |
|
"learning_rate": 2.25e-05, |
|
"loss": 59.6613, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.6542800265428004, |
|
"grad_norm": 1019.8450317382812, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 38.9072, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.67639902676399, |
|
"grad_norm": 195.94032287597656, |
|
"learning_rate": 2.1944444444444445e-05, |
|
"loss": 61.2397, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.6985180269851803, |
|
"grad_norm": 352.8352355957031, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 58.8343, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.7206370272063705, |
|
"grad_norm": 1884.443115234375, |
|
"learning_rate": 2.138888888888889e-05, |
|
"loss": 63.6136, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.74275602742756, |
|
"grad_norm": 599.9818725585938, |
|
"learning_rate": 2.111111111111111e-05, |
|
"loss": 39.0019, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.7648750276487504, |
|
"grad_norm": 887.4273071289062, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 58.0432, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.78699402786994, |
|
"grad_norm": 363.2604675292969, |
|
"learning_rate": 2.0555555555555555e-05, |
|
"loss": 67.5791, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.8091130280911303, |
|
"grad_norm": 199.94940185546875, |
|
"learning_rate": 2.027777777777778e-05, |
|
"loss": 56.6002, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.8312320283123205, |
|
"grad_norm": 945.8538208007812, |
|
"learning_rate": 2e-05, |
|
"loss": 43.1726, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.85335102853351, |
|
"grad_norm": 1024.704345703125, |
|
"learning_rate": 1.9722222222222224e-05, |
|
"loss": 38.9053, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 2.8754700287547004, |
|
"grad_norm": 1009.4866943359375, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 58.8109, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.89758902897589, |
|
"grad_norm": 537.6126098632812, |
|
"learning_rate": 1.9166666666666667e-05, |
|
"loss": 73.4815, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"grad_norm": 269.6388244628906, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 55.6755, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 2.9418270294182705, |
|
"grad_norm": 1045.892578125, |
|
"learning_rate": 1.861111111111111e-05, |
|
"loss": 52.2818, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 2.96394602963946, |
|
"grad_norm": 2102.285400390625, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 47.7583, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.9860650298606504, |
|
"grad_norm": 565.085693359375, |
|
"learning_rate": 1.8055555555555555e-05, |
|
"loss": 49.5788, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 122.14683532714844, |
|
"eval_runtime": 61.9195, |
|
"eval_samples_per_second": 32.462, |
|
"eval_steps_per_second": 8.123, |
|
"step": 13563 |
|
}, |
|
{ |
|
"epoch": 3.00818403008184, |
|
"grad_norm": 1329.616455078125, |
|
"learning_rate": 1.777777777777778e-05, |
|
"loss": 35.1822, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 206.73377990722656, |
|
"learning_rate": 1.75e-05, |
|
"loss": 45.8486, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.0524220305242205, |
|
"grad_norm": 171.621826171875, |
|
"learning_rate": 1.7222222222222224e-05, |
|
"loss": 51.2093, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.07454103074541, |
|
"grad_norm": 707.8844604492188, |
|
"learning_rate": 1.6944444444444446e-05, |
|
"loss": 30.8107, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.0966600309666004, |
|
"grad_norm": 454.2152099609375, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 44.615, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.11877903118779, |
|
"grad_norm": 527.0735473632812, |
|
"learning_rate": 1.638888888888889e-05, |
|
"loss": 68.0012, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.1408980314089803, |
|
"grad_norm": 378.765869140625, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 37.9635, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.1630170316301705, |
|
"grad_norm": 637.0385131835938, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 47.3296, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.18513603185136, |
|
"grad_norm": 409.57025146484375, |
|
"learning_rate": 1.5555555555555555e-05, |
|
"loss": 40.9838, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.2072550320725504, |
|
"grad_norm": 809.7942504882812, |
|
"learning_rate": 1.527777777777778e-05, |
|
"loss": 46.4379, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.22937403229374, |
|
"grad_norm": 23.185964584350586, |
|
"learning_rate": 1.5e-05, |
|
"loss": 31.5362, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.2514930325149303, |
|
"grad_norm": 1714.423095703125, |
|
"learning_rate": 1.4722222222222224e-05, |
|
"loss": 52.3923, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.2736120327361204, |
|
"grad_norm": 395.1560363769531, |
|
"learning_rate": 1.4444444444444444e-05, |
|
"loss": 49.203, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.29573103295731, |
|
"grad_norm": 376.0379638671875, |
|
"learning_rate": 1.4166666666666668e-05, |
|
"loss": 30.0454, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.3178500331785004, |
|
"grad_norm": 815.3323364257812, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 43.2695, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.33996903339969, |
|
"grad_norm": 332.2477722167969, |
|
"learning_rate": 1.3611111111111111e-05, |
|
"loss": 39.2527, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.3620880336208803, |
|
"grad_norm": 983.4076538085938, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 61.8092, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.3842070338420704, |
|
"grad_norm": 321.3147888183594, |
|
"learning_rate": 1.3055555555555557e-05, |
|
"loss": 57.5652, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.40632603406326, |
|
"grad_norm": 477.24224853515625, |
|
"learning_rate": 1.2777777777777777e-05, |
|
"loss": 53.3966, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.4284450342844504, |
|
"grad_norm": 1399.400146484375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 65.4909, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.4505640345056405, |
|
"grad_norm": 441.70849609375, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 56.8207, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.4726830347268303, |
|
"grad_norm": 373.0791931152344, |
|
"learning_rate": 1.1944444444444446e-05, |
|
"loss": 54.7861, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.4948020349480204, |
|
"grad_norm": 855.823974609375, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 43.0153, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.5169210351692106, |
|
"grad_norm": 1049.0374755859375, |
|
"learning_rate": 1.138888888888889e-05, |
|
"loss": 55.1925, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 3.5390400353904004, |
|
"grad_norm": 1020.545654296875, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 35.9507, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.56115903561159, |
|
"grad_norm": 1310.59033203125, |
|
"learning_rate": 1.0833333333333334e-05, |
|
"loss": 40.9387, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 3.5832780358327803, |
|
"grad_norm": 212.0251007080078, |
|
"learning_rate": 1.0555555555555555e-05, |
|
"loss": 37.7763, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 3.6053970360539704, |
|
"grad_norm": 360.9208984375, |
|
"learning_rate": 1.0277777777777777e-05, |
|
"loss": 34.6411, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 3.6275160362751606, |
|
"grad_norm": 973.3055419921875, |
|
"learning_rate": 1e-05, |
|
"loss": 35.7516, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 2761.760498046875, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 43.8869, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.6717540367175405, |
|
"grad_norm": 905.3215942382812, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 25.8201, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 3.6938730369387303, |
|
"grad_norm": 252.27920532226562, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 60.4341, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 3.7159920371599204, |
|
"grad_norm": 343.59344482421875, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 36.6514, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 3.7381110373811106, |
|
"grad_norm": 922.6008911132812, |
|
"learning_rate": 8.611111111111112e-06, |
|
"loss": 47.1865, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 3.7602300376023003, |
|
"grad_norm": 68.8092269897461, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 48.1257, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.7823490378234905, |
|
"grad_norm": 785.6138305664062, |
|
"learning_rate": 8.055555555555557e-06, |
|
"loss": 56.9577, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 3.8044680380446803, |
|
"grad_norm": 746.76416015625, |
|
"learning_rate": 7.777777777777777e-06, |
|
"loss": 39.9381, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 3.8265870382658704, |
|
"grad_norm": 362.0380554199219, |
|
"learning_rate": 7.5e-06, |
|
"loss": 34.4047, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 3.8487060384870606, |
|
"grad_norm": 180.6659698486328, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 31.7799, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 3.8708250387082503, |
|
"grad_norm": 298.1046447753906, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 47.4674, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.8929440389294405, |
|
"grad_norm": 1920.2069091796875, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 49.4887, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 3.9150630391506303, |
|
"grad_norm": 2852.952392578125, |
|
"learning_rate": 6.3888888888888885e-06, |
|
"loss": 48.4204, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 3.9371820393718204, |
|
"grad_norm": 496.2744445800781, |
|
"learning_rate": 6.111111111111111e-06, |
|
"loss": 37.0487, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 3.9593010395930106, |
|
"grad_norm": 812.6398315429688, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 52.909, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 3.9814200398142003, |
|
"grad_norm": 705.7808837890625, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 23.5713, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 119.30766296386719, |
|
"eval_runtime": 61.5463, |
|
"eval_samples_per_second": 32.658, |
|
"eval_steps_per_second": 8.173, |
|
"step": 18084 |
|
}, |
|
{ |
|
"epoch": 4.00353904003539, |
|
"grad_norm": 192.50172424316406, |
|
"learning_rate": 5.277777777777778e-06, |
|
"loss": 43.4573, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 4.02565804025658, |
|
"grad_norm": 657.6646118164062, |
|
"learning_rate": 5e-06, |
|
"loss": 33.3628, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 4.04777704047777, |
|
"grad_norm": 1364.3040771484375, |
|
"learning_rate": 4.722222222222222e-06, |
|
"loss": 46.6342, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 4.069896040698961, |
|
"grad_norm": 553.6497802734375, |
|
"learning_rate": 4.444444444444445e-06, |
|
"loss": 47.8158, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 4.092015040920151, |
|
"grad_norm": 86.2293472290039, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 50.5882, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.11413404114134, |
|
"grad_norm": 860.7024536132812, |
|
"learning_rate": 3.888888888888889e-06, |
|
"loss": 35.6031, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 4.13625304136253, |
|
"grad_norm": 646.49267578125, |
|
"learning_rate": 3.611111111111111e-06, |
|
"loss": 26.6029, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 4.15837204158372, |
|
"grad_norm": 586.1218872070312, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 42.9749, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 4.180491041804911, |
|
"grad_norm": 973.4674072265625, |
|
"learning_rate": 3.0555555555555556e-06, |
|
"loss": 37.9648, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 4.202610042026101, |
|
"grad_norm": 694.5072631835938, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 40.874, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.22472904224729, |
|
"grad_norm": 960.8961791992188, |
|
"learning_rate": 2.5e-06, |
|
"loss": 38.9788, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 4.24684804246848, |
|
"grad_norm": 494.7688293457031, |
|
"learning_rate": 2.2222222222222225e-06, |
|
"loss": 41.4962, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 4.26896704268967, |
|
"grad_norm": 1084.4683837890625, |
|
"learning_rate": 1.9444444444444444e-06, |
|
"loss": 37.17, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 4.291086042910861, |
|
"grad_norm": 961.61572265625, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 27.5056, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 4.313205043132051, |
|
"grad_norm": 596.8368530273438, |
|
"learning_rate": 1.388888888888889e-06, |
|
"loss": 36.0704, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.33532404335324, |
|
"grad_norm": 202.31369018554688, |
|
"learning_rate": 1.1111111111111112e-06, |
|
"loss": 37.608, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 4.35744304357443, |
|
"grad_norm": 461.15460205078125, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 31.0886, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"grad_norm": 995.475830078125, |
|
"learning_rate": 5.555555555555556e-07, |
|
"loss": 31.7115, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 4.401681044016811, |
|
"grad_norm": 435.1724548339844, |
|
"learning_rate": 2.777777777777778e-07, |
|
"loss": 39.3167, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 4.423800044238001, |
|
"grad_norm": 1266.6600341796875, |
|
"learning_rate": 0.0, |
|
"loss": 39.4665, |
|
"step": 20000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|