|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.908045977011493, |
|
"eval_steps": 500, |
|
"global_step": 1732, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11494252873563218, |
|
"grad_norm": 5.833221435546875, |
|
"learning_rate": 2.2988505747126437e-05, |
|
"loss": 1.1691, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 3.3890936374664307, |
|
"learning_rate": 4.597701149425287e-05, |
|
"loss": 0.4962, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 1.4635858535766602, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 0.2852, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 1.3263287544250488, |
|
"learning_rate": 9.195402298850575e-05, |
|
"loss": 0.2068, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"grad_norm": 1.4933586120605469, |
|
"learning_rate": 0.00011494252873563218, |
|
"loss": 0.1671, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 3.461280584335327, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 0.163, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8045977011494253, |
|
"grad_norm": 0.8741048574447632, |
|
"learning_rate": 0.00016091954022988506, |
|
"loss": 0.1625, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.9238091111183167, |
|
"learning_rate": 0.0001839080459770115, |
|
"loss": 0.1292, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 1.1964523792266846, |
|
"learning_rate": 0.00019999835873288997, |
|
"loss": 0.1137, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"grad_norm": 0.6014072895050049, |
|
"learning_rate": 0.0001999691821496584, |
|
"loss": 0.1066, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.264367816091954, |
|
"grad_norm": 0.6493586897850037, |
|
"learning_rate": 0.00019990354521250803, |
|
"loss": 0.1051, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.6839510202407837, |
|
"learning_rate": 0.00019980147186027586, |
|
"loss": 0.0926, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4942528735632183, |
|
"grad_norm": 0.5401099920272827, |
|
"learning_rate": 0.00019966299932074023, |
|
"loss": 0.0895, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6091954022988506, |
|
"grad_norm": 1.1563774347305298, |
|
"learning_rate": 0.000199488178097043, |
|
"loss": 0.0915, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 2.1474926471710205, |
|
"learning_rate": 0.00019927707194927066, |
|
"loss": 0.0853, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 0.7267495393753052, |
|
"learning_rate": 0.00019902975787119956, |
|
"loss": 0.0873, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9540229885057472, |
|
"grad_norm": 0.5461205244064331, |
|
"learning_rate": 0.00019874632606221545, |
|
"loss": 0.0739, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.43381860852241516, |
|
"learning_rate": 0.00019842687989441604, |
|
"loss": 0.0682, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"grad_norm": 0.619968593120575, |
|
"learning_rate": 0.00019807153587490963, |
|
"loss": 0.0725, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 0.5531813502311707, |
|
"learning_rate": 0.00019768042360332325, |
|
"loss": 0.0649, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.4325454831123352, |
|
"learning_rate": 0.00019725368572453539, |
|
"loss": 0.0629, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"grad_norm": 1.132866382598877, |
|
"learning_rate": 0.00019679147787665126, |
|
"loss": 0.0597, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"grad_norm": 0.5158783793449402, |
|
"learning_rate": 0.00019629396863423911, |
|
"loss": 0.0658, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.5275442600250244, |
|
"learning_rate": 0.0001957613394468484, |
|
"loss": 0.0624, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 0.26212960481643677, |
|
"learning_rate": 0.0001951937845728321, |
|
"loss": 0.0565, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"grad_norm": 0.4064450263977051, |
|
"learning_rate": 0.00019459151100849784, |
|
"loss": 0.0586, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 0.497156023979187, |
|
"learning_rate": 0.0001939547384126128, |
|
"loss": 0.0569, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.218390804597701, |
|
"grad_norm": 0.23238833248615265, |
|
"learning_rate": 0.00019328369902629083, |
|
"loss": 0.048, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.33057811856269836, |
|
"learning_rate": 0.00019257863758829035, |
|
"loss": 0.0508, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.2923976182937622, |
|
"learning_rate": 0.00019183981124575418, |
|
"loss": 0.059, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.5632183908045976, |
|
"grad_norm": 0.40444961190223694, |
|
"learning_rate": 0.00019106748946042407, |
|
"loss": 0.0589, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.67816091954023, |
|
"grad_norm": 0.3618530333042145, |
|
"learning_rate": 0.00019026195391036338, |
|
"loss": 0.0493, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 0.2655580937862396, |
|
"learning_rate": 0.00018942349838722486, |
|
"loss": 0.0502, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.9080459770114944, |
|
"grad_norm": 0.30788642168045044, |
|
"learning_rate": 0.0001885524286891002, |
|
"loss": 0.0562, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.022988505747127, |
|
"grad_norm": 0.3801023066043854, |
|
"learning_rate": 0.00018764906250899027, |
|
"loss": 0.0536, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.34299996495246887, |
|
"learning_rate": 0.00018671372931893773, |
|
"loss": 0.0583, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.252873563218391, |
|
"grad_norm": 0.5825142860412598, |
|
"learning_rate": 0.0001857467702498633, |
|
"loss": 0.0465, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.3678160919540225, |
|
"grad_norm": 0.46258264780044556, |
|
"learning_rate": 0.0001847485379671496, |
|
"loss": 0.0469, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.482758620689655, |
|
"grad_norm": 0.23550616204738617, |
|
"learning_rate": 0.0001837193965420188, |
|
"loss": 0.0477, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.597701149425287, |
|
"grad_norm": 0.609255850315094, |
|
"learning_rate": 0.00018265972131874987, |
|
"loss": 0.0467, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.712643678160919, |
|
"grad_norm": 0.3701482117176056, |
|
"learning_rate": 0.00018156989877778461, |
|
"loss": 0.0471, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.4651474356651306, |
|
"learning_rate": 0.00018045032639477194, |
|
"loss": 0.0434, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.942528735632184, |
|
"grad_norm": 0.34028705954551697, |
|
"learning_rate": 0.00017930141249560233, |
|
"loss": 0.0452, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.057471264367816, |
|
"grad_norm": 0.2748933732509613, |
|
"learning_rate": 0.00017812357610748488, |
|
"loss": 0.0413, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.172413793103448, |
|
"grad_norm": 0.4612014591693878, |
|
"learning_rate": 0.00017691724680612118, |
|
"loss": 0.0423, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.287356321839081, |
|
"grad_norm": 0.37961891293525696, |
|
"learning_rate": 0.00017568286455903258, |
|
"loss": 0.0422, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.402298850574713, |
|
"grad_norm": 0.3245999813079834, |
|
"learning_rate": 0.00017442087956509665, |
|
"loss": 0.0396, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.5230941772460938, |
|
"learning_rate": 0.00017313175209035268, |
|
"loss": 0.0405, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.6321839080459775, |
|
"grad_norm": 0.2870311737060547, |
|
"learning_rate": 0.00017181595230013525, |
|
"loss": 0.0343, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.747126436781609, |
|
"grad_norm": 0.2876773774623871, |
|
"learning_rate": 0.00017047396008759754, |
|
"loss": 0.0436, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.862068965517241, |
|
"grad_norm": 0.4095667898654938, |
|
"learning_rate": 0.00016910626489868649, |
|
"loss": 0.0408, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.977011494252873, |
|
"grad_norm": 0.377605140209198, |
|
"learning_rate": 0.00016771336555363418, |
|
"loss": 0.0415, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.091954022988506, |
|
"grad_norm": 0.28248798847198486, |
|
"learning_rate": 0.00016629577006503009, |
|
"loss": 0.0386, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 0.36199840903282166, |
|
"learning_rate": 0.0001648539954525409, |
|
"loss": 0.0405, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.32183908045977, |
|
"grad_norm": 0.2778664529323578, |
|
"learning_rate": 0.00016338856755434503, |
|
"loss": 0.0359, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.436781609195402, |
|
"grad_norm": 0.23418012261390686, |
|
"learning_rate": 0.00016190002083535122, |
|
"loss": 0.0382, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.3027312457561493, |
|
"learning_rate": 0.00016038889819227045, |
|
"loss": 0.0394, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.2858007550239563, |
|
"learning_rate": 0.00015885575075561326, |
|
"loss": 0.042, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.781609195402299, |
|
"grad_norm": 0.2762337923049927, |
|
"learning_rate": 0.00015730113768868312, |
|
"loss": 0.039, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.40732237696647644, |
|
"learning_rate": 0.0001557256259836412, |
|
"loss": 0.0404, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.011494252873563, |
|
"grad_norm": 0.36847805976867676, |
|
"learning_rate": 0.00015412979025471488, |
|
"loss": 0.0368, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.126436781609195, |
|
"grad_norm": 0.2492237538099289, |
|
"learning_rate": 0.00015251421252862707, |
|
"loss": 0.0336, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.2626156210899353, |
|
"learning_rate": 0.00015087948203232156, |
|
"loss": 0.0352, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.35632183908046, |
|
"grad_norm": 0.6365396976470947, |
|
"learning_rate": 0.00014922619497806277, |
|
"loss": 0.0342, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.471264367816092, |
|
"grad_norm": 0.3000635802745819, |
|
"learning_rate": 0.00014755495434598745, |
|
"loss": 0.037, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 0.21869853138923645, |
|
"learning_rate": 0.0001458663696641884, |
|
"loss": 0.0365, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.7011494252873565, |
|
"grad_norm": 0.22284150123596191, |
|
"learning_rate": 0.0001441610567864096, |
|
"loss": 0.035, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.816091954022989, |
|
"grad_norm": 0.267621248960495, |
|
"learning_rate": 0.00014243963766743495, |
|
"loss": 0.029, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 0.2817297875881195, |
|
"learning_rate": 0.00014070274013625096, |
|
"loss": 0.0303, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.045977011494253, |
|
"grad_norm": 0.3712492287158966, |
|
"learning_rate": 0.00013895099766706784, |
|
"loss": 0.0297, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.160919540229886, |
|
"grad_norm": 0.4549995958805084, |
|
"learning_rate": 0.00013718504914828135, |
|
"loss": 0.033, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.28695234656333923, |
|
"learning_rate": 0.00013540553864945976, |
|
"loss": 0.0306, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.39080459770115, |
|
"grad_norm": 0.34577062726020813, |
|
"learning_rate": 0.00013361311518644172, |
|
"loss": 0.0325, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.505747126436782, |
|
"grad_norm": 0.3214464783668518, |
|
"learning_rate": 0.00013180843248462973, |
|
"loss": 0.0337, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 0.33310961723327637, |
|
"learning_rate": 0.00012999214874056595, |
|
"loss": 0.0344, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.735632183908045, |
|
"grad_norm": 0.25606226921081543, |
|
"learning_rate": 0.00012816492638187762, |
|
"loss": 0.0396, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.850574712643677, |
|
"grad_norm": 0.36330148577690125, |
|
"learning_rate": 0.00012632743182567905, |
|
"loss": 0.0348, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 0.314394474029541, |
|
"learning_rate": 0.00012448033523551865, |
|
"loss": 0.0399, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.080459770114942, |
|
"grad_norm": 0.28129351139068604, |
|
"learning_rate": 0.00012262431027695964, |
|
"loss": 0.0298, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.195402298850574, |
|
"grad_norm": 0.256881982088089, |
|
"learning_rate": 0.00012076003387188353, |
|
"loss": 0.0292, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.310344827586206, |
|
"grad_norm": 0.1919921189546585, |
|
"learning_rate": 0.00011888818595160584, |
|
"loss": 0.0269, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.425287356321839, |
|
"grad_norm": 0.2719796895980835, |
|
"learning_rate": 0.00011700944920889436, |
|
"loss": 0.0265, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.540229885057471, |
|
"grad_norm": 0.2269754707813263, |
|
"learning_rate": 0.00011512450884898022, |
|
"loss": 0.0316, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.23504453897476196, |
|
"learning_rate": 0.00011323405233965256, |
|
"loss": 0.0273, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.770114942528735, |
|
"grad_norm": 0.22570957243442535, |
|
"learning_rate": 0.00011133876916052821, |
|
"loss": 0.0304, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 9.885057471264368, |
|
"grad_norm": 0.19824576377868652, |
|
"learning_rate": 0.00010943935055158734, |
|
"loss": 0.0283, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.41852012276649475, |
|
"learning_rate": 0.00010753648926106723, |
|
"loss": 0.0319, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.114942528735632, |
|
"grad_norm": 0.20548714697360992, |
|
"learning_rate": 0.00010563087929280613, |
|
"loss": 0.0285, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 10.229885057471265, |
|
"grad_norm": 0.22767336666584015, |
|
"learning_rate": 0.00010372321565312872, |
|
"loss": 0.031, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 10.344827586206897, |
|
"grad_norm": 0.20542040467262268, |
|
"learning_rate": 0.00010181419409736647, |
|
"loss": 0.0316, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 10.459770114942529, |
|
"grad_norm": 0.3105849027633667, |
|
"learning_rate": 9.990451087610448e-05, |
|
"loss": 0.027, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.574712643678161, |
|
"grad_norm": 0.31816890835762024, |
|
"learning_rate": 9.799486248124775e-05, |
|
"loss": 0.025, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 10.689655172413794, |
|
"grad_norm": 0.3295416235923767, |
|
"learning_rate": 9.608594539199957e-05, |
|
"loss": 0.0247, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 10.804597701149426, |
|
"grad_norm": 0.17071272432804108, |
|
"learning_rate": 9.417845582084448e-05, |
|
"loss": 0.0291, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 10.919540229885058, |
|
"grad_norm": 0.189552441239357, |
|
"learning_rate": 9.227308945962827e-05, |
|
"loss": 0.0243, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 11.03448275862069, |
|
"grad_norm": 0.3179641664028168, |
|
"learning_rate": 9.037054122582839e-05, |
|
"loss": 0.0308, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 11.149425287356323, |
|
"grad_norm": 0.3051457703113556, |
|
"learning_rate": 8.847150500910618e-05, |
|
"loss": 0.0275, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 11.264367816091955, |
|
"grad_norm": 0.29757606983184814, |
|
"learning_rate": 8.657667341823448e-05, |
|
"loss": 0.0264, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 11.379310344827585, |
|
"grad_norm": 0.2845855951309204, |
|
"learning_rate": 8.4686737528492e-05, |
|
"loss": 0.0249, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 11.494252873563218, |
|
"grad_norm": 0.239737406373024, |
|
"learning_rate": 8.280238662961728e-05, |
|
"loss": 0.027, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 11.60919540229885, |
|
"grad_norm": 0.2692360281944275, |
|
"learning_rate": 8.092430797441364e-05, |
|
"loss": 0.0216, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 11.724137931034482, |
|
"grad_norm": 0.18495500087738037, |
|
"learning_rate": 7.905318652809728e-05, |
|
"loss": 0.0255, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 11.839080459770114, |
|
"grad_norm": 0.2230875939130783, |
|
"learning_rate": 7.718970471847923e-05, |
|
"loss": 0.0262, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 11.954022988505747, |
|
"grad_norm": 0.14376775920391083, |
|
"learning_rate": 7.53345421870735e-05, |
|
"loss": 0.0209, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 12.068965517241379, |
|
"grad_norm": 0.20623371005058289, |
|
"learning_rate": 7.348837554122057e-05, |
|
"loss": 0.0192, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 12.183908045977011, |
|
"grad_norm": 0.27209600806236267, |
|
"learning_rate": 7.165187810731823e-05, |
|
"loss": 0.0208, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 12.298850574712644, |
|
"grad_norm": 0.19447851181030273, |
|
"learning_rate": 6.982571968524847e-05, |
|
"loss": 0.0201, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 12.413793103448276, |
|
"grad_norm": 0.18613241612911224, |
|
"learning_rate": 6.801056630409098e-05, |
|
"loss": 0.0248, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 12.528735632183908, |
|
"grad_norm": 0.24156583845615387, |
|
"learning_rate": 6.620707997921157e-05, |
|
"loss": 0.0197, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 12.64367816091954, |
|
"grad_norm": 0.16912145912647247, |
|
"learning_rate": 6.441591847081476e-05, |
|
"loss": 0.022, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 12.758620689655173, |
|
"grad_norm": 0.14165754616260529, |
|
"learning_rate": 6.263773504404801e-05, |
|
"loss": 0.0199, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 12.873563218390805, |
|
"grad_norm": 0.3424724340438843, |
|
"learning_rate": 6.087317823074565e-05, |
|
"loss": 0.0209, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 12.988505747126437, |
|
"grad_norm": 0.2658204138278961, |
|
"learning_rate": 5.912289159289883e-05, |
|
"loss": 0.0242, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 13.10344827586207, |
|
"grad_norm": 0.21321730315685272, |
|
"learning_rate": 5.7387513487938386e-05, |
|
"loss": 0.0216, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 13.218390804597702, |
|
"grad_norm": 0.2854823172092438, |
|
"learning_rate": 5.566767683591553e-05, |
|
"loss": 0.0227, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 0.28919658064842224, |
|
"learning_rate": 5.396400888866601e-05, |
|
"loss": 0.0195, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 13.448275862068966, |
|
"grad_norm": 0.22510255873203278, |
|
"learning_rate": 5.2277131001041125e-05, |
|
"loss": 0.0241, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 13.563218390804598, |
|
"grad_norm": 0.21545900404453278, |
|
"learning_rate": 5.060765840429e-05, |
|
"loss": 0.023, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 13.678160919540229, |
|
"grad_norm": 0.20618782937526703, |
|
"learning_rate": 4.8956199981674656e-05, |
|
"loss": 0.0181, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 13.793103448275861, |
|
"grad_norm": 0.22331970930099487, |
|
"learning_rate": 4.7323358046400844e-05, |
|
"loss": 0.0212, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 13.908045977011493, |
|
"grad_norm": 0.14791706204414368, |
|
"learning_rate": 4.570972812194457e-05, |
|
"loss": 0.0195, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 14.022988505747126, |
|
"grad_norm": 0.1526448130607605, |
|
"learning_rate": 4.4115898724855455e-05, |
|
"loss": 0.0188, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 14.137931034482758, |
|
"grad_norm": 0.18956783413887024, |
|
"learning_rate": 4.254245115011506e-05, |
|
"loss": 0.0188, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 14.25287356321839, |
|
"grad_norm": 0.1313301920890808, |
|
"learning_rate": 4.098995925912972e-05, |
|
"loss": 0.019, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 14.367816091954023, |
|
"grad_norm": 0.13764789700508118, |
|
"learning_rate": 3.945898927043372e-05, |
|
"loss": 0.0175, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 14.482758620689655, |
|
"grad_norm": 0.19556942582130432, |
|
"learning_rate": 3.7950099553180804e-05, |
|
"loss": 0.0196, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 14.597701149425287, |
|
"grad_norm": 0.14027345180511475, |
|
"learning_rate": 3.646384042349764e-05, |
|
"loss": 0.0177, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 14.71264367816092, |
|
"grad_norm": 0.2918284833431244, |
|
"learning_rate": 3.500075394377511e-05, |
|
"loss": 0.0204, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 14.827586206896552, |
|
"grad_norm": 0.12948164343833923, |
|
"learning_rate": 3.3561373724969224e-05, |
|
"loss": 0.0188, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 14.942528735632184, |
|
"grad_norm": 0.1773224174976349, |
|
"learning_rate": 3.214622473198492e-05, |
|
"loss": 0.0212, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 15.057471264367816, |
|
"grad_norm": 0.29863160848617554, |
|
"learning_rate": 3.075582309221289e-05, |
|
"loss": 0.0157, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 15.172413793103448, |
|
"grad_norm": 0.18764474987983704, |
|
"learning_rate": 2.939067590728972e-05, |
|
"loss": 0.0175, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 15.28735632183908, |
|
"grad_norm": 0.16273990273475647, |
|
"learning_rate": 2.8051281068149803e-05, |
|
"loss": 0.0135, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 15.402298850574713, |
|
"grad_norm": 0.25088945031166077, |
|
"learning_rate": 2.673812707343669e-05, |
|
"loss": 0.0242, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 15.517241379310345, |
|
"grad_norm": 0.25521960854530334, |
|
"learning_rate": 2.545169285133965e-05, |
|
"loss": 0.0188, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 15.632183908045977, |
|
"grad_norm": 0.15780223906040192, |
|
"learning_rate": 2.4192447584921195e-05, |
|
"loss": 0.0194, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 15.74712643678161, |
|
"grad_norm": 0.13658417761325836, |
|
"learning_rate": 2.296085054099828e-05, |
|
"loss": 0.0179, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 15.862068965517242, |
|
"grad_norm": 0.14593394100666046, |
|
"learning_rate": 2.175735090264058e-05, |
|
"loss": 0.016, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 15.977011494252874, |
|
"grad_norm": 0.20093883574008942, |
|
"learning_rate": 2.0582387605346088e-05, |
|
"loss": 0.0157, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 16.091954022988507, |
|
"grad_norm": 0.22261527180671692, |
|
"learning_rate": 1.943638917695453e-05, |
|
"loss": 0.0175, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 16.20689655172414, |
|
"grad_norm": 0.17486433684825897, |
|
"learning_rate": 1.831977358135625e-05, |
|
"loss": 0.0166, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 16.32183908045977, |
|
"grad_norm": 0.2138216346502304, |
|
"learning_rate": 1.723294806605428e-05, |
|
"loss": 0.0146, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 16.436781609195403, |
|
"grad_norm": 0.20112960040569305, |
|
"learning_rate": 1.6176309013634517e-05, |
|
"loss": 0.0159, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 16.551724137931036, |
|
"grad_norm": 0.15377485752105713, |
|
"learning_rate": 1.5150241797198883e-05, |
|
"loss": 0.016, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.23132722079753876, |
|
"learning_rate": 1.415512063981339e-05, |
|
"loss": 0.0134, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 16.7816091954023, |
|
"grad_norm": 0.15262600779533386, |
|
"learning_rate": 1.3191308478023212e-05, |
|
"loss": 0.017, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 16.896551724137932, |
|
"grad_norm": 0.0991855040192604, |
|
"learning_rate": 1.2259156829483842e-05, |
|
"loss": 0.0167, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 17.011494252873565, |
|
"grad_norm": 0.12278055399656296, |
|
"learning_rate": 1.1359005664756994e-05, |
|
"loss": 0.0146, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 17.126436781609197, |
|
"grad_norm": 0.17124158143997192, |
|
"learning_rate": 1.0491183283317997e-05, |
|
"loss": 0.0173, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 17.24137931034483, |
|
"grad_norm": 0.1300356686115265, |
|
"learning_rate": 9.656006193819633e-06, |
|
"loss": 0.0143, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 17.35632183908046, |
|
"grad_norm": 0.17946338653564453, |
|
"learning_rate": 8.853778998656537e-06, |
|
"loss": 0.0154, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 17.47126436781609, |
|
"grad_norm": 0.28736400604248047, |
|
"learning_rate": 8.084794282871689e-06, |
|
"loss": 0.0166, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 17.586206896551722, |
|
"grad_norm": 0.13112574815750122, |
|
"learning_rate": 7.3493325074461165e-06, |
|
"loss": 0.0132, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 17.701149425287355, |
|
"grad_norm": 0.12864838540554047, |
|
"learning_rate": 6.647661907010183e-06, |
|
"loss": 0.0171, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 17.816091954022987, |
|
"grad_norm": 0.16958807408809662, |
|
"learning_rate": 5.980038392014309e-06, |
|
"loss": 0.0161, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 17.93103448275862, |
|
"grad_norm": 0.36121729016304016, |
|
"learning_rate": 5.3467054553941405e-06, |
|
"loss": 0.0158, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 18.04597701149425, |
|
"grad_norm": 0.2107989490032196, |
|
"learning_rate": 4.7478940837649924e-06, |
|
"loss": 0.0147, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 18.160919540229884, |
|
"grad_norm": 0.15654149651527405, |
|
"learning_rate": 4.183822673177229e-06, |
|
"loss": 0.0164, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 18.275862068965516, |
|
"grad_norm": 0.1438828557729721, |
|
"learning_rate": 3.6546969494637986e-06, |
|
"loss": 0.0131, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 18.39080459770115, |
|
"grad_norm": 0.2543192207813263, |
|
"learning_rate": 3.16070989320868e-06, |
|
"loss": 0.0157, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 18.50574712643678, |
|
"grad_norm": 0.13453112542629242, |
|
"learning_rate": 2.702041669363875e-06, |
|
"loss": 0.017, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 18.620689655172413, |
|
"grad_norm": 0.16369780898094177, |
|
"learning_rate": 2.2788595615403474e-06, |
|
"loss": 0.0157, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 18.735632183908045, |
|
"grad_norm": 0.14639818668365479, |
|
"learning_rate": 1.8913179109969482e-06, |
|
"loss": 0.0122, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 18.850574712643677, |
|
"grad_norm": 0.23813354969024658, |
|
"learning_rate": 1.5395580603498328e-06, |
|
"loss": 0.0157, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 18.96551724137931, |
|
"grad_norm": 0.15577834844589233, |
|
"learning_rate": 1.2237083020224526e-06, |
|
"loss": 0.0144, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 19.080459770114942, |
|
"grad_norm": 0.06880059838294983, |
|
"learning_rate": 9.438838314553056e-07, |
|
"loss": 0.0109, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 19.195402298850574, |
|
"grad_norm": 0.19819270074367523, |
|
"learning_rate": 7.001867050923095e-07, |
|
"loss": 0.0134, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 19.310344827586206, |
|
"grad_norm": 0.10673543065786362, |
|
"learning_rate": 4.92705803159188e-07, |
|
"loss": 0.0155, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 19.42528735632184, |
|
"grad_norm": 0.16529639065265656, |
|
"learning_rate": 3.2151679724748975e-07, |
|
"loss": 0.0175, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 19.54022988505747, |
|
"grad_norm": 0.1206677109003067, |
|
"learning_rate": 1.8668212271585327e-07, |
|
"loss": 0.0188, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 19.655172413793103, |
|
"grad_norm": 0.10180158913135529, |
|
"learning_rate": 8.825095591891152e-08, |
|
"loss": 0.0158, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 19.770114942528735, |
|
"grad_norm": 0.14544282853603363, |
|
"learning_rate": 2.625919627188278e-08, |
|
"loss": 0.015, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 19.885057471264368, |
|
"grad_norm": 0.14054107666015625, |
|
"learning_rate": 7.294531574553176e-10, |
|
"loss": 0.0139, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 19.908045977011493, |
|
"step": 1732, |
|
"total_flos": 2.431760592612004e+17, |
|
"train_loss": 0.04785273781403314, |
|
"train_runtime": 1980.9382, |
|
"train_samples_per_second": 55.957, |
|
"train_steps_per_second": 0.874 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1732, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.431760592612004e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|