{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.908045977011493, "eval_steps": 500, "global_step": 1732, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11494252873563218, "grad_norm": 5.833221435546875, "learning_rate": 2.2988505747126437e-05, "loss": 1.1691, "step": 10 }, { "epoch": 0.22988505747126436, "grad_norm": 3.3890936374664307, "learning_rate": 4.597701149425287e-05, "loss": 0.4962, "step": 20 }, { "epoch": 0.3448275862068966, "grad_norm": 1.4635858535766602, "learning_rate": 6.896551724137931e-05, "loss": 0.2852, "step": 30 }, { "epoch": 0.45977011494252873, "grad_norm": 1.3263287544250488, "learning_rate": 9.195402298850575e-05, "loss": 0.2068, "step": 40 }, { "epoch": 0.5747126436781609, "grad_norm": 1.4933586120605469, "learning_rate": 0.00011494252873563218, "loss": 0.1671, "step": 50 }, { "epoch": 0.6896551724137931, "grad_norm": 3.461280584335327, "learning_rate": 0.00013793103448275863, "loss": 0.163, "step": 60 }, { "epoch": 0.8045977011494253, "grad_norm": 0.8741048574447632, "learning_rate": 0.00016091954022988506, "loss": 0.1625, "step": 70 }, { "epoch": 0.9195402298850575, "grad_norm": 0.9238091111183167, "learning_rate": 0.0001839080459770115, "loss": 0.1292, "step": 80 }, { "epoch": 1.0344827586206897, "grad_norm": 1.1964523792266846, "learning_rate": 0.00019999835873288997, "loss": 0.1137, "step": 90 }, { "epoch": 1.1494252873563218, "grad_norm": 0.6014072895050049, "learning_rate": 0.0001999691821496584, "loss": 0.1066, "step": 100 }, { "epoch": 1.264367816091954, "grad_norm": 0.6493586897850037, "learning_rate": 0.00019990354521250803, "loss": 0.1051, "step": 110 }, { "epoch": 1.3793103448275863, "grad_norm": 0.6839510202407837, "learning_rate": 0.00019980147186027586, "loss": 0.0926, "step": 120 }, { "epoch": 1.4942528735632183, "grad_norm": 0.5401099920272827, "learning_rate": 0.00019966299932074023, "loss": 0.0895, "step": 130 }, { "epoch": 1.6091954022988506, "grad_norm": 1.1563774347305298, "learning_rate": 0.000199488178097043, "loss": 0.0915, "step": 140 }, { "epoch": 1.7241379310344827, "grad_norm": 2.1474926471710205, "learning_rate": 0.00019927707194927066, "loss": 0.0853, "step": 150 }, { "epoch": 1.839080459770115, "grad_norm": 0.7267495393753052, "learning_rate": 0.00019902975787119956, "loss": 0.0873, "step": 160 }, { "epoch": 1.9540229885057472, "grad_norm": 0.5461205244064331, "learning_rate": 0.00019874632606221545, "loss": 0.0739, "step": 170 }, { "epoch": 2.0689655172413794, "grad_norm": 0.43381860852241516, "learning_rate": 0.00019842687989441604, "loss": 0.0682, "step": 180 }, { "epoch": 2.1839080459770113, "grad_norm": 0.619968593120575, "learning_rate": 0.00019807153587490963, "loss": 0.0725, "step": 190 }, { "epoch": 2.2988505747126435, "grad_norm": 0.5531813502311707, "learning_rate": 0.00019768042360332325, "loss": 0.0649, "step": 200 }, { "epoch": 2.413793103448276, "grad_norm": 0.4325454831123352, "learning_rate": 0.00019725368572453539, "loss": 0.0629, "step": 210 }, { "epoch": 2.528735632183908, "grad_norm": 1.132866382598877, "learning_rate": 0.00019679147787665126, "loss": 0.0597, "step": 220 }, { "epoch": 2.6436781609195403, "grad_norm": 0.5158783793449402, "learning_rate": 0.00019629396863423911, "loss": 0.0658, "step": 230 }, { "epoch": 2.7586206896551726, "grad_norm": 0.5275442600250244, "learning_rate": 0.0001957613394468484, "loss": 0.0624, "step": 240 }, { "epoch": 2.873563218390805, "grad_norm": 0.26212960481643677, "learning_rate": 0.0001951937845728321, "loss": 0.0565, "step": 250 }, { "epoch": 2.9885057471264367, "grad_norm": 0.4064450263977051, "learning_rate": 0.00019459151100849784, "loss": 0.0586, "step": 260 }, { "epoch": 3.103448275862069, "grad_norm": 0.497156023979187, "learning_rate": 0.0001939547384126128, "loss": 0.0569, "step": 270 }, { "epoch": 3.218390804597701, "grad_norm": 0.23238833248615265, "learning_rate": 0.00019328369902629083, "loss": 0.048, "step": 280 }, { "epoch": 3.3333333333333335, "grad_norm": 0.33057811856269836, "learning_rate": 0.00019257863758829035, "loss": 0.0508, "step": 290 }, { "epoch": 3.4482758620689653, "grad_norm": 0.2923976182937622, "learning_rate": 0.00019183981124575418, "loss": 0.059, "step": 300 }, { "epoch": 3.5632183908045976, "grad_norm": 0.40444961190223694, "learning_rate": 0.00019106748946042407, "loss": 0.0589, "step": 310 }, { "epoch": 3.67816091954023, "grad_norm": 0.3618530333042145, "learning_rate": 0.00019026195391036338, "loss": 0.0493, "step": 320 }, { "epoch": 3.793103448275862, "grad_norm": 0.2655580937862396, "learning_rate": 0.00018942349838722486, "loss": 0.0502, "step": 330 }, { "epoch": 3.9080459770114944, "grad_norm": 0.30788642168045044, "learning_rate": 0.0001885524286891002, "loss": 0.0562, "step": 340 }, { "epoch": 4.022988505747127, "grad_norm": 0.3801023066043854, "learning_rate": 0.00018764906250899027, "loss": 0.0536, "step": 350 }, { "epoch": 4.137931034482759, "grad_norm": 0.34299996495246887, "learning_rate": 0.00018671372931893773, "loss": 0.0583, "step": 360 }, { "epoch": 4.252873563218391, "grad_norm": 0.5825142860412598, "learning_rate": 0.0001857467702498633, "loss": 0.0465, "step": 370 }, { "epoch": 4.3678160919540225, "grad_norm": 0.46258264780044556, "learning_rate": 0.0001847485379671496, "loss": 0.0469, "step": 380 }, { "epoch": 4.482758620689655, "grad_norm": 0.23550616204738617, "learning_rate": 0.0001837193965420188, "loss": 0.0477, "step": 390 }, { "epoch": 4.597701149425287, "grad_norm": 0.609255850315094, "learning_rate": 0.00018265972131874987, "loss": 0.0467, "step": 400 }, { "epoch": 4.712643678160919, "grad_norm": 0.3701482117176056, "learning_rate": 0.00018156989877778461, "loss": 0.0471, "step": 410 }, { "epoch": 4.827586206896552, "grad_norm": 0.4651474356651306, "learning_rate": 0.00018045032639477194, "loss": 0.0434, "step": 420 }, { "epoch": 4.942528735632184, "grad_norm": 0.34028705954551697, "learning_rate": 0.00017930141249560233, "loss": 0.0452, "step": 430 }, { "epoch": 5.057471264367816, "grad_norm": 0.2748933732509613, "learning_rate": 0.00017812357610748488, "loss": 0.0413, "step": 440 }, { "epoch": 5.172413793103448, "grad_norm": 0.4612014591693878, "learning_rate": 0.00017691724680612118, "loss": 0.0423, "step": 450 }, { "epoch": 5.287356321839081, "grad_norm": 0.37961891293525696, "learning_rate": 0.00017568286455903258, "loss": 0.0422, "step": 460 }, { "epoch": 5.402298850574713, "grad_norm": 0.3245999813079834, "learning_rate": 0.00017442087956509665, "loss": 0.0396, "step": 470 }, { "epoch": 5.517241379310345, "grad_norm": 0.5230941772460938, "learning_rate": 0.00017313175209035268, "loss": 0.0405, "step": 480 }, { "epoch": 5.6321839080459775, "grad_norm": 0.2870311737060547, "learning_rate": 0.00017181595230013525, "loss": 0.0343, "step": 490 }, { "epoch": 5.747126436781609, "grad_norm": 0.2876773774623871, "learning_rate": 0.00017047396008759754, "loss": 0.0436, "step": 500 }, { "epoch": 5.862068965517241, "grad_norm": 0.4095667898654938, "learning_rate": 0.00016910626489868649, "loss": 0.0408, "step": 510 }, { "epoch": 5.977011494252873, "grad_norm": 0.377605140209198, "learning_rate": 0.00016771336555363418, "loss": 0.0415, "step": 520 }, { "epoch": 6.091954022988506, "grad_norm": 0.28248798847198486, "learning_rate": 0.00016629577006503009, "loss": 0.0386, "step": 530 }, { "epoch": 6.206896551724138, "grad_norm": 0.36199840903282166, "learning_rate": 0.0001648539954525409, "loss": 0.0405, "step": 540 }, { "epoch": 6.32183908045977, "grad_norm": 0.2778664529323578, "learning_rate": 0.00016338856755434503, "loss": 0.0359, "step": 550 }, { "epoch": 6.436781609195402, "grad_norm": 0.23418012261390686, "learning_rate": 0.00016190002083535122, "loss": 0.0382, "step": 560 }, { "epoch": 6.551724137931035, "grad_norm": 0.3027312457561493, "learning_rate": 0.00016038889819227045, "loss": 0.0394, "step": 570 }, { "epoch": 6.666666666666667, "grad_norm": 0.2858007550239563, "learning_rate": 0.00015885575075561326, "loss": 0.042, "step": 580 }, { "epoch": 6.781609195402299, "grad_norm": 0.2762337923049927, "learning_rate": 0.00015730113768868312, "loss": 0.039, "step": 590 }, { "epoch": 6.896551724137931, "grad_norm": 0.40732237696647644, "learning_rate": 0.0001557256259836412, "loss": 0.0404, "step": 600 }, { "epoch": 7.011494252873563, "grad_norm": 0.36847805976867676, "learning_rate": 0.00015412979025471488, "loss": 0.0368, "step": 610 }, { "epoch": 7.126436781609195, "grad_norm": 0.2492237538099289, "learning_rate": 0.00015251421252862707, "loss": 0.0336, "step": 620 }, { "epoch": 7.241379310344827, "grad_norm": 0.2626156210899353, "learning_rate": 0.00015087948203232156, "loss": 0.0352, "step": 630 }, { "epoch": 7.35632183908046, "grad_norm": 0.6365396976470947, "learning_rate": 0.00014922619497806277, "loss": 0.0342, "step": 640 }, { "epoch": 7.471264367816092, "grad_norm": 0.3000635802745819, "learning_rate": 0.00014755495434598745, "loss": 0.037, "step": 650 }, { "epoch": 7.586206896551724, "grad_norm": 0.21869853138923645, "learning_rate": 0.0001458663696641884, "loss": 0.0365, "step": 660 }, { "epoch": 7.7011494252873565, "grad_norm": 0.22284150123596191, "learning_rate": 0.0001441610567864096, "loss": 0.035, "step": 670 }, { "epoch": 7.816091954022989, "grad_norm": 0.267621248960495, "learning_rate": 0.00014243963766743495, "loss": 0.029, "step": 680 }, { "epoch": 7.931034482758621, "grad_norm": 0.2817297875881195, "learning_rate": 0.00014070274013625096, "loss": 0.0303, "step": 690 }, { "epoch": 8.045977011494253, "grad_norm": 0.3712492287158966, "learning_rate": 0.00013895099766706784, "loss": 0.0297, "step": 700 }, { "epoch": 8.160919540229886, "grad_norm": 0.4549995958805084, "learning_rate": 0.00013718504914828135, "loss": 0.033, "step": 710 }, { "epoch": 8.275862068965518, "grad_norm": 0.28695234656333923, "learning_rate": 0.00013540553864945976, "loss": 0.0306, "step": 720 }, { "epoch": 8.39080459770115, "grad_norm": 0.34577062726020813, "learning_rate": 0.00013361311518644172, "loss": 0.0325, "step": 730 }, { "epoch": 8.505747126436782, "grad_norm": 0.3214464783668518, "learning_rate": 0.00013180843248462973, "loss": 0.0337, "step": 740 }, { "epoch": 8.620689655172415, "grad_norm": 0.33310961723327637, "learning_rate": 0.00012999214874056595, "loss": 0.0344, "step": 750 }, { "epoch": 8.735632183908045, "grad_norm": 0.25606226921081543, "learning_rate": 0.00012816492638187762, "loss": 0.0396, "step": 760 }, { "epoch": 8.850574712643677, "grad_norm": 0.36330148577690125, "learning_rate": 0.00012632743182567905, "loss": 0.0348, "step": 770 }, { "epoch": 8.96551724137931, "grad_norm": 0.314394474029541, "learning_rate": 0.00012448033523551865, "loss": 0.0399, "step": 780 }, { "epoch": 9.080459770114942, "grad_norm": 0.28129351139068604, "learning_rate": 0.00012262431027695964, "loss": 0.0298, "step": 790 }, { "epoch": 9.195402298850574, "grad_norm": 0.256881982088089, "learning_rate": 0.00012076003387188353, "loss": 0.0292, "step": 800 }, { "epoch": 9.310344827586206, "grad_norm": 0.1919921189546585, "learning_rate": 0.00011888818595160584, "loss": 0.0269, "step": 810 }, { "epoch": 9.425287356321839, "grad_norm": 0.2719796895980835, "learning_rate": 0.00011700944920889436, "loss": 0.0265, "step": 820 }, { "epoch": 9.540229885057471, "grad_norm": 0.2269754707813263, "learning_rate": 0.00011512450884898022, "loss": 0.0316, "step": 830 }, { "epoch": 9.655172413793103, "grad_norm": 0.23504453897476196, "learning_rate": 0.00011323405233965256, "loss": 0.0273, "step": 840 }, { "epoch": 9.770114942528735, "grad_norm": 0.22570957243442535, "learning_rate": 0.00011133876916052821, "loss": 0.0304, "step": 850 }, { "epoch": 9.885057471264368, "grad_norm": 0.19824576377868652, "learning_rate": 0.00010943935055158734, "loss": 0.0283, "step": 860 }, { "epoch": 10.0, "grad_norm": 0.41852012276649475, "learning_rate": 0.00010753648926106723, "loss": 0.0319, "step": 870 }, { "epoch": 10.114942528735632, "grad_norm": 0.20548714697360992, "learning_rate": 0.00010563087929280613, "loss": 0.0285, "step": 880 }, { "epoch": 10.229885057471265, "grad_norm": 0.22767336666584015, "learning_rate": 0.00010372321565312872, "loss": 0.031, "step": 890 }, { "epoch": 10.344827586206897, "grad_norm": 0.20542040467262268, "learning_rate": 0.00010181419409736647, "loss": 0.0316, "step": 900 }, { "epoch": 10.459770114942529, "grad_norm": 0.3105849027633667, "learning_rate": 9.990451087610448e-05, "loss": 0.027, "step": 910 }, { "epoch": 10.574712643678161, "grad_norm": 0.31816890835762024, "learning_rate": 9.799486248124775e-05, "loss": 0.025, "step": 920 }, { "epoch": 10.689655172413794, "grad_norm": 0.3295416235923767, "learning_rate": 9.608594539199957e-05, "loss": 0.0247, "step": 930 }, { "epoch": 10.804597701149426, "grad_norm": 0.17071272432804108, "learning_rate": 9.417845582084448e-05, "loss": 0.0291, "step": 940 }, { "epoch": 10.919540229885058, "grad_norm": 0.189552441239357, "learning_rate": 9.227308945962827e-05, "loss": 0.0243, "step": 950 }, { "epoch": 11.03448275862069, "grad_norm": 0.3179641664028168, "learning_rate": 9.037054122582839e-05, "loss": 0.0308, "step": 960 }, { "epoch": 11.149425287356323, "grad_norm": 0.3051457703113556, "learning_rate": 8.847150500910618e-05, "loss": 0.0275, "step": 970 }, { "epoch": 11.264367816091955, "grad_norm": 0.29757606983184814, "learning_rate": 8.657667341823448e-05, "loss": 0.0264, "step": 980 }, { "epoch": 11.379310344827585, "grad_norm": 0.2845855951309204, "learning_rate": 8.4686737528492e-05, "loss": 0.0249, "step": 990 }, { "epoch": 11.494252873563218, "grad_norm": 0.239737406373024, "learning_rate": 8.280238662961728e-05, "loss": 0.027, "step": 1000 }, { "epoch": 11.60919540229885, "grad_norm": 0.2692360281944275, "learning_rate": 8.092430797441364e-05, "loss": 0.0216, "step": 1010 }, { "epoch": 11.724137931034482, "grad_norm": 0.18495500087738037, "learning_rate": 7.905318652809728e-05, "loss": 0.0255, "step": 1020 }, { "epoch": 11.839080459770114, "grad_norm": 0.2230875939130783, "learning_rate": 7.718970471847923e-05, "loss": 0.0262, "step": 1030 }, { "epoch": 11.954022988505747, "grad_norm": 0.14376775920391083, "learning_rate": 7.53345421870735e-05, "loss": 0.0209, "step": 1040 }, { "epoch": 12.068965517241379, "grad_norm": 0.20623371005058289, "learning_rate": 7.348837554122057e-05, "loss": 0.0192, "step": 1050 }, { "epoch": 12.183908045977011, "grad_norm": 0.27209600806236267, "learning_rate": 7.165187810731823e-05, "loss": 0.0208, "step": 1060 }, { "epoch": 12.298850574712644, "grad_norm": 0.19447851181030273, "learning_rate": 6.982571968524847e-05, "loss": 0.0201, "step": 1070 }, { "epoch": 12.413793103448276, "grad_norm": 0.18613241612911224, "learning_rate": 6.801056630409098e-05, "loss": 0.0248, "step": 1080 }, { "epoch": 12.528735632183908, "grad_norm": 0.24156583845615387, "learning_rate": 6.620707997921157e-05, "loss": 0.0197, "step": 1090 }, { "epoch": 12.64367816091954, "grad_norm": 0.16912145912647247, "learning_rate": 6.441591847081476e-05, "loss": 0.022, "step": 1100 }, { "epoch": 12.758620689655173, "grad_norm": 0.14165754616260529, "learning_rate": 6.263773504404801e-05, "loss": 0.0199, "step": 1110 }, { "epoch": 12.873563218390805, "grad_norm": 0.3424724340438843, "learning_rate": 6.087317823074565e-05, "loss": 0.0209, "step": 1120 }, { "epoch": 12.988505747126437, "grad_norm": 0.2658204138278961, "learning_rate": 5.912289159289883e-05, "loss": 0.0242, "step": 1130 }, { "epoch": 13.10344827586207, "grad_norm": 0.21321730315685272, "learning_rate": 5.7387513487938386e-05, "loss": 0.0216, "step": 1140 }, { "epoch": 13.218390804597702, "grad_norm": 0.2854823172092438, "learning_rate": 5.566767683591553e-05, "loss": 0.0227, "step": 1150 }, { "epoch": 13.333333333333334, "grad_norm": 0.28919658064842224, "learning_rate": 5.396400888866601e-05, "loss": 0.0195, "step": 1160 }, { "epoch": 13.448275862068966, "grad_norm": 0.22510255873203278, "learning_rate": 5.2277131001041125e-05, "loss": 0.0241, "step": 1170 }, { "epoch": 13.563218390804598, "grad_norm": 0.21545900404453278, "learning_rate": 5.060765840429e-05, "loss": 0.023, "step": 1180 }, { "epoch": 13.678160919540229, "grad_norm": 0.20618782937526703, "learning_rate": 4.8956199981674656e-05, "loss": 0.0181, "step": 1190 }, { "epoch": 13.793103448275861, "grad_norm": 0.22331970930099487, "learning_rate": 4.7323358046400844e-05, "loss": 0.0212, "step": 1200 }, { "epoch": 13.908045977011493, "grad_norm": 0.14791706204414368, "learning_rate": 4.570972812194457e-05, "loss": 0.0195, "step": 1210 }, { "epoch": 14.022988505747126, "grad_norm": 0.1526448130607605, "learning_rate": 4.4115898724855455e-05, "loss": 0.0188, "step": 1220 }, { "epoch": 14.137931034482758, "grad_norm": 0.18956783413887024, "learning_rate": 4.254245115011506e-05, "loss": 0.0188, "step": 1230 }, { "epoch": 14.25287356321839, "grad_norm": 0.1313301920890808, "learning_rate": 4.098995925912972e-05, "loss": 0.019, "step": 1240 }, { "epoch": 14.367816091954023, "grad_norm": 0.13764789700508118, "learning_rate": 3.945898927043372e-05, "loss": 0.0175, "step": 1250 }, { "epoch": 14.482758620689655, "grad_norm": 0.19556942582130432, "learning_rate": 3.7950099553180804e-05, "loss": 0.0196, "step": 1260 }, { "epoch": 14.597701149425287, "grad_norm": 0.14027345180511475, "learning_rate": 3.646384042349764e-05, "loss": 0.0177, "step": 1270 }, { "epoch": 14.71264367816092, "grad_norm": 0.2918284833431244, "learning_rate": 3.500075394377511e-05, "loss": 0.0204, "step": 1280 }, { "epoch": 14.827586206896552, "grad_norm": 0.12948164343833923, "learning_rate": 3.3561373724969224e-05, "loss": 0.0188, "step": 1290 }, { "epoch": 14.942528735632184, "grad_norm": 0.1773224174976349, "learning_rate": 3.214622473198492e-05, "loss": 0.0212, "step": 1300 }, { "epoch": 15.057471264367816, "grad_norm": 0.29863160848617554, "learning_rate": 3.075582309221289e-05, "loss": 0.0157, "step": 1310 }, { "epoch": 15.172413793103448, "grad_norm": 0.18764474987983704, "learning_rate": 2.939067590728972e-05, "loss": 0.0175, "step": 1320 }, { "epoch": 15.28735632183908, "grad_norm": 0.16273990273475647, "learning_rate": 2.8051281068149803e-05, "loss": 0.0135, "step": 1330 }, { "epoch": 15.402298850574713, "grad_norm": 0.25088945031166077, "learning_rate": 2.673812707343669e-05, "loss": 0.0242, "step": 1340 }, { "epoch": 15.517241379310345, "grad_norm": 0.25521960854530334, "learning_rate": 2.545169285133965e-05, "loss": 0.0188, "step": 1350 }, { "epoch": 15.632183908045977, "grad_norm": 0.15780223906040192, "learning_rate": 2.4192447584921195e-05, "loss": 0.0194, "step": 1360 }, { "epoch": 15.74712643678161, "grad_norm": 0.13658417761325836, "learning_rate": 2.296085054099828e-05, "loss": 0.0179, "step": 1370 }, { "epoch": 15.862068965517242, "grad_norm": 0.14593394100666046, "learning_rate": 2.175735090264058e-05, "loss": 0.016, "step": 1380 }, { "epoch": 15.977011494252874, "grad_norm": 0.20093883574008942, "learning_rate": 2.0582387605346088e-05, "loss": 0.0157, "step": 1390 }, { "epoch": 16.091954022988507, "grad_norm": 0.22261527180671692, "learning_rate": 1.943638917695453e-05, "loss": 0.0175, "step": 1400 }, { "epoch": 16.20689655172414, "grad_norm": 0.17486433684825897, "learning_rate": 1.831977358135625e-05, "loss": 0.0166, "step": 1410 }, { "epoch": 16.32183908045977, "grad_norm": 0.2138216346502304, "learning_rate": 1.723294806605428e-05, "loss": 0.0146, "step": 1420 }, { "epoch": 16.436781609195403, "grad_norm": 0.20112960040569305, "learning_rate": 1.6176309013634517e-05, "loss": 0.0159, "step": 1430 }, { "epoch": 16.551724137931036, "grad_norm": 0.15377485752105713, "learning_rate": 1.5150241797198883e-05, "loss": 0.016, "step": 1440 }, { "epoch": 16.666666666666668, "grad_norm": 0.23132722079753876, "learning_rate": 1.415512063981339e-05, "loss": 0.0134, "step": 1450 }, { "epoch": 16.7816091954023, "grad_norm": 0.15262600779533386, "learning_rate": 1.3191308478023212e-05, "loss": 0.017, "step": 1460 }, { "epoch": 16.896551724137932, "grad_norm": 0.0991855040192604, "learning_rate": 1.2259156829483842e-05, "loss": 0.0167, "step": 1470 }, { "epoch": 17.011494252873565, "grad_norm": 0.12278055399656296, "learning_rate": 1.1359005664756994e-05, "loss": 0.0146, "step": 1480 }, { "epoch": 17.126436781609197, "grad_norm": 0.17124158143997192, "learning_rate": 1.0491183283317997e-05, "loss": 0.0173, "step": 1490 }, { "epoch": 17.24137931034483, "grad_norm": 0.1300356686115265, "learning_rate": 9.656006193819633e-06, "loss": 0.0143, "step": 1500 }, { "epoch": 17.35632183908046, "grad_norm": 0.17946338653564453, "learning_rate": 8.853778998656537e-06, "loss": 0.0154, "step": 1510 }, { "epoch": 17.47126436781609, "grad_norm": 0.28736400604248047, "learning_rate": 8.084794282871689e-06, "loss": 0.0166, "step": 1520 }, { "epoch": 17.586206896551722, "grad_norm": 0.13112574815750122, "learning_rate": 7.3493325074461165e-06, "loss": 0.0132, "step": 1530 }, { "epoch": 17.701149425287355, "grad_norm": 0.12864838540554047, "learning_rate": 6.647661907010183e-06, "loss": 0.0171, "step": 1540 }, { "epoch": 17.816091954022987, "grad_norm": 0.16958807408809662, "learning_rate": 5.980038392014309e-06, "loss": 0.0161, "step": 1550 }, { "epoch": 17.93103448275862, "grad_norm": 0.36121729016304016, "learning_rate": 5.3467054553941405e-06, "loss": 0.0158, "step": 1560 }, { "epoch": 18.04597701149425, "grad_norm": 0.2107989490032196, "learning_rate": 4.7478940837649924e-06, "loss": 0.0147, "step": 1570 }, { "epoch": 18.160919540229884, "grad_norm": 0.15654149651527405, "learning_rate": 4.183822673177229e-06, "loss": 0.0164, "step": 1580 }, { "epoch": 18.275862068965516, "grad_norm": 0.1438828557729721, "learning_rate": 3.6546969494637986e-06, "loss": 0.0131, "step": 1590 }, { "epoch": 18.39080459770115, "grad_norm": 0.2543192207813263, "learning_rate": 3.16070989320868e-06, "loss": 0.0157, "step": 1600 }, { "epoch": 18.50574712643678, "grad_norm": 0.13453112542629242, "learning_rate": 2.702041669363875e-06, "loss": 0.017, "step": 1610 }, { "epoch": 18.620689655172413, "grad_norm": 0.16369780898094177, "learning_rate": 2.2788595615403474e-06, "loss": 0.0157, "step": 1620 }, { "epoch": 18.735632183908045, "grad_norm": 0.14639818668365479, "learning_rate": 1.8913179109969482e-06, "loss": 0.0122, "step": 1630 }, { "epoch": 18.850574712643677, "grad_norm": 0.23813354969024658, "learning_rate": 1.5395580603498328e-06, "loss": 0.0157, "step": 1640 }, { "epoch": 18.96551724137931, "grad_norm": 0.15577834844589233, "learning_rate": 1.2237083020224526e-06, "loss": 0.0144, "step": 1650 }, { "epoch": 19.080459770114942, "grad_norm": 0.06880059838294983, "learning_rate": 9.438838314553056e-07, "loss": 0.0109, "step": 1660 }, { "epoch": 19.195402298850574, "grad_norm": 0.19819270074367523, "learning_rate": 7.001867050923095e-07, "loss": 0.0134, "step": 1670 }, { "epoch": 19.310344827586206, "grad_norm": 0.10673543065786362, "learning_rate": 4.92705803159188e-07, "loss": 0.0155, "step": 1680 }, { "epoch": 19.42528735632184, "grad_norm": 0.16529639065265656, "learning_rate": 3.2151679724748975e-07, "loss": 0.0175, "step": 1690 }, { "epoch": 19.54022988505747, "grad_norm": 0.1206677109003067, "learning_rate": 1.8668212271585327e-07, "loss": 0.0188, "step": 1700 }, { "epoch": 19.655172413793103, "grad_norm": 0.10180158913135529, "learning_rate": 8.825095591891152e-08, "loss": 0.0158, "step": 1710 }, { "epoch": 19.770114942528735, "grad_norm": 0.14544282853603363, "learning_rate": 2.625919627188278e-08, "loss": 0.015, "step": 1720 }, { "epoch": 19.885057471264368, "grad_norm": 0.14054107666015625, "learning_rate": 7.294531574553176e-10, "loss": 0.0139, "step": 1730 }, { "epoch": 19.908045977011493, "step": 1732, "total_flos": 2.431760592612004e+17, "train_loss": 0.04785273781403314, "train_runtime": 1980.9382, "train_samples_per_second": 55.957, "train_steps_per_second": 0.874 } ], "logging_steps": 10, "max_steps": 1732, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.431760592612004e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }