{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7476476710350247, "eval_steps": 100, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 18.108646392822266, "learning_rate": 1e-05, "loss": 21.0115, "step": 1 }, { "epoch": 0.0, "grad_norm": 19.891128540039062, "learning_rate": 1e-05, "loss": 20.6659, "step": 10 }, { "epoch": 0.01, "grad_norm": 20.71453285217285, "learning_rate": 1e-05, "loss": 20.2163, "step": 20 }, { "epoch": 0.01, "grad_norm": 21.3765926361084, "learning_rate": 1e-05, "loss": 18.8949, "step": 30 }, { "epoch": 0.01, "grad_norm": 19.546733856201172, "learning_rate": 1e-05, "loss": 17.7048, "step": 40 }, { "epoch": 0.02, "grad_norm": 20.177675247192383, "learning_rate": 1e-05, "loss": 16.1373, "step": 50 }, { "epoch": 0.02, "grad_norm": 20.215681076049805, "learning_rate": 1e-05, "loss": 15.0542, "step": 60 }, { "epoch": 0.02, "grad_norm": 20.990703582763672, "learning_rate": 1e-05, "loss": 13.6239, "step": 70 }, { "epoch": 0.03, "grad_norm": 18.313385009765625, "learning_rate": 1e-05, "loss": 12.5539, "step": 80 }, { "epoch": 0.03, "grad_norm": 23.427459716796875, "learning_rate": 1e-05, "loss": 11.1956, "step": 90 }, { "epoch": 0.03, "grad_norm": 38.51707077026367, "learning_rate": 1e-05, "loss": 9.8649, "step": 100 }, { "epoch": 0.04, "grad_norm": 17.18704605102539, "learning_rate": 1e-05, "loss": 8.7252, "step": 110 }, { "epoch": 0.04, "grad_norm": 20.252742767333984, "learning_rate": 1e-05, "loss": 7.8294, "step": 120 }, { "epoch": 0.04, "grad_norm": 20.109561920166016, "learning_rate": 1e-05, "loss": 6.7392, "step": 130 }, { "epoch": 0.05, "grad_norm": 17.344636917114258, "learning_rate": 1e-05, "loss": 5.7289, "step": 140 }, { "epoch": 0.05, "grad_norm": 73.92993927001953, "learning_rate": 1e-05, "loss": 4.6879, "step": 150 }, { "epoch": 0.05, "grad_norm": 12.572184562683105, "learning_rate": 1e-05, "loss": 3.5502, "step": 160 }, { "epoch": 0.06, "grad_norm": 11.667272567749023, "learning_rate": 1e-05, "loss": 2.801, "step": 170 }, { "epoch": 0.06, "grad_norm": 9.249564170837402, "learning_rate": 1e-05, "loss": 2.3337, "step": 180 }, { "epoch": 0.06, "grad_norm": 13.55443000793457, "learning_rate": 1e-05, "loss": 1.9857, "step": 190 }, { "epoch": 0.07, "grad_norm": 248.29791259765625, "learning_rate": 1e-05, "loss": 1.7209, "step": 200 }, { "epoch": 0.07, "grad_norm": 2.2151737213134766, "learning_rate": 1e-05, "loss": 1.5336, "step": 210 }, { "epoch": 0.07, "grad_norm": 1.544109582901001, "learning_rate": 1e-05, "loss": 1.3586, "step": 220 }, { "epoch": 0.08, "grad_norm": 2.0416696071624756, "learning_rate": 1e-05, "loss": 1.3511, "step": 230 }, { "epoch": 0.08, "grad_norm": 1.0984798669815063, "learning_rate": 1e-05, "loss": 1.2404, "step": 240 }, { "epoch": 0.08, "grad_norm": 1.1240887641906738, "learning_rate": 1e-05, "loss": 1.1461, "step": 250 }, { "epoch": 0.09, "grad_norm": 0.9642849564552307, "learning_rate": 1e-05, "loss": 1.1479, "step": 260 }, { "epoch": 0.09, "grad_norm": 9.870434761047363, "learning_rate": 1e-05, "loss": 1.0765, "step": 270 }, { "epoch": 0.1, "grad_norm": 0.7763110399246216, "learning_rate": 1e-05, "loss": 1.0369, "step": 280 }, { "epoch": 0.1, "grad_norm": 0.542849063873291, "learning_rate": 1e-05, "loss": 1.0358, "step": 290 }, { "epoch": 0.1, "grad_norm": 41.260128021240234, "learning_rate": 1e-05, "loss": 0.9912, "step": 300 }, { "epoch": 0.11, "grad_norm": 0.45591601729393005, "learning_rate": 1e-05, "loss": 1.0068, "step": 310 }, { "epoch": 0.11, "grad_norm": 0.4061857759952545, "learning_rate": 1e-05, "loss": 0.9619, "step": 320 }, { "epoch": 0.11, "grad_norm": 0.6108375787734985, "learning_rate": 1e-05, "loss": 0.998, "step": 330 }, { "epoch": 0.12, "grad_norm": 54.73992156982422, "learning_rate": 1e-05, "loss": 0.9671, "step": 340 }, { "epoch": 0.12, "grad_norm": 1.1031421422958374, "learning_rate": 1e-05, "loss": 0.9131, "step": 350 }, { "epoch": 0.12, "grad_norm": 0.8087090849876404, "learning_rate": 1e-05, "loss": 0.8975, "step": 360 }, { "epoch": 0.13, "grad_norm": 0.830881655216217, "learning_rate": 1e-05, "loss": 0.8793, "step": 370 }, { "epoch": 0.13, "grad_norm": 0.6168428659439087, "learning_rate": 1e-05, "loss": 0.8614, "step": 380 }, { "epoch": 0.13, "grad_norm": 0.6438579559326172, "learning_rate": 1e-05, "loss": 0.844, "step": 390 }, { "epoch": 0.14, "grad_norm": 0.7063890099525452, "learning_rate": 1e-05, "loss": 0.8112, "step": 400 }, { "epoch": 0.14, "grad_norm": 0.6345189809799194, "learning_rate": 1e-05, "loss": 0.7793, "step": 410 }, { "epoch": 0.14, "grad_norm": 0.7021321654319763, "learning_rate": 1e-05, "loss": 0.7494, "step": 420 }, { "epoch": 0.15, "grad_norm": 0.6041214466094971, "learning_rate": 1e-05, "loss": 0.7391, "step": 430 }, { "epoch": 0.15, "grad_norm": 0.7526829242706299, "learning_rate": 1e-05, "loss": 0.6812, "step": 440 }, { "epoch": 0.15, "grad_norm": 0.9282905459403992, "learning_rate": 1e-05, "loss": 0.7046, "step": 450 }, { "epoch": 0.16, "grad_norm": 0.7867602109909058, "learning_rate": 1e-05, "loss": 0.6556, "step": 460 }, { "epoch": 0.16, "grad_norm": 0.7166017889976501, "learning_rate": 1e-05, "loss": 0.6598, "step": 470 }, { "epoch": 0.16, "grad_norm": 0.48004817962646484, "learning_rate": 1e-05, "loss": 0.6207, "step": 480 }, { "epoch": 0.17, "grad_norm": 0.5272298455238342, "learning_rate": 1e-05, "loss": 0.6203, "step": 490 }, { "epoch": 0.17, "grad_norm": 0.7496746182441711, "learning_rate": 1e-05, "loss": 0.5562, "step": 500 }, { "epoch": 0.17, "grad_norm": 0.7099195718765259, "learning_rate": 1e-05, "loss": 0.532, "step": 510 }, { "epoch": 0.18, "grad_norm": 0.6780660152435303, "learning_rate": 1e-05, "loss": 0.5432, "step": 520 }, { "epoch": 0.18, "grad_norm": 0.4947499632835388, "learning_rate": 1e-05, "loss": 0.5428, "step": 530 }, { "epoch": 0.18, "grad_norm": 1.102908968925476, "learning_rate": 1e-05, "loss": 0.5605, "step": 540 }, { "epoch": 0.19, "grad_norm": 13.204094886779785, "learning_rate": 1e-05, "loss": 0.5213, "step": 550 }, { "epoch": 0.19, "grad_norm": 0.5231891870498657, "learning_rate": 1e-05, "loss": 0.4716, "step": 560 }, { "epoch": 0.19, "grad_norm": 1.3716018199920654, "learning_rate": 1e-05, "loss": 0.4907, "step": 570 }, { "epoch": 0.2, "grad_norm": 0.6284762620925903, "learning_rate": 1e-05, "loss": 0.4753, "step": 580 }, { "epoch": 0.2, "grad_norm": 0.4310474097728729, "learning_rate": 1e-05, "loss": 0.4351, "step": 590 }, { "epoch": 0.2, "grad_norm": 1.1138970851898193, "learning_rate": 1e-05, "loss": 0.4556, "step": 600 }, { "epoch": 0.21, "grad_norm": 2.0587685108184814, "learning_rate": 1e-05, "loss": 0.4312, "step": 610 }, { "epoch": 0.21, "grad_norm": 0.4586256742477417, "learning_rate": 1e-05, "loss": 0.4448, "step": 620 }, { "epoch": 0.21, "grad_norm": 0.5893386006355286, "learning_rate": 1e-05, "loss": 0.4097, "step": 630 }, { "epoch": 0.22, "grad_norm": 0.6643804311752319, "learning_rate": 1e-05, "loss": 0.3823, "step": 640 }, { "epoch": 0.22, "grad_norm": 0.758120059967041, "learning_rate": 1e-05, "loss": 0.3963, "step": 650 }, { "epoch": 0.22, "grad_norm": 1.0745816230773926, "learning_rate": 1e-05, "loss": 0.3846, "step": 660 }, { "epoch": 0.23, "grad_norm": 53.08203887939453, "learning_rate": 1e-05, "loss": 0.3815, "step": 670 }, { "epoch": 0.23, "grad_norm": 0.9295670390129089, "learning_rate": 1e-05, "loss": 0.3577, "step": 680 }, { "epoch": 0.23, "grad_norm": 0.4713806211948395, "learning_rate": 1e-05, "loss": 0.3429, "step": 690 }, { "epoch": 0.24, "grad_norm": 0.426747590303421, "learning_rate": 1e-05, "loss": 0.3568, "step": 700 }, { "epoch": 0.24, "grad_norm": 0.32080256938934326, "learning_rate": 1e-05, "loss": 0.3549, "step": 710 }, { "epoch": 0.24, "grad_norm": 0.3621009290218353, "learning_rate": 1e-05, "loss": 0.3292, "step": 720 }, { "epoch": 0.25, "grad_norm": 0.7618722915649414, "learning_rate": 1e-05, "loss": 0.3325, "step": 730 }, { "epoch": 0.25, "grad_norm": 0.679786205291748, "learning_rate": 1e-05, "loss": 0.3231, "step": 740 }, { "epoch": 0.25, "grad_norm": 0.5011792182922363, "learning_rate": 1e-05, "loss": 0.3287, "step": 750 }, { "epoch": 0.26, "grad_norm": 0.47687655687332153, "learning_rate": 1e-05, "loss": 0.302, "step": 760 }, { "epoch": 0.26, "grad_norm": 0.5612205862998962, "learning_rate": 1e-05, "loss": 0.3053, "step": 770 }, { "epoch": 0.27, "grad_norm": 1.2981828451156616, "learning_rate": 1e-05, "loss": 0.2853, "step": 780 }, { "epoch": 0.27, "grad_norm": 0.4281419813632965, "learning_rate": 1e-05, "loss": 0.2746, "step": 790 }, { "epoch": 0.27, "grad_norm": 0.6802565455436707, "learning_rate": 1e-05, "loss": 0.2708, "step": 800 }, { "epoch": 0.28, "grad_norm": 3.3523566722869873, "learning_rate": 1e-05, "loss": 0.5448, "step": 810 }, { "epoch": 0.28, "grad_norm": 2.0021793842315674, "learning_rate": 1e-05, "loss": 0.3757, "step": 820 }, { "epoch": 0.28, "grad_norm": 1.0229527950286865, "learning_rate": 1e-05, "loss": 0.3256, "step": 830 }, { "epoch": 0.29, "grad_norm": 0.5520893931388855, "learning_rate": 1e-05, "loss": 0.292, "step": 840 }, { "epoch": 0.29, "grad_norm": 0.5874707698822021, "learning_rate": 1e-05, "loss": 0.2686, "step": 850 }, { "epoch": 0.29, "grad_norm": 0.38817650079727173, "learning_rate": 1e-05, "loss": 0.2527, "step": 860 }, { "epoch": 0.3, "grad_norm": 0.7592063546180725, "learning_rate": 1e-05, "loss": 0.243, "step": 870 }, { "epoch": 0.3, "grad_norm": 0.42729002237319946, "learning_rate": 1e-05, "loss": 0.2297, "step": 880 }, { "epoch": 0.3, "grad_norm": 0.3907771706581116, "learning_rate": 1e-05, "loss": 0.223, "step": 890 }, { "epoch": 0.31, "grad_norm": 0.38824358582496643, "learning_rate": 1e-05, "loss": 0.2173, "step": 900 }, { "epoch": 0.31, "grad_norm": 0.2964054048061371, "learning_rate": 1e-05, "loss": 0.2081, "step": 910 }, { "epoch": 0.31, "grad_norm": 0.37019163370132446, "learning_rate": 1e-05, "loss": 0.2032, "step": 920 }, { "epoch": 0.32, "grad_norm": 0.2846088111400604, "learning_rate": 1e-05, "loss": 0.1971, "step": 930 }, { "epoch": 0.32, "grad_norm": 0.2604421377182007, "learning_rate": 1e-05, "loss": 0.1936, "step": 940 }, { "epoch": 0.32, "grad_norm": 0.3167709410190582, "learning_rate": 1e-05, "loss": 0.1894, "step": 950 }, { "epoch": 0.33, "grad_norm": 0.4469132423400879, "learning_rate": 1e-05, "loss": 0.1817, "step": 960 }, { "epoch": 0.33, "grad_norm": 0.3273838758468628, "learning_rate": 1e-05, "loss": 0.1769, "step": 970 }, { "epoch": 0.33, "grad_norm": 0.3062090277671814, "learning_rate": 1e-05, "loss": 0.1758, "step": 980 }, { "epoch": 0.34, "grad_norm": 0.1933787316083908, "learning_rate": 1e-05, "loss": 0.1701, "step": 990 }, { "epoch": 0.34, "grad_norm": 0.2302626371383667, "learning_rate": 1e-05, "loss": 0.1655, "step": 1000 }, { "epoch": 0.34, "grad_norm": 0.1817045956850052, "learning_rate": 1e-05, "loss": 0.1649, "step": 1010 }, { "epoch": 0.35, "grad_norm": 0.2034452259540558, "learning_rate": 1e-05, "loss": 0.1596, "step": 1020 }, { "epoch": 0.35, "grad_norm": 0.393022745847702, "learning_rate": 1e-05, "loss": 0.1565, "step": 1030 }, { "epoch": 0.35, "grad_norm": 0.45332619547843933, "learning_rate": 1e-05, "loss": 0.1525, "step": 1040 }, { "epoch": 0.36, "grad_norm": 0.3920886516571045, "learning_rate": 1e-05, "loss": 0.1519, "step": 1050 }, { "epoch": 0.36, "grad_norm": 0.2492988407611847, "learning_rate": 1e-05, "loss": 0.1486, "step": 1060 }, { "epoch": 0.36, "grad_norm": 0.18808089196681976, "learning_rate": 1e-05, "loss": 0.144, "step": 1070 }, { "epoch": 0.37, "grad_norm": 0.17545698583126068, "learning_rate": 1e-05, "loss": 0.1436, "step": 1080 }, { "epoch": 0.37, "grad_norm": 0.39826542139053345, "learning_rate": 1e-05, "loss": 0.142, "step": 1090 }, { "epoch": 0.37, "grad_norm": 0.25920161604881287, "learning_rate": 1e-05, "loss": 0.1377, "step": 1100 }, { "epoch": 0.38, "grad_norm": 0.24598653614521027, "learning_rate": 1e-05, "loss": 0.1343, "step": 1110 }, { "epoch": 0.38, "grad_norm": 0.3771628439426422, "learning_rate": 1e-05, "loss": 0.1308, "step": 1120 }, { "epoch": 0.38, "grad_norm": 0.17581653594970703, "learning_rate": 1e-05, "loss": 0.1304, "step": 1130 }, { "epoch": 0.39, "grad_norm": 0.29346442222595215, "learning_rate": 1e-05, "loss": 0.1272, "step": 1140 }, { "epoch": 0.39, "grad_norm": 0.32129135727882385, "learning_rate": 1e-05, "loss": 0.1249, "step": 1150 }, { "epoch": 0.39, "grad_norm": 0.24236874282360077, "learning_rate": 1e-05, "loss": 0.1246, "step": 1160 }, { "epoch": 0.4, "grad_norm": 0.2639343738555908, "learning_rate": 1e-05, "loss": 0.1221, "step": 1170 }, { "epoch": 0.4, "grad_norm": 0.1657143533229828, "learning_rate": 1e-05, "loss": 0.1196, "step": 1180 }, { "epoch": 0.4, "grad_norm": 0.20446737110614777, "learning_rate": 1e-05, "loss": 0.1159, "step": 1190 }, { "epoch": 0.41, "grad_norm": 0.20889534056186676, "learning_rate": 1e-05, "loss": 0.1147, "step": 1200 }, { "epoch": 0.41, "grad_norm": 0.34177979826927185, "learning_rate": 1e-05, "loss": 0.1141, "step": 1210 }, { "epoch": 0.41, "grad_norm": 0.12536485493183136, "learning_rate": 1e-05, "loss": 0.1126, "step": 1220 }, { "epoch": 0.42, "grad_norm": 0.20015142858028412, "learning_rate": 1e-05, "loss": 0.1099, "step": 1230 }, { "epoch": 0.42, "grad_norm": 0.29639932513237, "learning_rate": 1e-05, "loss": 0.1102, "step": 1240 }, { "epoch": 0.42, "grad_norm": 0.41865894198417664, "learning_rate": 1e-05, "loss": 0.1085, "step": 1250 }, { "epoch": 0.43, "grad_norm": 0.14246885478496552, "learning_rate": 1e-05, "loss": 0.1054, "step": 1260 }, { "epoch": 0.43, "grad_norm": 0.29295045137405396, "learning_rate": 1e-05, "loss": 0.1048, "step": 1270 }, { "epoch": 0.43, "grad_norm": 0.5125443339347839, "learning_rate": 1e-05, "loss": 0.1038, "step": 1280 }, { "epoch": 0.44, "grad_norm": 0.7473035454750061, "learning_rate": 1e-05, "loss": 0.1025, "step": 1290 }, { "epoch": 0.44, "grad_norm": 0.7184324264526367, "learning_rate": 1e-05, "loss": 0.101, "step": 1300 }, { "epoch": 0.45, "grad_norm": 0.2857365906238556, "learning_rate": 1e-05, "loss": 0.0988, "step": 1310 }, { "epoch": 0.45, "grad_norm": 0.15471303462982178, "learning_rate": 1e-05, "loss": 0.0987, "step": 1320 }, { "epoch": 0.45, "grad_norm": 0.5794267654418945, "learning_rate": 1e-05, "loss": 0.0986, "step": 1330 }, { "epoch": 0.46, "grad_norm": 0.3085970878601074, "learning_rate": 1e-05, "loss": 0.0964, "step": 1340 }, { "epoch": 0.46, "grad_norm": 0.38039007782936096, "learning_rate": 1e-05, "loss": 0.0939, "step": 1350 }, { "epoch": 0.46, "grad_norm": 0.20592401921749115, "learning_rate": 1e-05, "loss": 0.0932, "step": 1360 }, { "epoch": 0.47, "grad_norm": 0.20163756608963013, "learning_rate": 1e-05, "loss": 0.0931, "step": 1370 }, { "epoch": 0.47, "grad_norm": 0.3857424855232239, "learning_rate": 1e-05, "loss": 0.0913, "step": 1380 }, { "epoch": 0.47, "grad_norm": 0.13959680497646332, "learning_rate": 1e-05, "loss": 0.092, "step": 1390 }, { "epoch": 0.48, "grad_norm": 0.4001765251159668, "learning_rate": 1e-05, "loss": 0.0902, "step": 1400 }, { "epoch": 0.48, "grad_norm": 0.28235796093940735, "learning_rate": 1e-05, "loss": 0.0896, "step": 1410 }, { "epoch": 0.48, "grad_norm": 0.4206916391849518, "learning_rate": 1e-05, "loss": 0.0874, "step": 1420 }, { "epoch": 0.49, "grad_norm": 0.20532837510108948, "learning_rate": 1e-05, "loss": 0.0859, "step": 1430 }, { "epoch": 0.49, "grad_norm": 0.5433242917060852, "learning_rate": 1e-05, "loss": 0.0868, "step": 1440 }, { "epoch": 0.49, "grad_norm": 0.19398020207881927, "learning_rate": 1e-05, "loss": 0.0862, "step": 1450 }, { "epoch": 0.5, "grad_norm": 0.8159850835800171, "learning_rate": 1e-05, "loss": 0.0848, "step": 1460 }, { "epoch": 0.5, "grad_norm": 0.18659061193466187, "learning_rate": 1e-05, "loss": 0.0839, "step": 1470 }, { "epoch": 0.5, "grad_norm": 0.2018926590681076, "learning_rate": 1e-05, "loss": 0.0826, "step": 1480 }, { "epoch": 0.51, "grad_norm": 0.2839762568473816, "learning_rate": 1e-05, "loss": 0.0824, "step": 1490 }, { "epoch": 0.51, "grad_norm": 0.1700347363948822, "learning_rate": 1e-05, "loss": 0.0814, "step": 1500 }, { "epoch": 0.51, "grad_norm": 0.13129307329654694, "learning_rate": 1e-05, "loss": 0.0815, "step": 1510 }, { "epoch": 0.52, "grad_norm": 0.397320032119751, "learning_rate": 1e-05, "loss": 0.0805, "step": 1520 }, { "epoch": 0.52, "grad_norm": 0.2305113822221756, "learning_rate": 1e-05, "loss": 0.0789, "step": 1530 }, { "epoch": 0.52, "grad_norm": 0.37059059739112854, "learning_rate": 1e-05, "loss": 0.0775, "step": 1540 }, { "epoch": 0.53, "grad_norm": 0.2454056739807129, "learning_rate": 1e-05, "loss": 0.0781, "step": 1550 }, { "epoch": 0.53, "grad_norm": 0.340133398771286, "learning_rate": 1e-05, "loss": 0.0778, "step": 1560 }, { "epoch": 0.53, "grad_norm": 0.250377893447876, "learning_rate": 1e-05, "loss": 0.0753, "step": 1570 }, { "epoch": 0.54, "grad_norm": 0.18379095196723938, "learning_rate": 1e-05, "loss": 0.0755, "step": 1580 }, { "epoch": 0.54, "grad_norm": 0.16987499594688416, "learning_rate": 1e-05, "loss": 0.0743, "step": 1590 }, { "epoch": 0.54, "grad_norm": 0.1600923091173172, "learning_rate": 1e-05, "loss": 0.0739, "step": 1600 }, { "epoch": 0.55, "grad_norm": 0.15362544357776642, "learning_rate": 1e-05, "loss": 0.0732, "step": 1610 }, { "epoch": 0.55, "grad_norm": 0.40313267707824707, "learning_rate": 1e-05, "loss": 0.0726, "step": 1620 }, { "epoch": 0.55, "grad_norm": 0.43576550483703613, "learning_rate": 1e-05, "loss": 0.0722, "step": 1630 }, { "epoch": 0.56, "grad_norm": 0.255543977022171, "learning_rate": 1e-05, "loss": 0.0709, "step": 1640 }, { "epoch": 0.56, "grad_norm": 0.23503994941711426, "learning_rate": 1e-05, "loss": 0.0719, "step": 1650 }, { "epoch": 0.56, "grad_norm": 0.27073901891708374, "learning_rate": 1e-05, "loss": 0.0702, "step": 1660 }, { "epoch": 0.57, "grad_norm": 0.15940971672534943, "learning_rate": 1e-05, "loss": 0.0699, "step": 1670 }, { "epoch": 0.57, "grad_norm": 0.3071956932544708, "learning_rate": 1e-05, "loss": 0.0701, "step": 1680 }, { "epoch": 0.57, "grad_norm": 0.13690294325351715, "learning_rate": 1e-05, "loss": 0.0687, "step": 1690 }, { "epoch": 0.58, "grad_norm": 0.11188670992851257, "learning_rate": 1e-05, "loss": 0.0685, "step": 1700 }, { "epoch": 0.58, "grad_norm": 0.22904923558235168, "learning_rate": 1e-05, "loss": 0.069, "step": 1710 }, { "epoch": 0.58, "grad_norm": 0.15047454833984375, "learning_rate": 1e-05, "loss": 0.0676, "step": 1720 }, { "epoch": 0.59, "grad_norm": 0.1788366287946701, "learning_rate": 1e-05, "loss": 0.0663, "step": 1730 }, { "epoch": 0.59, "grad_norm": 0.1737486571073532, "learning_rate": 1e-05, "loss": 0.0665, "step": 1740 }, { "epoch": 0.59, "grad_norm": 0.18163177371025085, "learning_rate": 1e-05, "loss": 0.066, "step": 1750 }, { "epoch": 0.6, "grad_norm": 0.15237963199615479, "learning_rate": 1e-05, "loss": 0.0647, "step": 1760 }, { "epoch": 0.6, "grad_norm": 0.1539287269115448, "learning_rate": 1e-05, "loss": 0.0655, "step": 1770 }, { "epoch": 0.6, "grad_norm": 0.13951052725315094, "learning_rate": 1e-05, "loss": 0.0649, "step": 1780 }, { "epoch": 0.61, "grad_norm": 0.1300867348909378, "learning_rate": 1e-05, "loss": 0.0641, "step": 1790 }, { "epoch": 0.61, "grad_norm": 0.12030315399169922, "learning_rate": 1e-05, "loss": 0.0636, "step": 1800 }, { "epoch": 0.62, "grad_norm": 0.10913581401109695, "learning_rate": 1e-05, "loss": 0.0636, "step": 1810 }, { "epoch": 0.62, "grad_norm": 0.17842526733875275, "learning_rate": 1e-05, "loss": 0.0637, "step": 1820 }, { "epoch": 0.62, "grad_norm": 0.20770031213760376, "learning_rate": 1e-05, "loss": 0.0632, "step": 1830 }, { "epoch": 0.63, "grad_norm": 0.12772506475448608, "learning_rate": 1e-05, "loss": 0.0625, "step": 1840 }, { "epoch": 0.63, "grad_norm": 0.10420256853103638, "learning_rate": 1e-05, "loss": 0.0625, "step": 1850 }, { "epoch": 0.63, "grad_norm": 0.2480468451976776, "learning_rate": 1e-05, "loss": 0.062, "step": 1860 }, { "epoch": 0.64, "grad_norm": 0.17825984954833984, "learning_rate": 1e-05, "loss": 0.0605, "step": 1870 }, { "epoch": 0.64, "grad_norm": 0.12127891182899475, "learning_rate": 1e-05, "loss": 0.0608, "step": 1880 }, { "epoch": 0.64, "grad_norm": 0.09041409939527512, "learning_rate": 1e-05, "loss": 0.0602, "step": 1890 }, { "epoch": 0.65, "grad_norm": 0.10765266418457031, "learning_rate": 1e-05, "loss": 0.0604, "step": 1900 }, { "epoch": 0.65, "grad_norm": 0.12556296586990356, "learning_rate": 1e-05, "loss": 0.0592, "step": 1910 }, { "epoch": 0.65, "grad_norm": 0.2558988928794861, "learning_rate": 1e-05, "loss": 0.0598, "step": 1920 }, { "epoch": 0.66, "grad_norm": 0.12372375279664993, "learning_rate": 1e-05, "loss": 0.0582, "step": 1930 }, { "epoch": 0.66, "grad_norm": 0.0909271314740181, "learning_rate": 1e-05, "loss": 0.0586, "step": 1940 }, { "epoch": 0.66, "grad_norm": 0.19711463153362274, "learning_rate": 1e-05, "loss": 0.058, "step": 1950 }, { "epoch": 0.67, "grad_norm": 0.13144218921661377, "learning_rate": 1e-05, "loss": 0.0592, "step": 1960 }, { "epoch": 0.67, "grad_norm": 0.10313715785741806, "learning_rate": 1e-05, "loss": 0.0577, "step": 1970 }, { "epoch": 0.67, "grad_norm": 0.15561428666114807, "learning_rate": 1e-05, "loss": 0.0581, "step": 1980 }, { "epoch": 0.68, "grad_norm": 0.1571255624294281, "learning_rate": 1e-05, "loss": 0.0576, "step": 1990 }, { "epoch": 0.68, "grad_norm": 0.34297963976860046, "learning_rate": 1e-05, "loss": 0.0574, "step": 2000 }, { "epoch": 0.68, "grad_norm": 0.3346453011035919, "learning_rate": 1e-05, "loss": 0.0573, "step": 2010 }, { "epoch": 0.69, "grad_norm": 0.09346891939640045, "learning_rate": 1e-05, "loss": 0.0558, "step": 2020 }, { "epoch": 0.69, "grad_norm": 0.3849794566631317, "learning_rate": 1e-05, "loss": 0.0576, "step": 2030 }, { "epoch": 0.69, "grad_norm": 0.3006366789340973, "learning_rate": 1e-05, "loss": 0.0561, "step": 2040 }, { "epoch": 0.7, "grad_norm": 0.08292259275913239, "learning_rate": 1e-05, "loss": 0.0567, "step": 2050 }, { "epoch": 0.7, "grad_norm": 0.37851089239120483, "learning_rate": 1e-05, "loss": 0.0556, "step": 2060 }, { "epoch": 0.7, "grad_norm": 0.2993626892566681, "learning_rate": 1e-05, "loss": 0.0551, "step": 2070 }, { "epoch": 0.71, "grad_norm": 1.3539237976074219, "learning_rate": 1e-05, "loss": 0.0556, "step": 2080 }, { "epoch": 0.71, "grad_norm": 0.6489169597625732, "learning_rate": 1e-05, "loss": 0.0556, "step": 2090 }, { "epoch": 0.71, "grad_norm": 0.20589084923267365, "learning_rate": 1e-05, "loss": 0.0545, "step": 2100 }, { "epoch": 0.72, "grad_norm": 0.08394920825958252, "learning_rate": 1e-05, "loss": 0.0542, "step": 2110 }, { "epoch": 0.72, "grad_norm": 0.0861760601401329, "learning_rate": 1e-05, "loss": 0.0534, "step": 2120 }, { "epoch": 0.72, "grad_norm": 0.22998709976673126, "learning_rate": 1e-05, "loss": 0.0538, "step": 2130 }, { "epoch": 0.73, "grad_norm": 0.1493302583694458, "learning_rate": 1e-05, "loss": 0.0542, "step": 2140 }, { "epoch": 0.73, "grad_norm": 0.11739397794008255, "learning_rate": 1e-05, "loss": 0.0539, "step": 2150 }, { "epoch": 0.73, "grad_norm": 0.10503773391246796, "learning_rate": 1e-05, "loss": 0.053, "step": 2160 }, { "epoch": 0.74, "grad_norm": 0.2653915286064148, "learning_rate": 1e-05, "loss": 0.0533, "step": 2170 }, { "epoch": 0.74, "grad_norm": 0.1554844081401825, "learning_rate": 1e-05, "loss": 0.0528, "step": 2180 }, { "epoch": 0.74, "grad_norm": 0.20956221222877502, "learning_rate": 1e-05, "loss": 0.0524, "step": 2190 }, { "epoch": 0.75, "grad_norm": 0.07734131067991257, "learning_rate": 1e-05, "loss": 0.0524, "step": 2200 } ], "logging_steps": 10, "max_steps": 17652, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 200, "total_flos": 3.689592339116851e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }