{ "best_metric": 0.51440329218107, "best_model_checkpoint": "./save_phi2_ft_lora/checkpoint-2500", "epoch": 0.7268498328245384, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029073993312981536, "grad_norm": 4.615232944488525, "learning_rate": 1.0000000000000002e-06, "loss": 6.6211, "step": 10 }, { "epoch": 0.005814798662596307, "grad_norm": 5.528043746948242, "learning_rate": 2.0000000000000003e-06, "loss": 6.5222, "step": 20 }, { "epoch": 0.00872219799389446, "grad_norm": 6.019925594329834, "learning_rate": 3e-06, "loss": 6.4104, "step": 30 }, { "epoch": 0.011629597325192614, "grad_norm": 7.163036823272705, "learning_rate": 4.000000000000001e-06, "loss": 6.0366, "step": 40 }, { "epoch": 0.01453699665649077, "grad_norm": 7.888458251953125, "learning_rate": 5e-06, "loss": 5.4618, "step": 50 }, { "epoch": 0.01744439598778892, "grad_norm": 9.86289119720459, "learning_rate": 6e-06, "loss": 4.526, "step": 60 }, { "epoch": 0.020351795319087075, "grad_norm": 9.114004135131836, "learning_rate": 7.000000000000001e-06, "loss": 3.2234, "step": 70 }, { "epoch": 0.02325919465038523, "grad_norm": 6.198110103607178, "learning_rate": 8.000000000000001e-06, "loss": 1.6796, "step": 80 }, { "epoch": 0.026166593981683386, "grad_norm": 1.2424126863479614, "learning_rate": 9e-06, "loss": 0.6189, "step": 90 }, { "epoch": 0.02907399331298154, "grad_norm": 0.20757392048835754, "learning_rate": 1e-05, "loss": 0.2989, "step": 100 }, { "epoch": 0.03198139264427969, "grad_norm": 0.19180330634117126, "learning_rate": 1.1000000000000001e-05, "loss": 0.3434, "step": 110 }, { "epoch": 0.03488879197557784, "grad_norm": 0.17373745143413544, "learning_rate": 1.2e-05, "loss": 0.2775, "step": 120 }, { "epoch": 0.037796191306876, "grad_norm": 0.14247311651706696, "learning_rate": 1.3000000000000001e-05, "loss": 0.291, "step": 130 }, { "epoch": 0.04070359063817415, "grad_norm": 0.1189289391040802, "learning_rate": 1.4000000000000001e-05, "loss": 0.2549, "step": 140 }, { "epoch": 0.043610989969472304, "grad_norm": 0.11715172231197357, "learning_rate": 1.5e-05, "loss": 0.2334, "step": 150 }, { "epoch": 0.04651838930077046, "grad_norm": 0.16550108790397644, "learning_rate": 1.6000000000000003e-05, "loss": 0.2515, "step": 160 }, { "epoch": 0.04942578863206862, "grad_norm": 0.13685165345668793, "learning_rate": 1.7000000000000003e-05, "loss": 0.2757, "step": 170 }, { "epoch": 0.05233318796336677, "grad_norm": 0.13247624039649963, "learning_rate": 1.8e-05, "loss": 0.2336, "step": 180 }, { "epoch": 0.055240587294664925, "grad_norm": 0.13763906061649323, "learning_rate": 1.9e-05, "loss": 0.2169, "step": 190 }, { "epoch": 0.05814798662596308, "grad_norm": 0.1327727884054184, "learning_rate": 2e-05, "loss": 0.2007, "step": 200 }, { "epoch": 0.06105538595726123, "grad_norm": 0.1083875447511673, "learning_rate": 2.1e-05, "loss": 0.2181, "step": 210 }, { "epoch": 0.06396278528855938, "grad_norm": 0.12445604056119919, "learning_rate": 2.2000000000000003e-05, "loss": 0.2969, "step": 220 }, { "epoch": 0.06687018461985754, "grad_norm": 0.15511994063854218, "learning_rate": 2.3000000000000003e-05, "loss": 0.1868, "step": 230 }, { "epoch": 0.06977758395115569, "grad_norm": 0.13466577231884003, "learning_rate": 2.4e-05, "loss": 0.25, "step": 240 }, { "epoch": 0.07268498328245385, "grad_norm": 0.06756726652383804, "learning_rate": 2.5e-05, "loss": 0.1809, "step": 250 }, { "epoch": 0.075592382613752, "grad_norm": 0.1573372632265091, "learning_rate": 2.6000000000000002e-05, "loss": 0.1963, "step": 260 }, { "epoch": 0.07849978194505015, "grad_norm": 0.13099811971187592, "learning_rate": 2.7000000000000002e-05, "loss": 0.1984, "step": 270 }, { "epoch": 0.0814071812763483, "grad_norm": 0.11419626325368881, "learning_rate": 2.8000000000000003e-05, "loss": 0.1665, "step": 280 }, { "epoch": 0.08431458060764646, "grad_norm": 0.10189209878444672, "learning_rate": 2.9e-05, "loss": 0.1749, "step": 290 }, { "epoch": 0.08722197993894461, "grad_norm": 0.12042666226625443, "learning_rate": 3e-05, "loss": 0.1528, "step": 300 }, { "epoch": 0.09012937927024277, "grad_norm": 0.08344505727291107, "learning_rate": 3.1e-05, "loss": 0.181, "step": 310 }, { "epoch": 0.09303677860154091, "grad_norm": 0.11566774547100067, "learning_rate": 3.2000000000000005e-05, "loss": 0.1363, "step": 320 }, { "epoch": 0.09594417793283908, "grad_norm": 0.1531219631433487, "learning_rate": 3.3e-05, "loss": 0.1564, "step": 330 }, { "epoch": 0.09885157726413724, "grad_norm": 0.08966905623674393, "learning_rate": 3.4000000000000007e-05, "loss": 0.1281, "step": 340 }, { "epoch": 0.10175897659543538, "grad_norm": 0.08444681763648987, "learning_rate": 3.5e-05, "loss": 0.1258, "step": 350 }, { "epoch": 0.10466637592673354, "grad_norm": 0.10482101142406464, "learning_rate": 3.6e-05, "loss": 0.1292, "step": 360 }, { "epoch": 0.10757377525803169, "grad_norm": 0.14638499915599823, "learning_rate": 3.7e-05, "loss": 0.1829, "step": 370 }, { "epoch": 0.11048117458932985, "grad_norm": 0.1461925059556961, "learning_rate": 3.8e-05, "loss": 0.2158, "step": 380 }, { "epoch": 0.113388573920628, "grad_norm": 0.16068245470523834, "learning_rate": 3.9000000000000006e-05, "loss": 0.2219, "step": 390 }, { "epoch": 0.11629597325192616, "grad_norm": 0.13314808905124664, "learning_rate": 4e-05, "loss": 0.1622, "step": 400 }, { "epoch": 0.1192033725832243, "grad_norm": 0.13541767001152039, "learning_rate": 4.1e-05, "loss": 0.1363, "step": 410 }, { "epoch": 0.12211077191452246, "grad_norm": 0.14549389481544495, "learning_rate": 4.2e-05, "loss": 0.1664, "step": 420 }, { "epoch": 0.12501817124582063, "grad_norm": 0.09966716170310974, "learning_rate": 4.3e-05, "loss": 0.1446, "step": 430 }, { "epoch": 0.12792557057711876, "grad_norm": 0.10374356061220169, "learning_rate": 4.4000000000000006e-05, "loss": 0.1302, "step": 440 }, { "epoch": 0.13083296990841692, "grad_norm": 0.10534253716468811, "learning_rate": 4.5e-05, "loss": 0.1612, "step": 450 }, { "epoch": 0.13374036923971508, "grad_norm": 0.14981906116008759, "learning_rate": 4.600000000000001e-05, "loss": 0.1782, "step": 460 }, { "epoch": 0.13664776857101324, "grad_norm": 0.08772515505552292, "learning_rate": 4.7e-05, "loss": 0.1303, "step": 470 }, { "epoch": 0.13955516790231137, "grad_norm": 0.14555956423282623, "learning_rate": 4.8e-05, "loss": 0.1259, "step": 480 }, { "epoch": 0.14246256723360953, "grad_norm": 0.08366917818784714, "learning_rate": 4.9e-05, "loss": 0.1827, "step": 490 }, { "epoch": 0.1453699665649077, "grad_norm": 0.13676565885543823, "learning_rate": 5e-05, "loss": 0.1563, "step": 500 }, { "epoch": 0.1453699665649077, "eval_accuracy": 0.4444444444444444, "step": 500 }, { "epoch": 0.1453699665649077, "eval_loss": 0.15286579728126526, "eval_runtime": 120.6585, "eval_samples_per_second": 2.014, "eval_steps_per_second": 2.014, "step": 500 }, { "epoch": 0.14827736589620585, "grad_norm": 0.16602879762649536, "learning_rate": 4.982987410683906e-05, "loss": 0.1322, "step": 510 }, { "epoch": 0.151184765227504, "grad_norm": 0.12974093854427338, "learning_rate": 4.965974821367813e-05, "loss": 0.1432, "step": 520 }, { "epoch": 0.15409216455880215, "grad_norm": 0.10730555653572083, "learning_rate": 4.948962232051719e-05, "loss": 0.1344, "step": 530 }, { "epoch": 0.1569995638901003, "grad_norm": 0.09881391376256943, "learning_rate": 4.931949642735625e-05, "loss": 0.1207, "step": 540 }, { "epoch": 0.15990696322139847, "grad_norm": 0.15530119836330414, "learning_rate": 4.9149370534195306e-05, "loss": 0.2084, "step": 550 }, { "epoch": 0.1628143625526966, "grad_norm": 0.1018906682729721, "learning_rate": 4.8979244641034366e-05, "loss": 0.1465, "step": 560 }, { "epoch": 0.16572176188399476, "grad_norm": 0.11174057424068451, "learning_rate": 4.8809118747873425e-05, "loss": 0.1704, "step": 570 }, { "epoch": 0.16862916121529292, "grad_norm": 0.13199737668037415, "learning_rate": 4.863899285471249e-05, "loss": 0.1376, "step": 580 }, { "epoch": 0.17153656054659108, "grad_norm": 0.12376833707094193, "learning_rate": 4.846886696155155e-05, "loss": 0.1848, "step": 590 }, { "epoch": 0.17444395987788922, "grad_norm": 0.10838288813829422, "learning_rate": 4.829874106839061e-05, "loss": 0.1327, "step": 600 }, { "epoch": 0.17735135920918738, "grad_norm": 0.09848768264055252, "learning_rate": 4.8128615175229676e-05, "loss": 0.1807, "step": 610 }, { "epoch": 0.18025875854048554, "grad_norm": 0.10199768096208572, "learning_rate": 4.7958489282068736e-05, "loss": 0.1475, "step": 620 }, { "epoch": 0.1831661578717837, "grad_norm": 0.1114463359117508, "learning_rate": 4.7788363388907795e-05, "loss": 0.1458, "step": 630 }, { "epoch": 0.18607355720308183, "grad_norm": 0.09931254386901855, "learning_rate": 4.7618237495746855e-05, "loss": 0.1572, "step": 640 }, { "epoch": 0.18898095653438, "grad_norm": 0.07883786410093307, "learning_rate": 4.7448111602585914e-05, "loss": 0.1502, "step": 650 }, { "epoch": 0.19188835586567815, "grad_norm": 0.1312653124332428, "learning_rate": 4.7277985709424973e-05, "loss": 0.1306, "step": 660 }, { "epoch": 0.1947957551969763, "grad_norm": 0.10453817993402481, "learning_rate": 4.710785981626404e-05, "loss": 0.1516, "step": 670 }, { "epoch": 0.19770315452827447, "grad_norm": 0.13967570662498474, "learning_rate": 4.69377339231031e-05, "loss": 0.1976, "step": 680 }, { "epoch": 0.2006105538595726, "grad_norm": 0.09553707391023636, "learning_rate": 4.676760802994216e-05, "loss": 0.1291, "step": 690 }, { "epoch": 0.20351795319087077, "grad_norm": 0.14968940615653992, "learning_rate": 4.6597482136781225e-05, "loss": 0.1669, "step": 700 }, { "epoch": 0.20642535252216893, "grad_norm": 0.10163428634405136, "learning_rate": 4.642735624362028e-05, "loss": 0.1727, "step": 710 }, { "epoch": 0.2093327518534671, "grad_norm": 0.13607974350452423, "learning_rate": 4.6257230350459343e-05, "loss": 0.1473, "step": 720 }, { "epoch": 0.21224015118476522, "grad_norm": 0.0766085609793663, "learning_rate": 4.60871044572984e-05, "loss": 0.1419, "step": 730 }, { "epoch": 0.21514755051606338, "grad_norm": 0.11004997789859772, "learning_rate": 4.591697856413746e-05, "loss": 0.1344, "step": 740 }, { "epoch": 0.21805494984736154, "grad_norm": 0.10307681560516357, "learning_rate": 4.574685267097653e-05, "loss": 0.1355, "step": 750 }, { "epoch": 0.2209623491786597, "grad_norm": 0.098714679479599, "learning_rate": 4.557672677781559e-05, "loss": 0.1565, "step": 760 }, { "epoch": 0.22386974850995783, "grad_norm": 0.1346523016691208, "learning_rate": 4.540660088465465e-05, "loss": 0.1675, "step": 770 }, { "epoch": 0.226777147841256, "grad_norm": 0.12962771952152252, "learning_rate": 4.5236474991493714e-05, "loss": 0.1058, "step": 780 }, { "epoch": 0.22968454717255415, "grad_norm": 0.14412884414196014, "learning_rate": 4.5066349098332766e-05, "loss": 0.198, "step": 790 }, { "epoch": 0.23259194650385231, "grad_norm": 0.14253386855125427, "learning_rate": 4.4896223205171826e-05, "loss": 0.1319, "step": 800 }, { "epoch": 0.23549934583515045, "grad_norm": 0.2203107625246048, "learning_rate": 4.472609731201089e-05, "loss": 0.1729, "step": 810 }, { "epoch": 0.2384067451664486, "grad_norm": 0.09438087791204453, "learning_rate": 4.455597141884995e-05, "loss": 0.1558, "step": 820 }, { "epoch": 0.24131414449774677, "grad_norm": 0.09754593670368195, "learning_rate": 4.438584552568901e-05, "loss": 0.1729, "step": 830 }, { "epoch": 0.24422154382904493, "grad_norm": 0.1403294950723648, "learning_rate": 4.421571963252808e-05, "loss": 0.1871, "step": 840 }, { "epoch": 0.24712894316034306, "grad_norm": 0.11251681298017502, "learning_rate": 4.4045593739367136e-05, "loss": 0.1726, "step": 850 }, { "epoch": 0.25003634249164125, "grad_norm": 0.13492991030216217, "learning_rate": 4.3875467846206196e-05, "loss": 0.1349, "step": 860 }, { "epoch": 0.2529437418229394, "grad_norm": 0.11674369871616364, "learning_rate": 4.3705341953045255e-05, "loss": 0.2088, "step": 870 }, { "epoch": 0.2558511411542375, "grad_norm": 0.09339413791894913, "learning_rate": 4.3535216059884315e-05, "loss": 0.1656, "step": 880 }, { "epoch": 0.2587585404855357, "grad_norm": 0.10234147310256958, "learning_rate": 4.3365090166723374e-05, "loss": 0.1495, "step": 890 }, { "epoch": 0.26166593981683384, "grad_norm": 0.09566229581832886, "learning_rate": 4.319496427356244e-05, "loss": 0.109, "step": 900 }, { "epoch": 0.26457333914813197, "grad_norm": 0.10038291662931442, "learning_rate": 4.30248383804015e-05, "loss": 0.1849, "step": 910 }, { "epoch": 0.26748073847943016, "grad_norm": 0.12414570152759552, "learning_rate": 4.285471248724056e-05, "loss": 0.1476, "step": 920 }, { "epoch": 0.2703881378107283, "grad_norm": 0.0866188034415245, "learning_rate": 4.2684586594079625e-05, "loss": 0.1339, "step": 930 }, { "epoch": 0.2732955371420265, "grad_norm": 0.20008312165737152, "learning_rate": 4.2514460700918685e-05, "loss": 0.1865, "step": 940 }, { "epoch": 0.2762029364733246, "grad_norm": 0.09849795699119568, "learning_rate": 4.2344334807757744e-05, "loss": 0.1293, "step": 950 }, { "epoch": 0.27911033580462274, "grad_norm": 0.11204036325216293, "learning_rate": 4.2174208914596803e-05, "loss": 0.1817, "step": 960 }, { "epoch": 0.28201773513592093, "grad_norm": 0.0960935726761818, "learning_rate": 4.200408302143586e-05, "loss": 0.1454, "step": 970 }, { "epoch": 0.28492513446721907, "grad_norm": 0.12531103193759918, "learning_rate": 4.183395712827492e-05, "loss": 0.126, "step": 980 }, { "epoch": 0.28783253379851725, "grad_norm": 0.1162460595369339, "learning_rate": 4.166383123511399e-05, "loss": 0.1634, "step": 990 }, { "epoch": 0.2907399331298154, "grad_norm": 0.06354565173387527, "learning_rate": 4.149370534195305e-05, "loss": 0.0986, "step": 1000 }, { "epoch": 0.2907399331298154, "eval_accuracy": 0.48148148148148145, "step": 1000 }, { "epoch": 0.2907399331298154, "eval_loss": 0.14855271577835083, "eval_runtime": 120.8042, "eval_samples_per_second": 2.012, "eval_steps_per_second": 2.012, "step": 1000 }, { "epoch": 0.2936473324611135, "grad_norm": 0.0838373601436615, "learning_rate": 4.132357944879211e-05, "loss": 0.1266, "step": 1010 }, { "epoch": 0.2965547317924117, "grad_norm": 0.10404589027166367, "learning_rate": 4.1153453555631173e-05, "loss": 0.1411, "step": 1020 }, { "epoch": 0.29946213112370984, "grad_norm": 0.137540802359581, "learning_rate": 4.0983327662470226e-05, "loss": 0.1719, "step": 1030 }, { "epoch": 0.302369530455008, "grad_norm": 0.10742737352848053, "learning_rate": 4.081320176930929e-05, "loss": 0.181, "step": 1040 }, { "epoch": 0.30527692978630616, "grad_norm": 0.14736784994602203, "learning_rate": 4.064307587614835e-05, "loss": 0.1544, "step": 1050 }, { "epoch": 0.3081843291176043, "grad_norm": 0.15068358182907104, "learning_rate": 4.047294998298741e-05, "loss": 0.1359, "step": 1060 }, { "epoch": 0.3110917284489025, "grad_norm": 0.12078270316123962, "learning_rate": 4.030282408982648e-05, "loss": 0.1304, "step": 1070 }, { "epoch": 0.3139991277802006, "grad_norm": 0.12046414613723755, "learning_rate": 4.013269819666554e-05, "loss": 0.1379, "step": 1080 }, { "epoch": 0.31690652711149875, "grad_norm": 0.13933445513248444, "learning_rate": 3.9962572303504596e-05, "loss": 0.1413, "step": 1090 }, { "epoch": 0.31981392644279694, "grad_norm": 0.09172947704792023, "learning_rate": 3.9792446410343656e-05, "loss": 0.1316, "step": 1100 }, { "epoch": 0.32272132577409507, "grad_norm": 0.11812902987003326, "learning_rate": 3.9622320517182715e-05, "loss": 0.1322, "step": 1110 }, { "epoch": 0.3256287251053932, "grad_norm": 0.1187342032790184, "learning_rate": 3.9452194624021774e-05, "loss": 0.138, "step": 1120 }, { "epoch": 0.3285361244366914, "grad_norm": 0.10506665706634521, "learning_rate": 3.928206873086084e-05, "loss": 0.1324, "step": 1130 }, { "epoch": 0.3314435237679895, "grad_norm": 0.06862190365791321, "learning_rate": 3.91119428376999e-05, "loss": 0.1375, "step": 1140 }, { "epoch": 0.3343509230992877, "grad_norm": 0.07644759863615036, "learning_rate": 3.894181694453896e-05, "loss": 0.1384, "step": 1150 }, { "epoch": 0.33725832243058584, "grad_norm": 0.12909413874149323, "learning_rate": 3.8771691051378026e-05, "loss": 0.1251, "step": 1160 }, { "epoch": 0.340165721761884, "grad_norm": 0.10902974754571915, "learning_rate": 3.8601565158217085e-05, "loss": 0.1, "step": 1170 }, { "epoch": 0.34307312109318217, "grad_norm": 0.09570310264825821, "learning_rate": 3.8431439265056145e-05, "loss": 0.1901, "step": 1180 }, { "epoch": 0.3459805204244803, "grad_norm": 0.11525869369506836, "learning_rate": 3.8261313371895204e-05, "loss": 0.1072, "step": 1190 }, { "epoch": 0.34888791975577843, "grad_norm": 0.1647326499223709, "learning_rate": 3.809118747873426e-05, "loss": 0.2477, "step": 1200 }, { "epoch": 0.3517953190870766, "grad_norm": 0.1333259642124176, "learning_rate": 3.792106158557332e-05, "loss": 0.1682, "step": 1210 }, { "epoch": 0.35470271841837475, "grad_norm": 0.11220316588878632, "learning_rate": 3.775093569241239e-05, "loss": 0.1399, "step": 1220 }, { "epoch": 0.35761011774967294, "grad_norm": 0.11679007858037949, "learning_rate": 3.758080979925145e-05, "loss": 0.1792, "step": 1230 }, { "epoch": 0.3605175170809711, "grad_norm": 0.11699705570936203, "learning_rate": 3.741068390609051e-05, "loss": 0.159, "step": 1240 }, { "epoch": 0.3634249164122692, "grad_norm": 0.13930341601371765, "learning_rate": 3.7240558012929574e-05, "loss": 0.1233, "step": 1250 }, { "epoch": 0.3663323157435674, "grad_norm": 0.11168822646141052, "learning_rate": 3.7070432119768633e-05, "loss": 0.1277, "step": 1260 }, { "epoch": 0.3692397150748655, "grad_norm": 0.10844167321920395, "learning_rate": 3.6900306226607686e-05, "loss": 0.1857, "step": 1270 }, { "epoch": 0.37214711440616366, "grad_norm": 0.1075616255402565, "learning_rate": 3.673018033344675e-05, "loss": 0.1169, "step": 1280 }, { "epoch": 0.37505451373746185, "grad_norm": 0.12448868900537491, "learning_rate": 3.656005444028581e-05, "loss": 0.2439, "step": 1290 }, { "epoch": 0.37796191306876, "grad_norm": 0.10500267148017883, "learning_rate": 3.638992854712487e-05, "loss": 0.117, "step": 1300 }, { "epoch": 0.38086931240005817, "grad_norm": 0.12791551649570465, "learning_rate": 3.621980265396394e-05, "loss": 0.13, "step": 1310 }, { "epoch": 0.3837767117313563, "grad_norm": 0.1264408826828003, "learning_rate": 3.6049676760803e-05, "loss": 0.1121, "step": 1320 }, { "epoch": 0.38668411106265443, "grad_norm": 0.1253873109817505, "learning_rate": 3.5879550867642056e-05, "loss": 0.1384, "step": 1330 }, { "epoch": 0.3895915103939526, "grad_norm": 0.08832169324159622, "learning_rate": 3.570942497448112e-05, "loss": 0.1477, "step": 1340 }, { "epoch": 0.39249890972525076, "grad_norm": 0.14027167856693268, "learning_rate": 3.5539299081320175e-05, "loss": 0.2171, "step": 1350 }, { "epoch": 0.39540630905654894, "grad_norm": 0.11315055191516876, "learning_rate": 3.536917318815924e-05, "loss": 0.1606, "step": 1360 }, { "epoch": 0.3983137083878471, "grad_norm": 0.09444567561149597, "learning_rate": 3.51990472949983e-05, "loss": 0.1284, "step": 1370 }, { "epoch": 0.4012211077191452, "grad_norm": 0.11262746155261993, "learning_rate": 3.502892140183736e-05, "loss": 0.1411, "step": 1380 }, { "epoch": 0.4041285070504434, "grad_norm": 0.17679522931575775, "learning_rate": 3.4858795508676426e-05, "loss": 0.1981, "step": 1390 }, { "epoch": 0.40703590638174153, "grad_norm": 0.18890735507011414, "learning_rate": 3.4688669615515486e-05, "loss": 0.1676, "step": 1400 }, { "epoch": 0.40994330571303966, "grad_norm": 0.09281571209430695, "learning_rate": 3.4518543722354545e-05, "loss": 0.1136, "step": 1410 }, { "epoch": 0.41285070504433785, "grad_norm": 0.13167564570903778, "learning_rate": 3.4348417829193604e-05, "loss": 0.1586, "step": 1420 }, { "epoch": 0.415758104375636, "grad_norm": 0.09986082464456558, "learning_rate": 3.4178291936032664e-05, "loss": 0.1607, "step": 1430 }, { "epoch": 0.4186655037069342, "grad_norm": 0.13981035351753235, "learning_rate": 3.400816604287172e-05, "loss": 0.1439, "step": 1440 }, { "epoch": 0.4215729030382323, "grad_norm": 0.11107143759727478, "learning_rate": 3.383804014971079e-05, "loss": 0.1382, "step": 1450 }, { "epoch": 0.42448030236953044, "grad_norm": 0.06783778220415115, "learning_rate": 3.366791425654985e-05, "loss": 0.1138, "step": 1460 }, { "epoch": 0.4273877017008286, "grad_norm": 0.11150864511728287, "learning_rate": 3.349778836338891e-05, "loss": 0.1709, "step": 1470 }, { "epoch": 0.43029510103212676, "grad_norm": 0.1245250254869461, "learning_rate": 3.3327662470227975e-05, "loss": 0.1323, "step": 1480 }, { "epoch": 0.4332025003634249, "grad_norm": 0.13299217820167542, "learning_rate": 3.3157536577067034e-05, "loss": 0.1311, "step": 1490 }, { "epoch": 0.4361098996947231, "grad_norm": 0.09196653217077255, "learning_rate": 3.298741068390609e-05, "loss": 0.121, "step": 1500 }, { "epoch": 0.4361098996947231, "eval_accuracy": 0.46502057613168724, "step": 1500 }, { "epoch": 0.4361098996947231, "eval_loss": 0.1465623825788498, "eval_runtime": 120.6266, "eval_samples_per_second": 2.014, "eval_steps_per_second": 2.014, "step": 1500 }, { "epoch": 0.4390172990260212, "grad_norm": 0.08671022951602936, "learning_rate": 3.281728479074515e-05, "loss": 0.1774, "step": 1510 }, { "epoch": 0.4419246983573194, "grad_norm": 0.1013597920536995, "learning_rate": 3.264715889758421e-05, "loss": 0.147, "step": 1520 }, { "epoch": 0.44483209768861753, "grad_norm": 0.13395969569683075, "learning_rate": 3.247703300442327e-05, "loss": 0.141, "step": 1530 }, { "epoch": 0.44773949701991567, "grad_norm": 0.08484592288732529, "learning_rate": 3.230690711126234e-05, "loss": 0.1315, "step": 1540 }, { "epoch": 0.45064689635121385, "grad_norm": 0.10540632903575897, "learning_rate": 3.21367812181014e-05, "loss": 0.2186, "step": 1550 }, { "epoch": 0.453554295682512, "grad_norm": 0.08647104352712631, "learning_rate": 3.196665532494046e-05, "loss": 0.1464, "step": 1560 }, { "epoch": 0.4564616950138101, "grad_norm": 0.11477908492088318, "learning_rate": 3.179652943177952e-05, "loss": 0.1229, "step": 1570 }, { "epoch": 0.4593690943451083, "grad_norm": 0.139444500207901, "learning_rate": 3.162640353861858e-05, "loss": 0.1722, "step": 1580 }, { "epoch": 0.46227649367640644, "grad_norm": 0.12302656471729279, "learning_rate": 3.1456277645457635e-05, "loss": 0.161, "step": 1590 }, { "epoch": 0.46518389300770463, "grad_norm": 0.13583935797214508, "learning_rate": 3.12861517522967e-05, "loss": 0.2173, "step": 1600 }, { "epoch": 0.46809129233900276, "grad_norm": 0.10586286336183548, "learning_rate": 3.111602585913576e-05, "loss": 0.1394, "step": 1610 }, { "epoch": 0.4709986916703009, "grad_norm": 0.1407371312379837, "learning_rate": 3.094589996597482e-05, "loss": 0.1452, "step": 1620 }, { "epoch": 0.4739060910015991, "grad_norm": 0.15980814397335052, "learning_rate": 3.0775774072813886e-05, "loss": 0.1929, "step": 1630 }, { "epoch": 0.4768134903328972, "grad_norm": 0.1253122240304947, "learning_rate": 3.0605648179652946e-05, "loss": 0.1423, "step": 1640 }, { "epoch": 0.4797208896641954, "grad_norm": 0.17438402771949768, "learning_rate": 3.043552228649201e-05, "loss": 0.1604, "step": 1650 }, { "epoch": 0.48262828899549354, "grad_norm": 0.13970227539539337, "learning_rate": 3.0265396393331068e-05, "loss": 0.1355, "step": 1660 }, { "epoch": 0.48553568832679167, "grad_norm": 0.12440519034862518, "learning_rate": 3.0095270500170124e-05, "loss": 0.1693, "step": 1670 }, { "epoch": 0.48844308765808986, "grad_norm": 0.136091947555542, "learning_rate": 2.9925144607009187e-05, "loss": 0.118, "step": 1680 }, { "epoch": 0.491350486989388, "grad_norm": 0.13912110030651093, "learning_rate": 2.975501871384825e-05, "loss": 0.1481, "step": 1690 }, { "epoch": 0.4942578863206861, "grad_norm": 0.10817007720470428, "learning_rate": 2.958489282068731e-05, "loss": 0.1343, "step": 1700 }, { "epoch": 0.4971652856519843, "grad_norm": 0.12969084084033966, "learning_rate": 2.941476692752637e-05, "loss": 0.1333, "step": 1710 }, { "epoch": 0.5000726849832825, "grad_norm": 0.13414813578128815, "learning_rate": 2.9244641034365434e-05, "loss": 0.1526, "step": 1720 }, { "epoch": 0.5029800843145806, "grad_norm": 0.13201922178268433, "learning_rate": 2.9074515141204494e-05, "loss": 0.1567, "step": 1730 }, { "epoch": 0.5058874836458788, "grad_norm": 0.14282363653182983, "learning_rate": 2.8904389248043557e-05, "loss": 0.1667, "step": 1740 }, { "epoch": 0.5087948829771769, "grad_norm": 0.09702622145414352, "learning_rate": 2.8734263354882613e-05, "loss": 0.1246, "step": 1750 }, { "epoch": 0.511702282308475, "grad_norm": 0.1287912130355835, "learning_rate": 2.8564137461721676e-05, "loss": 0.099, "step": 1760 }, { "epoch": 0.5146096816397733, "grad_norm": 0.10358964651823044, "learning_rate": 2.8394011568560735e-05, "loss": 0.122, "step": 1770 }, { "epoch": 0.5175170809710714, "grad_norm": 0.12403380870819092, "learning_rate": 2.8223885675399798e-05, "loss": 0.1731, "step": 1780 }, { "epoch": 0.5204244803023695, "grad_norm": 0.11310795694589615, "learning_rate": 2.8053759782238857e-05, "loss": 0.1394, "step": 1790 }, { "epoch": 0.5233318796336677, "grad_norm": 0.15226072072982788, "learning_rate": 2.788363388907792e-05, "loss": 0.1521, "step": 1800 }, { "epoch": 0.5262392789649658, "grad_norm": 0.11242180317640305, "learning_rate": 2.7713507995916983e-05, "loss": 0.167, "step": 1810 }, { "epoch": 0.5291466782962639, "grad_norm": 0.10229990631341934, "learning_rate": 2.7543382102756042e-05, "loss": 0.115, "step": 1820 }, { "epoch": 0.5320540776275622, "grad_norm": 0.11430121213197708, "learning_rate": 2.7373256209595098e-05, "loss": 0.1323, "step": 1830 }, { "epoch": 0.5349614769588603, "grad_norm": 0.0750517025589943, "learning_rate": 2.720313031643416e-05, "loss": 0.1896, "step": 1840 }, { "epoch": 0.5378688762901584, "grad_norm": 0.10639439523220062, "learning_rate": 2.7033004423273224e-05, "loss": 0.1511, "step": 1850 }, { "epoch": 0.5407762756214566, "grad_norm": 0.10970495641231537, "learning_rate": 2.6862878530112283e-05, "loss": 0.1456, "step": 1860 }, { "epoch": 0.5436836749527547, "grad_norm": 0.14610768854618073, "learning_rate": 2.6692752636951346e-05, "loss": 0.1407, "step": 1870 }, { "epoch": 0.546591074284053, "grad_norm": 0.0982876867055893, "learning_rate": 2.652262674379041e-05, "loss": 0.1484, "step": 1880 }, { "epoch": 0.5494984736153511, "grad_norm": 0.10405632108449936, "learning_rate": 2.635250085062947e-05, "loss": 0.147, "step": 1890 }, { "epoch": 0.5524058729466492, "grad_norm": 0.1323215663433075, "learning_rate": 2.618237495746853e-05, "loss": 0.1866, "step": 1900 }, { "epoch": 0.5553132722779474, "grad_norm": 0.08775471895933151, "learning_rate": 2.6012249064307587e-05, "loss": 0.098, "step": 1910 }, { "epoch": 0.5582206716092455, "grad_norm": 0.09187202900648117, "learning_rate": 2.584212317114665e-05, "loss": 0.1555, "step": 1920 }, { "epoch": 0.5611280709405437, "grad_norm": 0.11878269165754318, "learning_rate": 2.567199727798571e-05, "loss": 0.1181, "step": 1930 }, { "epoch": 0.5640354702718419, "grad_norm": 0.07616838067770004, "learning_rate": 2.5501871384824772e-05, "loss": 0.1979, "step": 1940 }, { "epoch": 0.56694286960314, "grad_norm": 0.12157191336154938, "learning_rate": 2.533174549166383e-05, "loss": 0.1479, "step": 1950 }, { "epoch": 0.5698502689344381, "grad_norm": 0.08891995251178741, "learning_rate": 2.5161619598502894e-05, "loss": 0.1184, "step": 1960 }, { "epoch": 0.5727576682657363, "grad_norm": 0.12914909422397614, "learning_rate": 2.4991493705341954e-05, "loss": 0.1533, "step": 1970 }, { "epoch": 0.5756650675970345, "grad_norm": 0.1337321698665619, "learning_rate": 2.4821367812181017e-05, "loss": 0.1564, "step": 1980 }, { "epoch": 0.5785724669283326, "grad_norm": 0.13483311235904694, "learning_rate": 2.4651241919020076e-05, "loss": 0.1648, "step": 1990 }, { "epoch": 0.5814798662596308, "grad_norm": 0.10340965539216995, "learning_rate": 2.448111602585914e-05, "loss": 0.125, "step": 2000 }, { "epoch": 0.5814798662596308, "eval_accuracy": 0.49794238683127573, "step": 2000 }, { "epoch": 0.5814798662596308, "eval_loss": 0.14546644687652588, "eval_runtime": 121.0771, "eval_samples_per_second": 2.007, "eval_steps_per_second": 2.007, "step": 2000 }, { "epoch": 0.5843872655909289, "grad_norm": 0.10115203261375427, "learning_rate": 2.4310990132698198e-05, "loss": 0.1131, "step": 2010 }, { "epoch": 0.587294664922227, "grad_norm": 0.11895623058080673, "learning_rate": 2.4140864239537258e-05, "loss": 0.1153, "step": 2020 }, { "epoch": 0.5902020642535252, "grad_norm": 0.0888802632689476, "learning_rate": 2.397073834637632e-05, "loss": 0.1441, "step": 2030 }, { "epoch": 0.5931094635848234, "grad_norm": 0.12660273909568787, "learning_rate": 2.3800612453215383e-05, "loss": 0.1529, "step": 2040 }, { "epoch": 0.5960168629161215, "grad_norm": 0.10489198565483093, "learning_rate": 2.363048656005444e-05, "loss": 0.1342, "step": 2050 }, { "epoch": 0.5989242622474197, "grad_norm": 0.13198301196098328, "learning_rate": 2.3460360666893502e-05, "loss": 0.1622, "step": 2060 }, { "epoch": 0.6018316615787178, "grad_norm": 0.08899589627981186, "learning_rate": 2.3290234773732565e-05, "loss": 0.1691, "step": 2070 }, { "epoch": 0.604739060910016, "grad_norm": 0.11825095117092133, "learning_rate": 2.3120108880571624e-05, "loss": 0.1397, "step": 2080 }, { "epoch": 0.6076464602413142, "grad_norm": 0.11306699365377426, "learning_rate": 2.2949982987410684e-05, "loss": 0.1317, "step": 2090 }, { "epoch": 0.6105538595726123, "grad_norm": 0.085251085460186, "learning_rate": 2.2779857094249747e-05, "loss": 0.1163, "step": 2100 }, { "epoch": 0.6134612589039105, "grad_norm": 0.09645051509141922, "learning_rate": 2.2609731201088806e-05, "loss": 0.1138, "step": 2110 }, { "epoch": 0.6163686582352086, "grad_norm": 0.10509444028139114, "learning_rate": 2.243960530792787e-05, "loss": 0.149, "step": 2120 }, { "epoch": 0.6192760575665067, "grad_norm": 0.13281555473804474, "learning_rate": 2.2269479414766928e-05, "loss": 0.1502, "step": 2130 }, { "epoch": 0.622183456897805, "grad_norm": 0.08020918071269989, "learning_rate": 2.209935352160599e-05, "loss": 0.1294, "step": 2140 }, { "epoch": 0.6250908562291031, "grad_norm": 0.10006445646286011, "learning_rate": 2.192922762844505e-05, "loss": 0.1153, "step": 2150 }, { "epoch": 0.6279982555604012, "grad_norm": 0.20892204344272614, "learning_rate": 2.1759101735284113e-05, "loss": 0.1805, "step": 2160 }, { "epoch": 0.6309056548916994, "grad_norm": 0.13679496943950653, "learning_rate": 2.1588975842123173e-05, "loss": 0.1123, "step": 2170 }, { "epoch": 0.6338130542229975, "grad_norm": 0.20436090230941772, "learning_rate": 2.1418849948962232e-05, "loss": 0.2282, "step": 2180 }, { "epoch": 0.6367204535542956, "grad_norm": 0.0943446233868599, "learning_rate": 2.1248724055801295e-05, "loss": 0.1288, "step": 2190 }, { "epoch": 0.6396278528855939, "grad_norm": 0.08362103998661041, "learning_rate": 2.1078598162640358e-05, "loss": 0.1278, "step": 2200 }, { "epoch": 0.642535252216892, "grad_norm": 0.11366961151361465, "learning_rate": 2.0908472269479414e-05, "loss": 0.115, "step": 2210 }, { "epoch": 0.6454426515481901, "grad_norm": 0.090823233127594, "learning_rate": 2.0738346376318477e-05, "loss": 0.1179, "step": 2220 }, { "epoch": 0.6483500508794883, "grad_norm": 0.12734675407409668, "learning_rate": 2.056822048315754e-05, "loss": 0.1367, "step": 2230 }, { "epoch": 0.6512574502107864, "grad_norm": 0.5848974585533142, "learning_rate": 2.03980945899966e-05, "loss": 0.1157, "step": 2240 }, { "epoch": 0.6541648495420846, "grad_norm": 0.08575470000505447, "learning_rate": 2.0227968696835658e-05, "loss": 0.1735, "step": 2250 }, { "epoch": 0.6570722488733828, "grad_norm": 0.09076444059610367, "learning_rate": 2.005784280367472e-05, "loss": 0.1414, "step": 2260 }, { "epoch": 0.6599796482046809, "grad_norm": 0.15823446214199066, "learning_rate": 1.988771691051378e-05, "loss": 0.1464, "step": 2270 }, { "epoch": 0.662887047535979, "grad_norm": 0.1579420417547226, "learning_rate": 1.9717591017352843e-05, "loss": 0.1669, "step": 2280 }, { "epoch": 0.6657944468672772, "grad_norm": 0.11068382859230042, "learning_rate": 1.9547465124191903e-05, "loss": 0.1743, "step": 2290 }, { "epoch": 0.6687018461985754, "grad_norm": 0.09305164217948914, "learning_rate": 1.9377339231030962e-05, "loss": 0.1481, "step": 2300 }, { "epoch": 0.6716092455298736, "grad_norm": 0.1341511458158493, "learning_rate": 1.9207213337870025e-05, "loss": 0.1199, "step": 2310 }, { "epoch": 0.6745166448611717, "grad_norm": 0.09602607041597366, "learning_rate": 1.9037087444709088e-05, "loss": 0.1393, "step": 2320 }, { "epoch": 0.6774240441924698, "grad_norm": 0.14141052961349487, "learning_rate": 1.8866961551548147e-05, "loss": 0.1189, "step": 2330 }, { "epoch": 0.680331443523768, "grad_norm": 0.10962352156639099, "learning_rate": 1.8696835658387207e-05, "loss": 0.1972, "step": 2340 }, { "epoch": 0.6832388428550662, "grad_norm": 0.11629267781972885, "learning_rate": 1.852670976522627e-05, "loss": 0.1648, "step": 2350 }, { "epoch": 0.6861462421863643, "grad_norm": 0.14257384836673737, "learning_rate": 1.8356583872065332e-05, "loss": 0.1906, "step": 2360 }, { "epoch": 0.6890536415176625, "grad_norm": 0.1146107092499733, "learning_rate": 1.8186457978904388e-05, "loss": 0.1632, "step": 2370 }, { "epoch": 0.6919610408489606, "grad_norm": 0.16012772917747498, "learning_rate": 1.801633208574345e-05, "loss": 0.138, "step": 2380 }, { "epoch": 0.6948684401802587, "grad_norm": 0.14589470624923706, "learning_rate": 1.7846206192582514e-05, "loss": 0.1451, "step": 2390 }, { "epoch": 0.6977758395115569, "grad_norm": 0.1040879562497139, "learning_rate": 1.7676080299421573e-05, "loss": 0.1402, "step": 2400 }, { "epoch": 0.7006832388428551, "grad_norm": 0.11645939201116562, "learning_rate": 1.7505954406260633e-05, "loss": 0.1637, "step": 2410 }, { "epoch": 0.7035906381741532, "grad_norm": 0.1045243889093399, "learning_rate": 1.7335828513099695e-05, "loss": 0.1235, "step": 2420 }, { "epoch": 0.7064980375054514, "grad_norm": 0.08904137462377548, "learning_rate": 1.7165702619938755e-05, "loss": 0.1348, "step": 2430 }, { "epoch": 0.7094054368367495, "grad_norm": 0.1161481961607933, "learning_rate": 1.6995576726777818e-05, "loss": 0.1225, "step": 2440 }, { "epoch": 0.7123128361680476, "grad_norm": 0.10629335045814514, "learning_rate": 1.6825450833616877e-05, "loss": 0.1173, "step": 2450 }, { "epoch": 0.7152202354993459, "grad_norm": 0.11692572385072708, "learning_rate": 1.6655324940455937e-05, "loss": 0.1225, "step": 2460 }, { "epoch": 0.718127634830644, "grad_norm": 0.13012786209583282, "learning_rate": 1.6485199047295e-05, "loss": 0.1336, "step": 2470 }, { "epoch": 0.7210350341619421, "grad_norm": 0.1442546844482422, "learning_rate": 1.6315073154134062e-05, "loss": 0.1755, "step": 2480 }, { "epoch": 0.7239424334932403, "grad_norm": 0.10278443247079849, "learning_rate": 1.614494726097312e-05, "loss": 0.1191, "step": 2490 }, { "epoch": 0.7268498328245384, "grad_norm": 0.12166904658079147, "learning_rate": 1.597482136781218e-05, "loss": 0.1458, "step": 2500 }, { "epoch": 0.7268498328245384, "eval_accuracy": 0.51440329218107, "step": 2500 }, { "epoch": 0.7268498328245384, "eval_loss": 0.14491653442382812, "eval_runtime": 121.2093, "eval_samples_per_second": 2.005, "eval_steps_per_second": 2.005, "step": 2500 } ], "logging_steps": 10, "max_steps": 3439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.286826483712e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }