{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992659652556888, "eval_steps": 500, "global_step": 1021, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009787129924149743, "grad_norm": 3.160645008087158, "learning_rate": 8.737864077669904e-07, "loss": 1.6202, "step": 10 }, { "epoch": 0.019574259848299486, "grad_norm": 1.8336282968521118, "learning_rate": 1.8446601941747574e-06, "loss": 1.5766, "step": 20 }, { "epoch": 0.029361389772449228, "grad_norm": 1.5718436241149902, "learning_rate": 2.8155339805825245e-06, "loss": 1.4962, "step": 30 }, { "epoch": 0.03914851969659897, "grad_norm": 1.2343661785125732, "learning_rate": 3.7864077669902915e-06, "loss": 1.3827, "step": 40 }, { "epoch": 0.048935649620748714, "grad_norm": 1.0965512990951538, "learning_rate": 4.7572815533980585e-06, "loss": 1.3328, "step": 50 }, { "epoch": 0.058722779544898455, "grad_norm": 1.0640696287155151, "learning_rate": 5.728155339805825e-06, "loss": 1.2826, "step": 60 }, { "epoch": 0.0685099094690482, "grad_norm": 1.076289415359497, "learning_rate": 6.6990291262135935e-06, "loss": 1.234, "step": 70 }, { "epoch": 0.07829703939319795, "grad_norm": 1.036766529083252, "learning_rate": 7.66990291262136e-06, "loss": 1.2398, "step": 80 }, { "epoch": 0.0880841693173477, "grad_norm": 1.0808024406433105, "learning_rate": 8.640776699029127e-06, "loss": 1.2031, "step": 90 }, { "epoch": 0.09787129924149743, "grad_norm": 1.1380928754806519, "learning_rate": 9.611650485436894e-06, "loss": 1.2039, "step": 100 }, { "epoch": 0.10765842916564718, "grad_norm": 1.0566169023513794, "learning_rate": 9.998945997517957e-06, "loss": 1.1918, "step": 110 }, { "epoch": 0.11744555908979691, "grad_norm": 1.080167293548584, "learning_rate": 9.992506480404137e-06, "loss": 1.1916, "step": 120 }, { "epoch": 0.12723268901394666, "grad_norm": 1.0032321214675903, "learning_rate": 9.980220534903889e-06, "loss": 1.1654, "step": 130 }, { "epoch": 0.1370198189380964, "grad_norm": 1.101294994354248, "learning_rate": 9.96210254835968e-06, "loss": 1.1627, "step": 140 }, { "epoch": 0.14680694886224616, "grad_norm": 1.1083616018295288, "learning_rate": 9.938173737671531e-06, "loss": 1.1622, "step": 150 }, { "epoch": 0.1565940787863959, "grad_norm": 0.9857867360115051, "learning_rate": 9.908462124451152e-06, "loss": 1.1588, "step": 160 }, { "epoch": 0.16638120871054562, "grad_norm": 1.0299389362335205, "learning_rate": 9.873002502207502e-06, "loss": 1.1774, "step": 170 }, { "epoch": 0.1761683386346954, "grad_norm": 1.0562578439712524, "learning_rate": 9.831836395602164e-06, "loss": 1.1632, "step": 180 }, { "epoch": 0.18595546855884512, "grad_norm": 1.0918824672698975, "learning_rate": 9.78501201182228e-06, "loss": 1.1575, "step": 190 }, { "epoch": 0.19574259848299486, "grad_norm": 0.9658213257789612, "learning_rate": 9.732584184127973e-06, "loss": 1.145, "step": 200 }, { "epoch": 0.20552972840714462, "grad_norm": 0.9882791042327881, "learning_rate": 9.674614307640368e-06, "loss": 1.1595, "step": 210 }, { "epoch": 0.21531685833129435, "grad_norm": 1.0786079168319702, "learning_rate": 9.611170267445401e-06, "loss": 1.1383, "step": 220 }, { "epoch": 0.2251039882554441, "grad_norm": 1.0685032606124878, "learning_rate": 9.542326359097619e-06, "loss": 1.1329, "step": 230 }, { "epoch": 0.23489111817959382, "grad_norm": 1.0679073333740234, "learning_rate": 9.468163201617063e-06, "loss": 1.1323, "step": 240 }, { "epoch": 0.24467824810374358, "grad_norm": 0.9797915816307068, "learning_rate": 9.388767643081109e-06, "loss": 1.1332, "step": 250 }, { "epoch": 0.2544653780278933, "grad_norm": 1.055120587348938, "learning_rate": 9.30423265892184e-06, "loss": 1.1301, "step": 260 }, { "epoch": 0.2642525079520431, "grad_norm": 1.034970998764038, "learning_rate": 9.214657243048021e-06, "loss": 1.1483, "step": 270 }, { "epoch": 0.2740396378761928, "grad_norm": 1.105406641960144, "learning_rate": 9.120146291919206e-06, "loss": 1.1249, "step": 280 }, { "epoch": 0.28382676780034255, "grad_norm": 0.9189131259918213, "learning_rate": 9.020810481707709e-06, "loss": 1.1385, "step": 290 }, { "epoch": 0.2936138977244923, "grad_norm": 1.0855603218078613, "learning_rate": 8.916766138692303e-06, "loss": 1.1326, "step": 300 }, { "epoch": 0.303401027648642, "grad_norm": 0.9882815480232239, "learning_rate": 8.808135103035407e-06, "loss": 1.1223, "step": 310 }, { "epoch": 0.3131881575727918, "grad_norm": 0.9927927255630493, "learning_rate": 8.695044586103297e-06, "loss": 1.1314, "step": 320 }, { "epoch": 0.32297528749694154, "grad_norm": 1.0153377056121826, "learning_rate": 8.577627021496413e-06, "loss": 1.1254, "step": 330 }, { "epoch": 0.33276241742109125, "grad_norm": 0.9959344267845154, "learning_rate": 8.456019909964224e-06, "loss": 1.1115, "step": 340 }, { "epoch": 0.342549547345241, "grad_norm": 1.044042706489563, "learning_rate": 8.330365658386252e-06, "loss": 1.142, "step": 350 }, { "epoch": 0.3523366772693908, "grad_norm": 0.9094734787940979, "learning_rate": 8.200811413007808e-06, "loss": 1.1254, "step": 360 }, { "epoch": 0.3621238071935405, "grad_norm": 1.0200663805007935, "learning_rate": 8.06750888712576e-06, "loss": 1.127, "step": 370 }, { "epoch": 0.37191093711769024, "grad_norm": 1.0179405212402344, "learning_rate": 7.930614183426074e-06, "loss": 1.0985, "step": 380 }, { "epoch": 0.38169806704184, "grad_norm": 0.9346506595611572, "learning_rate": 7.790287611181217e-06, "loss": 1.1061, "step": 390 }, { "epoch": 0.3914851969659897, "grad_norm": 0.9131274223327637, "learning_rate": 7.646693498521472e-06, "loss": 1.1178, "step": 400 }, { "epoch": 0.4012723268901395, "grad_norm": 1.1208817958831787, "learning_rate": 7.500000000000001e-06, "loss": 1.1076, "step": 410 }, { "epoch": 0.41105945681428924, "grad_norm": 0.9498858451843262, "learning_rate": 7.35037889967702e-06, "loss": 1.123, "step": 420 }, { "epoch": 0.42084658673843894, "grad_norm": 0.9704530239105225, "learning_rate": 7.19800540995367e-06, "loss": 1.1142, "step": 430 }, { "epoch": 0.4306337166625887, "grad_norm": 1.083723545074463, "learning_rate": 7.043057966391158e-06, "loss": 1.1068, "step": 440 }, { "epoch": 0.4404208465867384, "grad_norm": 0.959192156791687, "learning_rate": 6.885718018755448e-06, "loss": 1.1024, "step": 450 }, { "epoch": 0.4502079765108882, "grad_norm": 0.9829457402229309, "learning_rate": 6.7261698185322e-06, "loss": 1.0918, "step": 460 }, { "epoch": 0.45999510643503794, "grad_norm": 0.9769061207771301, "learning_rate": 6.5646002031607726e-06, "loss": 1.1092, "step": 470 }, { "epoch": 0.46978223635918764, "grad_norm": 0.9248567223548889, "learning_rate": 6.401198377239979e-06, "loss": 1.1096, "step": 480 }, { "epoch": 0.4795693662833374, "grad_norm": 1.0321615934371948, "learning_rate": 6.236155690961795e-06, "loss": 1.1118, "step": 490 }, { "epoch": 0.48935649620748717, "grad_norm": 0.9511229395866394, "learning_rate": 6.0696654160324875e-06, "loss": 1.0969, "step": 500 }, { "epoch": 0.4991436261316369, "grad_norm": 0.9163072109222412, "learning_rate": 5.901922519343586e-06, "loss": 1.107, "step": 510 }, { "epoch": 0.5089307560557866, "grad_norm": 0.9588350653648376, "learning_rate": 5.733123434657704e-06, "loss": 1.1008, "step": 520 }, { "epoch": 0.5187178859799364, "grad_norm": 0.9725273251533508, "learning_rate": 5.5634658325766066e-06, "loss": 1.1122, "step": 530 }, { "epoch": 0.5285050159040862, "grad_norm": 0.954433798789978, "learning_rate": 5.393148389060893e-06, "loss": 1.1052, "step": 540 }, { "epoch": 0.5382921458282359, "grad_norm": 1.1059950590133667, "learning_rate": 5.222370552772353e-06, "loss": 1.1014, "step": 550 }, { "epoch": 0.5480792757523856, "grad_norm": 0.9268523454666138, "learning_rate": 5.05133231151145e-06, "loss": 1.0981, "step": 560 }, { "epoch": 0.5578664056765353, "grad_norm": 0.9672321677207947, "learning_rate": 4.880233958023486e-06, "loss": 1.1086, "step": 570 }, { "epoch": 0.5676535356006851, "grad_norm": 1.0383234024047852, "learning_rate": 4.7092758554476215e-06, "loss": 1.0938, "step": 580 }, { "epoch": 0.5774406655248349, "grad_norm": 0.9663616418838501, "learning_rate": 4.53865820268349e-06, "loss": 1.1038, "step": 590 }, { "epoch": 0.5872277954489846, "grad_norm": 0.9746555685997009, "learning_rate": 4.368580799950133e-06, "loss": 1.0784, "step": 600 }, { "epoch": 0.5970149253731343, "grad_norm": 0.8926970362663269, "learning_rate": 4.199242814811807e-06, "loss": 1.1028, "step": 610 }, { "epoch": 0.606802055297284, "grad_norm": 0.9124487638473511, "learning_rate": 4.03084254894465e-06, "loss": 1.0938, "step": 620 }, { "epoch": 0.6165891852214338, "grad_norm": 0.9407256841659546, "learning_rate": 3.863577205917356e-06, "loss": 1.0917, "step": 630 }, { "epoch": 0.6263763151455836, "grad_norm": 0.9204623103141785, "learning_rate": 3.6976426602577565e-06, "loss": 1.0837, "step": 640 }, { "epoch": 0.6361634450697333, "grad_norm": 0.9785788059234619, "learning_rate": 3.5332332280757706e-06, "loss": 1.0964, "step": 650 }, { "epoch": 0.6459505749938831, "grad_norm": 0.9679555892944336, "learning_rate": 3.3705414395113354e-06, "loss": 1.0897, "step": 660 }, { "epoch": 0.6557377049180327, "grad_norm": 0.8785240054130554, "learning_rate": 3.2097578132737716e-06, "loss": 1.0986, "step": 670 }, { "epoch": 0.6655248348421825, "grad_norm": 0.9295753240585327, "learning_rate": 3.0510706335366034e-06, "loss": 1.0877, "step": 680 }, { "epoch": 0.6753119647663323, "grad_norm": 0.8944317102432251, "learning_rate": 2.8946657294491452e-06, "loss": 1.0941, "step": 690 }, { "epoch": 0.685099094690482, "grad_norm": 0.9147707223892212, "learning_rate": 2.740726257522987e-06, "loss": 1.0857, "step": 700 }, { "epoch": 0.6948862246146318, "grad_norm": 0.889441967010498, "learning_rate": 2.5894324871482557e-06, "loss": 1.0781, "step": 710 }, { "epoch": 0.7046733545387815, "grad_norm": 0.8948488235473633, "learning_rate": 2.4409615894908407e-06, "loss": 1.0713, "step": 720 }, { "epoch": 0.7144604844629312, "grad_norm": 0.881840169429779, "learning_rate": 2.2954874300177197e-06, "loss": 1.0765, "step": 730 }, { "epoch": 0.724247614387081, "grad_norm": 0.9054113030433655, "learning_rate": 2.1531803648934333e-06, "loss": 1.0835, "step": 740 }, { "epoch": 0.7340347443112307, "grad_norm": 0.8451863527297974, "learning_rate": 2.0142070414860704e-06, "loss": 1.0836, "step": 750 }, { "epoch": 0.7438218742353805, "grad_norm": 0.8607450127601624, "learning_rate": 1.8787302032164168e-06, "loss": 1.0816, "step": 760 }, { "epoch": 0.7536090041595302, "grad_norm": 0.8773781657218933, "learning_rate": 1.746908498978791e-06, "loss": 1.0807, "step": 770 }, { "epoch": 0.76339613408368, "grad_norm": 0.9455496668815613, "learning_rate": 1.6188962973567068e-06, "loss": 1.0828, "step": 780 }, { "epoch": 0.7731832640078297, "grad_norm": 0.9180036783218384, "learning_rate": 1.4948435058510036e-06, "loss": 1.1013, "step": 790 }, { "epoch": 0.7829703939319794, "grad_norm": 0.900719165802002, "learning_rate": 1.374895395332037e-06, "loss": 1.07, "step": 800 }, { "epoch": 0.7927575238561292, "grad_norm": 0.8657832145690918, "learning_rate": 1.259192429921584e-06, "loss": 1.0796, "step": 810 }, { "epoch": 0.802544653780279, "grad_norm": 1.0004676580429077, "learning_rate": 1.1478701025036359e-06, "loss": 1.1017, "step": 820 }, { "epoch": 0.8123317837044287, "grad_norm": 0.9102425575256348, "learning_rate": 1.0410587760567104e-06, "loss": 1.0805, "step": 830 }, { "epoch": 0.8221189136285785, "grad_norm": 0.8790053129196167, "learning_rate": 9.388835309934985e-07, "loss": 1.068, "step": 840 }, { "epoch": 0.8319060435527281, "grad_norm": 0.8961747288703918, "learning_rate": 8.414640186866063e-07, "loss": 1.0832, "step": 850 }, { "epoch": 0.8416931734768779, "grad_norm": 0.874332845211029, "learning_rate": 7.489143213519301e-07, "loss": 1.0858, "step": 860 }, { "epoch": 0.8514803034010276, "grad_norm": 0.9413198828697205, "learning_rate": 6.613428184537235e-07, "loss": 1.0938, "step": 870 }, { "epoch": 0.8612674333251774, "grad_norm": 0.8453653454780579, "learning_rate": 5.788520597878477e-07, "loss": 1.0722, "step": 880 }, { "epoch": 0.8710545632493272, "grad_norm": 0.880970299243927, "learning_rate": 5.015386453917742e-07, "loss": 1.0721, "step": 890 }, { "epoch": 0.8808416931734768, "grad_norm": 0.8894344568252563, "learning_rate": 4.29493112422007e-07, "loss": 1.0691, "step": 900 }, { "epoch": 0.8906288230976266, "grad_norm": 0.8694501519203186, "learning_rate": 3.627998291313939e-07, "loss": 1.086, "step": 910 }, { "epoch": 0.9004159530217763, "grad_norm": 0.8791433572769165, "learning_rate": 3.015368960704584e-07, "loss": 1.0739, "step": 920 }, { "epoch": 0.9102030829459261, "grad_norm": 0.9409059882164001, "learning_rate": 2.4577605462847764e-07, "loss": 1.0684, "step": 930 }, { "epoch": 0.9199902128700759, "grad_norm": 0.9185978770256042, "learning_rate": 1.9558260302139642e-07, "loss": 1.0857, "step": 940 }, { "epoch": 0.9297773427942256, "grad_norm": 0.8623930811882019, "learning_rate": 1.510153198249531e-07, "loss": 1.0742, "step": 950 }, { "epoch": 0.9395644727183753, "grad_norm": 0.9032447338104248, "learning_rate": 1.1212639514257829e-07, "loss": 1.0909, "step": 960 }, { "epoch": 0.949351602642525, "grad_norm": 0.9127726554870605, "learning_rate": 7.896136948865429e-08, "loss": 1.0675, "step": 970 }, { "epoch": 0.9591387325666748, "grad_norm": 0.8940075635910034, "learning_rate": 5.155908045872349e-08, "loss": 1.0865, "step": 980 }, { "epoch": 0.9689258624908246, "grad_norm": 0.8495497703552246, "learning_rate": 2.995161724907658e-08, "loss": 1.0787, "step": 990 }, { "epoch": 0.9787129924149743, "grad_norm": 0.8723552227020264, "learning_rate": 1.4164283079001196e-08, "loss": 1.0988, "step": 1000 }, { "epoch": 0.9885001223391241, "grad_norm": 0.8567104935646057, "learning_rate": 4.2155655596809455e-09, "loss": 1.0825, "step": 1010 }, { "epoch": 0.9982872522632737, "grad_norm": 0.8691814541816711, "learning_rate": 1.1711504444733567e-10, "loss": 1.0754, "step": 1020 }, { "epoch": 0.9992659652556888, "step": 1021, "total_flos": 2.8896472925732864e+18, "train_loss": 1.1314670071662583, "train_runtime": 7504.4933, "train_samples_per_second": 17.424, "train_steps_per_second": 0.136 } ], "logging_steps": 10, "max_steps": 1021, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8896472925732864e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }