diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4583 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.119138149556401, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0063371356147021544, + "grad_norm": 9.841416358947754, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3229, + "step": 10 + }, + { + "epoch": 0.012674271229404309, + "grad_norm": 10.776036262512207, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3359, + "step": 20 + }, + { + "epoch": 0.019011406844106463, + "grad_norm": 8.647089004516602, + "learning_rate": 6e-06, + "loss": 1.2916, + "step": 30 + }, + { + "epoch": 0.025348542458808618, + "grad_norm": 3.833814859390259, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8317, + "step": 40 + }, + { + "epoch": 0.031685678073510776, + "grad_norm": 4.45695686340332, + "learning_rate": 1e-05, + "loss": 0.3917, + "step": 50 + }, + { + "epoch": 0.03802281368821293, + "grad_norm": 4.198420524597168, + "learning_rate": 1.2e-05, + "loss": 0.3646, + "step": 60 + }, + { + "epoch": 0.044359949302915085, + "grad_norm": 1.619905948638916, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.2154, + "step": 70 + }, + { + "epoch": 0.050697084917617236, + "grad_norm": 1.2290397882461548, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.1981, + "step": 80 + }, + { + "epoch": 0.057034220532319393, + "grad_norm": 1.4010839462280273, + "learning_rate": 1.8e-05, + "loss": 0.1628, + "step": 90 + }, + { + "epoch": 0.06337135614702155, + "grad_norm": 1.5164788961410522, + "learning_rate": 2e-05, + "loss": 0.1349, + "step": 100 + }, + { + "epoch": 0.0697084917617237, + "grad_norm": 1.504683017730713, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.1351, + "step": 110 + }, + { + "epoch": 0.07604562737642585, + "grad_norm": 1.1718053817749023, + "learning_rate": 2.4e-05, + "loss": 0.1315, + "step": 120 + }, + { + "epoch": 0.08238276299112801, + "grad_norm": 1.2100762128829956, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.1088, + "step": 130 + }, + { + "epoch": 0.08871989860583017, + "grad_norm": 1.060626745223999, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.1019, + "step": 140 + }, + { + "epoch": 0.09505703422053231, + "grad_norm": 1.1521592140197754, + "learning_rate": 3e-05, + "loss": 0.0981, + "step": 150 + }, + { + "epoch": 0.10139416983523447, + "grad_norm": 0.9289679527282715, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0904, + "step": 160 + }, + { + "epoch": 0.10773130544993663, + "grad_norm": 1.2749850749969482, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0903, + "step": 170 + }, + { + "epoch": 0.11406844106463879, + "grad_norm": 0.6532027721405029, + "learning_rate": 3.6e-05, + "loss": 0.0844, + "step": 180 + }, + { + "epoch": 0.12040557667934093, + "grad_norm": 1.0130654573440552, + "learning_rate": 3.8e-05, + "loss": 0.0952, + "step": 190 + }, + { + "epoch": 0.1267427122940431, + "grad_norm": 1.218198299407959, + "learning_rate": 4e-05, + "loss": 0.0875, + "step": 200 + }, + { + "epoch": 0.13307984790874525, + "grad_norm": 0.7836218476295471, + "learning_rate": 4.2e-05, + "loss": 0.0813, + "step": 210 + }, + { + "epoch": 0.1394169835234474, + "grad_norm": 0.7320715188980103, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0865, + "step": 220 + }, + { + "epoch": 0.14575411913814956, + "grad_norm": 0.6925137639045715, + "learning_rate": 4.600000000000001e-05, + "loss": 0.0651, + "step": 230 + }, + { + "epoch": 0.1520912547528517, + "grad_norm": 1.0975725650787354, + "learning_rate": 4.8e-05, + "loss": 0.0735, + "step": 240 + }, + { + "epoch": 0.15842839036755388, + "grad_norm": 0.7768546342849731, + "learning_rate": 5e-05, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.16476552598225602, + "grad_norm": 0.5613494515419006, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0608, + "step": 260 + }, + { + "epoch": 0.17110266159695817, + "grad_norm": 0.6478670239448547, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0526, + "step": 270 + }, + { + "epoch": 0.17743979721166034, + "grad_norm": 0.9302045106887817, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0657, + "step": 280 + }, + { + "epoch": 0.18377693282636248, + "grad_norm": 0.5966052412986755, + "learning_rate": 5.8e-05, + "loss": 0.0518, + "step": 290 + }, + { + "epoch": 0.19011406844106463, + "grad_norm": 0.6293818354606628, + "learning_rate": 6e-05, + "loss": 0.0617, + "step": 300 + }, + { + "epoch": 0.1964512040557668, + "grad_norm": 0.7958998084068298, + "learning_rate": 6.2e-05, + "loss": 0.0559, + "step": 310 + }, + { + "epoch": 0.20278833967046894, + "grad_norm": 0.7211484909057617, + "learning_rate": 6.400000000000001e-05, + "loss": 0.0534, + "step": 320 + }, + { + "epoch": 0.20912547528517111, + "grad_norm": 1.108765959739685, + "learning_rate": 6.6e-05, + "loss": 0.0572, + "step": 330 + }, + { + "epoch": 0.21546261089987326, + "grad_norm": 0.9579587578773499, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0513, + "step": 340 + }, + { + "epoch": 0.2217997465145754, + "grad_norm": 1.0826679468154907, + "learning_rate": 7e-05, + "loss": 0.0591, + "step": 350 + }, + { + "epoch": 0.22813688212927757, + "grad_norm": 0.9620176553726196, + "learning_rate": 7.2e-05, + "loss": 0.0614, + "step": 360 + }, + { + "epoch": 0.23447401774397972, + "grad_norm": 0.3815687298774719, + "learning_rate": 7.4e-05, + "loss": 0.0452, + "step": 370 + }, + { + "epoch": 0.24081115335868186, + "grad_norm": 1.0042985677719116, + "learning_rate": 7.6e-05, + "loss": 0.0561, + "step": 380 + }, + { + "epoch": 0.24714828897338403, + "grad_norm": 0.6386390328407288, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0619, + "step": 390 + }, + { + "epoch": 0.2534854245880862, + "grad_norm": 0.7774208784103394, + "learning_rate": 8e-05, + "loss": 0.0478, + "step": 400 + }, + { + "epoch": 0.2598225602027883, + "grad_norm": 0.7507523894309998, + "learning_rate": 8.2e-05, + "loss": 0.053, + "step": 410 + }, + { + "epoch": 0.2661596958174905, + "grad_norm": 0.8982664346694946, + "learning_rate": 8.4e-05, + "loss": 0.0585, + "step": 420 + }, + { + "epoch": 0.27249683143219267, + "grad_norm": 0.83918297290802, + "learning_rate": 8.6e-05, + "loss": 0.0527, + "step": 430 + }, + { + "epoch": 0.2788339670468948, + "grad_norm": 0.5620638132095337, + "learning_rate": 8.800000000000001e-05, + "loss": 0.0477, + "step": 440 + }, + { + "epoch": 0.28517110266159695, + "grad_norm": 0.7724658250808716, + "learning_rate": 9e-05, + "loss": 0.0503, + "step": 450 + }, + { + "epoch": 0.2915082382762991, + "grad_norm": 0.5999326109886169, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0426, + "step": 460 + }, + { + "epoch": 0.29784537389100124, + "grad_norm": 0.6010283827781677, + "learning_rate": 9.4e-05, + "loss": 0.0464, + "step": 470 + }, + { + "epoch": 0.3041825095057034, + "grad_norm": 0.8040053248405457, + "learning_rate": 9.6e-05, + "loss": 0.0455, + "step": 480 + }, + { + "epoch": 0.3105196451204056, + "grad_norm": 1.1320879459381104, + "learning_rate": 9.8e-05, + "loss": 0.0513, + "step": 490 + }, + { + "epoch": 0.31685678073510776, + "grad_norm": 0.7393860816955566, + "learning_rate": 0.0001, + "loss": 0.0506, + "step": 500 + }, + { + "epoch": 0.3231939163498099, + "grad_norm": 0.9977770447731018, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0511, + "step": 510 + }, + { + "epoch": 0.32953105196451205, + "grad_norm": 0.9153000116348267, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0446, + "step": 520 + }, + { + "epoch": 0.3358681875792142, + "grad_norm": 0.7802489399909973, + "learning_rate": 9.999753945398704e-05, + "loss": 0.045, + "step": 530 + }, + { + "epoch": 0.34220532319391633, + "grad_norm": 0.7009689211845398, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0537, + "step": 540 + }, + { + "epoch": 0.3485424588086185, + "grad_norm": 0.5555402636528015, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0416, + "step": 550 + }, + { + "epoch": 0.3548795944233207, + "grad_norm": 0.613745391368866, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0463, + "step": 560 + }, + { + "epoch": 0.3612167300380228, + "grad_norm": 0.9725404381752014, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0534, + "step": 570 + }, + { + "epoch": 0.36755386565272496, + "grad_norm": 0.6908450126647949, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0486, + "step": 580 + }, + { + "epoch": 0.37389100126742714, + "grad_norm": 0.8201190829277039, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0381, + "step": 590 + }, + { + "epoch": 0.38022813688212925, + "grad_norm": 0.7427898645401001, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0504, + "step": 600 + }, + { + "epoch": 0.3865652724968314, + "grad_norm": 0.5355698466300964, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0454, + "step": 610 + }, + { + "epoch": 0.3929024081115336, + "grad_norm": 0.6287090182304382, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0481, + "step": 620 + }, + { + "epoch": 0.39923954372623577, + "grad_norm": 0.5858126878738403, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0379, + "step": 630 + }, + { + "epoch": 0.4055766793409379, + "grad_norm": 0.3876695930957794, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0387, + "step": 640 + }, + { + "epoch": 0.41191381495564006, + "grad_norm": 0.5101262331008911, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0499, + "step": 650 + }, + { + "epoch": 0.41825095057034223, + "grad_norm": 0.7811976075172424, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0458, + "step": 660 + }, + { + "epoch": 0.42458808618504434, + "grad_norm": 0.6726566553115845, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0359, + "step": 670 + }, + { + "epoch": 0.4309252217997465, + "grad_norm": 0.43720176815986633, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0525, + "step": 680 + }, + { + "epoch": 0.4372623574144487, + "grad_norm": 1.0024137496948242, + "learning_rate": 9.990133642141359e-05, + "loss": 0.04, + "step": 690 + }, + { + "epoch": 0.4435994930291508, + "grad_norm": 0.6060113310813904, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0354, + "step": 700 + }, + { + "epoch": 0.449936628643853, + "grad_norm": 0.8775745630264282, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0432, + "step": 710 + }, + { + "epoch": 0.45627376425855515, + "grad_norm": 0.7044681310653687, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0347, + "step": 720 + }, + { + "epoch": 0.46261089987325726, + "grad_norm": 0.5258815884590149, + "learning_rate": 9.985544309644475e-05, + "loss": 0.0428, + "step": 730 + }, + { + "epoch": 0.46894803548795944, + "grad_norm": 0.5640819072723389, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0477, + "step": 740 + }, + { + "epoch": 0.4752851711026616, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0398, + "step": 750 + }, + { + "epoch": 0.4816223067173637, + "grad_norm": 0.7809548377990723, + "learning_rate": 9.981529796748134e-05, + "loss": 0.042, + "step": 760 + }, + { + "epoch": 0.4879594423320659, + "grad_norm": 0.5453559756278992, + "learning_rate": 9.980082651251175e-05, + "loss": 0.0413, + "step": 770 + }, + { + "epoch": 0.49429657794676807, + "grad_norm": 0.43146708607673645, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0395, + "step": 780 + }, + { + "epoch": 0.5006337135614702, + "grad_norm": 0.6322312951087952, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0303, + "step": 790 + }, + { + "epoch": 0.5069708491761724, + "grad_norm": 0.3923812508583069, + "learning_rate": 9.975414512725057e-05, + "loss": 0.0417, + "step": 800 + }, + { + "epoch": 0.5133079847908745, + "grad_norm": 0.43715372681617737, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0357, + "step": 810 + }, + { + "epoch": 0.5196451204055766, + "grad_norm": 0.4882504343986511, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0483, + "step": 820 + }, + { + "epoch": 0.5259822560202788, + "grad_norm": 0.47166863083839417, + "learning_rate": 9.970256684745258e-05, + "loss": 0.033, + "step": 830 + }, + { + "epoch": 0.532319391634981, + "grad_norm": 0.6284196972846985, + "learning_rate": 9.968428675226714e-05, + "loss": 0.034, + "step": 840 + }, + { + "epoch": 0.5386565272496832, + "grad_norm": 0.34937915205955505, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0288, + "step": 850 + }, + { + "epoch": 0.5449936628643853, + "grad_norm": 0.6910156607627869, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0413, + "step": 860 + }, + { + "epoch": 0.5513307984790875, + "grad_norm": 0.3398362994194031, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0295, + "step": 870 + }, + { + "epoch": 0.5576679340937896, + "grad_norm": 0.5590093731880188, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0332, + "step": 880 + }, + { + "epoch": 0.5640050697084917, + "grad_norm": 0.444553405046463, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0331, + "step": 890 + }, + { + "epoch": 0.5703422053231939, + "grad_norm": 0.4849596619606018, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0388, + "step": 900 + }, + { + "epoch": 0.5766793409378961, + "grad_norm": 0.33844664692878723, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0324, + "step": 910 + }, + { + "epoch": 0.5830164765525983, + "grad_norm": 0.2623671889305115, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0403, + "step": 920 + }, + { + "epoch": 0.5893536121673004, + "grad_norm": 0.5219829678535461, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.5956907477820025, + "grad_norm": 0.40922799706459045, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0295, + "step": 940 + }, + { + "epoch": 0.6020278833967047, + "grad_norm": 0.37367522716522217, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0352, + "step": 950 + }, + { + "epoch": 0.6083650190114068, + "grad_norm": 0.8563676476478577, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0424, + "step": 960 + }, + { + "epoch": 0.614702154626109, + "grad_norm": 0.396813303232193, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0386, + "step": 970 + }, + { + "epoch": 0.6210392902408112, + "grad_norm": 0.5535157918930054, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0396, + "step": 980 + }, + { + "epoch": 0.6273764258555133, + "grad_norm": 0.4769563376903534, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0372, + "step": 990 + }, + { + "epoch": 0.6337135614702155, + "grad_norm": 0.5703756809234619, + "learning_rate": 9.931806517013612e-05, + "loss": 0.0325, + "step": 1000 + }, + { + "epoch": 0.6400506970849176, + "grad_norm": 0.31805068254470825, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0365, + "step": 1010 + }, + { + "epoch": 0.6463878326996197, + "grad_norm": 0.5349836945533752, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0265, + "step": 1020 + }, + { + "epoch": 0.6527249683143219, + "grad_norm": 0.5509829521179199, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0273, + "step": 1030 + }, + { + "epoch": 0.6590621039290241, + "grad_norm": 0.5818154215812683, + "learning_rate": 9.92048928531717e-05, + "loss": 0.0334, + "step": 1040 + }, + { + "epoch": 0.6653992395437263, + "grad_norm": 0.633552074432373, + "learning_rate": 9.917525374361912e-05, + "loss": 0.0297, + "step": 1050 + }, + { + "epoch": 0.6717363751584284, + "grad_norm": 0.505277156829834, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0386, + "step": 1060 + }, + { + "epoch": 0.6780735107731305, + "grad_norm": 0.30526870489120483, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0315, + "step": 1070 + }, + { + "epoch": 0.6844106463878327, + "grad_norm": 0.47638586163520813, + "learning_rate": 9.90831111046988e-05, + "loss": 0.036, + "step": 1080 + }, + { + "epoch": 0.6907477820025348, + "grad_norm": 0.38751357793807983, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0324, + "step": 1090 + }, + { + "epoch": 0.697084917617237, + "grad_norm": 0.20010784268379211, + "learning_rate": 9.901899829374047e-05, + "loss": 0.0231, + "step": 1100 + }, + { + "epoch": 0.7034220532319392, + "grad_norm": 0.4107750654220581, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0251, + "step": 1110 + }, + { + "epoch": 0.7097591888466414, + "grad_norm": 0.24435921013355255, + "learning_rate": 9.895274123299723e-05, + "loss": 0.027, + "step": 1120 + }, + { + "epoch": 0.7160963244613435, + "grad_norm": 0.4334017336368561, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0345, + "step": 1130 + }, + { + "epoch": 0.7224334600760456, + "grad_norm": 0.4201258420944214, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0283, + "step": 1140 + }, + { + "epoch": 0.7287705956907478, + "grad_norm": 0.41078707575798035, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0335, + "step": 1150 + }, + { + "epoch": 0.7351077313054499, + "grad_norm": 0.37457314133644104, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0298, + "step": 1160 + }, + { + "epoch": 0.7414448669201521, + "grad_norm": 0.7165825366973877, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0338, + "step": 1170 + }, + { + "epoch": 0.7477820025348543, + "grad_norm": 0.3498821556568146, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0335, + "step": 1180 + }, + { + "epoch": 0.7541191381495564, + "grad_norm": 0.3448495864868164, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0273, + "step": 1190 + }, + { + "epoch": 0.7604562737642585, + "grad_norm": 0.5568084716796875, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0396, + "step": 1200 + }, + { + "epoch": 0.7667934093789607, + "grad_norm": 0.4418658912181854, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0376, + "step": 1210 + }, + { + "epoch": 0.7731305449936628, + "grad_norm": 0.5665590167045593, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0307, + "step": 1220 + }, + { + "epoch": 0.779467680608365, + "grad_norm": 0.45505577325820923, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0263, + "step": 1230 + }, + { + "epoch": 0.7858048162230672, + "grad_norm": 0.4842069745063782, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0308, + "step": 1240 + }, + { + "epoch": 0.7921419518377694, + "grad_norm": 0.4048508405685425, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0243, + "step": 1250 + }, + { + "epoch": 0.7984790874524715, + "grad_norm": 0.24489720165729523, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0281, + "step": 1260 + }, + { + "epoch": 0.8048162230671736, + "grad_norm": 0.533295750617981, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0263, + "step": 1270 + }, + { + "epoch": 0.8111533586818758, + "grad_norm": 0.36894357204437256, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0316, + "step": 1280 + }, + { + "epoch": 0.8174904942965779, + "grad_norm": 0.2535454332828522, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0294, + "step": 1290 + }, + { + "epoch": 0.8238276299112801, + "grad_norm": 0.263375461101532, + "learning_rate": 9.826044551386744e-05, + "loss": 0.031, + "step": 1300 + }, + { + "epoch": 0.8301647655259823, + "grad_norm": 0.4550211429595947, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0278, + "step": 1310 + }, + { + "epoch": 0.8365019011406845, + "grad_norm": 0.5714216828346252, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0295, + "step": 1320 + }, + { + "epoch": 0.8428390367553865, + "grad_norm": 0.6447302103042603, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0331, + "step": 1330 + }, + { + "epoch": 0.8491761723700887, + "grad_norm": 0.4367750585079193, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0363, + "step": 1340 + }, + { + "epoch": 0.8555133079847909, + "grad_norm": 0.34794682264328003, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0241, + "step": 1350 + }, + { + "epoch": 0.861850443599493, + "grad_norm": 0.6189941167831421, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0325, + "step": 1360 + }, + { + "epoch": 0.8681875792141952, + "grad_norm": 0.4560254216194153, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0242, + "step": 1370 + }, + { + "epoch": 0.8745247148288974, + "grad_norm": 0.5019411444664001, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0321, + "step": 1380 + }, + { + "epoch": 0.8808618504435995, + "grad_norm": 0.7201408743858337, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0334, + "step": 1390 + }, + { + "epoch": 0.8871989860583016, + "grad_norm": 0.3263881206512451, + "learning_rate": 9.780178907671789e-05, + "loss": 0.0292, + "step": 1400 + }, + { + "epoch": 0.8935361216730038, + "grad_norm": 0.552166759967804, + "learning_rate": 9.775303978015585e-05, + "loss": 0.04, + "step": 1410 + }, + { + "epoch": 0.899873257287706, + "grad_norm": 0.520951509475708, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0321, + "step": 1420 + }, + { + "epoch": 0.9062103929024081, + "grad_norm": 0.6059055924415588, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0344, + "step": 1430 + }, + { + "epoch": 0.9125475285171103, + "grad_norm": 0.6472659707069397, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.9188846641318125, + "grad_norm": 0.2647618055343628, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0316, + "step": 1450 + }, + { + "epoch": 0.9252217997465145, + "grad_norm": 0.4789205491542816, + "learning_rate": 9.750147086550844e-05, + "loss": 0.0269, + "step": 1460 + }, + { + "epoch": 0.9315589353612167, + "grad_norm": 0.3620862364768982, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0198, + "step": 1470 + }, + { + "epoch": 0.9378960709759189, + "grad_norm": 0.33386313915252686, + "learning_rate": 9.739720312887535e-05, + "loss": 0.0248, + "step": 1480 + }, + { + "epoch": 0.944233206590621, + "grad_norm": 0.3961511254310608, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0244, + "step": 1490 + }, + { + "epoch": 0.9505703422053232, + "grad_norm": 0.29284903407096863, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0278, + "step": 1500 + }, + { + "epoch": 0.9569074778200254, + "grad_norm": 0.2947191596031189, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0202, + "step": 1510 + }, + { + "epoch": 0.9632446134347274, + "grad_norm": 0.5471847653388977, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0235, + "step": 1520 + }, + { + "epoch": 0.9695817490494296, + "grad_norm": 0.6192600727081299, + "learning_rate": 9.712747326859315e-05, + "loss": 0.0265, + "step": 1530 + }, + { + "epoch": 0.9759188846641318, + "grad_norm": 0.49848631024360657, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0211, + "step": 1540 + }, + { + "epoch": 0.982256020278834, + "grad_norm": 0.5650535225868225, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0323, + "step": 1550 + }, + { + "epoch": 0.9885931558935361, + "grad_norm": 0.4596526324748993, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0214, + "step": 1560 + }, + { + "epoch": 0.9949302915082383, + "grad_norm": 0.43541160225868225, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0249, + "step": 1570 + }, + { + "epoch": 1.0012674271229405, + "grad_norm": 0.3652093708515167, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0217, + "step": 1580 + }, + { + "epoch": 1.0076045627376427, + "grad_norm": 0.6113967895507812, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0288, + "step": 1590 + }, + { + "epoch": 1.0139416983523448, + "grad_norm": 0.35289207100868225, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0233, + "step": 1600 + }, + { + "epoch": 1.020278833967047, + "grad_norm": 0.4643147587776184, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0255, + "step": 1610 + }, + { + "epoch": 1.026615969581749, + "grad_norm": 0.186619833111763, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0239, + "step": 1620 + }, + { + "epoch": 1.0329531051964511, + "grad_norm": 0.46518614888191223, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0275, + "step": 1630 + }, + { + "epoch": 1.0392902408111533, + "grad_norm": 0.32950979471206665, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0289, + "step": 1640 + }, + { + "epoch": 1.0456273764258555, + "grad_norm": 0.35503265261650085, + "learning_rate": 9.642770192448536e-05, + "loss": 0.023, + "step": 1650 + }, + { + "epoch": 1.0519645120405576, + "grad_norm": 0.29523852467536926, + "learning_rate": 9.636607182864827e-05, + "loss": 0.029, + "step": 1660 + }, + { + "epoch": 1.0583016476552598, + "grad_norm": 0.5499984622001648, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0232, + "step": 1670 + }, + { + "epoch": 1.064638783269962, + "grad_norm": 0.3574666678905487, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0232, + "step": 1680 + }, + { + "epoch": 1.0709759188846641, + "grad_norm": 0.4880392551422119, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0348, + "step": 1690 + }, + { + "epoch": 1.0773130544993663, + "grad_norm": 0.6365160942077637, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0299, + "step": 1700 + }, + { + "epoch": 1.0836501901140685, + "grad_norm": 0.27736377716064453, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0219, + "step": 1710 + }, + { + "epoch": 1.0899873257287707, + "grad_norm": 0.34657785296440125, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0291, + "step": 1720 + }, + { + "epoch": 1.0963244613434728, + "grad_norm": 0.3848840296268463, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0244, + "step": 1730 + }, + { + "epoch": 1.102661596958175, + "grad_norm": 0.29880332946777344, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0276, + "step": 1740 + }, + { + "epoch": 1.1089987325728772, + "grad_norm": 0.32027584314346313, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0178, + "step": 1750 + }, + { + "epoch": 1.1153358681875791, + "grad_norm": 0.32828688621520996, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0221, + "step": 1760 + }, + { + "epoch": 1.1216730038022813, + "grad_norm": 0.3575758934020996, + "learning_rate": 9.565482757680415e-05, + "loss": 0.0226, + "step": 1770 + }, + { + "epoch": 1.1280101394169835, + "grad_norm": 0.39491409063339233, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0213, + "step": 1780 + }, + { + "epoch": 1.1343472750316856, + "grad_norm": 0.3913297951221466, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0242, + "step": 1790 + }, + { + "epoch": 1.1406844106463878, + "grad_norm": 0.3732500374317169, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0262, + "step": 1800 + }, + { + "epoch": 1.14702154626109, + "grad_norm": 0.3922960162162781, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0244, + "step": 1810 + }, + { + "epoch": 1.1533586818757922, + "grad_norm": 0.30090585350990295, + "learning_rate": 9.531150643965223e-05, + "loss": 0.019, + "step": 1820 + }, + { + "epoch": 1.1596958174904943, + "grad_norm": 0.20526911318302155, + "learning_rate": 9.524135262330098e-05, + "loss": 0.021, + "step": 1830 + }, + { + "epoch": 1.1660329531051965, + "grad_norm": 0.281562864780426, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0237, + "step": 1840 + }, + { + "epoch": 1.1723700887198987, + "grad_norm": 0.3962242603302002, + "learning_rate": 9.509956150664796e-05, + "loss": 0.023, + "step": 1850 + }, + { + "epoch": 1.1787072243346008, + "grad_norm": 0.4091711938381195, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0227, + "step": 1860 + }, + { + "epoch": 1.1850443599493028, + "grad_norm": 0.5421469807624817, + "learning_rate": 9.49557975890723e-05, + "loss": 0.0222, + "step": 1870 + }, + { + "epoch": 1.1913814955640052, + "grad_norm": 0.3753933906555176, + "learning_rate": 9.488317779179361e-05, + "loss": 0.027, + "step": 1880 + }, + { + "epoch": 1.1977186311787071, + "grad_norm": 0.2167666256427765, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0263, + "step": 1890 + }, + { + "epoch": 1.2040557667934093, + "grad_norm": 0.4261206090450287, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0309, + "step": 1900 + }, + { + "epoch": 1.2103929024081115, + "grad_norm": 0.5334855318069458, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0242, + "step": 1910 + }, + { + "epoch": 1.2167300380228137, + "grad_norm": 0.2908576726913452, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0199, + "step": 1920 + }, + { + "epoch": 1.2230671736375158, + "grad_norm": 0.3591972589492798, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0247, + "step": 1930 + }, + { + "epoch": 1.229404309252218, + "grad_norm": 0.3506092131137848, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0213, + "step": 1940 + }, + { + "epoch": 1.2357414448669202, + "grad_norm": 0.2472591996192932, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0237, + "step": 1950 + }, + { + "epoch": 1.2420785804816223, + "grad_norm": 0.33952316641807556, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0219, + "step": 1960 + }, + { + "epoch": 1.2484157160963245, + "grad_norm": 0.17157837748527527, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0268, + "step": 1970 + }, + { + "epoch": 1.2547528517110267, + "grad_norm": 0.2349107414484024, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0323, + "step": 1980 + }, + { + "epoch": 1.2610899873257289, + "grad_norm": 0.3310667872428894, + "learning_rate": 9.405214343720707e-05, + "loss": 0.022, + "step": 1990 + }, + { + "epoch": 1.2674271229404308, + "grad_norm": 0.45049792528152466, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0236, + "step": 2000 + }, + { + "epoch": 1.2737642585551332, + "grad_norm": 0.5270654559135437, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0233, + "step": 2010 + }, + { + "epoch": 1.2801013941698351, + "grad_norm": 0.4595666229724884, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0224, + "step": 2020 + }, + { + "epoch": 1.2864385297845373, + "grad_norm": 0.510229766368866, + "learning_rate": 9.373543805267368e-05, + "loss": 0.025, + "step": 2030 + }, + { + "epoch": 1.2927756653992395, + "grad_norm": 0.38161230087280273, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0206, + "step": 2040 + }, + { + "epoch": 1.2991128010139417, + "grad_norm": 0.3577732443809509, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0242, + "step": 2050 + }, + { + "epoch": 1.3054499366286438, + "grad_norm": 0.2776769995689392, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0244, + "step": 2060 + }, + { + "epoch": 1.311787072243346, + "grad_norm": 0.2369217425584793, + "learning_rate": 9.341108023285238e-05, + "loss": 0.0274, + "step": 2070 + }, + { + "epoch": 1.3181242078580482, + "grad_norm": 0.3153173327445984, + "learning_rate": 9.332880170637252e-05, + "loss": 0.0223, + "step": 2080 + }, + { + "epoch": 1.3244613434727504, + "grad_norm": 0.3351927399635315, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0191, + "step": 2090 + }, + { + "epoch": 1.3307984790874525, + "grad_norm": 0.3907555639743805, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0199, + "step": 2100 + }, + { + "epoch": 1.3371356147021547, + "grad_norm": 0.3048558235168457, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0257, + "step": 2110 + }, + { + "epoch": 1.3434727503168569, + "grad_norm": 0.4380377233028412, + "learning_rate": 9.299495830763286e-05, + "loss": 0.0317, + "step": 2120 + }, + { + "epoch": 1.3498098859315588, + "grad_norm": 0.47419941425323486, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0294, + "step": 2130 + }, + { + "epoch": 1.3561470215462612, + "grad_norm": 0.30324944853782654, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0233, + "step": 2140 + }, + { + "epoch": 1.3624841571609632, + "grad_norm": 0.3189409375190735, + "learning_rate": 9.273963562927695e-05, + "loss": 0.025, + "step": 2150 + }, + { + "epoch": 1.3688212927756653, + "grad_norm": 0.34753671288490295, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0302, + "step": 2160 + }, + { + "epoch": 1.3751584283903675, + "grad_norm": 0.2331438660621643, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0165, + "step": 2170 + }, + { + "epoch": 1.3814955640050697, + "grad_norm": 0.27667322754859924, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0175, + "step": 2180 + }, + { + "epoch": 1.3878326996197718, + "grad_norm": 0.5424685478210449, + "learning_rate": 9.239266632888659e-05, + "loss": 0.024, + "step": 2190 + }, + { + "epoch": 1.394169835234474, + "grad_norm": 0.2840613126754761, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0187, + "step": 2200 + }, + { + "epoch": 1.4005069708491762, + "grad_norm": 0.43624839186668396, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0253, + "step": 2210 + }, + { + "epoch": 1.4068441064638784, + "grad_norm": 0.27831771969795227, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0206, + "step": 2220 + }, + { + "epoch": 1.4131812420785805, + "grad_norm": 0.2653898298740387, + "learning_rate": 9.20382795393797e-05, + "loss": 0.0228, + "step": 2230 + }, + { + "epoch": 1.4195183776932827, + "grad_norm": 0.4191180467605591, + "learning_rate": 9.194853109746074e-05, + "loss": 0.0204, + "step": 2240 + }, + { + "epoch": 1.4258555133079849, + "grad_norm": 0.3698088526725769, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0201, + "step": 2250 + }, + { + "epoch": 1.4321926489226868, + "grad_norm": 0.32586073875427246, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0311, + "step": 2260 + }, + { + "epoch": 1.4385297845373892, + "grad_norm": 0.26891350746154785, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0228, + "step": 2270 + }, + { + "epoch": 1.4448669201520912, + "grad_norm": 0.4289737939834595, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0221, + "step": 2280 + }, + { + "epoch": 1.4512040557667933, + "grad_norm": 0.3612203598022461, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0215, + "step": 2290 + }, + { + "epoch": 1.4575411913814955, + "grad_norm": 0.511667013168335, + "learning_rate": 9.140044155740101e-05, + "loss": 0.0214, + "step": 2300 + }, + { + "epoch": 1.4638783269961977, + "grad_norm": 0.31154608726501465, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0228, + "step": 2310 + }, + { + "epoch": 1.4702154626108999, + "grad_norm": 0.33954188227653503, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0267, + "step": 2320 + }, + { + "epoch": 1.476552598225602, + "grad_norm": 0.4101477563381195, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0204, + "step": 2330 + }, + { + "epoch": 1.4828897338403042, + "grad_norm": 0.3624727427959442, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0194, + "step": 2340 + }, + { + "epoch": 1.4892268694550064, + "grad_norm": 0.29435303807258606, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0227, + "step": 2350 + }, + { + "epoch": 1.4955640050697085, + "grad_norm": 0.3966459035873413, + "learning_rate": 9.083605358775612e-05, + "loss": 0.02, + "step": 2360 + }, + { + "epoch": 1.5019011406844105, + "grad_norm": 0.459172785282135, + "learning_rate": 9.074041986463808e-05, + "loss": 0.025, + "step": 2370 + }, + { + "epoch": 1.508238276299113, + "grad_norm": 0.32978248596191406, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0186, + "step": 2380 + }, + { + "epoch": 1.5145754119138148, + "grad_norm": 0.38167238235473633, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0245, + "step": 2390 + }, + { + "epoch": 1.5209125475285172, + "grad_norm": 0.32322126626968384, + "learning_rate": 9.045084971874738e-05, + "loss": 0.022, + "step": 2400 + }, + { + "epoch": 1.5272496831432192, + "grad_norm": 0.2745392322540283, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0209, + "step": 2410 + }, + { + "epoch": 1.5335868187579216, + "grad_norm": 0.3605414032936096, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0213, + "step": 2420 + }, + { + "epoch": 1.5399239543726235, + "grad_norm": 0.32733994722366333, + "learning_rate": 9.015729832577681e-05, + "loss": 0.0172, + "step": 2430 + }, + { + "epoch": 1.5462610899873257, + "grad_norm": 0.6043369174003601, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0239, + "step": 2440 + }, + { + "epoch": 1.5525982256020279, + "grad_norm": 0.22895248234272003, + "learning_rate": 8.995939984474624e-05, + "loss": 0.019, + "step": 2450 + }, + { + "epoch": 1.55893536121673, + "grad_norm": 0.4059270918369293, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0224, + "step": 2460 + }, + { + "epoch": 1.5652724968314322, + "grad_norm": 0.3693525791168213, + "learning_rate": 8.975975341011596e-05, + "loss": 0.0187, + "step": 2470 + }, + { + "epoch": 1.5716096324461344, + "grad_norm": 0.47394832968711853, + "learning_rate": 8.965927743634391e-05, + "loss": 0.021, + "step": 2480 + }, + { + "epoch": 1.5779467680608366, + "grad_norm": 0.3060173988342285, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0216, + "step": 2490 + }, + { + "epoch": 1.5842839036755385, + "grad_norm": 0.2988053560256958, + "learning_rate": 8.945702546981969e-05, + "loss": 0.024, + "step": 2500 + }, + { + "epoch": 1.590621039290241, + "grad_norm": 0.5819903612136841, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0225, + "step": 2510 + }, + { + "epoch": 1.5969581749049429, + "grad_norm": 0.34001755714416504, + "learning_rate": 8.92530475251784e-05, + "loss": 0.02, + "step": 2520 + }, + { + "epoch": 1.6032953105196452, + "grad_norm": 0.45877692103385925, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0215, + "step": 2530 + }, + { + "epoch": 1.6096324461343472, + "grad_norm": 0.4635033905506134, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0325, + "step": 2540 + }, + { + "epoch": 1.6159695817490496, + "grad_norm": 0.42221376299858093, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0196, + "step": 2550 + }, + { + "epoch": 1.6223067173637515, + "grad_norm": 0.34406083822250366, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0245, + "step": 2560 + }, + { + "epoch": 1.6286438529784537, + "grad_norm": 0.4621538519859314, + "learning_rate": 8.873561024898668e-05, + "loss": 0.0236, + "step": 2570 + }, + { + "epoch": 1.6349809885931559, + "grad_norm": 0.3699153959751129, + "learning_rate": 8.863084742426719e-05, + "loss": 0.024, + "step": 2580 + }, + { + "epoch": 1.641318124207858, + "grad_norm": 0.4299221336841583, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0173, + "step": 2590 + }, + { + "epoch": 1.6476552598225602, + "grad_norm": 0.25442177057266235, + "learning_rate": 8.842005554284296e-05, + "loss": 0.021, + "step": 2600 + }, + { + "epoch": 1.6539923954372624, + "grad_norm": 0.31282278895378113, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0238, + "step": 2610 + }, + { + "epoch": 1.6603295310519646, + "grad_norm": 0.400104820728302, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0194, + "step": 2620 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5096003413200378, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0289, + "step": 2630 + }, + { + "epoch": 1.673003802281369, + "grad_norm": 0.36454638838768005, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0245, + "step": 2640 + }, + { + "epoch": 1.6793409378960709, + "grad_norm": 0.3085906207561493, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0225, + "step": 2650 + }, + { + "epoch": 1.6856780735107733, + "grad_norm": 0.25602665543556213, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0202, + "step": 2660 + }, + { + "epoch": 1.6920152091254752, + "grad_norm": 0.3288155496120453, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0307, + "step": 2670 + }, + { + "epoch": 1.6983523447401776, + "grad_norm": 0.2693244218826294, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0169, + "step": 2680 + }, + { + "epoch": 1.7046894803548795, + "grad_norm": 0.20038799941539764, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0177, + "step": 2690 + }, + { + "epoch": 1.7110266159695817, + "grad_norm": 0.2368573248386383, + "learning_rate": 8.73410738492077e-05, + "loss": 0.02, + "step": 2700 + }, + { + "epoch": 1.717363751584284, + "grad_norm": 0.3750855326652527, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0157, + "step": 2710 + }, + { + "epoch": 1.723700887198986, + "grad_norm": 0.468447744846344, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0212, + "step": 2720 + }, + { + "epoch": 1.7300380228136882, + "grad_norm": 0.32794639468193054, + "learning_rate": 8.700936225443959e-05, + "loss": 0.015, + "step": 2730 + }, + { + "epoch": 1.7363751584283904, + "grad_norm": 0.5568444132804871, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0182, + "step": 2740 + }, + { + "epoch": 1.7427122940430926, + "grad_norm": 0.36910462379455566, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0213, + "step": 2750 + }, + { + "epoch": 1.7490494296577945, + "grad_norm": 0.21325695514678955, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0195, + "step": 2760 + }, + { + "epoch": 1.755386565272497, + "grad_norm": 0.22268667817115784, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0192, + "step": 2770 + }, + { + "epoch": 1.7617237008871989, + "grad_norm": 0.5589450597763062, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0227, + "step": 2780 + }, + { + "epoch": 1.7680608365019013, + "grad_norm": 0.22853608429431915, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0234, + "step": 2790 + }, + { + "epoch": 1.7743979721166032, + "grad_norm": 0.3304268419742584, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0202, + "step": 2800 + }, + { + "epoch": 1.7807351077313056, + "grad_norm": 0.30013397336006165, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0189, + "step": 2810 + }, + { + "epoch": 1.7870722433460076, + "grad_norm": 0.3276195228099823, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0174, + "step": 2820 + }, + { + "epoch": 1.7934093789607097, + "grad_norm": 0.37568747997283936, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0217, + "step": 2830 + }, + { + "epoch": 1.799746514575412, + "grad_norm": 0.6022403836250305, + "learning_rate": 8.576217467724128e-05, + "loss": 0.0253, + "step": 2840 + }, + { + "epoch": 1.806083650190114, + "grad_norm": 0.3828267455101013, + "learning_rate": 8.564642241456986e-05, + "loss": 0.0219, + "step": 2850 + }, + { + "epoch": 1.8124207858048162, + "grad_norm": 0.37504634261131287, + "learning_rate": 8.553028032833397e-05, + "loss": 0.021, + "step": 2860 + }, + { + "epoch": 1.8187579214195184, + "grad_norm": 0.6533935070037842, + "learning_rate": 8.541374968864487e-05, + "loss": 0.027, + "step": 2870 + }, + { + "epoch": 1.8250950570342206, + "grad_norm": 0.4715104103088379, + "learning_rate": 8.529683176986295e-05, + "loss": 0.0165, + "step": 2880 + }, + { + "epoch": 1.8314321926489225, + "grad_norm": 0.5481289029121399, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0253, + "step": 2890 + }, + { + "epoch": 1.837769328263625, + "grad_norm": 0.3077695667743683, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0192, + "step": 2900 + }, + { + "epoch": 1.8441064638783269, + "grad_norm": 0.5000676512718201, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0234, + "step": 2910 + }, + { + "epoch": 1.8504435994930293, + "grad_norm": 0.253926545381546, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0225, + "step": 2920 + }, + { + "epoch": 1.8567807351077312, + "grad_norm": 0.3240850269794464, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0185, + "step": 2930 + }, + { + "epoch": 1.8631178707224336, + "grad_norm": 0.4573562741279602, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0218, + "step": 2940 + }, + { + "epoch": 1.8694550063371356, + "grad_norm": 0.443369597196579, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0244, + "step": 2950 + }, + { + "epoch": 1.8757921419518377, + "grad_norm": 0.3431839942932129, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0253, + "step": 2960 + }, + { + "epoch": 1.88212927756654, + "grad_norm": 0.37696412205696106, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0202, + "step": 2970 + }, + { + "epoch": 1.888466413181242, + "grad_norm": 0.6237473487854004, + "learning_rate": 8.410663560133784e-05, + "loss": 0.0197, + "step": 2980 + }, + { + "epoch": 1.8948035487959443, + "grad_norm": 0.39705732464790344, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0239, + "step": 2990 + }, + { + "epoch": 1.9011406844106464, + "grad_norm": 0.37278616428375244, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0289, + "step": 3000 + }, + { + "epoch": 1.9074778200253486, + "grad_norm": 0.3443812131881714, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0217, + "step": 3010 + }, + { + "epoch": 1.9138149556400506, + "grad_norm": 0.28766191005706787, + "learning_rate": 8.362004023673474e-05, + "loss": 0.024, + "step": 3020 + }, + { + "epoch": 1.920152091254753, + "grad_norm": 0.17464351654052734, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0201, + "step": 3030 + }, + { + "epoch": 1.926489226869455, + "grad_norm": 0.37782022356987, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0225, + "step": 3040 + }, + { + "epoch": 1.9328263624841573, + "grad_norm": 0.28459540009498596, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0247, + "step": 3050 + }, + { + "epoch": 1.9391634980988592, + "grad_norm": 0.3449212610721588, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0185, + "step": 3060 + }, + { + "epoch": 1.9455006337135616, + "grad_norm": 0.451109379529953, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0144, + "step": 3070 + }, + { + "epoch": 1.9518377693282636, + "grad_norm": 0.44775041937828064, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0239, + "step": 3080 + }, + { + "epoch": 1.9581749049429658, + "grad_norm": 0.3843693435192108, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0205, + "step": 3090 + }, + { + "epoch": 1.964512040557668, + "grad_norm": 0.32762160897254944, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0222, + "step": 3100 + }, + { + "epoch": 1.97084917617237, + "grad_norm": 0.323930025100708, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0191, + "step": 3110 + }, + { + "epoch": 1.9771863117870723, + "grad_norm": 0.26865267753601074, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0205, + "step": 3120 + }, + { + "epoch": 1.9835234474017744, + "grad_norm": 0.27057474851608276, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0181, + "step": 3130 + }, + { + "epoch": 1.9898605830164766, + "grad_norm": 0.38607391715049744, + "learning_rate": 8.212530463322583e-05, + "loss": 0.0206, + "step": 3140 + }, + { + "epoch": 1.9961977186311786, + "grad_norm": 0.3551093637943268, + "learning_rate": 8.199842702516583e-05, + "loss": 0.0226, + "step": 3150 + }, + { + "epoch": 2.002534854245881, + "grad_norm": 0.3289443552494049, + "learning_rate": 8.18711994874345e-05, + "loss": 0.0159, + "step": 3160 + }, + { + "epoch": 2.008871989860583, + "grad_norm": 0.29715219140052795, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0188, + "step": 3170 + }, + { + "epoch": 2.0152091254752853, + "grad_norm": 0.25433430075645447, + "learning_rate": 8.161570019212921e-05, + "loss": 0.0185, + "step": 3180 + }, + { + "epoch": 2.0215462610899873, + "grad_norm": 0.2330443561077118, + "learning_rate": 8.148743122865463e-05, + "loss": 0.016, + "step": 3190 + }, + { + "epoch": 2.0278833967046896, + "grad_norm": 0.3372531235218048, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0216, + "step": 3200 + }, + { + "epoch": 2.0342205323193916, + "grad_norm": 0.24772562086582184, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0156, + "step": 3210 + }, + { + "epoch": 2.040557667934094, + "grad_norm": 0.3528910279273987, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0149, + "step": 3220 + }, + { + "epoch": 2.046894803548796, + "grad_norm": 0.2773556113243103, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0172, + "step": 3230 + }, + { + "epoch": 2.053231939163498, + "grad_norm": 0.39497804641723633, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0193, + "step": 3240 + }, + { + "epoch": 2.0595690747782003, + "grad_norm": 0.2211255133152008, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0155, + "step": 3250 + }, + { + "epoch": 2.0659062103929022, + "grad_norm": 0.3327501118183136, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0154, + "step": 3260 + }, + { + "epoch": 2.0722433460076046, + "grad_norm": 0.3289473056793213, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0199, + "step": 3270 + }, + { + "epoch": 2.0785804816223066, + "grad_norm": 0.36586564779281616, + "learning_rate": 8.031768475274413e-05, + "loss": 0.0164, + "step": 3280 + }, + { + "epoch": 2.084917617237009, + "grad_norm": 0.3189631402492523, + "learning_rate": 8.018603611327504e-05, + "loss": 0.0168, + "step": 3290 + }, + { + "epoch": 2.091254752851711, + "grad_norm": 0.36633414030075073, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0317, + "step": 3300 + }, + { + "epoch": 2.0975918884664133, + "grad_norm": 0.2375175505876541, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0123, + "step": 3310 + }, + { + "epoch": 2.1039290240811153, + "grad_norm": 0.39194151759147644, + "learning_rate": 7.978911531372765e-05, + "loss": 0.0148, + "step": 3320 + }, + { + "epoch": 2.1102661596958177, + "grad_norm": 0.4107789099216461, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0205, + "step": 3330 + }, + { + "epoch": 2.1166032953105196, + "grad_norm": 0.3235548436641693, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0159, + "step": 3340 + }, + { + "epoch": 2.122940430925222, + "grad_norm": 0.2603886127471924, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0188, + "step": 3350 + }, + { + "epoch": 2.129277566539924, + "grad_norm": 0.387168288230896, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0223, + "step": 3360 + }, + { + "epoch": 2.1356147021546263, + "grad_norm": 0.4185580015182495, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0242, + "step": 3370 + }, + { + "epoch": 2.1419518377693283, + "grad_norm": 0.25904420018196106, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0169, + "step": 3380 + }, + { + "epoch": 2.1482889733840302, + "grad_norm": 0.29916971921920776, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0155, + "step": 3390 + }, + { + "epoch": 2.1546261089987326, + "grad_norm": 0.3949578106403351, + "learning_rate": 7.871643313414718e-05, + "loss": 0.018, + "step": 3400 + }, + { + "epoch": 2.1609632446134346, + "grad_norm": 0.31313779950141907, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0139, + "step": 3410 + }, + { + "epoch": 2.167300380228137, + "grad_norm": 0.31156864762306213, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0231, + "step": 3420 + }, + { + "epoch": 2.173637515842839, + "grad_norm": 0.27319496870040894, + "learning_rate": 7.830895520619128e-05, + "loss": 0.0173, + "step": 3430 + }, + { + "epoch": 2.1799746514575413, + "grad_norm": 0.5082638263702393, + "learning_rate": 7.817250808190483e-05, + "loss": 0.0206, + "step": 3440 + }, + { + "epoch": 2.1863117870722433, + "grad_norm": 0.3610362410545349, + "learning_rate": 7.803575286758364e-05, + "loss": 0.0187, + "step": 3450 + }, + { + "epoch": 2.1926489226869457, + "grad_norm": 0.6687695384025574, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0188, + "step": 3460 + }, + { + "epoch": 2.1989860583016476, + "grad_norm": 0.5592470765113831, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0178, + "step": 3470 + }, + { + "epoch": 2.20532319391635, + "grad_norm": 0.27790650725364685, + "learning_rate": 7.762365365649067e-05, + "loss": 0.0171, + "step": 3480 + }, + { + "epoch": 2.211660329531052, + "grad_norm": 0.4534870684146881, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0219, + "step": 3490 + }, + { + "epoch": 2.2179974651457544, + "grad_norm": 0.33770209550857544, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0184, + "step": 3500 + }, + { + "epoch": 2.2243346007604563, + "grad_norm": 0.29577895998954773, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0194, + "step": 3510 + }, + { + "epoch": 2.2306717363751583, + "grad_norm": 0.45840632915496826, + "learning_rate": 7.70699658915369e-05, + "loss": 0.023, + "step": 3520 + }, + { + "epoch": 2.2370088719898606, + "grad_norm": 0.2918465733528137, + "learning_rate": 7.693080007570084e-05, + "loss": 0.0176, + "step": 3530 + }, + { + "epoch": 2.2433460076045626, + "grad_norm": 0.3843630254268646, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0265, + "step": 3540 + }, + { + "epoch": 2.249683143219265, + "grad_norm": 0.24515217542648315, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0174, + "step": 3550 + }, + { + "epoch": 2.256020278833967, + "grad_norm": 0.19862592220306396, + "learning_rate": 7.651154166637025e-05, + "loss": 0.0215, + "step": 3560 + }, + { + "epoch": 2.2623574144486693, + "grad_norm": 0.17352795600891113, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0278, + "step": 3570 + }, + { + "epoch": 2.2686945500633713, + "grad_norm": 0.34404289722442627, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0208, + "step": 3580 + }, + { + "epoch": 2.2750316856780737, + "grad_norm": 0.2361377477645874, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0215, + "step": 3590 + }, + { + "epoch": 2.2813688212927756, + "grad_norm": 0.24867765605449677, + "learning_rate": 7.594847868906076e-05, + "loss": 0.0197, + "step": 3600 + }, + { + "epoch": 2.2877059569074776, + "grad_norm": 0.33056747913360596, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0206, + "step": 3610 + }, + { + "epoch": 2.29404309252218, + "grad_norm": 0.4908096194267273, + "learning_rate": 7.566523842452958e-05, + "loss": 0.0193, + "step": 3620 + }, + { + "epoch": 2.3003802281368824, + "grad_norm": 0.3107684254646301, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0179, + "step": 3630 + }, + { + "epoch": 2.3067173637515843, + "grad_norm": 0.2108030468225479, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0167, + "step": 3640 + }, + { + "epoch": 2.3130544993662863, + "grad_norm": 0.3508223295211792, + "learning_rate": 7.52382768867422e-05, + "loss": 0.0226, + "step": 3650 + }, + { + "epoch": 2.3193916349809887, + "grad_norm": 0.33375847339630127, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0159, + "step": 3660 + }, + { + "epoch": 2.3257287705956906, + "grad_norm": 0.3554519712924957, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0182, + "step": 3670 + }, + { + "epoch": 2.332065906210393, + "grad_norm": 0.38898682594299316, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0145, + "step": 3680 + }, + { + "epoch": 2.338403041825095, + "grad_norm": 0.33638787269592285, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0211, + "step": 3690 + }, + { + "epoch": 2.3447401774397973, + "grad_norm": 0.40711718797683716, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0196, + "step": 3700 + }, + { + "epoch": 2.3510773130544993, + "grad_norm": 0.3523763418197632, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0216, + "step": 3710 + }, + { + "epoch": 2.3574144486692017, + "grad_norm": 0.2812027633190155, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0207, + "step": 3720 + }, + { + "epoch": 2.3637515842839036, + "grad_norm": 0.2593352496623993, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0198, + "step": 3730 + }, + { + "epoch": 2.3700887198986056, + "grad_norm": 0.5507994890213013, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0249, + "step": 3740 + }, + { + "epoch": 2.376425855513308, + "grad_norm": 0.33799615502357483, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0222, + "step": 3750 + }, + { + "epoch": 2.3827629911280104, + "grad_norm": 0.3209291398525238, + "learning_rate": 7.365182146448205e-05, + "loss": 0.02, + "step": 3760 + }, + { + "epoch": 2.3891001267427123, + "grad_norm": 0.34429389238357544, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0156, + "step": 3770 + }, + { + "epoch": 2.3954372623574143, + "grad_norm": 0.29221874475479126, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0141, + "step": 3780 + }, + { + "epoch": 2.4017743979721167, + "grad_norm": 0.37552404403686523, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0193, + "step": 3790 + }, + { + "epoch": 2.4081115335868186, + "grad_norm": 0.25026583671569824, + "learning_rate": 7.30670581489344e-05, + "loss": 0.0215, + "step": 3800 + }, + { + "epoch": 2.414448669201521, + "grad_norm": 0.3728311061859131, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0173, + "step": 3810 + }, + { + "epoch": 2.420785804816223, + "grad_norm": 0.35615667700767517, + "learning_rate": 7.277315654334997e-05, + "loss": 0.021, + "step": 3820 + }, + { + "epoch": 2.4271229404309254, + "grad_norm": 0.25696536898612976, + "learning_rate": 7.262583137097018e-05, + "loss": 0.018, + "step": 3830 + }, + { + "epoch": 2.4334600760456273, + "grad_norm": 0.3925075829029083, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0168, + "step": 3840 + }, + { + "epoch": 2.4397972116603297, + "grad_norm": 0.2698250710964203, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0178, + "step": 3850 + }, + { + "epoch": 2.4461343472750317, + "grad_norm": 0.29354268312454224, + "learning_rate": 7.218237771703921e-05, + "loss": 0.0183, + "step": 3860 + }, + { + "epoch": 2.4524714828897336, + "grad_norm": 0.3076222240924835, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0167, + "step": 3870 + }, + { + "epoch": 2.458808618504436, + "grad_norm": 0.23654355108737946, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0165, + "step": 3880 + }, + { + "epoch": 2.4651457541191384, + "grad_norm": 0.2536214292049408, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0138, + "step": 3890 + }, + { + "epoch": 2.4714828897338403, + "grad_norm": 0.2710602879524231, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0148, + "step": 3900 + }, + { + "epoch": 2.4778200253485423, + "grad_norm": 0.15324455499649048, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0142, + "step": 3910 + }, + { + "epoch": 2.4841571609632447, + "grad_norm": 0.13363996148109436, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0144, + "step": 3920 + }, + { + "epoch": 2.4904942965779466, + "grad_norm": 0.2499360889196396, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0177, + "step": 3930 + }, + { + "epoch": 2.496831432192649, + "grad_norm": 0.1742558777332306, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0161, + "step": 3940 + }, + { + "epoch": 2.503168567807351, + "grad_norm": 0.33764421939849854, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0189, + "step": 3950 + }, + { + "epoch": 2.5095057034220534, + "grad_norm": 0.35038939118385315, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0236, + "step": 3960 + }, + { + "epoch": 2.5158428390367553, + "grad_norm": 0.16515985131263733, + "learning_rate": 7.053803645765128e-05, + "loss": 0.0169, + "step": 3970 + }, + { + "epoch": 2.5221799746514577, + "grad_norm": 0.49210289120674133, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0188, + "step": 3980 + }, + { + "epoch": 2.5285171102661597, + "grad_norm": 0.2284458875656128, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0159, + "step": 3990 + }, + { + "epoch": 2.5348542458808616, + "grad_norm": 0.19401752948760986, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0168, + "step": 4000 + }, + { + "epoch": 2.541191381495564, + "grad_norm": 0.14309851825237274, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0118, + "step": 4010 + }, + { + "epoch": 2.5475285171102664, + "grad_norm": 0.22239387035369873, + "learning_rate": 6.978149344295242e-05, + "loss": 0.0141, + "step": 4020 + }, + { + "epoch": 2.5538656527249683, + "grad_norm": 0.4320535957813263, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0178, + "step": 4030 + }, + { + "epoch": 2.5602027883396703, + "grad_norm": 0.32748010754585266, + "learning_rate": 6.947735034665002e-05, + "loss": 0.0137, + "step": 4040 + }, + { + "epoch": 2.5665399239543727, + "grad_norm": 0.1492803394794464, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0219, + "step": 4050 + }, + { + "epoch": 2.5728770595690746, + "grad_norm": 0.3322237432003021, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0188, + "step": 4060 + }, + { + "epoch": 2.579214195183777, + "grad_norm": 0.40152451395988464, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0263, + "step": 4070 + }, + { + "epoch": 2.585551330798479, + "grad_norm": 0.43540433049201965, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0146, + "step": 4080 + }, + { + "epoch": 2.5918884664131814, + "grad_norm": 0.24758180975914001, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0146, + "step": 4090 + }, + { + "epoch": 2.5982256020278833, + "grad_norm": 0.34365662932395935, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0166, + "step": 4100 + }, + { + "epoch": 2.6045627376425857, + "grad_norm": 0.2038048356771469, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0151, + "step": 4110 + }, + { + "epoch": 2.6108998732572877, + "grad_norm": 0.182295024394989, + "learning_rate": 6.825239153500029e-05, + "loss": 0.0145, + "step": 4120 + }, + { + "epoch": 2.6172370088719896, + "grad_norm": 0.3425464928150177, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0121, + "step": 4130 + }, + { + "epoch": 2.623574144486692, + "grad_norm": 0.2870509624481201, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0127, + "step": 4140 + }, + { + "epoch": 2.6299112801013944, + "grad_norm": 0.34516963362693787, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0158, + "step": 4150 + }, + { + "epoch": 2.6362484157160964, + "grad_norm": 0.27682995796203613, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0132, + "step": 4160 + }, + { + "epoch": 2.6425855513307983, + "grad_norm": 0.3806769549846649, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0158, + "step": 4170 + }, + { + "epoch": 2.6489226869455007, + "grad_norm": 0.20911303162574768, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0164, + "step": 4180 + }, + { + "epoch": 2.6552598225602027, + "grad_norm": 0.263217568397522, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0174, + "step": 4190 + }, + { + "epoch": 2.661596958174905, + "grad_norm": 0.17768670618534088, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0156, + "step": 4200 + }, + { + "epoch": 2.667934093789607, + "grad_norm": 0.30574101209640503, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0127, + "step": 4210 + }, + { + "epoch": 2.6742712294043094, + "grad_norm": 0.2742626965045929, + "learning_rate": 6.670333090488356e-05, + "loss": 0.0168, + "step": 4220 + }, + { + "epoch": 2.6806083650190113, + "grad_norm": 0.24799436330795288, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0141, + "step": 4230 + }, + { + "epoch": 2.6869455006337137, + "grad_norm": 0.33699798583984375, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0115, + "step": 4240 + }, + { + "epoch": 2.6932826362484157, + "grad_norm": 0.4366147816181183, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0201, + "step": 4250 + }, + { + "epoch": 2.6996197718631176, + "grad_norm": 0.29953205585479736, + "learning_rate": 6.607849694751977e-05, + "loss": 0.0143, + "step": 4260 + }, + { + "epoch": 2.70595690747782, + "grad_norm": 0.33452221751213074, + "learning_rate": 6.592184460293877e-05, + "loss": 0.0144, + "step": 4270 + }, + { + "epoch": 2.7122940430925224, + "grad_norm": 0.2584233283996582, + "learning_rate": 6.576501813961609e-05, + "loss": 0.0161, + "step": 4280 + }, + { + "epoch": 2.7186311787072244, + "grad_norm": 0.3979489505290985, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0282, + "step": 4290 + }, + { + "epoch": 2.7249683143219263, + "grad_norm": 0.4084644615650177, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0155, + "step": 4300 + }, + { + "epoch": 2.7313054499366287, + "grad_norm": 0.3948853611946106, + "learning_rate": 6.529351119689688e-05, + "loss": 0.014, + "step": 4310 + }, + { + "epoch": 2.7376425855513307, + "grad_norm": 0.3730246424674988, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0205, + "step": 4320 + }, + { + "epoch": 2.743979721166033, + "grad_norm": 0.30106091499328613, + "learning_rate": 6.497833413348909e-05, + "loss": 0.0162, + "step": 4330 + }, + { + "epoch": 2.750316856780735, + "grad_norm": 0.14317883551120758, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0134, + "step": 4340 + }, + { + "epoch": 2.7566539923954374, + "grad_norm": 0.33047810196876526, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0138, + "step": 4350 + }, + { + "epoch": 2.7629911280101394, + "grad_norm": 0.29450923204421997, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0166, + "step": 4360 + }, + { + "epoch": 2.7693282636248417, + "grad_norm": 0.22328028082847595, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0159, + "step": 4370 + }, + { + "epoch": 2.7756653992395437, + "grad_norm": 0.170170858502388, + "learning_rate": 6.418755520036775e-05, + "loss": 0.0147, + "step": 4380 + }, + { + "epoch": 2.7820025348542456, + "grad_norm": 0.3056323826313019, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0193, + "step": 4390 + }, + { + "epoch": 2.788339670468948, + "grad_norm": 0.19363383948802948, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0134, + "step": 4400 + }, + { + "epoch": 2.7946768060836504, + "grad_norm": 0.4496561586856842, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0132, + "step": 4410 + }, + { + "epoch": 2.8010139416983524, + "grad_norm": 0.17226959764957428, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0119, + "step": 4420 + }, + { + "epoch": 2.8073510773130543, + "grad_norm": 0.2828828692436218, + "learning_rate": 6.339289753131649e-05, + "loss": 0.016, + "step": 4430 + }, + { + "epoch": 2.8136882129277567, + "grad_norm": 0.28239116072654724, + "learning_rate": 6.323351964932908e-05, + "loss": 0.015, + "step": 4440 + }, + { + "epoch": 2.8200253485424587, + "grad_norm": 0.2226468324661255, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0135, + "step": 4450 + }, + { + "epoch": 2.826362484157161, + "grad_norm": 0.23484410345554352, + "learning_rate": 6.291433147091583e-05, + "loss": 0.015, + "step": 4460 + }, + { + "epoch": 2.832699619771863, + "grad_norm": 0.2702759802341461, + "learning_rate": 6.275452466508077e-05, + "loss": 0.016, + "step": 4470 + }, + { + "epoch": 2.8390367553865654, + "grad_norm": 0.39190873503685, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0116, + "step": 4480 + }, + { + "epoch": 2.8453738910012674, + "grad_norm": 0.4797994792461395, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0148, + "step": 4490 + }, + { + "epoch": 2.8517110266159698, + "grad_norm": 0.29361021518707275, + "learning_rate": 6.227427435703997e-05, + "loss": 0.025, + "step": 4500 + }, + { + "epoch": 2.8580481622306717, + "grad_norm": 0.26431670784950256, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0116, + "step": 4510 + }, + { + "epoch": 2.8643852978453737, + "grad_norm": 0.2542441189289093, + "learning_rate": 6.195343341974899e-05, + "loss": 0.0191, + "step": 4520 + }, + { + "epoch": 2.870722433460076, + "grad_norm": 0.21877507865428925, + "learning_rate": 6.179281599232591e-05, + "loss": 0.016, + "step": 4530 + }, + { + "epoch": 2.8770595690747784, + "grad_norm": 0.3358696401119232, + "learning_rate": 6.163206960055651e-05, + "loss": 0.0152, + "step": 4540 + }, + { + "epoch": 2.8833967046894804, + "grad_norm": 0.3265288174152374, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0199, + "step": 4550 + }, + { + "epoch": 2.8897338403041823, + "grad_norm": 0.3462512195110321, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0141, + "step": 4560 + }, + { + "epoch": 2.8960709759188847, + "grad_norm": 0.24054935574531555, + "learning_rate": 6.11490742250746e-05, + "loss": 0.0133, + "step": 4570 + }, + { + "epoch": 2.9024081115335867, + "grad_norm": 0.2769206166267395, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0177, + "step": 4580 + }, + { + "epoch": 2.908745247148289, + "grad_norm": 0.23850680887699127, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.0112, + "step": 4590 + }, + { + "epoch": 2.915082382762991, + "grad_norm": 0.20703135430812836, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0117, + "step": 4600 + }, + { + "epoch": 2.9214195183776934, + "grad_norm": 0.22506844997406006, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0162, + "step": 4610 + }, + { + "epoch": 2.9277566539923954, + "grad_norm": 0.30291593074798584, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0147, + "step": 4620 + }, + { + "epoch": 2.9340937896070978, + "grad_norm": 0.24053440988063812, + "learning_rate": 6.017983918218812e-05, + "loss": 0.0091, + "step": 4630 + }, + { + "epoch": 2.9404309252217997, + "grad_norm": 0.27635499835014343, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0154, + "step": 4640 + }, + { + "epoch": 2.9467680608365017, + "grad_norm": 0.29039266705513, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0165, + "step": 4650 + }, + { + "epoch": 2.953105196451204, + "grad_norm": 0.1605728417634964, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0154, + "step": 4660 + }, + { + "epoch": 2.9594423320659065, + "grad_norm": 0.2104729562997818, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0224, + "step": 4670 + }, + { + "epoch": 2.9657794676806084, + "grad_norm": 0.21257384121418, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0118, + "step": 4680 + }, + { + "epoch": 2.9721166032953104, + "grad_norm": 0.2738170027732849, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.0183, + "step": 4690 + }, + { + "epoch": 2.9784537389100127, + "grad_norm": 0.2844460904598236, + "learning_rate": 5.90440267166055e-05, + "loss": 0.0156, + "step": 4700 + }, + { + "epoch": 2.9847908745247147, + "grad_norm": 0.3566923439502716, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0134, + "step": 4710 + }, + { + "epoch": 2.991128010139417, + "grad_norm": 0.1965407133102417, + "learning_rate": 5.871859208889759e-05, + "loss": 0.013, + "step": 4720 + }, + { + "epoch": 2.997465145754119, + "grad_norm": 0.18356238305568695, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0113, + "step": 4730 + }, + { + "epoch": 3.0038022813688214, + "grad_norm": 0.23230914771556854, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0164, + "step": 4740 + }, + { + "epoch": 3.0101394169835234, + "grad_norm": 0.27572014927864075, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0113, + "step": 4750 + }, + { + "epoch": 3.016476552598226, + "grad_norm": 0.36763879656791687, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0124, + "step": 4760 + }, + { + "epoch": 3.0228136882129277, + "grad_norm": 0.18714648485183716, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0143, + "step": 4770 + }, + { + "epoch": 3.02915082382763, + "grad_norm": 0.22437100112438202, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0176, + "step": 4780 + }, + { + "epoch": 3.035487959442332, + "grad_norm": 0.33260634541511536, + "learning_rate": 5.757666109839702e-05, + "loss": 0.0195, + "step": 4790 + }, + { + "epoch": 3.041825095057034, + "grad_norm": 0.29105865955352783, + "learning_rate": 5.74131823855921e-05, + "loss": 0.015, + "step": 4800 + }, + { + "epoch": 3.0481622306717364, + "grad_norm": 0.20448502898216248, + "learning_rate": 5.72496226034123e-05, + "loss": 0.0141, + "step": 4810 + }, + { + "epoch": 3.0544993662864384, + "grad_norm": 0.2469986230134964, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0117, + "step": 4820 + }, + { + "epoch": 3.0608365019011408, + "grad_norm": 0.2351512908935547, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0138, + "step": 4830 + }, + { + "epoch": 3.0671736375158427, + "grad_norm": 0.2490169107913971, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0144, + "step": 4840 + }, + { + "epoch": 3.073510773130545, + "grad_norm": 0.3461778461933136, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0126, + "step": 4850 + }, + { + "epoch": 3.079847908745247, + "grad_norm": 0.28383558988571167, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0117, + "step": 4860 + }, + { + "epoch": 3.0861850443599494, + "grad_norm": 0.25864022970199585, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0146, + "step": 4870 + }, + { + "epoch": 3.0925221799746514, + "grad_norm": 0.4212886095046997, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0135, + "step": 4880 + }, + { + "epoch": 3.098859315589354, + "grad_norm": 0.2258593887090683, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0164, + "step": 4890 + }, + { + "epoch": 3.1051964512040557, + "grad_norm": 0.3095914423465729, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0134, + "step": 4900 + }, + { + "epoch": 3.111533586818758, + "grad_norm": 0.14614951610565186, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0131, + "step": 4910 + }, + { + "epoch": 3.11787072243346, + "grad_norm": 0.30523428320884705, + "learning_rate": 5.544562657317863e-05, + "loss": 0.015, + "step": 4920 + }, + { + "epoch": 3.124207858048162, + "grad_norm": 0.17284521460533142, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0126, + "step": 4930 + }, + { + "epoch": 3.1305449936628644, + "grad_norm": 0.33044707775115967, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0142, + "step": 4940 + }, + { + "epoch": 3.1368821292775664, + "grad_norm": 0.313587486743927, + "learning_rate": 5.495227651252315e-05, + "loss": 0.011, + "step": 4950 + }, + { + "epoch": 3.1432192648922688, + "grad_norm": 0.3139721751213074, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0132, + "step": 4960 + }, + { + "epoch": 3.1495564005069707, + "grad_norm": 0.313909649848938, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0129, + "step": 4970 + }, + { + "epoch": 3.155893536121673, + "grad_norm": 0.17512694001197815, + "learning_rate": 5.445843903969854e-05, + "loss": 0.0102, + "step": 4980 + }, + { + "epoch": 3.162230671736375, + "grad_norm": 0.21401318907737732, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.012, + "step": 4990 + }, + { + "epoch": 3.1685678073510775, + "grad_norm": 0.15441134572029114, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0119, + "step": 5000 + }, + { + "epoch": 3.1749049429657794, + "grad_norm": 0.23418472707271576, + "learning_rate": 5.396416275909779e-05, + "loss": 0.015, + "step": 5010 + }, + { + "epoch": 3.181242078580482, + "grad_norm": 0.2487867772579193, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0133, + "step": 5020 + }, + { + "epoch": 3.1875792141951838, + "grad_norm": 0.2844524681568146, + "learning_rate": 5.363442547846356e-05, + "loss": 0.014, + "step": 5030 + }, + { + "epoch": 3.1939163498098857, + "grad_norm": 0.20346881449222565, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0114, + "step": 5040 + }, + { + "epoch": 3.200253485424588, + "grad_norm": 0.3049124479293823, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0153, + "step": 5050 + }, + { + "epoch": 3.20659062103929, + "grad_norm": 0.29535266757011414, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0132, + "step": 5060 + }, + { + "epoch": 3.2129277566539924, + "grad_norm": 0.3959789276123047, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0157, + "step": 5070 + }, + { + "epoch": 3.2192648922686944, + "grad_norm": 0.23339158296585083, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0128, + "step": 5080 + }, + { + "epoch": 3.225602027883397, + "grad_norm": 0.2398107349872589, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0137, + "step": 5090 + }, + { + "epoch": 3.2319391634980987, + "grad_norm": 0.24322153627872467, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0184, + "step": 5100 + }, + { + "epoch": 3.238276299112801, + "grad_norm": 0.15239882469177246, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0161, + "step": 5110 + }, + { + "epoch": 3.244613434727503, + "grad_norm": 0.31975507736206055, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0147, + "step": 5120 + }, + { + "epoch": 3.2509505703422055, + "grad_norm": 0.22015072405338287, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0105, + "step": 5130 + }, + { + "epoch": 3.2572877059569074, + "grad_norm": 0.34388023614883423, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0156, + "step": 5140 + }, + { + "epoch": 3.26362484157161, + "grad_norm": 0.3033609092235565, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0148, + "step": 5150 + }, + { + "epoch": 3.2699619771863118, + "grad_norm": 0.1909927874803543, + "learning_rate": 5.148790314815663e-05, + "loss": 0.0123, + "step": 5160 + }, + { + "epoch": 3.2762991128010137, + "grad_norm": 0.23113128542900085, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0164, + "step": 5170 + }, + { + "epoch": 3.282636248415716, + "grad_norm": 0.19848641753196716, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0173, + "step": 5180 + }, + { + "epoch": 3.288973384030418, + "grad_norm": 0.27411961555480957, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0105, + "step": 5190 + }, + { + "epoch": 3.2953105196451205, + "grad_norm": 0.19896908104419708, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0153, + "step": 5200 + }, + { + "epoch": 3.3016476552598224, + "grad_norm": 0.20173603296279907, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0121, + "step": 5210 + }, + { + "epoch": 3.307984790874525, + "grad_norm": 0.1532195508480072, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0116, + "step": 5220 + }, + { + "epoch": 3.3143219264892267, + "grad_norm": 0.14388404786586761, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0134, + "step": 5230 + }, + { + "epoch": 3.320659062103929, + "grad_norm": 0.2755822241306305, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0103, + "step": 5240 + }, + { + "epoch": 3.326996197718631, + "grad_norm": 0.2270147204399109, + "learning_rate": 5e-05, + "loss": 0.0093, + "step": 5250 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.17542065680027008, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0119, + "step": 5260 + }, + { + "epoch": 3.3396704689480354, + "grad_norm": 0.26830822229385376, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0139, + "step": 5270 + }, + { + "epoch": 3.346007604562738, + "grad_norm": 0.23319189250469208, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0115, + "step": 5280 + }, + { + "epoch": 3.3523447401774398, + "grad_norm": 0.28234973549842834, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0096, + "step": 5290 + }, + { + "epoch": 3.3586818757921417, + "grad_norm": 0.2358221560716629, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0126, + "step": 5300 + }, + { + "epoch": 3.365019011406844, + "grad_norm": 0.29058966040611267, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.0281, + "step": 5310 + }, + { + "epoch": 3.371356147021546, + "grad_norm": 0.23800086975097656, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0177, + "step": 5320 + }, + { + "epoch": 3.3776932826362485, + "grad_norm": 0.26259422302246094, + "learning_rate": 4.867737844102261e-05, + "loss": 0.014, + "step": 5330 + }, + { + "epoch": 3.3840304182509504, + "grad_norm": 0.2746807336807251, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0102, + "step": 5340 + }, + { + "epoch": 3.390367553865653, + "grad_norm": 0.46735528111457825, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0207, + "step": 5350 + }, + { + "epoch": 3.3967046894803548, + "grad_norm": 0.2719608247280121, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0129, + "step": 5360 + }, + { + "epoch": 3.403041825095057, + "grad_norm": 0.28547102212905884, + "learning_rate": 4.801635694219079e-05, + "loss": 0.0101, + "step": 5370 + }, + { + "epoch": 3.409378960709759, + "grad_norm": 0.37190404534339905, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0156, + "step": 5380 + }, + { + "epoch": 3.4157160963244615, + "grad_norm": 0.3981722295284271, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0119, + "step": 5390 + }, + { + "epoch": 3.4220532319391634, + "grad_norm": 0.325492262840271, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0135, + "step": 5400 + }, + { + "epoch": 3.428390367553866, + "grad_norm": 0.24744857847690582, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0152, + "step": 5410 + }, + { + "epoch": 3.434727503168568, + "grad_norm": 0.15038323402404785, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0161, + "step": 5420 + }, + { + "epoch": 3.4410646387832697, + "grad_norm": 0.2310582995414734, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0121, + "step": 5430 + }, + { + "epoch": 3.447401774397972, + "grad_norm": 0.17245414853096008, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0127, + "step": 5440 + }, + { + "epoch": 3.453738910012674, + "grad_norm": 0.2583564817905426, + "learning_rate": 4.669547078371504e-05, + "loss": 0.0171, + "step": 5450 + }, + { + "epoch": 3.4600760456273765, + "grad_norm": 0.3162192702293396, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0113, + "step": 5460 + }, + { + "epoch": 3.4664131812420784, + "grad_norm": 0.29524022340774536, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0126, + "step": 5470 + }, + { + "epoch": 3.472750316856781, + "grad_norm": 0.14694812893867493, + "learning_rate": 4.620068510686985e-05, + "loss": 0.0119, + "step": 5480 + }, + { + "epoch": 3.4790874524714828, + "grad_norm": 0.1337551772594452, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0088, + "step": 5490 + }, + { + "epoch": 3.485424588086185, + "grad_norm": 0.3237602114677429, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0091, + "step": 5500 + }, + { + "epoch": 3.491761723700887, + "grad_norm": 0.1705574095249176, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0097, + "step": 5510 + }, + { + "epoch": 3.4980988593155895, + "grad_norm": 0.2950294613838196, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0117, + "step": 5520 + }, + { + "epoch": 3.5044359949302915, + "grad_norm": 0.27918121218681335, + "learning_rate": 4.537689731178883e-05, + "loss": 0.0104, + "step": 5530 + }, + { + "epoch": 3.510773130544994, + "grad_norm": 0.2752488851547241, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.009, + "step": 5540 + }, + { + "epoch": 3.517110266159696, + "grad_norm": 0.24830137193202972, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0133, + "step": 5550 + }, + { + "epoch": 3.5234474017743977, + "grad_norm": 0.38127654790878296, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0142, + "step": 5560 + }, + { + "epoch": 3.5297845373891, + "grad_norm": 0.3115534782409668, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0158, + "step": 5570 + }, + { + "epoch": 3.5361216730038025, + "grad_norm": 0.18441027402877808, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0087, + "step": 5580 + }, + { + "epoch": 3.5424588086185045, + "grad_norm": 0.19032280147075653, + "learning_rate": 4.439004011435979e-05, + "loss": 0.0128, + "step": 5590 + }, + { + "epoch": 3.5487959442332064, + "grad_norm": 0.16672304272651672, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0182, + "step": 5600 + }, + { + "epoch": 3.555133079847909, + "grad_norm": 0.24737948179244995, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0112, + "step": 5610 + }, + { + "epoch": 3.5614702154626108, + "grad_norm": 0.17746677994728088, + "learning_rate": 4.3897415459827e-05, + "loss": 0.0151, + "step": 5620 + }, + { + "epoch": 3.567807351077313, + "grad_norm": 0.2580298185348511, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0092, + "step": 5630 + }, + { + "epoch": 3.574144486692015, + "grad_norm": 0.24834871292114258, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.0133, + "step": 5640 + }, + { + "epoch": 3.5804816223067175, + "grad_norm": 0.23635822534561157, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0132, + "step": 5650 + }, + { + "epoch": 3.5868187579214195, + "grad_norm": 0.2585609555244446, + "learning_rate": 4.324152526842517e-05, + "loss": 0.014, + "step": 5660 + }, + { + "epoch": 3.593155893536122, + "grad_norm": 0.2859804034233093, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0101, + "step": 5670 + }, + { + "epoch": 3.599493029150824, + "grad_norm": 0.18319383263587952, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0202, + "step": 5680 + }, + { + "epoch": 3.6058301647655258, + "grad_norm": 0.2706187665462494, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0067, + "step": 5690 + }, + { + "epoch": 3.612167300380228, + "grad_norm": 0.2773619592189789, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0114, + "step": 5700 + }, + { + "epoch": 3.6185044359949305, + "grad_norm": 0.24154767394065857, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0178, + "step": 5710 + }, + { + "epoch": 3.6248415716096325, + "grad_norm": 0.1268390715122223, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0098, + "step": 5720 + }, + { + "epoch": 3.6311787072243344, + "grad_norm": 0.14447900652885437, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0121, + "step": 5730 + }, + { + "epoch": 3.637515842839037, + "grad_norm": 0.18617920577526093, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0092, + "step": 5740 + }, + { + "epoch": 3.643852978453739, + "grad_norm": 0.3276727497577667, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0109, + "step": 5750 + }, + { + "epoch": 3.650190114068441, + "grad_norm": 0.20072652399539948, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0106, + "step": 5760 + }, + { + "epoch": 3.656527249683143, + "grad_norm": 0.2064548283815384, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0097, + "step": 5770 + }, + { + "epoch": 3.6628643852978455, + "grad_norm": 0.20051850378513336, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.0127, + "step": 5780 + }, + { + "epoch": 3.6692015209125475, + "grad_norm": 0.27055758237838745, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0125, + "step": 5790 + }, + { + "epoch": 3.67553865652725, + "grad_norm": 0.3249766230583191, + "learning_rate": 4.095597328339452e-05, + "loss": 0.013, + "step": 5800 + }, + { + "epoch": 3.681875792141952, + "grad_norm": 0.36412471532821655, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0135, + "step": 5810 + }, + { + "epoch": 3.6882129277566538, + "grad_norm": 0.43938231468200684, + "learning_rate": 4.063093427071376e-05, + "loss": 0.0142, + "step": 5820 + }, + { + "epoch": 3.694550063371356, + "grad_norm": 0.2211577147245407, + "learning_rate": 4.046856756390767e-05, + "loss": 0.0092, + "step": 5830 + }, + { + "epoch": 3.7008871989860586, + "grad_norm": 0.2317885160446167, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0103, + "step": 5840 + }, + { + "epoch": 3.7072243346007605, + "grad_norm": 0.30124250054359436, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0128, + "step": 5850 + }, + { + "epoch": 3.7135614702154625, + "grad_norm": 0.22802747786045074, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0092, + "step": 5860 + }, + { + "epoch": 3.719898605830165, + "grad_norm": 0.3372552990913391, + "learning_rate": 3.982016081781189e-05, + "loss": 0.009, + "step": 5870 + }, + { + "epoch": 3.726235741444867, + "grad_norm": 0.3483387529850006, + "learning_rate": 3.965833301517017e-05, + "loss": 0.015, + "step": 5880 + }, + { + "epoch": 3.732572877059569, + "grad_norm": 0.17584814131259918, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0077, + "step": 5890 + }, + { + "epoch": 3.738910012674271, + "grad_norm": 0.26330336928367615, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0101, + "step": 5900 + }, + { + "epoch": 3.7452471482889735, + "grad_norm": 0.19230346381664276, + "learning_rate": 3.917353524881302e-05, + "loss": 0.014, + "step": 5910 + }, + { + "epoch": 3.7515842839036755, + "grad_norm": 0.19629403948783875, + "learning_rate": 3.901217043129735e-05, + "loss": 0.0124, + "step": 5920 + }, + { + "epoch": 3.757921419518378, + "grad_norm": 0.22321918606758118, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0116, + "step": 5930 + }, + { + "epoch": 3.76425855513308, + "grad_norm": 0.20382849872112274, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0142, + "step": 5940 + }, + { + "epoch": 3.770595690747782, + "grad_norm": 0.16952167451381683, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0125, + "step": 5950 + }, + { + "epoch": 3.776932826362484, + "grad_norm": 0.23627594113349915, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0132, + "step": 5960 + }, + { + "epoch": 3.7832699619771866, + "grad_norm": 0.20818836987018585, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0121, + "step": 5970 + }, + { + "epoch": 3.7896070975918885, + "grad_norm": 0.37023457884788513, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0093, + "step": 5980 + }, + { + "epoch": 3.7959442332065905, + "grad_norm": 0.24624159932136536, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0105, + "step": 5990 + }, + { + "epoch": 3.802281368821293, + "grad_norm": 0.2265588641166687, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0148, + "step": 6000 + }, + { + "epoch": 3.808618504435995, + "grad_norm": 0.24321384727954865, + "learning_rate": 3.756550564175727e-05, + "loss": 0.014, + "step": 6010 + }, + { + "epoch": 3.814955640050697, + "grad_norm": 0.15189670026302338, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0112, + "step": 6020 + }, + { + "epoch": 3.821292775665399, + "grad_norm": 0.366676926612854, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0163, + "step": 6030 + }, + { + "epoch": 3.8276299112801015, + "grad_norm": 0.22733189165592194, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.014, + "step": 6040 + }, + { + "epoch": 3.8339670468948035, + "grad_norm": 0.17254877090454102, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0112, + "step": 6050 + }, + { + "epoch": 3.840304182509506, + "grad_norm": 0.22515353560447693, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0113, + "step": 6060 + }, + { + "epoch": 3.846641318124208, + "grad_norm": 0.239216610789299, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.011, + "step": 6070 + }, + { + "epoch": 3.85297845373891, + "grad_norm": 0.11387116461992264, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0072, + "step": 6080 + }, + { + "epoch": 3.859315589353612, + "grad_norm": 0.18705078959465027, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0118, + "step": 6090 + }, + { + "epoch": 3.8656527249683146, + "grad_norm": 0.1804051548242569, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0134, + "step": 6100 + }, + { + "epoch": 3.8719898605830165, + "grad_norm": 0.4679585099220276, + "learning_rate": 3.597107297172084e-05, + "loss": 0.011, + "step": 6110 + }, + { + "epoch": 3.8783269961977185, + "grad_norm": 0.14044460654258728, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0106, + "step": 6120 + }, + { + "epoch": 3.884664131812421, + "grad_norm": 0.332378089427948, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0166, + "step": 6130 + }, + { + "epoch": 3.891001267427123, + "grad_norm": 0.2639565169811249, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0139, + "step": 6140 + }, + { + "epoch": 3.897338403041825, + "grad_norm": 0.2334318906068802, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0137, + "step": 6150 + }, + { + "epoch": 3.903675538656527, + "grad_norm": 0.3366606831550598, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0121, + "step": 6160 + }, + { + "epoch": 3.9100126742712296, + "grad_norm": 0.32218649983406067, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.0113, + "step": 6170 + }, + { + "epoch": 3.9163498098859315, + "grad_norm": 0.14572682976722717, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0147, + "step": 6180 + }, + { + "epoch": 3.922686945500634, + "grad_norm": 0.2537533938884735, + "learning_rate": 3.470648880310313e-05, + "loss": 0.011, + "step": 6190 + }, + { + "epoch": 3.929024081115336, + "grad_norm": 0.25570058822631836, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0107, + "step": 6200 + }, + { + "epoch": 3.935361216730038, + "grad_norm": 0.18532606959342957, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0109, + "step": 6210 + }, + { + "epoch": 3.94169835234474, + "grad_norm": 0.1890391707420349, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0168, + "step": 6220 + }, + { + "epoch": 3.9480354879594426, + "grad_norm": 0.17653165757656097, + "learning_rate": 3.407815539706124e-05, + "loss": 0.01, + "step": 6230 + }, + { + "epoch": 3.9543726235741445, + "grad_norm": 0.127501979470253, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0122, + "step": 6240 + }, + { + "epoch": 3.9607097591888465, + "grad_norm": 0.16189290583133698, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0114, + "step": 6250 + }, + { + "epoch": 3.967046894803549, + "grad_norm": 0.17998936772346497, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0148, + "step": 6260 + }, + { + "epoch": 3.973384030418251, + "grad_norm": 0.17054511606693268, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0112, + "step": 6270 + }, + { + "epoch": 3.9797211660329532, + "grad_norm": 0.2786466181278229, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0123, + "step": 6280 + }, + { + "epoch": 3.986058301647655, + "grad_norm": 0.21183429658412933, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0099, + "step": 6290 + }, + { + "epoch": 3.9923954372623576, + "grad_norm": 0.2610870897769928, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0136, + "step": 6300 + }, + { + "epoch": 3.9987325728770595, + "grad_norm": 0.22637638449668884, + "learning_rate": 3.282995562303754e-05, + "loss": 0.011, + "step": 6310 + }, + { + "epoch": 4.005069708491762, + "grad_norm": 0.18758799135684967, + "learning_rate": 3.267475773701161e-05, + "loss": 0.009, + "step": 6320 + }, + { + "epoch": 4.011406844106464, + "grad_norm": 0.22371801733970642, + "learning_rate": 3.251974931705933e-05, + "loss": 0.0107, + "step": 6330 + }, + { + "epoch": 4.017743979721166, + "grad_norm": 0.22646395862102509, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0117, + "step": 6340 + }, + { + "epoch": 4.024081115335868, + "grad_norm": 0.19698502123355865, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0119, + "step": 6350 + }, + { + "epoch": 4.030418250950571, + "grad_norm": 0.22287245094776154, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0093, + "step": 6360 + }, + { + "epoch": 4.0367553865652726, + "grad_norm": 0.27699437737464905, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0093, + "step": 6370 + }, + { + "epoch": 4.0430925221799745, + "grad_norm": 0.1392444372177124, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.0072, + "step": 6380 + }, + { + "epoch": 4.0494296577946765, + "grad_norm": 0.2287701815366745, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0154, + "step": 6390 + }, + { + "epoch": 4.055766793409379, + "grad_norm": 0.24777956306934357, + "learning_rate": 3.144013755408895e-05, + "loss": 0.012, + "step": 6400 + }, + { + "epoch": 4.062103929024081, + "grad_norm": 0.1426210254430771, + "learning_rate": 3.128670571009399e-05, + "loss": 0.012, + "step": 6410 + }, + { + "epoch": 4.068441064638783, + "grad_norm": 0.11875156313180923, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0088, + "step": 6420 + }, + { + "epoch": 4.074778200253485, + "grad_norm": 0.1875237077474594, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0074, + "step": 6430 + }, + { + "epoch": 4.081115335868188, + "grad_norm": 0.09495358169078827, + "learning_rate": 3.082764475205442e-05, + "loss": 0.0095, + "step": 6440 + }, + { + "epoch": 4.08745247148289, + "grad_norm": 0.2620109021663666, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.014, + "step": 6450 + }, + { + "epoch": 4.093789607097592, + "grad_norm": 0.30358102917671204, + "learning_rate": 3.052264965335e-05, + "loss": 0.0068, + "step": 6460 + }, + { + "epoch": 4.100126742712294, + "grad_norm": 0.2840622663497925, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.008, + "step": 6470 + }, + { + "epoch": 4.106463878326996, + "grad_norm": 0.12551774084568024, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0069, + "step": 6480 + }, + { + "epoch": 4.112801013941699, + "grad_norm": 0.2494884580373764, + "learning_rate": 3.006675866883275e-05, + "loss": 0.0076, + "step": 6490 + }, + { + "epoch": 4.119138149556401, + "grad_norm": 0.11765393614768982, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0065, + "step": 6500 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.278893812003424e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}