{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04950495049504951, "grad_norm": 8.046815872192383, "learning_rate": 9.900990099009901e-06, "loss": 1.1392, "step": 10 }, { "epoch": 0.09900990099009901, "grad_norm": 4.050646781921387, "learning_rate": 1.9801980198019803e-05, "loss": 0.6438, "step": 20 }, { "epoch": 0.1485148514851485, "grad_norm": 1.2656201124191284, "learning_rate": 2.9702970297029702e-05, "loss": 0.3058, "step": 30 }, { "epoch": 0.19801980198019803, "grad_norm": 1.2472892999649048, "learning_rate": 3.9603960396039605e-05, "loss": 0.2381, "step": 40 }, { "epoch": 0.24752475247524752, "grad_norm": 1.2082743644714355, "learning_rate": 4.950495049504951e-05, "loss": 0.1576, "step": 50 }, { "epoch": 0.297029702970297, "grad_norm": 0.9364326596260071, "learning_rate": 5.9405940594059404e-05, "loss": 0.1169, "step": 60 }, { "epoch": 0.3465346534653465, "grad_norm": 1.3442914485931396, "learning_rate": 6.93069306930693e-05, "loss": 0.1142, "step": 70 }, { "epoch": 0.39603960396039606, "grad_norm": 0.5791213512420654, "learning_rate": 7.920792079207921e-05, "loss": 0.0886, "step": 80 }, { "epoch": 0.44554455445544555, "grad_norm": 0.8255408406257629, "learning_rate": 8.910891089108912e-05, "loss": 0.0945, "step": 90 }, { "epoch": 0.49504950495049505, "grad_norm": 0.7826765775680542, "learning_rate": 9.900990099009902e-05, "loss": 0.0805, "step": 100 }, { "epoch": 0.5445544554455446, "grad_norm": 0.546211302280426, "learning_rate": 9.999457291071173e-05, "loss": 0.0766, "step": 110 }, { "epoch": 0.594059405940594, "grad_norm": 0.7814540863037109, "learning_rate": 9.997581411459941e-05, "loss": 0.0597, "step": 120 }, { "epoch": 0.6435643564356436, "grad_norm": 0.5694003701210022, "learning_rate": 9.994366163678231e-05, "loss": 0.0552, "step": 130 }, { "epoch": 0.693069306930693, "grad_norm": 0.6408165097236633, "learning_rate": 9.98981240942297e-05, "loss": 0.065, "step": 140 }, { "epoch": 0.7425742574257426, "grad_norm": 0.6475124955177307, "learning_rate": 9.983921369115196e-05, "loss": 0.0528, "step": 150 }, { "epoch": 0.7920792079207921, "grad_norm": 0.4199792742729187, "learning_rate": 9.976694621572991e-05, "loss": 0.0563, "step": 160 }, { "epoch": 0.8415841584158416, "grad_norm": 0.44479602575302124, "learning_rate": 9.96813410358834e-05, "loss": 0.0443, "step": 170 }, { "epoch": 0.8910891089108911, "grad_norm": 0.5495424866676331, "learning_rate": 9.958242109408072e-05, "loss": 0.0549, "step": 180 }, { "epoch": 0.9405940594059405, "grad_norm": 0.533566951751709, "learning_rate": 9.947021290118996e-05, "loss": 0.0515, "step": 190 }, { "epoch": 0.9900990099009901, "grad_norm": 0.3997371792793274, "learning_rate": 9.934474652937395e-05, "loss": 0.0488, "step": 200 }, { "epoch": 1.0396039603960396, "grad_norm": 0.5989133715629578, "learning_rate": 9.920605560403089e-05, "loss": 0.0449, "step": 210 }, { "epoch": 1.0891089108910892, "grad_norm": 0.4750981032848358, "learning_rate": 9.905417729478255e-05, "loss": 0.0428, "step": 220 }, { "epoch": 1.1386138613861387, "grad_norm": 0.494711697101593, "learning_rate": 9.888915230551276e-05, "loss": 0.0393, "step": 230 }, { "epoch": 1.188118811881188, "grad_norm": 0.4073871970176697, "learning_rate": 9.871102486345862e-05, "loss": 0.044, "step": 240 }, { "epoch": 1.2376237623762376, "grad_norm": 0.48300009965896606, "learning_rate": 9.851984270735742e-05, "loss": 0.0378, "step": 250 }, { "epoch": 1.2871287128712872, "grad_norm": 0.5179994702339172, "learning_rate": 9.831565707465256e-05, "loss": 0.0363, "step": 260 }, { "epoch": 1.3366336633663367, "grad_norm": 0.5359159111976624, "learning_rate": 9.809852268776172e-05, "loss": 0.045, "step": 270 }, { "epoch": 1.386138613861386, "grad_norm": 0.37923434376716614, "learning_rate": 9.786849773941103e-05, "loss": 0.0449, "step": 280 }, { "epoch": 1.4356435643564356, "grad_norm": 0.5288142561912537, "learning_rate": 9.762564387703929e-05, "loss": 0.0416, "step": 290 }, { "epoch": 1.4851485148514851, "grad_norm": 0.5554555058479309, "learning_rate": 9.737002618627626e-05, "loss": 0.036, "step": 300 }, { "epoch": 1.5346534653465347, "grad_norm": 0.3759332299232483, "learning_rate": 9.710171317349945e-05, "loss": 0.0447, "step": 310 }, { "epoch": 1.5841584158415842, "grad_norm": 0.5597942471504211, "learning_rate": 9.682077674747421e-05, "loss": 0.0442, "step": 320 }, { "epoch": 1.6336633663366338, "grad_norm": 0.461290180683136, "learning_rate": 9.652729220008194e-05, "loss": 0.0377, "step": 330 }, { "epoch": 1.6831683168316833, "grad_norm": 0.3928579092025757, "learning_rate": 9.622133818614165e-05, "loss": 0.038, "step": 340 }, { "epoch": 1.7326732673267327, "grad_norm": 0.344993531703949, "learning_rate": 9.590299670233012e-05, "loss": 0.033, "step": 350 }, { "epoch": 1.7821782178217822, "grad_norm": 0.3679508864879608, "learning_rate": 9.557235306520667e-05, "loss": 0.0378, "step": 360 }, { "epoch": 1.8316831683168315, "grad_norm": 0.4860198497772217, "learning_rate": 9.522949588834791e-05, "loss": 0.0311, "step": 370 }, { "epoch": 1.881188118811881, "grad_norm": 0.37521180510520935, "learning_rate": 9.487451705859908e-05, "loss": 0.0383, "step": 380 }, { "epoch": 1.9306930693069306, "grad_norm": 0.3351835608482361, "learning_rate": 9.450751171144804e-05, "loss": 0.0373, "step": 390 }, { "epoch": 1.9801980198019802, "grad_norm": 0.5943893790245056, "learning_rate": 9.41285782055287e-05, "loss": 0.0383, "step": 400 }, { "epoch": 2.0297029702970297, "grad_norm": 0.6122928857803345, "learning_rate": 9.37378180962605e-05, "loss": 0.0367, "step": 410 }, { "epoch": 2.0792079207920793, "grad_norm": 0.4920903146266937, "learning_rate": 9.333533610863132e-05, "loss": 0.0295, "step": 420 }, { "epoch": 2.128712871287129, "grad_norm": 0.32264697551727295, "learning_rate": 9.292124010913075e-05, "loss": 0.0364, "step": 430 }, { "epoch": 2.1782178217821784, "grad_norm": 0.425932914018631, "learning_rate": 9.249564107684164e-05, "loss": 0.0359, "step": 440 }, { "epoch": 2.227722772277228, "grad_norm": 0.36188560724258423, "learning_rate": 9.205865307369714e-05, "loss": 0.0323, "step": 450 }, { "epoch": 2.2772277227722775, "grad_norm": 0.4893958568572998, "learning_rate": 9.161039321391198e-05, "loss": 0.0352, "step": 460 }, { "epoch": 2.3267326732673266, "grad_norm": 0.33045169711112976, "learning_rate": 9.11509816325953e-05, "loss": 0.0287, "step": 470 }, { "epoch": 2.376237623762376, "grad_norm": 0.4212692379951477, "learning_rate": 9.068054145355406e-05, "loss": 0.0341, "step": 480 }, { "epoch": 2.4257425742574257, "grad_norm": 0.5016960501670837, "learning_rate": 9.019919875629558e-05, "loss": 0.0345, "step": 490 }, { "epoch": 2.4752475247524752, "grad_norm": 0.34293660521507263, "learning_rate": 8.970708254223768e-05, "loss": 0.0282, "step": 500 }, { "epoch": 2.5247524752475248, "grad_norm": 0.46188557147979736, "learning_rate": 8.920432470013593e-05, "loss": 0.0381, "step": 510 }, { "epoch": 2.5742574257425743, "grad_norm": 0.3944372534751892, "learning_rate": 8.869105997073703e-05, "loss": 0.0315, "step": 520 }, { "epoch": 2.623762376237624, "grad_norm": 0.3470475971698761, "learning_rate": 8.816742591066791e-05, "loss": 0.0327, "step": 530 }, { "epoch": 2.6732673267326734, "grad_norm": 0.42088139057159424, "learning_rate": 8.763356285556999e-05, "loss": 0.0386, "step": 540 }, { "epoch": 2.7227722772277225, "grad_norm": 0.5269019603729248, "learning_rate": 8.708961388248886e-05, "loss": 0.0345, "step": 550 }, { "epoch": 2.772277227722772, "grad_norm": 0.5604432821273804, "learning_rate": 8.653572477152914e-05, "loss": 0.0363, "step": 560 }, { "epoch": 2.8217821782178216, "grad_norm": 0.5375505685806274, "learning_rate": 8.597204396678485e-05, "loss": 0.0327, "step": 570 }, { "epoch": 2.871287128712871, "grad_norm": 0.3595617413520813, "learning_rate": 8.539872253655607e-05, "loss": 0.0293, "step": 580 }, { "epoch": 2.9207920792079207, "grad_norm": 0.3501165807247162, "learning_rate": 8.481591413286205e-05, "loss": 0.0316, "step": 590 }, { "epoch": 2.9702970297029703, "grad_norm": 0.2660023868083954, "learning_rate": 8.42237749502619e-05, "loss": 0.0258, "step": 600 }, { "epoch": 3.01980198019802, "grad_norm": 0.38347044587135315, "learning_rate": 8.362246368399407e-05, "loss": 0.026, "step": 610 }, { "epoch": 3.0693069306930694, "grad_norm": 0.3493596911430359, "learning_rate": 8.301214148744543e-05, "loss": 0.0279, "step": 620 }, { "epoch": 3.118811881188119, "grad_norm": 0.3242345154285431, "learning_rate": 8.239297192896161e-05, "loss": 0.0301, "step": 630 }, { "epoch": 3.1683168316831685, "grad_norm": 0.2825470268726349, "learning_rate": 8.176512094801027e-05, "loss": 0.0266, "step": 640 }, { "epoch": 3.217821782178218, "grad_norm": 0.24379639327526093, "learning_rate": 8.112875681070861e-05, "loss": 0.026, "step": 650 }, { "epoch": 3.2673267326732676, "grad_norm": 0.2989807724952698, "learning_rate": 8.048405006472766e-05, "loss": 0.0279, "step": 660 }, { "epoch": 3.3168316831683167, "grad_norm": 0.2257174700498581, "learning_rate": 7.983117349358482e-05, "loss": 0.0222, "step": 670 }, { "epoch": 3.366336633663366, "grad_norm": 0.17012077569961548, "learning_rate": 7.917030207033736e-05, "loss": 0.0279, "step": 680 }, { "epoch": 3.4158415841584158, "grad_norm": 0.2790309190750122, "learning_rate": 7.850161291068913e-05, "loss": 0.0239, "step": 690 }, { "epoch": 3.4653465346534653, "grad_norm": 0.27188029885292053, "learning_rate": 7.78252852255229e-05, "loss": 0.0263, "step": 700 }, { "epoch": 3.514851485148515, "grad_norm": 0.2633598744869232, "learning_rate": 7.71415002728712e-05, "loss": 0.0266, "step": 710 }, { "epoch": 3.5643564356435644, "grad_norm": 0.22800429165363312, "learning_rate": 7.645044130933874e-05, "loss": 0.0248, "step": 720 }, { "epoch": 3.613861386138614, "grad_norm": 0.19601000845432281, "learning_rate": 7.57522935409889e-05, "loss": 0.0309, "step": 730 }, { "epoch": 3.6633663366336635, "grad_norm": 0.3007054626941681, "learning_rate": 7.504724407370796e-05, "loss": 0.0314, "step": 740 }, { "epoch": 3.7128712871287126, "grad_norm": 0.31969642639160156, "learning_rate": 7.433548186306006e-05, "loss": 0.0279, "step": 750 }, { "epoch": 3.762376237623762, "grad_norm": 0.334820032119751, "learning_rate": 7.361719766364658e-05, "loss": 0.0235, "step": 760 }, { "epoch": 3.8118811881188117, "grad_norm": 0.3683372437953949, "learning_rate": 7.289258397798322e-05, "loss": 0.0273, "step": 770 }, { "epoch": 3.8613861386138613, "grad_norm": 0.3286759555339813, "learning_rate": 7.216183500490867e-05, "loss": 0.0282, "step": 780 }, { "epoch": 3.910891089108911, "grad_norm": 0.4011049270629883, "learning_rate": 7.142514658753877e-05, "loss": 0.0271, "step": 790 }, { "epoch": 3.9603960396039604, "grad_norm": 0.24703174829483032, "learning_rate": 7.068271616077979e-05, "loss": 0.0243, "step": 800 }, { "epoch": 4.00990099009901, "grad_norm": 1.2378054857254028, "learning_rate": 6.99347426984154e-05, "loss": 0.0242, "step": 810 }, { "epoch": 4.0594059405940595, "grad_norm": 0.3182317018508911, "learning_rate": 6.918142665978086e-05, "loss": 0.0331, "step": 820 }, { "epoch": 4.108910891089109, "grad_norm": 0.26656270027160645, "learning_rate": 6.842296993603941e-05, "loss": 0.0247, "step": 830 }, { "epoch": 4.158415841584159, "grad_norm": 0.314934641122818, "learning_rate": 6.76595757960747e-05, "loss": 0.0258, "step": 840 }, { "epoch": 4.207920792079208, "grad_norm": 0.23565533757209778, "learning_rate": 6.68914488320141e-05, "loss": 0.0246, "step": 850 }, { "epoch": 4.257425742574258, "grad_norm": 0.4688224196434021, "learning_rate": 6.611879490439729e-05, "loss": 0.0248, "step": 860 }, { "epoch": 4.306930693069307, "grad_norm": 0.474172979593277, "learning_rate": 6.534182108700501e-05, "loss": 0.0242, "step": 870 }, { "epoch": 4.356435643564357, "grad_norm": 0.2465556263923645, "learning_rate": 6.456073561136261e-05, "loss": 0.0184, "step": 880 }, { "epoch": 4.405940594059406, "grad_norm": 0.18616925179958344, "learning_rate": 6.377574781093329e-05, "loss": 0.0269, "step": 890 }, { "epoch": 4.455445544554456, "grad_norm": 0.39356115460395813, "learning_rate": 6.298706806501606e-05, "loss": 0.0185, "step": 900 }, { "epoch": 4.5049504950495045, "grad_norm": 0.25094884634017944, "learning_rate": 6.219490774236341e-05, "loss": 0.0249, "step": 910 }, { "epoch": 4.554455445544555, "grad_norm": 0.27497196197509766, "learning_rate": 6.139947914453378e-05, "loss": 0.0242, "step": 920 }, { "epoch": 4.603960396039604, "grad_norm": 0.21262158453464508, "learning_rate": 6.0600995448994036e-05, "loss": 0.0231, "step": 930 }, { "epoch": 4.653465346534653, "grad_norm": 0.30226609110832214, "learning_rate": 5.979967065198726e-05, "loss": 0.0261, "step": 940 }, { "epoch": 4.702970297029703, "grad_norm": 0.15758584439754486, "learning_rate": 5.899571951118099e-05, "loss": 0.0254, "step": 950 }, { "epoch": 4.752475247524752, "grad_norm": 0.3880341947078705, "learning_rate": 5.8189357488111516e-05, "loss": 0.0215, "step": 960 }, { "epoch": 4.801980198019802, "grad_norm": 0.3557332158088684, "learning_rate": 5.7380800690439443e-05, "loss": 0.022, "step": 970 }, { "epoch": 4.851485148514851, "grad_norm": 0.23131351172924042, "learning_rate": 5.657026581403218e-05, "loss": 0.0249, "step": 980 }, { "epoch": 4.900990099009901, "grad_norm": 0.29675719141960144, "learning_rate": 5.575797008488869e-05, "loss": 0.0234, "step": 990 }, { "epoch": 4.9504950495049505, "grad_norm": 0.18082201480865479, "learning_rate": 5.494413120092223e-05, "loss": 0.0258, "step": 1000 }, { "epoch": 5.0, "grad_norm": 0.33945131301879883, "learning_rate": 5.4128967273616625e-05, "loss": 0.0267, "step": 1010 }, { "epoch": 5.0495049504950495, "grad_norm": 0.3185798227787018, "learning_rate": 5.331269676957158e-05, "loss": 0.0252, "step": 1020 }, { "epoch": 5.099009900990099, "grad_norm": 0.19595389068126678, "learning_rate": 5.249553845195309e-05, "loss": 0.0239, "step": 1030 }, { "epoch": 5.148514851485149, "grad_norm": 0.1753993183374405, "learning_rate": 5.167771132186401e-05, "loss": 0.0165, "step": 1040 }, { "epoch": 5.198019801980198, "grad_norm": 0.19938364624977112, "learning_rate": 5.0859434559651164e-05, "loss": 0.0198, "step": 1050 }, { "epoch": 5.247524752475248, "grad_norm": 0.2329648733139038, "learning_rate": 5.0040927466164235e-05, "loss": 0.0208, "step": 1060 }, { "epoch": 5.297029702970297, "grad_norm": 0.16190586984157562, "learning_rate": 4.9222409403982453e-05, "loss": 0.0211, "step": 1070 }, { "epoch": 5.346534653465347, "grad_norm": 0.21188730001449585, "learning_rate": 4.840409973862472e-05, "loss": 0.0208, "step": 1080 }, { "epoch": 5.396039603960396, "grad_norm": 0.17959924042224884, "learning_rate": 4.7586217779758834e-05, "loss": 0.0165, "step": 1090 }, { "epoch": 5.445544554455446, "grad_norm": 0.29471927881240845, "learning_rate": 4.676898272242593e-05, "loss": 0.0236, "step": 1100 }, { "epoch": 5.4950495049504955, "grad_norm": 0.23320813477039337, "learning_rate": 4.59526135882954e-05, "loss": 0.0174, "step": 1110 }, { "epoch": 5.544554455445544, "grad_norm": 0.7296366095542908, "learning_rate": 4.5137329166966326e-05, "loss": 0.0231, "step": 1120 }, { "epoch": 5.594059405940594, "grad_norm": 0.2331215739250183, "learning_rate": 4.4323347957331306e-05, "loss": 0.0196, "step": 1130 }, { "epoch": 5.643564356435643, "grad_norm": 0.36525505781173706, "learning_rate": 4.351088810901785e-05, "loss": 0.0202, "step": 1140 }, { "epoch": 5.693069306930693, "grad_norm": 0.2810383141040802, "learning_rate": 4.270016736392371e-05, "loss": 0.0221, "step": 1150 }, { "epoch": 5.742574257425742, "grad_norm": 0.4460838735103607, "learning_rate": 4.1891402997861254e-05, "loss": 0.0222, "step": 1160 }, { "epoch": 5.792079207920792, "grad_norm": 0.30691349506378174, "learning_rate": 4.108481176232692e-05, "loss": 0.0197, "step": 1170 }, { "epoch": 5.841584158415841, "grad_norm": 0.2769784927368164, "learning_rate": 4.028060982641097e-05, "loss": 0.0193, "step": 1180 }, { "epoch": 5.891089108910891, "grad_norm": 0.3283490240573883, "learning_rate": 3.947901271886364e-05, "loss": 0.0213, "step": 1190 }, { "epoch": 5.9405940594059405, "grad_norm": 0.17090275883674622, "learning_rate": 3.8680235270332556e-05, "loss": 0.019, "step": 1200 }, { "epoch": 5.99009900990099, "grad_norm": 0.3182573914527893, "learning_rate": 3.788449155578758e-05, "loss": 0.0178, "step": 1210 }, { "epoch": 6.03960396039604, "grad_norm": 0.3006100356578827, "learning_rate": 3.7091994837147876e-05, "loss": 0.0217, "step": 1220 }, { "epoch": 6.089108910891089, "grad_norm": 0.30521124601364136, "learning_rate": 3.630295750612711e-05, "loss": 0.0172, "step": 1230 }, { "epoch": 6.138613861386139, "grad_norm": 0.5223678946495056, "learning_rate": 3.551759102731178e-05, "loss": 0.0228, "step": 1240 }, { "epoch": 6.188118811881188, "grad_norm": 0.3733428716659546, "learning_rate": 3.4736105881487904e-05, "loss": 0.0191, "step": 1250 }, { "epoch": 6.237623762376238, "grad_norm": 0.3236416280269623, "learning_rate": 3.395871150923163e-05, "loss": 0.0184, "step": 1260 }, { "epoch": 6.287128712871287, "grad_norm": 0.20324452221393585, "learning_rate": 3.318561625477834e-05, "loss": 0.0214, "step": 1270 }, { "epoch": 6.336633663366337, "grad_norm": 0.21963098645210266, "learning_rate": 3.241702731018586e-05, "loss": 0.0175, "step": 1280 }, { "epoch": 6.3861386138613865, "grad_norm": 0.21820610761642456, "learning_rate": 3.165315065980614e-05, "loss": 0.0169, "step": 1290 }, { "epoch": 6.435643564356436, "grad_norm": 0.17898957431316376, "learning_rate": 3.0894191025080946e-05, "loss": 0.0235, "step": 1300 }, { "epoch": 6.485148514851485, "grad_norm": 0.2586633563041687, "learning_rate": 3.014035180967567e-05, "loss": 0.0185, "step": 1310 }, { "epoch": 6.534653465346535, "grad_norm": 0.1782713085412979, "learning_rate": 2.939183504496667e-05, "loss": 0.0161, "step": 1320 }, { "epoch": 6.584158415841584, "grad_norm": 0.582782506942749, "learning_rate": 2.8648841335896006e-05, "loss": 0.0209, "step": 1330 }, { "epoch": 6.633663366336633, "grad_norm": 0.21330958604812622, "learning_rate": 2.7911569807208847e-05, "loss": 0.0159, "step": 1340 }, { "epoch": 6.683168316831683, "grad_norm": 0.17104759812355042, "learning_rate": 2.718021805008721e-05, "loss": 0.0143, "step": 1350 }, { "epoch": 6.732673267326732, "grad_norm": 0.22734080255031586, "learning_rate": 2.6454982069195023e-05, "loss": 0.0201, "step": 1360 }, { "epoch": 6.782178217821782, "grad_norm": 0.27949661016464233, "learning_rate": 2.5736056230148175e-05, "loss": 0.0195, "step": 1370 }, { "epoch": 6.8316831683168315, "grad_norm": 0.11735572665929794, "learning_rate": 2.5023633207423836e-05, "loss": 0.0192, "step": 1380 }, { "epoch": 6.881188118811881, "grad_norm": 0.29782113432884216, "learning_rate": 2.4317903932723196e-05, "loss": 0.0185, "step": 1390 }, { "epoch": 6.930693069306931, "grad_norm": 0.20763148367404938, "learning_rate": 2.3619057543800998e-05, "loss": 0.0195, "step": 1400 }, { "epoch": 6.98019801980198, "grad_norm": 0.5672227144241333, "learning_rate": 2.2927281333776163e-05, "loss": 0.0156, "step": 1410 }, { "epoch": 7.02970297029703, "grad_norm": 0.35093945264816284, "learning_rate": 2.2242760700936437e-05, "loss": 0.0176, "step": 1420 }, { "epoch": 7.079207920792079, "grad_norm": 0.3087887465953827, "learning_rate": 2.156567909905124e-05, "loss": 0.0197, "step": 1430 }, { "epoch": 7.128712871287129, "grad_norm": 0.19749677181243896, "learning_rate": 2.0896217988205253e-05, "loss": 0.0176, "step": 1440 }, { "epoch": 7.178217821782178, "grad_norm": 0.24461252987384796, "learning_rate": 2.0234556786166715e-05, "loss": 0.0158, "step": 1450 }, { "epoch": 7.227722772277228, "grad_norm": 0.24674923717975616, "learning_rate": 1.9580872820302716e-05, "loss": 0.0193, "step": 1460 }, { "epoch": 7.2772277227722775, "grad_norm": 0.12396609783172607, "learning_rate": 1.8935341280055e-05, "loss": 0.0172, "step": 1470 }, { "epoch": 7.326732673267327, "grad_norm": 0.12449830770492554, "learning_rate": 1.8298135169988534e-05, "loss": 0.0196, "step": 1480 }, { "epoch": 7.376237623762377, "grad_norm": 0.27104389667510986, "learning_rate": 1.7669425263425765e-05, "loss": 0.0157, "step": 1490 }, { "epoch": 7.425742574257426, "grad_norm": 0.22338876128196716, "learning_rate": 1.7049380056678832e-05, "loss": 0.012, "step": 1500 }, { "epoch": 7.475247524752476, "grad_norm": 0.2775212228298187, "learning_rate": 1.6438165723891906e-05, "loss": 0.0203, "step": 1510 }, { "epoch": 7.524752475247524, "grad_norm": 0.18439438939094543, "learning_rate": 1.583594607250613e-05, "loss": 0.0151, "step": 1520 }, { "epoch": 7.574257425742574, "grad_norm": 0.2981467843055725, "learning_rate": 1.5242882499358508e-05, "loss": 0.0174, "step": 1530 }, { "epoch": 7.623762376237623, "grad_norm": 0.21263225376605988, "learning_rate": 1.4659133947427189e-05, "loss": 0.0138, "step": 1540 }, { "epoch": 7.673267326732673, "grad_norm": 0.3413788080215454, "learning_rate": 1.4084856863234113e-05, "loss": 0.0132, "step": 1550 }, { "epoch": 7.7227722772277225, "grad_norm": 0.16698209941387177, "learning_rate": 1.3520205154916898e-05, "loss": 0.0171, "step": 1560 }, { "epoch": 7.772277227722772, "grad_norm": 0.22324088215827942, "learning_rate": 1.2965330150980943e-05, "loss": 0.0172, "step": 1570 }, { "epoch": 7.821782178217822, "grad_norm": 0.14203061163425446, "learning_rate": 1.2420380559742839e-05, "loss": 0.0175, "step": 1580 }, { "epoch": 7.871287128712871, "grad_norm": 0.1801433265209198, "learning_rate": 1.1885502429476159e-05, "loss": 0.0127, "step": 1590 }, { "epoch": 7.920792079207921, "grad_norm": 0.3111434578895569, "learning_rate": 1.1360839109269911e-05, "loss": 0.0153, "step": 1600 }, { "epoch": 7.97029702970297, "grad_norm": 0.19579961895942688, "learning_rate": 1.0846531210610644e-05, "loss": 0.0172, "step": 1610 }, { "epoch": 8.01980198019802, "grad_norm": 0.1957446187734604, "learning_rate": 1.034271656969803e-05, "loss": 0.0147, "step": 1620 }, { "epoch": 8.069306930693068, "grad_norm": 0.16515988111495972, "learning_rate": 9.849530210504315e-06, "loss": 0.0155, "step": 1630 }, { "epoch": 8.118811881188119, "grad_norm": 0.25872567296028137, "learning_rate": 9.367104308587494e-06, "loss": 0.0156, "step": 1640 }, { "epoch": 8.168316831683168, "grad_norm": 0.23563534021377563, "learning_rate": 8.895568155667767e-06, "loss": 0.0164, "step": 1650 }, { "epoch": 8.217821782178218, "grad_norm": 0.33589187264442444, "learning_rate": 8.435048124977019e-06, "loss": 0.0143, "step": 1660 }, { "epoch": 8.267326732673267, "grad_norm": 0.28475892543792725, "learning_rate": 7.985667637390287e-06, "loss": 0.0185, "step": 1670 }, { "epoch": 8.316831683168317, "grad_norm": 0.16741429269313812, "learning_rate": 7.547547128348642e-06, "loss": 0.0152, "step": 1680 }, { "epoch": 8.366336633663366, "grad_norm": 0.2126518040895462, "learning_rate": 7.12080401558205e-06, "loss": 0.0138, "step": 1690 }, { "epoch": 8.415841584158416, "grad_norm": 0.1645060032606125, "learning_rate": 6.7055526676410355e-06, "loss": 0.0151, "step": 1700 }, { "epoch": 8.465346534653465, "grad_norm": 0.7550293207168579, "learning_rate": 6.301904373245565e-06, "loss": 0.0148, "step": 1710 }, { "epoch": 8.514851485148515, "grad_norm": 0.21006666123867035, "learning_rate": 5.90996731145928e-06, "loss": 0.0138, "step": 1720 }, { "epoch": 8.564356435643564, "grad_norm": 0.1500163972377777, "learning_rate": 5.529846522697174e-06, "loss": 0.0122, "step": 1730 }, { "epoch": 8.613861386138614, "grad_norm": 0.1438021957874298, "learning_rate": 5.161643880574385e-06, "loss": 0.0194, "step": 1740 }, { "epoch": 8.663366336633663, "grad_norm": 0.16550536453723907, "learning_rate": 4.805458064603791e-06, "loss": 0.0183, "step": 1750 }, { "epoch": 8.712871287128714, "grad_norm": 0.2206612527370453, "learning_rate": 4.461384533749496e-06, "loss": 0.013, "step": 1760 }, { "epoch": 8.762376237623762, "grad_norm": 0.21099242568016052, "learning_rate": 4.129515500843601e-06, "loss": 0.0141, "step": 1770 }, { "epoch": 8.811881188118813, "grad_norm": 0.31080639362335205, "learning_rate": 3.8099399078728117e-06, "loss": 0.0147, "step": 1780 }, { "epoch": 8.861386138613861, "grad_norm": 0.15181687474250793, "learning_rate": 3.502743402141695e-06, "loss": 0.0152, "step": 1790 }, { "epoch": 8.910891089108912, "grad_norm": 0.1974124163389206, "learning_rate": 3.2080083133189443e-06, "loss": 0.013, "step": 1800 }, { "epoch": 8.96039603960396, "grad_norm": 0.16074234247207642, "learning_rate": 2.9258136313727424e-06, "loss": 0.0143, "step": 1810 }, { "epoch": 9.009900990099009, "grad_norm": 0.14589504897594452, "learning_rate": 2.6562349854012012e-06, "loss": 0.0107, "step": 1820 }, { "epoch": 9.05940594059406, "grad_norm": 0.23190949857234955, "learning_rate": 2.399344623363503e-06, "loss": 0.014, "step": 1830 }, { "epoch": 9.108910891089108, "grad_norm": 0.20311018824577332, "learning_rate": 2.155211392717238e-06, "loss": 0.013, "step": 1840 }, { "epoch": 9.158415841584159, "grad_norm": 0.208504319190979, "learning_rate": 1.9239007219670146e-06, "loss": 0.0139, "step": 1850 }, { "epoch": 9.207920792079207, "grad_norm": 0.19491755962371826, "learning_rate": 1.705474603129459e-06, "loss": 0.0116, "step": 1860 }, { "epoch": 9.257425742574258, "grad_norm": 0.226653590798378, "learning_rate": 1.4999915751191029e-06, "loss": 0.0137, "step": 1870 }, { "epoch": 9.306930693069306, "grad_norm": 0.12431623786687851, "learning_rate": 1.3075067080598136e-06, "loss": 0.0107, "step": 1880 }, { "epoch": 9.356435643564357, "grad_norm": 0.11901762336492538, "learning_rate": 1.1280715885257643e-06, "loss": 0.0141, "step": 1890 }, { "epoch": 9.405940594059405, "grad_norm": 0.1728558987379074, "learning_rate": 9.617343057161165e-07, "loss": 0.0165, "step": 1900 }, { "epoch": 9.455445544554456, "grad_norm": 0.16231480240821838, "learning_rate": 8.085394385669187e-07, "loss": 0.0142, "step": 1910 }, { "epoch": 9.504950495049505, "grad_norm": 0.18116702139377594, "learning_rate": 6.685280438038233e-07, "loss": 0.0123, "step": 1920 }, { "epoch": 9.554455445544555, "grad_norm": 0.17093735933303833, "learning_rate": 5.417376449387501e-07, "loss": 0.0181, "step": 1930 }, { "epoch": 9.603960396039604, "grad_norm": 0.1444690227508545, "learning_rate": 4.2820222221344743e-07, "loss": 0.0172, "step": 1940 }, { "epoch": 9.653465346534654, "grad_norm": 0.23010997474193573, "learning_rate": 3.279522034927096e-07, "loss": 0.0114, "step": 1950 }, { "epoch": 9.702970297029703, "grad_norm": 0.1534404456615448, "learning_rate": 2.410144561095673e-07, "loss": 0.0135, "step": 1960 }, { "epoch": 9.752475247524753, "grad_norm": 0.09397874027490616, "learning_rate": 1.6741227966478036e-07, "loss": 0.014, "step": 1970 }, { "epoch": 9.801980198019802, "grad_norm": 0.23087909817695618, "learning_rate": 1.071653997824662e-07, "loss": 0.0144, "step": 1980 }, { "epoch": 9.851485148514852, "grad_norm": 0.2112501859664917, "learning_rate": 6.028996282356758e-08, "loss": 0.0133, "step": 1990 }, { "epoch": 9.900990099009901, "grad_norm": 0.15660247206687927, "learning_rate": 2.6798531558552832e-08, "loss": 0.0146, "step": 2000 }, { "epoch": 9.950495049504951, "grad_norm": 0.4677261412143707, "learning_rate": 6.700081800598001e-09, "loss": 0.015, "step": 2010 }, { "epoch": 10.0, "grad_norm": 0.22400698065757751, "learning_rate": 0.0, "loss": 0.014, "step": 2020 }, { "epoch": 10.0, "step": 2020, "total_flos": 2.292289049907792e+17, "train_loss": 0.0384152461064629, "train_runtime": 2020.0898, "train_samples_per_second": 48.973, "train_steps_per_second": 1.0 } ], "logging_steps": 10, "max_steps": 2020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.292289049907792e+17, "train_batch_size": 49, "trial_name": null, "trial_params": null }