diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15413 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2618031500583697, + "eval_steps": 500, + "global_step": 21978, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005741297150402847, + "grad_norm": 47.5, + "learning_rate": 8.604206500956023e-07, + "loss": 12.2289, + "step": 10 + }, + { + "epoch": 0.0011482594300805695, + "grad_norm": 41.0, + "learning_rate": 1.8164435946462718e-06, + "loss": 12.1447, + "step": 20 + }, + { + "epoch": 0.0017223891451208543, + "grad_norm": 38.75, + "learning_rate": 2.772466539196941e-06, + "loss": 12.0576, + "step": 30 + }, + { + "epoch": 0.002296518860161139, + "grad_norm": 75.0, + "learning_rate": 3.7284894837476104e-06, + "loss": 11.9311, + "step": 40 + }, + { + "epoch": 0.0028706485752014238, + "grad_norm": 38.25, + "learning_rate": 4.68451242829828e-06, + "loss": 11.6058, + "step": 50 + }, + { + "epoch": 0.0034447782902417086, + "grad_norm": 35.0, + "learning_rate": 5.6405353728489485e-06, + "loss": 10.9243, + "step": 60 + }, + { + "epoch": 0.0040189080052819934, + "grad_norm": 59.25, + "learning_rate": 6.596558317399617e-06, + "loss": 10.1567, + "step": 70 + }, + { + "epoch": 0.004593037720322278, + "grad_norm": 52.0, + "learning_rate": 7.552581261950287e-06, + "loss": 9.6776, + "step": 80 + }, + { + "epoch": 0.005167167435362563, + "grad_norm": 18.75, + "learning_rate": 8.508604206500955e-06, + "loss": 9.3481, + "step": 90 + }, + { + "epoch": 0.0057412971504028475, + "grad_norm": 19.25, + "learning_rate": 9.464627151051626e-06, + "loss": 9.0556, + "step": 100 + }, + { + "epoch": 0.006315426865443133, + "grad_norm": 70.0, + "learning_rate": 1.0420650095602295e-05, + "loss": 8.7948, + "step": 110 + }, + { + "epoch": 0.006889556580483417, + "grad_norm": 54.0, + "learning_rate": 1.1376673040152965e-05, + "loss": 8.6711, + "step": 120 + }, + { + "epoch": 0.007463686295523702, + "grad_norm": 22.25, + "learning_rate": 1.2332695984703634e-05, + "loss": 8.5725, + "step": 130 + }, + { + "epoch": 0.008037816010563987, + "grad_norm": 5.03125, + "learning_rate": 1.3288718929254305e-05, + "loss": 8.3944, + "step": 140 + }, + { + "epoch": 0.008611945725604272, + "grad_norm": 4.84375, + "learning_rate": 1.4244741873804973e-05, + "loss": 8.1984, + "step": 150 + }, + { + "epoch": 0.009186075440644556, + "grad_norm": 2.640625, + "learning_rate": 1.5200764818355642e-05, + "loss": 8.0769, + "step": 160 + }, + { + "epoch": 0.009760205155684841, + "grad_norm": 2.21875, + "learning_rate": 1.615678776290631e-05, + "loss": 8.0288, + "step": 170 + }, + { + "epoch": 0.010334334870725126, + "grad_norm": 4.0625, + "learning_rate": 1.7112810707456982e-05, + "loss": 7.9832, + "step": 180 + }, + { + "epoch": 0.01090846458576541, + "grad_norm": 5.71875, + "learning_rate": 1.806883365200765e-05, + "loss": 7.956, + "step": 190 + }, + { + "epoch": 0.011482594300805695, + "grad_norm": 2.046875, + "learning_rate": 1.902485659655832e-05, + "loss": 7.9347, + "step": 200 + }, + { + "epoch": 0.01205672401584598, + "grad_norm": 2.453125, + "learning_rate": 1.9980879541108987e-05, + "loss": 7.9107, + "step": 210 + }, + { + "epoch": 0.012630853730886266, + "grad_norm": 2.234375, + "learning_rate": 2.0936902485659657e-05, + "loss": 7.8908, + "step": 220 + }, + { + "epoch": 0.01320498344592655, + "grad_norm": 2.375, + "learning_rate": 2.1892925430210324e-05, + "loss": 7.8553, + "step": 230 + }, + { + "epoch": 0.013779113160966834, + "grad_norm": 2.140625, + "learning_rate": 2.2848948374760995e-05, + "loss": 7.8601, + "step": 240 + }, + { + "epoch": 0.01435324287600712, + "grad_norm": 2.265625, + "learning_rate": 2.3804971319311666e-05, + "loss": 7.8253, + "step": 250 + }, + { + "epoch": 0.014927372591047403, + "grad_norm": 2.84375, + "learning_rate": 2.4760994263862333e-05, + "loss": 7.8335, + "step": 260 + }, + { + "epoch": 0.015501502306087689, + "grad_norm": 2.15625, + "learning_rate": 2.5717017208413003e-05, + "loss": 7.8141, + "step": 270 + }, + { + "epoch": 0.016075632021127974, + "grad_norm": 2.03125, + "learning_rate": 2.6673040152963674e-05, + "loss": 7.8022, + "step": 280 + }, + { + "epoch": 0.016649761736168257, + "grad_norm": 4.15625, + "learning_rate": 2.762906309751434e-05, + "loss": 7.7881, + "step": 290 + }, + { + "epoch": 0.017223891451208544, + "grad_norm": 1.96875, + "learning_rate": 2.858508604206501e-05, + "loss": 7.7425, + "step": 300 + }, + { + "epoch": 0.017798021166248828, + "grad_norm": 1.953125, + "learning_rate": 2.954110898661568e-05, + "loss": 7.7538, + "step": 310 + }, + { + "epoch": 0.01837215088128911, + "grad_norm": 1.90625, + "learning_rate": 3.049713193116635e-05, + "loss": 7.7481, + "step": 320 + }, + { + "epoch": 0.0189462805963294, + "grad_norm": 1.9453125, + "learning_rate": 3.145315487571702e-05, + "loss": 7.7378, + "step": 330 + }, + { + "epoch": 0.019520410311369682, + "grad_norm": 1.875, + "learning_rate": 3.240917782026769e-05, + "loss": 7.7184, + "step": 340 + }, + { + "epoch": 0.020094540026409966, + "grad_norm": 2.25, + "learning_rate": 3.3365200764818354e-05, + "loss": 7.6801, + "step": 350 + }, + { + "epoch": 0.020668669741450253, + "grad_norm": 2.171875, + "learning_rate": 3.432122370936903e-05, + "loss": 7.6844, + "step": 360 + }, + { + "epoch": 0.021242799456490536, + "grad_norm": 1.875, + "learning_rate": 3.5277246653919695e-05, + "loss": 7.6768, + "step": 370 + }, + { + "epoch": 0.02181692917153082, + "grad_norm": 1.8515625, + "learning_rate": 3.623326959847036e-05, + "loss": 7.6567, + "step": 380 + }, + { + "epoch": 0.022391058886571107, + "grad_norm": 2.0625, + "learning_rate": 3.7189292543021036e-05, + "loss": 7.6527, + "step": 390 + }, + { + "epoch": 0.02296518860161139, + "grad_norm": 2.15625, + "learning_rate": 3.8145315487571704e-05, + "loss": 7.6469, + "step": 400 + }, + { + "epoch": 0.023539318316651677, + "grad_norm": 2.015625, + "learning_rate": 3.910133843212238e-05, + "loss": 7.6354, + "step": 410 + }, + { + "epoch": 0.02411344803169196, + "grad_norm": 1.8828125, + "learning_rate": 4.0057361376673045e-05, + "loss": 7.6377, + "step": 420 + }, + { + "epoch": 0.024687577746732244, + "grad_norm": 3.0, + "learning_rate": 4.101338432122371e-05, + "loss": 7.6348, + "step": 430 + }, + { + "epoch": 0.02526170746177253, + "grad_norm": 1.9296875, + "learning_rate": 4.196940726577438e-05, + "loss": 7.6112, + "step": 440 + }, + { + "epoch": 0.025835837176812815, + "grad_norm": 1.875, + "learning_rate": 4.292543021032505e-05, + "loss": 7.6331, + "step": 450 + }, + { + "epoch": 0.0264099668918531, + "grad_norm": 2.0, + "learning_rate": 4.388145315487572e-05, + "loss": 7.5849, + "step": 460 + }, + { + "epoch": 0.026984096606893385, + "grad_norm": 2.046875, + "learning_rate": 4.483747609942639e-05, + "loss": 7.5836, + "step": 470 + }, + { + "epoch": 0.02755822632193367, + "grad_norm": 2.78125, + "learning_rate": 4.5793499043977055e-05, + "loss": 7.5977, + "step": 480 + }, + { + "epoch": 0.028132356036973952, + "grad_norm": 2.109375, + "learning_rate": 4.674952198852773e-05, + "loss": 7.5915, + "step": 490 + }, + { + "epoch": 0.02870648575201424, + "grad_norm": 1.9921875, + "learning_rate": 4.7705544933078396e-05, + "loss": 7.5873, + "step": 500 + }, + { + "epoch": 0.029280615467054523, + "grad_norm": 1.90625, + "learning_rate": 4.866156787762906e-05, + "loss": 7.5587, + "step": 510 + }, + { + "epoch": 0.029854745182094806, + "grad_norm": 2.078125, + "learning_rate": 4.961759082217973e-05, + "loss": 7.5544, + "step": 520 + }, + { + "epoch": 0.030428874897135093, + "grad_norm": 2.140625, + "learning_rate": 5.05736137667304e-05, + "loss": 7.5507, + "step": 530 + }, + { + "epoch": 0.031003004612175377, + "grad_norm": 1.96875, + "learning_rate": 5.152963671128107e-05, + "loss": 7.5432, + "step": 540 + }, + { + "epoch": 0.031577134327215664, + "grad_norm": 2.0625, + "learning_rate": 5.2485659655831745e-05, + "loss": 7.5419, + "step": 550 + }, + { + "epoch": 0.03215126404225595, + "grad_norm": 2.078125, + "learning_rate": 5.344168260038241e-05, + "loss": 7.5418, + "step": 560 + }, + { + "epoch": 0.03272539375729623, + "grad_norm": 2.25, + "learning_rate": 5.4397705544933086e-05, + "loss": 7.5354, + "step": 570 + }, + { + "epoch": 0.033299523472336515, + "grad_norm": 2.0, + "learning_rate": 5.535372848948375e-05, + "loss": 7.5175, + "step": 580 + }, + { + "epoch": 0.0338736531873768, + "grad_norm": 2.25, + "learning_rate": 5.630975143403442e-05, + "loss": 7.5083, + "step": 590 + }, + { + "epoch": 0.03444778290241709, + "grad_norm": 2.25, + "learning_rate": 5.726577437858509e-05, + "loss": 7.509, + "step": 600 + }, + { + "epoch": 0.03502191261745737, + "grad_norm": 2.0, + "learning_rate": 5.822179732313576e-05, + "loss": 7.4949, + "step": 610 + }, + { + "epoch": 0.035596042332497656, + "grad_norm": 2.296875, + "learning_rate": 5.917782026768642e-05, + "loss": 7.5187, + "step": 620 + }, + { + "epoch": 0.03617017204753794, + "grad_norm": 2.015625, + "learning_rate": 6.0133843212237096e-05, + "loss": 7.471, + "step": 630 + }, + { + "epoch": 0.03674430176257822, + "grad_norm": 2.125, + "learning_rate": 6.108986615678777e-05, + "loss": 7.4908, + "step": 640 + }, + { + "epoch": 0.03731843147761851, + "grad_norm": 2.078125, + "learning_rate": 6.204588910133844e-05, + "loss": 7.4648, + "step": 650 + }, + { + "epoch": 0.0378925611926588, + "grad_norm": 2.203125, + "learning_rate": 6.30019120458891e-05, + "loss": 7.4751, + "step": 660 + }, + { + "epoch": 0.03846669090769908, + "grad_norm": 1.9453125, + "learning_rate": 6.395793499043978e-05, + "loss": 7.451, + "step": 670 + }, + { + "epoch": 0.039040820622739364, + "grad_norm": 2.25, + "learning_rate": 6.491395793499044e-05, + "loss": 7.4656, + "step": 680 + }, + { + "epoch": 0.03961495033777965, + "grad_norm": 2.03125, + "learning_rate": 6.586998087954111e-05, + "loss": 7.4733, + "step": 690 + }, + { + "epoch": 0.04018908005281993, + "grad_norm": 2.34375, + "learning_rate": 6.682600382409177e-05, + "loss": 7.4538, + "step": 700 + }, + { + "epoch": 0.04076320976786022, + "grad_norm": 2.109375, + "learning_rate": 6.778202676864245e-05, + "loss": 7.4687, + "step": 710 + }, + { + "epoch": 0.041337339482900505, + "grad_norm": 1.9140625, + "learning_rate": 6.873804971319312e-05, + "loss": 7.4557, + "step": 720 + }, + { + "epoch": 0.04191146919794079, + "grad_norm": 2.15625, + "learning_rate": 6.96940726577438e-05, + "loss": 7.4609, + "step": 730 + }, + { + "epoch": 0.04248559891298107, + "grad_norm": 2.203125, + "learning_rate": 7.065009560229447e-05, + "loss": 7.4635, + "step": 740 + }, + { + "epoch": 0.043059728628021356, + "grad_norm": 2.046875, + "learning_rate": 7.160611854684513e-05, + "loss": 7.4266, + "step": 750 + }, + { + "epoch": 0.04363385834306164, + "grad_norm": 2.171875, + "learning_rate": 7.256214149139579e-05, + "loss": 7.4294, + "step": 760 + }, + { + "epoch": 0.04420798805810193, + "grad_norm": 1.9921875, + "learning_rate": 7.351816443594646e-05, + "loss": 7.4687, + "step": 770 + }, + { + "epoch": 0.04478211777314221, + "grad_norm": 2.1875, + "learning_rate": 7.447418738049714e-05, + "loss": 7.441, + "step": 780 + }, + { + "epoch": 0.0453562474881825, + "grad_norm": 2.09375, + "learning_rate": 7.54302103250478e-05, + "loss": 7.429, + "step": 790 + }, + { + "epoch": 0.04593037720322278, + "grad_norm": 1.9140625, + "learning_rate": 7.638623326959847e-05, + "loss": 7.4286, + "step": 800 + }, + { + "epoch": 0.046504506918263064, + "grad_norm": 2.046875, + "learning_rate": 7.734225621414915e-05, + "loss": 7.433, + "step": 810 + }, + { + "epoch": 0.047078636633303354, + "grad_norm": 2.1875, + "learning_rate": 7.829827915869982e-05, + "loss": 7.4379, + "step": 820 + }, + { + "epoch": 0.04765276634834364, + "grad_norm": 2.78125, + "learning_rate": 7.925430210325048e-05, + "loss": 7.4205, + "step": 830 + }, + { + "epoch": 0.04822689606338392, + "grad_norm": 2.1875, + "learning_rate": 8.021032504780115e-05, + "loss": 7.4257, + "step": 840 + }, + { + "epoch": 0.048801025778424205, + "grad_norm": 2.046875, + "learning_rate": 8.116634799235181e-05, + "loss": 7.4371, + "step": 850 + }, + { + "epoch": 0.04937515549346449, + "grad_norm": 2.0, + "learning_rate": 8.212237093690249e-05, + "loss": 7.4293, + "step": 860 + }, + { + "epoch": 0.04994928520850477, + "grad_norm": 2.046875, + "learning_rate": 8.307839388145315e-05, + "loss": 7.4071, + "step": 870 + }, + { + "epoch": 0.05052341492354506, + "grad_norm": 2.125, + "learning_rate": 8.403441682600382e-05, + "loss": 7.4027, + "step": 880 + }, + { + "epoch": 0.051097544638585346, + "grad_norm": 1.984375, + "learning_rate": 8.49904397705545e-05, + "loss": 7.4302, + "step": 890 + }, + { + "epoch": 0.05167167435362563, + "grad_norm": 2.140625, + "learning_rate": 8.594646271510517e-05, + "loss": 7.4265, + "step": 900 + }, + { + "epoch": 0.05224580406866591, + "grad_norm": 1.890625, + "learning_rate": 8.690248565965584e-05, + "loss": 7.4176, + "step": 910 + }, + { + "epoch": 0.0528199337837062, + "grad_norm": 2.0625, + "learning_rate": 8.78585086042065e-05, + "loss": 7.4276, + "step": 920 + }, + { + "epoch": 0.05339406349874648, + "grad_norm": 2.109375, + "learning_rate": 8.881453154875718e-05, + "loss": 7.3996, + "step": 930 + }, + { + "epoch": 0.05396819321378677, + "grad_norm": 2.125, + "learning_rate": 8.977055449330784e-05, + "loss": 7.4079, + "step": 940 + }, + { + "epoch": 0.054542322928827054, + "grad_norm": 1.96875, + "learning_rate": 9.072657743785851e-05, + "loss": 7.3955, + "step": 950 + }, + { + "epoch": 0.05511645264386734, + "grad_norm": 1.96875, + "learning_rate": 9.168260038240917e-05, + "loss": 7.4203, + "step": 960 + }, + { + "epoch": 0.05569058235890762, + "grad_norm": 2.046875, + "learning_rate": 9.263862332695985e-05, + "loss": 7.39, + "step": 970 + }, + { + "epoch": 0.056264712073947905, + "grad_norm": 2.046875, + "learning_rate": 9.359464627151052e-05, + "loss": 7.4234, + "step": 980 + }, + { + "epoch": 0.056838841788988195, + "grad_norm": 2.015625, + "learning_rate": 9.45506692160612e-05, + "loss": 7.3976, + "step": 990 + }, + { + "epoch": 0.05741297150402848, + "grad_norm": 2.046875, + "learning_rate": 9.550669216061186e-05, + "loss": 7.4, + "step": 1000 + }, + { + "epoch": 0.05798710121906876, + "grad_norm": 2.25, + "learning_rate": 9.646271510516253e-05, + "loss": 7.3905, + "step": 1010 + }, + { + "epoch": 0.058561230934109046, + "grad_norm": 1.9765625, + "learning_rate": 9.74187380497132e-05, + "loss": 7.3955, + "step": 1020 + }, + { + "epoch": 0.05913536064914933, + "grad_norm": 2.0625, + "learning_rate": 9.837476099426386e-05, + "loss": 7.3945, + "step": 1030 + }, + { + "epoch": 0.05970949036418961, + "grad_norm": 1.90625, + "learning_rate": 9.933078393881452e-05, + "loss": 7.3953, + "step": 1040 + }, + { + "epoch": 0.0602836200792299, + "grad_norm": 1.984375, + "learning_rate": 9.999999805483122e-05, + "loss": 7.3903, + "step": 1050 + }, + { + "epoch": 0.06085774979427019, + "grad_norm": 2.21875, + "learning_rate": 9.99999634740572e-05, + "loss": 7.3952, + "step": 1060 + }, + { + "epoch": 0.06143187950931047, + "grad_norm": 2.25, + "learning_rate": 9.999988566734478e-05, + "loss": 7.3853, + "step": 1070 + }, + { + "epoch": 0.062006009224350754, + "grad_norm": 1.796875, + "learning_rate": 9.999976463476122e-05, + "loss": 7.3878, + "step": 1080 + }, + { + "epoch": 0.06258013893939104, + "grad_norm": 2.140625, + "learning_rate": 9.999960037641117e-05, + "loss": 7.3885, + "step": 1090 + }, + { + "epoch": 0.06315426865443133, + "grad_norm": 2.03125, + "learning_rate": 9.999939289243663e-05, + "loss": 7.377, + "step": 1100 + }, + { + "epoch": 0.06372839836947161, + "grad_norm": 1.8671875, + "learning_rate": 9.999914218301699e-05, + "loss": 7.3959, + "step": 1110 + }, + { + "epoch": 0.0643025280845119, + "grad_norm": 1.875, + "learning_rate": 9.999884824836898e-05, + "loss": 7.3735, + "step": 1120 + }, + { + "epoch": 0.06487665779955218, + "grad_norm": 1.8984375, + "learning_rate": 9.99985110887467e-05, + "loss": 7.3862, + "step": 1130 + }, + { + "epoch": 0.06545078751459246, + "grad_norm": 2.28125, + "learning_rate": 9.999813070444166e-05, + "loss": 7.3695, + "step": 1140 + }, + { + "epoch": 0.06602491722963275, + "grad_norm": 2.21875, + "learning_rate": 9.999770709578267e-05, + "loss": 7.3761, + "step": 1150 + }, + { + "epoch": 0.06659904694467303, + "grad_norm": 1.8515625, + "learning_rate": 9.999724026313598e-05, + "loss": 7.355, + "step": 1160 + }, + { + "epoch": 0.06717317665971331, + "grad_norm": 1.6953125, + "learning_rate": 9.999673020690516e-05, + "loss": 7.3493, + "step": 1170 + }, + { + "epoch": 0.0677473063747536, + "grad_norm": 1.84375, + "learning_rate": 9.999617692753119e-05, + "loss": 7.3738, + "step": 1180 + }, + { + "epoch": 0.0683214360897939, + "grad_norm": 1.8671875, + "learning_rate": 9.999558042549236e-05, + "loss": 7.3737, + "step": 1190 + }, + { + "epoch": 0.06889556580483418, + "grad_norm": 1.7421875, + "learning_rate": 9.999494070130435e-05, + "loss": 7.3867, + "step": 1200 + }, + { + "epoch": 0.06946969551987446, + "grad_norm": 3.265625, + "learning_rate": 9.999425775552025e-05, + "loss": 7.3736, + "step": 1210 + }, + { + "epoch": 0.07004382523491474, + "grad_norm": 1.828125, + "learning_rate": 9.999353158873045e-05, + "loss": 7.3807, + "step": 1220 + }, + { + "epoch": 0.07061795494995503, + "grad_norm": 1.9921875, + "learning_rate": 9.999276220156276e-05, + "loss": 7.3413, + "step": 1230 + }, + { + "epoch": 0.07119208466499531, + "grad_norm": 2.296875, + "learning_rate": 9.99919495946823e-05, + "loss": 7.3401, + "step": 1240 + }, + { + "epoch": 0.0717662143800356, + "grad_norm": 2.03125, + "learning_rate": 9.999109376879163e-05, + "loss": 7.3607, + "step": 1250 + }, + { + "epoch": 0.07234034409507588, + "grad_norm": 1.875, + "learning_rate": 9.999019472463057e-05, + "loss": 7.353, + "step": 1260 + }, + { + "epoch": 0.07291447381011616, + "grad_norm": 1.8203125, + "learning_rate": 9.998925246297641e-05, + "loss": 7.3633, + "step": 1270 + }, + { + "epoch": 0.07348860352515645, + "grad_norm": 1.8984375, + "learning_rate": 9.998826698464372e-05, + "loss": 7.3313, + "step": 1280 + }, + { + "epoch": 0.07406273324019673, + "grad_norm": 1.8203125, + "learning_rate": 9.998723829048449e-05, + "loss": 7.371, + "step": 1290 + }, + { + "epoch": 0.07463686295523703, + "grad_norm": 1.828125, + "learning_rate": 9.998616638138802e-05, + "loss": 7.3106, + "step": 1300 + }, + { + "epoch": 0.07521099267027731, + "grad_norm": 1.9375, + "learning_rate": 9.998505125828104e-05, + "loss": 7.3412, + "step": 1310 + }, + { + "epoch": 0.0757851223853176, + "grad_norm": 1.8515625, + "learning_rate": 9.998389292212755e-05, + "loss": 7.3174, + "step": 1320 + }, + { + "epoch": 0.07635925210035788, + "grad_norm": 1.78125, + "learning_rate": 9.998269137392897e-05, + "loss": 7.34, + "step": 1330 + }, + { + "epoch": 0.07693338181539816, + "grad_norm": 1.8671875, + "learning_rate": 9.998144661472406e-05, + "loss": 7.3135, + "step": 1340 + }, + { + "epoch": 0.07750751153043844, + "grad_norm": 1.78125, + "learning_rate": 9.998015864558895e-05, + "loss": 7.3592, + "step": 1350 + }, + { + "epoch": 0.07808164124547873, + "grad_norm": 2.1875, + "learning_rate": 9.99788274676371e-05, + "loss": 7.3103, + "step": 1360 + }, + { + "epoch": 0.07865577096051901, + "grad_norm": 2.015625, + "learning_rate": 9.997745308201935e-05, + "loss": 7.3127, + "step": 1370 + }, + { + "epoch": 0.0792299006755593, + "grad_norm": 1.984375, + "learning_rate": 9.997603548992387e-05, + "loss": 7.3279, + "step": 1380 + }, + { + "epoch": 0.07980403039059958, + "grad_norm": 1.6640625, + "learning_rate": 9.99745746925762e-05, + "loss": 7.3282, + "step": 1390 + }, + { + "epoch": 0.08037816010563986, + "grad_norm": 1.8125, + "learning_rate": 9.997307069123925e-05, + "loss": 7.3304, + "step": 1400 + }, + { + "epoch": 0.08095228982068015, + "grad_norm": 1.765625, + "learning_rate": 9.997152348721324e-05, + "loss": 7.3042, + "step": 1410 + }, + { + "epoch": 0.08152641953572044, + "grad_norm": 1.765625, + "learning_rate": 9.996993308183575e-05, + "loss": 7.3112, + "step": 1420 + }, + { + "epoch": 0.08210054925076073, + "grad_norm": 1.8828125, + "learning_rate": 9.996829947648172e-05, + "loss": 7.3347, + "step": 1430 + }, + { + "epoch": 0.08267467896580101, + "grad_norm": 1.796875, + "learning_rate": 9.996662267256344e-05, + "loss": 7.3229, + "step": 1440 + }, + { + "epoch": 0.0832488086808413, + "grad_norm": 1.734375, + "learning_rate": 9.996490267153053e-05, + "loss": 7.3004, + "step": 1450 + }, + { + "epoch": 0.08382293839588158, + "grad_norm": 1.875, + "learning_rate": 9.996313947486999e-05, + "loss": 7.3087, + "step": 1460 + }, + { + "epoch": 0.08439706811092186, + "grad_norm": 1.71875, + "learning_rate": 9.996133308410609e-05, + "loss": 7.3292, + "step": 1470 + }, + { + "epoch": 0.08497119782596214, + "grad_norm": 1.8125, + "learning_rate": 9.995948350080055e-05, + "loss": 7.314, + "step": 1480 + }, + { + "epoch": 0.08554532754100243, + "grad_norm": 2.015625, + "learning_rate": 9.995759072655231e-05, + "loss": 7.2709, + "step": 1490 + }, + { + "epoch": 0.08611945725604271, + "grad_norm": 1.7734375, + "learning_rate": 9.995565476299777e-05, + "loss": 7.314, + "step": 1500 + }, + { + "epoch": 0.086693586971083, + "grad_norm": 1.796875, + "learning_rate": 9.995367561181057e-05, + "loss": 7.3032, + "step": 1510 + }, + { + "epoch": 0.08726771668612328, + "grad_norm": 1.828125, + "learning_rate": 9.99516532747017e-05, + "loss": 7.3275, + "step": 1520 + }, + { + "epoch": 0.08784184640116358, + "grad_norm": 1.8828125, + "learning_rate": 9.994958775341958e-05, + "loss": 7.3007, + "step": 1530 + }, + { + "epoch": 0.08841597611620386, + "grad_norm": 1.7109375, + "learning_rate": 9.994747904974983e-05, + "loss": 7.3164, + "step": 1540 + }, + { + "epoch": 0.08899010583124414, + "grad_norm": 1.796875, + "learning_rate": 9.994532716551551e-05, + "loss": 7.2968, + "step": 1550 + }, + { + "epoch": 0.08956423554628443, + "grad_norm": 1.7109375, + "learning_rate": 9.994313210257694e-05, + "loss": 7.2734, + "step": 1560 + }, + { + "epoch": 0.09013836526132471, + "grad_norm": 1.765625, + "learning_rate": 9.994089386283181e-05, + "loss": 7.3032, + "step": 1570 + }, + { + "epoch": 0.090712494976365, + "grad_norm": 1.6953125, + "learning_rate": 9.99386124482151e-05, + "loss": 7.295, + "step": 1580 + }, + { + "epoch": 0.09128662469140528, + "grad_norm": 1.734375, + "learning_rate": 9.993628786069914e-05, + "loss": 7.2955, + "step": 1590 + }, + { + "epoch": 0.09186075440644556, + "grad_norm": 1.765625, + "learning_rate": 9.993392010229361e-05, + "loss": 7.2999, + "step": 1600 + }, + { + "epoch": 0.09243488412148584, + "grad_norm": 1.6953125, + "learning_rate": 9.993150917504545e-05, + "loss": 7.2974, + "step": 1610 + }, + { + "epoch": 0.09300901383652613, + "grad_norm": 1.671875, + "learning_rate": 9.992905508103897e-05, + "loss": 7.3055, + "step": 1620 + }, + { + "epoch": 0.09358314355156641, + "grad_norm": 1.875, + "learning_rate": 9.992655782239577e-05, + "loss": 7.2906, + "step": 1630 + }, + { + "epoch": 0.09415727326660671, + "grad_norm": 1.8125, + "learning_rate": 9.992401740127481e-05, + "loss": 7.2949, + "step": 1640 + }, + { + "epoch": 0.09473140298164699, + "grad_norm": 1.6953125, + "learning_rate": 9.992143381987229e-05, + "loss": 7.2723, + "step": 1650 + }, + { + "epoch": 0.09530553269668728, + "grad_norm": 1.75, + "learning_rate": 9.99188070804218e-05, + "loss": 7.3072, + "step": 1660 + }, + { + "epoch": 0.09587966241172756, + "grad_norm": 1.8984375, + "learning_rate": 9.991613718519419e-05, + "loss": 7.2643, + "step": 1670 + }, + { + "epoch": 0.09645379212676784, + "grad_norm": 1.8515625, + "learning_rate": 9.991342413649764e-05, + "loss": 7.2772, + "step": 1680 + }, + { + "epoch": 0.09702792184180813, + "grad_norm": 1.6640625, + "learning_rate": 9.991066793667763e-05, + "loss": 7.2988, + "step": 1690 + }, + { + "epoch": 0.09760205155684841, + "grad_norm": 1.8359375, + "learning_rate": 9.990786858811695e-05, + "loss": 7.291, + "step": 1700 + }, + { + "epoch": 0.0981761812718887, + "grad_norm": 1.671875, + "learning_rate": 9.99050260932357e-05, + "loss": 7.281, + "step": 1710 + }, + { + "epoch": 0.09875031098692898, + "grad_norm": 1.78125, + "learning_rate": 9.990214045449127e-05, + "loss": 7.2702, + "step": 1720 + }, + { + "epoch": 0.09932444070196926, + "grad_norm": 1.7421875, + "learning_rate": 9.989921167437833e-05, + "loss": 7.2667, + "step": 1730 + }, + { + "epoch": 0.09989857041700954, + "grad_norm": 1.671875, + "learning_rate": 9.989623975542888e-05, + "loss": 7.2564, + "step": 1740 + }, + { + "epoch": 0.10047270013204983, + "grad_norm": 1.796875, + "learning_rate": 9.989322470021221e-05, + "loss": 7.2945, + "step": 1750 + }, + { + "epoch": 0.10104682984709012, + "grad_norm": 1.6953125, + "learning_rate": 9.98901665113349e-05, + "loss": 7.2481, + "step": 1760 + }, + { + "epoch": 0.10162095956213041, + "grad_norm": 1.734375, + "learning_rate": 9.98870651914408e-05, + "loss": 7.2643, + "step": 1770 + }, + { + "epoch": 0.10219508927717069, + "grad_norm": 1.6796875, + "learning_rate": 9.988392074321105e-05, + "loss": 7.2704, + "step": 1780 + }, + { + "epoch": 0.10276921899221098, + "grad_norm": 1.75, + "learning_rate": 9.98807331693641e-05, + "loss": 7.2702, + "step": 1790 + }, + { + "epoch": 0.10334334870725126, + "grad_norm": 1.734375, + "learning_rate": 9.987750247265568e-05, + "loss": 7.2794, + "step": 1800 + }, + { + "epoch": 0.10391747842229154, + "grad_norm": 1.875, + "learning_rate": 9.987422865587878e-05, + "loss": 7.2656, + "step": 1810 + }, + { + "epoch": 0.10449160813733183, + "grad_norm": 1.7890625, + "learning_rate": 9.987091172186367e-05, + "loss": 7.2393, + "step": 1820 + }, + { + "epoch": 0.10506573785237211, + "grad_norm": 1.7109375, + "learning_rate": 9.986755167347791e-05, + "loss": 7.2547, + "step": 1830 + }, + { + "epoch": 0.1056398675674124, + "grad_norm": 1.609375, + "learning_rate": 9.986414851362633e-05, + "loss": 7.2572, + "step": 1840 + }, + { + "epoch": 0.10621399728245268, + "grad_norm": 1.6484375, + "learning_rate": 9.986070224525101e-05, + "loss": 7.2767, + "step": 1850 + }, + { + "epoch": 0.10678812699749296, + "grad_norm": 1.8046875, + "learning_rate": 9.985721287133136e-05, + "loss": 7.2495, + "step": 1860 + }, + { + "epoch": 0.10736225671253326, + "grad_norm": 1.671875, + "learning_rate": 9.985368039488397e-05, + "loss": 7.2741, + "step": 1870 + }, + { + "epoch": 0.10793638642757354, + "grad_norm": 1.71875, + "learning_rate": 9.985010481896274e-05, + "loss": 7.267, + "step": 1880 + }, + { + "epoch": 0.10851051614261382, + "grad_norm": 1.7109375, + "learning_rate": 9.984648614665884e-05, + "loss": 7.2617, + "step": 1890 + }, + { + "epoch": 0.10908464585765411, + "grad_norm": 1.796875, + "learning_rate": 9.984282438110067e-05, + "loss": 7.2747, + "step": 1900 + }, + { + "epoch": 0.10965877557269439, + "grad_norm": 1.7890625, + "learning_rate": 9.983911952545391e-05, + "loss": 7.2657, + "step": 1910 + }, + { + "epoch": 0.11023290528773468, + "grad_norm": 1.6875, + "learning_rate": 9.983537158292145e-05, + "loss": 7.2559, + "step": 1920 + }, + { + "epoch": 0.11080703500277496, + "grad_norm": 1.6640625, + "learning_rate": 9.98315805567435e-05, + "loss": 7.271, + "step": 1930 + }, + { + "epoch": 0.11138116471781524, + "grad_norm": 1.734375, + "learning_rate": 9.982774645019746e-05, + "loss": 7.2564, + "step": 1940 + }, + { + "epoch": 0.11195529443285553, + "grad_norm": 1.7734375, + "learning_rate": 9.982386926659798e-05, + "loss": 7.2654, + "step": 1950 + }, + { + "epoch": 0.11252942414789581, + "grad_norm": 1.671875, + "learning_rate": 9.981994900929694e-05, + "loss": 7.2521, + "step": 1960 + }, + { + "epoch": 0.1131035538629361, + "grad_norm": 1.75, + "learning_rate": 9.981598568168354e-05, + "loss": 7.2378, + "step": 1970 + }, + { + "epoch": 0.11367768357797639, + "grad_norm": 1.8125, + "learning_rate": 9.98119792871841e-05, + "loss": 7.2558, + "step": 1980 + }, + { + "epoch": 0.11425181329301667, + "grad_norm": 1.6875, + "learning_rate": 9.980792982926224e-05, + "loss": 7.2372, + "step": 1990 + }, + { + "epoch": 0.11482594300805696, + "grad_norm": 1.625, + "learning_rate": 9.98038373114188e-05, + "loss": 7.2435, + "step": 2000 + }, + { + "epoch": 0.11540007272309724, + "grad_norm": 1.578125, + "learning_rate": 9.979970173719186e-05, + "loss": 7.2298, + "step": 2010 + }, + { + "epoch": 0.11597420243813752, + "grad_norm": 1.7109375, + "learning_rate": 9.979552311015666e-05, + "loss": 7.2634, + "step": 2020 + }, + { + "epoch": 0.11654833215317781, + "grad_norm": 1.59375, + "learning_rate": 9.979130143392575e-05, + "loss": 7.2516, + "step": 2030 + }, + { + "epoch": 0.11712246186821809, + "grad_norm": 1.78125, + "learning_rate": 9.978703671214881e-05, + "loss": 7.2689, + "step": 2040 + }, + { + "epoch": 0.11769659158325838, + "grad_norm": 1.578125, + "learning_rate": 9.97827289485128e-05, + "loss": 7.2549, + "step": 2050 + }, + { + "epoch": 0.11827072129829866, + "grad_norm": 1.6953125, + "learning_rate": 9.977837814674186e-05, + "loss": 7.2468, + "step": 2060 + }, + { + "epoch": 0.11884485101333894, + "grad_norm": 1.765625, + "learning_rate": 9.977398431059734e-05, + "loss": 7.2569, + "step": 2070 + }, + { + "epoch": 0.11941898072837923, + "grad_norm": 1.7890625, + "learning_rate": 9.97695474438778e-05, + "loss": 7.2475, + "step": 2080 + }, + { + "epoch": 0.11999311044341952, + "grad_norm": 1.6796875, + "learning_rate": 9.976506755041898e-05, + "loss": 7.2458, + "step": 2090 + }, + { + "epoch": 0.1205672401584598, + "grad_norm": 1.6875, + "learning_rate": 9.976054463409388e-05, + "loss": 7.2387, + "step": 2100 + }, + { + "epoch": 0.12114136987350009, + "grad_norm": 1.65625, + "learning_rate": 9.97559786988126e-05, + "loss": 7.2162, + "step": 2110 + }, + { + "epoch": 0.12171549958854037, + "grad_norm": 1.8828125, + "learning_rate": 9.97513697485225e-05, + "loss": 7.2453, + "step": 2120 + }, + { + "epoch": 0.12228962930358066, + "grad_norm": 1.6015625, + "learning_rate": 9.97467177872081e-05, + "loss": 7.2229, + "step": 2130 + }, + { + "epoch": 0.12286375901862094, + "grad_norm": 1.71875, + "learning_rate": 9.974202281889114e-05, + "loss": 7.2433, + "step": 2140 + }, + { + "epoch": 0.12343788873366122, + "grad_norm": 1.6875, + "learning_rate": 9.973728484763047e-05, + "loss": 7.232, + "step": 2150 + }, + { + "epoch": 0.12401201844870151, + "grad_norm": 1.7265625, + "learning_rate": 9.973250387752217e-05, + "loss": 7.2322, + "step": 2160 + }, + { + "epoch": 0.12458614816374179, + "grad_norm": 1.6796875, + "learning_rate": 9.97276799126995e-05, + "loss": 7.2194, + "step": 2170 + }, + { + "epoch": 0.1251602778787821, + "grad_norm": 1.6015625, + "learning_rate": 9.972281295733286e-05, + "loss": 7.2392, + "step": 2180 + }, + { + "epoch": 0.12573440759382237, + "grad_norm": 1.765625, + "learning_rate": 9.971790301562981e-05, + "loss": 7.2556, + "step": 2190 + }, + { + "epoch": 0.12630853730886266, + "grad_norm": 1.703125, + "learning_rate": 9.971295009183512e-05, + "loss": 7.2522, + "step": 2200 + }, + { + "epoch": 0.12688266702390294, + "grad_norm": 1.625, + "learning_rate": 9.970795419023065e-05, + "loss": 7.2535, + "step": 2210 + }, + { + "epoch": 0.12745679673894322, + "grad_norm": 1.765625, + "learning_rate": 9.970291531513551e-05, + "loss": 7.2208, + "step": 2220 + }, + { + "epoch": 0.1280309264539835, + "grad_norm": 1.734375, + "learning_rate": 9.969783347090585e-05, + "loss": 7.2127, + "step": 2230 + }, + { + "epoch": 0.1286050561690238, + "grad_norm": 1.6640625, + "learning_rate": 9.969270866193506e-05, + "loss": 7.2056, + "step": 2240 + }, + { + "epoch": 0.12917918588406407, + "grad_norm": 1.609375, + "learning_rate": 9.968754089265362e-05, + "loss": 7.231, + "step": 2250 + }, + { + "epoch": 0.12975331559910436, + "grad_norm": 1.6875, + "learning_rate": 9.968233016752917e-05, + "loss": 7.2163, + "step": 2260 + }, + { + "epoch": 0.13032744531414464, + "grad_norm": 1.71875, + "learning_rate": 9.967707649106648e-05, + "loss": 7.2007, + "step": 2270 + }, + { + "epoch": 0.13090157502918492, + "grad_norm": 1.65625, + "learning_rate": 9.967177986780746e-05, + "loss": 7.2608, + "step": 2280 + }, + { + "epoch": 0.1314757047442252, + "grad_norm": 1.7890625, + "learning_rate": 9.966644030233114e-05, + "loss": 7.222, + "step": 2290 + }, + { + "epoch": 0.1320498344592655, + "grad_norm": 1.7265625, + "learning_rate": 9.966105779925367e-05, + "loss": 7.2187, + "step": 2300 + }, + { + "epoch": 0.13262396417430578, + "grad_norm": 1.6953125, + "learning_rate": 9.965563236322836e-05, + "loss": 7.2438, + "step": 2310 + }, + { + "epoch": 0.13319809388934606, + "grad_norm": 1.7890625, + "learning_rate": 9.965016399894556e-05, + "loss": 7.1962, + "step": 2320 + }, + { + "epoch": 0.13377222360438634, + "grad_norm": 1.671875, + "learning_rate": 9.964465271113282e-05, + "loss": 7.222, + "step": 2330 + }, + { + "epoch": 0.13434635331942663, + "grad_norm": 1.6484375, + "learning_rate": 9.963909850455473e-05, + "loss": 7.1955, + "step": 2340 + }, + { + "epoch": 0.1349204830344669, + "grad_norm": 1.671875, + "learning_rate": 9.963350138401299e-05, + "loss": 7.2496, + "step": 2350 + }, + { + "epoch": 0.1354946127495072, + "grad_norm": 1.7578125, + "learning_rate": 9.962786135434648e-05, + "loss": 7.2438, + "step": 2360 + }, + { + "epoch": 0.13606874246454748, + "grad_norm": 1.703125, + "learning_rate": 9.962217842043106e-05, + "loss": 7.2093, + "step": 2370 + }, + { + "epoch": 0.1366428721795878, + "grad_norm": 1.6953125, + "learning_rate": 9.961645258717976e-05, + "loss": 7.1862, + "step": 2380 + }, + { + "epoch": 0.13721700189462807, + "grad_norm": 1.7265625, + "learning_rate": 9.961068385954265e-05, + "loss": 7.2067, + "step": 2390 + }, + { + "epoch": 0.13779113160966835, + "grad_norm": 1.640625, + "learning_rate": 9.960487224250694e-05, + "loss": 7.2353, + "step": 2400 + }, + { + "epoch": 0.13836526132470864, + "grad_norm": 1.6328125, + "learning_rate": 9.959901774109687e-05, + "loss": 7.1877, + "step": 2410 + }, + { + "epoch": 0.13893939103974892, + "grad_norm": 1.6484375, + "learning_rate": 9.959312036037379e-05, + "loss": 7.2055, + "step": 2420 + }, + { + "epoch": 0.1395135207547892, + "grad_norm": 1.7109375, + "learning_rate": 9.958718010543607e-05, + "loss": 7.2765, + "step": 2430 + }, + { + "epoch": 0.1400876504698295, + "grad_norm": 1.6875, + "learning_rate": 9.958119698141917e-05, + "loss": 7.2015, + "step": 2440 + }, + { + "epoch": 0.14066178018486977, + "grad_norm": 1.6640625, + "learning_rate": 9.957517099349564e-05, + "loss": 7.2288, + "step": 2450 + }, + { + "epoch": 0.14123590989991006, + "grad_norm": 1.65625, + "learning_rate": 9.956910214687507e-05, + "loss": 7.251, + "step": 2460 + }, + { + "epoch": 0.14181003961495034, + "grad_norm": 1.6484375, + "learning_rate": 9.956299044680409e-05, + "loss": 7.2318, + "step": 2470 + }, + { + "epoch": 0.14238416932999062, + "grad_norm": 1.890625, + "learning_rate": 9.955683589856634e-05, + "loss": 7.2268, + "step": 2480 + }, + { + "epoch": 0.1429582990450309, + "grad_norm": 1.5859375, + "learning_rate": 9.955063850748263e-05, + "loss": 7.2451, + "step": 2490 + }, + { + "epoch": 0.1435324287600712, + "grad_norm": 1.671875, + "learning_rate": 9.954439827891065e-05, + "loss": 7.2232, + "step": 2500 + }, + { + "epoch": 0.14410655847511147, + "grad_norm": 1.6875, + "learning_rate": 9.953811521824522e-05, + "loss": 7.2054, + "step": 2510 + }, + { + "epoch": 0.14468068819015176, + "grad_norm": 1.609375, + "learning_rate": 9.953178933091818e-05, + "loss": 7.2127, + "step": 2520 + }, + { + "epoch": 0.14525481790519204, + "grad_norm": 1.6875, + "learning_rate": 9.952542062239838e-05, + "loss": 7.2002, + "step": 2530 + }, + { + "epoch": 0.14582894762023232, + "grad_norm": 1.71875, + "learning_rate": 9.951900909819169e-05, + "loss": 7.235, + "step": 2540 + }, + { + "epoch": 0.1464030773352726, + "grad_norm": 1.7265625, + "learning_rate": 9.9512554763841e-05, + "loss": 7.2237, + "step": 2550 + }, + { + "epoch": 0.1469772070503129, + "grad_norm": 1.6953125, + "learning_rate": 9.95060576249262e-05, + "loss": 7.2328, + "step": 2560 + }, + { + "epoch": 0.14755133676535317, + "grad_norm": 1.6875, + "learning_rate": 9.949951768706419e-05, + "loss": 7.2167, + "step": 2570 + }, + { + "epoch": 0.14812546648039346, + "grad_norm": 1.65625, + "learning_rate": 9.949293495590888e-05, + "loss": 7.1892, + "step": 2580 + }, + { + "epoch": 0.14869959619543374, + "grad_norm": 1.6015625, + "learning_rate": 9.948630943715118e-05, + "loss": 7.1999, + "step": 2590 + }, + { + "epoch": 0.14927372591047405, + "grad_norm": 1.65625, + "learning_rate": 9.947964113651896e-05, + "loss": 7.2392, + "step": 2600 + }, + { + "epoch": 0.14984785562551434, + "grad_norm": 1.6640625, + "learning_rate": 9.947293005977709e-05, + "loss": 7.1989, + "step": 2610 + }, + { + "epoch": 0.15042198534055462, + "grad_norm": 1.71875, + "learning_rate": 9.946617621272745e-05, + "loss": 7.1833, + "step": 2620 + }, + { + "epoch": 0.1509961150555949, + "grad_norm": 1.6953125, + "learning_rate": 9.945937960120886e-05, + "loss": 7.2117, + "step": 2630 + }, + { + "epoch": 0.1515702447706352, + "grad_norm": 1.6953125, + "learning_rate": 9.945254023109712e-05, + "loss": 7.1973, + "step": 2640 + }, + { + "epoch": 0.15214437448567547, + "grad_norm": 1.6171875, + "learning_rate": 9.9445658108305e-05, + "loss": 7.1816, + "step": 2650 + }, + { + "epoch": 0.15271850420071575, + "grad_norm": 1.640625, + "learning_rate": 9.943873323878221e-05, + "loss": 7.1913, + "step": 2660 + }, + { + "epoch": 0.15329263391575604, + "grad_norm": 1.625, + "learning_rate": 9.943176562851548e-05, + "loss": 7.2218, + "step": 2670 + }, + { + "epoch": 0.15386676363079632, + "grad_norm": 1.75, + "learning_rate": 9.942475528352842e-05, + "loss": 7.2328, + "step": 2680 + }, + { + "epoch": 0.1544408933458366, + "grad_norm": 1.6796875, + "learning_rate": 9.941770220988158e-05, + "loss": 7.1863, + "step": 2690 + }, + { + "epoch": 0.1550150230608769, + "grad_norm": 1.6015625, + "learning_rate": 9.941060641367253e-05, + "loss": 7.2014, + "step": 2700 + }, + { + "epoch": 0.15558915277591717, + "grad_norm": 1.703125, + "learning_rate": 9.940346790103569e-05, + "loss": 7.2158, + "step": 2710 + }, + { + "epoch": 0.15616328249095746, + "grad_norm": 1.7890625, + "learning_rate": 9.939628667814244e-05, + "loss": 7.1862, + "step": 2720 + }, + { + "epoch": 0.15673741220599774, + "grad_norm": 1.6796875, + "learning_rate": 9.938906275120112e-05, + "loss": 7.169, + "step": 2730 + }, + { + "epoch": 0.15731154192103802, + "grad_norm": 1.671875, + "learning_rate": 9.938179612645693e-05, + "loss": 7.2047, + "step": 2740 + }, + { + "epoch": 0.1578856716360783, + "grad_norm": 1.6484375, + "learning_rate": 9.9374486810192e-05, + "loss": 7.2038, + "step": 2750 + }, + { + "epoch": 0.1584598013511186, + "grad_norm": 1.65625, + "learning_rate": 9.93671348087254e-05, + "loss": 7.2069, + "step": 2760 + }, + { + "epoch": 0.15903393106615887, + "grad_norm": 1.671875, + "learning_rate": 9.935974012841305e-05, + "loss": 7.2183, + "step": 2770 + }, + { + "epoch": 0.15960806078119916, + "grad_norm": 1.5859375, + "learning_rate": 9.935230277564782e-05, + "loss": 7.2102, + "step": 2780 + }, + { + "epoch": 0.16018219049623944, + "grad_norm": 1.8515625, + "learning_rate": 9.934482275685943e-05, + "loss": 7.2052, + "step": 2790 + }, + { + "epoch": 0.16075632021127972, + "grad_norm": 1.6796875, + "learning_rate": 9.933730007851451e-05, + "loss": 7.1924, + "step": 2800 + }, + { + "epoch": 0.16133044992632, + "grad_norm": 1.6484375, + "learning_rate": 9.932973474711655e-05, + "loss": 7.2124, + "step": 2810 + }, + { + "epoch": 0.1619045796413603, + "grad_norm": 1.625, + "learning_rate": 9.932212676920595e-05, + "loss": 7.2017, + "step": 2820 + }, + { + "epoch": 0.1624787093564006, + "grad_norm": 1.5859375, + "learning_rate": 9.931447615135994e-05, + "loss": 7.2266, + "step": 2830 + }, + { + "epoch": 0.16305283907144089, + "grad_norm": 1.5703125, + "learning_rate": 9.93067829001926e-05, + "loss": 7.1716, + "step": 2840 + }, + { + "epoch": 0.16362696878648117, + "grad_norm": 1.703125, + "learning_rate": 9.929904702235495e-05, + "loss": 7.1671, + "step": 2850 + }, + { + "epoch": 0.16420109850152145, + "grad_norm": 1.671875, + "learning_rate": 9.929126852453477e-05, + "loss": 7.1841, + "step": 2860 + }, + { + "epoch": 0.16477522821656174, + "grad_norm": 1.6953125, + "learning_rate": 9.928344741345672e-05, + "loss": 7.2104, + "step": 2870 + }, + { + "epoch": 0.16534935793160202, + "grad_norm": 1.578125, + "learning_rate": 9.92755836958823e-05, + "loss": 7.1866, + "step": 2880 + }, + { + "epoch": 0.1659234876466423, + "grad_norm": 1.7109375, + "learning_rate": 9.926767737860988e-05, + "loss": 7.1634, + "step": 2890 + }, + { + "epoch": 0.1664976173616826, + "grad_norm": 1.6484375, + "learning_rate": 9.92597284684746e-05, + "loss": 7.2023, + "step": 2900 + }, + { + "epoch": 0.16707174707672287, + "grad_norm": 1.7578125, + "learning_rate": 9.925173697234844e-05, + "loss": 7.1846, + "step": 2910 + }, + { + "epoch": 0.16764587679176315, + "grad_norm": 1.6953125, + "learning_rate": 9.924370289714022e-05, + "loss": 7.1909, + "step": 2920 + }, + { + "epoch": 0.16822000650680344, + "grad_norm": 1.5859375, + "learning_rate": 9.923562624979555e-05, + "loss": 7.1852, + "step": 2930 + }, + { + "epoch": 0.16879413622184372, + "grad_norm": 1.6484375, + "learning_rate": 9.922750703729684e-05, + "loss": 7.1982, + "step": 2940 + }, + { + "epoch": 0.169368265936884, + "grad_norm": 1.65625, + "learning_rate": 9.921934526666332e-05, + "loss": 7.1717, + "step": 2950 + }, + { + "epoch": 0.1699423956519243, + "grad_norm": 1.6796875, + "learning_rate": 9.921114094495099e-05, + "loss": 7.1865, + "step": 2960 + }, + { + "epoch": 0.17051652536696457, + "grad_norm": 1.6328125, + "learning_rate": 9.920289407925263e-05, + "loss": 7.194, + "step": 2970 + }, + { + "epoch": 0.17109065508200486, + "grad_norm": 1.65625, + "learning_rate": 9.919460467669785e-05, + "loss": 7.1622, + "step": 2980 + }, + { + "epoch": 0.17166478479704514, + "grad_norm": 1.6953125, + "learning_rate": 9.918627274445297e-05, + "loss": 7.1727, + "step": 2990 + }, + { + "epoch": 0.17223891451208542, + "grad_norm": 1.6328125, + "learning_rate": 9.917789828972113e-05, + "loss": 7.2052, + "step": 3000 + }, + { + "epoch": 0.1728130442271257, + "grad_norm": 1.6015625, + "learning_rate": 9.916948131974217e-05, + "loss": 7.2079, + "step": 3010 + }, + { + "epoch": 0.173387173942166, + "grad_norm": 1.8203125, + "learning_rate": 9.916102184179279e-05, + "loss": 7.1754, + "step": 3020 + }, + { + "epoch": 0.17396130365720627, + "grad_norm": 1.703125, + "learning_rate": 9.91525198631863e-05, + "loss": 7.1914, + "step": 3030 + }, + { + "epoch": 0.17453543337224656, + "grad_norm": 1.6875, + "learning_rate": 9.914397539127289e-05, + "loss": 7.1679, + "step": 3040 + }, + { + "epoch": 0.17510956308728687, + "grad_norm": 1.5859375, + "learning_rate": 9.913538843343936e-05, + "loss": 7.181, + "step": 3050 + }, + { + "epoch": 0.17568369280232715, + "grad_norm": 1.53125, + "learning_rate": 9.912675899710934e-05, + "loss": 7.1929, + "step": 3060 + }, + { + "epoch": 0.17625782251736744, + "grad_norm": 1.65625, + "learning_rate": 9.911808708974315e-05, + "loss": 7.174, + "step": 3070 + }, + { + "epoch": 0.17683195223240772, + "grad_norm": 1.640625, + "learning_rate": 9.91093727188378e-05, + "loss": 7.1991, + "step": 3080 + }, + { + "epoch": 0.177406081947448, + "grad_norm": 1.640625, + "learning_rate": 9.910061589192705e-05, + "loss": 7.1869, + "step": 3090 + }, + { + "epoch": 0.17798021166248829, + "grad_norm": 1.7421875, + "learning_rate": 9.909181661658134e-05, + "loss": 7.1955, + "step": 3100 + }, + { + "epoch": 0.17855434137752857, + "grad_norm": 1.65625, + "learning_rate": 9.908297490040778e-05, + "loss": 7.1625, + "step": 3110 + }, + { + "epoch": 0.17912847109256885, + "grad_norm": 1.78125, + "learning_rate": 9.907409075105028e-05, + "loss": 7.201, + "step": 3120 + }, + { + "epoch": 0.17970260080760914, + "grad_norm": 1.6796875, + "learning_rate": 9.90651641761893e-05, + "loss": 7.167, + "step": 3130 + }, + { + "epoch": 0.18027673052264942, + "grad_norm": 1.671875, + "learning_rate": 9.905619518354205e-05, + "loss": 7.1732, + "step": 3140 + }, + { + "epoch": 0.1808508602376897, + "grad_norm": 1.640625, + "learning_rate": 9.904718378086242e-05, + "loss": 7.175, + "step": 3150 + }, + { + "epoch": 0.18142498995273, + "grad_norm": 1.7109375, + "learning_rate": 9.903812997594092e-05, + "loss": 7.169, + "step": 3160 + }, + { + "epoch": 0.18199911966777027, + "grad_norm": 1.6484375, + "learning_rate": 9.902903377660473e-05, + "loss": 7.1731, + "step": 3170 + }, + { + "epoch": 0.18257324938281055, + "grad_norm": 1.921875, + "learning_rate": 9.901989519071773e-05, + "loss": 7.1691, + "step": 3180 + }, + { + "epoch": 0.18314737909785084, + "grad_norm": 1.6484375, + "learning_rate": 9.901071422618036e-05, + "loss": 7.1727, + "step": 3190 + }, + { + "epoch": 0.18372150881289112, + "grad_norm": 1.59375, + "learning_rate": 9.900149089092978e-05, + "loss": 7.1871, + "step": 3200 + }, + { + "epoch": 0.1842956385279314, + "grad_norm": 1.6875, + "learning_rate": 9.899222519293971e-05, + "loss": 7.1621, + "step": 3210 + }, + { + "epoch": 0.1848697682429717, + "grad_norm": 1.6953125, + "learning_rate": 9.898291714022055e-05, + "loss": 7.1787, + "step": 3220 + }, + { + "epoch": 0.18544389795801197, + "grad_norm": 1.65625, + "learning_rate": 9.897356674081928e-05, + "loss": 7.1843, + "step": 3230 + }, + { + "epoch": 0.18601802767305226, + "grad_norm": 1.671875, + "learning_rate": 9.89641740028195e-05, + "loss": 7.1441, + "step": 3240 + }, + { + "epoch": 0.18659215738809254, + "grad_norm": 1.6171875, + "learning_rate": 9.895473893434142e-05, + "loss": 7.1718, + "step": 3250 + }, + { + "epoch": 0.18716628710313282, + "grad_norm": 1.75, + "learning_rate": 9.894526154354185e-05, + "loss": 7.177, + "step": 3260 + }, + { + "epoch": 0.1877404168181731, + "grad_norm": 1.7109375, + "learning_rate": 9.893574183861417e-05, + "loss": 7.1814, + "step": 3270 + }, + { + "epoch": 0.18831454653321342, + "grad_norm": 1.734375, + "learning_rate": 9.892617982778833e-05, + "loss": 7.166, + "step": 3280 + }, + { + "epoch": 0.1888886762482537, + "grad_norm": 1.7734375, + "learning_rate": 9.89165755193309e-05, + "loss": 7.1755, + "step": 3290 + }, + { + "epoch": 0.18946280596329398, + "grad_norm": 1.65625, + "learning_rate": 9.890692892154498e-05, + "loss": 7.1605, + "step": 3300 + }, + { + "epoch": 0.19003693567833427, + "grad_norm": 1.6640625, + "learning_rate": 9.889724004277023e-05, + "loss": 7.1718, + "step": 3310 + }, + { + "epoch": 0.19061106539337455, + "grad_norm": 1.546875, + "learning_rate": 9.88875088913829e-05, + "loss": 7.174, + "step": 3320 + }, + { + "epoch": 0.19118519510841483, + "grad_norm": 1.78125, + "learning_rate": 9.887773547579574e-05, + "loss": 7.1784, + "step": 3330 + }, + { + "epoch": 0.19175932482345512, + "grad_norm": 1.7109375, + "learning_rate": 9.886791980445806e-05, + "loss": 7.1844, + "step": 3340 + }, + { + "epoch": 0.1923334545384954, + "grad_norm": 1.6796875, + "learning_rate": 9.885806188585571e-05, + "loss": 7.1523, + "step": 3350 + }, + { + "epoch": 0.19290758425353569, + "grad_norm": 1.734375, + "learning_rate": 9.884816172851104e-05, + "loss": 7.1403, + "step": 3360 + }, + { + "epoch": 0.19348171396857597, + "grad_norm": 1.59375, + "learning_rate": 9.883821934098292e-05, + "loss": 7.1545, + "step": 3370 + }, + { + "epoch": 0.19405584368361625, + "grad_norm": 1.578125, + "learning_rate": 9.882823473186675e-05, + "loss": 7.1925, + "step": 3380 + }, + { + "epoch": 0.19462997339865654, + "grad_norm": 1.6328125, + "learning_rate": 9.881820790979443e-05, + "loss": 7.1765, + "step": 3390 + }, + { + "epoch": 0.19520410311369682, + "grad_norm": 1.65625, + "learning_rate": 9.880813888343431e-05, + "loss": 7.1598, + "step": 3400 + }, + { + "epoch": 0.1957782328287371, + "grad_norm": 1.609375, + "learning_rate": 9.87980276614913e-05, + "loss": 7.1651, + "step": 3410 + }, + { + "epoch": 0.1963523625437774, + "grad_norm": 1.609375, + "learning_rate": 9.87878742527067e-05, + "loss": 7.1966, + "step": 3420 + }, + { + "epoch": 0.19692649225881767, + "grad_norm": 1.7109375, + "learning_rate": 9.877767866585837e-05, + "loss": 7.1851, + "step": 3430 + }, + { + "epoch": 0.19750062197385795, + "grad_norm": 1.6796875, + "learning_rate": 9.876744090976056e-05, + "loss": 7.1835, + "step": 3440 + }, + { + "epoch": 0.19807475168889824, + "grad_norm": 1.6953125, + "learning_rate": 9.875716099326404e-05, + "loss": 7.2052, + "step": 3450 + }, + { + "epoch": 0.19864888140393852, + "grad_norm": 1.609375, + "learning_rate": 9.874683892525598e-05, + "loss": 7.1426, + "step": 3460 + }, + { + "epoch": 0.1992230111189788, + "grad_norm": 1.71875, + "learning_rate": 9.873647471466e-05, + "loss": 7.1271, + "step": 3470 + }, + { + "epoch": 0.1997971408340191, + "grad_norm": 1.5859375, + "learning_rate": 9.872606837043617e-05, + "loss": 7.1351, + "step": 3480 + }, + { + "epoch": 0.20037127054905937, + "grad_norm": 1.671875, + "learning_rate": 9.871561990158097e-05, + "loss": 7.1292, + "step": 3490 + }, + { + "epoch": 0.20094540026409966, + "grad_norm": 1.6328125, + "learning_rate": 9.870512931712734e-05, + "loss": 7.1518, + "step": 3500 + }, + { + "epoch": 0.20151952997913997, + "grad_norm": 1.703125, + "learning_rate": 9.869459662614455e-05, + "loss": 7.1549, + "step": 3510 + }, + { + "epoch": 0.20209365969418025, + "grad_norm": 1.6328125, + "learning_rate": 9.868402183773833e-05, + "loss": 7.1718, + "step": 3520 + }, + { + "epoch": 0.20266778940922053, + "grad_norm": 1.6328125, + "learning_rate": 9.867340496105079e-05, + "loss": 7.1645, + "step": 3530 + }, + { + "epoch": 0.20324191912426082, + "grad_norm": 1.625, + "learning_rate": 9.866274600526043e-05, + "loss": 7.1599, + "step": 3540 + }, + { + "epoch": 0.2038160488393011, + "grad_norm": 1.75, + "learning_rate": 9.865204497958211e-05, + "loss": 7.1633, + "step": 3550 + }, + { + "epoch": 0.20439017855434138, + "grad_norm": 1.65625, + "learning_rate": 9.864130189326709e-05, + "loss": 7.1397, + "step": 3560 + }, + { + "epoch": 0.20496430826938167, + "grad_norm": 1.640625, + "learning_rate": 9.863051675560297e-05, + "loss": 7.1739, + "step": 3570 + }, + { + "epoch": 0.20553843798442195, + "grad_norm": 1.8515625, + "learning_rate": 9.861968957591372e-05, + "loss": 7.157, + "step": 3580 + }, + { + "epoch": 0.20611256769946223, + "grad_norm": 1.6484375, + "learning_rate": 9.860882036355962e-05, + "loss": 7.1219, + "step": 3590 + }, + { + "epoch": 0.20668669741450252, + "grad_norm": 1.6953125, + "learning_rate": 9.859790912793737e-05, + "loss": 7.1577, + "step": 3600 + }, + { + "epoch": 0.2072608271295428, + "grad_norm": 1.6640625, + "learning_rate": 9.858695587847987e-05, + "loss": 7.1883, + "step": 3610 + }, + { + "epoch": 0.20783495684458309, + "grad_norm": 1.609375, + "learning_rate": 9.857596062465648e-05, + "loss": 7.1774, + "step": 3620 + }, + { + "epoch": 0.20840908655962337, + "grad_norm": 1.625, + "learning_rate": 9.856492337597276e-05, + "loss": 7.1515, + "step": 3630 + }, + { + "epoch": 0.20898321627466365, + "grad_norm": 1.71875, + "learning_rate": 9.855384414197067e-05, + "loss": 7.1831, + "step": 3640 + }, + { + "epoch": 0.20955734598970394, + "grad_norm": 1.6796875, + "learning_rate": 9.854272293222841e-05, + "loss": 7.1699, + "step": 3650 + }, + { + "epoch": 0.21013147570474422, + "grad_norm": 1.6640625, + "learning_rate": 9.853155975636045e-05, + "loss": 7.1675, + "step": 3660 + }, + { + "epoch": 0.2107056054197845, + "grad_norm": 1.6875, + "learning_rate": 9.852035462401764e-05, + "loss": 7.1572, + "step": 3670 + }, + { + "epoch": 0.2112797351348248, + "grad_norm": 1.5546875, + "learning_rate": 9.850910754488697e-05, + "loss": 7.1519, + "step": 3680 + }, + { + "epoch": 0.21185386484986507, + "grad_norm": 1.6953125, + "learning_rate": 9.849781852869176e-05, + "loss": 7.1442, + "step": 3690 + }, + { + "epoch": 0.21242799456490535, + "grad_norm": 1.609375, + "learning_rate": 9.848648758519161e-05, + "loss": 7.1557, + "step": 3700 + }, + { + "epoch": 0.21300212427994564, + "grad_norm": 1.6171875, + "learning_rate": 9.847511472418235e-05, + "loss": 7.1443, + "step": 3710 + }, + { + "epoch": 0.21357625399498592, + "grad_norm": 1.6796875, + "learning_rate": 9.846369995549601e-05, + "loss": 7.1367, + "step": 3720 + }, + { + "epoch": 0.21415038371002623, + "grad_norm": 1.703125, + "learning_rate": 9.84522432890009e-05, + "loss": 7.1849, + "step": 3730 + }, + { + "epoch": 0.21472451342506652, + "grad_norm": 1.6484375, + "learning_rate": 9.844074473460152e-05, + "loss": 7.1564, + "step": 3740 + }, + { + "epoch": 0.2152986431401068, + "grad_norm": 1.609375, + "learning_rate": 9.842920430223858e-05, + "loss": 7.1428, + "step": 3750 + }, + { + "epoch": 0.21587277285514708, + "grad_norm": 1.7265625, + "learning_rate": 9.841762200188904e-05, + "loss": 7.1381, + "step": 3760 + }, + { + "epoch": 0.21644690257018737, + "grad_norm": 1.609375, + "learning_rate": 9.840599784356601e-05, + "loss": 7.1563, + "step": 3770 + }, + { + "epoch": 0.21702103228522765, + "grad_norm": 1.609375, + "learning_rate": 9.839433183731879e-05, + "loss": 7.1821, + "step": 3780 + }, + { + "epoch": 0.21759516200026793, + "grad_norm": 1.703125, + "learning_rate": 9.838262399323288e-05, + "loss": 7.1049, + "step": 3790 + }, + { + "epoch": 0.21816929171530822, + "grad_norm": 1.625, + "learning_rate": 9.837087432142993e-05, + "loss": 7.1677, + "step": 3800 + }, + { + "epoch": 0.2187434214303485, + "grad_norm": 1.5546875, + "learning_rate": 9.835908283206777e-05, + "loss": 7.1602, + "step": 3810 + }, + { + "epoch": 0.21931755114538878, + "grad_norm": 1.6640625, + "learning_rate": 9.834724953534036e-05, + "loss": 7.158, + "step": 3820 + }, + { + "epoch": 0.21989168086042907, + "grad_norm": 1.75, + "learning_rate": 9.833537444147781e-05, + "loss": 7.1471, + "step": 3830 + }, + { + "epoch": 0.22046581057546935, + "grad_norm": 1.703125, + "learning_rate": 9.832345756074639e-05, + "loss": 7.1652, + "step": 3840 + }, + { + "epoch": 0.22103994029050963, + "grad_norm": 1.5625, + "learning_rate": 9.831149890344846e-05, + "loss": 7.1406, + "step": 3850 + }, + { + "epoch": 0.22161407000554992, + "grad_norm": 1.625, + "learning_rate": 9.82994984799225e-05, + "loss": 7.1428, + "step": 3860 + }, + { + "epoch": 0.2221881997205902, + "grad_norm": 1.6328125, + "learning_rate": 9.828745630054314e-05, + "loss": 7.1159, + "step": 3870 + }, + { + "epoch": 0.22276232943563049, + "grad_norm": 1.671875, + "learning_rate": 9.827537237572107e-05, + "loss": 7.1448, + "step": 3880 + }, + { + "epoch": 0.22333645915067077, + "grad_norm": 1.7109375, + "learning_rate": 9.826324671590303e-05, + "loss": 7.132, + "step": 3890 + }, + { + "epoch": 0.22391058886571105, + "grad_norm": 1.6328125, + "learning_rate": 9.825107933157196e-05, + "loss": 7.1557, + "step": 3900 + }, + { + "epoch": 0.22448471858075134, + "grad_norm": 1.71875, + "learning_rate": 9.823887023324675e-05, + "loss": 7.1258, + "step": 3910 + }, + { + "epoch": 0.22505884829579162, + "grad_norm": 1.6328125, + "learning_rate": 9.822661943148243e-05, + "loss": 7.1438, + "step": 3920 + }, + { + "epoch": 0.2256329780108319, + "grad_norm": 1.59375, + "learning_rate": 9.821432693687004e-05, + "loss": 7.1421, + "step": 3930 + }, + { + "epoch": 0.2262071077258722, + "grad_norm": 1.640625, + "learning_rate": 9.820199276003667e-05, + "loss": 7.1424, + "step": 3940 + }, + { + "epoch": 0.22678123744091247, + "grad_norm": 1.59375, + "learning_rate": 9.818961691164548e-05, + "loss": 7.1514, + "step": 3950 + }, + { + "epoch": 0.22735536715595278, + "grad_norm": 1.609375, + "learning_rate": 9.817719940239563e-05, + "loss": 7.128, + "step": 3960 + }, + { + "epoch": 0.22792949687099306, + "grad_norm": 1.703125, + "learning_rate": 9.816474024302228e-05, + "loss": 7.1122, + "step": 3970 + }, + { + "epoch": 0.22850362658603335, + "grad_norm": 1.78125, + "learning_rate": 9.815223944429662e-05, + "loss": 7.1329, + "step": 3980 + }, + { + "epoch": 0.22907775630107363, + "grad_norm": 1.6015625, + "learning_rate": 9.813969701702583e-05, + "loss": 7.1356, + "step": 3990 + }, + { + "epoch": 0.22965188601611392, + "grad_norm": 1.6953125, + "learning_rate": 9.81271129720531e-05, + "loss": 7.1245, + "step": 4000 + }, + { + "epoch": 0.2302260157311542, + "grad_norm": 1.6328125, + "learning_rate": 9.811448732025757e-05, + "loss": 7.1262, + "step": 4010 + }, + { + "epoch": 0.23080014544619448, + "grad_norm": 1.578125, + "learning_rate": 9.810182007255435e-05, + "loss": 7.1314, + "step": 4020 + }, + { + "epoch": 0.23137427516123477, + "grad_norm": 1.7109375, + "learning_rate": 9.808911123989452e-05, + "loss": 7.134, + "step": 4030 + }, + { + "epoch": 0.23194840487627505, + "grad_norm": 1.640625, + "learning_rate": 9.807636083326515e-05, + "loss": 7.1211, + "step": 4040 + }, + { + "epoch": 0.23252253459131533, + "grad_norm": 1.625, + "learning_rate": 9.806356886368917e-05, + "loss": 7.1423, + "step": 4050 + }, + { + "epoch": 0.23309666430635562, + "grad_norm": 1.640625, + "learning_rate": 9.80507353422255e-05, + "loss": 7.1196, + "step": 4060 + }, + { + "epoch": 0.2336707940213959, + "grad_norm": 1.6171875, + "learning_rate": 9.803786027996899e-05, + "loss": 7.1225, + "step": 4070 + }, + { + "epoch": 0.23424492373643618, + "grad_norm": 1.6875, + "learning_rate": 9.802494368805035e-05, + "loss": 7.1493, + "step": 4080 + }, + { + "epoch": 0.23481905345147647, + "grad_norm": 1.765625, + "learning_rate": 9.801198557763623e-05, + "loss": 7.1441, + "step": 4090 + }, + { + "epoch": 0.23539318316651675, + "grad_norm": 1.671875, + "learning_rate": 9.799898595992919e-05, + "loss": 7.1074, + "step": 4100 + }, + { + "epoch": 0.23596731288155703, + "grad_norm": 1.7109375, + "learning_rate": 9.798594484616762e-05, + "loss": 7.1399, + "step": 4110 + }, + { + "epoch": 0.23654144259659732, + "grad_norm": 1.671875, + "learning_rate": 9.797286224762584e-05, + "loss": 7.1218, + "step": 4120 + }, + { + "epoch": 0.2371155723116376, + "grad_norm": 1.59375, + "learning_rate": 9.7959738175614e-05, + "loss": 7.1224, + "step": 4130 + }, + { + "epoch": 0.23768970202667788, + "grad_norm": 1.640625, + "learning_rate": 9.794657264147811e-05, + "loss": 7.1262, + "step": 4140 + }, + { + "epoch": 0.23826383174171817, + "grad_norm": 1.703125, + "learning_rate": 9.793336565660005e-05, + "loss": 7.1277, + "step": 4150 + }, + { + "epoch": 0.23883796145675845, + "grad_norm": 1.6015625, + "learning_rate": 9.792011723239751e-05, + "loss": 7.1333, + "step": 4160 + }, + { + "epoch": 0.23941209117179874, + "grad_norm": 1.6875, + "learning_rate": 9.790682738032397e-05, + "loss": 7.1199, + "step": 4170 + }, + { + "epoch": 0.23998622088683905, + "grad_norm": 1.609375, + "learning_rate": 9.789349611186882e-05, + "loss": 7.1562, + "step": 4180 + }, + { + "epoch": 0.24056035060187933, + "grad_norm": 1.7265625, + "learning_rate": 9.788012343855716e-05, + "loss": 7.1346, + "step": 4190 + }, + { + "epoch": 0.2411344803169196, + "grad_norm": 1.671875, + "learning_rate": 9.786670937194996e-05, + "loss": 7.1284, + "step": 4200 + }, + { + "epoch": 0.2417086100319599, + "grad_norm": 1.609375, + "learning_rate": 9.785325392364391e-05, + "loss": 7.1572, + "step": 4210 + }, + { + "epoch": 0.24228273974700018, + "grad_norm": 1.6171875, + "learning_rate": 9.783975710527154e-05, + "loss": 7.1039, + "step": 4220 + }, + { + "epoch": 0.24285686946204046, + "grad_norm": 1.6875, + "learning_rate": 9.782621892850106e-05, + "loss": 7.108, + "step": 4230 + }, + { + "epoch": 0.24343099917708075, + "grad_norm": 1.625, + "learning_rate": 9.781263940503653e-05, + "loss": 7.1546, + "step": 4240 + }, + { + "epoch": 0.24400512889212103, + "grad_norm": 1.6328125, + "learning_rate": 9.77990185466177e-05, + "loss": 7.1348, + "step": 4250 + }, + { + "epoch": 0.24457925860716132, + "grad_norm": 1.7578125, + "learning_rate": 9.778535636502005e-05, + "loss": 7.1317, + "step": 4260 + }, + { + "epoch": 0.2451533883222016, + "grad_norm": 1.640625, + "learning_rate": 9.777165287205484e-05, + "loss": 7.1193, + "step": 4270 + }, + { + "epoch": 0.24572751803724188, + "grad_norm": 1.5859375, + "learning_rate": 9.775790807956894e-05, + "loss": 7.1458, + "step": 4280 + }, + { + "epoch": 0.24630164775228217, + "grad_norm": 1.5390625, + "learning_rate": 9.774412199944507e-05, + "loss": 7.0719, + "step": 4290 + }, + { + "epoch": 0.24687577746732245, + "grad_norm": 1.6328125, + "learning_rate": 9.773029464360151e-05, + "loss": 7.1046, + "step": 4300 + }, + { + "epoch": 0.24744990718236273, + "grad_norm": 1.6796875, + "learning_rate": 9.771642602399229e-05, + "loss": 7.1422, + "step": 4310 + }, + { + "epoch": 0.24802403689740302, + "grad_norm": 1.6875, + "learning_rate": 9.77025161526071e-05, + "loss": 7.0738, + "step": 4320 + }, + { + "epoch": 0.2485981666124433, + "grad_norm": 1.5078125, + "learning_rate": 9.76885650414713e-05, + "loss": 7.0958, + "step": 4330 + }, + { + "epoch": 0.24917229632748358, + "grad_norm": 1.640625, + "learning_rate": 9.76745727026459e-05, + "loss": 7.1058, + "step": 4340 + }, + { + "epoch": 0.24974642604252387, + "grad_norm": 1.734375, + "learning_rate": 9.766053914822754e-05, + "loss": 7.1442, + "step": 4350 + }, + { + "epoch": 0.2503205557575642, + "grad_norm": 1.71875, + "learning_rate": 9.764646439034849e-05, + "loss": 7.1123, + "step": 4360 + }, + { + "epoch": 0.25089468547260446, + "grad_norm": 1.546875, + "learning_rate": 9.763234844117666e-05, + "loss": 7.1233, + "step": 4370 + }, + { + "epoch": 0.25146881518764475, + "grad_norm": 1.7109375, + "learning_rate": 9.761819131291557e-05, + "loss": 7.1272, + "step": 4380 + }, + { + "epoch": 0.25204294490268503, + "grad_norm": 1.6796875, + "learning_rate": 9.760399301780433e-05, + "loss": 7.0923, + "step": 4390 + }, + { + "epoch": 0.2526170746177253, + "grad_norm": 1.5859375, + "learning_rate": 9.758975356811763e-05, + "loss": 7.1102, + "step": 4400 + }, + { + "epoch": 0.2531912043327656, + "grad_norm": 1.6328125, + "learning_rate": 9.757547297616576e-05, + "loss": 7.1003, + "step": 4410 + }, + { + "epoch": 0.2537653340478059, + "grad_norm": 1.6953125, + "learning_rate": 9.756115125429457e-05, + "loss": 7.1449, + "step": 4420 + }, + { + "epoch": 0.25433946376284616, + "grad_norm": 1.6875, + "learning_rate": 9.754678841488545e-05, + "loss": 7.1108, + "step": 4430 + }, + { + "epoch": 0.25491359347788645, + "grad_norm": 1.5625, + "learning_rate": 9.753238447035537e-05, + "loss": 7.1287, + "step": 4440 + }, + { + "epoch": 0.25548772319292673, + "grad_norm": 1.703125, + "learning_rate": 9.751793943315683e-05, + "loss": 7.0953, + "step": 4450 + }, + { + "epoch": 0.256061852907967, + "grad_norm": 1.5703125, + "learning_rate": 9.750345331577781e-05, + "loss": 7.1042, + "step": 4460 + }, + { + "epoch": 0.2566359826230073, + "grad_norm": 1.6328125, + "learning_rate": 9.748892613074188e-05, + "loss": 7.1171, + "step": 4470 + }, + { + "epoch": 0.2572101123380476, + "grad_norm": 1.6171875, + "learning_rate": 9.747435789060804e-05, + "loss": 7.1258, + "step": 4480 + }, + { + "epoch": 0.25778424205308786, + "grad_norm": 1.71875, + "learning_rate": 9.745974860797084e-05, + "loss": 7.116, + "step": 4490 + }, + { + "epoch": 0.25835837176812815, + "grad_norm": 1.640625, + "learning_rate": 9.744509829546027e-05, + "loss": 7.1304, + "step": 4500 + }, + { + "epoch": 0.25893250148316843, + "grad_norm": 1.609375, + "learning_rate": 9.743040696574182e-05, + "loss": 7.1077, + "step": 4510 + }, + { + "epoch": 0.2595066311982087, + "grad_norm": 1.6875, + "learning_rate": 9.741567463151642e-05, + "loss": 7.1422, + "step": 4520 + }, + { + "epoch": 0.260080760913249, + "grad_norm": 1.5625, + "learning_rate": 9.740090130552046e-05, + "loss": 7.1365, + "step": 4530 + }, + { + "epoch": 0.2606548906282893, + "grad_norm": 1.71875, + "learning_rate": 9.73860870005258e-05, + "loss": 7.0826, + "step": 4540 + }, + { + "epoch": 0.26122902034332957, + "grad_norm": 1.578125, + "learning_rate": 9.737123172933964e-05, + "loss": 7.1016, + "step": 4550 + }, + { + "epoch": 0.26180315005836985, + "grad_norm": 1.6640625, + "learning_rate": 9.735633550480469e-05, + "loss": 7.1019, + "step": 4560 + }, + { + "epoch": 0.26237727977341013, + "grad_norm": 1.6171875, + "learning_rate": 9.7341398339799e-05, + "loss": 7.0796, + "step": 4570 + }, + { + "epoch": 0.2629514094884504, + "grad_norm": 1.640625, + "learning_rate": 9.732642024723605e-05, + "loss": 7.1059, + "step": 4580 + }, + { + "epoch": 0.2635255392034907, + "grad_norm": 1.578125, + "learning_rate": 9.731140124006471e-05, + "loss": 7.1171, + "step": 4590 + }, + { + "epoch": 0.264099668918531, + "grad_norm": 1.59375, + "learning_rate": 9.729634133126917e-05, + "loss": 7.1195, + "step": 4600 + }, + { + "epoch": 0.26467379863357127, + "grad_norm": 1.765625, + "learning_rate": 9.728124053386905e-05, + "loss": 7.1011, + "step": 4610 + }, + { + "epoch": 0.26524792834861155, + "grad_norm": 1.71875, + "learning_rate": 9.726609886091925e-05, + "loss": 7.1241, + "step": 4620 + }, + { + "epoch": 0.26582205806365183, + "grad_norm": 1.640625, + "learning_rate": 9.725091632551002e-05, + "loss": 7.1232, + "step": 4630 + }, + { + "epoch": 0.2663961877786921, + "grad_norm": 1.65625, + "learning_rate": 9.723569294076702e-05, + "loss": 7.1256, + "step": 4640 + }, + { + "epoch": 0.2669703174937324, + "grad_norm": 1.609375, + "learning_rate": 9.722042871985112e-05, + "loss": 7.1163, + "step": 4650 + }, + { + "epoch": 0.2675444472087727, + "grad_norm": 1.5625, + "learning_rate": 9.720512367595854e-05, + "loss": 7.1269, + "step": 4660 + }, + { + "epoch": 0.26811857692381297, + "grad_norm": 1.609375, + "learning_rate": 9.718977782232079e-05, + "loss": 7.1275, + "step": 4670 + }, + { + "epoch": 0.26869270663885325, + "grad_norm": 1.5703125, + "learning_rate": 9.717439117220468e-05, + "loss": 7.0919, + "step": 4680 + }, + { + "epoch": 0.26926683635389353, + "grad_norm": 1.6953125, + "learning_rate": 9.715896373891222e-05, + "loss": 7.1171, + "step": 4690 + }, + { + "epoch": 0.2698409660689338, + "grad_norm": 1.6484375, + "learning_rate": 9.714349553578077e-05, + "loss": 7.0927, + "step": 4700 + }, + { + "epoch": 0.2704150957839741, + "grad_norm": 1.625, + "learning_rate": 9.712798657618287e-05, + "loss": 7.1006, + "step": 4710 + }, + { + "epoch": 0.2709892254990144, + "grad_norm": 1.625, + "learning_rate": 9.711243687352632e-05, + "loss": 7.0835, + "step": 4720 + }, + { + "epoch": 0.27156335521405467, + "grad_norm": 1.6875, + "learning_rate": 9.709684644125413e-05, + "loss": 7.11, + "step": 4730 + }, + { + "epoch": 0.27213748492909495, + "grad_norm": 1.6640625, + "learning_rate": 9.708121529284455e-05, + "loss": 7.1328, + "step": 4740 + }, + { + "epoch": 0.2727116146441353, + "grad_norm": 1.59375, + "learning_rate": 9.706554344181101e-05, + "loss": 7.084, + "step": 4750 + }, + { + "epoch": 0.2732857443591756, + "grad_norm": 1.625, + "learning_rate": 9.70498309017021e-05, + "loss": 7.1197, + "step": 4760 + }, + { + "epoch": 0.27385987407421586, + "grad_norm": 1.625, + "learning_rate": 9.703407768610164e-05, + "loss": 7.1251, + "step": 4770 + }, + { + "epoch": 0.27443400378925614, + "grad_norm": 1.703125, + "learning_rate": 9.70182838086286e-05, + "loss": 7.0986, + "step": 4780 + }, + { + "epoch": 0.2750081335042964, + "grad_norm": 1.703125, + "learning_rate": 9.700244928293708e-05, + "loss": 7.1065, + "step": 4790 + }, + { + "epoch": 0.2755822632193367, + "grad_norm": 1.703125, + "learning_rate": 9.698657412271634e-05, + "loss": 7.1378, + "step": 4800 + }, + { + "epoch": 0.276156392934377, + "grad_norm": 1.640625, + "learning_rate": 9.697065834169075e-05, + "loss": 7.096, + "step": 4810 + }, + { + "epoch": 0.2767305226494173, + "grad_norm": 1.6015625, + "learning_rate": 9.695470195361982e-05, + "loss": 7.105, + "step": 4820 + }, + { + "epoch": 0.27730465236445756, + "grad_norm": 1.6171875, + "learning_rate": 9.693870497229816e-05, + "loss": 7.1321, + "step": 4830 + }, + { + "epoch": 0.27787878207949784, + "grad_norm": 1.625, + "learning_rate": 9.692266741155547e-05, + "loss": 7.104, + "step": 4840 + }, + { + "epoch": 0.2784529117945381, + "grad_norm": 1.65625, + "learning_rate": 9.690658928525653e-05, + "loss": 7.1017, + "step": 4850 + }, + { + "epoch": 0.2790270415095784, + "grad_norm": 1.6484375, + "learning_rate": 9.689047060730119e-05, + "loss": 7.1238, + "step": 4860 + }, + { + "epoch": 0.2796011712246187, + "grad_norm": 1.7109375, + "learning_rate": 9.687431139162437e-05, + "loss": 7.0816, + "step": 4870 + }, + { + "epoch": 0.280175300939659, + "grad_norm": 1.5859375, + "learning_rate": 9.6858111652196e-05, + "loss": 7.0878, + "step": 4880 + }, + { + "epoch": 0.28074943065469926, + "grad_norm": 1.6640625, + "learning_rate": 9.684187140302113e-05, + "loss": 7.0907, + "step": 4890 + }, + { + "epoch": 0.28132356036973954, + "grad_norm": 1.8203125, + "learning_rate": 9.682559065813969e-05, + "loss": 7.1139, + "step": 4900 + }, + { + "epoch": 0.28189769008477983, + "grad_norm": 1.6796875, + "learning_rate": 9.680926943162674e-05, + "loss": 7.1432, + "step": 4910 + }, + { + "epoch": 0.2824718197998201, + "grad_norm": 1.6484375, + "learning_rate": 9.67929077375923e-05, + "loss": 7.1069, + "step": 4920 + }, + { + "epoch": 0.2830459495148604, + "grad_norm": 1.6328125, + "learning_rate": 9.677650559018137e-05, + "loss": 7.1006, + "step": 4930 + }, + { + "epoch": 0.2836200792299007, + "grad_norm": 1.578125, + "learning_rate": 9.676006300357392e-05, + "loss": 7.1214, + "step": 4940 + }, + { + "epoch": 0.28419420894494096, + "grad_norm": 1.5859375, + "learning_rate": 9.674357999198489e-05, + "loss": 7.1161, + "step": 4950 + }, + { + "epoch": 0.28476833865998125, + "grad_norm": 1.625, + "learning_rate": 9.672705656966417e-05, + "loss": 7.0781, + "step": 4960 + }, + { + "epoch": 0.28534246837502153, + "grad_norm": 1.625, + "learning_rate": 9.671049275089654e-05, + "loss": 7.0649, + "step": 4970 + }, + { + "epoch": 0.2859165980900618, + "grad_norm": 1.703125, + "learning_rate": 9.669388855000178e-05, + "loss": 7.0862, + "step": 4980 + }, + { + "epoch": 0.2864907278051021, + "grad_norm": 1.6015625, + "learning_rate": 9.667724398133455e-05, + "loss": 7.1193, + "step": 4990 + }, + { + "epoch": 0.2870648575201424, + "grad_norm": 1.6171875, + "learning_rate": 9.666055905928437e-05, + "loss": 7.1017, + "step": 5000 + }, + { + "epoch": 0.28763898723518266, + "grad_norm": 1.671875, + "learning_rate": 9.66438337982757e-05, + "loss": 7.0887, + "step": 5010 + }, + { + "epoch": 0.28821311695022295, + "grad_norm": 1.640625, + "learning_rate": 9.662706821276787e-05, + "loss": 7.135, + "step": 5020 + }, + { + "epoch": 0.28878724666526323, + "grad_norm": 1.5859375, + "learning_rate": 9.6610262317255e-05, + "loss": 7.1089, + "step": 5030 + }, + { + "epoch": 0.2893613763803035, + "grad_norm": 1.6171875, + "learning_rate": 9.659341612626618e-05, + "loss": 7.0617, + "step": 5040 + }, + { + "epoch": 0.2899355060953438, + "grad_norm": 1.6953125, + "learning_rate": 9.657652965436521e-05, + "loss": 7.1023, + "step": 5050 + }, + { + "epoch": 0.2905096358103841, + "grad_norm": 1.6328125, + "learning_rate": 9.655960291615081e-05, + "loss": 7.0837, + "step": 5060 + }, + { + "epoch": 0.29108376552542436, + "grad_norm": 1.6171875, + "learning_rate": 9.654263592625645e-05, + "loss": 7.0944, + "step": 5070 + }, + { + "epoch": 0.29165789524046465, + "grad_norm": 1.609375, + "learning_rate": 9.652562869935045e-05, + "loss": 7.0902, + "step": 5080 + }, + { + "epoch": 0.29223202495550493, + "grad_norm": 1.640625, + "learning_rate": 9.650858125013584e-05, + "loss": 7.0896, + "step": 5090 + }, + { + "epoch": 0.2928061546705452, + "grad_norm": 1.6171875, + "learning_rate": 9.649149359335053e-05, + "loss": 7.0806, + "step": 5100 + }, + { + "epoch": 0.2933802843855855, + "grad_norm": 1.6875, + "learning_rate": 9.647436574376708e-05, + "loss": 7.1028, + "step": 5110 + }, + { + "epoch": 0.2939544141006258, + "grad_norm": 1.5859375, + "learning_rate": 9.645719771619288e-05, + "loss": 7.0584, + "step": 5120 + }, + { + "epoch": 0.29452854381566607, + "grad_norm": 1.6015625, + "learning_rate": 9.643998952547002e-05, + "loss": 7.0907, + "step": 5130 + }, + { + "epoch": 0.29510267353070635, + "grad_norm": 1.7265625, + "learning_rate": 9.642274118647529e-05, + "loss": 7.1007, + "step": 5140 + }, + { + "epoch": 0.29567680324574663, + "grad_norm": 1.6328125, + "learning_rate": 9.640545271412024e-05, + "loss": 7.1318, + "step": 5150 + }, + { + "epoch": 0.2962509329607869, + "grad_norm": 1.6796875, + "learning_rate": 9.638812412335108e-05, + "loss": 7.1222, + "step": 5160 + }, + { + "epoch": 0.2968250626758272, + "grad_norm": 1.625, + "learning_rate": 9.63707554291487e-05, + "loss": 7.0565, + "step": 5170 + }, + { + "epoch": 0.2973991923908675, + "grad_norm": 1.6015625, + "learning_rate": 9.63533466465287e-05, + "loss": 7.0986, + "step": 5180 + }, + { + "epoch": 0.29797332210590777, + "grad_norm": 1.546875, + "learning_rate": 9.633589779054131e-05, + "loss": 7.0853, + "step": 5190 + }, + { + "epoch": 0.2985474518209481, + "grad_norm": 1.6875, + "learning_rate": 9.631840887627138e-05, + "loss": 7.1219, + "step": 5200 + }, + { + "epoch": 0.2991215815359884, + "grad_norm": 1.6328125, + "learning_rate": 9.630087991883843e-05, + "loss": 7.0556, + "step": 5210 + }, + { + "epoch": 0.2996957112510287, + "grad_norm": 1.5625, + "learning_rate": 9.628331093339657e-05, + "loss": 7.0984, + "step": 5220 + }, + { + "epoch": 0.30026984096606896, + "grad_norm": 1.6953125, + "learning_rate": 9.626570193513456e-05, + "loss": 7.0877, + "step": 5230 + }, + { + "epoch": 0.30084397068110924, + "grad_norm": 1.6328125, + "learning_rate": 9.624805293927568e-05, + "loss": 7.1067, + "step": 5240 + }, + { + "epoch": 0.3014181003961495, + "grad_norm": 1.5625, + "learning_rate": 9.623036396107785e-05, + "loss": 7.1021, + "step": 5250 + }, + { + "epoch": 0.3019922301111898, + "grad_norm": 1.6484375, + "learning_rate": 9.621263501583356e-05, + "loss": 7.1186, + "step": 5260 + }, + { + "epoch": 0.3025663598262301, + "grad_norm": 1.5, + "learning_rate": 9.619486611886976e-05, + "loss": 7.0824, + "step": 5270 + }, + { + "epoch": 0.3031404895412704, + "grad_norm": 1.6015625, + "learning_rate": 9.617705728554807e-05, + "loss": 7.1194, + "step": 5280 + }, + { + "epoch": 0.30371461925631066, + "grad_norm": 1.671875, + "learning_rate": 9.615920853126456e-05, + "loss": 7.1096, + "step": 5290 + }, + { + "epoch": 0.30428874897135094, + "grad_norm": 1.7265625, + "learning_rate": 9.61413198714498e-05, + "loss": 7.1142, + "step": 5300 + }, + { + "epoch": 0.3048628786863912, + "grad_norm": 1.5703125, + "learning_rate": 9.612339132156889e-05, + "loss": 7.0574, + "step": 5310 + }, + { + "epoch": 0.3054370084014315, + "grad_norm": 1.609375, + "learning_rate": 9.610542289712143e-05, + "loss": 7.1176, + "step": 5320 + }, + { + "epoch": 0.3060111381164718, + "grad_norm": 1.7421875, + "learning_rate": 9.608741461364145e-05, + "loss": 7.09, + "step": 5330 + }, + { + "epoch": 0.3065852678315121, + "grad_norm": 1.71875, + "learning_rate": 9.60693664866975e-05, + "loss": 7.1117, + "step": 5340 + }, + { + "epoch": 0.30715939754655236, + "grad_norm": 1.6953125, + "learning_rate": 9.605127853189246e-05, + "loss": 7.0935, + "step": 5350 + }, + { + "epoch": 0.30773352726159264, + "grad_norm": 1.5625, + "learning_rate": 9.603315076486378e-05, + "loss": 7.0575, + "step": 5360 + }, + { + "epoch": 0.3083076569766329, + "grad_norm": 1.625, + "learning_rate": 9.601498320128324e-05, + "loss": 7.0817, + "step": 5370 + }, + { + "epoch": 0.3088817866916732, + "grad_norm": 1.640625, + "learning_rate": 9.599677585685707e-05, + "loss": 7.0864, + "step": 5380 + }, + { + "epoch": 0.3094559164067135, + "grad_norm": 1.640625, + "learning_rate": 9.597852874732585e-05, + "loss": 7.1067, + "step": 5390 + }, + { + "epoch": 0.3100300461217538, + "grad_norm": 1.6796875, + "learning_rate": 9.596024188846459e-05, + "loss": 7.1235, + "step": 5400 + }, + { + "epoch": 0.31060417583679406, + "grad_norm": 1.6875, + "learning_rate": 9.59419152960826e-05, + "loss": 7.1188, + "step": 5410 + }, + { + "epoch": 0.31117830555183434, + "grad_norm": 1.6015625, + "learning_rate": 9.59235489860236e-05, + "loss": 7.1008, + "step": 5420 + }, + { + "epoch": 0.31175243526687463, + "grad_norm": 1.6171875, + "learning_rate": 9.590514297416561e-05, + "loss": 7.0965, + "step": 5430 + }, + { + "epoch": 0.3123265649819149, + "grad_norm": 1.6953125, + "learning_rate": 9.588669727642099e-05, + "loss": 7.1121, + "step": 5440 + }, + { + "epoch": 0.3129006946969552, + "grad_norm": 1.703125, + "learning_rate": 9.586821190873639e-05, + "loss": 7.1177, + "step": 5450 + }, + { + "epoch": 0.3134748244119955, + "grad_norm": 1.609375, + "learning_rate": 9.584968688709279e-05, + "loss": 7.1023, + "step": 5460 + }, + { + "epoch": 0.31404895412703576, + "grad_norm": 1.6171875, + "learning_rate": 9.58311222275054e-05, + "loss": 7.1225, + "step": 5470 + }, + { + "epoch": 0.31462308384207605, + "grad_norm": 1.6953125, + "learning_rate": 9.581251794602377e-05, + "loss": 7.0741, + "step": 5480 + }, + { + "epoch": 0.31519721355711633, + "grad_norm": 1.640625, + "learning_rate": 9.579387405873164e-05, + "loss": 7.1023, + "step": 5490 + }, + { + "epoch": 0.3157713432721566, + "grad_norm": 1.6015625, + "learning_rate": 9.5775190581747e-05, + "loss": 7.1073, + "step": 5500 + }, + { + "epoch": 0.3163454729871969, + "grad_norm": 1.6171875, + "learning_rate": 9.57564675312221e-05, + "loss": 7.0511, + "step": 5510 + }, + { + "epoch": 0.3169196027022372, + "grad_norm": 1.6171875, + "learning_rate": 9.573770492334338e-05, + "loss": 7.0816, + "step": 5520 + }, + { + "epoch": 0.31749373241727746, + "grad_norm": 1.71875, + "learning_rate": 9.571890277433144e-05, + "loss": 7.1016, + "step": 5530 + }, + { + "epoch": 0.31806786213231775, + "grad_norm": 1.65625, + "learning_rate": 9.570006110044116e-05, + "loss": 7.0907, + "step": 5540 + }, + { + "epoch": 0.31864199184735803, + "grad_norm": 1.6484375, + "learning_rate": 9.568117991796148e-05, + "loss": 7.078, + "step": 5550 + }, + { + "epoch": 0.3192161215623983, + "grad_norm": 1.6328125, + "learning_rate": 9.56622592432156e-05, + "loss": 7.0899, + "step": 5560 + }, + { + "epoch": 0.3197902512774386, + "grad_norm": 1.6484375, + "learning_rate": 9.564329909256078e-05, + "loss": 7.1083, + "step": 5570 + }, + { + "epoch": 0.3203643809924789, + "grad_norm": 1.6015625, + "learning_rate": 9.562429948238842e-05, + "loss": 7.0546, + "step": 5580 + }, + { + "epoch": 0.32093851070751916, + "grad_norm": 1.59375, + "learning_rate": 9.56052604291241e-05, + "loss": 7.0755, + "step": 5590 + }, + { + "epoch": 0.32151264042255945, + "grad_norm": 1.5390625, + "learning_rate": 9.55861819492274e-05, + "loss": 7.0788, + "step": 5600 + }, + { + "epoch": 0.32208677013759973, + "grad_norm": 1.6640625, + "learning_rate": 9.556706405919208e-05, + "loss": 7.0758, + "step": 5610 + }, + { + "epoch": 0.32266089985264, + "grad_norm": 1.5859375, + "learning_rate": 9.55479067755459e-05, + "loss": 7.0838, + "step": 5620 + }, + { + "epoch": 0.3232350295676803, + "grad_norm": 1.5703125, + "learning_rate": 9.552871011485071e-05, + "loss": 7.0924, + "step": 5630 + }, + { + "epoch": 0.3238091592827206, + "grad_norm": 1.625, + "learning_rate": 9.550947409370239e-05, + "loss": 7.0698, + "step": 5640 + }, + { + "epoch": 0.3243832889977609, + "grad_norm": 1.6640625, + "learning_rate": 9.549019872873087e-05, + "loss": 7.0464, + "step": 5650 + }, + { + "epoch": 0.3249574187128012, + "grad_norm": 1.546875, + "learning_rate": 9.547088403660005e-05, + "loss": 7.0699, + "step": 5660 + }, + { + "epoch": 0.3255315484278415, + "grad_norm": 1.5859375, + "learning_rate": 9.545153003400789e-05, + "loss": 7.0841, + "step": 5670 + }, + { + "epoch": 0.32610567814288177, + "grad_norm": 1.59375, + "learning_rate": 9.543213673768627e-05, + "loss": 7.0842, + "step": 5680 + }, + { + "epoch": 0.32667980785792206, + "grad_norm": 1.6875, + "learning_rate": 9.541270416440109e-05, + "loss": 7.0989, + "step": 5690 + }, + { + "epoch": 0.32725393757296234, + "grad_norm": 1.578125, + "learning_rate": 9.539323233095219e-05, + "loss": 7.0961, + "step": 5700 + }, + { + "epoch": 0.3278280672880026, + "grad_norm": 1.59375, + "learning_rate": 9.537372125417333e-05, + "loss": 7.0765, + "step": 5710 + }, + { + "epoch": 0.3284021970030429, + "grad_norm": 1.6171875, + "learning_rate": 9.535417095093222e-05, + "loss": 7.0622, + "step": 5720 + }, + { + "epoch": 0.3289763267180832, + "grad_norm": 1.6484375, + "learning_rate": 9.533458143813048e-05, + "loss": 7.0771, + "step": 5730 + }, + { + "epoch": 0.3295504564331235, + "grad_norm": 1.6484375, + "learning_rate": 9.531495273270363e-05, + "loss": 7.07, + "step": 5740 + }, + { + "epoch": 0.33012458614816376, + "grad_norm": 1.609375, + "learning_rate": 9.529528485162105e-05, + "loss": 7.0796, + "step": 5750 + }, + { + "epoch": 0.33069871586320404, + "grad_norm": 1.7109375, + "learning_rate": 9.527557781188602e-05, + "loss": 7.1156, + "step": 5760 + }, + { + "epoch": 0.3312728455782443, + "grad_norm": 1.6875, + "learning_rate": 9.525583163053566e-05, + "loss": 7.0224, + "step": 5770 + }, + { + "epoch": 0.3318469752932846, + "grad_norm": 1.6640625, + "learning_rate": 9.523604632464092e-05, + "loss": 7.0701, + "step": 5780 + }, + { + "epoch": 0.3324211050083249, + "grad_norm": 1.6015625, + "learning_rate": 9.521622191130656e-05, + "loss": 7.0623, + "step": 5790 + }, + { + "epoch": 0.3329952347233652, + "grad_norm": 1.5703125, + "learning_rate": 9.51963584076712e-05, + "loss": 7.1004, + "step": 5800 + }, + { + "epoch": 0.33356936443840546, + "grad_norm": 1.671875, + "learning_rate": 9.517645583090722e-05, + "loss": 7.0978, + "step": 5810 + }, + { + "epoch": 0.33414349415344574, + "grad_norm": 1.5703125, + "learning_rate": 9.515651419822077e-05, + "loss": 7.0835, + "step": 5820 + }, + { + "epoch": 0.334717623868486, + "grad_norm": 1.6171875, + "learning_rate": 9.513653352685179e-05, + "loss": 7.0948, + "step": 5830 + }, + { + "epoch": 0.3352917535835263, + "grad_norm": 1.6015625, + "learning_rate": 9.511651383407395e-05, + "loss": 7.0186, + "step": 5840 + }, + { + "epoch": 0.3358658832985666, + "grad_norm": 1.6328125, + "learning_rate": 9.509645513719467e-05, + "loss": 7.0853, + "step": 5850 + }, + { + "epoch": 0.3364400130136069, + "grad_norm": 1.5859375, + "learning_rate": 9.507635745355509e-05, + "loss": 7.0859, + "step": 5860 + }, + { + "epoch": 0.33701414272864716, + "grad_norm": 1.7265625, + "learning_rate": 9.505622080053003e-05, + "loss": 7.0422, + "step": 5870 + }, + { + "epoch": 0.33758827244368744, + "grad_norm": 1.6328125, + "learning_rate": 9.503604519552803e-05, + "loss": 7.0574, + "step": 5880 + }, + { + "epoch": 0.3381624021587277, + "grad_norm": 1.5390625, + "learning_rate": 9.501583065599127e-05, + "loss": 7.062, + "step": 5890 + }, + { + "epoch": 0.338736531873768, + "grad_norm": 1.53125, + "learning_rate": 9.499557719939564e-05, + "loss": 7.0634, + "step": 5900 + }, + { + "epoch": 0.3393106615888083, + "grad_norm": 1.6171875, + "learning_rate": 9.497528484325062e-05, + "loss": 7.0739, + "step": 5910 + }, + { + "epoch": 0.3398847913038486, + "grad_norm": 1.6953125, + "learning_rate": 9.495495360509937e-05, + "loss": 7.0782, + "step": 5920 + }, + { + "epoch": 0.34045892101888886, + "grad_norm": 1.71875, + "learning_rate": 9.493458350251862e-05, + "loss": 7.0697, + "step": 5930 + }, + { + "epoch": 0.34103305073392914, + "grad_norm": 1.671875, + "learning_rate": 9.491417455311875e-05, + "loss": 7.0745, + "step": 5940 + }, + { + "epoch": 0.3416071804489694, + "grad_norm": 1.578125, + "learning_rate": 9.489372677454365e-05, + "loss": 7.0831, + "step": 5950 + }, + { + "epoch": 0.3421813101640097, + "grad_norm": 1.7109375, + "learning_rate": 9.487324018447086e-05, + "loss": 7.0725, + "step": 5960 + }, + { + "epoch": 0.34275543987905, + "grad_norm": 1.546875, + "learning_rate": 9.48527148006114e-05, + "loss": 7.103, + "step": 5970 + }, + { + "epoch": 0.3433295695940903, + "grad_norm": 1.5859375, + "learning_rate": 9.48321506407099e-05, + "loss": 7.0543, + "step": 5980 + }, + { + "epoch": 0.34390369930913056, + "grad_norm": 1.640625, + "learning_rate": 9.481154772254444e-05, + "loss": 7.1007, + "step": 5990 + }, + { + "epoch": 0.34447782902417085, + "grad_norm": 1.7109375, + "learning_rate": 9.479090606392664e-05, + "loss": 7.0926, + "step": 6000 + }, + { + "epoch": 0.34505195873921113, + "grad_norm": 1.6875, + "learning_rate": 9.477022568270166e-05, + "loss": 7.0797, + "step": 6010 + }, + { + "epoch": 0.3456260884542514, + "grad_norm": 1.671875, + "learning_rate": 9.474950659674804e-05, + "loss": 7.0643, + "step": 6020 + }, + { + "epoch": 0.3462002181692917, + "grad_norm": 1.59375, + "learning_rate": 9.472874882397786e-05, + "loss": 7.0753, + "step": 6030 + }, + { + "epoch": 0.346774347884332, + "grad_norm": 1.6796875, + "learning_rate": 9.470795238233662e-05, + "loss": 7.0945, + "step": 6040 + }, + { + "epoch": 0.34734847759937226, + "grad_norm": 1.6875, + "learning_rate": 9.468711728980323e-05, + "loss": 7.055, + "step": 6050 + }, + { + "epoch": 0.34792260731441255, + "grad_norm": 1.5703125, + "learning_rate": 9.466624356439004e-05, + "loss": 7.0759, + "step": 6060 + }, + { + "epoch": 0.34849673702945283, + "grad_norm": 1.5859375, + "learning_rate": 9.46453312241428e-05, + "loss": 7.0866, + "step": 6070 + }, + { + "epoch": 0.3490708667444931, + "grad_norm": 1.640625, + "learning_rate": 9.462438028714061e-05, + "loss": 7.0849, + "step": 6080 + }, + { + "epoch": 0.3496449964595334, + "grad_norm": 1.6328125, + "learning_rate": 9.460339077149597e-05, + "loss": 7.101, + "step": 6090 + }, + { + "epoch": 0.35021912617457374, + "grad_norm": 1.640625, + "learning_rate": 9.458236269535476e-05, + "loss": 7.0593, + "step": 6100 + }, + { + "epoch": 0.350793255889614, + "grad_norm": 1.625, + "learning_rate": 9.45612960768961e-05, + "loss": 7.0364, + "step": 6110 + }, + { + "epoch": 0.3513673856046543, + "grad_norm": 1.609375, + "learning_rate": 9.454019093433253e-05, + "loss": 7.0575, + "step": 6120 + }, + { + "epoch": 0.3519415153196946, + "grad_norm": 1.5, + "learning_rate": 9.451904728590983e-05, + "loss": 7.1114, + "step": 6130 + }, + { + "epoch": 0.35251564503473487, + "grad_norm": 1.6484375, + "learning_rate": 9.449786514990713e-05, + "loss": 7.0604, + "step": 6140 + }, + { + "epoch": 0.35308977474977515, + "grad_norm": 1.546875, + "learning_rate": 9.447664454463677e-05, + "loss": 7.0818, + "step": 6150 + }, + { + "epoch": 0.35366390446481544, + "grad_norm": 1.5546875, + "learning_rate": 9.445538548844436e-05, + "loss": 7.0759, + "step": 6160 + }, + { + "epoch": 0.3542380341798557, + "grad_norm": 1.6171875, + "learning_rate": 9.443408799970884e-05, + "loss": 7.0768, + "step": 6170 + }, + { + "epoch": 0.354812163894896, + "grad_norm": 1.6171875, + "learning_rate": 9.441275209684219e-05, + "loss": 7.0732, + "step": 6180 + }, + { + "epoch": 0.3553862936099363, + "grad_norm": 1.6328125, + "learning_rate": 9.43913777982898e-05, + "loss": 7.0547, + "step": 6190 + }, + { + "epoch": 0.35596042332497657, + "grad_norm": 1.640625, + "learning_rate": 9.436996512253013e-05, + "loss": 7.0371, + "step": 6200 + }, + { + "epoch": 0.35653455304001685, + "grad_norm": 1.6328125, + "learning_rate": 9.434851408807487e-05, + "loss": 7.0853, + "step": 6210 + }, + { + "epoch": 0.35710868275505714, + "grad_norm": 1.640625, + "learning_rate": 9.432702471346884e-05, + "loss": 7.0577, + "step": 6220 + }, + { + "epoch": 0.3576828124700974, + "grad_norm": 1.671875, + "learning_rate": 9.430549701729004e-05, + "loss": 7.066, + "step": 6230 + }, + { + "epoch": 0.3582569421851377, + "grad_norm": 1.625, + "learning_rate": 9.428393101814954e-05, + "loss": 7.0415, + "step": 6240 + }, + { + "epoch": 0.358831071900178, + "grad_norm": 1.625, + "learning_rate": 9.426232673469162e-05, + "loss": 7.0349, + "step": 6250 + }, + { + "epoch": 0.3594052016152183, + "grad_norm": 1.5546875, + "learning_rate": 9.424068418559356e-05, + "loss": 7.0604, + "step": 6260 + }, + { + "epoch": 0.35997933133025856, + "grad_norm": 1.609375, + "learning_rate": 9.421900338956578e-05, + "loss": 7.0384, + "step": 6270 + }, + { + "epoch": 0.36055346104529884, + "grad_norm": 1.6015625, + "learning_rate": 9.419728436535176e-05, + "loss": 7.0879, + "step": 6280 + }, + { + "epoch": 0.3611275907603391, + "grad_norm": 1.59375, + "learning_rate": 9.4175527131728e-05, + "loss": 7.0746, + "step": 6290 + }, + { + "epoch": 0.3617017204753794, + "grad_norm": 1.515625, + "learning_rate": 9.415373170750404e-05, + "loss": 7.0225, + "step": 6300 + }, + { + "epoch": 0.3622758501904197, + "grad_norm": 1.5625, + "learning_rate": 9.413189811152247e-05, + "loss": 7.0722, + "step": 6310 + }, + { + "epoch": 0.36284997990546, + "grad_norm": 1.6015625, + "learning_rate": 9.411002636265886e-05, + "loss": 7.0454, + "step": 6320 + }, + { + "epoch": 0.36342410962050026, + "grad_norm": 1.7578125, + "learning_rate": 9.408811647982176e-05, + "loss": 7.0908, + "step": 6330 + }, + { + "epoch": 0.36399823933554054, + "grad_norm": 1.6953125, + "learning_rate": 9.406616848195266e-05, + "loss": 7.0544, + "step": 6340 + }, + { + "epoch": 0.3645723690505808, + "grad_norm": 1.7734375, + "learning_rate": 9.404418238802606e-05, + "loss": 7.0017, + "step": 6350 + }, + { + "epoch": 0.3651464987656211, + "grad_norm": 1.609375, + "learning_rate": 9.402215821704935e-05, + "loss": 7.0659, + "step": 6360 + }, + { + "epoch": 0.3657206284806614, + "grad_norm": 1.5546875, + "learning_rate": 9.400009598806287e-05, + "loss": 7.0969, + "step": 6370 + }, + { + "epoch": 0.3662947581957017, + "grad_norm": 1.5625, + "learning_rate": 9.397799572013982e-05, + "loss": 7.028, + "step": 6380 + }, + { + "epoch": 0.36686888791074196, + "grad_norm": 1.6171875, + "learning_rate": 9.395585743238633e-05, + "loss": 7.0821, + "step": 6390 + }, + { + "epoch": 0.36744301762578224, + "grad_norm": 1.7109375, + "learning_rate": 9.393368114394136e-05, + "loss": 7.0553, + "step": 6400 + }, + { + "epoch": 0.3680171473408225, + "grad_norm": 1.6484375, + "learning_rate": 9.391146687397676e-05, + "loss": 7.0522, + "step": 6410 + }, + { + "epoch": 0.3685912770558628, + "grad_norm": 1.6796875, + "learning_rate": 9.388921464169719e-05, + "loss": 7.017, + "step": 6420 + }, + { + "epoch": 0.3691654067709031, + "grad_norm": 1.578125, + "learning_rate": 9.386692446634016e-05, + "loss": 7.0541, + "step": 6430 + }, + { + "epoch": 0.3697395364859434, + "grad_norm": 1.6484375, + "learning_rate": 9.38445963671759e-05, + "loss": 7.069, + "step": 6440 + }, + { + "epoch": 0.37031366620098366, + "grad_norm": 1.640625, + "learning_rate": 9.382223036350755e-05, + "loss": 7.0844, + "step": 6450 + }, + { + "epoch": 0.37088779591602394, + "grad_norm": 1.671875, + "learning_rate": 9.379982647467091e-05, + "loss": 7.0528, + "step": 6460 + }, + { + "epoch": 0.3714619256310642, + "grad_norm": 1.640625, + "learning_rate": 9.37773847200346e-05, + "loss": 7.0136, + "step": 6470 + }, + { + "epoch": 0.3720360553461045, + "grad_norm": 1.6484375, + "learning_rate": 9.375490511899994e-05, + "loss": 7.0743, + "step": 6480 + }, + { + "epoch": 0.3726101850611448, + "grad_norm": 1.59375, + "learning_rate": 9.373238769100098e-05, + "loss": 7.0748, + "step": 6490 + }, + { + "epoch": 0.3731843147761851, + "grad_norm": 1.59375, + "learning_rate": 9.370983245550449e-05, + "loss": 7.0398, + "step": 6500 + }, + { + "epoch": 0.37375844449122536, + "grad_norm": 1.5390625, + "learning_rate": 9.36872394320099e-05, + "loss": 7.0709, + "step": 6510 + }, + { + "epoch": 0.37433257420626564, + "grad_norm": 1.5546875, + "learning_rate": 9.36646086400493e-05, + "loss": 7.0296, + "step": 6520 + }, + { + "epoch": 0.37490670392130593, + "grad_norm": 1.671875, + "learning_rate": 9.36419400991875e-05, + "loss": 7.0657, + "step": 6530 + }, + { + "epoch": 0.3754808336363462, + "grad_norm": 1.6484375, + "learning_rate": 9.361923382902182e-05, + "loss": 7.0604, + "step": 6540 + }, + { + "epoch": 0.37605496335138655, + "grad_norm": 1.6171875, + "learning_rate": 9.359648984918232e-05, + "loss": 7.0412, + "step": 6550 + }, + { + "epoch": 0.37662909306642683, + "grad_norm": 1.734375, + "learning_rate": 9.35737081793316e-05, + "loss": 7.0582, + "step": 6560 + }, + { + "epoch": 0.3772032227814671, + "grad_norm": 1.6484375, + "learning_rate": 9.355088883916485e-05, + "loss": 7.0994, + "step": 6570 + }, + { + "epoch": 0.3777773524965074, + "grad_norm": 1.6015625, + "learning_rate": 9.352803184840983e-05, + "loss": 7.0822, + "step": 6580 + }, + { + "epoch": 0.3783514822115477, + "grad_norm": 1.484375, + "learning_rate": 9.350513722682687e-05, + "loss": 7.0721, + "step": 6590 + }, + { + "epoch": 0.37892561192658797, + "grad_norm": 1.59375, + "learning_rate": 9.34822049942088e-05, + "loss": 7.0822, + "step": 6600 + }, + { + "epoch": 0.37949974164162825, + "grad_norm": 1.609375, + "learning_rate": 9.3459235170381e-05, + "loss": 7.0448, + "step": 6610 + }, + { + "epoch": 0.38007387135666854, + "grad_norm": 1.625, + "learning_rate": 9.343622777520129e-05, + "loss": 7.037, + "step": 6620 + }, + { + "epoch": 0.3806480010717088, + "grad_norm": 1.6328125, + "learning_rate": 9.341318282856004e-05, + "loss": 7.0463, + "step": 6630 + }, + { + "epoch": 0.3812221307867491, + "grad_norm": 1.6953125, + "learning_rate": 9.339010035038005e-05, + "loss": 7.0267, + "step": 6640 + }, + { + "epoch": 0.3817962605017894, + "grad_norm": 1.625, + "learning_rate": 9.336698036061657e-05, + "loss": 7.0255, + "step": 6650 + }, + { + "epoch": 0.38237039021682967, + "grad_norm": 1.609375, + "learning_rate": 9.334382287925726e-05, + "loss": 7.0543, + "step": 6660 + }, + { + "epoch": 0.38294451993186995, + "grad_norm": 1.609375, + "learning_rate": 9.332062792632223e-05, + "loss": 7.0655, + "step": 6670 + }, + { + "epoch": 0.38351864964691024, + "grad_norm": 1.546875, + "learning_rate": 9.329739552186396e-05, + "loss": 7.012, + "step": 6680 + }, + { + "epoch": 0.3840927793619505, + "grad_norm": 1.609375, + "learning_rate": 9.327412568596735e-05, + "loss": 7.0628, + "step": 6690 + }, + { + "epoch": 0.3846669090769908, + "grad_norm": 1.6015625, + "learning_rate": 9.325081843874954e-05, + "loss": 7.0552, + "step": 6700 + }, + { + "epoch": 0.3852410387920311, + "grad_norm": 1.6875, + "learning_rate": 9.322747380036019e-05, + "loss": 7.0691, + "step": 6710 + }, + { + "epoch": 0.38581516850707137, + "grad_norm": 1.65625, + "learning_rate": 9.320409179098113e-05, + "loss": 7.0627, + "step": 6720 + }, + { + "epoch": 0.38638929822211165, + "grad_norm": 1.6171875, + "learning_rate": 9.31806724308266e-05, + "loss": 7.0231, + "step": 6730 + }, + { + "epoch": 0.38696342793715194, + "grad_norm": 1.6328125, + "learning_rate": 9.315721574014307e-05, + "loss": 7.0489, + "step": 6740 + }, + { + "epoch": 0.3875375576521922, + "grad_norm": 1.6171875, + "learning_rate": 9.31337217392093e-05, + "loss": 7.0758, + "step": 6750 + }, + { + "epoch": 0.3881116873672325, + "grad_norm": 1.6015625, + "learning_rate": 9.311019044833631e-05, + "loss": 7.0796, + "step": 6760 + }, + { + "epoch": 0.3886858170822728, + "grad_norm": 1.671875, + "learning_rate": 9.308662188786738e-05, + "loss": 7.0804, + "step": 6770 + }, + { + "epoch": 0.38925994679731307, + "grad_norm": 1.546875, + "learning_rate": 9.306301607817797e-05, + "loss": 7.0535, + "step": 6780 + }, + { + "epoch": 0.38983407651235336, + "grad_norm": 1.5625, + "learning_rate": 9.303937303967578e-05, + "loss": 7.0569, + "step": 6790 + }, + { + "epoch": 0.39040820622739364, + "grad_norm": 1.5546875, + "learning_rate": 9.301569279280063e-05, + "loss": 7.0705, + "step": 6800 + }, + { + "epoch": 0.3909823359424339, + "grad_norm": 1.5546875, + "learning_rate": 9.29919753580246e-05, + "loss": 7.0721, + "step": 6810 + }, + { + "epoch": 0.3915564656574742, + "grad_norm": 1.6484375, + "learning_rate": 9.296822075585185e-05, + "loss": 7.1056, + "step": 6820 + }, + { + "epoch": 0.3921305953725145, + "grad_norm": 1.6171875, + "learning_rate": 9.29444290068187e-05, + "loss": 7.0806, + "step": 6830 + }, + { + "epoch": 0.3927047250875548, + "grad_norm": 1.65625, + "learning_rate": 9.292060013149357e-05, + "loss": 6.9807, + "step": 6840 + }, + { + "epoch": 0.39327885480259506, + "grad_norm": 1.6171875, + "learning_rate": 9.289673415047701e-05, + "loss": 7.0552, + "step": 6850 + }, + { + "epoch": 0.39385298451763534, + "grad_norm": 1.6796875, + "learning_rate": 9.287283108440159e-05, + "loss": 7.0737, + "step": 6860 + }, + { + "epoch": 0.3944271142326756, + "grad_norm": 1.65625, + "learning_rate": 9.2848890953932e-05, + "loss": 7.0574, + "step": 6870 + }, + { + "epoch": 0.3950012439477159, + "grad_norm": 1.6796875, + "learning_rate": 9.282491377976494e-05, + "loss": 7.0671, + "step": 6880 + }, + { + "epoch": 0.3955753736627562, + "grad_norm": 1.6953125, + "learning_rate": 9.280089958262912e-05, + "loss": 6.9983, + "step": 6890 + }, + { + "epoch": 0.3961495033777965, + "grad_norm": 1.609375, + "learning_rate": 9.277684838328532e-05, + "loss": 7.0531, + "step": 6900 + }, + { + "epoch": 0.39672363309283676, + "grad_norm": 1.6015625, + "learning_rate": 9.275276020252624e-05, + "loss": 7.0274, + "step": 6910 + }, + { + "epoch": 0.39729776280787704, + "grad_norm": 1.640625, + "learning_rate": 9.272863506117659e-05, + "loss": 7.0561, + "step": 6920 + }, + { + "epoch": 0.3978718925229173, + "grad_norm": 1.671875, + "learning_rate": 9.270447298009301e-05, + "loss": 7.0595, + "step": 6930 + }, + { + "epoch": 0.3984460222379576, + "grad_norm": 1.5703125, + "learning_rate": 9.26802739801641e-05, + "loss": 7.0799, + "step": 6940 + }, + { + "epoch": 0.3990201519529979, + "grad_norm": 1.578125, + "learning_rate": 9.265603808231038e-05, + "loss": 7.0158, + "step": 6950 + }, + { + "epoch": 0.3995942816680382, + "grad_norm": 1.765625, + "learning_rate": 9.263176530748422e-05, + "loss": 7.0851, + "step": 6960 + }, + { + "epoch": 0.40016841138307846, + "grad_norm": 1.75, + "learning_rate": 9.260745567666992e-05, + "loss": 7.0673, + "step": 6970 + }, + { + "epoch": 0.40074254109811874, + "grad_norm": 1.65625, + "learning_rate": 9.258310921088363e-05, + "loss": 7.0224, + "step": 6980 + }, + { + "epoch": 0.401316670813159, + "grad_norm": 1.6171875, + "learning_rate": 9.255872593117334e-05, + "loss": 7.0413, + "step": 6990 + }, + { + "epoch": 0.4018908005281993, + "grad_norm": 1.640625, + "learning_rate": 9.253430585861887e-05, + "loss": 7.0657, + "step": 7000 + }, + { + "epoch": 0.40246493024323965, + "grad_norm": 1.6640625, + "learning_rate": 9.250984901433185e-05, + "loss": 6.9849, + "step": 7010 + }, + { + "epoch": 0.40303905995827993, + "grad_norm": 1.7265625, + "learning_rate": 9.248535541945569e-05, + "loss": 7.05, + "step": 7020 + }, + { + "epoch": 0.4036131896733202, + "grad_norm": 1.6015625, + "learning_rate": 9.246082509516558e-05, + "loss": 7.0623, + "step": 7030 + }, + { + "epoch": 0.4041873193883605, + "grad_norm": 1.625, + "learning_rate": 9.243625806266845e-05, + "loss": 7.0092, + "step": 7040 + }, + { + "epoch": 0.4047614491034008, + "grad_norm": 1.6328125, + "learning_rate": 9.2411654343203e-05, + "loss": 7.0477, + "step": 7050 + }, + { + "epoch": 0.40533557881844107, + "grad_norm": 1.6328125, + "learning_rate": 9.238701395803962e-05, + "loss": 7.0425, + "step": 7060 + }, + { + "epoch": 0.40590970853348135, + "grad_norm": 1.5703125, + "learning_rate": 9.236233692848035e-05, + "loss": 7.0353, + "step": 7070 + }, + { + "epoch": 0.40648383824852163, + "grad_norm": 1.6171875, + "learning_rate": 9.233762327585905e-05, + "loss": 7.0285, + "step": 7080 + }, + { + "epoch": 0.4070579679635619, + "grad_norm": 1.625, + "learning_rate": 9.231287302154107e-05, + "loss": 7.0415, + "step": 7090 + }, + { + "epoch": 0.4076320976786022, + "grad_norm": 1.640625, + "learning_rate": 9.228808618692353e-05, + "loss": 7.0454, + "step": 7100 + }, + { + "epoch": 0.4082062273936425, + "grad_norm": 1.671875, + "learning_rate": 9.226326279343512e-05, + "loss": 7.0603, + "step": 7110 + }, + { + "epoch": 0.40878035710868277, + "grad_norm": 1.5859375, + "learning_rate": 9.223840286253613e-05, + "loss": 7.0359, + "step": 7120 + }, + { + "epoch": 0.40935448682372305, + "grad_norm": 1.625, + "learning_rate": 9.221350641571848e-05, + "loss": 7.0449, + "step": 7130 + }, + { + "epoch": 0.40992861653876334, + "grad_norm": 1.6640625, + "learning_rate": 9.21885734745056e-05, + "loss": 7.0498, + "step": 7140 + }, + { + "epoch": 0.4105027462538036, + "grad_norm": 1.6171875, + "learning_rate": 9.216360406045254e-05, + "loss": 7.0622, + "step": 7150 + }, + { + "epoch": 0.4110768759688439, + "grad_norm": 1.6328125, + "learning_rate": 9.213859819514581e-05, + "loss": 7.0269, + "step": 7160 + }, + { + "epoch": 0.4116510056838842, + "grad_norm": 1.6953125, + "learning_rate": 9.211355590020348e-05, + "loss": 7.061, + "step": 7170 + }, + { + "epoch": 0.41222513539892447, + "grad_norm": 1.6171875, + "learning_rate": 9.208847719727509e-05, + "loss": 7.0471, + "step": 7180 + }, + { + "epoch": 0.41279926511396475, + "grad_norm": 1.578125, + "learning_rate": 9.206336210804167e-05, + "loss": 7.0315, + "step": 7190 + }, + { + "epoch": 0.41337339482900504, + "grad_norm": 1.6796875, + "learning_rate": 9.203821065421571e-05, + "loss": 7.0379, + "step": 7200 + }, + { + "epoch": 0.4139475245440453, + "grad_norm": 1.6328125, + "learning_rate": 9.201302285754114e-05, + "loss": 7.0435, + "step": 7210 + }, + { + "epoch": 0.4145216542590856, + "grad_norm": 1.703125, + "learning_rate": 9.198779873979326e-05, + "loss": 7.016, + "step": 7220 + }, + { + "epoch": 0.4150957839741259, + "grad_norm": 1.5703125, + "learning_rate": 9.196253832277883e-05, + "loss": 7.0658, + "step": 7230 + }, + { + "epoch": 0.41566991368916617, + "grad_norm": 1.5234375, + "learning_rate": 9.193724162833598e-05, + "loss": 7.0362, + "step": 7240 + }, + { + "epoch": 0.41624404340420645, + "grad_norm": 1.6171875, + "learning_rate": 9.191190867833419e-05, + "loss": 7.0765, + "step": 7250 + }, + { + "epoch": 0.41681817311924674, + "grad_norm": 1.546875, + "learning_rate": 9.188653949467427e-05, + "loss": 7.0459, + "step": 7260 + }, + { + "epoch": 0.417392302834287, + "grad_norm": 1.5390625, + "learning_rate": 9.186113409928838e-05, + "loss": 7.0007, + "step": 7270 + }, + { + "epoch": 0.4179664325493273, + "grad_norm": 1.7109375, + "learning_rate": 9.183569251413999e-05, + "loss": 7.0554, + "step": 7280 + }, + { + "epoch": 0.4185405622643676, + "grad_norm": 1.609375, + "learning_rate": 9.181021476122385e-05, + "loss": 7.0235, + "step": 7290 + }, + { + "epoch": 0.41911469197940787, + "grad_norm": 1.65625, + "learning_rate": 9.178470086256593e-05, + "loss": 7.0363, + "step": 7300 + }, + { + "epoch": 0.41968882169444816, + "grad_norm": 1.7265625, + "learning_rate": 9.175915084022353e-05, + "loss": 7.0607, + "step": 7310 + }, + { + "epoch": 0.42026295140948844, + "grad_norm": 1.703125, + "learning_rate": 9.173356471628511e-05, + "loss": 7.0638, + "step": 7320 + }, + { + "epoch": 0.4208370811245287, + "grad_norm": 1.6171875, + "learning_rate": 9.17079425128704e-05, + "loss": 7.0519, + "step": 7330 + }, + { + "epoch": 0.421411210839569, + "grad_norm": 1.625, + "learning_rate": 9.168228425213028e-05, + "loss": 7.0528, + "step": 7340 + }, + { + "epoch": 0.4219853405546093, + "grad_norm": 1.5859375, + "learning_rate": 9.165658995624681e-05, + "loss": 6.9998, + "step": 7350 + }, + { + "epoch": 0.4225594702696496, + "grad_norm": 1.6484375, + "learning_rate": 9.163085964743321e-05, + "loss": 7.0553, + "step": 7360 + }, + { + "epoch": 0.42313359998468986, + "grad_norm": 1.5, + "learning_rate": 9.160509334793384e-05, + "loss": 7.087, + "step": 7370 + }, + { + "epoch": 0.42370772969973014, + "grad_norm": 1.765625, + "learning_rate": 9.157929108002414e-05, + "loss": 7.0492, + "step": 7380 + }, + { + "epoch": 0.4242818594147704, + "grad_norm": 1.6484375, + "learning_rate": 9.155345286601069e-05, + "loss": 7.0492, + "step": 7390 + }, + { + "epoch": 0.4248559891298107, + "grad_norm": 1.6171875, + "learning_rate": 9.152757872823113e-05, + "loss": 7.0173, + "step": 7400 + }, + { + "epoch": 0.425430118844851, + "grad_norm": 1.7109375, + "learning_rate": 9.150166868905414e-05, + "loss": 7.0533, + "step": 7410 + }, + { + "epoch": 0.4260042485598913, + "grad_norm": 1.7265625, + "learning_rate": 9.147572277087948e-05, + "loss": 7.03, + "step": 7420 + }, + { + "epoch": 0.42657837827493156, + "grad_norm": 1.578125, + "learning_rate": 9.144974099613787e-05, + "loss": 7.0407, + "step": 7430 + }, + { + "epoch": 0.42715250798997184, + "grad_norm": 1.5625, + "learning_rate": 9.142372338729108e-05, + "loss": 7.0591, + "step": 7440 + }, + { + "epoch": 0.4277266377050121, + "grad_norm": 1.6015625, + "learning_rate": 9.139766996683181e-05, + "loss": 7.0383, + "step": 7450 + }, + { + "epoch": 0.42830076742005246, + "grad_norm": 1.5390625, + "learning_rate": 9.137158075728377e-05, + "loss": 7.0378, + "step": 7460 + }, + { + "epoch": 0.42887489713509275, + "grad_norm": 1.6015625, + "learning_rate": 9.134545578120157e-05, + "loss": 7.0556, + "step": 7470 + }, + { + "epoch": 0.42944902685013303, + "grad_norm": 1.5859375, + "learning_rate": 9.131929506117078e-05, + "loss": 7.0647, + "step": 7480 + }, + { + "epoch": 0.4300231565651733, + "grad_norm": 1.6484375, + "learning_rate": 9.129309861980783e-05, + "loss": 7.0471, + "step": 7490 + }, + { + "epoch": 0.4305972862802136, + "grad_norm": 1.625, + "learning_rate": 9.126686647976008e-05, + "loss": 7.0375, + "step": 7500 + }, + { + "epoch": 0.4311714159952539, + "grad_norm": 1.5234375, + "learning_rate": 9.124059866370571e-05, + "loss": 7.0423, + "step": 7510 + }, + { + "epoch": 0.43174554571029417, + "grad_norm": 1.578125, + "learning_rate": 9.121429519435374e-05, + "loss": 7.0172, + "step": 7520 + }, + { + "epoch": 0.43231967542533445, + "grad_norm": 1.5390625, + "learning_rate": 9.118795609444403e-05, + "loss": 7.0096, + "step": 7530 + }, + { + "epoch": 0.43289380514037473, + "grad_norm": 1.6171875, + "learning_rate": 9.116158138674729e-05, + "loss": 7.0544, + "step": 7540 + }, + { + "epoch": 0.433467934855415, + "grad_norm": 1.625, + "learning_rate": 9.11351710940649e-05, + "loss": 7.0353, + "step": 7550 + }, + { + "epoch": 0.4340420645704553, + "grad_norm": 1.6328125, + "learning_rate": 9.110872523922911e-05, + "loss": 7.0482, + "step": 7560 + }, + { + "epoch": 0.4346161942854956, + "grad_norm": 1.515625, + "learning_rate": 9.108224384510286e-05, + "loss": 7.0514, + "step": 7570 + }, + { + "epoch": 0.43519032400053587, + "grad_norm": 1.7109375, + "learning_rate": 9.105572693457985e-05, + "loss": 7.0284, + "step": 7580 + }, + { + "epoch": 0.43576445371557615, + "grad_norm": 1.8046875, + "learning_rate": 9.102917453058444e-05, + "loss": 7.0148, + "step": 7590 + }, + { + "epoch": 0.43633858343061643, + "grad_norm": 1.609375, + "learning_rate": 9.100258665607171e-05, + "loss": 7.0613, + "step": 7600 + }, + { + "epoch": 0.4369127131456567, + "grad_norm": 1.6796875, + "learning_rate": 9.097596333402738e-05, + "loss": 7.0254, + "step": 7610 + }, + { + "epoch": 0.437486842860697, + "grad_norm": 1.625, + "learning_rate": 9.094930458746784e-05, + "loss": 7.0785, + "step": 7620 + }, + { + "epoch": 0.4380609725757373, + "grad_norm": 1.4765625, + "learning_rate": 9.09226104394401e-05, + "loss": 7.0231, + "step": 7630 + }, + { + "epoch": 0.43863510229077757, + "grad_norm": 1.578125, + "learning_rate": 9.089588091302176e-05, + "loss": 7.0644, + "step": 7640 + }, + { + "epoch": 0.43920923200581785, + "grad_norm": 1.6015625, + "learning_rate": 9.086911603132103e-05, + "loss": 7.0538, + "step": 7650 + }, + { + "epoch": 0.43978336172085813, + "grad_norm": 1.515625, + "learning_rate": 9.084231581747662e-05, + "loss": 7.0289, + "step": 7660 + }, + { + "epoch": 0.4403574914358984, + "grad_norm": 1.609375, + "learning_rate": 9.081548029465789e-05, + "loss": 7.0333, + "step": 7670 + }, + { + "epoch": 0.4409316211509387, + "grad_norm": 1.609375, + "learning_rate": 9.078860948606464e-05, + "loss": 7.0796, + "step": 7680 + }, + { + "epoch": 0.441505750865979, + "grad_norm": 1.671875, + "learning_rate": 9.076170341492722e-05, + "loss": 7.0289, + "step": 7690 + }, + { + "epoch": 0.44207988058101927, + "grad_norm": 1.671875, + "learning_rate": 9.073476210450646e-05, + "loss": 7.0234, + "step": 7700 + }, + { + "epoch": 0.44265401029605955, + "grad_norm": 1.609375, + "learning_rate": 9.070778557809362e-05, + "loss": 7.0468, + "step": 7710 + }, + { + "epoch": 0.44322814001109984, + "grad_norm": 1.59375, + "learning_rate": 9.068077385901043e-05, + "loss": 7.048, + "step": 7720 + }, + { + "epoch": 0.4438022697261401, + "grad_norm": 1.578125, + "learning_rate": 9.065372697060908e-05, + "loss": 7.0085, + "step": 7730 + }, + { + "epoch": 0.4443763994411804, + "grad_norm": 1.53125, + "learning_rate": 9.062664493627208e-05, + "loss": 7.0594, + "step": 7740 + }, + { + "epoch": 0.4449505291562207, + "grad_norm": 1.609375, + "learning_rate": 9.059952777941241e-05, + "loss": 7.0759, + "step": 7750 + }, + { + "epoch": 0.44552465887126097, + "grad_norm": 1.6875, + "learning_rate": 9.057237552347337e-05, + "loss": 7.0201, + "step": 7760 + }, + { + "epoch": 0.44609878858630125, + "grad_norm": 1.75, + "learning_rate": 9.054518819192862e-05, + "loss": 7.029, + "step": 7770 + }, + { + "epoch": 0.44667291830134154, + "grad_norm": 1.5703125, + "learning_rate": 9.051796580828212e-05, + "loss": 7.0293, + "step": 7780 + }, + { + "epoch": 0.4472470480163818, + "grad_norm": 1.6328125, + "learning_rate": 9.049070839606813e-05, + "loss": 6.9941, + "step": 7790 + }, + { + "epoch": 0.4478211777314221, + "grad_norm": 1.609375, + "learning_rate": 9.046341597885126e-05, + "loss": 7.0569, + "step": 7800 + }, + { + "epoch": 0.4483953074464624, + "grad_norm": 1.625, + "learning_rate": 9.043608858022631e-05, + "loss": 7.0543, + "step": 7810 + }, + { + "epoch": 0.44896943716150267, + "grad_norm": 1.765625, + "learning_rate": 9.040872622381834e-05, + "loss": 7.0131, + "step": 7820 + }, + { + "epoch": 0.44954356687654295, + "grad_norm": 1.578125, + "learning_rate": 9.038132893328264e-05, + "loss": 7.0364, + "step": 7830 + }, + { + "epoch": 0.45011769659158324, + "grad_norm": 1.578125, + "learning_rate": 9.035389673230472e-05, + "loss": 7.0294, + "step": 7840 + }, + { + "epoch": 0.4506918263066235, + "grad_norm": 1.75, + "learning_rate": 9.032642964460023e-05, + "loss": 7.046, + "step": 7850 + }, + { + "epoch": 0.4512659560216638, + "grad_norm": 1.6484375, + "learning_rate": 9.0298927693915e-05, + "loss": 7.0224, + "step": 7860 + }, + { + "epoch": 0.4518400857367041, + "grad_norm": 1.765625, + "learning_rate": 9.027139090402499e-05, + "loss": 7.0183, + "step": 7870 + }, + { + "epoch": 0.4524142154517444, + "grad_norm": 1.7265625, + "learning_rate": 9.024381929873631e-05, + "loss": 7.037, + "step": 7880 + }, + { + "epoch": 0.45298834516678466, + "grad_norm": 1.578125, + "learning_rate": 9.021621290188516e-05, + "loss": 7.0357, + "step": 7890 + }, + { + "epoch": 0.45356247488182494, + "grad_norm": 1.6484375, + "learning_rate": 9.018857173733776e-05, + "loss": 7.0316, + "step": 7900 + }, + { + "epoch": 0.4541366045968653, + "grad_norm": 1.71875, + "learning_rate": 9.016089582899047e-05, + "loss": 7.0234, + "step": 7910 + }, + { + "epoch": 0.45471073431190556, + "grad_norm": 1.5546875, + "learning_rate": 9.013318520076964e-05, + "loss": 7.0453, + "step": 7920 + }, + { + "epoch": 0.45528486402694585, + "grad_norm": 1.609375, + "learning_rate": 9.010543987663165e-05, + "loss": 7.0319, + "step": 7930 + }, + { + "epoch": 0.45585899374198613, + "grad_norm": 1.609375, + "learning_rate": 9.007765988056284e-05, + "loss": 7.0268, + "step": 7940 + }, + { + "epoch": 0.4564331234570264, + "grad_norm": 1.6171875, + "learning_rate": 9.00498452365796e-05, + "loss": 6.972, + "step": 7950 + }, + { + "epoch": 0.4570072531720667, + "grad_norm": 1.6640625, + "learning_rate": 9.002199596872821e-05, + "loss": 7.062, + "step": 7960 + }, + { + "epoch": 0.457581382887107, + "grad_norm": 1.59375, + "learning_rate": 8.99941121010849e-05, + "loss": 7.0066, + "step": 7970 + }, + { + "epoch": 0.45815551260214726, + "grad_norm": 1.59375, + "learning_rate": 8.996619365775583e-05, + "loss": 7.0132, + "step": 7980 + }, + { + "epoch": 0.45872964231718755, + "grad_norm": 1.5390625, + "learning_rate": 8.993824066287699e-05, + "loss": 7.0405, + "step": 7990 + }, + { + "epoch": 0.45930377203222783, + "grad_norm": 1.6015625, + "learning_rate": 8.991025314061434e-05, + "loss": 7.0209, + "step": 8000 + }, + { + "epoch": 0.4598779017472681, + "grad_norm": 1.5859375, + "learning_rate": 8.988223111516363e-05, + "loss": 7.0209, + "step": 8010 + }, + { + "epoch": 0.4604520314623084, + "grad_norm": 1.7578125, + "learning_rate": 8.98541746107504e-05, + "loss": 7.0216, + "step": 8020 + }, + { + "epoch": 0.4610261611773487, + "grad_norm": 1.6328125, + "learning_rate": 8.982608365163009e-05, + "loss": 7.0717, + "step": 8030 + }, + { + "epoch": 0.46160029089238896, + "grad_norm": 1.6875, + "learning_rate": 8.979795826208785e-05, + "loss": 7.0264, + "step": 8040 + }, + { + "epoch": 0.46217442060742925, + "grad_norm": 1.5859375, + "learning_rate": 8.976979846643865e-05, + "loss": 7.0146, + "step": 8050 + }, + { + "epoch": 0.46274855032246953, + "grad_norm": 1.6484375, + "learning_rate": 8.974160428902716e-05, + "loss": 7.012, + "step": 8060 + }, + { + "epoch": 0.4633226800375098, + "grad_norm": 1.734375, + "learning_rate": 8.97133757542278e-05, + "loss": 7.0154, + "step": 8070 + }, + { + "epoch": 0.4638968097525501, + "grad_norm": 1.59375, + "learning_rate": 8.968511288644468e-05, + "loss": 6.9963, + "step": 8080 + }, + { + "epoch": 0.4644709394675904, + "grad_norm": 1.5859375, + "learning_rate": 8.96568157101116e-05, + "loss": 7.0217, + "step": 8090 + }, + { + "epoch": 0.46504506918263067, + "grad_norm": 1.59375, + "learning_rate": 8.962848424969201e-05, + "loss": 7.0337, + "step": 8100 + }, + { + "epoch": 0.46561919889767095, + "grad_norm": 1.6640625, + "learning_rate": 8.960011852967904e-05, + "loss": 7.0316, + "step": 8110 + }, + { + "epoch": 0.46619332861271123, + "grad_norm": 1.6328125, + "learning_rate": 8.957171857459538e-05, + "loss": 7.0226, + "step": 8120 + }, + { + "epoch": 0.4667674583277515, + "grad_norm": 1.6015625, + "learning_rate": 8.954328440899334e-05, + "loss": 7.0425, + "step": 8130 + }, + { + "epoch": 0.4673415880427918, + "grad_norm": 1.6640625, + "learning_rate": 8.95148160574548e-05, + "loss": 7.0096, + "step": 8140 + }, + { + "epoch": 0.4679157177578321, + "grad_norm": 1.5703125, + "learning_rate": 8.948631354459123e-05, + "loss": 7.0395, + "step": 8150 + }, + { + "epoch": 0.46848984747287237, + "grad_norm": 1.6953125, + "learning_rate": 8.945777689504357e-05, + "loss": 7.0209, + "step": 8160 + }, + { + "epoch": 0.46906397718791265, + "grad_norm": 1.6875, + "learning_rate": 8.942920613348235e-05, + "loss": 7.0099, + "step": 8170 + }, + { + "epoch": 0.46963810690295293, + "grad_norm": 1.6328125, + "learning_rate": 8.940060128460752e-05, + "loss": 7.0199, + "step": 8180 + }, + { + "epoch": 0.4702122366179932, + "grad_norm": 1.7109375, + "learning_rate": 8.937196237314853e-05, + "loss": 7.0487, + "step": 8190 + }, + { + "epoch": 0.4707863663330335, + "grad_norm": 1.6015625, + "learning_rate": 8.934328942386427e-05, + "loss": 7.0168, + "step": 8200 + }, + { + "epoch": 0.4713604960480738, + "grad_norm": 1.7578125, + "learning_rate": 8.931458246154307e-05, + "loss": 7.0126, + "step": 8210 + }, + { + "epoch": 0.47193462576311407, + "grad_norm": 1.5234375, + "learning_rate": 8.928584151100265e-05, + "loss": 7.0564, + "step": 8220 + }, + { + "epoch": 0.47250875547815435, + "grad_norm": 1.6640625, + "learning_rate": 8.925706659709014e-05, + "loss": 7.0058, + "step": 8230 + }, + { + "epoch": 0.47308288519319464, + "grad_norm": 1.578125, + "learning_rate": 8.922825774468198e-05, + "loss": 7.0144, + "step": 8240 + }, + { + "epoch": 0.4736570149082349, + "grad_norm": 1.5859375, + "learning_rate": 8.919941497868398e-05, + "loss": 7.0205, + "step": 8250 + }, + { + "epoch": 0.4742311446232752, + "grad_norm": 1.6796875, + "learning_rate": 8.917053832403131e-05, + "loss": 7.0391, + "step": 8260 + }, + { + "epoch": 0.4748052743383155, + "grad_norm": 1.6484375, + "learning_rate": 8.914162780568836e-05, + "loss": 7.004, + "step": 8270 + }, + { + "epoch": 0.47537940405335577, + "grad_norm": 1.59375, + "learning_rate": 8.911268344864885e-05, + "loss": 6.9943, + "step": 8280 + }, + { + "epoch": 0.47595353376839605, + "grad_norm": 1.625, + "learning_rate": 8.908370527793573e-05, + "loss": 7.0023, + "step": 8290 + }, + { + "epoch": 0.47652766348343634, + "grad_norm": 1.6171875, + "learning_rate": 8.905469331860121e-05, + "loss": 6.9991, + "step": 8300 + }, + { + "epoch": 0.4771017931984766, + "grad_norm": 1.6328125, + "learning_rate": 8.902564759572667e-05, + "loss": 7.0314, + "step": 8310 + }, + { + "epoch": 0.4776759229135169, + "grad_norm": 1.7421875, + "learning_rate": 8.899656813442273e-05, + "loss": 7.0244, + "step": 8320 + }, + { + "epoch": 0.4782500526285572, + "grad_norm": 1.609375, + "learning_rate": 8.89674549598291e-05, + "loss": 6.9878, + "step": 8330 + }, + { + "epoch": 0.47882418234359747, + "grad_norm": 1.6171875, + "learning_rate": 8.893830809711472e-05, + "loss": 7.0248, + "step": 8340 + }, + { + "epoch": 0.47939831205863775, + "grad_norm": 1.5703125, + "learning_rate": 8.89091275714776e-05, + "loss": 7.0032, + "step": 8350 + }, + { + "epoch": 0.4799724417736781, + "grad_norm": 1.6875, + "learning_rate": 8.88799134081449e-05, + "loss": 7.0167, + "step": 8360 + }, + { + "epoch": 0.4805465714887184, + "grad_norm": 1.6328125, + "learning_rate": 8.88506656323728e-05, + "loss": 7.0079, + "step": 8370 + }, + { + "epoch": 0.48112070120375866, + "grad_norm": 1.6015625, + "learning_rate": 8.88213842694466e-05, + "loss": 7.0017, + "step": 8380 + }, + { + "epoch": 0.48169483091879894, + "grad_norm": 1.6328125, + "learning_rate": 8.879206934468056e-05, + "loss": 7.0265, + "step": 8390 + }, + { + "epoch": 0.4822689606338392, + "grad_norm": 1.6015625, + "learning_rate": 8.876272088341804e-05, + "loss": 6.9952, + "step": 8400 + }, + { + "epoch": 0.4828430903488795, + "grad_norm": 1.578125, + "learning_rate": 8.873333891103135e-05, + "loss": 6.9986, + "step": 8410 + }, + { + "epoch": 0.4834172200639198, + "grad_norm": 1.6171875, + "learning_rate": 8.870392345292175e-05, + "loss": 6.9903, + "step": 8420 + }, + { + "epoch": 0.4839913497789601, + "grad_norm": 1.625, + "learning_rate": 8.867447453451952e-05, + "loss": 7.0363, + "step": 8430 + }, + { + "epoch": 0.48456547949400036, + "grad_norm": 1.6796875, + "learning_rate": 8.864499218128377e-05, + "loss": 7.0042, + "step": 8440 + }, + { + "epoch": 0.48513960920904065, + "grad_norm": 1.6171875, + "learning_rate": 8.86154764187026e-05, + "loss": 7.0164, + "step": 8450 + }, + { + "epoch": 0.48571373892408093, + "grad_norm": 1.5546875, + "learning_rate": 8.858592727229295e-05, + "loss": 7.0584, + "step": 8460 + }, + { + "epoch": 0.4862878686391212, + "grad_norm": 1.5625, + "learning_rate": 8.855634476760061e-05, + "loss": 7.0018, + "step": 8470 + }, + { + "epoch": 0.4868619983541615, + "grad_norm": 1.5859375, + "learning_rate": 8.852672893020027e-05, + "loss": 6.9839, + "step": 8480 + }, + { + "epoch": 0.4874361280692018, + "grad_norm": 1.640625, + "learning_rate": 8.849707978569537e-05, + "loss": 7.004, + "step": 8490 + }, + { + "epoch": 0.48801025778424206, + "grad_norm": 1.515625, + "learning_rate": 8.846739735971817e-05, + "loss": 7.0146, + "step": 8500 + }, + { + "epoch": 0.48858438749928235, + "grad_norm": 1.59375, + "learning_rate": 8.843768167792971e-05, + "loss": 7.0398, + "step": 8510 + }, + { + "epoch": 0.48915851721432263, + "grad_norm": 1.625, + "learning_rate": 8.840793276601977e-05, + "loss": 7.0074, + "step": 8520 + }, + { + "epoch": 0.4897326469293629, + "grad_norm": 1.578125, + "learning_rate": 8.837815064970687e-05, + "loss": 6.9928, + "step": 8530 + }, + { + "epoch": 0.4903067766444032, + "grad_norm": 1.6171875, + "learning_rate": 8.834833535473822e-05, + "loss": 7.0171, + "step": 8540 + }, + { + "epoch": 0.4908809063594435, + "grad_norm": 1.5625, + "learning_rate": 8.831848690688972e-05, + "loss": 7.0176, + "step": 8550 + }, + { + "epoch": 0.49145503607448376, + "grad_norm": 1.65625, + "learning_rate": 8.828860533196593e-05, + "loss": 7.0408, + "step": 8560 + }, + { + "epoch": 0.49202916578952405, + "grad_norm": 1.625, + "learning_rate": 8.825869065580006e-05, + "loss": 7.0392, + "step": 8570 + }, + { + "epoch": 0.49260329550456433, + "grad_norm": 1.7421875, + "learning_rate": 8.822874290425391e-05, + "loss": 7.0104, + "step": 8580 + }, + { + "epoch": 0.4931774252196046, + "grad_norm": 1.640625, + "learning_rate": 8.819876210321792e-05, + "loss": 7.001, + "step": 8590 + }, + { + "epoch": 0.4937515549346449, + "grad_norm": 1.640625, + "learning_rate": 8.816874827861103e-05, + "loss": 7.0417, + "step": 8600 + }, + { + "epoch": 0.4943256846496852, + "grad_norm": 1.6171875, + "learning_rate": 8.813870145638083e-05, + "loss": 7.0276, + "step": 8610 + }, + { + "epoch": 0.49489981436472547, + "grad_norm": 1.6171875, + "learning_rate": 8.810862166250335e-05, + "loss": 7.0135, + "step": 8620 + }, + { + "epoch": 0.49547394407976575, + "grad_norm": 1.5, + "learning_rate": 8.807850892298315e-05, + "loss": 7.0043, + "step": 8630 + }, + { + "epoch": 0.49604807379480603, + "grad_norm": 1.6796875, + "learning_rate": 8.804836326385328e-05, + "loss": 7.0027, + "step": 8640 + }, + { + "epoch": 0.4966222035098463, + "grad_norm": 1.609375, + "learning_rate": 8.801818471117528e-05, + "loss": 7.0387, + "step": 8650 + }, + { + "epoch": 0.4971963332248866, + "grad_norm": 1.6484375, + "learning_rate": 8.798797329103905e-05, + "loss": 7.0251, + "step": 8660 + }, + { + "epoch": 0.4977704629399269, + "grad_norm": 1.59375, + "learning_rate": 8.795772902956297e-05, + "loss": 7.0148, + "step": 8670 + }, + { + "epoch": 0.49834459265496717, + "grad_norm": 1.6484375, + "learning_rate": 8.792745195289378e-05, + "loss": 7.0388, + "step": 8680 + }, + { + "epoch": 0.49891872237000745, + "grad_norm": 1.5625, + "learning_rate": 8.789714208720661e-05, + "loss": 7.0125, + "step": 8690 + }, + { + "epoch": 0.49949285208504773, + "grad_norm": 1.625, + "learning_rate": 8.786679945870491e-05, + "loss": 7.0172, + "step": 8700 + }, + { + "epoch": 0.500066981800088, + "grad_norm": 1.5546875, + "learning_rate": 8.78364240936205e-05, + "loss": 7.0287, + "step": 8710 + }, + { + "epoch": 0.5006411115151284, + "grad_norm": 1.6171875, + "learning_rate": 8.780601601821345e-05, + "loss": 7.0163, + "step": 8720 + }, + { + "epoch": 0.5012152412301686, + "grad_norm": 1.6796875, + "learning_rate": 8.777557525877216e-05, + "loss": 7.0036, + "step": 8730 + }, + { + "epoch": 0.5017893709452089, + "grad_norm": 1.609375, + "learning_rate": 8.774510184161322e-05, + "loss": 6.9887, + "step": 8740 + }, + { + "epoch": 0.5023635006602492, + "grad_norm": 1.578125, + "learning_rate": 8.77145957930815e-05, + "loss": 7.0021, + "step": 8750 + }, + { + "epoch": 0.5029376303752895, + "grad_norm": 1.6015625, + "learning_rate": 8.768405713955009e-05, + "loss": 7.0171, + "step": 8760 + }, + { + "epoch": 0.5035117600903297, + "grad_norm": 1.59375, + "learning_rate": 8.765348590742021e-05, + "loss": 7.0025, + "step": 8770 + }, + { + "epoch": 0.5040858898053701, + "grad_norm": 1.6953125, + "learning_rate": 8.762288212312133e-05, + "loss": 7.0257, + "step": 8780 + }, + { + "epoch": 0.5046600195204103, + "grad_norm": 1.5234375, + "learning_rate": 8.759224581311098e-05, + "loss": 7.0121, + "step": 8790 + }, + { + "epoch": 0.5052341492354506, + "grad_norm": 1.59375, + "learning_rate": 8.756157700387487e-05, + "loss": 7.0393, + "step": 8800 + }, + { + "epoch": 0.5058082789504909, + "grad_norm": 1.625, + "learning_rate": 8.753087572192675e-05, + "loss": 7.0138, + "step": 8810 + }, + { + "epoch": 0.5063824086655312, + "grad_norm": 1.5625, + "learning_rate": 8.750014199380852e-05, + "loss": 7.0661, + "step": 8820 + }, + { + "epoch": 0.5069565383805714, + "grad_norm": 1.6171875, + "learning_rate": 8.746937584609003e-05, + "loss": 7.0297, + "step": 8830 + }, + { + "epoch": 0.5075306680956118, + "grad_norm": 1.640625, + "learning_rate": 8.743857730536925e-05, + "loss": 7.0149, + "step": 8840 + }, + { + "epoch": 0.508104797810652, + "grad_norm": 1.53125, + "learning_rate": 8.74077463982721e-05, + "loss": 7.0319, + "step": 8850 + }, + { + "epoch": 0.5086789275256923, + "grad_norm": 1.5234375, + "learning_rate": 8.737688315145251e-05, + "loss": 7.0174, + "step": 8860 + }, + { + "epoch": 0.5092530572407326, + "grad_norm": 1.609375, + "learning_rate": 8.734598759159234e-05, + "loss": 7.0097, + "step": 8870 + }, + { + "epoch": 0.5098271869557729, + "grad_norm": 1.671875, + "learning_rate": 8.731505974540139e-05, + "loss": 6.9883, + "step": 8880 + }, + { + "epoch": 0.5104013166708131, + "grad_norm": 1.65625, + "learning_rate": 8.728409963961744e-05, + "loss": 7.0213, + "step": 8890 + }, + { + "epoch": 0.5109754463858535, + "grad_norm": 1.6015625, + "learning_rate": 8.725310730100602e-05, + "loss": 7.0088, + "step": 8900 + }, + { + "epoch": 0.5115495761008937, + "grad_norm": 1.59375, + "learning_rate": 8.722208275636068e-05, + "loss": 6.9966, + "step": 8910 + }, + { + "epoch": 0.512123705815934, + "grad_norm": 1.546875, + "learning_rate": 8.71910260325027e-05, + "loss": 7.0113, + "step": 8920 + }, + { + "epoch": 0.5126978355309743, + "grad_norm": 1.5390625, + "learning_rate": 8.715993715628122e-05, + "loss": 7.0126, + "step": 8930 + }, + { + "epoch": 0.5132719652460146, + "grad_norm": 1.578125, + "learning_rate": 8.71288161545732e-05, + "loss": 7.0036, + "step": 8940 + }, + { + "epoch": 0.5138460949610548, + "grad_norm": 1.6484375, + "learning_rate": 8.709766305428334e-05, + "loss": 7.004, + "step": 8950 + }, + { + "epoch": 0.5144202246760952, + "grad_norm": 1.5234375, + "learning_rate": 8.70664778823441e-05, + "loss": 7.0153, + "step": 8960 + }, + { + "epoch": 0.5149943543911354, + "grad_norm": 1.6484375, + "learning_rate": 8.703526066571565e-05, + "loss": 7.0101, + "step": 8970 + }, + { + "epoch": 0.5155684841061757, + "grad_norm": 1.5546875, + "learning_rate": 8.70040114313859e-05, + "loss": 7.0252, + "step": 8980 + }, + { + "epoch": 0.516142613821216, + "grad_norm": 1.609375, + "learning_rate": 8.697273020637042e-05, + "loss": 6.9998, + "step": 8990 + }, + { + "epoch": 0.5167167435362563, + "grad_norm": 1.734375, + "learning_rate": 8.694141701771241e-05, + "loss": 7.0078, + "step": 9000 + }, + { + "epoch": 0.5172908732512965, + "grad_norm": 1.5859375, + "learning_rate": 8.691007189248276e-05, + "loss": 7.0193, + "step": 9010 + }, + { + "epoch": 0.5178650029663369, + "grad_norm": 1.640625, + "learning_rate": 8.687869485777993e-05, + "loss": 7.0169, + "step": 9020 + }, + { + "epoch": 0.5184391326813771, + "grad_norm": 1.578125, + "learning_rate": 8.684728594072995e-05, + "loss": 6.9945, + "step": 9030 + }, + { + "epoch": 0.5190132623964174, + "grad_norm": 1.6171875, + "learning_rate": 8.681584516848648e-05, + "loss": 7.002, + "step": 9040 + }, + { + "epoch": 0.5195873921114578, + "grad_norm": 1.640625, + "learning_rate": 8.678437256823065e-05, + "loss": 7.0061, + "step": 9050 + }, + { + "epoch": 0.520161521826498, + "grad_norm": 1.46875, + "learning_rate": 8.675286816717114e-05, + "loss": 7.0683, + "step": 9060 + }, + { + "epoch": 0.5207356515415383, + "grad_norm": 1.6015625, + "learning_rate": 8.67213319925441e-05, + "loss": 6.9938, + "step": 9070 + }, + { + "epoch": 0.5213097812565786, + "grad_norm": 1.5546875, + "learning_rate": 8.66897640716132e-05, + "loss": 7.0302, + "step": 9080 + }, + { + "epoch": 0.5218839109716189, + "grad_norm": 1.4921875, + "learning_rate": 8.66581644316695e-05, + "loss": 6.9934, + "step": 9090 + }, + { + "epoch": 0.5224580406866591, + "grad_norm": 1.65625, + "learning_rate": 8.66265331000315e-05, + "loss": 7.0016, + "step": 9100 + }, + { + "epoch": 0.5230321704016995, + "grad_norm": 1.6015625, + "learning_rate": 8.659487010404511e-05, + "loss": 7.0131, + "step": 9110 + }, + { + "epoch": 0.5236063001167397, + "grad_norm": 1.8203125, + "learning_rate": 8.656317547108356e-05, + "loss": 6.9904, + "step": 9120 + }, + { + "epoch": 0.52418042983178, + "grad_norm": 1.640625, + "learning_rate": 8.653144922854755e-05, + "loss": 7.0, + "step": 9130 + }, + { + "epoch": 0.5247545595468203, + "grad_norm": 1.609375, + "learning_rate": 8.649969140386497e-05, + "loss": 7.0017, + "step": 9140 + }, + { + "epoch": 0.5253286892618606, + "grad_norm": 1.625, + "learning_rate": 8.646790202449114e-05, + "loss": 6.9942, + "step": 9150 + }, + { + "epoch": 0.5259028189769008, + "grad_norm": 1.6796875, + "learning_rate": 8.64360811179085e-05, + "loss": 6.9673, + "step": 9160 + }, + { + "epoch": 0.5264769486919412, + "grad_norm": 1.6640625, + "learning_rate": 8.640422871162693e-05, + "loss": 7.0162, + "step": 9170 + }, + { + "epoch": 0.5270510784069814, + "grad_norm": 1.59375, + "learning_rate": 8.637234483318342e-05, + "loss": 7.0132, + "step": 9180 + }, + { + "epoch": 0.5276252081220217, + "grad_norm": 1.53125, + "learning_rate": 8.634042951014219e-05, + "loss": 6.9909, + "step": 9190 + }, + { + "epoch": 0.528199337837062, + "grad_norm": 1.640625, + "learning_rate": 8.630848277009465e-05, + "loss": 6.9867, + "step": 9200 + }, + { + "epoch": 0.5287734675521023, + "grad_norm": 1.6015625, + "learning_rate": 8.627650464065942e-05, + "loss": 7.0013, + "step": 9210 + }, + { + "epoch": 0.5293475972671425, + "grad_norm": 1.7265625, + "learning_rate": 8.624449514948216e-05, + "loss": 7.0366, + "step": 9220 + }, + { + "epoch": 0.5299217269821829, + "grad_norm": 1.6875, + "learning_rate": 8.621245432423575e-05, + "loss": 7.042, + "step": 9230 + }, + { + "epoch": 0.5304958566972231, + "grad_norm": 1.5625, + "learning_rate": 8.618038219262006e-05, + "loss": 7.017, + "step": 9240 + }, + { + "epoch": 0.5310699864122634, + "grad_norm": 1.71875, + "learning_rate": 8.614827878236209e-05, + "loss": 6.9998, + "step": 9250 + }, + { + "epoch": 0.5316441161273037, + "grad_norm": 1.6171875, + "learning_rate": 8.611614412121584e-05, + "loss": 6.9938, + "step": 9260 + }, + { + "epoch": 0.532218245842344, + "grad_norm": 1.65625, + "learning_rate": 8.608397823696239e-05, + "loss": 7.0051, + "step": 9270 + }, + { + "epoch": 0.5327923755573842, + "grad_norm": 1.6796875, + "learning_rate": 8.605178115740975e-05, + "loss": 6.9972, + "step": 9280 + }, + { + "epoch": 0.5333665052724246, + "grad_norm": 1.6171875, + "learning_rate": 8.60195529103929e-05, + "loss": 7.0035, + "step": 9290 + }, + { + "epoch": 0.5339406349874648, + "grad_norm": 1.5390625, + "learning_rate": 8.598729352377381e-05, + "loss": 7.0136, + "step": 9300 + }, + { + "epoch": 0.5345147647025051, + "grad_norm": 1.5390625, + "learning_rate": 8.595500302544133e-05, + "loss": 7.0247, + "step": 9310 + }, + { + "epoch": 0.5350888944175454, + "grad_norm": 1.59375, + "learning_rate": 8.592268144331124e-05, + "loss": 6.9363, + "step": 9320 + }, + { + "epoch": 0.5356630241325857, + "grad_norm": 1.671875, + "learning_rate": 8.589032880532615e-05, + "loss": 6.9205, + "step": 9330 + }, + { + "epoch": 0.5362371538476259, + "grad_norm": 1.640625, + "learning_rate": 8.585794513945557e-05, + "loss": 6.9942, + "step": 9340 + }, + { + "epoch": 0.5368112835626663, + "grad_norm": 1.5703125, + "learning_rate": 8.582553047369579e-05, + "loss": 7.0191, + "step": 9350 + }, + { + "epoch": 0.5373854132777065, + "grad_norm": 1.59375, + "learning_rate": 8.579308483606991e-05, + "loss": 6.9831, + "step": 9360 + }, + { + "epoch": 0.5379595429927468, + "grad_norm": 1.5859375, + "learning_rate": 8.576060825462784e-05, + "loss": 6.9771, + "step": 9370 + }, + { + "epoch": 0.5385336727077871, + "grad_norm": 1.671875, + "learning_rate": 8.57281007574462e-05, + "loss": 7.0119, + "step": 9380 + }, + { + "epoch": 0.5391078024228274, + "grad_norm": 1.671875, + "learning_rate": 8.569556237262834e-05, + "loss": 6.9919, + "step": 9390 + }, + { + "epoch": 0.5396819321378676, + "grad_norm": 1.6171875, + "learning_rate": 8.566299312830433e-05, + "loss": 7.0048, + "step": 9400 + }, + { + "epoch": 0.540256061852908, + "grad_norm": 1.578125, + "learning_rate": 8.563039305263095e-05, + "loss": 7.0365, + "step": 9410 + }, + { + "epoch": 0.5408301915679482, + "grad_norm": 1.546875, + "learning_rate": 8.559776217379154e-05, + "loss": 7.0129, + "step": 9420 + }, + { + "epoch": 0.5414043212829885, + "grad_norm": 1.6796875, + "learning_rate": 8.556510051999616e-05, + "loss": 6.9662, + "step": 9430 + }, + { + "epoch": 0.5419784509980288, + "grad_norm": 1.6328125, + "learning_rate": 8.553240811948144e-05, + "loss": 6.9831, + "step": 9440 + }, + { + "epoch": 0.5425525807130691, + "grad_norm": 1.6015625, + "learning_rate": 8.54996850005106e-05, + "loss": 7.0207, + "step": 9450 + }, + { + "epoch": 0.5431267104281093, + "grad_norm": 1.5703125, + "learning_rate": 8.54669311913734e-05, + "loss": 6.9951, + "step": 9460 + }, + { + "epoch": 0.5437008401431497, + "grad_norm": 1.6640625, + "learning_rate": 8.543414672038615e-05, + "loss": 7.0068, + "step": 9470 + }, + { + "epoch": 0.5442749698581899, + "grad_norm": 1.6015625, + "learning_rate": 8.540133161589165e-05, + "loss": 6.9715, + "step": 9480 + }, + { + "epoch": 0.5448490995732302, + "grad_norm": 1.6484375, + "learning_rate": 8.536848590625923e-05, + "loss": 6.9811, + "step": 9490 + }, + { + "epoch": 0.5454232292882706, + "grad_norm": 1.5859375, + "learning_rate": 8.53356096198846e-05, + "loss": 6.9818, + "step": 9500 + }, + { + "epoch": 0.5459973590033108, + "grad_norm": 1.6328125, + "learning_rate": 8.530270278518997e-05, + "loss": 7.032, + "step": 9510 + }, + { + "epoch": 0.5465714887183512, + "grad_norm": 1.640625, + "learning_rate": 8.52697654306239e-05, + "loss": 7.0155, + "step": 9520 + }, + { + "epoch": 0.5471456184333914, + "grad_norm": 1.5390625, + "learning_rate": 8.523679758466144e-05, + "loss": 7.0373, + "step": 9530 + }, + { + "epoch": 0.5477197481484317, + "grad_norm": 1.5703125, + "learning_rate": 8.520379927580386e-05, + "loss": 7.0093, + "step": 9540 + }, + { + "epoch": 0.548293877863472, + "grad_norm": 1.5625, + "learning_rate": 8.51707705325789e-05, + "loss": 7.0006, + "step": 9550 + }, + { + "epoch": 0.5488680075785123, + "grad_norm": 1.5859375, + "learning_rate": 8.513771138354052e-05, + "loss": 6.9994, + "step": 9560 + }, + { + "epoch": 0.5494421372935525, + "grad_norm": 1.59375, + "learning_rate": 8.5104621857269e-05, + "loss": 7.004, + "step": 9570 + }, + { + "epoch": 0.5500162670085929, + "grad_norm": 1.6328125, + "learning_rate": 8.507150198237087e-05, + "loss": 7.0039, + "step": 9580 + }, + { + "epoch": 0.5505903967236331, + "grad_norm": 1.609375, + "learning_rate": 8.503835178747892e-05, + "loss": 6.9992, + "step": 9590 + }, + { + "epoch": 0.5511645264386734, + "grad_norm": 1.6171875, + "learning_rate": 8.500517130125212e-05, + "loss": 7.0309, + "step": 9600 + }, + { + "epoch": 0.5517386561537136, + "grad_norm": 1.59375, + "learning_rate": 8.497196055237565e-05, + "loss": 6.9839, + "step": 9610 + }, + { + "epoch": 0.552312785868754, + "grad_norm": 1.609375, + "learning_rate": 8.493871956956083e-05, + "loss": 6.9932, + "step": 9620 + }, + { + "epoch": 0.5528869155837942, + "grad_norm": 1.53125, + "learning_rate": 8.490544838154518e-05, + "loss": 7.0001, + "step": 9630 + }, + { + "epoch": 0.5534610452988346, + "grad_norm": 1.5859375, + "learning_rate": 8.487214701709225e-05, + "loss": 7.0033, + "step": 9640 + }, + { + "epoch": 0.5540351750138748, + "grad_norm": 1.7109375, + "learning_rate": 8.48388155049917e-05, + "loss": 7.0179, + "step": 9650 + }, + { + "epoch": 0.5546093047289151, + "grad_norm": 1.6171875, + "learning_rate": 8.480545387405933e-05, + "loss": 7.0158, + "step": 9660 + }, + { + "epoch": 0.5551834344439553, + "grad_norm": 1.6953125, + "learning_rate": 8.477206215313687e-05, + "loss": 6.971, + "step": 9670 + }, + { + "epoch": 0.5557575641589957, + "grad_norm": 1.625, + "learning_rate": 8.473864037109212e-05, + "loss": 6.9604, + "step": 9680 + }, + { + "epoch": 0.5563316938740359, + "grad_norm": 1.65625, + "learning_rate": 8.470518855681886e-05, + "loss": 7.0133, + "step": 9690 + }, + { + "epoch": 0.5569058235890763, + "grad_norm": 1.6484375, + "learning_rate": 8.467170673923684e-05, + "loss": 6.9575, + "step": 9700 + }, + { + "epoch": 0.5574799533041165, + "grad_norm": 1.5625, + "learning_rate": 8.463819494729173e-05, + "loss": 6.9639, + "step": 9710 + }, + { + "epoch": 0.5580540830191568, + "grad_norm": 1.578125, + "learning_rate": 8.460465320995513e-05, + "loss": 6.995, + "step": 9720 + }, + { + "epoch": 0.558628212734197, + "grad_norm": 1.5625, + "learning_rate": 8.45710815562245e-05, + "loss": 7.0093, + "step": 9730 + }, + { + "epoch": 0.5592023424492374, + "grad_norm": 1.6015625, + "learning_rate": 8.453748001512322e-05, + "loss": 6.9967, + "step": 9740 + }, + { + "epoch": 0.5597764721642776, + "grad_norm": 1.625, + "learning_rate": 8.450384861570047e-05, + "loss": 7.0163, + "step": 9750 + }, + { + "epoch": 0.560350601879318, + "grad_norm": 1.578125, + "learning_rate": 8.447018738703122e-05, + "loss": 6.9971, + "step": 9760 + }, + { + "epoch": 0.5609247315943582, + "grad_norm": 1.6328125, + "learning_rate": 8.443649635821629e-05, + "loss": 6.9673, + "step": 9770 + }, + { + "epoch": 0.5614988613093985, + "grad_norm": 1.5546875, + "learning_rate": 8.44027755583822e-05, + "loss": 6.9829, + "step": 9780 + }, + { + "epoch": 0.5620729910244388, + "grad_norm": 1.5859375, + "learning_rate": 8.436902501668124e-05, + "loss": 6.9677, + "step": 9790 + }, + { + "epoch": 0.5626471207394791, + "grad_norm": 1.578125, + "learning_rate": 8.433524476229142e-05, + "loss": 7.0134, + "step": 9800 + }, + { + "epoch": 0.5632212504545193, + "grad_norm": 1.6484375, + "learning_rate": 8.430143482441643e-05, + "loss": 7.0174, + "step": 9810 + }, + { + "epoch": 0.5637953801695597, + "grad_norm": 1.6953125, + "learning_rate": 8.42675952322856e-05, + "loss": 7.0116, + "step": 9820 + }, + { + "epoch": 0.5643695098845999, + "grad_norm": 1.6796875, + "learning_rate": 8.423372601515391e-05, + "loss": 6.9594, + "step": 9830 + }, + { + "epoch": 0.5649436395996402, + "grad_norm": 1.6484375, + "learning_rate": 8.419982720230199e-05, + "loss": 6.9933, + "step": 9840 + }, + { + "epoch": 0.5655177693146805, + "grad_norm": 1.6875, + "learning_rate": 8.416589882303598e-05, + "loss": 6.9948, + "step": 9850 + }, + { + "epoch": 0.5660918990297208, + "grad_norm": 1.671875, + "learning_rate": 8.413194090668766e-05, + "loss": 6.9934, + "step": 9860 + }, + { + "epoch": 0.566666028744761, + "grad_norm": 1.6171875, + "learning_rate": 8.409795348261427e-05, + "loss": 6.9736, + "step": 9870 + }, + { + "epoch": 0.5672401584598014, + "grad_norm": 1.53125, + "learning_rate": 8.40639365801986e-05, + "loss": 6.9948, + "step": 9880 + }, + { + "epoch": 0.5678142881748416, + "grad_norm": 1.59375, + "learning_rate": 8.402989022884896e-05, + "loss": 6.9928, + "step": 9890 + }, + { + "epoch": 0.5683884178898819, + "grad_norm": 1.625, + "learning_rate": 8.399581445799905e-05, + "loss": 6.9894, + "step": 9900 + }, + { + "epoch": 0.5689625476049222, + "grad_norm": 1.6484375, + "learning_rate": 8.396170929710805e-05, + "loss": 6.9837, + "step": 9910 + }, + { + "epoch": 0.5695366773199625, + "grad_norm": 944.0, + "learning_rate": 8.392757477566051e-05, + "loss": 7.0183, + "step": 9920 + }, + { + "epoch": 0.5701108070350027, + "grad_norm": 1.71875, + "learning_rate": 8.389341092316642e-05, + "loss": 6.9961, + "step": 9930 + }, + { + "epoch": 0.5706849367500431, + "grad_norm": 1.6328125, + "learning_rate": 8.385921776916106e-05, + "loss": 7.0278, + "step": 9940 + }, + { + "epoch": 0.5712590664650834, + "grad_norm": 1.5859375, + "learning_rate": 8.382499534320509e-05, + "loss": 7.014, + "step": 9950 + }, + { + "epoch": 0.5718331961801236, + "grad_norm": 1.5703125, + "learning_rate": 8.379074367488446e-05, + "loss": 6.9726, + "step": 9960 + }, + { + "epoch": 0.572407325895164, + "grad_norm": 1.5703125, + "learning_rate": 8.375646279381042e-05, + "loss": 7.0086, + "step": 9970 + }, + { + "epoch": 0.5729814556102042, + "grad_norm": 1.5703125, + "learning_rate": 8.372215272961943e-05, + "loss": 6.9934, + "step": 9980 + }, + { + "epoch": 0.5735555853252445, + "grad_norm": 1.6875, + "learning_rate": 8.368781351197321e-05, + "loss": 7.0058, + "step": 9990 + }, + { + "epoch": 0.5741297150402848, + "grad_norm": 1.6796875, + "learning_rate": 8.36534451705587e-05, + "loss": 6.9995, + "step": 10000 + }, + { + "epoch": 0.5747038447553251, + "grad_norm": 1.5390625, + "learning_rate": 8.361904773508798e-05, + "loss": 7.0011, + "step": 10010 + }, + { + "epoch": 0.5752779744703653, + "grad_norm": 1.546875, + "learning_rate": 8.358462123529829e-05, + "loss": 6.9823, + "step": 10020 + }, + { + "epoch": 0.5758521041854057, + "grad_norm": 1.6484375, + "learning_rate": 8.355016570095204e-05, + "loss": 7.0104, + "step": 10030 + }, + { + "epoch": 0.5764262339004459, + "grad_norm": 1.5234375, + "learning_rate": 8.351568116183667e-05, + "loss": 7.0075, + "step": 10040 + }, + { + "epoch": 0.5770003636154862, + "grad_norm": 1.59375, + "learning_rate": 8.348116764776475e-05, + "loss": 6.9775, + "step": 10050 + }, + { + "epoch": 0.5775744933305265, + "grad_norm": 1.578125, + "learning_rate": 8.344662518857388e-05, + "loss": 6.9956, + "step": 10060 + }, + { + "epoch": 0.5781486230455668, + "grad_norm": 1.59375, + "learning_rate": 8.34120538141267e-05, + "loss": 7.0129, + "step": 10070 + }, + { + "epoch": 0.578722752760607, + "grad_norm": 1.7265625, + "learning_rate": 8.337745355431083e-05, + "loss": 6.985, + "step": 10080 + }, + { + "epoch": 0.5792968824756474, + "grad_norm": 1.65625, + "learning_rate": 8.334282443903886e-05, + "loss": 7.025, + "step": 10090 + }, + { + "epoch": 0.5798710121906876, + "grad_norm": 1.5859375, + "learning_rate": 8.330816649824833e-05, + "loss": 6.9988, + "step": 10100 + }, + { + "epoch": 0.5804451419057279, + "grad_norm": 1.4921875, + "learning_rate": 8.32734797619017e-05, + "loss": 6.9752, + "step": 10110 + }, + { + "epoch": 0.5810192716207682, + "grad_norm": 1.609375, + "learning_rate": 8.323876425998633e-05, + "loss": 6.9748, + "step": 10120 + }, + { + "epoch": 0.5815934013358085, + "grad_norm": 1.6875, + "learning_rate": 8.320402002251446e-05, + "loss": 7.0053, + "step": 10130 + }, + { + "epoch": 0.5821675310508487, + "grad_norm": 1.5859375, + "learning_rate": 8.316924707952312e-05, + "loss": 6.9957, + "step": 10140 + }, + { + "epoch": 0.5827416607658891, + "grad_norm": 1.7109375, + "learning_rate": 8.313444546107423e-05, + "loss": 6.9575, + "step": 10150 + }, + { + "epoch": 0.5833157904809293, + "grad_norm": 1.6171875, + "learning_rate": 8.309961519725444e-05, + "loss": 6.9751, + "step": 10160 + }, + { + "epoch": 0.5838899201959696, + "grad_norm": 1.6484375, + "learning_rate": 8.30647563181752e-05, + "loss": 6.9962, + "step": 10170 + }, + { + "epoch": 0.5844640499110099, + "grad_norm": 1.578125, + "learning_rate": 8.30298688539727e-05, + "loss": 6.9946, + "step": 10180 + }, + { + "epoch": 0.5850381796260502, + "grad_norm": 1.5703125, + "learning_rate": 8.29949528348078e-05, + "loss": 6.9892, + "step": 10190 + }, + { + "epoch": 0.5856123093410904, + "grad_norm": 1.59375, + "learning_rate": 8.296000829086611e-05, + "loss": 6.9865, + "step": 10200 + }, + { + "epoch": 0.5861864390561308, + "grad_norm": 1.6796875, + "learning_rate": 8.292503525235785e-05, + "loss": 6.9942, + "step": 10210 + }, + { + "epoch": 0.586760568771171, + "grad_norm": 1.7734375, + "learning_rate": 8.289003374951786e-05, + "loss": 6.988, + "step": 10220 + }, + { + "epoch": 0.5873346984862113, + "grad_norm": 1.65625, + "learning_rate": 8.285500381260567e-05, + "loss": 6.9678, + "step": 10230 + }, + { + "epoch": 0.5879088282012516, + "grad_norm": 1.5625, + "learning_rate": 8.28199454719053e-05, + "loss": 6.9993, + "step": 10240 + }, + { + "epoch": 0.5884829579162919, + "grad_norm": 1.578125, + "learning_rate": 8.27848587577254e-05, + "loss": 6.9852, + "step": 10250 + }, + { + "epoch": 0.5890570876313321, + "grad_norm": 1.5859375, + "learning_rate": 8.274974370039909e-05, + "loss": 6.9722, + "step": 10260 + }, + { + "epoch": 0.5896312173463725, + "grad_norm": 1.546875, + "learning_rate": 8.271460033028401e-05, + "loss": 7.0244, + "step": 10270 + }, + { + "epoch": 0.5902053470614127, + "grad_norm": 1.5703125, + "learning_rate": 8.267942867776233e-05, + "loss": 7.0165, + "step": 10280 + }, + { + "epoch": 0.590779476776453, + "grad_norm": 1.65625, + "learning_rate": 8.264422877324059e-05, + "loss": 6.9987, + "step": 10290 + }, + { + "epoch": 0.5913536064914933, + "grad_norm": 1.625, + "learning_rate": 8.260900064714978e-05, + "loss": 6.9803, + "step": 10300 + }, + { + "epoch": 0.5919277362065336, + "grad_norm": 1.5546875, + "learning_rate": 8.257374432994532e-05, + "loss": 6.9853, + "step": 10310 + }, + { + "epoch": 0.5925018659215738, + "grad_norm": 1.6015625, + "learning_rate": 8.253845985210697e-05, + "loss": 6.9921, + "step": 10320 + }, + { + "epoch": 0.5930759956366142, + "grad_norm": 1.59375, + "learning_rate": 8.250314724413888e-05, + "loss": 7.0052, + "step": 10330 + }, + { + "epoch": 0.5936501253516544, + "grad_norm": 1.5859375, + "learning_rate": 8.246780653656942e-05, + "loss": 6.9785, + "step": 10340 + }, + { + "epoch": 0.5942242550666947, + "grad_norm": 1.609375, + "learning_rate": 8.243243775995138e-05, + "loss": 7.0001, + "step": 10350 + }, + { + "epoch": 0.594798384781735, + "grad_norm": 1.640625, + "learning_rate": 8.239704094486171e-05, + "loss": 6.9794, + "step": 10360 + }, + { + "epoch": 0.5953725144967753, + "grad_norm": 1.5234375, + "learning_rate": 8.236161612190167e-05, + "loss": 6.9948, + "step": 10370 + }, + { + "epoch": 0.5959466442118155, + "grad_norm": 1.578125, + "learning_rate": 8.232616332169669e-05, + "loss": 6.9464, + "step": 10380 + }, + { + "epoch": 0.5965207739268559, + "grad_norm": 1.6796875, + "learning_rate": 8.229068257489643e-05, + "loss": 6.9786, + "step": 10390 + }, + { + "epoch": 0.5970949036418962, + "grad_norm": 1.5546875, + "learning_rate": 8.225517391217464e-05, + "loss": 7.0033, + "step": 10400 + }, + { + "epoch": 0.5976690333569364, + "grad_norm": 1.703125, + "learning_rate": 8.221963736422929e-05, + "loss": 6.974, + "step": 10410 + }, + { + "epoch": 0.5982431630719768, + "grad_norm": 1.625, + "learning_rate": 8.218407296178238e-05, + "loss": 6.9626, + "step": 10420 + }, + { + "epoch": 0.598817292787017, + "grad_norm": 1.7734375, + "learning_rate": 8.214848073558006e-05, + "loss": 6.9609, + "step": 10430 + }, + { + "epoch": 0.5993914225020573, + "grad_norm": 1.5625, + "learning_rate": 8.211286071639246e-05, + "loss": 6.9778, + "step": 10440 + }, + { + "epoch": 0.5999655522170976, + "grad_norm": 1.6875, + "learning_rate": 8.207721293501383e-05, + "loss": 6.9841, + "step": 10450 + }, + { + "epoch": 0.6005396819321379, + "grad_norm": 1.5546875, + "learning_rate": 8.20415374222623e-05, + "loss": 6.9789, + "step": 10460 + }, + { + "epoch": 0.6011138116471781, + "grad_norm": 1.6171875, + "learning_rate": 8.200583420898012e-05, + "loss": 6.9939, + "step": 10470 + }, + { + "epoch": 0.6016879413622185, + "grad_norm": 1.6484375, + "learning_rate": 8.197010332603336e-05, + "loss": 6.9697, + "step": 10480 + }, + { + "epoch": 0.6022620710772587, + "grad_norm": 1.5546875, + "learning_rate": 8.193434480431206e-05, + "loss": 6.975, + "step": 10490 + }, + { + "epoch": 0.602836200792299, + "grad_norm": 1.5390625, + "learning_rate": 8.189855867473018e-05, + "loss": 7.0106, + "step": 10500 + }, + { + "epoch": 0.6034103305073393, + "grad_norm": 1.609375, + "learning_rate": 8.186274496822552e-05, + "loss": 6.9875, + "step": 10510 + }, + { + "epoch": 0.6039844602223796, + "grad_norm": 1.5859375, + "learning_rate": 8.182690371575971e-05, + "loss": 6.9691, + "step": 10520 + }, + { + "epoch": 0.6045585899374198, + "grad_norm": 1.578125, + "learning_rate": 8.179103494831821e-05, + "loss": 6.9972, + "step": 10530 + }, + { + "epoch": 0.6051327196524602, + "grad_norm": 1.65625, + "learning_rate": 8.175513869691027e-05, + "loss": 6.9925, + "step": 10540 + }, + { + "epoch": 0.6057068493675004, + "grad_norm": 1.609375, + "learning_rate": 8.171921499256891e-05, + "loss": 6.9612, + "step": 10550 + }, + { + "epoch": 0.6062809790825407, + "grad_norm": 1.578125, + "learning_rate": 8.168326386635083e-05, + "loss": 6.9736, + "step": 10560 + }, + { + "epoch": 0.606855108797581, + "grad_norm": 1.6484375, + "learning_rate": 8.164728534933653e-05, + "loss": 6.9866, + "step": 10570 + }, + { + "epoch": 0.6074292385126213, + "grad_norm": 1.7421875, + "learning_rate": 8.161127947263007e-05, + "loss": 7.0039, + "step": 10580 + }, + { + "epoch": 0.6080033682276615, + "grad_norm": 1.5703125, + "learning_rate": 8.15752462673593e-05, + "loss": 7.0022, + "step": 10590 + }, + { + "epoch": 0.6085774979427019, + "grad_norm": 1.6171875, + "learning_rate": 8.153918576467558e-05, + "loss": 7.0146, + "step": 10600 + }, + { + "epoch": 0.6091516276577421, + "grad_norm": 1.59375, + "learning_rate": 8.150309799575394e-05, + "loss": 6.998, + "step": 10610 + }, + { + "epoch": 0.6097257573727825, + "grad_norm": 1.5703125, + "learning_rate": 8.146698299179291e-05, + "loss": 6.9919, + "step": 10620 + }, + { + "epoch": 0.6102998870878227, + "grad_norm": 1.671875, + "learning_rate": 8.143084078401467e-05, + "loss": 6.9656, + "step": 10630 + }, + { + "epoch": 0.610874016802863, + "grad_norm": 1.59375, + "learning_rate": 8.139467140366483e-05, + "loss": 6.9782, + "step": 10640 + }, + { + "epoch": 0.6114481465179032, + "grad_norm": 1.6328125, + "learning_rate": 8.135847488201251e-05, + "loss": 6.9906, + "step": 10650 + }, + { + "epoch": 0.6120222762329436, + "grad_norm": 1.5, + "learning_rate": 8.132225125035032e-05, + "loss": 6.9975, + "step": 10660 + }, + { + "epoch": 0.6125964059479838, + "grad_norm": 1.59375, + "learning_rate": 8.128600053999431e-05, + "loss": 6.9186, + "step": 10670 + }, + { + "epoch": 0.6131705356630242, + "grad_norm": 1.5859375, + "learning_rate": 8.124972278228389e-05, + "loss": 6.9996, + "step": 10680 + }, + { + "epoch": 0.6137446653780644, + "grad_norm": 1.6953125, + "learning_rate": 8.121341800858189e-05, + "loss": 6.9948, + "step": 10690 + }, + { + "epoch": 0.6143187950931047, + "grad_norm": 1.625, + "learning_rate": 8.117708625027451e-05, + "loss": 6.972, + "step": 10700 + }, + { + "epoch": 0.614892924808145, + "grad_norm": 1.53125, + "learning_rate": 8.114072753877125e-05, + "loss": 7.0159, + "step": 10710 + }, + { + "epoch": 0.6154670545231853, + "grad_norm": 1.609375, + "learning_rate": 8.110434190550493e-05, + "loss": 6.9762, + "step": 10720 + }, + { + "epoch": 0.6160411842382255, + "grad_norm": 1.59375, + "learning_rate": 8.106792938193162e-05, + "loss": 6.9775, + "step": 10730 + }, + { + "epoch": 0.6166153139532659, + "grad_norm": 1.59375, + "learning_rate": 8.103148999953065e-05, + "loss": 6.9968, + "step": 10740 + }, + { + "epoch": 0.6171894436683061, + "grad_norm": 1.625, + "learning_rate": 8.099502378980459e-05, + "loss": 6.9856, + "step": 10750 + }, + { + "epoch": 0.6177635733833464, + "grad_norm": 1.5546875, + "learning_rate": 8.095853078427918e-05, + "loss": 7.0284, + "step": 10760 + }, + { + "epoch": 0.6183377030983866, + "grad_norm": 1.6015625, + "learning_rate": 8.092201101450332e-05, + "loss": 6.9898, + "step": 10770 + }, + { + "epoch": 0.618911832813427, + "grad_norm": 1.609375, + "learning_rate": 8.088546451204909e-05, + "loss": 6.9919, + "step": 10780 + }, + { + "epoch": 0.6194859625284672, + "grad_norm": 1.578125, + "learning_rate": 8.084889130851163e-05, + "loss": 7.0162, + "step": 10790 + }, + { + "epoch": 0.6200600922435076, + "grad_norm": 1.6171875, + "learning_rate": 8.081229143550917e-05, + "loss": 6.9979, + "step": 10800 + }, + { + "epoch": 0.6206342219585478, + "grad_norm": 1.640625, + "learning_rate": 8.077566492468302e-05, + "loss": 6.9826, + "step": 10810 + }, + { + "epoch": 0.6212083516735881, + "grad_norm": 1.515625, + "learning_rate": 8.073901180769752e-05, + "loss": 6.9901, + "step": 10820 + }, + { + "epoch": 0.6217824813886283, + "grad_norm": 1.640625, + "learning_rate": 8.070233211623999e-05, + "loss": 6.9824, + "step": 10830 + }, + { + "epoch": 0.6223566111036687, + "grad_norm": 1.65625, + "learning_rate": 8.066562588202073e-05, + "loss": 6.9176, + "step": 10840 + }, + { + "epoch": 0.622930740818709, + "grad_norm": 1.6015625, + "learning_rate": 8.062889313677302e-05, + "loss": 6.9748, + "step": 10850 + }, + { + "epoch": 0.6235048705337493, + "grad_norm": 1.609375, + "learning_rate": 8.059213391225301e-05, + "loss": 7.001, + "step": 10860 + }, + { + "epoch": 0.6240790002487896, + "grad_norm": 1.734375, + "learning_rate": 8.055534824023976e-05, + "loss": 6.9788, + "step": 10870 + }, + { + "epoch": 0.6246531299638298, + "grad_norm": 1.578125, + "learning_rate": 8.05185361525352e-05, + "loss": 6.9838, + "step": 10880 + }, + { + "epoch": 0.6252272596788702, + "grad_norm": 1.53125, + "learning_rate": 8.04816976809641e-05, + "loss": 6.9763, + "step": 10890 + }, + { + "epoch": 0.6258013893939104, + "grad_norm": 1.6015625, + "learning_rate": 8.044483285737401e-05, + "loss": 6.9989, + "step": 10900 + }, + { + "epoch": 0.6263755191089507, + "grad_norm": 1.6328125, + "learning_rate": 8.040794171363531e-05, + "loss": 7.0087, + "step": 10910 + }, + { + "epoch": 0.626949648823991, + "grad_norm": 1.578125, + "learning_rate": 8.037102428164112e-05, + "loss": 6.9784, + "step": 10920 + }, + { + "epoch": 0.6275237785390313, + "grad_norm": 1.6796875, + "learning_rate": 8.033408059330725e-05, + "loss": 6.9524, + "step": 10930 + }, + { + "epoch": 0.6280979082540715, + "grad_norm": 1.59375, + "learning_rate": 8.029711068057224e-05, + "loss": 6.9713, + "step": 10940 + }, + { + "epoch": 0.6286720379691119, + "grad_norm": 1.6171875, + "learning_rate": 8.02601145753973e-05, + "loss": 6.9852, + "step": 10950 + }, + { + "epoch": 0.6292461676841521, + "grad_norm": 1.625, + "learning_rate": 8.022309230976628e-05, + "loss": 7.0048, + "step": 10960 + }, + { + "epoch": 0.6298202973991924, + "grad_norm": 1.5234375, + "learning_rate": 8.018604391568564e-05, + "loss": 6.982, + "step": 10970 + }, + { + "epoch": 0.6303944271142327, + "grad_norm": 1.65625, + "learning_rate": 8.014896942518446e-05, + "loss": 6.9608, + "step": 10980 + }, + { + "epoch": 0.630968556829273, + "grad_norm": 1.6015625, + "learning_rate": 8.011186887031434e-05, + "loss": 6.9604, + "step": 10990 + }, + { + "epoch": 0.6315426865443132, + "grad_norm": 1.578125, + "learning_rate": 8.007474228314942e-05, + "loss": 6.9843, + "step": 11000 + }, + { + "epoch": 0.6321168162593536, + "grad_norm": 1.6875, + "learning_rate": 8.003758969578636e-05, + "loss": 6.988, + "step": 11010 + }, + { + "epoch": 0.6326909459743938, + "grad_norm": 1.625, + "learning_rate": 8.000041114034431e-05, + "loss": 6.9857, + "step": 11020 + }, + { + "epoch": 0.6332650756894341, + "grad_norm": 1.5703125, + "learning_rate": 7.996320664896483e-05, + "loss": 6.9709, + "step": 11030 + }, + { + "epoch": 0.6338392054044744, + "grad_norm": 1.65625, + "learning_rate": 7.992597625381195e-05, + "loss": 6.9667, + "step": 11040 + }, + { + "epoch": 0.6344133351195147, + "grad_norm": 1.6640625, + "learning_rate": 7.988871998707204e-05, + "loss": 6.9538, + "step": 11050 + }, + { + "epoch": 0.6349874648345549, + "grad_norm": 1.65625, + "learning_rate": 7.985143788095389e-05, + "loss": 6.9634, + "step": 11060 + }, + { + "epoch": 0.6355615945495953, + "grad_norm": 1.5546875, + "learning_rate": 7.981412996768858e-05, + "loss": 6.9727, + "step": 11070 + }, + { + "epoch": 0.6361357242646355, + "grad_norm": 1.5625, + "learning_rate": 7.977679627952953e-05, + "loss": 7.0057, + "step": 11080 + }, + { + "epoch": 0.6367098539796758, + "grad_norm": 1.5390625, + "learning_rate": 7.973943684875245e-05, + "loss": 6.9734, + "step": 11090 + }, + { + "epoch": 0.6372839836947161, + "grad_norm": 1.640625, + "learning_rate": 7.970205170765528e-05, + "loss": 6.9587, + "step": 11100 + }, + { + "epoch": 0.6378581134097564, + "grad_norm": 1.7265625, + "learning_rate": 7.966464088855822e-05, + "loss": 6.9155, + "step": 11110 + }, + { + "epoch": 0.6384322431247966, + "grad_norm": 1.59375, + "learning_rate": 7.962720442380364e-05, + "loss": 6.961, + "step": 11120 + }, + { + "epoch": 0.639006372839837, + "grad_norm": 1.6171875, + "learning_rate": 7.958974234575607e-05, + "loss": 6.9626, + "step": 11130 + }, + { + "epoch": 0.6395805025548772, + "grad_norm": 1.734375, + "learning_rate": 7.955225468680223e-05, + "loss": 6.9431, + "step": 11140 + }, + { + "epoch": 0.6401546322699175, + "grad_norm": 1.6328125, + "learning_rate": 7.951474147935091e-05, + "loss": 6.9833, + "step": 11150 + }, + { + "epoch": 0.6407287619849578, + "grad_norm": 1.59375, + "learning_rate": 7.947720275583301e-05, + "loss": 6.9595, + "step": 11160 + }, + { + "epoch": 0.6413028916999981, + "grad_norm": 1.5625, + "learning_rate": 7.943963854870149e-05, + "loss": 6.9866, + "step": 11170 + }, + { + "epoch": 0.6418770214150383, + "grad_norm": 1.6015625, + "learning_rate": 7.940204889043135e-05, + "loss": 6.9559, + "step": 11180 + }, + { + "epoch": 0.6424511511300787, + "grad_norm": 1.578125, + "learning_rate": 7.936443381351954e-05, + "loss": 6.9778, + "step": 11190 + }, + { + "epoch": 0.6430252808451189, + "grad_norm": 1.7109375, + "learning_rate": 7.932679335048506e-05, + "loss": 6.9584, + "step": 11200 + }, + { + "epoch": 0.6435994105601592, + "grad_norm": 1.5859375, + "learning_rate": 7.92891275338688e-05, + "loss": 6.9656, + "step": 11210 + }, + { + "epoch": 0.6441735402751995, + "grad_norm": 1.6328125, + "learning_rate": 7.92514363962336e-05, + "loss": 6.9864, + "step": 11220 + }, + { + "epoch": 0.6447476699902398, + "grad_norm": 1.7890625, + "learning_rate": 7.921371997016416e-05, + "loss": 6.9844, + "step": 11230 + }, + { + "epoch": 0.64532179970528, + "grad_norm": 1.6875, + "learning_rate": 7.91759782882671e-05, + "loss": 6.9804, + "step": 11240 + }, + { + "epoch": 0.6458959294203204, + "grad_norm": 1.6953125, + "learning_rate": 7.913821138317079e-05, + "loss": 6.9777, + "step": 11250 + }, + { + "epoch": 0.6464700591353606, + "grad_norm": 1.578125, + "learning_rate": 7.91004192875255e-05, + "loss": 6.974, + "step": 11260 + }, + { + "epoch": 0.6470441888504009, + "grad_norm": 1.7421875, + "learning_rate": 7.906260203400319e-05, + "loss": 6.9603, + "step": 11270 + }, + { + "epoch": 0.6476183185654412, + "grad_norm": 1.5859375, + "learning_rate": 7.902475965529763e-05, + "loss": 6.9737, + "step": 11280 + }, + { + "epoch": 0.6481924482804815, + "grad_norm": 1.578125, + "learning_rate": 7.898689218412427e-05, + "loss": 6.9711, + "step": 11290 + }, + { + "epoch": 0.6487665779955218, + "grad_norm": 1.703125, + "learning_rate": 7.894899965322031e-05, + "loss": 6.956, + "step": 11300 + }, + { + "epoch": 0.6493407077105621, + "grad_norm": 1.6015625, + "learning_rate": 7.891108209534455e-05, + "loss": 6.9562, + "step": 11310 + }, + { + "epoch": 0.6499148374256024, + "grad_norm": 1.796875, + "learning_rate": 7.887313954327745e-05, + "loss": 7.0049, + "step": 11320 + }, + { + "epoch": 0.6504889671406426, + "grad_norm": 1.6484375, + "learning_rate": 7.88351720298211e-05, + "loss": 7.0021, + "step": 11330 + }, + { + "epoch": 0.651063096855683, + "grad_norm": 1.6484375, + "learning_rate": 7.879717958779915e-05, + "loss": 6.9747, + "step": 11340 + }, + { + "epoch": 0.6516372265707232, + "grad_norm": 1.6171875, + "learning_rate": 7.87591622500568e-05, + "loss": 6.9826, + "step": 11350 + }, + { + "epoch": 0.6522113562857635, + "grad_norm": 1.640625, + "learning_rate": 7.872112004946075e-05, + "loss": 6.9759, + "step": 11360 + }, + { + "epoch": 0.6527854860008038, + "grad_norm": 1.625, + "learning_rate": 7.868305301889927e-05, + "loss": 6.9862, + "step": 11370 + }, + { + "epoch": 0.6533596157158441, + "grad_norm": 1.5, + "learning_rate": 7.864496119128202e-05, + "loss": 6.9906, + "step": 11380 + }, + { + "epoch": 0.6539337454308843, + "grad_norm": 1.7109375, + "learning_rate": 7.860684459954011e-05, + "loss": 6.978, + "step": 11390 + }, + { + "epoch": 0.6545078751459247, + "grad_norm": 1.65625, + "learning_rate": 7.856870327662611e-05, + "loss": 6.9735, + "step": 11400 + }, + { + "epoch": 0.6550820048609649, + "grad_norm": 1.546875, + "learning_rate": 7.853053725551389e-05, + "loss": 6.9732, + "step": 11410 + }, + { + "epoch": 0.6556561345760052, + "grad_norm": 1.5234375, + "learning_rate": 7.849234656919875e-05, + "loss": 7.0027, + "step": 11420 + }, + { + "epoch": 0.6562302642910455, + "grad_norm": 1.6328125, + "learning_rate": 7.845413125069727e-05, + "loss": 6.9497, + "step": 11430 + }, + { + "epoch": 0.6568043940060858, + "grad_norm": 1.5859375, + "learning_rate": 7.841589133304732e-05, + "loss": 6.9555, + "step": 11440 + }, + { + "epoch": 0.657378523721126, + "grad_norm": 1.4921875, + "learning_rate": 7.837762684930806e-05, + "loss": 6.9795, + "step": 11450 + }, + { + "epoch": 0.6579526534361664, + "grad_norm": 1.6640625, + "learning_rate": 7.833933783255988e-05, + "loss": 6.9998, + "step": 11460 + }, + { + "epoch": 0.6585267831512066, + "grad_norm": 1.578125, + "learning_rate": 7.83010243159044e-05, + "loss": 7.0157, + "step": 11470 + }, + { + "epoch": 0.659100912866247, + "grad_norm": 1.6015625, + "learning_rate": 7.826268633246435e-05, + "loss": 6.9558, + "step": 11480 + }, + { + "epoch": 0.6596750425812872, + "grad_norm": 1.578125, + "learning_rate": 7.822432391538371e-05, + "loss": 6.9717, + "step": 11490 + }, + { + "epoch": 0.6602491722963275, + "grad_norm": 1.6640625, + "learning_rate": 7.818593709782749e-05, + "loss": 6.9613, + "step": 11500 + }, + { + "epoch": 0.6608233020113677, + "grad_norm": 1.6875, + "learning_rate": 7.814752591298186e-05, + "loss": 6.9635, + "step": 11510 + }, + { + "epoch": 0.6613974317264081, + "grad_norm": 1.5234375, + "learning_rate": 7.810909039405402e-05, + "loss": 7.0205, + "step": 11520 + }, + { + "epoch": 0.6619715614414483, + "grad_norm": 1.671875, + "learning_rate": 7.807063057427226e-05, + "loss": 6.9883, + "step": 11530 + }, + { + "epoch": 0.6625456911564886, + "grad_norm": 1.609375, + "learning_rate": 7.803214648688581e-05, + "loss": 6.9535, + "step": 11540 + }, + { + "epoch": 0.6631198208715289, + "grad_norm": 1.5859375, + "learning_rate": 7.799363816516491e-05, + "loss": 6.944, + "step": 11550 + }, + { + "epoch": 0.6636939505865692, + "grad_norm": 1.5625, + "learning_rate": 7.795510564240076e-05, + "loss": 6.9636, + "step": 11560 + }, + { + "epoch": 0.6642680803016094, + "grad_norm": 1.5546875, + "learning_rate": 7.791654895190548e-05, + "loss": 7.007, + "step": 11570 + }, + { + "epoch": 0.6648422100166498, + "grad_norm": 1.5546875, + "learning_rate": 7.787796812701204e-05, + "loss": 6.9788, + "step": 11580 + }, + { + "epoch": 0.66541633973169, + "grad_norm": 1.59375, + "learning_rate": 7.783936320107437e-05, + "loss": 6.9309, + "step": 11590 + }, + { + "epoch": 0.6659904694467303, + "grad_norm": 1.625, + "learning_rate": 7.780073420746712e-05, + "loss": 6.9335, + "step": 11600 + }, + { + "epoch": 0.6665645991617706, + "grad_norm": 1.6171875, + "learning_rate": 7.776208117958585e-05, + "loss": 6.9752, + "step": 11610 + }, + { + "epoch": 0.6671387288768109, + "grad_norm": 1.5625, + "learning_rate": 7.772340415084681e-05, + "loss": 6.9545, + "step": 11620 + }, + { + "epoch": 0.6677128585918511, + "grad_norm": 1.8125, + "learning_rate": 7.768470315468707e-05, + "loss": 6.9765, + "step": 11630 + }, + { + "epoch": 0.6682869883068915, + "grad_norm": 1.5859375, + "learning_rate": 7.76459782245644e-05, + "loss": 6.9908, + "step": 11640 + }, + { + "epoch": 0.6688611180219317, + "grad_norm": 1.5703125, + "learning_rate": 7.760722939395724e-05, + "loss": 6.9714, + "step": 11650 + }, + { + "epoch": 0.669435247736972, + "grad_norm": 1.59375, + "learning_rate": 7.756845669636469e-05, + "loss": 6.9427, + "step": 11660 + }, + { + "epoch": 0.6700093774520123, + "grad_norm": 1.671875, + "learning_rate": 7.752966016530652e-05, + "loss": 6.9655, + "step": 11670 + }, + { + "epoch": 0.6705835071670526, + "grad_norm": 1.7265625, + "learning_rate": 7.749083983432308e-05, + "loss": 7.0029, + "step": 11680 + }, + { + "epoch": 0.6711576368820928, + "grad_norm": 1.6875, + "learning_rate": 7.74519957369753e-05, + "loss": 6.9661, + "step": 11690 + }, + { + "epoch": 0.6717317665971332, + "grad_norm": 1.5703125, + "learning_rate": 7.741312790684465e-05, + "loss": 6.9679, + "step": 11700 + }, + { + "epoch": 0.6723058963121734, + "grad_norm": 1.6328125, + "learning_rate": 7.737423637753313e-05, + "loss": 6.9563, + "step": 11710 + }, + { + "epoch": 0.6728800260272138, + "grad_norm": 1.59375, + "learning_rate": 7.73353211826632e-05, + "loss": 6.9657, + "step": 11720 + }, + { + "epoch": 0.673454155742254, + "grad_norm": 1.6640625, + "learning_rate": 7.729638235587783e-05, + "loss": 6.9761, + "step": 11730 + }, + { + "epoch": 0.6740282854572943, + "grad_norm": 1.6484375, + "learning_rate": 7.72574199308404e-05, + "loss": 6.9707, + "step": 11740 + }, + { + "epoch": 0.6746024151723347, + "grad_norm": 1.546875, + "learning_rate": 7.721843394123465e-05, + "loss": 6.9245, + "step": 11750 + }, + { + "epoch": 0.6751765448873749, + "grad_norm": 1.6484375, + "learning_rate": 7.717942442076473e-05, + "loss": 6.9743, + "step": 11760 + }, + { + "epoch": 0.6757506746024152, + "grad_norm": 1.59375, + "learning_rate": 7.714039140315514e-05, + "loss": 6.9635, + "step": 11770 + }, + { + "epoch": 0.6763248043174555, + "grad_norm": 1.609375, + "learning_rate": 7.710133492215066e-05, + "loss": 6.9753, + "step": 11780 + }, + { + "epoch": 0.6768989340324958, + "grad_norm": 1.5703125, + "learning_rate": 7.706225501151641e-05, + "loss": 6.9818, + "step": 11790 + }, + { + "epoch": 0.677473063747536, + "grad_norm": 1.5078125, + "learning_rate": 7.702315170503769e-05, + "loss": 6.9832, + "step": 11800 + }, + { + "epoch": 0.6780471934625764, + "grad_norm": 1.6328125, + "learning_rate": 7.69840250365201e-05, + "loss": 7.0003, + "step": 11810 + }, + { + "epoch": 0.6786213231776166, + "grad_norm": 1.6328125, + "learning_rate": 7.69448750397894e-05, + "loss": 6.9929, + "step": 11820 + }, + { + "epoch": 0.6791954528926569, + "grad_norm": 1.515625, + "learning_rate": 7.690570174869149e-05, + "loss": 6.97, + "step": 11830 + }, + { + "epoch": 0.6797695826076972, + "grad_norm": 1.515625, + "learning_rate": 7.686650519709249e-05, + "loss": 6.9556, + "step": 11840 + }, + { + "epoch": 0.6803437123227375, + "grad_norm": 1.6015625, + "learning_rate": 7.682728541887854e-05, + "loss": 6.9593, + "step": 11850 + }, + { + "epoch": 0.6809178420377777, + "grad_norm": 1.59375, + "learning_rate": 7.678804244795593e-05, + "loss": 6.9603, + "step": 11860 + }, + { + "epoch": 0.6814919717528181, + "grad_norm": 1.5859375, + "learning_rate": 7.674877631825093e-05, + "loss": 6.9669, + "step": 11870 + }, + { + "epoch": 0.6820661014678583, + "grad_norm": 1.6015625, + "learning_rate": 7.670948706370988e-05, + "loss": 6.9686, + "step": 11880 + }, + { + "epoch": 0.6826402311828986, + "grad_norm": 1.6328125, + "learning_rate": 7.667017471829914e-05, + "loss": 6.9484, + "step": 11890 + }, + { + "epoch": 0.6832143608979389, + "grad_norm": 1.59375, + "learning_rate": 7.663083931600497e-05, + "loss": 6.944, + "step": 11900 + }, + { + "epoch": 0.6837884906129792, + "grad_norm": 1.6875, + "learning_rate": 7.659148089083357e-05, + "loss": 6.9765, + "step": 11910 + }, + { + "epoch": 0.6843626203280194, + "grad_norm": 1.5625, + "learning_rate": 7.65520994768111e-05, + "loss": 7.0007, + "step": 11920 + }, + { + "epoch": 0.6849367500430598, + "grad_norm": 1.5703125, + "learning_rate": 7.651269510798353e-05, + "loss": 6.9465, + "step": 11930 + }, + { + "epoch": 0.6855108797581, + "grad_norm": 1.7421875, + "learning_rate": 7.64732678184167e-05, + "loss": 6.9472, + "step": 11940 + }, + { + "epoch": 0.6860850094731403, + "grad_norm": 1.6640625, + "learning_rate": 7.64338176421963e-05, + "loss": 6.9246, + "step": 11950 + }, + { + "epoch": 0.6866591391881806, + "grad_norm": 1.484375, + "learning_rate": 7.639434461342773e-05, + "loss": 6.9619, + "step": 11960 + }, + { + "epoch": 0.6872332689032209, + "grad_norm": 1.546875, + "learning_rate": 7.63548487662362e-05, + "loss": 6.9673, + "step": 11970 + }, + { + "epoch": 0.6878073986182611, + "grad_norm": 1.6953125, + "learning_rate": 7.631533013476665e-05, + "loss": 6.9765, + "step": 11980 + }, + { + "epoch": 0.6883815283333015, + "grad_norm": 1.6171875, + "learning_rate": 7.627578875318372e-05, + "loss": 6.9593, + "step": 11990 + }, + { + "epoch": 0.6889556580483417, + "grad_norm": 1.5546875, + "learning_rate": 7.623622465567166e-05, + "loss": 6.9462, + "step": 12000 + }, + { + "epoch": 0.689529787763382, + "grad_norm": 1.53125, + "learning_rate": 7.619663787643441e-05, + "loss": 6.9357, + "step": 12010 + }, + { + "epoch": 0.6901039174784223, + "grad_norm": 1.6015625, + "learning_rate": 7.615702844969553e-05, + "loss": 6.9577, + "step": 12020 + }, + { + "epoch": 0.6906780471934626, + "grad_norm": 1.6015625, + "learning_rate": 7.611739640969813e-05, + "loss": 6.9309, + "step": 12030 + }, + { + "epoch": 0.6912521769085028, + "grad_norm": 1.65625, + "learning_rate": 7.607774179070485e-05, + "loss": 6.9118, + "step": 12040 + }, + { + "epoch": 0.6918263066235432, + "grad_norm": 1.6484375, + "learning_rate": 7.603806462699792e-05, + "loss": 6.976, + "step": 12050 + }, + { + "epoch": 0.6924004363385834, + "grad_norm": 1.9375, + "learning_rate": 7.599836495287898e-05, + "loss": 6.9466, + "step": 12060 + }, + { + "epoch": 0.6929745660536237, + "grad_norm": 1.5546875, + "learning_rate": 7.59586428026692e-05, + "loss": 6.9853, + "step": 12070 + }, + { + "epoch": 0.693548695768664, + "grad_norm": 1.6796875, + "learning_rate": 7.591889821070913e-05, + "loss": 6.9509, + "step": 12080 + }, + { + "epoch": 0.6941228254837043, + "grad_norm": 1.5625, + "learning_rate": 7.587913121135875e-05, + "loss": 6.9566, + "step": 12090 + }, + { + "epoch": 0.6946969551987445, + "grad_norm": 1.515625, + "learning_rate": 7.583934183899738e-05, + "loss": 6.9765, + "step": 12100 + }, + { + "epoch": 0.6952710849137849, + "grad_norm": 1.515625, + "learning_rate": 7.579953012802374e-05, + "loss": 6.9199, + "step": 12110 + }, + { + "epoch": 0.6958452146288251, + "grad_norm": 1.6484375, + "learning_rate": 7.57596961128558e-05, + "loss": 6.9372, + "step": 12120 + }, + { + "epoch": 0.6964193443438654, + "grad_norm": 1.6484375, + "learning_rate": 7.571983982793086e-05, + "loss": 6.9888, + "step": 12130 + }, + { + "epoch": 0.6969934740589057, + "grad_norm": 1.75, + "learning_rate": 7.567996130770543e-05, + "loss": 6.9627, + "step": 12140 + }, + { + "epoch": 0.697567603773946, + "grad_norm": 1.59375, + "learning_rate": 7.564006058665525e-05, + "loss": 6.9265, + "step": 12150 + }, + { + "epoch": 0.6981417334889862, + "grad_norm": 1.59375, + "learning_rate": 7.560013769927532e-05, + "loss": 6.9392, + "step": 12160 + }, + { + "epoch": 0.6987158632040266, + "grad_norm": 1.796875, + "learning_rate": 7.556019268007972e-05, + "loss": 6.9563, + "step": 12170 + }, + { + "epoch": 0.6992899929190668, + "grad_norm": 1.625, + "learning_rate": 7.55202255636017e-05, + "loss": 6.9561, + "step": 12180 + }, + { + "epoch": 0.6998641226341071, + "grad_norm": 1.6328125, + "learning_rate": 7.548023638439359e-05, + "loss": 6.9558, + "step": 12190 + }, + { + "epoch": 0.7004382523491475, + "grad_norm": 1.5859375, + "learning_rate": 7.544022517702684e-05, + "loss": 6.9474, + "step": 12200 + }, + { + "epoch": 0.7010123820641877, + "grad_norm": 1.609375, + "learning_rate": 7.54001919760919e-05, + "loss": 6.973, + "step": 12210 + }, + { + "epoch": 0.701586511779228, + "grad_norm": 1.625, + "learning_rate": 7.536013681619822e-05, + "loss": 6.9548, + "step": 12220 + }, + { + "epoch": 0.7021606414942683, + "grad_norm": 1.5546875, + "learning_rate": 7.532005973197431e-05, + "loss": 6.9928, + "step": 12230 + }, + { + "epoch": 0.7027347712093086, + "grad_norm": 1.640625, + "learning_rate": 7.527996075806757e-05, + "loss": 6.9436, + "step": 12240 + }, + { + "epoch": 0.7033089009243488, + "grad_norm": 1.515625, + "learning_rate": 7.523983992914435e-05, + "loss": 6.9541, + "step": 12250 + }, + { + "epoch": 0.7038830306393892, + "grad_norm": 1.5546875, + "learning_rate": 7.519969727988984e-05, + "loss": 6.9461, + "step": 12260 + }, + { + "epoch": 0.7044571603544294, + "grad_norm": 1.578125, + "learning_rate": 7.51595328450082e-05, + "loss": 6.9568, + "step": 12270 + }, + { + "epoch": 0.7050312900694697, + "grad_norm": 1.6484375, + "learning_rate": 7.511934665922232e-05, + "loss": 7.0033, + "step": 12280 + }, + { + "epoch": 0.70560541978451, + "grad_norm": 1.8984375, + "learning_rate": 7.507913875727397e-05, + "loss": 6.9849, + "step": 12290 + }, + { + "epoch": 0.7061795494995503, + "grad_norm": 1.6015625, + "learning_rate": 7.503890917392361e-05, + "loss": 6.9881, + "step": 12300 + }, + { + "epoch": 0.7067536792145905, + "grad_norm": 1.546875, + "learning_rate": 7.499865794395057e-05, + "loss": 6.9328, + "step": 12310 + }, + { + "epoch": 0.7073278089296309, + "grad_norm": 1.5390625, + "learning_rate": 7.495838510215276e-05, + "loss": 6.9288, + "step": 12320 + }, + { + "epoch": 0.7079019386446711, + "grad_norm": 1.65625, + "learning_rate": 7.491809068334685e-05, + "loss": 6.9334, + "step": 12330 + }, + { + "epoch": 0.7084760683597114, + "grad_norm": 1.609375, + "learning_rate": 7.487777472236815e-05, + "loss": 6.9809, + "step": 12340 + }, + { + "epoch": 0.7090501980747517, + "grad_norm": 1.75, + "learning_rate": 7.48374372540706e-05, + "loss": 6.9187, + "step": 12350 + }, + { + "epoch": 0.709624327789792, + "grad_norm": 1.6015625, + "learning_rate": 7.47970783133267e-05, + "loss": 6.9398, + "step": 12360 + }, + { + "epoch": 0.7101984575048322, + "grad_norm": 1.6328125, + "learning_rate": 7.475669793502755e-05, + "loss": 6.9453, + "step": 12370 + }, + { + "epoch": 0.7107725872198726, + "grad_norm": 1.609375, + "learning_rate": 7.471629615408278e-05, + "loss": 6.9427, + "step": 12380 + }, + { + "epoch": 0.7113467169349128, + "grad_norm": 1.5859375, + "learning_rate": 7.467587300542049e-05, + "loss": 6.9699, + "step": 12390 + }, + { + "epoch": 0.7119208466499531, + "grad_norm": 1.6171875, + "learning_rate": 7.463542852398728e-05, + "loss": 6.9523, + "step": 12400 + }, + { + "epoch": 0.7124949763649934, + "grad_norm": 1.5703125, + "learning_rate": 7.459496274474822e-05, + "loss": 6.9642, + "step": 12410 + }, + { + "epoch": 0.7130691060800337, + "grad_norm": 1.609375, + "learning_rate": 7.455447570268673e-05, + "loss": 6.9485, + "step": 12420 + }, + { + "epoch": 0.7136432357950739, + "grad_norm": 1.6640625, + "learning_rate": 7.451396743280465e-05, + "loss": 6.922, + "step": 12430 + }, + { + "epoch": 0.7142173655101143, + "grad_norm": 1.625, + "learning_rate": 7.447343797012218e-05, + "loss": 6.9302, + "step": 12440 + }, + { + "epoch": 0.7147914952251545, + "grad_norm": 1.65625, + "learning_rate": 7.443288734967782e-05, + "loss": 6.9553, + "step": 12450 + }, + { + "epoch": 0.7153656249401948, + "grad_norm": 1.5859375, + "learning_rate": 7.439231560652834e-05, + "loss": 6.9574, + "step": 12460 + }, + { + "epoch": 0.7159397546552351, + "grad_norm": 1.6171875, + "learning_rate": 7.435172277574885e-05, + "loss": 6.9633, + "step": 12470 + }, + { + "epoch": 0.7165138843702754, + "grad_norm": 1.6015625, + "learning_rate": 7.431110889243259e-05, + "loss": 6.9022, + "step": 12480 + }, + { + "epoch": 0.7170880140853156, + "grad_norm": 1.6015625, + "learning_rate": 7.427047399169108e-05, + "loss": 6.9603, + "step": 12490 + }, + { + "epoch": 0.717662143800356, + "grad_norm": 1.59375, + "learning_rate": 7.422981810865397e-05, + "loss": 6.9143, + "step": 12500 + }, + { + "epoch": 0.7182362735153962, + "grad_norm": 1.6328125, + "learning_rate": 7.418914127846906e-05, + "loss": 6.917, + "step": 12510 + }, + { + "epoch": 0.7188104032304365, + "grad_norm": 1.484375, + "learning_rate": 7.414844353630226e-05, + "loss": 6.9381, + "step": 12520 + }, + { + "epoch": 0.7193845329454768, + "grad_norm": 1.5859375, + "learning_rate": 7.410772491733756e-05, + "loss": 6.9498, + "step": 12530 + }, + { + "epoch": 0.7199586626605171, + "grad_norm": 1.4375, + "learning_rate": 7.406698545677698e-05, + "loss": 6.9534, + "step": 12540 + }, + { + "epoch": 0.7205327923755573, + "grad_norm": 1.5390625, + "learning_rate": 7.40262251898406e-05, + "loss": 6.9452, + "step": 12550 + }, + { + "epoch": 0.7211069220905977, + "grad_norm": 1.46875, + "learning_rate": 7.398544415176645e-05, + "loss": 6.9431, + "step": 12560 + }, + { + "epoch": 0.7216810518056379, + "grad_norm": 1.5859375, + "learning_rate": 7.394464237781053e-05, + "loss": 6.9655, + "step": 12570 + }, + { + "epoch": 0.7222551815206782, + "grad_norm": 1.671875, + "learning_rate": 7.390381990324674e-05, + "loss": 6.9516, + "step": 12580 + }, + { + "epoch": 0.7228293112357185, + "grad_norm": 1.5859375, + "learning_rate": 7.386297676336696e-05, + "loss": 6.9533, + "step": 12590 + }, + { + "epoch": 0.7234034409507588, + "grad_norm": 1.6171875, + "learning_rate": 7.382211299348081e-05, + "loss": 6.9604, + "step": 12600 + }, + { + "epoch": 0.723977570665799, + "grad_norm": 1.6796875, + "learning_rate": 7.378122862891585e-05, + "loss": 6.9449, + "step": 12610 + }, + { + "epoch": 0.7245517003808394, + "grad_norm": 1.5625, + "learning_rate": 7.37403237050174e-05, + "loss": 6.9143, + "step": 12620 + }, + { + "epoch": 0.7251258300958796, + "grad_norm": 1.5546875, + "learning_rate": 7.369939825714856e-05, + "loss": 6.9502, + "step": 12630 + }, + { + "epoch": 0.72569995981092, + "grad_norm": 1.578125, + "learning_rate": 7.365845232069019e-05, + "loss": 6.9382, + "step": 12640 + }, + { + "epoch": 0.7262740895259603, + "grad_norm": 1.5546875, + "learning_rate": 7.36174859310408e-05, + "loss": 6.9101, + "step": 12650 + }, + { + "epoch": 0.7268482192410005, + "grad_norm": 1.6875, + "learning_rate": 7.357649912361668e-05, + "loss": 6.9055, + "step": 12660 + }, + { + "epoch": 0.7274223489560409, + "grad_norm": 1.5234375, + "learning_rate": 7.353549193385168e-05, + "loss": 6.9336, + "step": 12670 + }, + { + "epoch": 0.7279964786710811, + "grad_norm": 1.546875, + "learning_rate": 7.349446439719734e-05, + "loss": 6.9425, + "step": 12680 + }, + { + "epoch": 0.7285706083861214, + "grad_norm": 1.734375, + "learning_rate": 7.345341654912274e-05, + "loss": 6.9389, + "step": 12690 + }, + { + "epoch": 0.7291447381011616, + "grad_norm": 1.6328125, + "learning_rate": 7.341234842511456e-05, + "loss": 6.9289, + "step": 12700 + }, + { + "epoch": 0.729718867816202, + "grad_norm": 1.546875, + "learning_rate": 7.337126006067699e-05, + "loss": 6.9337, + "step": 12710 + }, + { + "epoch": 0.7302929975312422, + "grad_norm": 1.609375, + "learning_rate": 7.333015149133169e-05, + "loss": 6.9392, + "step": 12720 + }, + { + "epoch": 0.7308671272462826, + "grad_norm": 1.5546875, + "learning_rate": 7.328902275261785e-05, + "loss": 6.9751, + "step": 12730 + }, + { + "epoch": 0.7314412569613228, + "grad_norm": 1.5078125, + "learning_rate": 7.324787388009204e-05, + "loss": 6.9742, + "step": 12740 + }, + { + "epoch": 0.7320153866763631, + "grad_norm": 1.6328125, + "learning_rate": 7.320670490932827e-05, + "loss": 6.9776, + "step": 12750 + }, + { + "epoch": 0.7325895163914034, + "grad_norm": 1.5625, + "learning_rate": 7.31655158759179e-05, + "loss": 6.9609, + "step": 12760 + }, + { + "epoch": 0.7331636461064437, + "grad_norm": 1.640625, + "learning_rate": 7.312430681546966e-05, + "loss": 6.9548, + "step": 12770 + }, + { + "epoch": 0.7337377758214839, + "grad_norm": 1.5625, + "learning_rate": 7.308307776360959e-05, + "loss": 6.9416, + "step": 12780 + }, + { + "epoch": 0.7343119055365243, + "grad_norm": 1.6328125, + "learning_rate": 7.3041828755981e-05, + "loss": 6.9397, + "step": 12790 + }, + { + "epoch": 0.7348860352515645, + "grad_norm": 1.578125, + "learning_rate": 7.300055982824443e-05, + "loss": 6.938, + "step": 12800 + }, + { + "epoch": 0.7354601649666048, + "grad_norm": 1.6484375, + "learning_rate": 7.295927101607771e-05, + "loss": 6.9473, + "step": 12810 + }, + { + "epoch": 0.736034294681645, + "grad_norm": 1.671875, + "learning_rate": 7.29179623551758e-05, + "loss": 6.9411, + "step": 12820 + }, + { + "epoch": 0.7366084243966854, + "grad_norm": 1.6171875, + "learning_rate": 7.287663388125083e-05, + "loss": 6.9525, + "step": 12830 + }, + { + "epoch": 0.7371825541117256, + "grad_norm": 1.578125, + "learning_rate": 7.283528563003208e-05, + "loss": 6.9232, + "step": 12840 + }, + { + "epoch": 0.737756683826766, + "grad_norm": 1.5625, + "learning_rate": 7.27939176372659e-05, + "loss": 6.9248, + "step": 12850 + }, + { + "epoch": 0.7383308135418062, + "grad_norm": 1.5703125, + "learning_rate": 7.275252993871576e-05, + "loss": 6.9764, + "step": 12860 + }, + { + "epoch": 0.7389049432568465, + "grad_norm": 1.625, + "learning_rate": 7.27111225701621e-05, + "loss": 6.9333, + "step": 12870 + }, + { + "epoch": 0.7394790729718868, + "grad_norm": 1.5546875, + "learning_rate": 7.266969556740239e-05, + "loss": 6.9455, + "step": 12880 + }, + { + "epoch": 0.7400532026869271, + "grad_norm": 1.59375, + "learning_rate": 7.262824896625107e-05, + "loss": 6.919, + "step": 12890 + }, + { + "epoch": 0.7406273324019673, + "grad_norm": 1.5546875, + "learning_rate": 7.258678280253954e-05, + "loss": 6.9078, + "step": 12900 + }, + { + "epoch": 0.7412014621170077, + "grad_norm": 1.5859375, + "learning_rate": 7.254529711211612e-05, + "loss": 6.9439, + "step": 12910 + }, + { + "epoch": 0.7417755918320479, + "grad_norm": 1.6171875, + "learning_rate": 7.2503791930846e-05, + "loss": 6.9729, + "step": 12920 + }, + { + "epoch": 0.7423497215470882, + "grad_norm": 1.65625, + "learning_rate": 7.246226729461117e-05, + "loss": 6.9428, + "step": 12930 + }, + { + "epoch": 0.7429238512621285, + "grad_norm": 1.625, + "learning_rate": 7.242072323931051e-05, + "loss": 6.9696, + "step": 12940 + }, + { + "epoch": 0.7434979809771688, + "grad_norm": 1.671875, + "learning_rate": 7.237915980085966e-05, + "loss": 6.9893, + "step": 12950 + }, + { + "epoch": 0.744072110692209, + "grad_norm": 1.65625, + "learning_rate": 7.233757701519103e-05, + "loss": 6.9275, + "step": 12960 + }, + { + "epoch": 0.7446462404072494, + "grad_norm": 1.5234375, + "learning_rate": 7.229597491825374e-05, + "loss": 6.9448, + "step": 12970 + }, + { + "epoch": 0.7452203701222896, + "grad_norm": 1.6171875, + "learning_rate": 7.22543535460136e-05, + "loss": 6.9222, + "step": 12980 + }, + { + "epoch": 0.7457944998373299, + "grad_norm": 1.65625, + "learning_rate": 7.221271293445308e-05, + "loss": 6.9139, + "step": 12990 + }, + { + "epoch": 0.7463686295523702, + "grad_norm": 1.53125, + "learning_rate": 7.217105311957135e-05, + "loss": 6.9575, + "step": 13000 + }, + { + "epoch": 0.7469427592674105, + "grad_norm": 1.65625, + "learning_rate": 7.212937413738408e-05, + "loss": 6.9712, + "step": 13010 + }, + { + "epoch": 0.7475168889824507, + "grad_norm": 1.5625, + "learning_rate": 7.208767602392354e-05, + "loss": 6.9427, + "step": 13020 + }, + { + "epoch": 0.7480910186974911, + "grad_norm": 1.6640625, + "learning_rate": 7.204595881523862e-05, + "loss": 6.8905, + "step": 13030 + }, + { + "epoch": 0.7486651484125313, + "grad_norm": 1.625, + "learning_rate": 7.200422254739463e-05, + "loss": 6.9344, + "step": 13040 + }, + { + "epoch": 0.7492392781275716, + "grad_norm": 1.6328125, + "learning_rate": 7.196246725647338e-05, + "loss": 6.9794, + "step": 13050 + }, + { + "epoch": 0.7498134078426119, + "grad_norm": 1.59375, + "learning_rate": 7.19206929785731e-05, + "loss": 6.9557, + "step": 13060 + }, + { + "epoch": 0.7503875375576522, + "grad_norm": 1.6171875, + "learning_rate": 7.187889974980852e-05, + "loss": 6.9585, + "step": 13070 + }, + { + "epoch": 0.7509616672726924, + "grad_norm": 1.6328125, + "learning_rate": 7.183708760631064e-05, + "loss": 6.9335, + "step": 13080 + }, + { + "epoch": 0.7515357969877328, + "grad_norm": 1.5703125, + "learning_rate": 7.179525658422693e-05, + "loss": 6.9735, + "step": 13090 + }, + { + "epoch": 0.7521099267027731, + "grad_norm": 1.6328125, + "learning_rate": 7.175340671972108e-05, + "loss": 6.9229, + "step": 13100 + }, + { + "epoch": 0.7526840564178133, + "grad_norm": 1.53125, + "learning_rate": 7.17115380489731e-05, + "loss": 6.952, + "step": 13110 + }, + { + "epoch": 0.7532581861328537, + "grad_norm": 1.7109375, + "learning_rate": 7.166965060817929e-05, + "loss": 6.942, + "step": 13120 + }, + { + "epoch": 0.7538323158478939, + "grad_norm": 1.5859375, + "learning_rate": 7.162774443355218e-05, + "loss": 6.9544, + "step": 13130 + }, + { + "epoch": 0.7544064455629342, + "grad_norm": 1.65625, + "learning_rate": 7.158581956132042e-05, + "loss": 6.9458, + "step": 13140 + }, + { + "epoch": 0.7549805752779745, + "grad_norm": 1.6015625, + "learning_rate": 7.154387602772889e-05, + "loss": 6.9283, + "step": 13150 + }, + { + "epoch": 0.7555547049930148, + "grad_norm": 1.6171875, + "learning_rate": 7.150191386903861e-05, + "loss": 6.9262, + "step": 13160 + }, + { + "epoch": 0.756128834708055, + "grad_norm": 1.6796875, + "learning_rate": 7.145993312152666e-05, + "loss": 6.9766, + "step": 13170 + }, + { + "epoch": 0.7567029644230954, + "grad_norm": 1.5390625, + "learning_rate": 7.141793382148621e-05, + "loss": 6.9395, + "step": 13180 + }, + { + "epoch": 0.7572770941381356, + "grad_norm": 1.6328125, + "learning_rate": 7.137591600522649e-05, + "loss": 6.9313, + "step": 13190 + }, + { + "epoch": 0.7578512238531759, + "grad_norm": 1.59375, + "learning_rate": 7.133387970907268e-05, + "loss": 6.9541, + "step": 13200 + }, + { + "epoch": 0.7584253535682162, + "grad_norm": 1.6171875, + "learning_rate": 7.129182496936602e-05, + "loss": 6.9298, + "step": 13210 + }, + { + "epoch": 0.7589994832832565, + "grad_norm": 1.59375, + "learning_rate": 7.124975182246361e-05, + "loss": 6.9438, + "step": 13220 + }, + { + "epoch": 0.7595736129982967, + "grad_norm": 1.65625, + "learning_rate": 7.120766030473854e-05, + "loss": 6.9108, + "step": 13230 + }, + { + "epoch": 0.7601477427133371, + "grad_norm": 1.5390625, + "learning_rate": 7.116555045257969e-05, + "loss": 6.9529, + "step": 13240 + }, + { + "epoch": 0.7607218724283773, + "grad_norm": 1.5859375, + "learning_rate": 7.11234223023919e-05, + "loss": 6.9413, + "step": 13250 + }, + { + "epoch": 0.7612960021434176, + "grad_norm": 1.640625, + "learning_rate": 7.108127589059573e-05, + "loss": 6.9513, + "step": 13260 + }, + { + "epoch": 0.7618701318584579, + "grad_norm": 1.515625, + "learning_rate": 7.103911125362762e-05, + "loss": 6.9433, + "step": 13270 + }, + { + "epoch": 0.7624442615734982, + "grad_norm": 1.53125, + "learning_rate": 7.099692842793964e-05, + "loss": 6.9542, + "step": 13280 + }, + { + "epoch": 0.7630183912885384, + "grad_norm": 1.5859375, + "learning_rate": 7.095472744999973e-05, + "loss": 6.9232, + "step": 13290 + }, + { + "epoch": 0.7635925210035788, + "grad_norm": 1.609375, + "learning_rate": 7.091250835629143e-05, + "loss": 6.8932, + "step": 13300 + }, + { + "epoch": 0.764166650718619, + "grad_norm": 1.59375, + "learning_rate": 7.087027118331397e-05, + "loss": 6.9483, + "step": 13310 + }, + { + "epoch": 0.7647407804336593, + "grad_norm": 1.625, + "learning_rate": 7.082801596758219e-05, + "loss": 6.9503, + "step": 13320 + }, + { + "epoch": 0.7653149101486996, + "grad_norm": 1.546875, + "learning_rate": 7.078574274562657e-05, + "loss": 6.9326, + "step": 13330 + }, + { + "epoch": 0.7658890398637399, + "grad_norm": 1.6171875, + "learning_rate": 7.07434515539931e-05, + "loss": 6.9463, + "step": 13340 + }, + { + "epoch": 0.7664631695787801, + "grad_norm": 1.625, + "learning_rate": 7.070114242924337e-05, + "loss": 6.9759, + "step": 13350 + }, + { + "epoch": 0.7670372992938205, + "grad_norm": 1.6640625, + "learning_rate": 7.06588154079544e-05, + "loss": 6.903, + "step": 13360 + }, + { + "epoch": 0.7676114290088607, + "grad_norm": 1.484375, + "learning_rate": 7.061647052671873e-05, + "loss": 6.9557, + "step": 13370 + }, + { + "epoch": 0.768185558723901, + "grad_norm": 1.6171875, + "learning_rate": 7.057410782214438e-05, + "loss": 6.9652, + "step": 13380 + }, + { + "epoch": 0.7687596884389413, + "grad_norm": 1.578125, + "learning_rate": 7.053172733085466e-05, + "loss": 6.9292, + "step": 13390 + }, + { + "epoch": 0.7693338181539816, + "grad_norm": 1.59375, + "learning_rate": 7.048932908948839e-05, + "loss": 6.9163, + "step": 13400 + }, + { + "epoch": 0.7699079478690218, + "grad_norm": 1.6015625, + "learning_rate": 7.04469131346996e-05, + "loss": 6.9463, + "step": 13410 + }, + { + "epoch": 0.7704820775840622, + "grad_norm": 1.5859375, + "learning_rate": 7.040447950315779e-05, + "loss": 6.9376, + "step": 13420 + }, + { + "epoch": 0.7710562072991024, + "grad_norm": 1.53125, + "learning_rate": 7.03620282315476e-05, + "loss": 6.9563, + "step": 13430 + }, + { + "epoch": 0.7716303370141427, + "grad_norm": 1.59375, + "learning_rate": 7.031955935656899e-05, + "loss": 6.938, + "step": 13440 + }, + { + "epoch": 0.772204466729183, + "grad_norm": 1.6015625, + "learning_rate": 7.027707291493711e-05, + "loss": 6.9266, + "step": 13450 + }, + { + "epoch": 0.7727785964442233, + "grad_norm": 1.6875, + "learning_rate": 7.023456894338235e-05, + "loss": 6.9203, + "step": 13460 + }, + { + "epoch": 0.7733527261592635, + "grad_norm": 1.6015625, + "learning_rate": 7.01920474786502e-05, + "loss": 6.8945, + "step": 13470 + }, + { + "epoch": 0.7739268558743039, + "grad_norm": 1.6171875, + "learning_rate": 7.01495085575013e-05, + "loss": 6.9758, + "step": 13480 + }, + { + "epoch": 0.7745009855893441, + "grad_norm": 1.640625, + "learning_rate": 7.010695221671135e-05, + "loss": 6.9387, + "step": 13490 + }, + { + "epoch": 0.7750751153043844, + "grad_norm": 1.65625, + "learning_rate": 7.006437849307115e-05, + "loss": 6.9299, + "step": 13500 + }, + { + "epoch": 0.7756492450194247, + "grad_norm": 2.15625, + "learning_rate": 7.00217874233865e-05, + "loss": 6.9433, + "step": 13510 + }, + { + "epoch": 0.776223374734465, + "grad_norm": 1.59375, + "learning_rate": 6.997917904447823e-05, + "loss": 6.9512, + "step": 13520 + }, + { + "epoch": 0.7767975044495052, + "grad_norm": 1.6484375, + "learning_rate": 6.993655339318208e-05, + "loss": 6.9717, + "step": 13530 + }, + { + "epoch": 0.7773716341645456, + "grad_norm": 1.5625, + "learning_rate": 6.989391050634877e-05, + "loss": 6.9223, + "step": 13540 + }, + { + "epoch": 0.7779457638795859, + "grad_norm": 1.6640625, + "learning_rate": 6.985125042084388e-05, + "loss": 6.9604, + "step": 13550 + }, + { + "epoch": 0.7785198935946261, + "grad_norm": 1.6171875, + "learning_rate": 6.980857317354792e-05, + "loss": 6.9727, + "step": 13560 + }, + { + "epoch": 0.7790940233096665, + "grad_norm": 1.5546875, + "learning_rate": 6.976587880135617e-05, + "loss": 6.9454, + "step": 13570 + }, + { + "epoch": 0.7796681530247067, + "grad_norm": 1.671875, + "learning_rate": 6.972316734117874e-05, + "loss": 6.9135, + "step": 13580 + }, + { + "epoch": 0.780242282739747, + "grad_norm": 1.6171875, + "learning_rate": 6.968043882994054e-05, + "loss": 6.9029, + "step": 13590 + }, + { + "epoch": 0.7808164124547873, + "grad_norm": 1.6015625, + "learning_rate": 6.963769330458117e-05, + "loss": 6.9762, + "step": 13600 + }, + { + "epoch": 0.7813905421698276, + "grad_norm": 1.609375, + "learning_rate": 6.959493080205499e-05, + "loss": 6.9443, + "step": 13610 + }, + { + "epoch": 0.7819646718848678, + "grad_norm": 1.5390625, + "learning_rate": 6.9552151359331e-05, + "loss": 6.9077, + "step": 13620 + }, + { + "epoch": 0.7825388015999082, + "grad_norm": 1.6328125, + "learning_rate": 6.950935501339284e-05, + "loss": 6.9295, + "step": 13630 + }, + { + "epoch": 0.7831129313149484, + "grad_norm": 1.5546875, + "learning_rate": 6.946654180123883e-05, + "loss": 6.9423, + "step": 13640 + }, + { + "epoch": 0.7836870610299888, + "grad_norm": 1.6015625, + "learning_rate": 6.942371175988178e-05, + "loss": 6.9018, + "step": 13650 + }, + { + "epoch": 0.784261190745029, + "grad_norm": 1.65625, + "learning_rate": 6.93808649263491e-05, + "loss": 6.9466, + "step": 13660 + }, + { + "epoch": 0.7848353204600693, + "grad_norm": 1.6953125, + "learning_rate": 6.933800133768274e-05, + "loss": 6.9222, + "step": 13670 + }, + { + "epoch": 0.7854094501751095, + "grad_norm": 1.6640625, + "learning_rate": 6.929512103093905e-05, + "loss": 6.892, + "step": 13680 + }, + { + "epoch": 0.7859835798901499, + "grad_norm": 1.6328125, + "learning_rate": 6.925222404318892e-05, + "loss": 6.9525, + "step": 13690 + }, + { + "epoch": 0.7865577096051901, + "grad_norm": 1.59375, + "learning_rate": 6.920931041151764e-05, + "loss": 6.9537, + "step": 13700 + }, + { + "epoch": 0.7871318393202305, + "grad_norm": 1.6328125, + "learning_rate": 6.916638017302484e-05, + "loss": 6.9071, + "step": 13710 + }, + { + "epoch": 0.7877059690352707, + "grad_norm": 1.578125, + "learning_rate": 6.912343336482456e-05, + "loss": 6.9564, + "step": 13720 + }, + { + "epoch": 0.788280098750311, + "grad_norm": 1.578125, + "learning_rate": 6.908047002404517e-05, + "loss": 6.937, + "step": 13730 + }, + { + "epoch": 0.7888542284653512, + "grad_norm": 1.640625, + "learning_rate": 6.903749018782928e-05, + "loss": 6.983, + "step": 13740 + }, + { + "epoch": 0.7894283581803916, + "grad_norm": 1.625, + "learning_rate": 6.899449389333382e-05, + "loss": 6.9388, + "step": 13750 + }, + { + "epoch": 0.7900024878954318, + "grad_norm": 1.65625, + "learning_rate": 6.89514811777299e-05, + "loss": 6.9097, + "step": 13760 + }, + { + "epoch": 0.7905766176104722, + "grad_norm": 1.546875, + "learning_rate": 6.890845207820286e-05, + "loss": 6.9503, + "step": 13770 + }, + { + "epoch": 0.7911507473255124, + "grad_norm": 1.6171875, + "learning_rate": 6.886540663195218e-05, + "loss": 6.9628, + "step": 13780 + }, + { + "epoch": 0.7917248770405527, + "grad_norm": 1.5625, + "learning_rate": 6.882234487619149e-05, + "loss": 6.921, + "step": 13790 + }, + { + "epoch": 0.792299006755593, + "grad_norm": 1.53125, + "learning_rate": 6.877926684814853e-05, + "loss": 6.901, + "step": 13800 + }, + { + "epoch": 0.7928731364706333, + "grad_norm": 1.5625, + "learning_rate": 6.873617258506504e-05, + "loss": 6.9464, + "step": 13810 + }, + { + "epoch": 0.7934472661856735, + "grad_norm": 1.5703125, + "learning_rate": 6.86930621241969e-05, + "loss": 6.9005, + "step": 13820 + }, + { + "epoch": 0.7940213959007139, + "grad_norm": 1.5546875, + "learning_rate": 6.864993550281393e-05, + "loss": 6.9325, + "step": 13830 + }, + { + "epoch": 0.7945955256157541, + "grad_norm": 1.6171875, + "learning_rate": 6.86067927581999e-05, + "loss": 6.9075, + "step": 13840 + }, + { + "epoch": 0.7951696553307944, + "grad_norm": 1.5625, + "learning_rate": 6.856363392765257e-05, + "loss": 6.8993, + "step": 13850 + }, + { + "epoch": 0.7957437850458347, + "grad_norm": 1.5859375, + "learning_rate": 6.85204590484836e-05, + "loss": 6.9175, + "step": 13860 + }, + { + "epoch": 0.796317914760875, + "grad_norm": 1.6484375, + "learning_rate": 6.84772681580185e-05, + "loss": 6.9469, + "step": 13870 + }, + { + "epoch": 0.7968920444759152, + "grad_norm": 1.546875, + "learning_rate": 6.843406129359661e-05, + "loss": 6.9306, + "step": 13880 + }, + { + "epoch": 0.7974661741909556, + "grad_norm": 1.515625, + "learning_rate": 6.839083849257113e-05, + "loss": 6.925, + "step": 13890 + }, + { + "epoch": 0.7980403039059958, + "grad_norm": 1.609375, + "learning_rate": 6.8347599792309e-05, + "loss": 6.9039, + "step": 13900 + }, + { + "epoch": 0.7986144336210361, + "grad_norm": 1.5546875, + "learning_rate": 6.830434523019091e-05, + "loss": 6.9328, + "step": 13910 + }, + { + "epoch": 0.7991885633360764, + "grad_norm": 1.640625, + "learning_rate": 6.826107484361129e-05, + "loss": 6.9284, + "step": 13920 + }, + { + "epoch": 0.7997626930511167, + "grad_norm": 1.5703125, + "learning_rate": 6.821778866997822e-05, + "loss": 6.9301, + "step": 13930 + }, + { + "epoch": 0.8003368227661569, + "grad_norm": 1.59375, + "learning_rate": 6.817448674671341e-05, + "loss": 6.9278, + "step": 13940 + }, + { + "epoch": 0.8009109524811973, + "grad_norm": 1.5390625, + "learning_rate": 6.813116911125225e-05, + "loss": 6.9447, + "step": 13950 + }, + { + "epoch": 0.8014850821962375, + "grad_norm": 1.59375, + "learning_rate": 6.808783580104365e-05, + "loss": 6.9523, + "step": 13960 + }, + { + "epoch": 0.8020592119112778, + "grad_norm": 1.578125, + "learning_rate": 6.804448685355011e-05, + "loss": 6.9536, + "step": 13970 + }, + { + "epoch": 0.802633341626318, + "grad_norm": 1.53125, + "learning_rate": 6.800112230624764e-05, + "loss": 6.9397, + "step": 13980 + }, + { + "epoch": 0.8032074713413584, + "grad_norm": 1.578125, + "learning_rate": 6.795774219662569e-05, + "loss": 6.9319, + "step": 13990 + }, + { + "epoch": 0.8037816010563986, + "grad_norm": 1.515625, + "learning_rate": 6.791434656218729e-05, + "loss": 7.0099, + "step": 14000 + }, + { + "epoch": 0.804355730771439, + "grad_norm": 1.65625, + "learning_rate": 6.787093544044873e-05, + "loss": 6.9383, + "step": 14010 + }, + { + "epoch": 0.8049298604864793, + "grad_norm": 1.5859375, + "learning_rate": 6.782750886893981e-05, + "loss": 6.952, + "step": 14020 + }, + { + "epoch": 0.8055039902015195, + "grad_norm": 1.546875, + "learning_rate": 6.778406688520362e-05, + "loss": 6.9498, + "step": 14030 + }, + { + "epoch": 0.8060781199165599, + "grad_norm": 1.609375, + "learning_rate": 6.774060952679661e-05, + "loss": 6.9687, + "step": 14040 + }, + { + "epoch": 0.8066522496316001, + "grad_norm": 1.484375, + "learning_rate": 6.769713683128851e-05, + "loss": 6.9473, + "step": 14050 + }, + { + "epoch": 0.8072263793466404, + "grad_norm": 1.625, + "learning_rate": 6.76536488362623e-05, + "loss": 6.924, + "step": 14060 + }, + { + "epoch": 0.8078005090616807, + "grad_norm": 1.6953125, + "learning_rate": 6.761014557931421e-05, + "loss": 6.9409, + "step": 14070 + }, + { + "epoch": 0.808374638776721, + "grad_norm": 1.65625, + "learning_rate": 6.756662709805363e-05, + "loss": 6.9435, + "step": 14080 + }, + { + "epoch": 0.8089487684917612, + "grad_norm": 1.609375, + "learning_rate": 6.752309343010316e-05, + "loss": 6.9117, + "step": 14090 + }, + { + "epoch": 0.8095228982068016, + "grad_norm": 1.6015625, + "learning_rate": 6.747954461309847e-05, + "loss": 6.9805, + "step": 14100 + }, + { + "epoch": 0.8100970279218418, + "grad_norm": 1.625, + "learning_rate": 6.743598068468837e-05, + "loss": 6.9137, + "step": 14110 + }, + { + "epoch": 0.8106711576368821, + "grad_norm": 1.59375, + "learning_rate": 6.739240168253471e-05, + "loss": 6.9724, + "step": 14120 + }, + { + "epoch": 0.8112452873519224, + "grad_norm": 1.6015625, + "learning_rate": 6.734880764431242e-05, + "loss": 6.8846, + "step": 14130 + }, + { + "epoch": 0.8118194170669627, + "grad_norm": 1.6015625, + "learning_rate": 6.730519860770935e-05, + "loss": 6.9561, + "step": 14140 + }, + { + "epoch": 0.8123935467820029, + "grad_norm": 1.53125, + "learning_rate": 6.726157461042637e-05, + "loss": 6.9365, + "step": 14150 + }, + { + "epoch": 0.8129676764970433, + "grad_norm": 1.5703125, + "learning_rate": 6.721793569017727e-05, + "loss": 6.8861, + "step": 14160 + }, + { + "epoch": 0.8135418062120835, + "grad_norm": 1.546875, + "learning_rate": 6.717428188468875e-05, + "loss": 6.9391, + "step": 14170 + }, + { + "epoch": 0.8141159359271238, + "grad_norm": 1.640625, + "learning_rate": 6.713061323170038e-05, + "loss": 6.9643, + "step": 14180 + }, + { + "epoch": 0.8146900656421641, + "grad_norm": 1.5859375, + "learning_rate": 6.708692976896454e-05, + "loss": 6.9478, + "step": 14190 + }, + { + "epoch": 0.8152641953572044, + "grad_norm": 1.609375, + "learning_rate": 6.704323153424643e-05, + "loss": 6.8882, + "step": 14200 + }, + { + "epoch": 0.8158383250722446, + "grad_norm": 1.734375, + "learning_rate": 6.699951856532405e-05, + "loss": 6.9549, + "step": 14210 + }, + { + "epoch": 0.816412454787285, + "grad_norm": 1.5625, + "learning_rate": 6.695579089998808e-05, + "loss": 6.9198, + "step": 14220 + }, + { + "epoch": 0.8169865845023252, + "grad_norm": 1.5546875, + "learning_rate": 6.691204857604195e-05, + "loss": 6.9332, + "step": 14230 + }, + { + "epoch": 0.8175607142173655, + "grad_norm": 1.6171875, + "learning_rate": 6.686829163130173e-05, + "loss": 6.968, + "step": 14240 + }, + { + "epoch": 0.8181348439324058, + "grad_norm": 1.7421875, + "learning_rate": 6.682452010359616e-05, + "loss": 6.9519, + "step": 14250 + }, + { + "epoch": 0.8187089736474461, + "grad_norm": 1.71875, + "learning_rate": 6.678073403076658e-05, + "loss": 6.9242, + "step": 14260 + }, + { + "epoch": 0.8192831033624863, + "grad_norm": 1.6328125, + "learning_rate": 6.673693345066691e-05, + "loss": 6.961, + "step": 14270 + }, + { + "epoch": 0.8198572330775267, + "grad_norm": 1.59375, + "learning_rate": 6.669311840116357e-05, + "loss": 6.9111, + "step": 14280 + }, + { + "epoch": 0.8204313627925669, + "grad_norm": 1.5859375, + "learning_rate": 6.664928892013553e-05, + "loss": 6.9245, + "step": 14290 + }, + { + "epoch": 0.8210054925076072, + "grad_norm": 1.546875, + "learning_rate": 6.660544504547423e-05, + "loss": 6.9198, + "step": 14300 + }, + { + "epoch": 0.8215796222226475, + "grad_norm": 1.5859375, + "learning_rate": 6.656158681508357e-05, + "loss": 6.9587, + "step": 14310 + }, + { + "epoch": 0.8221537519376878, + "grad_norm": 1.6171875, + "learning_rate": 6.651771426687983e-05, + "loss": 6.9579, + "step": 14320 + }, + { + "epoch": 0.822727881652728, + "grad_norm": 1.6015625, + "learning_rate": 6.647382743879166e-05, + "loss": 6.9472, + "step": 14330 + }, + { + "epoch": 0.8233020113677684, + "grad_norm": 1.5390625, + "learning_rate": 6.642992636876007e-05, + "loss": 6.9644, + "step": 14340 + }, + { + "epoch": 0.8238761410828086, + "grad_norm": 1.6015625, + "learning_rate": 6.638601109473842e-05, + "loss": 6.9212, + "step": 14350 + }, + { + "epoch": 0.8244502707978489, + "grad_norm": 1.609375, + "learning_rate": 6.634208165469231e-05, + "loss": 6.9237, + "step": 14360 + }, + { + "epoch": 0.8250244005128892, + "grad_norm": 1.625, + "learning_rate": 6.629813808659958e-05, + "loss": 6.9283, + "step": 14370 + }, + { + "epoch": 0.8255985302279295, + "grad_norm": 1.578125, + "learning_rate": 6.625418042845028e-05, + "loss": 6.9112, + "step": 14380 + }, + { + "epoch": 0.8261726599429697, + "grad_norm": 1.6328125, + "learning_rate": 6.621020871824668e-05, + "loss": 6.9496, + "step": 14390 + }, + { + "epoch": 0.8267467896580101, + "grad_norm": 1.65625, + "learning_rate": 6.616622299400319e-05, + "loss": 6.9288, + "step": 14400 + }, + { + "epoch": 0.8273209193730503, + "grad_norm": 1.5, + "learning_rate": 6.612222329374631e-05, + "loss": 6.923, + "step": 14410 + }, + { + "epoch": 0.8278950490880906, + "grad_norm": 1.6953125, + "learning_rate": 6.607820965551462e-05, + "loss": 6.9171, + "step": 14420 + }, + { + "epoch": 0.8284691788031309, + "grad_norm": 1.6484375, + "learning_rate": 6.603418211735876e-05, + "loss": 6.9524, + "step": 14430 + }, + { + "epoch": 0.8290433085181712, + "grad_norm": 1.65625, + "learning_rate": 6.599014071734145e-05, + "loss": 6.9356, + "step": 14440 + }, + { + "epoch": 0.8296174382332114, + "grad_norm": 1.53125, + "learning_rate": 6.594608549353725e-05, + "loss": 6.8981, + "step": 14450 + }, + { + "epoch": 0.8301915679482518, + "grad_norm": 1.6640625, + "learning_rate": 6.59020164840328e-05, + "loss": 6.9348, + "step": 14460 + }, + { + "epoch": 0.8307656976632921, + "grad_norm": 1.6328125, + "learning_rate": 6.585793372692663e-05, + "loss": 6.9244, + "step": 14470 + }, + { + "epoch": 0.8313398273783323, + "grad_norm": 1.59375, + "learning_rate": 6.581383726032912e-05, + "loss": 6.9678, + "step": 14480 + }, + { + "epoch": 0.8319139570933727, + "grad_norm": 1.6015625, + "learning_rate": 6.57697271223625e-05, + "loss": 6.9098, + "step": 14490 + }, + { + "epoch": 0.8324880868084129, + "grad_norm": 1.6796875, + "learning_rate": 6.572560335116087e-05, + "loss": 6.9442, + "step": 14500 + }, + { + "epoch": 0.8330622165234532, + "grad_norm": 1.515625, + "learning_rate": 6.568146598487007e-05, + "loss": 6.9094, + "step": 14510 + }, + { + "epoch": 0.8336363462384935, + "grad_norm": 1.640625, + "learning_rate": 6.563731506164772e-05, + "loss": 6.9335, + "step": 14520 + }, + { + "epoch": 0.8342104759535338, + "grad_norm": 1.6015625, + "learning_rate": 6.559315061966314e-05, + "loss": 6.9398, + "step": 14530 + }, + { + "epoch": 0.834784605668574, + "grad_norm": 1.5234375, + "learning_rate": 6.554897269709735e-05, + "loss": 6.9264, + "step": 14540 + }, + { + "epoch": 0.8353587353836144, + "grad_norm": 1.5703125, + "learning_rate": 6.550478133214304e-05, + "loss": 6.9198, + "step": 14550 + }, + { + "epoch": 0.8359328650986546, + "grad_norm": 1.515625, + "learning_rate": 6.546057656300447e-05, + "loss": 6.9202, + "step": 14560 + }, + { + "epoch": 0.836506994813695, + "grad_norm": 1.6171875, + "learning_rate": 6.541635842789752e-05, + "loss": 6.9135, + "step": 14570 + }, + { + "epoch": 0.8370811245287352, + "grad_norm": 1.609375, + "learning_rate": 6.537212696504968e-05, + "loss": 6.9019, + "step": 14580 + }, + { + "epoch": 0.8376552542437755, + "grad_norm": 1.6171875, + "learning_rate": 6.532788221269985e-05, + "loss": 6.891, + "step": 14590 + }, + { + "epoch": 0.8382293839588157, + "grad_norm": 1.6484375, + "learning_rate": 6.528362420909848e-05, + "loss": 6.9124, + "step": 14600 + }, + { + "epoch": 0.8388035136738561, + "grad_norm": 1.59375, + "learning_rate": 6.52393529925075e-05, + "loss": 6.9593, + "step": 14610 + }, + { + "epoch": 0.8393776433888963, + "grad_norm": 1.640625, + "learning_rate": 6.519506860120024e-05, + "loss": 6.9323, + "step": 14620 + }, + { + "epoch": 0.8399517731039366, + "grad_norm": 1.59375, + "learning_rate": 6.515077107346139e-05, + "loss": 6.9259, + "step": 14630 + }, + { + "epoch": 0.8405259028189769, + "grad_norm": 1.6171875, + "learning_rate": 6.5106460447587e-05, + "loss": 6.9423, + "step": 14640 + }, + { + "epoch": 0.8411000325340172, + "grad_norm": 1.7421875, + "learning_rate": 6.506213676188453e-05, + "loss": 6.9264, + "step": 14650 + }, + { + "epoch": 0.8416741622490574, + "grad_norm": 1.6484375, + "learning_rate": 6.501780005467262e-05, + "loss": 6.9357, + "step": 14660 + }, + { + "epoch": 0.8422482919640978, + "grad_norm": 1.6640625, + "learning_rate": 6.497345036428124e-05, + "loss": 6.9022, + "step": 14670 + }, + { + "epoch": 0.842822421679138, + "grad_norm": 1.625, + "learning_rate": 6.492908772905154e-05, + "loss": 6.9009, + "step": 14680 + }, + { + "epoch": 0.8433965513941784, + "grad_norm": 1.6640625, + "learning_rate": 6.488471218733588e-05, + "loss": 6.8893, + "step": 14690 + }, + { + "epoch": 0.8439706811092186, + "grad_norm": 1.5859375, + "learning_rate": 6.484032377749777e-05, + "loss": 6.9118, + "step": 14700 + }, + { + "epoch": 0.8445448108242589, + "grad_norm": 1.5859375, + "learning_rate": 6.479592253791187e-05, + "loss": 6.8947, + "step": 14710 + }, + { + "epoch": 0.8451189405392991, + "grad_norm": 1.578125, + "learning_rate": 6.47515085069639e-05, + "loss": 6.9224, + "step": 14720 + }, + { + "epoch": 0.8456930702543395, + "grad_norm": 1.5703125, + "learning_rate": 6.470708172305065e-05, + "loss": 6.9317, + "step": 14730 + }, + { + "epoch": 0.8462671999693797, + "grad_norm": 1.4921875, + "learning_rate": 6.466264222457997e-05, + "loss": 6.9228, + "step": 14740 + }, + { + "epoch": 0.84684132968442, + "grad_norm": 1.515625, + "learning_rate": 6.46181900499706e-05, + "loss": 6.9008, + "step": 14750 + }, + { + "epoch": 0.8474154593994603, + "grad_norm": 1.609375, + "learning_rate": 6.457372523765238e-05, + "loss": 6.928, + "step": 14760 + }, + { + "epoch": 0.8479895891145006, + "grad_norm": 1.640625, + "learning_rate": 6.452924782606595e-05, + "loss": 6.9059, + "step": 14770 + }, + { + "epoch": 0.8485637188295408, + "grad_norm": 1.5078125, + "learning_rate": 6.448475785366291e-05, + "loss": 6.9193, + "step": 14780 + }, + { + "epoch": 0.8491378485445812, + "grad_norm": 1.5703125, + "learning_rate": 6.444025535890573e-05, + "loss": 6.904, + "step": 14790 + }, + { + "epoch": 0.8497119782596214, + "grad_norm": 1.5859375, + "learning_rate": 6.439574038026766e-05, + "loss": 6.9381, + "step": 14800 + }, + { + "epoch": 0.8502861079746618, + "grad_norm": 1.5546875, + "learning_rate": 6.435121295623276e-05, + "loss": 6.9774, + "step": 14810 + }, + { + "epoch": 0.850860237689702, + "grad_norm": 1.53125, + "learning_rate": 6.430667312529585e-05, + "loss": 6.9291, + "step": 14820 + }, + { + "epoch": 0.8514343674047423, + "grad_norm": 1.578125, + "learning_rate": 6.426212092596248e-05, + "loss": 6.9351, + "step": 14830 + }, + { + "epoch": 0.8520084971197825, + "grad_norm": 1.6875, + "learning_rate": 6.421755639674889e-05, + "loss": 6.9359, + "step": 14840 + }, + { + "epoch": 0.8525826268348229, + "grad_norm": 1.640625, + "learning_rate": 6.4172979576182e-05, + "loss": 6.8811, + "step": 14850 + }, + { + "epoch": 0.8531567565498631, + "grad_norm": 1.671875, + "learning_rate": 6.412839050279929e-05, + "loss": 6.9069, + "step": 14860 + }, + { + "epoch": 0.8537308862649035, + "grad_norm": 1.671875, + "learning_rate": 6.408378921514894e-05, + "loss": 6.936, + "step": 14870 + }, + { + "epoch": 0.8543050159799437, + "grad_norm": 1.7578125, + "learning_rate": 6.403917575178959e-05, + "loss": 6.9129, + "step": 14880 + }, + { + "epoch": 0.854879145694984, + "grad_norm": 1.46875, + "learning_rate": 6.399455015129043e-05, + "loss": 6.912, + "step": 14890 + }, + { + "epoch": 0.8554532754100242, + "grad_norm": 1.609375, + "learning_rate": 6.394991245223121e-05, + "loss": 6.9321, + "step": 14900 + }, + { + "epoch": 0.8560274051250646, + "grad_norm": 1.65625, + "learning_rate": 6.390526269320202e-05, + "loss": 6.8809, + "step": 14910 + }, + { + "epoch": 0.8566015348401049, + "grad_norm": 1.5703125, + "learning_rate": 6.38606009128035e-05, + "loss": 6.9148, + "step": 14920 + }, + { + "epoch": 0.8571756645551452, + "grad_norm": 1.625, + "learning_rate": 6.38159271496466e-05, + "loss": 6.9322, + "step": 14930 + }, + { + "epoch": 0.8577497942701855, + "grad_norm": 1.6171875, + "learning_rate": 6.377124144235265e-05, + "loss": 6.9338, + "step": 14940 + }, + { + "epoch": 0.8583239239852257, + "grad_norm": 1.5859375, + "learning_rate": 6.372654382955334e-05, + "loss": 6.8938, + "step": 14950 + }, + { + "epoch": 0.8588980537002661, + "grad_norm": 1.6328125, + "learning_rate": 6.368183434989058e-05, + "loss": 6.9226, + "step": 14960 + }, + { + "epoch": 0.8594721834153063, + "grad_norm": 1.5546875, + "learning_rate": 6.363711304201661e-05, + "loss": 6.9437, + "step": 14970 + }, + { + "epoch": 0.8600463131303466, + "grad_norm": 1.59375, + "learning_rate": 6.359237994459388e-05, + "loss": 6.9135, + "step": 14980 + }, + { + "epoch": 0.8606204428453869, + "grad_norm": 1.6328125, + "learning_rate": 6.354763509629498e-05, + "loss": 6.9327, + "step": 14990 + }, + { + "epoch": 0.8611945725604272, + "grad_norm": 1.6328125, + "learning_rate": 6.350287853580273e-05, + "loss": 6.8962, + "step": 15000 + }, + { + "epoch": 0.8617687022754674, + "grad_norm": 1.546875, + "learning_rate": 6.345811030181005e-05, + "loss": 6.9345, + "step": 15010 + }, + { + "epoch": 0.8623428319905078, + "grad_norm": 1.6015625, + "learning_rate": 6.341333043301993e-05, + "loss": 6.9071, + "step": 15020 + }, + { + "epoch": 0.862916961705548, + "grad_norm": 1.890625, + "learning_rate": 6.336853896814543e-05, + "loss": 6.9159, + "step": 15030 + }, + { + "epoch": 0.8634910914205883, + "grad_norm": 1.625, + "learning_rate": 6.332373594590964e-05, + "loss": 6.9337, + "step": 15040 + }, + { + "epoch": 0.8640652211356286, + "grad_norm": 1.6484375, + "learning_rate": 6.327892140504567e-05, + "loss": 6.9085, + "step": 15050 + }, + { + "epoch": 0.8646393508506689, + "grad_norm": 1.5234375, + "learning_rate": 6.323409538429656e-05, + "loss": 6.9236, + "step": 15060 + }, + { + "epoch": 0.8652134805657091, + "grad_norm": 1.5625, + "learning_rate": 6.318925792241523e-05, + "loss": 6.9405, + "step": 15070 + }, + { + "epoch": 0.8657876102807495, + "grad_norm": 1.625, + "learning_rate": 6.314440905816457e-05, + "loss": 6.9579, + "step": 15080 + }, + { + "epoch": 0.8663617399957897, + "grad_norm": 1.578125, + "learning_rate": 6.30995488303173e-05, + "loss": 6.9024, + "step": 15090 + }, + { + "epoch": 0.86693586971083, + "grad_norm": 1.578125, + "learning_rate": 6.305467727765592e-05, + "loss": 6.9078, + "step": 15100 + }, + { + "epoch": 0.8675099994258703, + "grad_norm": 1.5703125, + "learning_rate": 6.30097944389728e-05, + "loss": 6.9246, + "step": 15110 + }, + { + "epoch": 0.8680841291409106, + "grad_norm": 1.6875, + "learning_rate": 6.296490035306999e-05, + "loss": 6.9051, + "step": 15120 + }, + { + "epoch": 0.8686582588559508, + "grad_norm": 1.5390625, + "learning_rate": 6.291999505875932e-05, + "loss": 6.9181, + "step": 15130 + }, + { + "epoch": 0.8692323885709912, + "grad_norm": 1.5625, + "learning_rate": 6.287507859486228e-05, + "loss": 6.9262, + "step": 15140 + }, + { + "epoch": 0.8698065182860314, + "grad_norm": 1.5625, + "learning_rate": 6.283015100021002e-05, + "loss": 6.888, + "step": 15150 + }, + { + "epoch": 0.8703806480010717, + "grad_norm": 1.6484375, + "learning_rate": 6.278521231364334e-05, + "loss": 6.9464, + "step": 15160 + }, + { + "epoch": 0.870954777716112, + "grad_norm": 1.59375, + "learning_rate": 6.274026257401258e-05, + "loss": 6.9193, + "step": 15170 + }, + { + "epoch": 0.8715289074311523, + "grad_norm": 1.6796875, + "learning_rate": 6.269530182017766e-05, + "loss": 6.8934, + "step": 15180 + }, + { + "epoch": 0.8721030371461925, + "grad_norm": 1.5, + "learning_rate": 6.265033009100805e-05, + "loss": 6.9851, + "step": 15190 + }, + { + "epoch": 0.8726771668612329, + "grad_norm": 1.546875, + "learning_rate": 6.260534742538267e-05, + "loss": 6.9182, + "step": 15200 + }, + { + "epoch": 0.8732512965762731, + "grad_norm": 1.625, + "learning_rate": 6.256035386218989e-05, + "loss": 6.9386, + "step": 15210 + }, + { + "epoch": 0.8738254262913134, + "grad_norm": 1.640625, + "learning_rate": 6.251534944032754e-05, + "loss": 6.9668, + "step": 15220 + }, + { + "epoch": 0.8743995560063537, + "grad_norm": 1.6328125, + "learning_rate": 6.247033419870281e-05, + "loss": 6.9103, + "step": 15230 + }, + { + "epoch": 0.874973685721394, + "grad_norm": 1.625, + "learning_rate": 6.242530817623225e-05, + "loss": 6.8926, + "step": 15240 + }, + { + "epoch": 0.8755478154364342, + "grad_norm": 1.5625, + "learning_rate": 6.238027141184171e-05, + "loss": 6.9194, + "step": 15250 + }, + { + "epoch": 0.8761219451514746, + "grad_norm": 1.5859375, + "learning_rate": 6.23352239444664e-05, + "loss": 6.8804, + "step": 15260 + }, + { + "epoch": 0.8766960748665148, + "grad_norm": 1.5703125, + "learning_rate": 6.229016581305067e-05, + "loss": 6.9138, + "step": 15270 + }, + { + "epoch": 0.8772702045815551, + "grad_norm": 1.6328125, + "learning_rate": 6.224509705654818e-05, + "loss": 6.8961, + "step": 15280 + }, + { + "epoch": 0.8778443342965954, + "grad_norm": 1.5625, + "learning_rate": 6.220001771392173e-05, + "loss": 6.9099, + "step": 15290 + }, + { + "epoch": 0.8784184640116357, + "grad_norm": 1.6171875, + "learning_rate": 6.21549278241433e-05, + "loss": 6.9214, + "step": 15300 + }, + { + "epoch": 0.8789925937266759, + "grad_norm": 1.6484375, + "learning_rate": 6.210982742619395e-05, + "loss": 6.8928, + "step": 15310 + }, + { + "epoch": 0.8795667234417163, + "grad_norm": 1.5859375, + "learning_rate": 6.206471655906388e-05, + "loss": 6.9285, + "step": 15320 + }, + { + "epoch": 0.8801408531567565, + "grad_norm": 1.578125, + "learning_rate": 6.20195952617523e-05, + "loss": 6.9484, + "step": 15330 + }, + { + "epoch": 0.8807149828717968, + "grad_norm": 1.6640625, + "learning_rate": 6.197446357326745e-05, + "loss": 6.9297, + "step": 15340 + }, + { + "epoch": 0.8812891125868371, + "grad_norm": 1.625, + "learning_rate": 6.192932153262653e-05, + "loss": 6.8879, + "step": 15350 + }, + { + "epoch": 0.8818632423018774, + "grad_norm": 1.53125, + "learning_rate": 6.188416917885572e-05, + "loss": 6.8707, + "step": 15360 + }, + { + "epoch": 0.8824373720169177, + "grad_norm": 1.5390625, + "learning_rate": 6.183900655099013e-05, + "loss": 6.9316, + "step": 15370 + }, + { + "epoch": 0.883011501731958, + "grad_norm": 1.5703125, + "learning_rate": 6.179383368807369e-05, + "loss": 6.9382, + "step": 15380 + }, + { + "epoch": 0.8835856314469983, + "grad_norm": 1.5390625, + "learning_rate": 6.174865062915924e-05, + "loss": 6.9445, + "step": 15390 + }, + { + "epoch": 0.8841597611620385, + "grad_norm": 1.5703125, + "learning_rate": 6.170345741330839e-05, + "loss": 6.8567, + "step": 15400 + }, + { + "epoch": 0.8847338908770789, + "grad_norm": 1.6953125, + "learning_rate": 6.165825407959158e-05, + "loss": 6.9408, + "step": 15410 + }, + { + "epoch": 0.8853080205921191, + "grad_norm": 1.5546875, + "learning_rate": 6.161304066708796e-05, + "loss": 6.9133, + "step": 15420 + }, + { + "epoch": 0.8858821503071594, + "grad_norm": 1.6328125, + "learning_rate": 6.156781721488538e-05, + "loss": 6.8569, + "step": 15430 + }, + { + "epoch": 0.8864562800221997, + "grad_norm": 1.5859375, + "learning_rate": 6.152258376208042e-05, + "loss": 6.8869, + "step": 15440 + }, + { + "epoch": 0.88703040973724, + "grad_norm": 1.671875, + "learning_rate": 6.147734034777828e-05, + "loss": 6.9566, + "step": 15450 + }, + { + "epoch": 0.8876045394522802, + "grad_norm": 1.578125, + "learning_rate": 6.143208701109274e-05, + "loss": 6.8694, + "step": 15460 + }, + { + "epoch": 0.8881786691673206, + "grad_norm": 1.6328125, + "learning_rate": 6.13868237911462e-05, + "loss": 6.9282, + "step": 15470 + }, + { + "epoch": 0.8887527988823608, + "grad_norm": 1.59375, + "learning_rate": 6.13415507270696e-05, + "loss": 6.9229, + "step": 15480 + }, + { + "epoch": 0.8893269285974011, + "grad_norm": 1.6171875, + "learning_rate": 6.129626785800237e-05, + "loss": 6.9499, + "step": 15490 + }, + { + "epoch": 0.8899010583124414, + "grad_norm": 1.640625, + "learning_rate": 6.125097522309245e-05, + "loss": 6.9157, + "step": 15500 + }, + { + "epoch": 0.8904751880274817, + "grad_norm": 1.5703125, + "learning_rate": 6.120567286149618e-05, + "loss": 6.9177, + "step": 15510 + }, + { + "epoch": 0.8910493177425219, + "grad_norm": 1.6953125, + "learning_rate": 6.116036081237834e-05, + "loss": 6.8992, + "step": 15520 + }, + { + "epoch": 0.8916234474575623, + "grad_norm": 1.5546875, + "learning_rate": 6.111503911491207e-05, + "loss": 6.8907, + "step": 15530 + }, + { + "epoch": 0.8921975771726025, + "grad_norm": 1.53125, + "learning_rate": 6.106970780827886e-05, + "loss": 6.8982, + "step": 15540 + }, + { + "epoch": 0.8927717068876428, + "grad_norm": 1.59375, + "learning_rate": 6.102436693166852e-05, + "loss": 6.9049, + "step": 15550 + }, + { + "epoch": 0.8933458366026831, + "grad_norm": 1.546875, + "learning_rate": 6.0979016524279077e-05, + "loss": 6.9312, + "step": 15560 + }, + { + "epoch": 0.8939199663177234, + "grad_norm": 1.515625, + "learning_rate": 6.093365662531686e-05, + "loss": 6.8921, + "step": 15570 + }, + { + "epoch": 0.8944940960327636, + "grad_norm": 1.59375, + "learning_rate": 6.0888287273996404e-05, + "loss": 6.9304, + "step": 15580 + }, + { + "epoch": 0.895068225747804, + "grad_norm": 1.53125, + "learning_rate": 6.084290850954036e-05, + "loss": 6.921, + "step": 15590 + }, + { + "epoch": 0.8956423554628442, + "grad_norm": 1.6171875, + "learning_rate": 6.0797520371179585e-05, + "loss": 6.9617, + "step": 15600 + }, + { + "epoch": 0.8962164851778845, + "grad_norm": 1.546875, + "learning_rate": 6.0752122898152955e-05, + "loss": 6.9234, + "step": 15610 + }, + { + "epoch": 0.8967906148929248, + "grad_norm": 1.6015625, + "learning_rate": 6.070671612970751e-05, + "loss": 6.9161, + "step": 15620 + }, + { + "epoch": 0.8973647446079651, + "grad_norm": 1.59375, + "learning_rate": 6.066130010509827e-05, + "loss": 6.9208, + "step": 15630 + }, + { + "epoch": 0.8979388743230053, + "grad_norm": 1.5703125, + "learning_rate": 6.061587486358826e-05, + "loss": 6.9033, + "step": 15640 + }, + { + "epoch": 0.8985130040380457, + "grad_norm": 1.5625, + "learning_rate": 6.057044044444848e-05, + "loss": 6.9014, + "step": 15650 + }, + { + "epoch": 0.8990871337530859, + "grad_norm": 1.65625, + "learning_rate": 6.0524996886957896e-05, + "loss": 6.8919, + "step": 15660 + }, + { + "epoch": 0.8996612634681262, + "grad_norm": 1.640625, + "learning_rate": 6.047954423040332e-05, + "loss": 6.9082, + "step": 15670 + }, + { + "epoch": 0.9002353931831665, + "grad_norm": 1.625, + "learning_rate": 6.043408251407945e-05, + "loss": 6.9212, + "step": 15680 + }, + { + "epoch": 0.9008095228982068, + "grad_norm": 1.578125, + "learning_rate": 6.038861177728884e-05, + "loss": 6.9136, + "step": 15690 + }, + { + "epoch": 0.901383652613247, + "grad_norm": 1.5859375, + "learning_rate": 6.03431320593418e-05, + "loss": 6.9018, + "step": 15700 + }, + { + "epoch": 0.9019577823282874, + "grad_norm": 1.7109375, + "learning_rate": 6.029764339955646e-05, + "loss": 6.9162, + "step": 15710 + }, + { + "epoch": 0.9025319120433276, + "grad_norm": 1.546875, + "learning_rate": 6.0252145837258636e-05, + "loss": 6.9063, + "step": 15720 + }, + { + "epoch": 0.903106041758368, + "grad_norm": 1.6640625, + "learning_rate": 6.020663941178184e-05, + "loss": 6.8981, + "step": 15730 + }, + { + "epoch": 0.9036801714734082, + "grad_norm": 1.6484375, + "learning_rate": 6.016112416246726e-05, + "loss": 6.8945, + "step": 15740 + }, + { + "epoch": 0.9042543011884485, + "grad_norm": 1.578125, + "learning_rate": 6.011560012866375e-05, + "loss": 6.9036, + "step": 15750 + }, + { + "epoch": 0.9048284309034887, + "grad_norm": 1.65625, + "learning_rate": 6.0070067349727675e-05, + "loss": 6.9011, + "step": 15760 + }, + { + "epoch": 0.9054025606185291, + "grad_norm": 1.625, + "learning_rate": 6.002452586502303e-05, + "loss": 6.9039, + "step": 15770 + }, + { + "epoch": 0.9059766903335693, + "grad_norm": 1.5078125, + "learning_rate": 5.9978975713921294e-05, + "loss": 6.9252, + "step": 15780 + }, + { + "epoch": 0.9065508200486097, + "grad_norm": 1.609375, + "learning_rate": 5.9933416935801466e-05, + "loss": 6.9102, + "step": 15790 + }, + { + "epoch": 0.9071249497636499, + "grad_norm": 1.53125, + "learning_rate": 5.9887849570050016e-05, + "loss": 6.9262, + "step": 15800 + }, + { + "epoch": 0.9076990794786902, + "grad_norm": 1.6875, + "learning_rate": 5.984227365606078e-05, + "loss": 6.877, + "step": 15810 + }, + { + "epoch": 0.9082732091937306, + "grad_norm": 1.703125, + "learning_rate": 5.979668923323503e-05, + "loss": 6.9007, + "step": 15820 + }, + { + "epoch": 0.9088473389087708, + "grad_norm": 1.609375, + "learning_rate": 5.9751096340981373e-05, + "loss": 6.9126, + "step": 15830 + }, + { + "epoch": 0.9094214686238111, + "grad_norm": 1.7265625, + "learning_rate": 5.970549501871578e-05, + "loss": 6.8955, + "step": 15840 + }, + { + "epoch": 0.9099955983388514, + "grad_norm": 1.515625, + "learning_rate": 5.9659885305861476e-05, + "loss": 6.9163, + "step": 15850 + }, + { + "epoch": 0.9105697280538917, + "grad_norm": 1.6171875, + "learning_rate": 5.961426724184892e-05, + "loss": 6.9319, + "step": 15860 + }, + { + "epoch": 0.9111438577689319, + "grad_norm": 1.5546875, + "learning_rate": 5.956864086611581e-05, + "loss": 6.9329, + "step": 15870 + }, + { + "epoch": 0.9117179874839723, + "grad_norm": 1.765625, + "learning_rate": 5.952300621810707e-05, + "loss": 6.9002, + "step": 15880 + }, + { + "epoch": 0.9122921171990125, + "grad_norm": 1.515625, + "learning_rate": 5.9477363337274696e-05, + "loss": 6.8949, + "step": 15890 + }, + { + "epoch": 0.9128662469140528, + "grad_norm": 1.703125, + "learning_rate": 5.9431712263077864e-05, + "loss": 6.9267, + "step": 15900 + }, + { + "epoch": 0.913440376629093, + "grad_norm": 1.6328125, + "learning_rate": 5.938605303498282e-05, + "loss": 6.9459, + "step": 15910 + }, + { + "epoch": 0.9140145063441334, + "grad_norm": 1.5703125, + "learning_rate": 5.934038569246283e-05, + "loss": 6.9093, + "step": 15920 + }, + { + "epoch": 0.9145886360591736, + "grad_norm": 1.640625, + "learning_rate": 5.929471027499822e-05, + "loss": 6.8726, + "step": 15930 + }, + { + "epoch": 0.915162765774214, + "grad_norm": 1.5859375, + "learning_rate": 5.924902682207627e-05, + "loss": 6.8642, + "step": 15940 + }, + { + "epoch": 0.9157368954892542, + "grad_norm": 1.6484375, + "learning_rate": 5.9203335373191195e-05, + "loss": 6.9166, + "step": 15950 + }, + { + "epoch": 0.9163110252042945, + "grad_norm": 1.6171875, + "learning_rate": 5.915763596784415e-05, + "loss": 6.9005, + "step": 15960 + }, + { + "epoch": 0.9168851549193348, + "grad_norm": 1.65625, + "learning_rate": 5.911192864554317e-05, + "loss": 6.912, + "step": 15970 + }, + { + "epoch": 0.9174592846343751, + "grad_norm": 1.59375, + "learning_rate": 5.90662134458031e-05, + "loss": 6.9025, + "step": 15980 + }, + { + "epoch": 0.9180334143494153, + "grad_norm": 1.5703125, + "learning_rate": 5.902049040814563e-05, + "loss": 6.9095, + "step": 15990 + }, + { + "epoch": 0.9186075440644557, + "grad_norm": 1.6796875, + "learning_rate": 5.89747595720992e-05, + "loss": 6.8551, + "step": 16000 + }, + { + "epoch": 0.9191816737794959, + "grad_norm": 1.5625, + "learning_rate": 5.8929020977199034e-05, + "loss": 6.9176, + "step": 16010 + }, + { + "epoch": 0.9197558034945362, + "grad_norm": 1.59375, + "learning_rate": 5.888327466298701e-05, + "loss": 6.9277, + "step": 16020 + }, + { + "epoch": 0.9203299332095765, + "grad_norm": 1.6875, + "learning_rate": 5.88375206690117e-05, + "loss": 6.8961, + "step": 16030 + }, + { + "epoch": 0.9209040629246168, + "grad_norm": 1.6953125, + "learning_rate": 5.879175903482833e-05, + "loss": 6.9116, + "step": 16040 + }, + { + "epoch": 0.921478192639657, + "grad_norm": 1.5703125, + "learning_rate": 5.874598979999873e-05, + "loss": 6.8972, + "step": 16050 + }, + { + "epoch": 0.9220523223546974, + "grad_norm": 1.578125, + "learning_rate": 5.870021300409128e-05, + "loss": 6.9153, + "step": 16060 + }, + { + "epoch": 0.9226264520697376, + "grad_norm": 1.640625, + "learning_rate": 5.8654428686680905e-05, + "loss": 6.8592, + "step": 16070 + }, + { + "epoch": 0.9232005817847779, + "grad_norm": 1.625, + "learning_rate": 5.860863688734903e-05, + "loss": 6.9135, + "step": 16080 + }, + { + "epoch": 0.9237747114998182, + "grad_norm": 1.5078125, + "learning_rate": 5.8562837645683575e-05, + "loss": 6.9168, + "step": 16090 + }, + { + "epoch": 0.9243488412148585, + "grad_norm": 1.5234375, + "learning_rate": 5.851703100127886e-05, + "loss": 6.8931, + "step": 16100 + }, + { + "epoch": 0.9249229709298987, + "grad_norm": 1.609375, + "learning_rate": 5.8471216993735626e-05, + "loss": 6.9196, + "step": 16110 + }, + { + "epoch": 0.9254971006449391, + "grad_norm": 1.5625, + "learning_rate": 5.842539566266095e-05, + "loss": 6.8941, + "step": 16120 + }, + { + "epoch": 0.9260712303599793, + "grad_norm": 1.5859375, + "learning_rate": 5.837956704766829e-05, + "loss": 6.9126, + "step": 16130 + }, + { + "epoch": 0.9266453600750196, + "grad_norm": 1.578125, + "learning_rate": 5.833373118837734e-05, + "loss": 6.9539, + "step": 16140 + }, + { + "epoch": 0.9272194897900599, + "grad_norm": 1.6015625, + "learning_rate": 5.8287888124414126e-05, + "loss": 6.8976, + "step": 16150 + }, + { + "epoch": 0.9277936195051002, + "grad_norm": 1.6875, + "learning_rate": 5.824203789541085e-05, + "loss": 6.8622, + "step": 16160 + }, + { + "epoch": 0.9283677492201404, + "grad_norm": 1.5625, + "learning_rate": 5.819618054100591e-05, + "loss": 6.9183, + "step": 16170 + }, + { + "epoch": 0.9289418789351808, + "grad_norm": 1.6015625, + "learning_rate": 5.8150316100843895e-05, + "loss": 6.9262, + "step": 16180 + }, + { + "epoch": 0.929516008650221, + "grad_norm": 1.6328125, + "learning_rate": 5.81044446145755e-05, + "loss": 6.9072, + "step": 16190 + }, + { + "epoch": 0.9300901383652613, + "grad_norm": 1.609375, + "learning_rate": 5.8058566121857514e-05, + "loss": 6.8663, + "step": 16200 + }, + { + "epoch": 0.9306642680803016, + "grad_norm": 1.6328125, + "learning_rate": 5.8012680662352795e-05, + "loss": 6.9084, + "step": 16210 + }, + { + "epoch": 0.9312383977953419, + "grad_norm": 1.546875, + "learning_rate": 5.796678827573018e-05, + "loss": 6.8419, + "step": 16220 + }, + { + "epoch": 0.9318125275103821, + "grad_norm": 1.6328125, + "learning_rate": 5.792088900166457e-05, + "loss": 6.8997, + "step": 16230 + }, + { + "epoch": 0.9323866572254225, + "grad_norm": 1.59375, + "learning_rate": 5.7874982879836746e-05, + "loss": 6.8838, + "step": 16240 + }, + { + "epoch": 0.9329607869404627, + "grad_norm": 1.65625, + "learning_rate": 5.7829069949933464e-05, + "loss": 6.9244, + "step": 16250 + }, + { + "epoch": 0.933534916655503, + "grad_norm": 1.578125, + "learning_rate": 5.778315025164731e-05, + "loss": 6.9197, + "step": 16260 + }, + { + "epoch": 0.9341090463705434, + "grad_norm": 1.6640625, + "learning_rate": 5.77372238246768e-05, + "loss": 6.8825, + "step": 16270 + }, + { + "epoch": 0.9346831760855836, + "grad_norm": 1.5703125, + "learning_rate": 5.769129070872619e-05, + "loss": 6.9021, + "step": 16280 + }, + { + "epoch": 0.9352573058006239, + "grad_norm": 1.6796875, + "learning_rate": 5.7645350943505547e-05, + "loss": 6.9201, + "step": 16290 + }, + { + "epoch": 0.9358314355156642, + "grad_norm": 1.5703125, + "learning_rate": 5.759940456873071e-05, + "loss": 6.8906, + "step": 16300 + }, + { + "epoch": 0.9364055652307045, + "grad_norm": 1.65625, + "learning_rate": 5.755345162412318e-05, + "loss": 6.9528, + "step": 16310 + }, + { + "epoch": 0.9369796949457447, + "grad_norm": 1.609375, + "learning_rate": 5.7507492149410204e-05, + "loss": 6.8871, + "step": 16320 + }, + { + "epoch": 0.9375538246607851, + "grad_norm": 1.5390625, + "learning_rate": 5.746152618432462e-05, + "loss": 6.9257, + "step": 16330 + }, + { + "epoch": 0.9381279543758253, + "grad_norm": 1.5859375, + "learning_rate": 5.7415553768604904e-05, + "loss": 6.9333, + "step": 16340 + }, + { + "epoch": 0.9387020840908656, + "grad_norm": 1.671875, + "learning_rate": 5.736957494199509e-05, + "loss": 6.9548, + "step": 16350 + }, + { + "epoch": 0.9392762138059059, + "grad_norm": 1.4765625, + "learning_rate": 5.7323589744244765e-05, + "loss": 6.9107, + "step": 16360 + }, + { + "epoch": 0.9398503435209462, + "grad_norm": 1.546875, + "learning_rate": 5.727759821510904e-05, + "loss": 6.9093, + "step": 16370 + }, + { + "epoch": 0.9404244732359864, + "grad_norm": 1.5, + "learning_rate": 5.72316003943485e-05, + "loss": 6.9069, + "step": 16380 + }, + { + "epoch": 0.9409986029510268, + "grad_norm": 1.609375, + "learning_rate": 5.7185596321729106e-05, + "loss": 6.9084, + "step": 16390 + }, + { + "epoch": 0.941572732666067, + "grad_norm": 1.6328125, + "learning_rate": 5.71395860370223e-05, + "loss": 6.8798, + "step": 16400 + }, + { + "epoch": 0.9421468623811073, + "grad_norm": 1.609375, + "learning_rate": 5.7093569580004855e-05, + "loss": 6.9022, + "step": 16410 + }, + { + "epoch": 0.9427209920961476, + "grad_norm": 1.59375, + "learning_rate": 5.704754699045891e-05, + "loss": 6.9142, + "step": 16420 + }, + { + "epoch": 0.9432951218111879, + "grad_norm": 1.5546875, + "learning_rate": 5.700151830817187e-05, + "loss": 6.9078, + "step": 16430 + }, + { + "epoch": 0.9438692515262281, + "grad_norm": 1.625, + "learning_rate": 5.695548357293642e-05, + "loss": 6.966, + "step": 16440 + }, + { + "epoch": 0.9444433812412685, + "grad_norm": 1.59375, + "learning_rate": 5.690944282455049e-05, + "loss": 6.8658, + "step": 16450 + }, + { + "epoch": 0.9450175109563087, + "grad_norm": 1.640625, + "learning_rate": 5.6863396102817214e-05, + "loss": 6.8941, + "step": 16460 + }, + { + "epoch": 0.945591640671349, + "grad_norm": 1.6484375, + "learning_rate": 5.681734344754486e-05, + "loss": 6.895, + "step": 16470 + }, + { + "epoch": 0.9461657703863893, + "grad_norm": 1.6328125, + "learning_rate": 5.677128489854684e-05, + "loss": 6.897, + "step": 16480 + }, + { + "epoch": 0.9467399001014296, + "grad_norm": 1.5234375, + "learning_rate": 5.672522049564165e-05, + "loss": 6.872, + "step": 16490 + }, + { + "epoch": 0.9473140298164698, + "grad_norm": 1.640625, + "learning_rate": 5.6679150278652895e-05, + "loss": 6.9132, + "step": 16500 + }, + { + "epoch": 0.9478881595315102, + "grad_norm": 1.6171875, + "learning_rate": 5.6633074287409135e-05, + "loss": 6.9661, + "step": 16510 + }, + { + "epoch": 0.9484622892465504, + "grad_norm": 1.578125, + "learning_rate": 5.6586992561744e-05, + "loss": 6.9056, + "step": 16520 + }, + { + "epoch": 0.9490364189615907, + "grad_norm": 1.640625, + "learning_rate": 5.654090514149598e-05, + "loss": 6.9056, + "step": 16530 + }, + { + "epoch": 0.949610548676631, + "grad_norm": 1.5625, + "learning_rate": 5.649481206650859e-05, + "loss": 6.8932, + "step": 16540 + }, + { + "epoch": 0.9501846783916713, + "grad_norm": 1.578125, + "learning_rate": 5.6448713376630194e-05, + "loss": 6.8895, + "step": 16550 + }, + { + "epoch": 0.9507588081067115, + "grad_norm": 1.5859375, + "learning_rate": 5.640260911171397e-05, + "loss": 6.9123, + "step": 16560 + }, + { + "epoch": 0.9513329378217519, + "grad_norm": 1.5546875, + "learning_rate": 5.635649931161794e-05, + "loss": 6.8596, + "step": 16570 + }, + { + "epoch": 0.9519070675367921, + "grad_norm": 1.796875, + "learning_rate": 5.6310384016204965e-05, + "loss": 6.899, + "step": 16580 + }, + { + "epoch": 0.9524811972518324, + "grad_norm": 1.65625, + "learning_rate": 5.6264263265342586e-05, + "loss": 6.9051, + "step": 16590 + }, + { + "epoch": 0.9530553269668727, + "grad_norm": 1.53125, + "learning_rate": 5.62181370989031e-05, + "loss": 6.8821, + "step": 16600 + }, + { + "epoch": 0.953629456681913, + "grad_norm": 1.625, + "learning_rate": 5.617200555676344e-05, + "loss": 6.8929, + "step": 16610 + }, + { + "epoch": 0.9542035863969532, + "grad_norm": 1.5859375, + "learning_rate": 5.612586867880525e-05, + "loss": 6.8953, + "step": 16620 + }, + { + "epoch": 0.9547777161119936, + "grad_norm": 1.5703125, + "learning_rate": 5.607972650491476e-05, + "loss": 6.9298, + "step": 16630 + }, + { + "epoch": 0.9553518458270338, + "grad_norm": 1.5390625, + "learning_rate": 5.6033579074982766e-05, + "loss": 6.8771, + "step": 16640 + }, + { + "epoch": 0.9559259755420741, + "grad_norm": 1.5625, + "learning_rate": 5.598742642890461e-05, + "loss": 6.9059, + "step": 16650 + }, + { + "epoch": 0.9565001052571144, + "grad_norm": 1.5546875, + "learning_rate": 5.5941268606580146e-05, + "loss": 6.9305, + "step": 16660 + }, + { + "epoch": 0.9570742349721547, + "grad_norm": 1.6015625, + "learning_rate": 5.5895105647913716e-05, + "loss": 6.9076, + "step": 16670 + }, + { + "epoch": 0.9576483646871949, + "grad_norm": 1.640625, + "learning_rate": 5.58489375928141e-05, + "loss": 6.9448, + "step": 16680 + }, + { + "epoch": 0.9582224944022353, + "grad_norm": 1.546875, + "learning_rate": 5.580276448119447e-05, + "loss": 6.8704, + "step": 16690 + }, + { + "epoch": 0.9587966241172755, + "grad_norm": 1.5625, + "learning_rate": 5.5756586352972374e-05, + "loss": 6.9062, + "step": 16700 + }, + { + "epoch": 0.9593707538323158, + "grad_norm": 1.6171875, + "learning_rate": 5.571040324806969e-05, + "loss": 6.9017, + "step": 16710 + }, + { + "epoch": 0.9599448835473562, + "grad_norm": 1.6328125, + "learning_rate": 5.566421520641263e-05, + "loss": 6.8891, + "step": 16720 + }, + { + "epoch": 0.9605190132623964, + "grad_norm": 1.609375, + "learning_rate": 5.561802226793165e-05, + "loss": 6.9133, + "step": 16730 + }, + { + "epoch": 0.9610931429774368, + "grad_norm": 1.609375, + "learning_rate": 5.557182447256142e-05, + "loss": 6.865, + "step": 16740 + }, + { + "epoch": 0.961667272692477, + "grad_norm": 1.59375, + "learning_rate": 5.552562186024084e-05, + "loss": 6.8862, + "step": 16750 + }, + { + "epoch": 0.9622414024075173, + "grad_norm": 1.5703125, + "learning_rate": 5.547941447091297e-05, + "loss": 6.9085, + "step": 16760 + }, + { + "epoch": 0.9628155321225575, + "grad_norm": 1.6328125, + "learning_rate": 5.5433202344525e-05, + "loss": 6.8982, + "step": 16770 + }, + { + "epoch": 0.9633896618375979, + "grad_norm": 1.6015625, + "learning_rate": 5.53869855210282e-05, + "loss": 6.9161, + "step": 16780 + }, + { + "epoch": 0.9639637915526381, + "grad_norm": 1.6328125, + "learning_rate": 5.53407640403779e-05, + "loss": 6.8866, + "step": 16790 + }, + { + "epoch": 0.9645379212676785, + "grad_norm": 1.5546875, + "learning_rate": 5.52945379425335e-05, + "loss": 6.9069, + "step": 16800 + }, + { + "epoch": 0.9651120509827187, + "grad_norm": 1.578125, + "learning_rate": 5.5248307267458334e-05, + "loss": 6.9002, + "step": 16810 + }, + { + "epoch": 0.965686180697759, + "grad_norm": 1.640625, + "learning_rate": 5.5202072055119715e-05, + "loss": 6.8604, + "step": 16820 + }, + { + "epoch": 0.9662603104127993, + "grad_norm": 1.5, + "learning_rate": 5.5155832345488875e-05, + "loss": 6.8671, + "step": 16830 + }, + { + "epoch": 0.9668344401278396, + "grad_norm": 1.71875, + "learning_rate": 5.510958817854097e-05, + "loss": 6.9308, + "step": 16840 + }, + { + "epoch": 0.9674085698428798, + "grad_norm": 1.546875, + "learning_rate": 5.506333959425497e-05, + "loss": 6.9129, + "step": 16850 + }, + { + "epoch": 0.9679826995579202, + "grad_norm": 1.625, + "learning_rate": 5.501708663261366e-05, + "loss": 6.8769, + "step": 16860 + }, + { + "epoch": 0.9685568292729604, + "grad_norm": 1.53125, + "learning_rate": 5.49708293336036e-05, + "loss": 6.8734, + "step": 16870 + }, + { + "epoch": 0.9691309589880007, + "grad_norm": 1.53125, + "learning_rate": 5.492456773721517e-05, + "loss": 6.9219, + "step": 16880 + }, + { + "epoch": 0.969705088703041, + "grad_norm": 1.5625, + "learning_rate": 5.4878301883442396e-05, + "loss": 6.8936, + "step": 16890 + }, + { + "epoch": 0.9702792184180813, + "grad_norm": 1.5625, + "learning_rate": 5.483203181228301e-05, + "loss": 6.8887, + "step": 16900 + }, + { + "epoch": 0.9708533481331215, + "grad_norm": 1.671875, + "learning_rate": 5.4785757563738396e-05, + "loss": 6.9107, + "step": 16910 + }, + { + "epoch": 0.9714274778481619, + "grad_norm": 1.625, + "learning_rate": 5.4739479177813516e-05, + "loss": 6.9171, + "step": 16920 + }, + { + "epoch": 0.9720016075632021, + "grad_norm": 1.59375, + "learning_rate": 5.469319669451692e-05, + "loss": 6.9316, + "step": 16930 + }, + { + "epoch": 0.9725757372782424, + "grad_norm": 1.5390625, + "learning_rate": 5.4646910153860764e-05, + "loss": 6.8953, + "step": 16940 + }, + { + "epoch": 0.9731498669932827, + "grad_norm": 1.71875, + "learning_rate": 5.460061959586063e-05, + "loss": 6.8968, + "step": 16950 + }, + { + "epoch": 0.973723996708323, + "grad_norm": 1.5625, + "learning_rate": 5.455432506053562e-05, + "loss": 6.8586, + "step": 16960 + }, + { + "epoch": 0.9742981264233632, + "grad_norm": 1.6484375, + "learning_rate": 5.450802658790821e-05, + "loss": 6.8524, + "step": 16970 + }, + { + "epoch": 0.9748722561384036, + "grad_norm": 1.6171875, + "learning_rate": 5.4461724218004386e-05, + "loss": 6.9294, + "step": 16980 + }, + { + "epoch": 0.9754463858534438, + "grad_norm": 1.546875, + "learning_rate": 5.441541799085341e-05, + "loss": 6.8962, + "step": 16990 + }, + { + "epoch": 0.9760205155684841, + "grad_norm": 1.625, + "learning_rate": 5.436910794648794e-05, + "loss": 6.8631, + "step": 17000 + }, + { + "epoch": 0.9765946452835244, + "grad_norm": 1.5859375, + "learning_rate": 5.432279412494386e-05, + "loss": 6.8757, + "step": 17010 + }, + { + "epoch": 0.9771687749985647, + "grad_norm": 1.5390625, + "learning_rate": 5.4276476566260426e-05, + "loss": 6.9315, + "step": 17020 + }, + { + "epoch": 0.9777429047136049, + "grad_norm": 1.609375, + "learning_rate": 5.423015531048003e-05, + "loss": 6.9108, + "step": 17030 + }, + { + "epoch": 0.9783170344286453, + "grad_norm": 1.546875, + "learning_rate": 5.418383039764833e-05, + "loss": 6.851, + "step": 17040 + }, + { + "epoch": 0.9788911641436855, + "grad_norm": 1.5625, + "learning_rate": 5.413750186781406e-05, + "loss": 6.8727, + "step": 17050 + }, + { + "epoch": 0.9794652938587258, + "grad_norm": 1.578125, + "learning_rate": 5.409116976102916e-05, + "loss": 6.8615, + "step": 17060 + }, + { + "epoch": 0.980039423573766, + "grad_norm": 1.6875, + "learning_rate": 5.4044834117348643e-05, + "loss": 6.8752, + "step": 17070 + }, + { + "epoch": 0.9806135532888064, + "grad_norm": 1.6640625, + "learning_rate": 5.3998494976830574e-05, + "loss": 6.8833, + "step": 17080 + }, + { + "epoch": 0.9811876830038466, + "grad_norm": 1.5859375, + "learning_rate": 5.395215237953601e-05, + "loss": 6.8956, + "step": 17090 + }, + { + "epoch": 0.981761812718887, + "grad_norm": 1.59375, + "learning_rate": 5.390580636552904e-05, + "loss": 6.8642, + "step": 17100 + }, + { + "epoch": 0.9823359424339272, + "grad_norm": 1.5234375, + "learning_rate": 5.385945697487672e-05, + "loss": 6.9134, + "step": 17110 + }, + { + "epoch": 0.9829100721489675, + "grad_norm": 1.578125, + "learning_rate": 5.3813104247648973e-05, + "loss": 6.8912, + "step": 17120 + }, + { + "epoch": 0.9834842018640078, + "grad_norm": 1.609375, + "learning_rate": 5.376674822391861e-05, + "loss": 6.8854, + "step": 17130 + }, + { + "epoch": 0.9840583315790481, + "grad_norm": 1.640625, + "learning_rate": 5.372038894376135e-05, + "loss": 6.9169, + "step": 17140 + }, + { + "epoch": 0.9846324612940883, + "grad_norm": 1.7265625, + "learning_rate": 5.367402644725566e-05, + "loss": 6.8841, + "step": 17150 + }, + { + "epoch": 0.9852065910091287, + "grad_norm": 1.5234375, + "learning_rate": 5.3627660774482846e-05, + "loss": 6.9321, + "step": 17160 + }, + { + "epoch": 0.985780720724169, + "grad_norm": 1.5390625, + "learning_rate": 5.3581291965526924e-05, + "loss": 6.9105, + "step": 17170 + }, + { + "epoch": 0.9863548504392092, + "grad_norm": 1.5, + "learning_rate": 5.353492006047461e-05, + "loss": 6.9344, + "step": 17180 + }, + { + "epoch": 0.9869289801542496, + "grad_norm": 1.6171875, + "learning_rate": 5.348854509941533e-05, + "loss": 6.8562, + "step": 17190 + }, + { + "epoch": 0.9875031098692898, + "grad_norm": 1.6640625, + "learning_rate": 5.3442167122441145e-05, + "loss": 6.9075, + "step": 17200 + }, + { + "epoch": 0.9880772395843301, + "grad_norm": 1.75, + "learning_rate": 5.33957861696467e-05, + "loss": 6.894, + "step": 17210 + }, + { + "epoch": 0.9886513692993704, + "grad_norm": 1.5859375, + "learning_rate": 5.3349402281129246e-05, + "loss": 6.8863, + "step": 17220 + }, + { + "epoch": 0.9892254990144107, + "grad_norm": 1.59375, + "learning_rate": 5.330301549698853e-05, + "loss": 6.8475, + "step": 17230 + }, + { + "epoch": 0.9897996287294509, + "grad_norm": 1.59375, + "learning_rate": 5.325662585732683e-05, + "loss": 6.8824, + "step": 17240 + }, + { + "epoch": 0.9903737584444913, + "grad_norm": 1.65625, + "learning_rate": 5.321023340224893e-05, + "loss": 6.8266, + "step": 17250 + }, + { + "epoch": 0.9909478881595315, + "grad_norm": 1.6796875, + "learning_rate": 5.316383817186196e-05, + "loss": 6.8925, + "step": 17260 + }, + { + "epoch": 0.9915220178745718, + "grad_norm": 1.671875, + "learning_rate": 5.3117440206275504e-05, + "loss": 6.916, + "step": 17270 + }, + { + "epoch": 0.9920961475896121, + "grad_norm": 1.5859375, + "learning_rate": 5.307103954560153e-05, + "loss": 6.9125, + "step": 17280 + }, + { + "epoch": 0.9926702773046524, + "grad_norm": 1.484375, + "learning_rate": 5.302463622995429e-05, + "loss": 6.8616, + "step": 17290 + }, + { + "epoch": 0.9932444070196926, + "grad_norm": 1.5859375, + "learning_rate": 5.297823029945036e-05, + "loss": 6.9259, + "step": 17300 + }, + { + "epoch": 0.993818536734733, + "grad_norm": 1.546875, + "learning_rate": 5.293182179420855e-05, + "loss": 6.8977, + "step": 17310 + }, + { + "epoch": 0.9943926664497732, + "grad_norm": 1.609375, + "learning_rate": 5.288541075434992e-05, + "loss": 6.9003, + "step": 17320 + }, + { + "epoch": 0.9949667961648135, + "grad_norm": 1.671875, + "learning_rate": 5.283899721999772e-05, + "loss": 6.8966, + "step": 17330 + }, + { + "epoch": 0.9955409258798538, + "grad_norm": 1.5703125, + "learning_rate": 5.279258123127735e-05, + "loss": 6.9307, + "step": 17340 + }, + { + "epoch": 0.9961150555948941, + "grad_norm": 1.6171875, + "learning_rate": 5.2746162828316334e-05, + "loss": 6.9119, + "step": 17350 + }, + { + "epoch": 0.9966891853099343, + "grad_norm": 1.5703125, + "learning_rate": 5.269974205124426e-05, + "loss": 6.8631, + "step": 17360 + }, + { + "epoch": 0.9972633150249747, + "grad_norm": 1.578125, + "learning_rate": 5.265331894019283e-05, + "loss": 6.8766, + "step": 17370 + }, + { + "epoch": 0.9978374447400149, + "grad_norm": 1.546875, + "learning_rate": 5.260689353529571e-05, + "loss": 6.8461, + "step": 17380 + }, + { + "epoch": 0.9984115744550552, + "grad_norm": 1.609375, + "learning_rate": 5.256046587668855e-05, + "loss": 6.8429, + "step": 17390 + }, + { + "epoch": 0.9989857041700955, + "grad_norm": 1.671875, + "learning_rate": 5.251403600450895e-05, + "loss": 6.8993, + "step": 17400 + }, + { + "epoch": 0.9995598338851358, + "grad_norm": 1.5546875, + "learning_rate": 5.246760395889646e-05, + "loss": 6.8517, + "step": 17410 + }, + { + "epoch": 1.0001148259430082, + "grad_norm": 1.5078125, + "learning_rate": 5.2421169779992486e-05, + "loss": 6.8916, + "step": 17420 + }, + { + "epoch": 1.0006889556580483, + "grad_norm": 1.6015625, + "learning_rate": 5.237473350794026e-05, + "loss": 6.8312, + "step": 17430 + }, + { + "epoch": 1.0012630853730886, + "grad_norm": 1.59375, + "learning_rate": 5.232829518288487e-05, + "loss": 6.8256, + "step": 17440 + }, + { + "epoch": 1.001837215088129, + "grad_norm": 1.5390625, + "learning_rate": 5.22818548449731e-05, + "loss": 6.8157, + "step": 17450 + }, + { + "epoch": 1.0024113448031693, + "grad_norm": 1.578125, + "learning_rate": 5.223541253435356e-05, + "loss": 6.8387, + "step": 17460 + }, + { + "epoch": 1.0029854745182094, + "grad_norm": 1.7421875, + "learning_rate": 5.2188968291176524e-05, + "loss": 6.7973, + "step": 17470 + }, + { + "epoch": 1.0035596042332497, + "grad_norm": 1.5703125, + "learning_rate": 5.214252215559393e-05, + "loss": 6.8648, + "step": 17480 + }, + { + "epoch": 1.00413373394829, + "grad_norm": 1.640625, + "learning_rate": 5.209607416775937e-05, + "loss": 6.8186, + "step": 17490 + }, + { + "epoch": 1.0047078636633304, + "grad_norm": 1.6484375, + "learning_rate": 5.204962436782802e-05, + "loss": 6.8089, + "step": 17500 + }, + { + "epoch": 1.0052819933783705, + "grad_norm": 1.53125, + "learning_rate": 5.200317279595666e-05, + "loss": 6.7777, + "step": 17510 + }, + { + "epoch": 1.0058561230934109, + "grad_norm": 1.5625, + "learning_rate": 5.1956719492303554e-05, + "loss": 6.7948, + "step": 17520 + }, + { + "epoch": 1.0064302528084512, + "grad_norm": 1.6015625, + "learning_rate": 5.191026449702848e-05, + "loss": 6.8556, + "step": 17530 + }, + { + "epoch": 1.0070043825234916, + "grad_norm": 1.5546875, + "learning_rate": 5.186380785029269e-05, + "loss": 6.8099, + "step": 17540 + }, + { + "epoch": 1.0075785122385317, + "grad_norm": 1.53125, + "learning_rate": 5.181734959225886e-05, + "loss": 6.8332, + "step": 17550 + }, + { + "epoch": 1.008152641953572, + "grad_norm": 1.609375, + "learning_rate": 5.177088976309106e-05, + "loss": 6.8506, + "step": 17560 + }, + { + "epoch": 1.0087267716686124, + "grad_norm": 1.5859375, + "learning_rate": 5.1724428402954694e-05, + "loss": 6.805, + "step": 17570 + }, + { + "epoch": 1.0093009013836527, + "grad_norm": 1.6953125, + "learning_rate": 5.1677965552016515e-05, + "loss": 6.8179, + "step": 17580 + }, + { + "epoch": 1.0098750310986928, + "grad_norm": 1.5625, + "learning_rate": 5.163150125044458e-05, + "loss": 6.8217, + "step": 17590 + }, + { + "epoch": 1.0104491608137331, + "grad_norm": 1.5390625, + "learning_rate": 5.1585035538408155e-05, + "loss": 6.7778, + "step": 17600 + }, + { + "epoch": 1.0110232905287735, + "grad_norm": 1.5078125, + "learning_rate": 5.153856845607776e-05, + "loss": 6.8724, + "step": 17610 + }, + { + "epoch": 1.0115974202438138, + "grad_norm": 1.5859375, + "learning_rate": 5.149210004362508e-05, + "loss": 6.8117, + "step": 17620 + }, + { + "epoch": 1.012171549958854, + "grad_norm": 1.609375, + "learning_rate": 5.1445630341222984e-05, + "loss": 6.7884, + "step": 17630 + }, + { + "epoch": 1.0127456796738943, + "grad_norm": 1.5703125, + "learning_rate": 5.1399159389045406e-05, + "loss": 6.8375, + "step": 17640 + }, + { + "epoch": 1.0133198093889346, + "grad_norm": 1.6171875, + "learning_rate": 5.1352687227267395e-05, + "loss": 6.8501, + "step": 17650 + }, + { + "epoch": 1.013893939103975, + "grad_norm": 1.53125, + "learning_rate": 5.1306213896065024e-05, + "loss": 6.8114, + "step": 17660 + }, + { + "epoch": 1.014468068819015, + "grad_norm": 1.5859375, + "learning_rate": 5.12597394356154e-05, + "loss": 6.8388, + "step": 17670 + }, + { + "epoch": 1.0150421985340554, + "grad_norm": 1.5546875, + "learning_rate": 5.121326388609661e-05, + "loss": 6.8205, + "step": 17680 + }, + { + "epoch": 1.0156163282490958, + "grad_norm": 1.59375, + "learning_rate": 5.116678728768764e-05, + "loss": 6.8352, + "step": 17690 + }, + { + "epoch": 1.016190457964136, + "grad_norm": 1.609375, + "learning_rate": 5.112030968056843e-05, + "loss": 6.8312, + "step": 17700 + }, + { + "epoch": 1.0167645876791762, + "grad_norm": 1.546875, + "learning_rate": 5.107383110491978e-05, + "loss": 6.8543, + "step": 17710 + }, + { + "epoch": 1.0173387173942166, + "grad_norm": 1.625, + "learning_rate": 5.102735160092329e-05, + "loss": 6.7808, + "step": 17720 + }, + { + "epoch": 1.017912847109257, + "grad_norm": 2.125, + "learning_rate": 5.0980871208761427e-05, + "loss": 6.815, + "step": 17730 + }, + { + "epoch": 1.0184869768242972, + "grad_norm": 1.671875, + "learning_rate": 5.0934389968617366e-05, + "loss": 6.8204, + "step": 17740 + }, + { + "epoch": 1.0190611065393373, + "grad_norm": 1.5859375, + "learning_rate": 5.088790792067506e-05, + "loss": 6.8204, + "step": 17750 + }, + { + "epoch": 1.0196352362543777, + "grad_norm": 1.578125, + "learning_rate": 5.084142510511911e-05, + "loss": 6.7875, + "step": 17760 + }, + { + "epoch": 1.020209365969418, + "grad_norm": 1.5703125, + "learning_rate": 5.079494156213485e-05, + "loss": 6.8449, + "step": 17770 + }, + { + "epoch": 1.0207834956844584, + "grad_norm": 1.5234375, + "learning_rate": 5.0748457331908186e-05, + "loss": 6.7516, + "step": 17780 + }, + { + "epoch": 1.0213576253994985, + "grad_norm": 1.53125, + "learning_rate": 5.070197245462564e-05, + "loss": 6.7899, + "step": 17790 + }, + { + "epoch": 1.0219317551145388, + "grad_norm": 1.578125, + "learning_rate": 5.065548697047429e-05, + "loss": 6.7944, + "step": 17800 + }, + { + "epoch": 1.0225058848295792, + "grad_norm": 1.5625, + "learning_rate": 5.060900091964174e-05, + "loss": 6.7772, + "step": 17810 + }, + { + "epoch": 1.0230800145446195, + "grad_norm": 1.5859375, + "learning_rate": 5.056251434231607e-05, + "loss": 6.8124, + "step": 17820 + }, + { + "epoch": 1.0236541442596598, + "grad_norm": 1.5234375, + "learning_rate": 5.051602727868585e-05, + "loss": 6.7964, + "step": 17830 + }, + { + "epoch": 1.0242282739747, + "grad_norm": 1.5859375, + "learning_rate": 5.046953976894002e-05, + "loss": 6.8475, + "step": 17840 + }, + { + "epoch": 1.0248024036897403, + "grad_norm": 1.59375, + "learning_rate": 5.0423051853267965e-05, + "loss": 6.8388, + "step": 17850 + }, + { + "epoch": 1.0253765334047806, + "grad_norm": 1.5859375, + "learning_rate": 5.037656357185938e-05, + "loss": 6.824, + "step": 17860 + }, + { + "epoch": 1.025950663119821, + "grad_norm": 1.5, + "learning_rate": 5.0330074964904273e-05, + "loss": 6.7823, + "step": 17870 + }, + { + "epoch": 1.026524792834861, + "grad_norm": 1.640625, + "learning_rate": 5.028358607259297e-05, + "loss": 6.8141, + "step": 17880 + }, + { + "epoch": 1.0270989225499014, + "grad_norm": 1.5390625, + "learning_rate": 5.023709693511598e-05, + "loss": 6.8081, + "step": 17890 + }, + { + "epoch": 1.0276730522649418, + "grad_norm": 1.5703125, + "learning_rate": 5.019060759266411e-05, + "loss": 6.7904, + "step": 17900 + }, + { + "epoch": 1.028247181979982, + "grad_norm": 1.5234375, + "learning_rate": 5.014411808542827e-05, + "loss": 6.8179, + "step": 17910 + }, + { + "epoch": 1.0288213116950222, + "grad_norm": 1.5625, + "learning_rate": 5.009762845359954e-05, + "loss": 6.7808, + "step": 17920 + }, + { + "epoch": 1.0293954414100626, + "grad_norm": 1.4765625, + "learning_rate": 5.005113873736914e-05, + "loss": 6.784, + "step": 17930 + }, + { + "epoch": 1.029969571125103, + "grad_norm": 1.7890625, + "learning_rate": 5.000464897692829e-05, + "loss": 6.7981, + "step": 17940 + }, + { + "epoch": 1.0305437008401432, + "grad_norm": 1.5, + "learning_rate": 4.9958159212468335e-05, + "loss": 6.8296, + "step": 17950 + }, + { + "epoch": 1.0311178305551834, + "grad_norm": 1.46875, + "learning_rate": 4.991166948418054e-05, + "loss": 6.836, + "step": 17960 + }, + { + "epoch": 1.0316919602702237, + "grad_norm": 1.5703125, + "learning_rate": 4.9865179832256174e-05, + "loss": 6.8217, + "step": 17970 + }, + { + "epoch": 1.032266089985264, + "grad_norm": 1.53125, + "learning_rate": 4.9818690296886475e-05, + "loss": 6.86, + "step": 17980 + }, + { + "epoch": 1.0328402197003044, + "grad_norm": 1.7421875, + "learning_rate": 4.977220091826253e-05, + "loss": 6.8151, + "step": 17990 + }, + { + "epoch": 1.0334143494153445, + "grad_norm": 1.53125, + "learning_rate": 4.972571173657531e-05, + "loss": 6.7877, + "step": 18000 + }, + { + "epoch": 1.0339884791303848, + "grad_norm": 1.4765625, + "learning_rate": 4.96792227920156e-05, + "loss": 6.7953, + "step": 18010 + }, + { + "epoch": 1.0345626088454252, + "grad_norm": 1.578125, + "learning_rate": 4.963273412477403e-05, + "loss": 6.8301, + "step": 18020 + }, + { + "epoch": 1.0351367385604655, + "grad_norm": 1.640625, + "learning_rate": 4.9586245775040926e-05, + "loss": 6.8222, + "step": 18030 + }, + { + "epoch": 1.0357108682755056, + "grad_norm": 1.6015625, + "learning_rate": 4.9539757783006376e-05, + "loss": 6.805, + "step": 18040 + }, + { + "epoch": 1.036284997990546, + "grad_norm": 1.5625, + "learning_rate": 4.949327018886013e-05, + "loss": 6.778, + "step": 18050 + }, + { + "epoch": 1.0368591277055863, + "grad_norm": 1.5859375, + "learning_rate": 4.944678303279166e-05, + "loss": 6.7922, + "step": 18060 + }, + { + "epoch": 1.0374332574206266, + "grad_norm": 1.515625, + "learning_rate": 4.9400296354989974e-05, + "loss": 6.8341, + "step": 18070 + }, + { + "epoch": 1.0380073871356668, + "grad_norm": 1.6484375, + "learning_rate": 4.935381019564374e-05, + "loss": 6.853, + "step": 18080 + }, + { + "epoch": 1.038581516850707, + "grad_norm": 1.6171875, + "learning_rate": 4.930732459494113e-05, + "loss": 6.8091, + "step": 18090 + }, + { + "epoch": 1.0391556465657474, + "grad_norm": 1.5625, + "learning_rate": 4.926083959306984e-05, + "loss": 6.8279, + "step": 18100 + }, + { + "epoch": 1.0397297762807878, + "grad_norm": 1.578125, + "learning_rate": 4.9214355230217054e-05, + "loss": 6.844, + "step": 18110 + }, + { + "epoch": 1.040303905995828, + "grad_norm": 1.6171875, + "learning_rate": 4.916787154656942e-05, + "loss": 6.8196, + "step": 18120 + }, + { + "epoch": 1.0408780357108682, + "grad_norm": 1.546875, + "learning_rate": 4.912138858231297e-05, + "loss": 6.8181, + "step": 18130 + }, + { + "epoch": 1.0414521654259086, + "grad_norm": 1.609375, + "learning_rate": 4.907490637763314e-05, + "loss": 6.7709, + "step": 18140 + }, + { + "epoch": 1.042026295140949, + "grad_norm": 1.5859375, + "learning_rate": 4.90284249727147e-05, + "loss": 6.7914, + "step": 18150 + }, + { + "epoch": 1.042600424855989, + "grad_norm": 1.5546875, + "learning_rate": 4.8981944407741704e-05, + "loss": 6.7891, + "step": 18160 + }, + { + "epoch": 1.0431745545710294, + "grad_norm": 1.5859375, + "learning_rate": 4.8935464722897525e-05, + "loss": 6.8074, + "step": 18170 + }, + { + "epoch": 1.0437486842860697, + "grad_norm": 1.5703125, + "learning_rate": 4.888898595836475e-05, + "loss": 6.8193, + "step": 18180 + }, + { + "epoch": 1.04432281400111, + "grad_norm": 1.5234375, + "learning_rate": 4.884250815432516e-05, + "loss": 6.8144, + "step": 18190 + }, + { + "epoch": 1.0448969437161502, + "grad_norm": 1.625, + "learning_rate": 4.879603135095972e-05, + "loss": 6.8019, + "step": 18200 + }, + { + "epoch": 1.0454710734311905, + "grad_norm": 1.546875, + "learning_rate": 4.874955558844854e-05, + "loss": 6.8075, + "step": 18210 + }, + { + "epoch": 1.0460452031462308, + "grad_norm": 1.6015625, + "learning_rate": 4.870308090697082e-05, + "loss": 6.799, + "step": 18220 + }, + { + "epoch": 1.0466193328612712, + "grad_norm": 1.546875, + "learning_rate": 4.8656607346704805e-05, + "loss": 6.7688, + "step": 18230 + }, + { + "epoch": 1.0471934625763115, + "grad_norm": 1.65625, + "learning_rate": 4.861013494782779e-05, + "loss": 6.8544, + "step": 18240 + }, + { + "epoch": 1.0477675922913516, + "grad_norm": 1.5703125, + "learning_rate": 4.8563663750516094e-05, + "loss": 6.7944, + "step": 18250 + }, + { + "epoch": 1.048341722006392, + "grad_norm": 1.5390625, + "learning_rate": 4.8517193794944945e-05, + "loss": 6.8, + "step": 18260 + }, + { + "epoch": 1.0489158517214323, + "grad_norm": 1.5625, + "learning_rate": 4.847072512128852e-05, + "loss": 6.8211, + "step": 18270 + }, + { + "epoch": 1.0494899814364727, + "grad_norm": 1.5859375, + "learning_rate": 4.8424257769719885e-05, + "loss": 6.8798, + "step": 18280 + }, + { + "epoch": 1.0500641111515128, + "grad_norm": 1.546875, + "learning_rate": 4.8377791780411e-05, + "loss": 6.7913, + "step": 18290 + }, + { + "epoch": 1.050638240866553, + "grad_norm": 1.5390625, + "learning_rate": 4.8331327193532585e-05, + "loss": 6.8235, + "step": 18300 + }, + { + "epoch": 1.0512123705815934, + "grad_norm": 1.6640625, + "learning_rate": 4.828486404925418e-05, + "loss": 6.822, + "step": 18310 + }, + { + "epoch": 1.0517865002966338, + "grad_norm": 1.515625, + "learning_rate": 4.823840238774408e-05, + "loss": 6.7757, + "step": 18320 + }, + { + "epoch": 1.052360630011674, + "grad_norm": 1.5546875, + "learning_rate": 4.8191942249169266e-05, + "loss": 6.8006, + "step": 18330 + }, + { + "epoch": 1.0529347597267142, + "grad_norm": 1.578125, + "learning_rate": 4.8145483673695476e-05, + "loss": 6.7836, + "step": 18340 + }, + { + "epoch": 1.0535088894417546, + "grad_norm": 1.6328125, + "learning_rate": 4.8099026701487024e-05, + "loss": 6.8307, + "step": 18350 + }, + { + "epoch": 1.054083019156795, + "grad_norm": 1.5859375, + "learning_rate": 4.8052571372706866e-05, + "loss": 6.7965, + "step": 18360 + }, + { + "epoch": 1.054657148871835, + "grad_norm": 1.515625, + "learning_rate": 4.8006117727516525e-05, + "loss": 6.8181, + "step": 18370 + }, + { + "epoch": 1.0552312785868754, + "grad_norm": 1.6171875, + "learning_rate": 4.79596658060761e-05, + "loss": 6.851, + "step": 18380 + }, + { + "epoch": 1.0558054083019157, + "grad_norm": 1.5625, + "learning_rate": 4.791321564854417e-05, + "loss": 6.8201, + "step": 18390 + }, + { + "epoch": 1.056379538016956, + "grad_norm": 1.609375, + "learning_rate": 4.786676729507779e-05, + "loss": 6.8427, + "step": 18400 + }, + { + "epoch": 1.0569536677319962, + "grad_norm": 1.640625, + "learning_rate": 4.7820320785832454e-05, + "loss": 6.7823, + "step": 18410 + }, + { + "epoch": 1.0575277974470365, + "grad_norm": 1.640625, + "learning_rate": 4.777387616096209e-05, + "loss": 6.7862, + "step": 18420 + }, + { + "epoch": 1.0581019271620768, + "grad_norm": 1.5546875, + "learning_rate": 4.772743346061898e-05, + "loss": 6.845, + "step": 18430 + }, + { + "epoch": 1.0586760568771172, + "grad_norm": 1.6015625, + "learning_rate": 4.768099272495373e-05, + "loss": 6.8106, + "step": 18440 + }, + { + "epoch": 1.0592501865921573, + "grad_norm": 1.5859375, + "learning_rate": 4.763455399411524e-05, + "loss": 6.8431, + "step": 18450 + }, + { + "epoch": 1.0598243163071976, + "grad_norm": 1.546875, + "learning_rate": 4.7588117308250725e-05, + "loss": 6.8614, + "step": 18460 + }, + { + "epoch": 1.060398446022238, + "grad_norm": 1.5625, + "learning_rate": 4.7541682707505584e-05, + "loss": 6.8251, + "step": 18470 + }, + { + "epoch": 1.0609725757372783, + "grad_norm": 1.5859375, + "learning_rate": 4.7495250232023435e-05, + "loss": 6.863, + "step": 18480 + }, + { + "epoch": 1.0615467054523184, + "grad_norm": 1.5078125, + "learning_rate": 4.744881992194605e-05, + "loss": 6.8241, + "step": 18490 + }, + { + "epoch": 1.0621208351673588, + "grad_norm": 1.5, + "learning_rate": 4.7402391817413324e-05, + "loss": 6.8399, + "step": 18500 + }, + { + "epoch": 1.0626949648823991, + "grad_norm": 1.6015625, + "learning_rate": 4.735596595856327e-05, + "loss": 6.8467, + "step": 18510 + }, + { + "epoch": 1.0632690945974395, + "grad_norm": 1.578125, + "learning_rate": 4.730954238553194e-05, + "loss": 6.8012, + "step": 18520 + }, + { + "epoch": 1.0638432243124796, + "grad_norm": 1.5390625, + "learning_rate": 4.7263121138453406e-05, + "loss": 6.7794, + "step": 18530 + }, + { + "epoch": 1.06441735402752, + "grad_norm": 1.53125, + "learning_rate": 4.721670225745974e-05, + "loss": 6.7886, + "step": 18540 + }, + { + "epoch": 1.0649914837425603, + "grad_norm": 1.5390625, + "learning_rate": 4.717028578268097e-05, + "loss": 6.8604, + "step": 18550 + }, + { + "epoch": 1.0655656134576006, + "grad_norm": 1.5625, + "learning_rate": 4.712387175424504e-05, + "loss": 6.7841, + "step": 18560 + }, + { + "epoch": 1.0661397431726407, + "grad_norm": 1.609375, + "learning_rate": 4.7077460212277775e-05, + "loss": 6.8609, + "step": 18570 + }, + { + "epoch": 1.066713872887681, + "grad_norm": 1.6328125, + "learning_rate": 4.703105119690283e-05, + "loss": 6.8164, + "step": 18580 + }, + { + "epoch": 1.0672880026027214, + "grad_norm": 1.65625, + "learning_rate": 4.6984644748241744e-05, + "loss": 6.817, + "step": 18590 + }, + { + "epoch": 1.0678621323177617, + "grad_norm": 1.640625, + "learning_rate": 4.693824090641375e-05, + "loss": 6.8193, + "step": 18600 + }, + { + "epoch": 1.0684362620328018, + "grad_norm": 1.5703125, + "learning_rate": 4.689183971153591e-05, + "loss": 6.7969, + "step": 18610 + }, + { + "epoch": 1.0690103917478422, + "grad_norm": 1.6015625, + "learning_rate": 4.6845441203722926e-05, + "loss": 6.8536, + "step": 18620 + }, + { + "epoch": 1.0695845214628825, + "grad_norm": 1.59375, + "learning_rate": 4.6799045423087225e-05, + "loss": 6.8249, + "step": 18630 + }, + { + "epoch": 1.0701586511779229, + "grad_norm": 1.59375, + "learning_rate": 4.675265240973883e-05, + "loss": 6.8164, + "step": 18640 + }, + { + "epoch": 1.0707327808929632, + "grad_norm": 1.4765625, + "learning_rate": 4.670626220378545e-05, + "loss": 6.8229, + "step": 18650 + }, + { + "epoch": 1.0713069106080033, + "grad_norm": 1.5390625, + "learning_rate": 4.665987484533229e-05, + "loss": 6.8113, + "step": 18660 + }, + { + "epoch": 1.0718810403230437, + "grad_norm": 1.5859375, + "learning_rate": 4.661349037448211e-05, + "loss": 6.8164, + "step": 18670 + }, + { + "epoch": 1.072455170038084, + "grad_norm": 1.5859375, + "learning_rate": 4.6567108831335196e-05, + "loss": 6.8221, + "step": 18680 + }, + { + "epoch": 1.073029299753124, + "grad_norm": 1.5546875, + "learning_rate": 4.65207302559893e-05, + "loss": 6.816, + "step": 18690 + }, + { + "epoch": 1.0736034294681644, + "grad_norm": 1.6875, + "learning_rate": 4.6474354688539596e-05, + "loss": 6.8139, + "step": 18700 + }, + { + "epoch": 1.0741775591832048, + "grad_norm": 1.6015625, + "learning_rate": 4.642798216907866e-05, + "loss": 6.8234, + "step": 18710 + }, + { + "epoch": 1.0747516888982451, + "grad_norm": 1.546875, + "learning_rate": 4.638161273769643e-05, + "loss": 6.8128, + "step": 18720 + }, + { + "epoch": 1.0753258186132855, + "grad_norm": 1.5078125, + "learning_rate": 4.633524643448017e-05, + "loss": 6.7798, + "step": 18730 + }, + { + "epoch": 1.0758999483283256, + "grad_norm": 1.5625, + "learning_rate": 4.6288883299514486e-05, + "loss": 6.8403, + "step": 18740 + }, + { + "epoch": 1.076474078043366, + "grad_norm": 1.59375, + "learning_rate": 4.624252337288117e-05, + "loss": 6.8104, + "step": 18750 + }, + { + "epoch": 1.0770482077584063, + "grad_norm": 1.6484375, + "learning_rate": 4.619616669465929e-05, + "loss": 6.8176, + "step": 18760 + }, + { + "epoch": 1.0776223374734466, + "grad_norm": 1.546875, + "learning_rate": 4.614981330492509e-05, + "loss": 6.8101, + "step": 18770 + }, + { + "epoch": 1.0781964671884867, + "grad_norm": 1.5859375, + "learning_rate": 4.6103463243751995e-05, + "loss": 6.7995, + "step": 18780 + }, + { + "epoch": 1.078770596903527, + "grad_norm": 1.5625, + "learning_rate": 4.6057116551210506e-05, + "loss": 6.8148, + "step": 18790 + }, + { + "epoch": 1.0793447266185674, + "grad_norm": 1.578125, + "learning_rate": 4.601077326736825e-05, + "loss": 6.8365, + "step": 18800 + }, + { + "epoch": 1.0799188563336077, + "grad_norm": 1.5625, + "learning_rate": 4.5964433432289885e-05, + "loss": 6.8396, + "step": 18810 + }, + { + "epoch": 1.0804929860486479, + "grad_norm": 1.5859375, + "learning_rate": 4.5918097086037116e-05, + "loss": 6.7852, + "step": 18820 + }, + { + "epoch": 1.0810671157636882, + "grad_norm": 1.546875, + "learning_rate": 4.5871764268668603e-05, + "loss": 6.7954, + "step": 18830 + }, + { + "epoch": 1.0816412454787285, + "grad_norm": 1.6640625, + "learning_rate": 4.582543502023996e-05, + "loss": 6.8054, + "step": 18840 + }, + { + "epoch": 1.0822153751937689, + "grad_norm": 1.6875, + "learning_rate": 4.577910938080372e-05, + "loss": 6.8172, + "step": 18850 + }, + { + "epoch": 1.082789504908809, + "grad_norm": 1.609375, + "learning_rate": 4.573278739040932e-05, + "loss": 6.826, + "step": 18860 + }, + { + "epoch": 1.0833636346238493, + "grad_norm": 1.546875, + "learning_rate": 4.568646908910299e-05, + "loss": 6.8217, + "step": 18870 + }, + { + "epoch": 1.0839377643388897, + "grad_norm": 1.59375, + "learning_rate": 4.564015451692782e-05, + "loss": 6.7828, + "step": 18880 + }, + { + "epoch": 1.08451189405393, + "grad_norm": 1.625, + "learning_rate": 4.5593843713923645e-05, + "loss": 6.7962, + "step": 18890 + }, + { + "epoch": 1.0850860237689701, + "grad_norm": 1.578125, + "learning_rate": 4.5547536720127045e-05, + "loss": 6.8068, + "step": 18900 + }, + { + "epoch": 1.0856601534840105, + "grad_norm": 1.5078125, + "learning_rate": 4.5501233575571337e-05, + "loss": 6.831, + "step": 18910 + }, + { + "epoch": 1.0862342831990508, + "grad_norm": 1.6171875, + "learning_rate": 4.545493432028648e-05, + "loss": 6.8396, + "step": 18920 + }, + { + "epoch": 1.0868084129140911, + "grad_norm": 1.6953125, + "learning_rate": 4.5408638994299066e-05, + "loss": 6.8095, + "step": 18930 + }, + { + "epoch": 1.0873825426291313, + "grad_norm": 1.6796875, + "learning_rate": 4.536234763763231e-05, + "loss": 6.7861, + "step": 18940 + }, + { + "epoch": 1.0879566723441716, + "grad_norm": 1.640625, + "learning_rate": 4.5316060290305996e-05, + "loss": 6.7677, + "step": 18950 + }, + { + "epoch": 1.088530802059212, + "grad_norm": 1.5703125, + "learning_rate": 4.526977699233643e-05, + "loss": 6.7941, + "step": 18960 + }, + { + "epoch": 1.0891049317742523, + "grad_norm": 1.578125, + "learning_rate": 4.522349778373641e-05, + "loss": 6.7881, + "step": 18970 + }, + { + "epoch": 1.0896790614892924, + "grad_norm": 1.5546875, + "learning_rate": 4.517722270451521e-05, + "loss": 6.8236, + "step": 18980 + }, + { + "epoch": 1.0902531912043327, + "grad_norm": 1.578125, + "learning_rate": 4.513095179467855e-05, + "loss": 6.8218, + "step": 18990 + }, + { + "epoch": 1.090827320919373, + "grad_norm": 1.640625, + "learning_rate": 4.5084685094228524e-05, + "loss": 6.8067, + "step": 19000 + }, + { + "epoch": 1.0914014506344134, + "grad_norm": 1.5859375, + "learning_rate": 4.503842264316359e-05, + "loss": 6.8452, + "step": 19010 + }, + { + "epoch": 1.0919755803494535, + "grad_norm": 1.546875, + "learning_rate": 4.499216448147852e-05, + "loss": 6.8308, + "step": 19020 + }, + { + "epoch": 1.0925497100644939, + "grad_norm": 1.6640625, + "learning_rate": 4.494591064916441e-05, + "loss": 6.8074, + "step": 19030 + }, + { + "epoch": 1.0931238397795342, + "grad_norm": 1.5859375, + "learning_rate": 4.489966118620859e-05, + "loss": 6.7477, + "step": 19040 + }, + { + "epoch": 1.0936979694945745, + "grad_norm": 1.5625, + "learning_rate": 4.485341613259462e-05, + "loss": 6.8247, + "step": 19050 + }, + { + "epoch": 1.0942720992096147, + "grad_norm": 1.6640625, + "learning_rate": 4.4807175528302234e-05, + "loss": 6.808, + "step": 19060 + }, + { + "epoch": 1.094846228924655, + "grad_norm": 1.5703125, + "learning_rate": 4.4760939413307355e-05, + "loss": 6.8517, + "step": 19070 + }, + { + "epoch": 1.0954203586396953, + "grad_norm": 1.578125, + "learning_rate": 4.4714707827581994e-05, + "loss": 6.8317, + "step": 19080 + }, + { + "epoch": 1.0959944883547357, + "grad_norm": 1.546875, + "learning_rate": 4.466848081109424e-05, + "loss": 6.833, + "step": 19090 + }, + { + "epoch": 1.0965686180697758, + "grad_norm": 1.625, + "learning_rate": 4.4622258403808226e-05, + "loss": 6.8285, + "step": 19100 + }, + { + "epoch": 1.0971427477848161, + "grad_norm": 1.546875, + "learning_rate": 4.4576040645684174e-05, + "loss": 6.8295, + "step": 19110 + }, + { + "epoch": 1.0977168774998565, + "grad_norm": 1.6171875, + "learning_rate": 4.452982757667821e-05, + "loss": 6.7981, + "step": 19120 + }, + { + "epoch": 1.0982910072148968, + "grad_norm": 1.5234375, + "learning_rate": 4.44836192367424e-05, + "loss": 6.8319, + "step": 19130 + }, + { + "epoch": 1.0988651369299371, + "grad_norm": 1.640625, + "learning_rate": 4.44374156658248e-05, + "loss": 6.8486, + "step": 19140 + }, + { + "epoch": 1.0994392666449773, + "grad_norm": 1.6328125, + "learning_rate": 4.439121690386926e-05, + "loss": 6.7847, + "step": 19150 + }, + { + "epoch": 1.1000133963600176, + "grad_norm": 1.578125, + "learning_rate": 4.434502299081551e-05, + "loss": 6.7814, + "step": 19160 + }, + { + "epoch": 1.100587526075058, + "grad_norm": 1.6328125, + "learning_rate": 4.429883396659908e-05, + "loss": 6.7948, + "step": 19170 + }, + { + "epoch": 1.101161655790098, + "grad_norm": 1.546875, + "learning_rate": 4.42526498711513e-05, + "loss": 6.7628, + "step": 19180 + }, + { + "epoch": 1.1017357855051384, + "grad_norm": 1.484375, + "learning_rate": 4.42064707443992e-05, + "loss": 6.8418, + "step": 19190 + }, + { + "epoch": 1.1023099152201787, + "grad_norm": 1.578125, + "learning_rate": 4.416029662626553e-05, + "loss": 6.7977, + "step": 19200 + }, + { + "epoch": 1.102884044935219, + "grad_norm": 1.6171875, + "learning_rate": 4.41141275566687e-05, + "loss": 6.7637, + "step": 19210 + }, + { + "epoch": 1.1034581746502594, + "grad_norm": 1.609375, + "learning_rate": 4.40679635755228e-05, + "loss": 6.8116, + "step": 19220 + }, + { + "epoch": 1.1040323043652995, + "grad_norm": 1.6171875, + "learning_rate": 4.4021804722737466e-05, + "loss": 6.8296, + "step": 19230 + }, + { + "epoch": 1.1046064340803399, + "grad_norm": 1.6015625, + "learning_rate": 4.3975651038217916e-05, + "loss": 6.7715, + "step": 19240 + }, + { + "epoch": 1.1051805637953802, + "grad_norm": 1.59375, + "learning_rate": 4.39295025618649e-05, + "loss": 6.821, + "step": 19250 + }, + { + "epoch": 1.1057546935104205, + "grad_norm": 1.46875, + "learning_rate": 4.3883359333574695e-05, + "loss": 6.8011, + "step": 19260 + }, + { + "epoch": 1.1063288232254607, + "grad_norm": 1.5859375, + "learning_rate": 4.3837221393239015e-05, + "loss": 6.8172, + "step": 19270 + }, + { + "epoch": 1.106902952940501, + "grad_norm": 1.4921875, + "learning_rate": 4.3791088780744984e-05, + "loss": 6.7852, + "step": 19280 + }, + { + "epoch": 1.1074770826555413, + "grad_norm": 1.5390625, + "learning_rate": 4.374496153597514e-05, + "loss": 6.8689, + "step": 19290 + }, + { + "epoch": 1.1080512123705817, + "grad_norm": 1.5390625, + "learning_rate": 4.3698839698807415e-05, + "loss": 6.7963, + "step": 19300 + }, + { + "epoch": 1.1086253420856218, + "grad_norm": 1.609375, + "learning_rate": 4.3652723309115e-05, + "loss": 6.7908, + "step": 19310 + }, + { + "epoch": 1.1091994718006621, + "grad_norm": 1.5703125, + "learning_rate": 4.360661240676642e-05, + "loss": 6.8105, + "step": 19320 + }, + { + "epoch": 1.1097736015157025, + "grad_norm": 1.609375, + "learning_rate": 4.3560507031625435e-05, + "loss": 6.7767, + "step": 19330 + }, + { + "epoch": 1.1103477312307428, + "grad_norm": 1.53125, + "learning_rate": 4.351440722355104e-05, + "loss": 6.7945, + "step": 19340 + }, + { + "epoch": 1.110921860945783, + "grad_norm": 1.6171875, + "learning_rate": 4.346831302239743e-05, + "loss": 6.8243, + "step": 19350 + }, + { + "epoch": 1.1114959906608233, + "grad_norm": 1.5859375, + "learning_rate": 4.342222446801392e-05, + "loss": 6.8118, + "step": 19360 + }, + { + "epoch": 1.1120701203758636, + "grad_norm": 1.5234375, + "learning_rate": 4.3376141600244957e-05, + "loss": 6.822, + "step": 19370 + }, + { + "epoch": 1.112644250090904, + "grad_norm": 1.6015625, + "learning_rate": 4.3330064458930076e-05, + "loss": 6.8116, + "step": 19380 + }, + { + "epoch": 1.113218379805944, + "grad_norm": 1.5078125, + "learning_rate": 4.328399308390387e-05, + "loss": 6.8233, + "step": 19390 + }, + { + "epoch": 1.1137925095209844, + "grad_norm": 1.6015625, + "learning_rate": 4.323792751499593e-05, + "loss": 6.8053, + "step": 19400 + }, + { + "epoch": 1.1143666392360247, + "grad_norm": 1.5234375, + "learning_rate": 4.3191867792030834e-05, + "loss": 6.8136, + "step": 19410 + }, + { + "epoch": 1.114940768951065, + "grad_norm": 1.7109375, + "learning_rate": 4.314581395482809e-05, + "loss": 6.7782, + "step": 19420 + }, + { + "epoch": 1.1155148986661052, + "grad_norm": 1.6015625, + "learning_rate": 4.309976604320217e-05, + "loss": 6.7782, + "step": 19430 + }, + { + "epoch": 1.1160890283811455, + "grad_norm": 1.59375, + "learning_rate": 4.305372409696236e-05, + "loss": 6.8324, + "step": 19440 + }, + { + "epoch": 1.1166631580961859, + "grad_norm": 1.546875, + "learning_rate": 4.300768815591282e-05, + "loss": 6.8208, + "step": 19450 + }, + { + "epoch": 1.1172372878112262, + "grad_norm": 1.59375, + "learning_rate": 4.296165825985251e-05, + "loss": 6.8429, + "step": 19460 + }, + { + "epoch": 1.1178114175262663, + "grad_norm": 1.5625, + "learning_rate": 4.2915634448575184e-05, + "loss": 6.8359, + "step": 19470 + }, + { + "epoch": 1.1183855472413067, + "grad_norm": 1.6171875, + "learning_rate": 4.2869616761869304e-05, + "loss": 6.818, + "step": 19480 + }, + { + "epoch": 1.118959676956347, + "grad_norm": 1.53125, + "learning_rate": 4.282360523951806e-05, + "loss": 6.7769, + "step": 19490 + }, + { + "epoch": 1.1195338066713874, + "grad_norm": 1.578125, + "learning_rate": 4.2777599921299304e-05, + "loss": 6.7761, + "step": 19500 + }, + { + "epoch": 1.1201079363864275, + "grad_norm": 1.5390625, + "learning_rate": 4.273160084698552e-05, + "loss": 6.7693, + "step": 19510 + }, + { + "epoch": 1.1206820661014678, + "grad_norm": 1.515625, + "learning_rate": 4.268560805634382e-05, + "loss": 6.7573, + "step": 19520 + }, + { + "epoch": 1.1212561958165081, + "grad_norm": 1.6015625, + "learning_rate": 4.2639621589135845e-05, + "loss": 6.8521, + "step": 19530 + }, + { + "epoch": 1.1218303255315485, + "grad_norm": 1.5625, + "learning_rate": 4.259364148511779e-05, + "loss": 6.787, + "step": 19540 + }, + { + "epoch": 1.1224044552465888, + "grad_norm": 1.5625, + "learning_rate": 4.254766778404034e-05, + "loss": 6.8294, + "step": 19550 + }, + { + "epoch": 1.122978584961629, + "grad_norm": 1.515625, + "learning_rate": 4.250170052564868e-05, + "loss": 6.8143, + "step": 19560 + }, + { + "epoch": 1.1235527146766693, + "grad_norm": 1.5390625, + "learning_rate": 4.2455739749682374e-05, + "loss": 6.8112, + "step": 19570 + }, + { + "epoch": 1.1241268443917096, + "grad_norm": 1.5546875, + "learning_rate": 4.24097854958754e-05, + "loss": 6.8071, + "step": 19580 + }, + { + "epoch": 1.1247009741067497, + "grad_norm": 1.515625, + "learning_rate": 4.2363837803956115e-05, + "loss": 6.8058, + "step": 19590 + }, + { + "epoch": 1.12527510382179, + "grad_norm": 1.71875, + "learning_rate": 4.2317896713647185e-05, + "loss": 6.821, + "step": 19600 + }, + { + "epoch": 1.1258492335368304, + "grad_norm": 1.609375, + "learning_rate": 4.2271962264665575e-05, + "loss": 6.8413, + "step": 19610 + }, + { + "epoch": 1.1264233632518708, + "grad_norm": 1.5234375, + "learning_rate": 4.222603449672249e-05, + "loss": 6.7789, + "step": 19620 + }, + { + "epoch": 1.126997492966911, + "grad_norm": 1.5078125, + "learning_rate": 4.218011344952341e-05, + "loss": 6.7947, + "step": 19630 + }, + { + "epoch": 1.1275716226819512, + "grad_norm": 1.5859375, + "learning_rate": 4.2134199162767956e-05, + "loss": 6.8209, + "step": 19640 + }, + { + "epoch": 1.1281457523969916, + "grad_norm": 1.5546875, + "learning_rate": 4.208829167614991e-05, + "loss": 6.7714, + "step": 19650 + }, + { + "epoch": 1.128719882112032, + "grad_norm": 1.5546875, + "learning_rate": 4.20423910293572e-05, + "loss": 6.8094, + "step": 19660 + }, + { + "epoch": 1.129294011827072, + "grad_norm": 1.4765625, + "learning_rate": 4.199649726207181e-05, + "loss": 6.8415, + "step": 19670 + }, + { + "epoch": 1.1298681415421123, + "grad_norm": 1.546875, + "learning_rate": 4.1950610413969814e-05, + "loss": 6.7762, + "step": 19680 + }, + { + "epoch": 1.1304422712571527, + "grad_norm": 1.5234375, + "learning_rate": 4.190473052472125e-05, + "loss": 6.8122, + "step": 19690 + }, + { + "epoch": 1.131016400972193, + "grad_norm": 1.6875, + "learning_rate": 4.1858857633990204e-05, + "loss": 6.7658, + "step": 19700 + }, + { + "epoch": 1.1315905306872334, + "grad_norm": 1.5390625, + "learning_rate": 4.181299178143467e-05, + "loss": 6.7999, + "step": 19710 + }, + { + "epoch": 1.1321646604022735, + "grad_norm": 1.5703125, + "learning_rate": 4.1767133006706555e-05, + "loss": 6.8033, + "step": 19720 + }, + { + "epoch": 1.1327387901173138, + "grad_norm": 1.5546875, + "learning_rate": 4.172128134945167e-05, + "loss": 6.8088, + "step": 19730 + }, + { + "epoch": 1.1333129198323542, + "grad_norm": 1.59375, + "learning_rate": 4.167543684930966e-05, + "loss": 6.7741, + "step": 19740 + }, + { + "epoch": 1.1338870495473945, + "grad_norm": 1.625, + "learning_rate": 4.162959954591399e-05, + "loss": 6.8355, + "step": 19750 + }, + { + "epoch": 1.1344611792624346, + "grad_norm": 1.65625, + "learning_rate": 4.1583769478891885e-05, + "loss": 6.7529, + "step": 19760 + }, + { + "epoch": 1.135035308977475, + "grad_norm": 1.5625, + "learning_rate": 4.153794668786435e-05, + "loss": 6.8089, + "step": 19770 + }, + { + "epoch": 1.1356094386925153, + "grad_norm": 1.5546875, + "learning_rate": 4.149213121244604e-05, + "loss": 6.7903, + "step": 19780 + }, + { + "epoch": 1.1361835684075556, + "grad_norm": 1.515625, + "learning_rate": 4.144632309224536e-05, + "loss": 6.8052, + "step": 19790 + }, + { + "epoch": 1.1367576981225957, + "grad_norm": 1.625, + "learning_rate": 4.1400522366864306e-05, + "loss": 6.8578, + "step": 19800 + }, + { + "epoch": 1.137331827837636, + "grad_norm": 1.6015625, + "learning_rate": 4.135472907589849e-05, + "loss": 6.8058, + "step": 19810 + }, + { + "epoch": 1.1379059575526764, + "grad_norm": 1.4921875, + "learning_rate": 4.130894325893708e-05, + "loss": 6.8482, + "step": 19820 + }, + { + "epoch": 1.1384800872677168, + "grad_norm": 1.546875, + "learning_rate": 4.126316495556284e-05, + "loss": 6.7568, + "step": 19830 + }, + { + "epoch": 1.1390542169827569, + "grad_norm": 1.6796875, + "learning_rate": 4.121739420535199e-05, + "loss": 6.8415, + "step": 19840 + }, + { + "epoch": 1.1396283466977972, + "grad_norm": 1.6015625, + "learning_rate": 4.117163104787422e-05, + "loss": 6.803, + "step": 19850 + }, + { + "epoch": 1.1402024764128376, + "grad_norm": 1.609375, + "learning_rate": 4.112587552269267e-05, + "loss": 6.7807, + "step": 19860 + }, + { + "epoch": 1.140776606127878, + "grad_norm": 1.4921875, + "learning_rate": 4.108012766936389e-05, + "loss": 6.8434, + "step": 19870 + }, + { + "epoch": 1.141350735842918, + "grad_norm": 1.5546875, + "learning_rate": 4.103438752743778e-05, + "loss": 6.8533, + "step": 19880 + }, + { + "epoch": 1.1419248655579584, + "grad_norm": 1.5390625, + "learning_rate": 4.0988655136457583e-05, + "loss": 6.8059, + "step": 19890 + }, + { + "epoch": 1.1424989952729987, + "grad_norm": 1.625, + "learning_rate": 4.094293053595983e-05, + "loss": 6.7928, + "step": 19900 + }, + { + "epoch": 1.143073124988039, + "grad_norm": 1.5546875, + "learning_rate": 4.089721376547433e-05, + "loss": 6.8311, + "step": 19910 + }, + { + "epoch": 1.1436472547030792, + "grad_norm": 1.6015625, + "learning_rate": 4.085150486452412e-05, + "loss": 6.8427, + "step": 19920 + }, + { + "epoch": 1.1442213844181195, + "grad_norm": 1.6953125, + "learning_rate": 4.0805803872625434e-05, + "loss": 6.7816, + "step": 19930 + }, + { + "epoch": 1.1447955141331598, + "grad_norm": 1.6796875, + "learning_rate": 4.076011082928766e-05, + "loss": 6.8232, + "step": 19940 + }, + { + "epoch": 1.1453696438482002, + "grad_norm": 1.6015625, + "learning_rate": 4.071442577401331e-05, + "loss": 6.7945, + "step": 19950 + }, + { + "epoch": 1.1459437735632405, + "grad_norm": 1.5625, + "learning_rate": 4.0668748746298026e-05, + "loss": 6.813, + "step": 19960 + }, + { + "epoch": 1.1465179032782806, + "grad_norm": 1.484375, + "learning_rate": 4.062307978563047e-05, + "loss": 6.8593, + "step": 19970 + }, + { + "epoch": 1.147092032993321, + "grad_norm": 1.59375, + "learning_rate": 4.057741893149234e-05, + "loss": 6.81, + "step": 19980 + }, + { + "epoch": 1.1476661627083613, + "grad_norm": 1.5859375, + "learning_rate": 4.053176622335834e-05, + "loss": 6.7886, + "step": 19990 + }, + { + "epoch": 1.1482402924234014, + "grad_norm": 1.6171875, + "learning_rate": 4.048612170069612e-05, + "loss": 6.8227, + "step": 20000 + }, + { + "epoch": 1.1488144221384418, + "grad_norm": 1.6328125, + "learning_rate": 4.0440485402966254e-05, + "loss": 6.8347, + "step": 20010 + }, + { + "epoch": 1.149388551853482, + "grad_norm": 1.6484375, + "learning_rate": 4.039485736962221e-05, + "loss": 6.7834, + "step": 20020 + }, + { + "epoch": 1.1499626815685224, + "grad_norm": 1.5859375, + "learning_rate": 4.034923764011029e-05, + "loss": 6.8269, + "step": 20030 + }, + { + "epoch": 1.1505368112835628, + "grad_norm": 1.546875, + "learning_rate": 4.0303626253869655e-05, + "loss": 6.8141, + "step": 20040 + }, + { + "epoch": 1.151110940998603, + "grad_norm": 1.578125, + "learning_rate": 4.0258023250332235e-05, + "loss": 6.7619, + "step": 20050 + }, + { + "epoch": 1.1516850707136432, + "grad_norm": 1.59375, + "learning_rate": 4.02124286689227e-05, + "loss": 6.8245, + "step": 20060 + }, + { + "epoch": 1.1522592004286836, + "grad_norm": 1.6953125, + "learning_rate": 4.016684254905845e-05, + "loss": 6.795, + "step": 20070 + }, + { + "epoch": 1.1528333301437237, + "grad_norm": 1.5078125, + "learning_rate": 4.012126493014957e-05, + "loss": 6.7862, + "step": 20080 + }, + { + "epoch": 1.153407459858764, + "grad_norm": 1.6171875, + "learning_rate": 4.007569585159881e-05, + "loss": 6.8271, + "step": 20090 + }, + { + "epoch": 1.1539815895738044, + "grad_norm": 1.6171875, + "learning_rate": 4.0030135352801505e-05, + "loss": 6.7944, + "step": 20100 + }, + { + "epoch": 1.1545557192888447, + "grad_norm": 1.6015625, + "learning_rate": 3.99845834731456e-05, + "loss": 6.8388, + "step": 20110 + }, + { + "epoch": 1.155129849003885, + "grad_norm": 1.5859375, + "learning_rate": 3.993904025201157e-05, + "loss": 6.7978, + "step": 20120 + }, + { + "epoch": 1.1557039787189252, + "grad_norm": 1.5859375, + "learning_rate": 3.9893505728772423e-05, + "loss": 6.7911, + "step": 20130 + }, + { + "epoch": 1.1562781084339655, + "grad_norm": 1.6015625, + "learning_rate": 3.984797994279363e-05, + "loss": 6.8024, + "step": 20140 + }, + { + "epoch": 1.1568522381490058, + "grad_norm": 1.5859375, + "learning_rate": 3.9802462933433106e-05, + "loss": 6.7964, + "step": 20150 + }, + { + "epoch": 1.1574263678640462, + "grad_norm": 1.65625, + "learning_rate": 3.975695474004123e-05, + "loss": 6.7825, + "step": 20160 + }, + { + "epoch": 1.1580004975790863, + "grad_norm": 1.5078125, + "learning_rate": 3.9711455401960675e-05, + "loss": 6.7821, + "step": 20170 + }, + { + "epoch": 1.1585746272941266, + "grad_norm": 1.5234375, + "learning_rate": 3.9665964958526516e-05, + "loss": 6.8397, + "step": 20180 + }, + { + "epoch": 1.159148757009167, + "grad_norm": 1.578125, + "learning_rate": 3.962048344906612e-05, + "loss": 6.7868, + "step": 20190 + }, + { + "epoch": 1.1597228867242073, + "grad_norm": 1.546875, + "learning_rate": 3.957501091289916e-05, + "loss": 6.821, + "step": 20200 + }, + { + "epoch": 1.1602970164392474, + "grad_norm": 1.6484375, + "learning_rate": 3.95295473893375e-05, + "loss": 6.8652, + "step": 20210 + }, + { + "epoch": 1.1608711461542878, + "grad_norm": 1.59375, + "learning_rate": 3.9484092917685214e-05, + "loss": 6.8374, + "step": 20220 + }, + { + "epoch": 1.161445275869328, + "grad_norm": 1.578125, + "learning_rate": 3.943864753723863e-05, + "loss": 6.7714, + "step": 20230 + }, + { + "epoch": 1.1620194055843684, + "grad_norm": 1.5078125, + "learning_rate": 3.939321128728613e-05, + "loss": 6.8082, + "step": 20240 + }, + { + "epoch": 1.1625935352994086, + "grad_norm": 1.5078125, + "learning_rate": 3.934778420710824e-05, + "loss": 6.8283, + "step": 20250 + }, + { + "epoch": 1.163167665014449, + "grad_norm": 1.6328125, + "learning_rate": 3.9302366335977535e-05, + "loss": 6.8022, + "step": 20260 + }, + { + "epoch": 1.1637417947294892, + "grad_norm": 1.5546875, + "learning_rate": 3.925695771315867e-05, + "loss": 6.7978, + "step": 20270 + }, + { + "epoch": 1.1643159244445296, + "grad_norm": 1.546875, + "learning_rate": 3.921155837790828e-05, + "loss": 6.8335, + "step": 20280 + }, + { + "epoch": 1.1648900541595697, + "grad_norm": 1.78125, + "learning_rate": 3.916616836947495e-05, + "loss": 6.7864, + "step": 20290 + }, + { + "epoch": 1.16546418387461, + "grad_norm": 1.5234375, + "learning_rate": 3.9120787727099226e-05, + "loss": 6.7743, + "step": 20300 + }, + { + "epoch": 1.1660383135896504, + "grad_norm": 1.5234375, + "learning_rate": 3.9075416490013573e-05, + "loss": 6.7906, + "step": 20310 + }, + { + "epoch": 1.1666124433046907, + "grad_norm": 1.5703125, + "learning_rate": 3.90300546974423e-05, + "loss": 6.7976, + "step": 20320 + }, + { + "epoch": 1.1671865730197308, + "grad_norm": 1.59375, + "learning_rate": 3.8984702388601544e-05, + "loss": 6.8192, + "step": 20330 + }, + { + "epoch": 1.1677607027347712, + "grad_norm": 1.5078125, + "learning_rate": 3.893935960269927e-05, + "loss": 6.8099, + "step": 20340 + }, + { + "epoch": 1.1683348324498115, + "grad_norm": 1.5234375, + "learning_rate": 3.889402637893518e-05, + "loss": 6.8522, + "step": 20350 + }, + { + "epoch": 1.1689089621648519, + "grad_norm": 1.515625, + "learning_rate": 3.8848702756500736e-05, + "loss": 6.862, + "step": 20360 + }, + { + "epoch": 1.169483091879892, + "grad_norm": 1.5234375, + "learning_rate": 3.88033887745791e-05, + "loss": 6.8081, + "step": 20370 + }, + { + "epoch": 1.1700572215949323, + "grad_norm": 1.5546875, + "learning_rate": 3.8758084472345064e-05, + "loss": 6.8136, + "step": 20380 + }, + { + "epoch": 1.1706313513099726, + "grad_norm": 1.515625, + "learning_rate": 3.871278988896508e-05, + "loss": 6.8325, + "step": 20390 + }, + { + "epoch": 1.171205481025013, + "grad_norm": 1.578125, + "learning_rate": 3.8667505063597215e-05, + "loss": 6.8118, + "step": 20400 + }, + { + "epoch": 1.171779610740053, + "grad_norm": 1.6640625, + "learning_rate": 3.862223003539107e-05, + "loss": 6.7854, + "step": 20410 + }, + { + "epoch": 1.1723537404550934, + "grad_norm": 1.6171875, + "learning_rate": 3.857696484348777e-05, + "loss": 6.7856, + "step": 20420 + }, + { + "epoch": 1.1729278701701338, + "grad_norm": 1.5625, + "learning_rate": 3.853170952701996e-05, + "loss": 6.8181, + "step": 20430 + }, + { + "epoch": 1.1735019998851741, + "grad_norm": 1.5390625, + "learning_rate": 3.848646412511175e-05, + "loss": 6.7811, + "step": 20440 + }, + { + "epoch": 1.1740761296002145, + "grad_norm": 1.625, + "learning_rate": 3.844122867687867e-05, + "loss": 6.8026, + "step": 20450 + }, + { + "epoch": 1.1746502593152546, + "grad_norm": 1.640625, + "learning_rate": 3.839600322142762e-05, + "loss": 6.8377, + "step": 20460 + }, + { + "epoch": 1.175224389030295, + "grad_norm": 1.5625, + "learning_rate": 3.835078779785689e-05, + "loss": 6.7971, + "step": 20470 + }, + { + "epoch": 1.1757985187453353, + "grad_norm": 1.59375, + "learning_rate": 3.830558244525611e-05, + "loss": 6.7962, + "step": 20480 + }, + { + "epoch": 1.1763726484603754, + "grad_norm": 1.6328125, + "learning_rate": 3.826038720270616e-05, + "loss": 6.8334, + "step": 20490 + }, + { + "epoch": 1.1769467781754157, + "grad_norm": 1.546875, + "learning_rate": 3.821520210927922e-05, + "loss": 6.8429, + "step": 20500 + }, + { + "epoch": 1.177520907890456, + "grad_norm": 1.5234375, + "learning_rate": 3.817002720403868e-05, + "loss": 6.8043, + "step": 20510 + }, + { + "epoch": 1.1780950376054964, + "grad_norm": 1.5390625, + "learning_rate": 3.812486252603909e-05, + "loss": 6.8029, + "step": 20520 + }, + { + "epoch": 1.1786691673205367, + "grad_norm": 1.6015625, + "learning_rate": 3.807970811432625e-05, + "loss": 6.8094, + "step": 20530 + }, + { + "epoch": 1.1792432970355768, + "grad_norm": 1.5234375, + "learning_rate": 3.803456400793698e-05, + "loss": 6.7932, + "step": 20540 + }, + { + "epoch": 1.1798174267506172, + "grad_norm": 1.578125, + "learning_rate": 3.798943024589924e-05, + "loss": 6.8151, + "step": 20550 + }, + { + "epoch": 1.1803915564656575, + "grad_norm": 1.4765625, + "learning_rate": 3.794430686723205e-05, + "loss": 6.8063, + "step": 20560 + }, + { + "epoch": 1.1809656861806976, + "grad_norm": 1.5625, + "learning_rate": 3.789919391094546e-05, + "loss": 6.8127, + "step": 20570 + }, + { + "epoch": 1.181539815895738, + "grad_norm": 1.5546875, + "learning_rate": 3.7854091416040475e-05, + "loss": 6.8294, + "step": 20580 + }, + { + "epoch": 1.1821139456107783, + "grad_norm": 1.5234375, + "learning_rate": 3.780899942150908e-05, + "loss": 6.8127, + "step": 20590 + }, + { + "epoch": 1.1826880753258187, + "grad_norm": 1.6484375, + "learning_rate": 3.776391796633418e-05, + "loss": 6.8079, + "step": 20600 + }, + { + "epoch": 1.183262205040859, + "grad_norm": 1.640625, + "learning_rate": 3.7718847089489584e-05, + "loss": 6.8004, + "step": 20610 + }, + { + "epoch": 1.1838363347558991, + "grad_norm": 1.5390625, + "learning_rate": 3.7673786829939924e-05, + "loss": 6.7828, + "step": 20620 + }, + { + "epoch": 1.1844104644709395, + "grad_norm": 1.6640625, + "learning_rate": 3.762873722664067e-05, + "loss": 6.8034, + "step": 20630 + }, + { + "epoch": 1.1849845941859798, + "grad_norm": 1.6328125, + "learning_rate": 3.758369831853806e-05, + "loss": 6.8227, + "step": 20640 + }, + { + "epoch": 1.1855587239010201, + "grad_norm": 1.46875, + "learning_rate": 3.753867014456914e-05, + "loss": 6.8206, + "step": 20650 + }, + { + "epoch": 1.1861328536160602, + "grad_norm": 1.5234375, + "learning_rate": 3.749365274366163e-05, + "loss": 6.7616, + "step": 20660 + }, + { + "epoch": 1.1867069833311006, + "grad_norm": 1.5546875, + "learning_rate": 3.744864615473391e-05, + "loss": 6.86, + "step": 20670 + }, + { + "epoch": 1.187281113046141, + "grad_norm": 1.5859375, + "learning_rate": 3.74036504166951e-05, + "loss": 6.7983, + "step": 20680 + }, + { + "epoch": 1.1878552427611813, + "grad_norm": 1.578125, + "learning_rate": 3.7358665568444864e-05, + "loss": 6.8181, + "step": 20690 + }, + { + "epoch": 1.1884293724762214, + "grad_norm": 1.6171875, + "learning_rate": 3.731369164887347e-05, + "loss": 6.7961, + "step": 20700 + }, + { + "epoch": 1.1890035021912617, + "grad_norm": 1.453125, + "learning_rate": 3.726872869686176e-05, + "loss": 6.8288, + "step": 20710 + }, + { + "epoch": 1.189577631906302, + "grad_norm": 1.609375, + "learning_rate": 3.722377675128108e-05, + "loss": 6.8376, + "step": 20720 + }, + { + "epoch": 1.1901517616213424, + "grad_norm": 1.6328125, + "learning_rate": 3.717883585099324e-05, + "loss": 6.7921, + "step": 20730 + }, + { + "epoch": 1.1907258913363825, + "grad_norm": 1.515625, + "learning_rate": 3.713390603485053e-05, + "loss": 6.8358, + "step": 20740 + }, + { + "epoch": 1.1913000210514229, + "grad_norm": 1.5703125, + "learning_rate": 3.708898734169563e-05, + "loss": 6.8181, + "step": 20750 + }, + { + "epoch": 1.1918741507664632, + "grad_norm": 1.6484375, + "learning_rate": 3.704407981036167e-05, + "loss": 6.781, + "step": 20760 + }, + { + "epoch": 1.1924482804815035, + "grad_norm": 1.59375, + "learning_rate": 3.699918347967204e-05, + "loss": 6.818, + "step": 20770 + }, + { + "epoch": 1.1930224101965436, + "grad_norm": 1.5546875, + "learning_rate": 3.6954298388440494e-05, + "loss": 6.781, + "step": 20780 + }, + { + "epoch": 1.193596539911584, + "grad_norm": 1.5234375, + "learning_rate": 3.690942457547106e-05, + "loss": 6.8087, + "step": 20790 + }, + { + "epoch": 1.1941706696266243, + "grad_norm": 1.6640625, + "learning_rate": 3.686456207955805e-05, + "loss": 6.8211, + "step": 20800 + }, + { + "epoch": 1.1947447993416647, + "grad_norm": 1.53125, + "learning_rate": 3.681971093948594e-05, + "loss": 6.8242, + "step": 20810 + }, + { + "epoch": 1.1953189290567048, + "grad_norm": 1.625, + "learning_rate": 3.677487119402941e-05, + "loss": 6.7892, + "step": 20820 + }, + { + "epoch": 1.1958930587717451, + "grad_norm": 1.578125, + "learning_rate": 3.673004288195328e-05, + "loss": 6.8204, + "step": 20830 + }, + { + "epoch": 1.1964671884867855, + "grad_norm": 1.53125, + "learning_rate": 3.668522604201252e-05, + "loss": 6.822, + "step": 20840 + }, + { + "epoch": 1.1970413182018258, + "grad_norm": 1.5390625, + "learning_rate": 3.664042071295214e-05, + "loss": 6.8165, + "step": 20850 + }, + { + "epoch": 1.1976154479168661, + "grad_norm": 1.453125, + "learning_rate": 3.659562693350723e-05, + "loss": 6.8037, + "step": 20860 + }, + { + "epoch": 1.1981895776319063, + "grad_norm": 1.5859375, + "learning_rate": 3.655084474240286e-05, + "loss": 6.8077, + "step": 20870 + }, + { + "epoch": 1.1987637073469466, + "grad_norm": 1.4921875, + "learning_rate": 3.650607417835412e-05, + "loss": 6.839, + "step": 20880 + }, + { + "epoch": 1.199337837061987, + "grad_norm": 1.53125, + "learning_rate": 3.646131528006604e-05, + "loss": 6.8457, + "step": 20890 + }, + { + "epoch": 1.199911966777027, + "grad_norm": 1.5859375, + "learning_rate": 3.641656808623353e-05, + "loss": 6.8387, + "step": 20900 + }, + { + "epoch": 1.2004860964920674, + "grad_norm": 1.6171875, + "learning_rate": 3.637183263554143e-05, + "loss": 6.8158, + "step": 20910 + }, + { + "epoch": 1.2010602262071077, + "grad_norm": 1.640625, + "learning_rate": 3.632710896666437e-05, + "loss": 6.797, + "step": 20920 + }, + { + "epoch": 1.201634355922148, + "grad_norm": 1.5859375, + "learning_rate": 3.6282397118266876e-05, + "loss": 6.8049, + "step": 20930 + }, + { + "epoch": 1.2022084856371884, + "grad_norm": 1.625, + "learning_rate": 3.623769712900319e-05, + "loss": 6.7645, + "step": 20940 + }, + { + "epoch": 1.2027826153522285, + "grad_norm": 1.625, + "learning_rate": 3.6193009037517314e-05, + "loss": 6.7906, + "step": 20950 + }, + { + "epoch": 1.2033567450672689, + "grad_norm": 1.5, + "learning_rate": 3.614833288244295e-05, + "loss": 6.8026, + "step": 20960 + }, + { + "epoch": 1.2039308747823092, + "grad_norm": 1.5546875, + "learning_rate": 3.6103668702403546e-05, + "loss": 6.8169, + "step": 20970 + }, + { + "epoch": 1.2045050044973493, + "grad_norm": 1.546875, + "learning_rate": 3.6059016536012124e-05, + "loss": 6.8261, + "step": 20980 + }, + { + "epoch": 1.2050791342123897, + "grad_norm": 1.6171875, + "learning_rate": 3.601437642187135e-05, + "loss": 6.8157, + "step": 20990 + }, + { + "epoch": 1.20565326392743, + "grad_norm": 1.5625, + "learning_rate": 3.5969748398573474e-05, + "loss": 6.7973, + "step": 21000 + }, + { + "epoch": 1.2062273936424703, + "grad_norm": 1.6171875, + "learning_rate": 3.5925132504700286e-05, + "loss": 6.7928, + "step": 21010 + }, + { + "epoch": 1.2068015233575107, + "grad_norm": 1.5, + "learning_rate": 3.58805287788231e-05, + "loss": 6.8132, + "step": 21020 + }, + { + "epoch": 1.2073756530725508, + "grad_norm": 1.5859375, + "learning_rate": 3.583593725950268e-05, + "loss": 6.8118, + "step": 21030 + }, + { + "epoch": 1.2079497827875911, + "grad_norm": 1.5703125, + "learning_rate": 3.5791357985289277e-05, + "loss": 6.821, + "step": 21040 + }, + { + "epoch": 1.2085239125026315, + "grad_norm": 1.6015625, + "learning_rate": 3.5746790994722534e-05, + "loss": 6.8024, + "step": 21050 + }, + { + "epoch": 1.2090980422176718, + "grad_norm": 1.59375, + "learning_rate": 3.570223632633148e-05, + "loss": 6.8069, + "step": 21060 + }, + { + "epoch": 1.209672171932712, + "grad_norm": 1.5390625, + "learning_rate": 3.56576940186345e-05, + "loss": 6.7979, + "step": 21070 + }, + { + "epoch": 1.2102463016477523, + "grad_norm": 1.53125, + "learning_rate": 3.5613164110139275e-05, + "loss": 6.7888, + "step": 21080 + }, + { + "epoch": 1.2108204313627926, + "grad_norm": 1.5859375, + "learning_rate": 3.556864663934275e-05, + "loss": 6.8396, + "step": 21090 + }, + { + "epoch": 1.211394561077833, + "grad_norm": 1.609375, + "learning_rate": 3.552414164473118e-05, + "loss": 6.8034, + "step": 21100 + }, + { + "epoch": 1.211968690792873, + "grad_norm": 1.515625, + "learning_rate": 3.547964916477998e-05, + "loss": 6.8231, + "step": 21110 + }, + { + "epoch": 1.2125428205079134, + "grad_norm": 1.578125, + "learning_rate": 3.543516923795377e-05, + "loss": 6.8226, + "step": 21120 + }, + { + "epoch": 1.2131169502229537, + "grad_norm": 1.6875, + "learning_rate": 3.539070190270629e-05, + "loss": 6.8056, + "step": 21130 + }, + { + "epoch": 1.213691079937994, + "grad_norm": 1.5078125, + "learning_rate": 3.534624719748043e-05, + "loss": 6.7595, + "step": 21140 + }, + { + "epoch": 1.2142652096530342, + "grad_norm": 1.5859375, + "learning_rate": 3.530180516070815e-05, + "loss": 6.7858, + "step": 21150 + }, + { + "epoch": 1.2148393393680745, + "grad_norm": 1.5078125, + "learning_rate": 3.525737583081044e-05, + "loss": 6.8228, + "step": 21160 + }, + { + "epoch": 1.2154134690831149, + "grad_norm": 1.5859375, + "learning_rate": 3.521295924619731e-05, + "loss": 6.7693, + "step": 21170 + }, + { + "epoch": 1.2159875987981552, + "grad_norm": 1.578125, + "learning_rate": 3.516855544526779e-05, + "loss": 6.7931, + "step": 21180 + }, + { + "epoch": 1.2165617285131953, + "grad_norm": 1.7109375, + "learning_rate": 3.51241644664098e-05, + "loss": 6.7883, + "step": 21190 + }, + { + "epoch": 1.2171358582282357, + "grad_norm": 1.5078125, + "learning_rate": 3.50797863480002e-05, + "loss": 6.8149, + "step": 21200 + }, + { + "epoch": 1.217709987943276, + "grad_norm": 1.484375, + "learning_rate": 3.503542112840476e-05, + "loss": 6.8273, + "step": 21210 + }, + { + "epoch": 1.2182841176583163, + "grad_norm": 1.53125, + "learning_rate": 3.4991068845978056e-05, + "loss": 6.7913, + "step": 21220 + }, + { + "epoch": 1.2188582473733565, + "grad_norm": 1.59375, + "learning_rate": 3.494672953906349e-05, + "loss": 6.7956, + "step": 21230 + }, + { + "epoch": 1.2194323770883968, + "grad_norm": 1.53125, + "learning_rate": 3.490240324599328e-05, + "loss": 6.8015, + "step": 21240 + }, + { + "epoch": 1.2200065068034371, + "grad_norm": 1.6171875, + "learning_rate": 3.485809000508834e-05, + "loss": 6.8228, + "step": 21250 + }, + { + "epoch": 1.2205806365184775, + "grad_norm": 1.609375, + "learning_rate": 3.481378985465833e-05, + "loss": 6.8111, + "step": 21260 + }, + { + "epoch": 1.2211547662335176, + "grad_norm": 1.6328125, + "learning_rate": 3.476950283300159e-05, + "loss": 6.779, + "step": 21270 + }, + { + "epoch": 1.221728895948558, + "grad_norm": 1.5625, + "learning_rate": 3.472522897840512e-05, + "loss": 6.808, + "step": 21280 + }, + { + "epoch": 1.2223030256635983, + "grad_norm": 1.5390625, + "learning_rate": 3.468096832914452e-05, + "loss": 6.801, + "step": 21290 + }, + { + "epoch": 1.2228771553786386, + "grad_norm": 1.5, + "learning_rate": 3.463672092348399e-05, + "loss": 6.8156, + "step": 21300 + }, + { + "epoch": 1.2234512850936787, + "grad_norm": 1.5625, + "learning_rate": 3.4592486799676256e-05, + "loss": 6.8023, + "step": 21310 + }, + { + "epoch": 1.224025414808719, + "grad_norm": 1.6328125, + "learning_rate": 3.454826599596256e-05, + "loss": 6.7724, + "step": 21320 + }, + { + "epoch": 1.2245995445237594, + "grad_norm": 1.6015625, + "learning_rate": 3.450405855057268e-05, + "loss": 6.8353, + "step": 21330 + }, + { + "epoch": 1.2251736742387997, + "grad_norm": 1.640625, + "learning_rate": 3.44598645017248e-05, + "loss": 6.8009, + "step": 21340 + }, + { + "epoch": 1.22574780395384, + "grad_norm": 1.6484375, + "learning_rate": 3.441568388762553e-05, + "loss": 6.8197, + "step": 21350 + }, + { + "epoch": 1.2263219336688802, + "grad_norm": 1.640625, + "learning_rate": 3.4371516746469847e-05, + "loss": 6.7526, + "step": 21360 + }, + { + "epoch": 1.2268960633839205, + "grad_norm": 1.546875, + "learning_rate": 3.4327363116441136e-05, + "loss": 6.8039, + "step": 21370 + }, + { + "epoch": 1.2274701930989609, + "grad_norm": 1.515625, + "learning_rate": 3.4283223035711045e-05, + "loss": 6.842, + "step": 21380 + }, + { + "epoch": 1.228044322814001, + "grad_norm": 1.5078125, + "learning_rate": 3.423909654243954e-05, + "loss": 6.8283, + "step": 21390 + }, + { + "epoch": 1.2286184525290413, + "grad_norm": 1.4921875, + "learning_rate": 3.4194983674774805e-05, + "loss": 6.8216, + "step": 21400 + }, + { + "epoch": 1.2291925822440817, + "grad_norm": 1.5234375, + "learning_rate": 3.415088447085332e-05, + "loss": 6.8379, + "step": 21410 + }, + { + "epoch": 1.229766711959122, + "grad_norm": 1.609375, + "learning_rate": 3.410679896879966e-05, + "loss": 6.7836, + "step": 21420 + }, + { + "epoch": 1.2303408416741624, + "grad_norm": 1.4765625, + "learning_rate": 3.4062727206726606e-05, + "loss": 6.817, + "step": 21430 + }, + { + "epoch": 1.2309149713892025, + "grad_norm": 1.625, + "learning_rate": 3.4018669222735054e-05, + "loss": 6.7849, + "step": 21440 + }, + { + "epoch": 1.2314891011042428, + "grad_norm": 1.5234375, + "learning_rate": 3.3974625054914e-05, + "loss": 6.8206, + "step": 21450 + }, + { + "epoch": 1.2320632308192832, + "grad_norm": 1.5703125, + "learning_rate": 3.393059474134047e-05, + "loss": 6.7817, + "step": 21460 + }, + { + "epoch": 1.2326373605343233, + "grad_norm": 1.53125, + "learning_rate": 3.388657832007951e-05, + "loss": 6.8132, + "step": 21470 + }, + { + "epoch": 1.2332114902493636, + "grad_norm": 1.578125, + "learning_rate": 3.384257582918418e-05, + "loss": 6.8044, + "step": 21480 + }, + { + "epoch": 1.233785619964404, + "grad_norm": 1.5390625, + "learning_rate": 3.379858730669551e-05, + "loss": 6.8424, + "step": 21490 + }, + { + "epoch": 1.2343597496794443, + "grad_norm": 1.5546875, + "learning_rate": 3.375461279064239e-05, + "loss": 6.8208, + "step": 21500 + }, + { + "epoch": 1.2349338793944846, + "grad_norm": 1.5, + "learning_rate": 3.371065231904168e-05, + "loss": 6.8039, + "step": 21510 + }, + { + "epoch": 1.2355080091095247, + "grad_norm": 1.5078125, + "learning_rate": 3.366670592989803e-05, + "loss": 6.7942, + "step": 21520 + }, + { + "epoch": 1.236082138824565, + "grad_norm": 1.5859375, + "learning_rate": 3.362277366120397e-05, + "loss": 6.7946, + "step": 21530 + }, + { + "epoch": 1.2366562685396054, + "grad_norm": 1.625, + "learning_rate": 3.357885555093978e-05, + "loss": 6.8253, + "step": 21540 + }, + { + "epoch": 1.2372303982546458, + "grad_norm": 1.546875, + "learning_rate": 3.353495163707353e-05, + "loss": 6.8448, + "step": 21550 + }, + { + "epoch": 1.2378045279696859, + "grad_norm": 1.578125, + "learning_rate": 3.349106195756101e-05, + "loss": 6.8034, + "step": 21560 + }, + { + "epoch": 1.2383786576847262, + "grad_norm": 1.5703125, + "learning_rate": 3.344718655034568e-05, + "loss": 6.7819, + "step": 21570 + }, + { + "epoch": 1.2389527873997666, + "grad_norm": 1.5546875, + "learning_rate": 3.340332545335869e-05, + "loss": 6.8123, + "step": 21580 + }, + { + "epoch": 1.239526917114807, + "grad_norm": 1.4921875, + "learning_rate": 3.335947870451882e-05, + "loss": 6.8522, + "step": 21590 + }, + { + "epoch": 1.240101046829847, + "grad_norm": 1.6171875, + "learning_rate": 3.331564634173243e-05, + "loss": 6.8463, + "step": 21600 + }, + { + "epoch": 1.2406751765448873, + "grad_norm": 1.6328125, + "learning_rate": 3.327182840289343e-05, + "loss": 6.8039, + "step": 21610 + }, + { + "epoch": 1.2412493062599277, + "grad_norm": 1.515625, + "learning_rate": 3.32280249258833e-05, + "loss": 6.8103, + "step": 21620 + }, + { + "epoch": 1.241823435974968, + "grad_norm": 1.5859375, + "learning_rate": 3.3184235948570983e-05, + "loss": 6.8569, + "step": 21630 + }, + { + "epoch": 1.2423975656900081, + "grad_norm": 1.53125, + "learning_rate": 3.3140461508812914e-05, + "loss": 6.7996, + "step": 21640 + }, + { + "epoch": 1.2429716954050485, + "grad_norm": 1.5078125, + "learning_rate": 3.309670164445292e-05, + "loss": 6.8066, + "step": 21650 + }, + { + "epoch": 1.2435458251200888, + "grad_norm": 1.5546875, + "learning_rate": 3.3052956393322287e-05, + "loss": 6.7836, + "step": 21660 + }, + { + "epoch": 1.2441199548351292, + "grad_norm": 1.5546875, + "learning_rate": 3.30092257932396e-05, + "loss": 6.8482, + "step": 21670 + }, + { + "epoch": 1.2446940845501693, + "grad_norm": 1.6015625, + "learning_rate": 3.296550988201083e-05, + "loss": 6.7908, + "step": 21680 + }, + { + "epoch": 1.2452682142652096, + "grad_norm": 1.5703125, + "learning_rate": 3.292180869742924e-05, + "loss": 6.774, + "step": 21690 + }, + { + "epoch": 1.24584234398025, + "grad_norm": 1.546875, + "learning_rate": 3.2878122277275313e-05, + "loss": 6.803, + "step": 21700 + }, + { + "epoch": 1.2464164736952903, + "grad_norm": 1.578125, + "learning_rate": 3.283445065931685e-05, + "loss": 6.7836, + "step": 21710 + }, + { + "epoch": 1.2469906034103304, + "grad_norm": 1.59375, + "learning_rate": 3.279079388130877e-05, + "loss": 6.8021, + "step": 21720 + }, + { + "epoch": 1.2475647331253708, + "grad_norm": 1.6484375, + "learning_rate": 3.274715198099324e-05, + "loss": 6.7848, + "step": 21730 + }, + { + "epoch": 1.248138862840411, + "grad_norm": 1.59375, + "learning_rate": 3.270352499609952e-05, + "loss": 6.8375, + "step": 21740 + }, + { + "epoch": 1.2487129925554514, + "grad_norm": 1.5546875, + "learning_rate": 3.265991296434399e-05, + "loss": 6.7839, + "step": 21750 + }, + { + "epoch": 1.2492871222704918, + "grad_norm": 1.5703125, + "learning_rate": 3.261631592343008e-05, + "loss": 6.8102, + "step": 21760 + }, + { + "epoch": 1.2498612519855319, + "grad_norm": 1.625, + "learning_rate": 3.257273391104829e-05, + "loss": 6.8041, + "step": 21770 + }, + { + "epoch": 1.2504353817005722, + "grad_norm": 1.6953125, + "learning_rate": 3.2529166964876115e-05, + "loss": 6.8141, + "step": 21780 + }, + { + "epoch": 1.2510095114156126, + "grad_norm": 1.5, + "learning_rate": 3.248561512257802e-05, + "loss": 6.8004, + "step": 21790 + }, + { + "epoch": 1.2515836411306527, + "grad_norm": 1.5859375, + "learning_rate": 3.244207842180542e-05, + "loss": 6.7802, + "step": 21800 + }, + { + "epoch": 1.252157770845693, + "grad_norm": 1.5703125, + "learning_rate": 3.2398556900196636e-05, + "loss": 6.8308, + "step": 21810 + }, + { + "epoch": 1.2527319005607334, + "grad_norm": 1.6171875, + "learning_rate": 3.235505059537688e-05, + "loss": 6.7989, + "step": 21820 + }, + { + "epoch": 1.2533060302757737, + "grad_norm": 1.5, + "learning_rate": 3.2311559544958174e-05, + "loss": 6.7923, + "step": 21830 + }, + { + "epoch": 1.253880159990814, + "grad_norm": 1.5703125, + "learning_rate": 3.226808378653938e-05, + "loss": 6.8128, + "step": 21840 + }, + { + "epoch": 1.2544542897058542, + "grad_norm": 1.4765625, + "learning_rate": 3.222462335770615e-05, + "loss": 6.8132, + "step": 21850 + }, + { + "epoch": 1.2550284194208945, + "grad_norm": 1.5859375, + "learning_rate": 3.218117829603087e-05, + "loss": 6.8116, + "step": 21860 + }, + { + "epoch": 1.2556025491359348, + "grad_norm": 1.6015625, + "learning_rate": 3.213774863907262e-05, + "loss": 6.7959, + "step": 21870 + }, + { + "epoch": 1.256176678850975, + "grad_norm": 1.53125, + "learning_rate": 3.2094334424377176e-05, + "loss": 6.8238, + "step": 21880 + }, + { + "epoch": 1.2567508085660153, + "grad_norm": 1.5546875, + "learning_rate": 3.205093568947699e-05, + "loss": 6.7817, + "step": 21890 + }, + { + "epoch": 1.2573249382810556, + "grad_norm": 1.5390625, + "learning_rate": 3.200755247189111e-05, + "loss": 6.8276, + "step": 21900 + }, + { + "epoch": 1.257899067996096, + "grad_norm": 1.5078125, + "learning_rate": 3.196418480912515e-05, + "loss": 6.7674, + "step": 21910 + }, + { + "epoch": 1.2584731977111363, + "grad_norm": 1.6015625, + "learning_rate": 3.192083273867131e-05, + "loss": 6.8002, + "step": 21920 + }, + { + "epoch": 1.2590473274261764, + "grad_norm": 1.671875, + "learning_rate": 3.187749629800829e-05, + "loss": 6.7855, + "step": 21930 + }, + { + "epoch": 1.2596214571412168, + "grad_norm": 1.6328125, + "learning_rate": 3.183417552460129e-05, + "loss": 6.8001, + "step": 21940 + }, + { + "epoch": 1.260195586856257, + "grad_norm": 1.515625, + "learning_rate": 3.179087045590196e-05, + "loss": 6.7425, + "step": 21950 + }, + { + "epoch": 1.2607697165712972, + "grad_norm": 1.546875, + "learning_rate": 3.174758112934836e-05, + "loss": 6.7388, + "step": 21960 + }, + { + "epoch": 1.2613438462863376, + "grad_norm": 1.6171875, + "learning_rate": 3.170430758236495e-05, + "loss": 6.816, + "step": 21970 + } + ], + "logging_steps": 10, + "max_steps": 34834, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.04593957210827e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}