{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6507508336696333, "eval_steps": 100, "global_step": 7800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.021212395280599594, "learning_rate": 0.0001, "loss": 8.6987, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.018885262310504913, "learning_rate": 0.0001, "loss": 8.7228, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.030739275738596916, "learning_rate": 0.0001, "loss": 8.7384, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.03329615667462349, "learning_rate": 0.0001, "loss": 8.7324, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.021798215806484222, "learning_rate": 0.0001, "loss": 8.7257, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.022552112117409706, "learning_rate": 0.0001, "loss": 8.7345, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.024906286969780922, "learning_rate": 0.0001, "loss": 8.7341, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.029604945331811905, "learning_rate": 0.0001, "loss": 8.7372, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.027017420157790184, "learning_rate": 0.0001, "loss": 8.7178, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.1736399084329605, "learning_rate": 0.0001, "loss": 8.7289, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.032085757702589035, "learning_rate": 0.0001, "loss": 8.6985, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.0356113463640213, "learning_rate": 0.0001, "loss": 8.7198, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.04567497596144676, "learning_rate": 0.0001, "loss": 8.7256, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.10645324736833572, "learning_rate": 0.0001, "loss": 8.7044, "step": 130 }, { "epoch": 0.05, "grad_norm": 0.08604108542203903, "learning_rate": 0.0001, "loss": 8.6868, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.21885468065738678, "learning_rate": 0.0001, "loss": 8.678, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.34108299016952515, "learning_rate": 0.0001, "loss": 8.6014, "step": 160 }, { "epoch": 0.06, "grad_norm": 0.551449179649353, "learning_rate": 0.0001, "loss": 8.5854, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.4222724437713623, "learning_rate": 0.0001, "loss": 8.5096, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.4721892178058624, "learning_rate": 0.0001, "loss": 8.4888, "step": 190 }, { "epoch": 0.07, "grad_norm": 0.4090125262737274, "learning_rate": 0.0001, "loss": 8.4725, "step": 200 }, { "epoch": 0.07, "grad_norm": 0.3760685920715332, "learning_rate": 0.0001, "loss": 8.4548, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.44033998250961304, "learning_rate": 0.0001, "loss": 8.4367, "step": 220 }, { "epoch": 0.08, "grad_norm": 0.47666457295417786, "learning_rate": 0.0001, "loss": 8.4161, "step": 230 }, { "epoch": 0.08, "grad_norm": 0.4455925524234772, "learning_rate": 0.0001, "loss": 8.4375, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.46253493428230286, "learning_rate": 0.0001, "loss": 8.4125, "step": 250 }, { "epoch": 0.09, "grad_norm": 0.5054658651351929, "learning_rate": 0.0001, "loss": 8.3878, "step": 260 }, { "epoch": 0.09, "grad_norm": 0.5697858929634094, "learning_rate": 0.0001, "loss": 8.4173, "step": 270 }, { "epoch": 0.1, "grad_norm": 0.5195950269699097, "learning_rate": 0.0001, "loss": 8.3315, "step": 280 }, { "epoch": 0.1, "grad_norm": 0.5799770355224609, "learning_rate": 0.0001, "loss": 8.3224, "step": 290 }, { "epoch": 0.1, "grad_norm": 0.5585042834281921, "learning_rate": 0.0001, "loss": 8.3111, "step": 300 }, { "epoch": 0.11, "grad_norm": 0.6580124497413635, "learning_rate": 0.0001, "loss": 8.3186, "step": 310 }, { "epoch": 0.11, "grad_norm": 0.7007073760032654, "learning_rate": 0.0001, "loss": 8.2862, "step": 320 }, { "epoch": 0.11, "grad_norm": 0.6406170725822449, "learning_rate": 0.0001, "loss": 8.3202, "step": 330 }, { "epoch": 0.12, "grad_norm": 0.6901131272315979, "learning_rate": 0.0001, "loss": 8.3053, "step": 340 }, { "epoch": 0.12, "grad_norm": 0.7509775161743164, "learning_rate": 0.0001, "loss": 8.2628, "step": 350 }, { "epoch": 0.12, "grad_norm": 0.6910294890403748, "learning_rate": 0.0001, "loss": 8.2785, "step": 360 }, { "epoch": 0.13, "grad_norm": 0.95322185754776, "learning_rate": 0.0001, "loss": 8.2177, "step": 370 }, { "epoch": 0.13, "grad_norm": 0.7333555221557617, "learning_rate": 0.0001, "loss": 8.2437, "step": 380 }, { "epoch": 0.13, "grad_norm": 0.7888531684875488, "learning_rate": 0.0001, "loss": 8.2163, "step": 390 }, { "epoch": 0.14, "grad_norm": 2.3577535152435303, "learning_rate": 0.0001, "loss": 8.1334, "step": 400 }, { "epoch": 0.14, "grad_norm": 0.7684480547904968, "learning_rate": 0.0001, "loss": 8.2015, "step": 410 }, { "epoch": 0.14, "grad_norm": 0.9100524187088013, "learning_rate": 0.0001, "loss": 8.2179, "step": 420 }, { "epoch": 0.15, "grad_norm": 1.0707799196243286, "learning_rate": 0.0001, "loss": 8.2101, "step": 430 }, { "epoch": 0.15, "grad_norm": 0.8474343419075012, "learning_rate": 0.0001, "loss": 8.1322, "step": 440 }, { "epoch": 0.15, "grad_norm": 0.8755463361740112, "learning_rate": 0.0001, "loss": 8.178, "step": 450 }, { "epoch": 0.16, "grad_norm": 0.9474639892578125, "learning_rate": 0.0001, "loss": 8.1057, "step": 460 }, { "epoch": 0.16, "grad_norm": 0.82960045337677, "learning_rate": 0.0001, "loss": 8.1476, "step": 470 }, { "epoch": 0.16, "grad_norm": 0.9738801121711731, "learning_rate": 0.0001, "loss": 8.1107, "step": 480 }, { "epoch": 0.17, "grad_norm": 0.9569644927978516, "learning_rate": 0.0001, "loss": 8.0467, "step": 490 }, { "epoch": 0.17, "grad_norm": 0.9318249225616455, "learning_rate": 0.0001, "loss": 8.1014, "step": 500 }, { "epoch": 0.17, "grad_norm": 1.0197961330413818, "learning_rate": 0.0001, "loss": 8.111, "step": 510 }, { "epoch": 0.18, "grad_norm": 1.0179370641708374, "learning_rate": 0.0001, "loss": 8.1044, "step": 520 }, { "epoch": 0.18, "grad_norm": 0.9549160599708557, "learning_rate": 0.0001, "loss": 8.1088, "step": 530 }, { "epoch": 0.18, "grad_norm": 0.9991666674613953, "learning_rate": 0.0001, "loss": 8.1583, "step": 540 }, { "epoch": 0.19, "grad_norm": 1.237109661102295, "learning_rate": 0.0001, "loss": 8.048, "step": 550 }, { "epoch": 0.19, "grad_norm": 1.0213505029678345, "learning_rate": 0.0001, "loss": 8.0383, "step": 560 }, { "epoch": 0.19, "grad_norm": 1.1742730140686035, "learning_rate": 0.0001, "loss": 8.0809, "step": 570 }, { "epoch": 0.2, "grad_norm": 1.0418646335601807, "learning_rate": 0.0001, "loss": 8.0511, "step": 580 }, { "epoch": 0.2, "grad_norm": 1.026742935180664, "learning_rate": 0.0001, "loss": 7.9717, "step": 590 }, { "epoch": 0.2, "grad_norm": 1.2008246183395386, "learning_rate": 0.0001, "loss": 8.029, "step": 600 }, { "epoch": 0.21, "grad_norm": 1.0359411239624023, "learning_rate": 0.0001, "loss": 8.0179, "step": 610 }, { "epoch": 0.21, "grad_norm": 1.0166168212890625, "learning_rate": 0.0001, "loss": 8.0339, "step": 620 }, { "epoch": 0.21, "grad_norm": 1.0548157691955566, "learning_rate": 0.0001, "loss": 7.962, "step": 630 }, { "epoch": 0.22, "grad_norm": 1.0204545259475708, "learning_rate": 0.0001, "loss": 8.0035, "step": 640 }, { "epoch": 0.22, "grad_norm": 1.122182846069336, "learning_rate": 0.0001, "loss": 8.046, "step": 650 }, { "epoch": 0.22, "grad_norm": 1.0963354110717773, "learning_rate": 0.0001, "loss": 8.0241, "step": 660 }, { "epoch": 0.23, "grad_norm": 1.0327413082122803, "learning_rate": 0.0001, "loss": 7.9474, "step": 670 }, { "epoch": 0.23, "grad_norm": 1.279113531112671, "learning_rate": 0.0001, "loss": 8.011, "step": 680 }, { "epoch": 0.23, "grad_norm": 1.0806117057800293, "learning_rate": 0.0001, "loss": 7.9235, "step": 690 }, { "epoch": 0.24, "grad_norm": 1.0781127214431763, "learning_rate": 0.0001, "loss": 7.9572, "step": 700 }, { "epoch": 0.24, "grad_norm": 1.1420655250549316, "learning_rate": 0.0001, "loss": 7.9394, "step": 710 }, { "epoch": 0.24, "grad_norm": 1.161694049835205, "learning_rate": 0.0001, "loss": 7.9587, "step": 720 }, { "epoch": 0.25, "grad_norm": 1.1545701026916504, "learning_rate": 0.0001, "loss": 7.8929, "step": 730 }, { "epoch": 0.25, "grad_norm": 1.1458271741867065, "learning_rate": 0.0001, "loss": 7.8929, "step": 740 }, { "epoch": 0.25, "grad_norm": 1.1537671089172363, "learning_rate": 0.0001, "loss": 7.9182, "step": 750 }, { "epoch": 0.26, "grad_norm": 1.217034935951233, "learning_rate": 0.0001, "loss": 7.9654, "step": 760 }, { "epoch": 0.26, "grad_norm": 1.1825016736984253, "learning_rate": 0.0001, "loss": 7.8556, "step": 770 }, { "epoch": 0.27, "grad_norm": 1.420203447341919, "learning_rate": 0.0001, "loss": 7.8865, "step": 780 }, { "epoch": 0.27, "grad_norm": 1.126854419708252, "learning_rate": 0.0001, "loss": 7.9135, "step": 790 }, { "epoch": 0.27, "grad_norm": 1.1468126773834229, "learning_rate": 0.0001, "loss": 7.9101, "step": 800 }, { "epoch": 0.28, "grad_norm": 1.2076383829116821, "learning_rate": 0.0001, "loss": 7.8781, "step": 810 }, { "epoch": 0.28, "grad_norm": 1.2358585596084595, "learning_rate": 0.0001, "loss": 7.8729, "step": 820 }, { "epoch": 0.28, "grad_norm": 1.1873291730880737, "learning_rate": 0.0001, "loss": 7.8703, "step": 830 }, { "epoch": 0.29, "grad_norm": 1.1813349723815918, "learning_rate": 0.0001, "loss": 7.9166, "step": 840 }, { "epoch": 0.29, "grad_norm": 1.1983938217163086, "learning_rate": 0.0001, "loss": 7.8662, "step": 850 }, { "epoch": 0.29, "grad_norm": 1.1260148286819458, "learning_rate": 0.0001, "loss": 7.8488, "step": 860 }, { "epoch": 0.3, "grad_norm": 1.1964378356933594, "learning_rate": 0.0001, "loss": 7.8679, "step": 870 }, { "epoch": 0.3, "grad_norm": 1.2019832134246826, "learning_rate": 0.0001, "loss": 7.7503, "step": 880 }, { "epoch": 0.3, "grad_norm": 1.1462825536727905, "learning_rate": 0.0001, "loss": 7.8578, "step": 890 }, { "epoch": 0.31, "grad_norm": 1.561476230621338, "learning_rate": 0.0001, "loss": 7.8796, "step": 900 }, { "epoch": 0.31, "grad_norm": 1.1881765127182007, "learning_rate": 0.0001, "loss": 7.8308, "step": 910 }, { "epoch": 0.31, "grad_norm": 1.2523167133331299, "learning_rate": 0.0001, "loss": 7.7982, "step": 920 }, { "epoch": 0.32, "grad_norm": 1.3139631748199463, "learning_rate": 0.0001, "loss": 7.7945, "step": 930 }, { "epoch": 0.32, "grad_norm": 1.2442649602890015, "learning_rate": 0.0001, "loss": 7.8018, "step": 940 }, { "epoch": 0.32, "grad_norm": 1.2717634439468384, "learning_rate": 0.0001, "loss": 7.8349, "step": 950 }, { "epoch": 0.33, "grad_norm": 1.3789329528808594, "learning_rate": 0.0001, "loss": 7.8695, "step": 960 }, { "epoch": 0.33, "grad_norm": 1.2785015106201172, "learning_rate": 0.0001, "loss": 7.7706, "step": 970 }, { "epoch": 0.33, "grad_norm": 1.3102847337722778, "learning_rate": 0.0001, "loss": 7.7189, "step": 980 }, { "epoch": 0.34, "grad_norm": 1.443925380706787, "learning_rate": 0.0001, "loss": 7.7913, "step": 990 }, { "epoch": 0.34, "grad_norm": 1.3788172006607056, "learning_rate": 0.0001, "loss": 7.7747, "step": 1000 }, { "epoch": 0.34, "grad_norm": 1.3770830631256104, "learning_rate": 0.0001, "loss": 7.7729, "step": 1010 }, { "epoch": 0.35, "grad_norm": 1.492782711982727, "learning_rate": 0.0001, "loss": 7.773, "step": 1020 }, { "epoch": 0.35, "grad_norm": 1.284954309463501, "learning_rate": 0.0001, "loss": 7.819, "step": 1030 }, { "epoch": 0.35, "grad_norm": 1.627119779586792, "learning_rate": 0.0001, "loss": 7.6785, "step": 1040 }, { "epoch": 0.36, "grad_norm": 1.3385231494903564, "learning_rate": 0.0001, "loss": 7.792, "step": 1050 }, { "epoch": 0.36, "grad_norm": 1.3017216920852661, "learning_rate": 0.0001, "loss": 7.7978, "step": 1060 }, { "epoch": 0.36, "grad_norm": 1.3827013969421387, "learning_rate": 0.0001, "loss": 7.712, "step": 1070 }, { "epoch": 0.37, "grad_norm": 1.4803731441497803, "learning_rate": 0.0001, "loss": 7.7092, "step": 1080 }, { "epoch": 0.37, "grad_norm": 1.305528163909912, "learning_rate": 0.0001, "loss": 7.7423, "step": 1090 }, { "epoch": 0.37, "grad_norm": 1.4556981325149536, "learning_rate": 0.0001, "loss": 7.7396, "step": 1100 }, { "epoch": 0.38, "grad_norm": 1.370935320854187, "learning_rate": 0.0001, "loss": 7.7866, "step": 1110 }, { "epoch": 0.38, "grad_norm": 1.5131551027297974, "learning_rate": 0.0001, "loss": 7.6658, "step": 1120 }, { "epoch": 0.38, "grad_norm": 1.4023760557174683, "learning_rate": 0.0001, "loss": 7.7168, "step": 1130 }, { "epoch": 0.39, "grad_norm": 1.4399409294128418, "learning_rate": 0.0001, "loss": 7.776, "step": 1140 }, { "epoch": 0.39, "grad_norm": 1.3380918502807617, "learning_rate": 0.0001, "loss": 7.6652, "step": 1150 }, { "epoch": 0.39, "grad_norm": 1.3455390930175781, "learning_rate": 0.0001, "loss": 7.7044, "step": 1160 }, { "epoch": 0.4, "grad_norm": 1.3240538835525513, "learning_rate": 0.0001, "loss": 7.7477, "step": 1170 }, { "epoch": 0.4, "grad_norm": 1.282774567604065, "learning_rate": 0.0001, "loss": 7.7434, "step": 1180 }, { "epoch": 0.4, "grad_norm": 1.3946601152420044, "learning_rate": 0.0001, "loss": 7.7023, "step": 1190 }, { "epoch": 0.41, "grad_norm": 1.4039177894592285, "learning_rate": 0.0001, "loss": 7.6228, "step": 1200 }, { "epoch": 0.41, "grad_norm": 1.2924675941467285, "learning_rate": 0.0001, "loss": 7.6144, "step": 1210 }, { "epoch": 0.41, "grad_norm": 1.3153537511825562, "learning_rate": 0.0001, "loss": 7.661, "step": 1220 }, { "epoch": 0.42, "grad_norm": 1.4871569871902466, "learning_rate": 0.0001, "loss": 7.6486, "step": 1230 }, { "epoch": 0.42, "grad_norm": 1.386214256286621, "learning_rate": 0.0001, "loss": 7.5779, "step": 1240 }, { "epoch": 0.42, "grad_norm": 1.7129216194152832, "learning_rate": 0.0001, "loss": 7.7321, "step": 1250 }, { "epoch": 0.43, "grad_norm": 1.4541782140731812, "learning_rate": 0.0001, "loss": 7.6519, "step": 1260 }, { "epoch": 0.43, "grad_norm": 1.459065556526184, "learning_rate": 0.0001, "loss": 7.68, "step": 1270 }, { "epoch": 0.43, "grad_norm": 1.4400455951690674, "learning_rate": 0.0001, "loss": 7.6245, "step": 1280 }, { "epoch": 0.44, "grad_norm": 1.4551613330841064, "learning_rate": 0.0001, "loss": 7.6565, "step": 1290 }, { "epoch": 0.44, "grad_norm": 1.3700193166732788, "learning_rate": 0.0001, "loss": 7.629, "step": 1300 }, { "epoch": 0.45, "grad_norm": 1.3817769289016724, "learning_rate": 0.0001, "loss": 7.6016, "step": 1310 }, { "epoch": 0.45, "grad_norm": 1.6119599342346191, "learning_rate": 0.0001, "loss": 7.6298, "step": 1320 }, { "epoch": 0.45, "grad_norm": 1.4976177215576172, "learning_rate": 0.0001, "loss": 7.6669, "step": 1330 }, { "epoch": 0.46, "grad_norm": 1.4302177429199219, "learning_rate": 0.0001, "loss": 7.6144, "step": 1340 }, { "epoch": 0.46, "grad_norm": 1.474238634109497, "learning_rate": 0.0001, "loss": 7.6288, "step": 1350 }, { "epoch": 0.46, "grad_norm": 1.4761487245559692, "learning_rate": 0.0001, "loss": 7.6317, "step": 1360 }, { "epoch": 0.47, "grad_norm": 1.5451174974441528, "learning_rate": 0.0001, "loss": 7.5998, "step": 1370 }, { "epoch": 0.47, "grad_norm": 1.436260461807251, "learning_rate": 0.0001, "loss": 7.5431, "step": 1380 }, { "epoch": 0.47, "grad_norm": 1.5143426656723022, "learning_rate": 0.0001, "loss": 7.701, "step": 1390 }, { "epoch": 0.48, "grad_norm": 1.6152522563934326, "learning_rate": 0.0001, "loss": 7.6366, "step": 1400 }, { "epoch": 0.48, "grad_norm": 1.4636591672897339, "learning_rate": 0.0001, "loss": 7.6032, "step": 1410 }, { "epoch": 0.48, "grad_norm": 1.4276256561279297, "learning_rate": 0.0001, "loss": 7.621, "step": 1420 }, { "epoch": 0.49, "grad_norm": 2.719571590423584, "learning_rate": 0.0001, "loss": 7.6289, "step": 1430 }, { "epoch": 0.49, "grad_norm": 1.5515453815460205, "learning_rate": 0.0001, "loss": 7.5354, "step": 1440 }, { "epoch": 0.49, "grad_norm": 1.448424220085144, "learning_rate": 0.0001, "loss": 7.546, "step": 1450 }, { "epoch": 0.5, "grad_norm": 1.4829095602035522, "learning_rate": 0.0001, "loss": 7.6309, "step": 1460 }, { "epoch": 0.5, "grad_norm": 1.5336196422576904, "learning_rate": 0.0001, "loss": 7.5966, "step": 1470 }, { "epoch": 0.5, "grad_norm": 1.4048209190368652, "learning_rate": 0.0001, "loss": 7.5463, "step": 1480 }, { "epoch": 0.51, "grad_norm": 1.5213136672973633, "learning_rate": 0.0001, "loss": 7.5134, "step": 1490 }, { "epoch": 0.51, "grad_norm": 1.5125114917755127, "learning_rate": 0.0001, "loss": 7.5627, "step": 1500 }, { "epoch": 0.51, "grad_norm": 1.3993090391159058, "learning_rate": 0.0001, "loss": 7.4983, "step": 1510 }, { "epoch": 0.52, "grad_norm": 1.5257900953292847, "learning_rate": 0.0001, "loss": 7.5514, "step": 1520 }, { "epoch": 0.52, "grad_norm": 1.425870418548584, "learning_rate": 0.0001, "loss": 7.5658, "step": 1530 }, { "epoch": 0.52, "grad_norm": 1.5099962949752808, "learning_rate": 0.0001, "loss": 7.5112, "step": 1540 }, { "epoch": 0.53, "grad_norm": 1.5617893934249878, "learning_rate": 0.0001, "loss": 7.4845, "step": 1550 }, { "epoch": 0.53, "grad_norm": 1.4649708271026611, "learning_rate": 0.0001, "loss": 7.5263, "step": 1560 }, { "epoch": 0.53, "grad_norm": 1.5084022283554077, "learning_rate": 0.0001, "loss": 7.6381, "step": 1570 }, { "epoch": 0.54, "grad_norm": 1.5481611490249634, "learning_rate": 0.0001, "loss": 7.5124, "step": 1580 }, { "epoch": 0.54, "grad_norm": 1.4792182445526123, "learning_rate": 0.0001, "loss": 7.4572, "step": 1590 }, { "epoch": 0.54, "grad_norm": 1.4853042364120483, "learning_rate": 0.0001, "loss": 7.4854, "step": 1600 }, { "epoch": 0.55, "grad_norm": 1.365810513496399, "learning_rate": 0.0001, "loss": 8.7212, "step": 1610 }, { "epoch": 0.55, "grad_norm": 1.451943039894104, "learning_rate": 0.0001, "loss": 8.6955, "step": 1620 }, { "epoch": 0.55, "grad_norm": 1.3740949630737305, "learning_rate": 0.0001, "loss": 8.6539, "step": 1630 }, { "epoch": 0.56, "grad_norm": 1.3139616250991821, "learning_rate": 0.0001, "loss": 8.6498, "step": 1640 }, { "epoch": 0.56, "grad_norm": 1.3916865587234497, "learning_rate": 0.0001, "loss": 8.6318, "step": 1650 }, { "epoch": 0.56, "grad_norm": 1.4029638767242432, "learning_rate": 0.0001, "loss": 8.5894, "step": 1660 }, { "epoch": 0.57, "grad_norm": 1.49574613571167, "learning_rate": 0.0001, "loss": 8.568, "step": 1670 }, { "epoch": 0.57, "grad_norm": 1.4445033073425293, "learning_rate": 0.0001, "loss": 8.538, "step": 1680 }, { "epoch": 0.57, "grad_norm": 1.495067834854126, "learning_rate": 0.0001, "loss": 8.5365, "step": 1690 }, { "epoch": 0.58, "grad_norm": 1.4277544021606445, "learning_rate": 0.0001, "loss": 8.5172, "step": 1700 }, { "epoch": 0.58, "grad_norm": 1.4678715467453003, "learning_rate": 0.0001, "loss": 8.4763, "step": 1710 }, { "epoch": 0.58, "grad_norm": 1.5344581604003906, "learning_rate": 0.0001, "loss": 8.4817, "step": 1720 }, { "epoch": 0.59, "grad_norm": 1.4848291873931885, "learning_rate": 0.0001, "loss": 8.4339, "step": 1730 }, { "epoch": 0.59, "grad_norm": 1.8756532669067383, "learning_rate": 0.0001, "loss": 8.4432, "step": 1740 }, { "epoch": 0.59, "grad_norm": 1.491782307624817, "learning_rate": 0.0001, "loss": 8.3983, "step": 1750 }, { "epoch": 0.6, "grad_norm": 1.4570878744125366, "learning_rate": 0.0001, "loss": 8.4038, "step": 1760 }, { "epoch": 0.6, "grad_norm": 1.5003529787063599, "learning_rate": 0.0001, "loss": 8.3846, "step": 1770 }, { "epoch": 0.6, "grad_norm": 1.5071924924850464, "learning_rate": 0.0001, "loss": 8.3769, "step": 1780 }, { "epoch": 0.61, "grad_norm": 2.1756210327148438, "learning_rate": 0.0001, "loss": 8.3569, "step": 1790 }, { "epoch": 0.61, "grad_norm": 1.5465497970581055, "learning_rate": 0.0001, "loss": 8.3511, "step": 1800 }, { "epoch": 0.62, "grad_norm": 1.4990923404693604, "learning_rate": 0.0001, "loss": 8.3228, "step": 1810 }, { "epoch": 0.62, "grad_norm": 1.4488916397094727, "learning_rate": 0.0001, "loss": 8.3425, "step": 1820 }, { "epoch": 0.62, "grad_norm": 1.4601061344146729, "learning_rate": 0.0001, "loss": 8.3078, "step": 1830 }, { "epoch": 0.63, "grad_norm": 1.4482914209365845, "learning_rate": 0.0001, "loss": 8.3215, "step": 1840 }, { "epoch": 0.63, "grad_norm": 1.5367404222488403, "learning_rate": 0.0001, "loss": 8.279, "step": 1850 }, { "epoch": 0.63, "grad_norm": 1.5058660507202148, "learning_rate": 0.0001, "loss": 8.2561, "step": 1860 }, { "epoch": 0.64, "grad_norm": 1.4864140748977661, "learning_rate": 0.0001, "loss": 8.2908, "step": 1870 }, { "epoch": 0.64, "grad_norm": 1.6639902591705322, "learning_rate": 0.0001, "loss": 8.2606, "step": 1880 }, { "epoch": 0.64, "grad_norm": 1.6261042356491089, "learning_rate": 0.0001, "loss": 8.2199, "step": 1890 }, { "epoch": 0.65, "grad_norm": 1.6041576862335205, "learning_rate": 0.0001, "loss": 8.2398, "step": 1900 }, { "epoch": 0.65, "grad_norm": 1.4379785060882568, "learning_rate": 0.0001, "loss": 8.2312, "step": 1910 }, { "epoch": 0.65, "grad_norm": 1.5959835052490234, "learning_rate": 0.0001, "loss": 8.234, "step": 1920 }, { "epoch": 0.66, "grad_norm": 1.6284434795379639, "learning_rate": 0.0001, "loss": 8.1663, "step": 1930 }, { "epoch": 0.66, "grad_norm": 1.622789978981018, "learning_rate": 0.0001, "loss": 8.1825, "step": 1940 }, { "epoch": 0.66, "grad_norm": 1.634716272354126, "learning_rate": 0.0001, "loss": 8.1376, "step": 1950 }, { "epoch": 0.67, "grad_norm": 1.6884840726852417, "learning_rate": 0.0001, "loss": 8.2065, "step": 1960 }, { "epoch": 0.67, "grad_norm": 1.5878825187683105, "learning_rate": 0.0001, "loss": 8.1239, "step": 1970 }, { "epoch": 0.67, "grad_norm": 1.5738948583602905, "learning_rate": 0.0001, "loss": 8.1848, "step": 1980 }, { "epoch": 0.68, "grad_norm": 1.68436861038208, "learning_rate": 0.0001, "loss": 8.1342, "step": 1990 }, { "epoch": 0.68, "grad_norm": 1.5968626737594604, "learning_rate": 0.0001, "loss": 8.1406, "step": 2000 }, { "epoch": 0.68, "grad_norm": 1.5772038698196411, "learning_rate": 0.0001, "loss": 8.1452, "step": 2010 }, { "epoch": 0.69, "grad_norm": 1.751119613647461, "learning_rate": 0.0001, "loss": 8.0963, "step": 2020 }, { "epoch": 0.69, "grad_norm": 1.6801263093948364, "learning_rate": 0.0001, "loss": 8.1235, "step": 2030 }, { "epoch": 0.69, "grad_norm": 1.6694085597991943, "learning_rate": 0.0001, "loss": 8.0969, "step": 2040 }, { "epoch": 0.7, "grad_norm": 1.7246935367584229, "learning_rate": 0.0001, "loss": 8.1064, "step": 2050 }, { "epoch": 0.7, "grad_norm": 1.708856225013733, "learning_rate": 0.0001, "loss": 8.0842, "step": 2060 }, { "epoch": 0.7, "grad_norm": 1.648662805557251, "learning_rate": 0.0001, "loss": 8.0483, "step": 2070 }, { "epoch": 0.71, "grad_norm": 1.6835368871688843, "learning_rate": 0.0001, "loss": 8.0595, "step": 2080 }, { "epoch": 0.71, "grad_norm": 1.725957989692688, "learning_rate": 0.0001, "loss": 8.0901, "step": 2090 }, { "epoch": 0.71, "grad_norm": 1.6435219049453735, "learning_rate": 0.0001, "loss": 8.0726, "step": 2100 }, { "epoch": 0.72, "grad_norm": 1.6143757104873657, "learning_rate": 0.0001, "loss": 7.9797, "step": 2110 }, { "epoch": 0.72, "grad_norm": 1.8680998086929321, "learning_rate": 0.0001, "loss": 8.0061, "step": 2120 }, { "epoch": 0.72, "grad_norm": 1.7306435108184814, "learning_rate": 0.0001, "loss": 8.0243, "step": 2130 }, { "epoch": 0.73, "grad_norm": 1.7048896551132202, "learning_rate": 0.0001, "loss": 8.0616, "step": 2140 }, { "epoch": 0.73, "grad_norm": 1.7379491329193115, "learning_rate": 0.0001, "loss": 8.0672, "step": 2150 }, { "epoch": 0.73, "grad_norm": 1.7977313995361328, "learning_rate": 0.0001, "loss": 8.0118, "step": 2160 }, { "epoch": 0.74, "grad_norm": 1.70112943649292, "learning_rate": 0.0001, "loss": 7.9786, "step": 2170 }, { "epoch": 0.74, "grad_norm": 1.7871379852294922, "learning_rate": 0.0001, "loss": 8.0074, "step": 2180 }, { "epoch": 0.74, "grad_norm": 1.7438380718231201, "learning_rate": 0.0001, "loss": 8.0325, "step": 2190 }, { "epoch": 0.75, "grad_norm": 1.8116419315338135, "learning_rate": 0.0001, "loss": 8.0519, "step": 2200 }, { "epoch": 0.75, "grad_norm": 1.9109293222427368, "learning_rate": 0.0001, "loss": 8.0154, "step": 2210 }, { "epoch": 0.75, "grad_norm": 1.7357542514801025, "learning_rate": 0.0001, "loss": 8.0222, "step": 2220 }, { "epoch": 0.76, "grad_norm": 1.745240330696106, "learning_rate": 0.0001, "loss": 7.9692, "step": 2230 }, { "epoch": 0.76, "grad_norm": 1.757216453552246, "learning_rate": 0.0001, "loss": 7.8783, "step": 2240 }, { "epoch": 0.76, "grad_norm": 1.8003623485565186, "learning_rate": 0.0001, "loss": 7.9668, "step": 2250 }, { "epoch": 0.77, "grad_norm": 1.7825720310211182, "learning_rate": 0.0001, "loss": 7.9397, "step": 2260 }, { "epoch": 0.77, "grad_norm": 1.9213591814041138, "learning_rate": 0.0001, "loss": 8.0387, "step": 2270 }, { "epoch": 0.77, "grad_norm": 1.8614377975463867, "learning_rate": 0.0001, "loss": 7.9599, "step": 2280 }, { "epoch": 0.78, "grad_norm": 1.8351399898529053, "learning_rate": 0.0001, "loss": 7.9109, "step": 2290 }, { "epoch": 0.78, "grad_norm": 1.7677947282791138, "learning_rate": 0.0001, "loss": 7.9349, "step": 2300 }, { "epoch": 0.79, "grad_norm": 1.916437029838562, "learning_rate": 0.0001, "loss": 7.9398, "step": 2310 }, { "epoch": 0.79, "grad_norm": 1.7147809267044067, "learning_rate": 0.0001, "loss": 7.8875, "step": 2320 }, { "epoch": 0.79, "grad_norm": 1.7566685676574707, "learning_rate": 0.0001, "loss": 8.0169, "step": 2330 }, { "epoch": 0.8, "grad_norm": 2.0143845081329346, "learning_rate": 0.0001, "loss": 7.9364, "step": 2340 }, { "epoch": 0.8, "grad_norm": 1.8107824325561523, "learning_rate": 0.0001, "loss": 7.9327, "step": 2350 }, { "epoch": 0.8, "grad_norm": 1.817542314529419, "learning_rate": 0.0001, "loss": 7.923, "step": 2360 }, { "epoch": 0.81, "grad_norm": 1.7315311431884766, "learning_rate": 0.0001, "loss": 7.8709, "step": 2370 }, { "epoch": 0.81, "grad_norm": 1.8613227605819702, "learning_rate": 0.0001, "loss": 7.8522, "step": 2380 }, { "epoch": 0.81, "grad_norm": 1.8293215036392212, "learning_rate": 0.0001, "loss": 7.9198, "step": 2390 }, { "epoch": 0.82, "grad_norm": 1.8429689407348633, "learning_rate": 0.0001, "loss": 7.8768, "step": 2400 }, { "epoch": 0.82, "grad_norm": 1.8556689023971558, "learning_rate": 0.0001, "loss": 7.8559, "step": 2410 }, { "epoch": 0.82, "grad_norm": 1.832610845565796, "learning_rate": 0.0001, "loss": 7.8548, "step": 2420 }, { "epoch": 0.83, "grad_norm": 1.8542442321777344, "learning_rate": 0.0001, "loss": 7.8004, "step": 2430 }, { "epoch": 0.83, "grad_norm": 1.8391692638397217, "learning_rate": 0.0001, "loss": 7.8565, "step": 2440 }, { "epoch": 0.83, "grad_norm": 1.823958396911621, "learning_rate": 0.0001, "loss": 7.8082, "step": 2450 }, { "epoch": 0.84, "grad_norm": 1.9417130947113037, "learning_rate": 0.0001, "loss": 7.8148, "step": 2460 }, { "epoch": 0.84, "grad_norm": 1.882686734199524, "learning_rate": 0.0001, "loss": 7.8759, "step": 2470 }, { "epoch": 0.84, "grad_norm": 2.1273910999298096, "learning_rate": 0.0001, "loss": 7.7771, "step": 2480 }, { "epoch": 0.85, "grad_norm": 1.8162022829055786, "learning_rate": 0.0001, "loss": 7.8569, "step": 2490 }, { "epoch": 0.85, "grad_norm": 1.7690000534057617, "learning_rate": 0.0001, "loss": 7.8044, "step": 2500 }, { "epoch": 0.85, "grad_norm": 1.802037000656128, "learning_rate": 0.0001, "loss": 7.7328, "step": 2510 }, { "epoch": 0.86, "grad_norm": 1.9244282245635986, "learning_rate": 0.0001, "loss": 7.9286, "step": 2520 }, { "epoch": 0.86, "grad_norm": 1.8962161540985107, "learning_rate": 0.0001, "loss": 7.8084, "step": 2530 }, { "epoch": 0.86, "grad_norm": 1.9069162607192993, "learning_rate": 0.0001, "loss": 7.7968, "step": 2540 }, { "epoch": 0.87, "grad_norm": 1.8832062482833862, "learning_rate": 0.0001, "loss": 7.8471, "step": 2550 }, { "epoch": 0.87, "grad_norm": 1.8760149478912354, "learning_rate": 0.0001, "loss": 7.7647, "step": 2560 }, { "epoch": 0.87, "grad_norm": 1.9329017400741577, "learning_rate": 0.0001, "loss": 7.7911, "step": 2570 }, { "epoch": 0.88, "grad_norm": 1.9016661643981934, "learning_rate": 0.0001, "loss": 7.728, "step": 2580 }, { "epoch": 0.88, "grad_norm": 1.900335431098938, "learning_rate": 0.0001, "loss": 7.7218, "step": 2590 }, { "epoch": 0.88, "grad_norm": 1.9823989868164062, "learning_rate": 0.0001, "loss": 7.7499, "step": 2600 }, { "epoch": 0.89, "grad_norm": 1.8205046653747559, "learning_rate": 0.0001, "loss": 7.7044, "step": 2610 }, { "epoch": 0.89, "grad_norm": 1.9539875984191895, "learning_rate": 0.0001, "loss": 7.7712, "step": 2620 }, { "epoch": 0.89, "grad_norm": 1.910416841506958, "learning_rate": 0.0001, "loss": 7.7335, "step": 2630 }, { "epoch": 0.9, "grad_norm": 2.0304157733917236, "learning_rate": 0.0001, "loss": 7.7906, "step": 2640 }, { "epoch": 0.9, "grad_norm": 1.9031319618225098, "learning_rate": 0.0001, "loss": 7.7231, "step": 2650 }, { "epoch": 0.9, "grad_norm": 1.8728690147399902, "learning_rate": 0.0001, "loss": 7.7539, "step": 2660 }, { "epoch": 0.91, "grad_norm": 2.2379097938537598, "learning_rate": 0.0001, "loss": 7.8118, "step": 2670 }, { "epoch": 0.91, "grad_norm": 2.289486885070801, "learning_rate": 0.0001, "loss": 7.7609, "step": 2680 }, { "epoch": 0.91, "grad_norm": 2.0566112995147705, "learning_rate": 0.0001, "loss": 7.7602, "step": 2690 }, { "epoch": 0.92, "grad_norm": 1.9911762475967407, "learning_rate": 0.0001, "loss": 7.6783, "step": 2700 }, { "epoch": 0.92, "grad_norm": 1.8722708225250244, "learning_rate": 0.0001, "loss": 7.7683, "step": 2710 }, { "epoch": 0.92, "grad_norm": 2.0248823165893555, "learning_rate": 0.0001, "loss": 7.6759, "step": 2720 }, { "epoch": 0.93, "grad_norm": 2.038174867630005, "learning_rate": 0.0001, "loss": 7.6805, "step": 2730 }, { "epoch": 0.93, "grad_norm": 1.9096912145614624, "learning_rate": 0.0001, "loss": 7.622, "step": 2740 }, { "epoch": 0.93, "grad_norm": 1.9723740816116333, "learning_rate": 0.0001, "loss": 7.7454, "step": 2750 }, { "epoch": 0.94, "grad_norm": 2.0898845195770264, "learning_rate": 0.0001, "loss": 7.7406, "step": 2760 }, { "epoch": 0.94, "grad_norm": 1.9320049285888672, "learning_rate": 0.0001, "loss": 7.6579, "step": 2770 }, { "epoch": 0.94, "grad_norm": 2.080528497695923, "learning_rate": 0.0001, "loss": 7.7255, "step": 2780 }, { "epoch": 0.95, "grad_norm": 2.2108402252197266, "learning_rate": 0.0001, "loss": 7.6704, "step": 2790 }, { "epoch": 0.95, "grad_norm": 1.9102885723114014, "learning_rate": 0.0001, "loss": 7.6922, "step": 2800 }, { "epoch": 0.95, "grad_norm": 2.090742349624634, "learning_rate": 0.0001, "loss": 7.6837, "step": 2810 }, { "epoch": 0.96, "grad_norm": 2.0163843631744385, "learning_rate": 0.0001, "loss": 7.6791, "step": 2820 }, { "epoch": 0.96, "grad_norm": 2.0053796768188477, "learning_rate": 0.0001, "loss": 7.7171, "step": 2830 }, { "epoch": 0.97, "grad_norm": 1.956453800201416, "learning_rate": 0.0001, "loss": 7.6368, "step": 2840 }, { "epoch": 0.97, "grad_norm": 2.0385525226593018, "learning_rate": 0.0001, "loss": 7.5701, "step": 2850 }, { "epoch": 0.97, "grad_norm": 2.113161325454712, "learning_rate": 0.0001, "loss": 7.666, "step": 2860 }, { "epoch": 0.98, "grad_norm": 2.1949219703674316, "learning_rate": 0.0001, "loss": 7.7483, "step": 2870 }, { "epoch": 0.98, "grad_norm": 1.9858150482177734, "learning_rate": 0.0001, "loss": 7.5969, "step": 2880 }, { "epoch": 0.98, "grad_norm": 2.061603546142578, "learning_rate": 0.0001, "loss": 7.6143, "step": 2890 }, { "epoch": 0.99, "grad_norm": 1.8981903791427612, "learning_rate": 0.0001, "loss": 7.6184, "step": 2900 }, { "epoch": 0.99, "grad_norm": 1.9247734546661377, "learning_rate": 0.0001, "loss": 7.5995, "step": 2910 }, { "epoch": 0.99, "grad_norm": 2.1106338500976562, "learning_rate": 0.0001, "loss": 7.7138, "step": 2920 }, { "epoch": 1.0, "grad_norm": 2.051254987716675, "learning_rate": 0.0001, "loss": 7.7863, "step": 2930 }, { "epoch": 1.0, "grad_norm": 2.010615825653076, "learning_rate": 0.0001, "loss": 7.6377, "step": 2940 }, { "epoch": 1.0, "grad_norm": 1.9900637865066528, "learning_rate": 0.0001, "loss": 7.6389, "step": 2950 }, { "epoch": 1.01, "grad_norm": 2.054337501525879, "learning_rate": 0.0001, "loss": 7.6445, "step": 2960 }, { "epoch": 1.01, "grad_norm": 1.9585248231887817, "learning_rate": 0.0001, "loss": 7.6182, "step": 2970 }, { "epoch": 1.01, "grad_norm": 2.0628204345703125, "learning_rate": 0.0001, "loss": 7.5937, "step": 2980 }, { "epoch": 1.02, "grad_norm": 2.0061392784118652, "learning_rate": 0.0001, "loss": 7.6077, "step": 2990 }, { "epoch": 1.02, "grad_norm": 1.9314968585968018, "learning_rate": 0.0001, "loss": 7.592, "step": 3000 }, { "epoch": 1.02, "grad_norm": 1.9211483001708984, "learning_rate": 0.0001, "loss": 7.5896, "step": 3010 }, { "epoch": 1.03, "grad_norm": 1.967714786529541, "learning_rate": 0.0001, "loss": 7.5541, "step": 3020 }, { "epoch": 1.03, "grad_norm": 1.9651085138320923, "learning_rate": 0.0001, "loss": 7.5424, "step": 3030 }, { "epoch": 1.03, "grad_norm": 2.044400453567505, "learning_rate": 0.0001, "loss": 7.5674, "step": 3040 }, { "epoch": 1.04, "grad_norm": 2.03792142868042, "learning_rate": 0.0001, "loss": 7.5019, "step": 3050 }, { "epoch": 1.04, "grad_norm": 2.0937745571136475, "learning_rate": 0.0001, "loss": 7.6097, "step": 3060 }, { "epoch": 1.04, "grad_norm": 2.1240177154541016, "learning_rate": 0.0001, "loss": 7.5857, "step": 3070 }, { "epoch": 1.05, "grad_norm": 2.1132490634918213, "learning_rate": 0.0001, "loss": 7.5241, "step": 3080 }, { "epoch": 1.05, "grad_norm": 2.132042646408081, "learning_rate": 0.0001, "loss": 7.5651, "step": 3090 }, { "epoch": 1.05, "grad_norm": 2.032273292541504, "learning_rate": 0.0001, "loss": 7.5327, "step": 3100 }, { "epoch": 1.06, "grad_norm": 2.105538845062256, "learning_rate": 0.0001, "loss": 7.5053, "step": 3110 }, { "epoch": 1.06, "grad_norm": 2.1169042587280273, "learning_rate": 0.0001, "loss": 7.5133, "step": 3120 }, { "epoch": 1.06, "grad_norm": 2.0327932834625244, "learning_rate": 0.0001, "loss": 7.5425, "step": 3130 }, { "epoch": 1.07, "grad_norm": 2.1249454021453857, "learning_rate": 0.0001, "loss": 7.5475, "step": 3140 }, { "epoch": 1.07, "grad_norm": 2.0636510848999023, "learning_rate": 0.0001, "loss": 7.5902, "step": 3150 }, { "epoch": 1.07, "grad_norm": 2.4280169010162354, "learning_rate": 0.0001, "loss": 7.5048, "step": 3160 }, { "epoch": 1.08, "grad_norm": 2.1936709880828857, "learning_rate": 0.0001, "loss": 7.5676, "step": 3170 }, { "epoch": 1.08, "grad_norm": 2.1290299892425537, "learning_rate": 0.0001, "loss": 7.45, "step": 3180 }, { "epoch": 1.08, "grad_norm": 2.076470136642456, "learning_rate": 0.0001, "loss": 7.4977, "step": 3190 }, { "epoch": 1.09, "grad_norm": 1.9378889799118042, "learning_rate": 0.0001, "loss": 7.4409, "step": 3200 }, { "epoch": 1.09, "grad_norm": 2.13578724861145, "learning_rate": 0.0001, "loss": 7.5446, "step": 3210 }, { "epoch": 1.09, "grad_norm": 2.1130871772766113, "learning_rate": 0.0001, "loss": 7.5431, "step": 3220 }, { "epoch": 1.1, "grad_norm": 2.072411298751831, "learning_rate": 0.0001, "loss": 7.3864, "step": 3230 }, { "epoch": 1.1, "grad_norm": 2.1405575275421143, "learning_rate": 0.0001, "loss": 7.4161, "step": 3240 }, { "epoch": 1.1, "grad_norm": 2.0535531044006348, "learning_rate": 0.0001, "loss": 7.4866, "step": 3250 }, { "epoch": 1.11, "grad_norm": 2.2262203693389893, "learning_rate": 0.0001, "loss": 7.5469, "step": 3260 }, { "epoch": 1.11, "grad_norm": 2.0053820610046387, "learning_rate": 0.0001, "loss": 7.4489, "step": 3270 }, { "epoch": 1.11, "grad_norm": 2.116321325302124, "learning_rate": 0.0001, "loss": 7.4585, "step": 3280 }, { "epoch": 1.12, "grad_norm": 2.023549795150757, "learning_rate": 0.0001, "loss": 7.544, "step": 3290 }, { "epoch": 1.12, "grad_norm": 2.096761465072632, "learning_rate": 0.0001, "loss": 7.408, "step": 3300 }, { "epoch": 1.12, "grad_norm": 2.1344809532165527, "learning_rate": 0.0001, "loss": 7.4602, "step": 3310 }, { "epoch": 1.13, "grad_norm": 2.1644928455352783, "learning_rate": 0.0001, "loss": 7.4324, "step": 3320 }, { "epoch": 1.13, "grad_norm": 2.2048068046569824, "learning_rate": 0.0001, "loss": 7.5078, "step": 3330 }, { "epoch": 1.14, "grad_norm": 2.1286168098449707, "learning_rate": 0.0001, "loss": 7.4117, "step": 3340 }, { "epoch": 1.14, "grad_norm": 1.9755229949951172, "learning_rate": 0.0001, "loss": 7.4233, "step": 3350 }, { "epoch": 1.14, "grad_norm": 2.165160655975342, "learning_rate": 0.0001, "loss": 7.5074, "step": 3360 }, { "epoch": 1.15, "grad_norm": 2.085435152053833, "learning_rate": 0.0001, "loss": 7.4901, "step": 3370 }, { "epoch": 1.15, "grad_norm": 2.0615718364715576, "learning_rate": 0.0001, "loss": 7.4103, "step": 3380 }, { "epoch": 1.15, "grad_norm": 2.113546371459961, "learning_rate": 0.0001, "loss": 7.4724, "step": 3390 }, { "epoch": 1.16, "grad_norm": 2.0932693481445312, "learning_rate": 0.0001, "loss": 7.4305, "step": 3400 }, { "epoch": 1.16, "grad_norm": 2.0436511039733887, "learning_rate": 0.0001, "loss": 7.4556, "step": 3410 }, { "epoch": 1.16, "grad_norm": 1.9966588020324707, "learning_rate": 0.0001, "loss": 7.3095, "step": 3420 }, { "epoch": 1.17, "grad_norm": 2.1284430027008057, "learning_rate": 0.0001, "loss": 7.3911, "step": 3430 }, { "epoch": 1.17, "grad_norm": 2.105534553527832, "learning_rate": 0.0001, "loss": 7.4409, "step": 3440 }, { "epoch": 1.17, "grad_norm": 2.23077654838562, "learning_rate": 0.0001, "loss": 7.4349, "step": 3450 }, { "epoch": 1.18, "grad_norm": 2.1125681400299072, "learning_rate": 0.0001, "loss": 7.4981, "step": 3460 }, { "epoch": 1.18, "grad_norm": 2.067131757736206, "learning_rate": 0.0001, "loss": 7.4678, "step": 3470 }, { "epoch": 1.18, "grad_norm": 2.220170021057129, "learning_rate": 0.0001, "loss": 7.4274, "step": 3480 }, { "epoch": 1.19, "grad_norm": 2.143610954284668, "learning_rate": 0.0001, "loss": 7.4003, "step": 3490 }, { "epoch": 1.19, "grad_norm": 2.099076747894287, "learning_rate": 0.0001, "loss": 7.383, "step": 3500 }, { "epoch": 1.19, "grad_norm": 2.046013355255127, "learning_rate": 0.0001, "loss": 7.3424, "step": 3510 }, { "epoch": 1.2, "grad_norm": 2.070675849914551, "learning_rate": 0.0001, "loss": 7.4347, "step": 3520 }, { "epoch": 1.2, "grad_norm": 2.0689220428466797, "learning_rate": 0.0001, "loss": 7.402, "step": 3530 }, { "epoch": 1.2, "grad_norm": 2.303022623062134, "learning_rate": 0.0001, "loss": 7.3843, "step": 3540 }, { "epoch": 1.21, "grad_norm": 2.153381586074829, "learning_rate": 0.0001, "loss": 7.3525, "step": 3550 }, { "epoch": 1.21, "grad_norm": 2.286656618118286, "learning_rate": 0.0001, "loss": 7.3751, "step": 3560 }, { "epoch": 1.21, "grad_norm": 2.0820603370666504, "learning_rate": 0.0001, "loss": 7.3258, "step": 3570 }, { "epoch": 1.22, "grad_norm": 3.858593702316284, "learning_rate": 0.0001, "loss": 7.3927, "step": 3580 }, { "epoch": 1.22, "grad_norm": 2.2527058124542236, "learning_rate": 0.0001, "loss": 7.3588, "step": 3590 }, { "epoch": 1.22, "grad_norm": 2.079145669937134, "learning_rate": 0.0001, "loss": 7.374, "step": 3600 }, { "epoch": 1.23, "grad_norm": 2.1242687702178955, "learning_rate": 0.0001, "loss": 7.3516, "step": 3610 }, { "epoch": 1.23, "grad_norm": 2.272284984588623, "learning_rate": 0.0001, "loss": 7.3299, "step": 3620 }, { "epoch": 1.23, "grad_norm": 2.10646653175354, "learning_rate": 0.0001, "loss": 7.4278, "step": 3630 }, { "epoch": 1.24, "grad_norm": 2.0857315063476562, "learning_rate": 0.0001, "loss": 7.3894, "step": 3640 }, { "epoch": 1.24, "grad_norm": 2.268275260925293, "learning_rate": 0.0001, "loss": 7.342, "step": 3650 }, { "epoch": 1.24, "grad_norm": 2.0469658374786377, "learning_rate": 0.0001, "loss": 7.3556, "step": 3660 }, { "epoch": 1.25, "grad_norm": 2.1571669578552246, "learning_rate": 0.0001, "loss": 7.3198, "step": 3670 }, { "epoch": 1.25, "grad_norm": 2.3082923889160156, "learning_rate": 0.0001, "loss": 7.3687, "step": 3680 }, { "epoch": 1.25, "grad_norm": 2.8455331325531006, "learning_rate": 0.0001, "loss": 7.3044, "step": 3690 }, { "epoch": 1.26, "grad_norm": 2.215144634246826, "learning_rate": 0.0001, "loss": 7.2532, "step": 3700 }, { "epoch": 1.26, "grad_norm": 2.159938097000122, "learning_rate": 0.0001, "loss": 7.292, "step": 3710 }, { "epoch": 1.26, "grad_norm": 2.175513744354248, "learning_rate": 0.0001, "loss": 7.4197, "step": 3720 }, { "epoch": 1.27, "grad_norm": 2.287431478500366, "learning_rate": 0.0001, "loss": 7.3119, "step": 3730 }, { "epoch": 1.27, "grad_norm": 2.1502156257629395, "learning_rate": 0.0001, "loss": 7.2915, "step": 3740 }, { "epoch": 1.27, "grad_norm": 2.184866189956665, "learning_rate": 0.0001, "loss": 7.3259, "step": 3750 }, { "epoch": 1.28, "grad_norm": 2.73464298248291, "learning_rate": 0.0001, "loss": 7.299, "step": 3760 }, { "epoch": 1.28, "grad_norm": 2.0966055393218994, "learning_rate": 0.0001, "loss": 7.3202, "step": 3770 }, { "epoch": 1.28, "grad_norm": 2.2224791049957275, "learning_rate": 0.0001, "loss": 7.2796, "step": 3780 }, { "epoch": 1.29, "grad_norm": 2.3259451389312744, "learning_rate": 0.0001, "loss": 7.3665, "step": 3790 }, { "epoch": 1.29, "grad_norm": 2.251563310623169, "learning_rate": 0.0001, "loss": 7.3481, "step": 3800 }, { "epoch": 1.29, "grad_norm": 2.183901786804199, "learning_rate": 0.0001, "loss": 7.2849, "step": 3810 }, { "epoch": 1.3, "grad_norm": 2.1441636085510254, "learning_rate": 0.0001, "loss": 7.3627, "step": 3820 }, { "epoch": 1.3, "grad_norm": 2.17110538482666, "learning_rate": 0.0001, "loss": 7.2228, "step": 3830 }, { "epoch": 1.3, "grad_norm": 2.1089303493499756, "learning_rate": 0.0001, "loss": 7.3896, "step": 3840 }, { "epoch": 1.31, "grad_norm": 2.2746925354003906, "learning_rate": 0.0001, "loss": 7.3009, "step": 3850 }, { "epoch": 1.31, "grad_norm": 2.163635730743408, "learning_rate": 0.0001, "loss": 7.2944, "step": 3860 }, { "epoch": 1.32, "grad_norm": 2.1355977058410645, "learning_rate": 0.0001, "loss": 7.2778, "step": 3870 }, { "epoch": 1.32, "grad_norm": 2.229297161102295, "learning_rate": 0.0001, "loss": 7.3898, "step": 3880 }, { "epoch": 1.32, "grad_norm": 2.233773708343506, "learning_rate": 0.0001, "loss": 7.2333, "step": 3890 }, { "epoch": 1.33, "grad_norm": 2.2442920207977295, "learning_rate": 0.0001, "loss": 7.3783, "step": 3900 }, { "epoch": 1.33, "grad_norm": 2.113609552383423, "learning_rate": 0.0001, "loss": 7.2231, "step": 3910 }, { "epoch": 1.33, "grad_norm": 2.049269199371338, "learning_rate": 0.0001, "loss": 7.2754, "step": 3920 }, { "epoch": 1.34, "grad_norm": 2.2847328186035156, "learning_rate": 0.0001, "loss": 7.2955, "step": 3930 }, { "epoch": 1.34, "grad_norm": 2.0207245349884033, "learning_rate": 0.0001, "loss": 7.2091, "step": 3940 }, { "epoch": 1.34, "grad_norm": 2.120429277420044, "learning_rate": 0.0001, "loss": 7.3541, "step": 3950 }, { "epoch": 1.35, "grad_norm": 2.1240029335021973, "learning_rate": 0.0001, "loss": 7.2128, "step": 3960 }, { "epoch": 1.35, "grad_norm": 2.112290859222412, "learning_rate": 0.0001, "loss": 7.3131, "step": 3970 }, { "epoch": 1.35, "grad_norm": 2.1971917152404785, "learning_rate": 0.0001, "loss": 7.1242, "step": 3980 }, { "epoch": 1.36, "grad_norm": 2.2146995067596436, "learning_rate": 0.0001, "loss": 7.307, "step": 3990 }, { "epoch": 1.36, "grad_norm": 2.1895055770874023, "learning_rate": 0.0001, "loss": 7.2525, "step": 4000 }, { "epoch": 1.36, "grad_norm": 2.1644904613494873, "learning_rate": 0.0001, "loss": 7.2756, "step": 4010 }, { "epoch": 1.37, "grad_norm": 2.307317018508911, "learning_rate": 0.0001, "loss": 7.2806, "step": 4020 }, { "epoch": 1.37, "grad_norm": 2.1317138671875, "learning_rate": 0.0001, "loss": 7.1696, "step": 4030 }, { "epoch": 1.37, "grad_norm": 2.378333568572998, "learning_rate": 0.0001, "loss": 7.2553, "step": 4040 }, { "epoch": 1.38, "grad_norm": 2.156888246536255, "learning_rate": 0.0001, "loss": 7.2355, "step": 4050 }, { "epoch": 1.38, "grad_norm": 2.226720094680786, "learning_rate": 0.0001, "loss": 7.24, "step": 4060 }, { "epoch": 1.38, "grad_norm": 2.2522284984588623, "learning_rate": 0.0001, "loss": 7.2193, "step": 4070 }, { "epoch": 1.39, "grad_norm": 2.2693936824798584, "learning_rate": 0.0001, "loss": 7.2314, "step": 4080 }, { "epoch": 1.39, "grad_norm": 2.3695926666259766, "learning_rate": 0.0001, "loss": 7.2259, "step": 4090 }, { "epoch": 1.39, "grad_norm": 2.290940761566162, "learning_rate": 0.0001, "loss": 7.2789, "step": 4100 }, { "epoch": 1.4, "grad_norm": 2.2457990646362305, "learning_rate": 0.0001, "loss": 7.2184, "step": 4110 }, { "epoch": 1.4, "grad_norm": 2.299743413925171, "learning_rate": 0.0001, "loss": 7.2948, "step": 4120 }, { "epoch": 1.4, "grad_norm": 2.2357537746429443, "learning_rate": 0.0001, "loss": 7.2268, "step": 4130 }, { "epoch": 1.41, "grad_norm": 2.3687903881073, "learning_rate": 0.0001, "loss": 7.2935, "step": 4140 }, { "epoch": 1.41, "grad_norm": 2.255699396133423, "learning_rate": 0.0001, "loss": 7.1908, "step": 4150 }, { "epoch": 1.41, "grad_norm": 2.3741886615753174, "learning_rate": 0.0001, "loss": 7.1706, "step": 4160 }, { "epoch": 1.42, "grad_norm": 2.2647061347961426, "learning_rate": 0.0001, "loss": 7.3281, "step": 4170 }, { "epoch": 1.42, "grad_norm": 2.179403781890869, "learning_rate": 0.0001, "loss": 7.2573, "step": 4180 }, { "epoch": 1.42, "grad_norm": 2.2076845169067383, "learning_rate": 0.0001, "loss": 7.2239, "step": 4190 }, { "epoch": 1.43, "grad_norm": 2.1713955402374268, "learning_rate": 0.0001, "loss": 7.1854, "step": 4200 }, { "epoch": 1.43, "grad_norm": 2.224914312362671, "learning_rate": 0.0001, "loss": 7.1896, "step": 4210 }, { "epoch": 1.43, "grad_norm": 2.277390241622925, "learning_rate": 0.0001, "loss": 7.1617, "step": 4220 }, { "epoch": 1.44, "grad_norm": 2.262953996658325, "learning_rate": 0.0001, "loss": 7.1675, "step": 4230 }, { "epoch": 1.44, "grad_norm": 2.307990312576294, "learning_rate": 0.0001, "loss": 7.2488, "step": 4240 }, { "epoch": 1.44, "grad_norm": 2.343857526779175, "learning_rate": 0.0001, "loss": 7.1453, "step": 4250 }, { "epoch": 1.45, "grad_norm": 2.225097179412842, "learning_rate": 0.0001, "loss": 7.1337, "step": 4260 }, { "epoch": 1.45, "grad_norm": 2.240400791168213, "learning_rate": 0.0001, "loss": 7.2477, "step": 4270 }, { "epoch": 1.45, "grad_norm": 2.144904613494873, "learning_rate": 0.0001, "loss": 7.2, "step": 4280 }, { "epoch": 1.46, "grad_norm": 2.24607253074646, "learning_rate": 0.0001, "loss": 7.1434, "step": 4290 }, { "epoch": 1.46, "grad_norm": 2.3587682247161865, "learning_rate": 0.0001, "loss": 7.2904, "step": 4300 }, { "epoch": 1.46, "grad_norm": 2.3396973609924316, "learning_rate": 0.0001, "loss": 7.157, "step": 4310 }, { "epoch": 1.47, "grad_norm": 2.254444122314453, "learning_rate": 0.0001, "loss": 7.203, "step": 4320 }, { "epoch": 1.47, "grad_norm": 2.3054215908050537, "learning_rate": 0.0001, "loss": 7.177, "step": 4330 }, { "epoch": 1.47, "grad_norm": 2.1978611946105957, "learning_rate": 0.0001, "loss": 7.1707, "step": 4340 }, { "epoch": 1.48, "grad_norm": 2.1555399894714355, "learning_rate": 0.0001, "loss": 7.0313, "step": 4350 }, { "epoch": 1.48, "grad_norm": 2.261692762374878, "learning_rate": 0.0001, "loss": 7.1372, "step": 4360 }, { "epoch": 1.49, "grad_norm": 2.1888833045959473, "learning_rate": 0.0001, "loss": 7.1493, "step": 4370 }, { "epoch": 1.49, "grad_norm": 2.3072457313537598, "learning_rate": 0.0001, "loss": 7.0555, "step": 4380 }, { "epoch": 1.49, "grad_norm": 2.1015727519989014, "learning_rate": 0.0001, "loss": 7.2164, "step": 4390 }, { "epoch": 1.5, "grad_norm": 2.2788422107696533, "learning_rate": 0.0001, "loss": 7.1169, "step": 4400 }, { "epoch": 1.5, "grad_norm": 2.180575132369995, "learning_rate": 0.0001, "loss": 7.1008, "step": 4410 }, { "epoch": 1.5, "grad_norm": 2.234184503555298, "learning_rate": 0.0001, "loss": 7.1023, "step": 4420 }, { "epoch": 1.51, "grad_norm": 2.4093639850616455, "learning_rate": 0.0001, "loss": 7.1913, "step": 4430 }, { "epoch": 1.51, "grad_norm": 2.1984338760375977, "learning_rate": 0.0001, "loss": 7.2237, "step": 4440 }, { "epoch": 1.51, "grad_norm": 2.2517895698547363, "learning_rate": 0.0001, "loss": 7.114, "step": 4450 }, { "epoch": 1.52, "grad_norm": 2.231074094772339, "learning_rate": 0.0001, "loss": 7.1897, "step": 4460 }, { "epoch": 1.52, "grad_norm": 2.7863545417785645, "learning_rate": 0.0001, "loss": 7.0944, "step": 4470 }, { "epoch": 1.52, "grad_norm": 2.2908711433410645, "learning_rate": 0.0001, "loss": 7.1134, "step": 4480 }, { "epoch": 1.53, "grad_norm": 2.216499090194702, "learning_rate": 0.0001, "loss": 7.0969, "step": 4490 }, { "epoch": 1.53, "grad_norm": 2.1753108501434326, "learning_rate": 0.0001, "loss": 7.1187, "step": 4500 }, { "epoch": 1.53, "grad_norm": 2.310413360595703, "learning_rate": 0.0001, "loss": 7.183, "step": 4510 }, { "epoch": 1.54, "grad_norm": 2.270989179611206, "learning_rate": 0.0001, "loss": 7.1332, "step": 4520 }, { "epoch": 1.54, "grad_norm": 2.1965484619140625, "learning_rate": 0.0001, "loss": 7.1457, "step": 4530 }, { "epoch": 1.54, "grad_norm": 2.1956119537353516, "learning_rate": 0.0001, "loss": 7.2644, "step": 4540 }, { "epoch": 1.55, "grad_norm": 2.54412579536438, "learning_rate": 0.0001, "loss": 7.1193, "step": 4550 }, { "epoch": 1.55, "grad_norm": 2.1950461864471436, "learning_rate": 0.0001, "loss": 7.1085, "step": 4560 }, { "epoch": 1.55, "grad_norm": 2.4147634506225586, "learning_rate": 0.0001, "loss": 7.1386, "step": 4570 }, { "epoch": 1.56, "grad_norm": 2.2770273685455322, "learning_rate": 0.0001, "loss": 7.1362, "step": 4580 }, { "epoch": 1.56, "grad_norm": 2.305471181869507, "learning_rate": 0.0001, "loss": 7.0386, "step": 4590 }, { "epoch": 1.56, "grad_norm": 2.2918567657470703, "learning_rate": 0.0001, "loss": 7.1585, "step": 4600 }, { "epoch": 1.57, "grad_norm": 2.2294692993164062, "learning_rate": 0.0001, "loss": 7.1507, "step": 4610 }, { "epoch": 1.57, "grad_norm": 2.0887460708618164, "learning_rate": 0.0001, "loss": 7.099, "step": 4620 }, { "epoch": 1.57, "grad_norm": 2.202714681625366, "learning_rate": 0.0001, "loss": 7.193, "step": 4630 }, { "epoch": 1.58, "grad_norm": 2.342766761779785, "learning_rate": 0.0001, "loss": 7.0854, "step": 4640 }, { "epoch": 1.58, "grad_norm": 2.3064093589782715, "learning_rate": 0.0001, "loss": 7.1019, "step": 4650 }, { "epoch": 1.58, "grad_norm": 2.1647086143493652, "learning_rate": 0.0001, "loss": 6.9739, "step": 4660 }, { "epoch": 1.59, "grad_norm": 2.220046281814575, "learning_rate": 0.0001, "loss": 7.1119, "step": 4670 }, { "epoch": 1.59, "grad_norm": 2.478766918182373, "learning_rate": 0.0001, "loss": 7.0424, "step": 4680 }, { "epoch": 1.59, "grad_norm": 2.349714994430542, "learning_rate": 0.0001, "loss": 7.1136, "step": 4690 }, { "epoch": 1.6, "grad_norm": 2.365969657897949, "learning_rate": 0.0001, "loss": 7.1204, "step": 4700 }, { "epoch": 1.6, "grad_norm": 2.3459932804107666, "learning_rate": 0.0001, "loss": 7.0308, "step": 4710 }, { "epoch": 1.6, "grad_norm": 2.2051196098327637, "learning_rate": 0.0001, "loss": 7.1009, "step": 4720 }, { "epoch": 1.61, "grad_norm": 2.2125892639160156, "learning_rate": 0.0001, "loss": 7.012, "step": 4730 }, { "epoch": 1.61, "grad_norm": 2.402265787124634, "learning_rate": 0.0001, "loss": 7.1284, "step": 4740 }, { "epoch": 1.61, "grad_norm": 2.1848082542419434, "learning_rate": 0.0001, "loss": 7.0313, "step": 4750 }, { "epoch": 1.62, "grad_norm": 2.2981090545654297, "learning_rate": 0.0001, "loss": 7.0365, "step": 4760 }, { "epoch": 1.62, "grad_norm": 2.3186306953430176, "learning_rate": 0.0001, "loss": 7.099, "step": 4770 }, { "epoch": 1.62, "grad_norm": 2.371321678161621, "learning_rate": 0.0001, "loss": 7.0968, "step": 4780 }, { "epoch": 1.63, "grad_norm": 2.278888702392578, "learning_rate": 0.0001, "loss": 7.131, "step": 4790 }, { "epoch": 1.63, "grad_norm": 2.2284281253814697, "learning_rate": 0.0001, "loss": 6.8648, "step": 4800 }, { "epoch": 1.63, "grad_norm": 2.272829532623291, "learning_rate": 0.0001, "loss": 7.0416, "step": 4810 }, { "epoch": 1.64, "grad_norm": 2.224722146987915, "learning_rate": 0.0001, "loss": 7.0871, "step": 4820 }, { "epoch": 1.64, "grad_norm": 2.2158703804016113, "learning_rate": 0.0001, "loss": 7.1796, "step": 4830 }, { "epoch": 1.64, "grad_norm": 2.2695987224578857, "learning_rate": 0.0001, "loss": 7.095, "step": 4840 }, { "epoch": 1.65, "grad_norm": 2.4090540409088135, "learning_rate": 0.0001, "loss": 7.0629, "step": 4850 }, { "epoch": 1.65, "grad_norm": 2.3584811687469482, "learning_rate": 0.0001, "loss": 7.1282, "step": 4860 }, { "epoch": 1.66, "grad_norm": 2.288571357727051, "learning_rate": 0.0001, "loss": 7.0856, "step": 4870 }, { "epoch": 1.66, "grad_norm": 2.195589065551758, "learning_rate": 0.0001, "loss": 6.9848, "step": 4880 }, { "epoch": 1.66, "grad_norm": 2.2008442878723145, "learning_rate": 0.0001, "loss": 7.0626, "step": 4890 }, { "epoch": 1.67, "grad_norm": 2.3728606700897217, "learning_rate": 0.0001, "loss": 7.09, "step": 4900 }, { "epoch": 1.67, "grad_norm": 2.2774598598480225, "learning_rate": 0.0001, "loss": 7.1203, "step": 4910 }, { "epoch": 1.67, "grad_norm": 2.3884873390197754, "learning_rate": 0.0001, "loss": 7.0542, "step": 4920 }, { "epoch": 1.68, "grad_norm": 2.33353590965271, "learning_rate": 0.0001, "loss": 7.0604, "step": 4930 }, { "epoch": 1.68, "grad_norm": 2.2535629272460938, "learning_rate": 0.0001, "loss": 7.04, "step": 4940 }, { "epoch": 1.68, "grad_norm": 2.2882211208343506, "learning_rate": 0.0001, "loss": 7.0488, "step": 4950 }, { "epoch": 1.69, "grad_norm": 2.2261734008789062, "learning_rate": 0.0001, "loss": 7.0044, "step": 4960 }, { "epoch": 1.69, "grad_norm": 2.2294704914093018, "learning_rate": 0.0001, "loss": 7.0224, "step": 4970 }, { "epoch": 1.69, "grad_norm": 2.355883836746216, "learning_rate": 0.0001, "loss": 6.996, "step": 4980 }, { "epoch": 1.7, "grad_norm": 2.255924940109253, "learning_rate": 0.0001, "loss": 7.0028, "step": 4990 }, { "epoch": 1.7, "grad_norm": 2.121638298034668, "learning_rate": 0.0001, "loss": 6.9282, "step": 5000 }, { "epoch": 1.7, "grad_norm": 2.522393226623535, "learning_rate": 0.0001, "loss": 7.0418, "step": 5010 }, { "epoch": 1.71, "grad_norm": 2.3869247436523438, "learning_rate": 0.0001, "loss": 7.1096, "step": 5020 }, { "epoch": 1.71, "grad_norm": 2.2230443954467773, "learning_rate": 0.0001, "loss": 7.0023, "step": 5030 }, { "epoch": 1.71, "grad_norm": 2.2751498222351074, "learning_rate": 0.0001, "loss": 6.9971, "step": 5040 }, { "epoch": 1.72, "grad_norm": 2.3354780673980713, "learning_rate": 0.0001, "loss": 6.9604, "step": 5050 }, { "epoch": 1.72, "grad_norm": 2.2902698516845703, "learning_rate": 0.0001, "loss": 6.9917, "step": 5060 }, { "epoch": 1.72, "grad_norm": 2.3068253993988037, "learning_rate": 0.0001, "loss": 7.0955, "step": 5070 }, { "epoch": 1.73, "grad_norm": 2.175537109375, "learning_rate": 0.0001, "loss": 7.0525, "step": 5080 }, { "epoch": 1.73, "grad_norm": 2.30596923828125, "learning_rate": 0.0001, "loss": 6.9501, "step": 5090 }, { "epoch": 1.73, "grad_norm": 2.3087093830108643, "learning_rate": 0.0001, "loss": 7.068, "step": 5100 }, { "epoch": 1.74, "grad_norm": 2.311290979385376, "learning_rate": 0.0001, "loss": 7.0114, "step": 5110 }, { "epoch": 1.74, "grad_norm": 2.3126213550567627, "learning_rate": 0.0001, "loss": 6.8703, "step": 5120 }, { "epoch": 1.74, "grad_norm": 2.339921236038208, "learning_rate": 0.0001, "loss": 6.9398, "step": 5130 }, { "epoch": 1.75, "grad_norm": 2.385369062423706, "learning_rate": 0.0001, "loss": 7.092, "step": 5140 }, { "epoch": 1.75, "grad_norm": 2.2324063777923584, "learning_rate": 0.0001, "loss": 7.0395, "step": 5150 }, { "epoch": 1.75, "grad_norm": 2.1126644611358643, "learning_rate": 0.0001, "loss": 6.9577, "step": 5160 }, { "epoch": 1.76, "grad_norm": 2.3116745948791504, "learning_rate": 0.0001, "loss": 7.007, "step": 5170 }, { "epoch": 1.76, "grad_norm": 2.302995443344116, "learning_rate": 0.0001, "loss": 7.0157, "step": 5180 }, { "epoch": 1.76, "grad_norm": 2.2893717288970947, "learning_rate": 0.0001, "loss": 7.0359, "step": 5190 }, { "epoch": 1.77, "grad_norm": 2.49497652053833, "learning_rate": 0.0001, "loss": 6.8841, "step": 5200 }, { "epoch": 1.77, "grad_norm": 2.4017276763916016, "learning_rate": 0.0001, "loss": 6.9358, "step": 5210 }, { "epoch": 1.77, "grad_norm": 2.2474894523620605, "learning_rate": 0.0001, "loss": 6.9088, "step": 5220 }, { "epoch": 1.78, "grad_norm": 2.430718183517456, "learning_rate": 0.0001, "loss": 6.9641, "step": 5230 }, { "epoch": 1.78, "grad_norm": 2.2758383750915527, "learning_rate": 0.0001, "loss": 6.8806, "step": 5240 }, { "epoch": 1.78, "grad_norm": 2.4353394508361816, "learning_rate": 0.0001, "loss": 6.9995, "step": 5250 }, { "epoch": 1.79, "grad_norm": 2.167844533920288, "learning_rate": 0.0001, "loss": 6.9315, "step": 5260 }, { "epoch": 1.79, "grad_norm": 2.2327005863189697, "learning_rate": 0.0001, "loss": 6.9489, "step": 5270 }, { "epoch": 1.79, "grad_norm": 2.4071407318115234, "learning_rate": 0.0001, "loss": 6.9209, "step": 5280 }, { "epoch": 1.8, "grad_norm": 2.3251023292541504, "learning_rate": 0.0001, "loss": 7.0063, "step": 5290 }, { "epoch": 1.8, "grad_norm": 2.3144476413726807, "learning_rate": 0.0001, "loss": 6.9339, "step": 5300 }, { "epoch": 1.8, "grad_norm": 2.239985942840576, "learning_rate": 0.0001, "loss": 6.8448, "step": 5310 }, { "epoch": 1.81, "grad_norm": 2.3071162700653076, "learning_rate": 0.0001, "loss": 6.9222, "step": 5320 }, { "epoch": 1.81, "grad_norm": 2.3900599479675293, "learning_rate": 0.0001, "loss": 6.9269, "step": 5330 }, { "epoch": 1.81, "grad_norm": 2.181514024734497, "learning_rate": 0.0001, "loss": 6.8864, "step": 5340 }, { "epoch": 1.82, "grad_norm": 2.2172696590423584, "learning_rate": 0.0001, "loss": 6.9524, "step": 5350 }, { "epoch": 1.82, "grad_norm": 2.4472038745880127, "learning_rate": 0.0001, "loss": 7.0239, "step": 5360 }, { "epoch": 1.82, "grad_norm": 2.205364942550659, "learning_rate": 0.0001, "loss": 6.8968, "step": 5370 }, { "epoch": 1.83, "grad_norm": 2.275344133377075, "learning_rate": 0.0001, "loss": 6.9347, "step": 5380 }, { "epoch": 1.83, "grad_norm": 2.2779319286346436, "learning_rate": 0.0001, "loss": 6.9653, "step": 5390 }, { "epoch": 1.84, "grad_norm": 2.321763038635254, "learning_rate": 0.0001, "loss": 6.9756, "step": 5400 }, { "epoch": 1.84, "grad_norm": 2.4092564582824707, "learning_rate": 0.0001, "loss": 6.9021, "step": 5410 }, { "epoch": 1.84, "grad_norm": 2.5679686069488525, "learning_rate": 0.0001, "loss": 6.9372, "step": 5420 }, { "epoch": 1.85, "grad_norm": 2.332031488418579, "learning_rate": 0.0001, "loss": 6.9713, "step": 5430 }, { "epoch": 1.85, "grad_norm": 2.4025161266326904, "learning_rate": 0.0001, "loss": 6.8268, "step": 5440 }, { "epoch": 1.85, "grad_norm": 2.3501081466674805, "learning_rate": 0.0001, "loss": 6.9569, "step": 5450 }, { "epoch": 1.86, "grad_norm": 2.3788626194000244, "learning_rate": 0.0001, "loss": 6.8993, "step": 5460 }, { "epoch": 1.86, "grad_norm": 2.3662962913513184, "learning_rate": 0.0001, "loss": 6.8359, "step": 5470 }, { "epoch": 1.86, "grad_norm": 2.401604175567627, "learning_rate": 0.0001, "loss": 6.9646, "step": 5480 }, { "epoch": 1.87, "grad_norm": 2.348583698272705, "learning_rate": 0.0001, "loss": 6.913, "step": 5490 }, { "epoch": 1.87, "grad_norm": 2.356675386428833, "learning_rate": 0.0001, "loss": 6.9592, "step": 5500 }, { "epoch": 1.87, "grad_norm": 2.2990479469299316, "learning_rate": 0.0001, "loss": 6.9444, "step": 5510 }, { "epoch": 1.88, "grad_norm": 2.4327516555786133, "learning_rate": 0.0001, "loss": 6.9434, "step": 5520 }, { "epoch": 1.88, "grad_norm": 2.2638111114501953, "learning_rate": 0.0001, "loss": 6.9039, "step": 5530 }, { "epoch": 1.88, "grad_norm": 2.3576090335845947, "learning_rate": 0.0001, "loss": 7.0159, "step": 5540 }, { "epoch": 1.89, "grad_norm": 2.226438522338867, "learning_rate": 0.0001, "loss": 6.9251, "step": 5550 }, { "epoch": 1.89, "grad_norm": 2.3056225776672363, "learning_rate": 0.0001, "loss": 6.915, "step": 5560 }, { "epoch": 1.89, "grad_norm": 2.361037254333496, "learning_rate": 0.0001, "loss": 6.9419, "step": 5570 }, { "epoch": 1.9, "grad_norm": 2.2873964309692383, "learning_rate": 0.0001, "loss": 6.9656, "step": 5580 }, { "epoch": 1.9, "grad_norm": 2.1307671070098877, "learning_rate": 0.0001, "loss": 6.9237, "step": 5590 }, { "epoch": 1.9, "grad_norm": 2.331777572631836, "learning_rate": 0.0001, "loss": 6.8829, "step": 5600 }, { "epoch": 1.91, "grad_norm": 2.4056396484375, "learning_rate": 0.0001, "loss": 6.8417, "step": 5610 }, { "epoch": 1.91, "grad_norm": 2.3956289291381836, "learning_rate": 0.0001, "loss": 6.848, "step": 5620 }, { "epoch": 1.91, "grad_norm": 2.404996156692505, "learning_rate": 0.0001, "loss": 6.8649, "step": 5630 }, { "epoch": 1.92, "grad_norm": 2.32171893119812, "learning_rate": 0.0001, "loss": 6.977, "step": 5640 }, { "epoch": 1.92, "grad_norm": 2.45902681350708, "learning_rate": 0.0001, "loss": 6.905, "step": 5650 }, { "epoch": 1.92, "grad_norm": 2.273726224899292, "learning_rate": 0.0001, "loss": 6.8997, "step": 5660 }, { "epoch": 1.93, "grad_norm": 2.427234172821045, "learning_rate": 0.0001, "loss": 6.8337, "step": 5670 }, { "epoch": 1.93, "grad_norm": 2.4655344486236572, "learning_rate": 0.0001, "loss": 6.8642, "step": 5680 }, { "epoch": 1.93, "grad_norm": 2.356198787689209, "learning_rate": 0.0001, "loss": 6.9213, "step": 5690 }, { "epoch": 1.94, "grad_norm": 2.2886674404144287, "learning_rate": 0.0001, "loss": 6.9349, "step": 5700 }, { "epoch": 1.94, "grad_norm": 2.36144757270813, "learning_rate": 0.0001, "loss": 6.9109, "step": 5710 }, { "epoch": 1.94, "grad_norm": 2.34747576713562, "learning_rate": 0.0001, "loss": 6.875, "step": 5720 }, { "epoch": 1.95, "grad_norm": 2.3149468898773193, "learning_rate": 0.0001, "loss": 6.8872, "step": 5730 }, { "epoch": 1.95, "grad_norm": 2.38944673538208, "learning_rate": 0.0001, "loss": 6.8233, "step": 5740 }, { "epoch": 1.95, "grad_norm": 2.441582441329956, "learning_rate": 0.0001, "loss": 6.9502, "step": 5750 }, { "epoch": 1.96, "grad_norm": 2.380875587463379, "learning_rate": 0.0001, "loss": 6.8632, "step": 5760 }, { "epoch": 1.96, "grad_norm": 2.284801721572876, "learning_rate": 0.0001, "loss": 6.7719, "step": 5770 }, { "epoch": 1.96, "grad_norm": 2.3175783157348633, "learning_rate": 0.0001, "loss": 6.8299, "step": 5780 }, { "epoch": 1.97, "grad_norm": 2.3551747798919678, "learning_rate": 0.0001, "loss": 6.8656, "step": 5790 }, { "epoch": 1.97, "grad_norm": 2.3243958950042725, "learning_rate": 0.0001, "loss": 6.9141, "step": 5800 }, { "epoch": 1.97, "grad_norm": 2.3328282833099365, "learning_rate": 0.0001, "loss": 6.8811, "step": 5810 }, { "epoch": 1.98, "grad_norm": 2.4374606609344482, "learning_rate": 0.0001, "loss": 6.9106, "step": 5820 }, { "epoch": 1.98, "grad_norm": 2.3840887546539307, "learning_rate": 0.0001, "loss": 6.865, "step": 5830 }, { "epoch": 1.98, "grad_norm": 2.421757221221924, "learning_rate": 0.0001, "loss": 6.8836, "step": 5840 }, { "epoch": 1.99, "grad_norm": 2.400735855102539, "learning_rate": 0.0001, "loss": 6.8079, "step": 5850 }, { "epoch": 1.99, "grad_norm": 2.3208372592926025, "learning_rate": 0.0001, "loss": 6.8372, "step": 5860 }, { "epoch": 1.99, "grad_norm": 2.2900898456573486, "learning_rate": 0.0001, "loss": 6.7931, "step": 5870 }, { "epoch": 2.0, "grad_norm": 2.4279627799987793, "learning_rate": 0.0001, "loss": 6.9407, "step": 5880 }, { "epoch": 2.0, "grad_norm": 2.536397695541382, "learning_rate": 0.0001, "loss": 6.7606, "step": 5890 }, { "epoch": 2.01, "grad_norm": 2.3431828022003174, "learning_rate": 0.0001, "loss": 6.7804, "step": 5900 }, { "epoch": 2.01, "grad_norm": 2.306082248687744, "learning_rate": 0.0001, "loss": 6.7596, "step": 5910 }, { "epoch": 2.01, "grad_norm": 2.324713945388794, "learning_rate": 0.0001, "loss": 6.7527, "step": 5920 }, { "epoch": 2.02, "grad_norm": 2.4252986907958984, "learning_rate": 0.0001, "loss": 6.8721, "step": 5930 }, { "epoch": 2.02, "grad_norm": 2.2642593383789062, "learning_rate": 0.0001, "loss": 6.7373, "step": 5940 }, { "epoch": 2.02, "grad_norm": 2.3272111415863037, "learning_rate": 0.0001, "loss": 6.7637, "step": 5950 }, { "epoch": 2.03, "grad_norm": 2.428049325942993, "learning_rate": 0.0001, "loss": 6.77, "step": 5960 }, { "epoch": 2.03, "grad_norm": 2.4869771003723145, "learning_rate": 0.0001, "loss": 6.858, "step": 5970 }, { "epoch": 2.03, "grad_norm": 2.3572263717651367, "learning_rate": 0.0001, "loss": 6.8538, "step": 5980 }, { "epoch": 2.04, "grad_norm": 2.6547634601593018, "learning_rate": 0.0001, "loss": 6.786, "step": 5990 }, { "epoch": 2.04, "grad_norm": 2.362126111984253, "learning_rate": 0.0001, "loss": 6.7479, "step": 6000 }, { "epoch": 2.04, "grad_norm": 2.273364305496216, "learning_rate": 0.0001, "loss": 6.8455, "step": 6010 }, { "epoch": 2.05, "grad_norm": 2.3831865787506104, "learning_rate": 0.0001, "loss": 6.7612, "step": 6020 }, { "epoch": 2.05, "grad_norm": 2.3428845405578613, "learning_rate": 0.0001, "loss": 6.8112, "step": 6030 }, { "epoch": 2.05, "grad_norm": 2.3353376388549805, "learning_rate": 0.0001, "loss": 6.8605, "step": 6040 }, { "epoch": 2.06, "grad_norm": 2.5539698600769043, "learning_rate": 0.0001, "loss": 6.7803, "step": 6050 }, { "epoch": 2.06, "grad_norm": 2.363743305206299, "learning_rate": 0.0001, "loss": 6.7285, "step": 6060 }, { "epoch": 2.06, "grad_norm": 2.30212664604187, "learning_rate": 0.0001, "loss": 6.7403, "step": 6070 }, { "epoch": 2.07, "grad_norm": 2.4871535301208496, "learning_rate": 0.0001, "loss": 6.8449, "step": 6080 }, { "epoch": 2.07, "grad_norm": 2.4670472145080566, "learning_rate": 0.0001, "loss": 6.6744, "step": 6090 }, { "epoch": 2.07, "grad_norm": 2.3175411224365234, "learning_rate": 0.0001, "loss": 6.7953, "step": 6100 }, { "epoch": 2.08, "grad_norm": 2.3637678623199463, "learning_rate": 0.0001, "loss": 6.7753, "step": 6110 }, { "epoch": 2.08, "grad_norm": 2.4457314014434814, "learning_rate": 0.0001, "loss": 6.8608, "step": 6120 }, { "epoch": 2.08, "grad_norm": 2.5131587982177734, "learning_rate": 0.0001, "loss": 6.823, "step": 6130 }, { "epoch": 2.09, "grad_norm": 2.2857093811035156, "learning_rate": 0.0001, "loss": 6.667, "step": 6140 }, { "epoch": 2.09, "grad_norm": 2.4054253101348877, "learning_rate": 0.0001, "loss": 6.6906, "step": 6150 }, { "epoch": 2.09, "grad_norm": 2.414931535720825, "learning_rate": 0.0001, "loss": 6.7225, "step": 6160 }, { "epoch": 2.1, "grad_norm": 2.3389484882354736, "learning_rate": 0.0001, "loss": 6.7414, "step": 6170 }, { "epoch": 2.1, "grad_norm": 2.2892231941223145, "learning_rate": 0.0001, "loss": 6.8815, "step": 6180 }, { "epoch": 2.1, "grad_norm": 2.4546141624450684, "learning_rate": 0.0001, "loss": 6.8125, "step": 6190 }, { "epoch": 2.11, "grad_norm": 2.5399043560028076, "learning_rate": 0.0001, "loss": 6.7728, "step": 6200 }, { "epoch": 2.11, "grad_norm": 2.4791290760040283, "learning_rate": 0.0001, "loss": 6.8109, "step": 6210 }, { "epoch": 2.11, "grad_norm": 2.2605090141296387, "learning_rate": 0.0001, "loss": 6.6687, "step": 6220 }, { "epoch": 2.12, "grad_norm": 2.2917070388793945, "learning_rate": 0.0001, "loss": 6.7358, "step": 6230 }, { "epoch": 2.12, "grad_norm": 2.355299711227417, "learning_rate": 0.0001, "loss": 6.8014, "step": 6240 }, { "epoch": 2.12, "grad_norm": 2.325913667678833, "learning_rate": 0.0001, "loss": 6.6981, "step": 6250 }, { "epoch": 2.13, "grad_norm": 2.4129645824432373, "learning_rate": 0.0001, "loss": 6.7411, "step": 6260 }, { "epoch": 2.13, "grad_norm": 2.5154759883880615, "learning_rate": 0.0001, "loss": 6.6815, "step": 6270 }, { "epoch": 2.13, "grad_norm": 2.3484439849853516, "learning_rate": 0.0001, "loss": 6.7147, "step": 6280 }, { "epoch": 2.14, "grad_norm": 2.524163007736206, "learning_rate": 0.0001, "loss": 6.7066, "step": 6290 }, { "epoch": 2.14, "grad_norm": 2.343323230743408, "learning_rate": 0.0001, "loss": 6.805, "step": 6300 }, { "epoch": 2.14, "grad_norm": 2.652834415435791, "learning_rate": 0.0001, "loss": 6.6583, "step": 6310 }, { "epoch": 2.15, "grad_norm": 2.4346814155578613, "learning_rate": 0.0001, "loss": 6.8433, "step": 6320 }, { "epoch": 2.15, "grad_norm": 2.6968090534210205, "learning_rate": 0.0001, "loss": 6.6649, "step": 6330 }, { "epoch": 2.15, "grad_norm": 2.486971855163574, "learning_rate": 0.0001, "loss": 6.7352, "step": 6340 }, { "epoch": 2.16, "grad_norm": 2.4433326721191406, "learning_rate": 0.0001, "loss": 6.7313, "step": 6350 }, { "epoch": 2.16, "grad_norm": 2.3915398120880127, "learning_rate": 0.0001, "loss": 6.6529, "step": 6360 }, { "epoch": 2.16, "grad_norm": 2.344801902770996, "learning_rate": 0.0001, "loss": 6.6492, "step": 6370 }, { "epoch": 2.17, "grad_norm": 2.4664182662963867, "learning_rate": 0.0001, "loss": 6.7902, "step": 6380 }, { "epoch": 2.17, "grad_norm": 2.3616039752960205, "learning_rate": 0.0001, "loss": 6.7068, "step": 6390 }, { "epoch": 2.17, "grad_norm": 2.502997875213623, "learning_rate": 0.0001, "loss": 6.8245, "step": 6400 }, { "epoch": 2.18, "grad_norm": 2.3835978507995605, "learning_rate": 0.0001, "loss": 6.6895, "step": 6410 }, { "epoch": 2.18, "grad_norm": 2.433032512664795, "learning_rate": 0.0001, "loss": 6.7404, "step": 6420 }, { "epoch": 2.19, "grad_norm": 2.5085232257843018, "learning_rate": 0.0001, "loss": 6.8267, "step": 6430 }, { "epoch": 2.19, "grad_norm": 2.3694140911102295, "learning_rate": 0.0001, "loss": 6.707, "step": 6440 }, { "epoch": 2.19, "grad_norm": 2.4146511554718018, "learning_rate": 0.0001, "loss": 6.7517, "step": 6450 }, { "epoch": 2.2, "grad_norm": 2.4081177711486816, "learning_rate": 0.0001, "loss": 6.6947, "step": 6460 }, { "epoch": 2.2, "grad_norm": 2.3583593368530273, "learning_rate": 0.0001, "loss": 6.6277, "step": 6470 }, { "epoch": 2.2, "grad_norm": 2.445613384246826, "learning_rate": 0.0001, "loss": 6.5899, "step": 6480 }, { "epoch": 2.21, "grad_norm": 2.3775651454925537, "learning_rate": 0.0001, "loss": 6.6387, "step": 6490 }, { "epoch": 2.21, "grad_norm": 2.419865608215332, "learning_rate": 0.0001, "loss": 6.7063, "step": 6500 }, { "epoch": 2.21, "grad_norm": 2.3780555725097656, "learning_rate": 0.0001, "loss": 6.7098, "step": 6510 }, { "epoch": 2.22, "grad_norm": 2.3151533603668213, "learning_rate": 0.0001, "loss": 6.7985, "step": 6520 }, { "epoch": 2.22, "grad_norm": 2.419513463973999, "learning_rate": 0.0001, "loss": 6.7104, "step": 6530 }, { "epoch": 2.22, "grad_norm": 2.4487075805664062, "learning_rate": 0.0001, "loss": 6.7433, "step": 6540 }, { "epoch": 2.23, "grad_norm": 2.4610846042633057, "learning_rate": 0.0001, "loss": 6.7578, "step": 6550 }, { "epoch": 2.23, "grad_norm": 2.4538321495056152, "learning_rate": 0.0001, "loss": 6.7022, "step": 6560 }, { "epoch": 2.23, "grad_norm": 2.5543904304504395, "learning_rate": 0.0001, "loss": 6.5997, "step": 6570 }, { "epoch": 2.24, "grad_norm": 2.461637020111084, "learning_rate": 0.0001, "loss": 6.647, "step": 6580 }, { "epoch": 2.24, "grad_norm": 2.477705240249634, "learning_rate": 0.0001, "loss": 6.6654, "step": 6590 }, { "epoch": 2.24, "grad_norm": 2.4382028579711914, "learning_rate": 0.0001, "loss": 6.6494, "step": 6600 }, { "epoch": 2.25, "grad_norm": 2.578843832015991, "learning_rate": 0.0001, "loss": 6.679, "step": 6610 }, { "epoch": 2.25, "grad_norm": 2.321014165878296, "learning_rate": 0.0001, "loss": 6.6854, "step": 6620 }, { "epoch": 2.25, "grad_norm": 2.3366098403930664, "learning_rate": 0.0001, "loss": 6.7736, "step": 6630 }, { "epoch": 2.26, "grad_norm": 2.338735580444336, "learning_rate": 0.0001, "loss": 6.6428, "step": 6640 }, { "epoch": 2.26, "grad_norm": 2.5356178283691406, "learning_rate": 0.0001, "loss": 6.631, "step": 6650 }, { "epoch": 2.26, "grad_norm": 2.184365749359131, "learning_rate": 0.0001, "loss": 6.6344, "step": 6660 }, { "epoch": 2.27, "grad_norm": 2.3265435695648193, "learning_rate": 0.0001, "loss": 6.5805, "step": 6670 }, { "epoch": 2.27, "grad_norm": 2.421135425567627, "learning_rate": 0.0001, "loss": 6.6785, "step": 6680 }, { "epoch": 2.27, "grad_norm": 2.321599006652832, "learning_rate": 0.0001, "loss": 6.704, "step": 6690 }, { "epoch": 2.28, "grad_norm": 2.3654885292053223, "learning_rate": 0.0001, "loss": 6.7158, "step": 6700 }, { "epoch": 2.28, "grad_norm": 2.5177831649780273, "learning_rate": 0.0001, "loss": 6.6503, "step": 6710 }, { "epoch": 2.28, "grad_norm": 2.313040018081665, "learning_rate": 0.0001, "loss": 6.5879, "step": 6720 }, { "epoch": 2.29, "grad_norm": 2.3558781147003174, "learning_rate": 0.0001, "loss": 6.7163, "step": 6730 }, { "epoch": 2.29, "grad_norm": 2.357043504714966, "learning_rate": 0.0001, "loss": 6.7328, "step": 6740 }, { "epoch": 2.29, "grad_norm": 2.4913806915283203, "learning_rate": 0.0001, "loss": 6.6572, "step": 6750 }, { "epoch": 2.3, "grad_norm": 2.4111156463623047, "learning_rate": 0.0001, "loss": 6.6991, "step": 6760 }, { "epoch": 2.3, "grad_norm": 2.5237784385681152, "learning_rate": 0.0001, "loss": 6.7251, "step": 6770 }, { "epoch": 2.3, "grad_norm": 2.480854034423828, "learning_rate": 0.0001, "loss": 6.5924, "step": 6780 }, { "epoch": 2.31, "grad_norm": 2.342013359069824, "learning_rate": 0.0001, "loss": 6.6509, "step": 6790 }, { "epoch": 2.31, "grad_norm": 2.5419070720672607, "learning_rate": 0.0001, "loss": 6.7271, "step": 6800 }, { "epoch": 2.31, "grad_norm": 2.566915512084961, "learning_rate": 0.0001, "loss": 6.6383, "step": 6810 }, { "epoch": 2.32, "grad_norm": 2.4860012531280518, "learning_rate": 0.0001, "loss": 6.6661, "step": 6820 }, { "epoch": 2.32, "grad_norm": 2.4008114337921143, "learning_rate": 0.0001, "loss": 6.6478, "step": 6830 }, { "epoch": 2.32, "grad_norm": 2.404358386993408, "learning_rate": 0.0001, "loss": 6.5861, "step": 6840 }, { "epoch": 2.33, "grad_norm": 2.487964391708374, "learning_rate": 0.0001, "loss": 6.6282, "step": 6850 }, { "epoch": 2.33, "grad_norm": 2.448789358139038, "learning_rate": 0.0001, "loss": 6.5526, "step": 6860 }, { "epoch": 2.33, "grad_norm": 2.5977635383605957, "learning_rate": 0.0001, "loss": 6.7969, "step": 6870 }, { "epoch": 2.34, "grad_norm": 2.435678720474243, "learning_rate": 0.0001, "loss": 6.6237, "step": 6880 }, { "epoch": 2.34, "grad_norm": 2.430389881134033, "learning_rate": 0.0001, "loss": 6.6098, "step": 6890 }, { "epoch": 2.34, "grad_norm": 2.3108503818511963, "learning_rate": 0.0001, "loss": 6.5015, "step": 6900 }, { "epoch": 2.35, "grad_norm": 2.4835143089294434, "learning_rate": 0.0001, "loss": 6.5734, "step": 6910 }, { "epoch": 2.35, "grad_norm": 2.467932939529419, "learning_rate": 0.0001, "loss": 6.5835, "step": 6920 }, { "epoch": 2.36, "grad_norm": 2.4854071140289307, "learning_rate": 0.0001, "loss": 6.6903, "step": 6930 }, { "epoch": 2.36, "grad_norm": 2.3827455043792725, "learning_rate": 0.0001, "loss": 6.6991, "step": 6940 }, { "epoch": 2.36, "grad_norm": 2.40759539604187, "learning_rate": 0.0001, "loss": 6.6489, "step": 6950 }, { "epoch": 2.37, "grad_norm": 2.494851589202881, "learning_rate": 0.0001, "loss": 6.6051, "step": 6960 }, { "epoch": 2.37, "grad_norm": 2.38189435005188, "learning_rate": 0.0001, "loss": 6.6455, "step": 6970 }, { "epoch": 2.37, "grad_norm": 2.2972683906555176, "learning_rate": 0.0001, "loss": 6.6477, "step": 6980 }, { "epoch": 2.38, "grad_norm": 2.3755240440368652, "learning_rate": 0.0001, "loss": 6.6287, "step": 6990 }, { "epoch": 2.38, "grad_norm": 2.4599435329437256, "learning_rate": 0.0001, "loss": 6.497, "step": 7000 }, { "epoch": 2.38, "grad_norm": 2.5824034214019775, "learning_rate": 0.0001, "loss": 6.5575, "step": 7010 }, { "epoch": 2.39, "grad_norm": 2.6116127967834473, "learning_rate": 0.0001, "loss": 6.6656, "step": 7020 }, { "epoch": 2.39, "grad_norm": 2.4924373626708984, "learning_rate": 0.0001, "loss": 6.5141, "step": 7030 }, { "epoch": 2.39, "grad_norm": 2.6754868030548096, "learning_rate": 0.0001, "loss": 6.6132, "step": 7040 }, { "epoch": 2.4, "grad_norm": 2.6676344871520996, "learning_rate": 0.0001, "loss": 6.6724, "step": 7050 }, { "epoch": 2.4, "grad_norm": 2.3747360706329346, "learning_rate": 0.0001, "loss": 6.6109, "step": 7060 }, { "epoch": 2.4, "grad_norm": 2.4569485187530518, "learning_rate": 0.0001, "loss": 6.5757, "step": 7070 }, { "epoch": 2.41, "grad_norm": 2.354471445083618, "learning_rate": 0.0001, "loss": 6.6857, "step": 7080 }, { "epoch": 2.41, "grad_norm": 2.4351491928100586, "learning_rate": 0.0001, "loss": 6.7106, "step": 7090 }, { "epoch": 2.41, "grad_norm": 2.420145034790039, "learning_rate": 0.0001, "loss": 6.593, "step": 7100 }, { "epoch": 2.42, "grad_norm": 2.480693817138672, "learning_rate": 0.0001, "loss": 6.7345, "step": 7110 }, { "epoch": 2.42, "grad_norm": 2.398705005645752, "learning_rate": 0.0001, "loss": 6.573, "step": 7120 }, { "epoch": 2.42, "grad_norm": 2.4723408222198486, "learning_rate": 0.0001, "loss": 6.6368, "step": 7130 }, { "epoch": 2.43, "grad_norm": 2.353672981262207, "learning_rate": 0.0001, "loss": 6.5385, "step": 7140 }, { "epoch": 2.43, "grad_norm": 2.4100985527038574, "learning_rate": 0.0001, "loss": 6.5854, "step": 7150 }, { "epoch": 2.43, "grad_norm": 2.4232449531555176, "learning_rate": 0.0001, "loss": 6.4987, "step": 7160 }, { "epoch": 2.44, "grad_norm": 2.713700532913208, "learning_rate": 0.0001, "loss": 6.5794, "step": 7170 }, { "epoch": 2.44, "grad_norm": 2.6835007667541504, "learning_rate": 0.0001, "loss": 6.6516, "step": 7180 }, { "epoch": 2.44, "grad_norm": 2.540566921234131, "learning_rate": 0.0001, "loss": 6.5251, "step": 7190 }, { "epoch": 2.45, "grad_norm": 2.3966240882873535, "learning_rate": 0.0001, "loss": 6.6168, "step": 7200 }, { "epoch": 2.45, "grad_norm": 2.3429226875305176, "learning_rate": 0.0001, "loss": 6.7033, "step": 7210 }, { "epoch": 2.45, "grad_norm": 2.2682909965515137, "learning_rate": 0.0001, "loss": 6.5747, "step": 7220 }, { "epoch": 2.46, "grad_norm": 2.472114324569702, "learning_rate": 0.0001, "loss": 6.6046, "step": 7230 }, { "epoch": 2.46, "grad_norm": 2.375086784362793, "learning_rate": 0.0001, "loss": 6.5942, "step": 7240 }, { "epoch": 2.46, "grad_norm": 2.3834826946258545, "learning_rate": 0.0001, "loss": 6.6195, "step": 7250 }, { "epoch": 2.47, "grad_norm": 2.443159818649292, "learning_rate": 0.0001, "loss": 6.5226, "step": 7260 }, { "epoch": 2.47, "grad_norm": 2.3855631351470947, "learning_rate": 0.0001, "loss": 6.5294, "step": 7270 }, { "epoch": 2.47, "grad_norm": 2.5067007541656494, "learning_rate": 0.0001, "loss": 6.5628, "step": 7280 }, { "epoch": 2.48, "grad_norm": 2.2735891342163086, "learning_rate": 0.0001, "loss": 6.5801, "step": 7290 }, { "epoch": 2.48, "grad_norm": 2.338221549987793, "learning_rate": 0.0001, "loss": 6.5488, "step": 7300 }, { "epoch": 2.48, "grad_norm": 2.345191240310669, "learning_rate": 0.0001, "loss": 6.3966, "step": 7310 }, { "epoch": 2.49, "grad_norm": 2.5452232360839844, "learning_rate": 0.0001, "loss": 6.582, "step": 7320 }, { "epoch": 2.49, "grad_norm": 2.5252342224121094, "learning_rate": 0.0001, "loss": 6.4424, "step": 7330 }, { "epoch": 2.49, "grad_norm": 2.3293297290802, "learning_rate": 0.0001, "loss": 6.4929, "step": 7340 }, { "epoch": 2.5, "grad_norm": 2.3897252082824707, "learning_rate": 0.0001, "loss": 6.5216, "step": 7350 }, { "epoch": 2.5, "grad_norm": 2.3954155445098877, "learning_rate": 0.0001, "loss": 6.6009, "step": 7360 }, { "epoch": 2.5, "grad_norm": 2.4169862270355225, "learning_rate": 0.0001, "loss": 6.5498, "step": 7370 }, { "epoch": 2.51, "grad_norm": 2.3694748878479004, "learning_rate": 0.0001, "loss": 6.6092, "step": 7380 }, { "epoch": 2.51, "grad_norm": 2.1525871753692627, "learning_rate": 0.0001, "loss": 6.4587, "step": 7390 }, { "epoch": 2.51, "grad_norm": 2.3603813648223877, "learning_rate": 0.0001, "loss": 6.5468, "step": 7400 }, { "epoch": 2.52, "grad_norm": 2.374941110610962, "learning_rate": 0.0001, "loss": 6.5933, "step": 7410 }, { "epoch": 2.52, "grad_norm": 2.58593487739563, "learning_rate": 0.0001, "loss": 6.5777, "step": 7420 }, { "epoch": 2.53, "grad_norm": 2.3613600730895996, "learning_rate": 0.0001, "loss": 6.4953, "step": 7430 }, { "epoch": 2.53, "grad_norm": 2.400717258453369, "learning_rate": 0.0001, "loss": 6.5551, "step": 7440 }, { "epoch": 2.53, "grad_norm": 2.4552764892578125, "learning_rate": 0.0001, "loss": 6.595, "step": 7450 }, { "epoch": 2.54, "grad_norm": 2.431891679763794, "learning_rate": 0.0001, "loss": 6.5206, "step": 7460 }, { "epoch": 2.54, "grad_norm": 2.622408628463745, "learning_rate": 0.0001, "loss": 6.6461, "step": 7470 }, { "epoch": 2.54, "grad_norm": 2.530582904815674, "learning_rate": 0.0001, "loss": 6.5773, "step": 7480 }, { "epoch": 2.55, "grad_norm": 2.5170624256134033, "learning_rate": 0.0001, "loss": 6.5591, "step": 7490 }, { "epoch": 2.55, "grad_norm": 2.525491714477539, "learning_rate": 0.0001, "loss": 6.6803, "step": 7500 }, { "epoch": 2.55, "grad_norm": 2.6200342178344727, "learning_rate": 0.0001, "loss": 6.4981, "step": 7510 }, { "epoch": 2.56, "grad_norm": 2.6727588176727295, "learning_rate": 0.0001, "loss": 6.565, "step": 7520 }, { "epoch": 2.56, "grad_norm": 2.4518978595733643, "learning_rate": 0.0001, "loss": 6.5723, "step": 7530 }, { "epoch": 2.56, "grad_norm": 2.5073177814483643, "learning_rate": 0.0001, "loss": 6.5436, "step": 7540 }, { "epoch": 2.57, "grad_norm": 2.4163527488708496, "learning_rate": 0.0001, "loss": 6.5592, "step": 7550 }, { "epoch": 2.57, "grad_norm": 2.3171191215515137, "learning_rate": 0.0001, "loss": 6.4982, "step": 7560 }, { "epoch": 2.57, "grad_norm": 2.4813661575317383, "learning_rate": 0.0001, "loss": 6.5612, "step": 7570 }, { "epoch": 2.58, "grad_norm": 2.391533613204956, "learning_rate": 0.0001, "loss": 6.5979, "step": 7580 }, { "epoch": 2.58, "grad_norm": 2.3852477073669434, "learning_rate": 0.0001, "loss": 6.5185, "step": 7590 }, { "epoch": 2.58, "grad_norm": 2.3847193717956543, "learning_rate": 0.0001, "loss": 6.5222, "step": 7600 }, { "epoch": 2.59, "grad_norm": 2.582272529602051, "learning_rate": 0.0001, "loss": 6.6064, "step": 7610 }, { "epoch": 2.59, "grad_norm": 2.2116689682006836, "learning_rate": 0.0001, "loss": 6.4506, "step": 7620 }, { "epoch": 2.59, "grad_norm": 2.3123679161071777, "learning_rate": 0.0001, "loss": 6.4978, "step": 7630 }, { "epoch": 2.6, "grad_norm": 2.4657137393951416, "learning_rate": 0.0001, "loss": 6.5312, "step": 7640 }, { "epoch": 2.6, "grad_norm": 2.466048002243042, "learning_rate": 0.0001, "loss": 6.5716, "step": 7650 }, { "epoch": 2.6, "grad_norm": 2.464794874191284, "learning_rate": 0.0001, "loss": 6.5371, "step": 7660 }, { "epoch": 2.61, "grad_norm": 2.514782428741455, "learning_rate": 0.0001, "loss": 6.485, "step": 7670 }, { "epoch": 2.61, "grad_norm": 2.3275089263916016, "learning_rate": 0.0001, "loss": 6.4874, "step": 7680 }, { "epoch": 2.61, "grad_norm": 2.54858136177063, "learning_rate": 0.0001, "loss": 6.4682, "step": 7690 }, { "epoch": 2.62, "grad_norm": 2.384810447692871, "learning_rate": 0.0001, "loss": 6.6035, "step": 7700 }, { "epoch": 2.62, "grad_norm": 2.4897470474243164, "learning_rate": 0.0001, "loss": 6.5514, "step": 7710 }, { "epoch": 2.62, "grad_norm": 2.4654860496520996, "learning_rate": 0.0001, "loss": 6.4483, "step": 7720 }, { "epoch": 2.63, "grad_norm": 2.2626514434814453, "learning_rate": 0.0001, "loss": 6.4642, "step": 7730 }, { "epoch": 2.63, "grad_norm": 2.4894731044769287, "learning_rate": 0.0001, "loss": 6.5365, "step": 7740 }, { "epoch": 2.63, "grad_norm": 2.3856000900268555, "learning_rate": 0.0001, "loss": 6.4537, "step": 7750 }, { "epoch": 2.64, "grad_norm": 2.338667392730713, "learning_rate": 0.0001, "loss": 6.5448, "step": 7760 }, { "epoch": 2.64, "grad_norm": 2.4658851623535156, "learning_rate": 0.0001, "loss": 6.638, "step": 7770 }, { "epoch": 2.64, "grad_norm": 2.5155911445617676, "learning_rate": 0.0001, "loss": 6.4092, "step": 7780 }, { "epoch": 2.65, "grad_norm": 2.4454259872436523, "learning_rate": 0.0001, "loss": 6.4091, "step": 7790 }, { "epoch": 2.65, "grad_norm": 2.4755043983459473, "learning_rate": 0.0001, "loss": 6.4448, "step": 7800 } ], "logging_steps": 10, "max_steps": 17652, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 200, "total_flos": 1.3151692470722863e+19, "train_batch_size": 3, "trial_name": null, "trial_params": null }