{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 276.0, "learning_rate": 1.1250000000000001e-07, "loss": 0.4619, "step": 10 }, { "epoch": 0.005, "grad_norm": 236.0, "learning_rate": 2.3750000000000003e-07, "loss": 0.457, "step": 20 }, { "epoch": 0.0075, "grad_norm": 276.0, "learning_rate": 3.625e-07, "loss": 0.51, "step": 30 }, { "epoch": 0.01, "grad_norm": 194.0, "learning_rate": 4.875000000000001e-07, "loss": 0.4631, "step": 40 }, { "epoch": 0.0125, "grad_norm": 264.0, "learning_rate": 6.125000000000001e-07, "loss": 0.5266, "step": 50 }, { "epoch": 0.015, "grad_norm": 21.75, "learning_rate": 7.375e-07, "loss": 0.5422, "step": 60 }, { "epoch": 0.0175, "grad_norm": 21.875, "learning_rate": 8.625e-07, "loss": 0.5314, "step": 70 }, { "epoch": 0.02, "grad_norm": 170.0, "learning_rate": 9.875e-07, "loss": 0.4754, "step": 80 }, { "epoch": 0.0225, "grad_norm": 5.625, "learning_rate": 1.1125000000000001e-06, "loss": 0.5029, "step": 90 }, { "epoch": 0.025, "grad_norm": 278.0, "learning_rate": 1.2375e-06, "loss": 0.4709, "step": 100 }, { "epoch": 0.0275, "grad_norm": 2.546875, "learning_rate": 1.3625000000000003e-06, "loss": 0.3525, "step": 110 }, { "epoch": 0.03, "grad_norm": 74.0, "learning_rate": 1.4875000000000002e-06, "loss": 0.3973, "step": 120 }, { "epoch": 0.0325, "grad_norm": 2.4375, "learning_rate": 1.6125000000000002e-06, "loss": 0.3656, "step": 130 }, { "epoch": 0.035, "grad_norm": 232.0, "learning_rate": 1.7375e-06, "loss": 0.3996, "step": 140 }, { "epoch": 0.0375, "grad_norm": 4.09375, "learning_rate": 1.8625000000000002e-06, "loss": 0.3652, "step": 150 }, { "epoch": 0.04, "grad_norm": 12.875, "learning_rate": 1.9875000000000005e-06, "loss": 0.4156, "step": 160 }, { "epoch": 0.0425, "grad_norm": 22.625, "learning_rate": 2.1125e-06, "loss": 0.3318, "step": 170 }, { "epoch": 0.045, "grad_norm": 12.75, "learning_rate": 2.2375e-06, "loss": 0.3256, "step": 180 }, { "epoch": 0.0475, "grad_norm": 5.09375, "learning_rate": 2.3625000000000003e-06, "loss": 0.3281, "step": 190 }, { "epoch": 0.05, "grad_norm": 25.875, "learning_rate": 2.4875000000000003e-06, "loss": 0.3357, "step": 200 }, { "epoch": 0.0525, "grad_norm": 2.703125, "learning_rate": 2.6125e-06, "loss": 0.3479, "step": 210 }, { "epoch": 0.055, "grad_norm": 2.765625, "learning_rate": 2.7375e-06, "loss": 0.3328, "step": 220 }, { "epoch": 0.0575, "grad_norm": 2.0, "learning_rate": 2.8625e-06, "loss": 0.3164, "step": 230 }, { "epoch": 0.06, "grad_norm": 2.34375, "learning_rate": 2.9875e-06, "loss": 0.299, "step": 240 }, { "epoch": 0.0625, "grad_norm": 2.84375, "learning_rate": 3.1125000000000007e-06, "loss": 0.3129, "step": 250 }, { "epoch": 0.065, "grad_norm": 2.671875, "learning_rate": 3.2375e-06, "loss": 0.3131, "step": 260 }, { "epoch": 0.0675, "grad_norm": 4.09375, "learning_rate": 3.3625000000000004e-06, "loss": 0.3053, "step": 270 }, { "epoch": 0.07, "grad_norm": 1.9375, "learning_rate": 3.4875000000000005e-06, "loss": 0.299, "step": 280 }, { "epoch": 0.0725, "grad_norm": 2.6875, "learning_rate": 3.6125000000000006e-06, "loss": 0.3273, "step": 290 }, { "epoch": 0.075, "grad_norm": 1.90625, "learning_rate": 3.7375000000000006e-06, "loss": 0.3041, "step": 300 }, { "epoch": 0.0775, "grad_norm": 1.640625, "learning_rate": 3.8625e-06, "loss": 0.3057, "step": 310 }, { "epoch": 0.08, "grad_norm": 2.375, "learning_rate": 3.9875e-06, "loss": 0.315, "step": 320 }, { "epoch": 0.0825, "grad_norm": 2.015625, "learning_rate": 4.1125e-06, "loss": 0.3063, "step": 330 }, { "epoch": 0.085, "grad_norm": 1.96875, "learning_rate": 4.2375000000000005e-06, "loss": 0.3023, "step": 340 }, { "epoch": 0.0875, "grad_norm": 2.53125, "learning_rate": 4.362500000000001e-06, "loss": 0.2834, "step": 350 }, { "epoch": 0.09, "grad_norm": 2.0, "learning_rate": 4.4875e-06, "loss": 0.2767, "step": 360 }, { "epoch": 0.0925, "grad_norm": 1.6015625, "learning_rate": 4.6125e-06, "loss": 0.2859, "step": 370 }, { "epoch": 0.095, "grad_norm": 2.375, "learning_rate": 4.737500000000001e-06, "loss": 0.2889, "step": 380 }, { "epoch": 0.0975, "grad_norm": 4.75, "learning_rate": 4.8625000000000005e-06, "loss": 0.2916, "step": 390 }, { "epoch": 0.1, "grad_norm": 1.671875, "learning_rate": 4.987500000000001e-06, "loss": 0.2722, "step": 400 }, { "epoch": 0.1025, "grad_norm": 2.46875, "learning_rate": 4.999922894111975e-06, "loss": 0.2822, "step": 410 }, { "epoch": 0.105, "grad_norm": 2.515625, "learning_rate": 4.999656361346094e-06, "loss": 0.269, "step": 420 }, { "epoch": 0.1075, "grad_norm": 1.6875, "learning_rate": 4.999199470070484e-06, "loss": 0.2836, "step": 430 }, { "epoch": 0.11, "grad_norm": 1.671875, "learning_rate": 4.998552255079182e-06, "loss": 0.2577, "step": 440 }, { "epoch": 0.1125, "grad_norm": 1.5234375, "learning_rate": 4.9977147656601196e-06, "loss": 0.266, "step": 450 }, { "epoch": 0.115, "grad_norm": 1.890625, "learning_rate": 4.996687065591355e-06, "loss": 0.2792, "step": 460 }, { "epoch": 0.1175, "grad_norm": 2.484375, "learning_rate": 4.9954692331362295e-06, "loss": 0.2801, "step": 470 }, { "epoch": 0.12, "grad_norm": 1.828125, "learning_rate": 4.9940613610373974e-06, "loss": 0.26, "step": 480 }, { "epoch": 0.1225, "grad_norm": 1.9296875, "learning_rate": 4.992463556509772e-06, "loss": 0.2613, "step": 490 }, { "epoch": 0.125, "grad_norm": 1.828125, "learning_rate": 4.990675941232353e-06, "loss": 0.2385, "step": 500 }, { "epoch": 0.1275, "grad_norm": 1.90625, "learning_rate": 4.988698651338965e-06, "loss": 0.2574, "step": 510 }, { "epoch": 0.13, "grad_norm": 1.5, "learning_rate": 4.986531837407891e-06, "loss": 0.2559, "step": 520 }, { "epoch": 0.1325, "grad_norm": 1.484375, "learning_rate": 4.9841756644503965e-06, "loss": 0.2622, "step": 530 }, { "epoch": 0.135, "grad_norm": 1.15625, "learning_rate": 4.981630311898178e-06, "loss": 0.2537, "step": 540 }, { "epoch": 0.1375, "grad_norm": 1.6484375, "learning_rate": 4.978895973589686e-06, "loss": 0.2832, "step": 550 }, { "epoch": 0.14, "grad_norm": 1.6796875, "learning_rate": 4.975972857755369e-06, "loss": 0.2471, "step": 560 }, { "epoch": 0.1425, "grad_norm": 1.4296875, "learning_rate": 4.972861187001815e-06, "loss": 0.2648, "step": 570 }, { "epoch": 0.145, "grad_norm": 1.6171875, "learning_rate": 4.9695611982947995e-06, "loss": 0.2697, "step": 580 }, { "epoch": 0.1475, "grad_norm": 1.8515625, "learning_rate": 4.966073142941239e-06, "loss": 0.2592, "step": 590 }, { "epoch": 0.15, "grad_norm": 1.4140625, "learning_rate": 4.962397286570053e-06, "loss": 0.267, "step": 600 }, { "epoch": 0.1525, "grad_norm": 1.515625, "learning_rate": 4.958533909111936e-06, "loss": 0.253, "step": 610 }, { "epoch": 0.155, "grad_norm": 1.390625, "learning_rate": 4.95448330477804e-06, "loss": 0.2488, "step": 620 }, { "epoch": 0.1575, "grad_norm": 1.71875, "learning_rate": 4.950245782037566e-06, "loss": 0.2552, "step": 630 }, { "epoch": 0.16, "grad_norm": 1.2890625, "learning_rate": 4.945821663594277e-06, "loss": 0.2485, "step": 640 }, { "epoch": 0.1625, "grad_norm": 1.5078125, "learning_rate": 4.941211286361922e-06, "loss": 0.2612, "step": 650 }, { "epoch": 0.165, "grad_norm": 1.8984375, "learning_rate": 4.936415001438577e-06, "loss": 0.2465, "step": 660 }, { "epoch": 0.1675, "grad_norm": 1.609375, "learning_rate": 4.9314331740799084e-06, "loss": 0.2449, "step": 670 }, { "epoch": 0.17, "grad_norm": 1.234375, "learning_rate": 4.926266183671356e-06, "loss": 0.2431, "step": 680 }, { "epoch": 0.1725, "grad_norm": 1.359375, "learning_rate": 4.920914423699247e-06, "loss": 0.2479, "step": 690 }, { "epoch": 0.175, "grad_norm": 1.28125, "learning_rate": 4.915378301720822e-06, "loss": 0.2327, "step": 700 }, { "epoch": 0.1775, "grad_norm": 1.3984375, "learning_rate": 4.909658239333203e-06, "loss": 0.2434, "step": 710 }, { "epoch": 0.18, "grad_norm": 1.4296875, "learning_rate": 4.903754672141288e-06, "loss": 0.2437, "step": 720 }, { "epoch": 0.1825, "grad_norm": 1.6953125, "learning_rate": 4.897668049724574e-06, "loss": 0.248, "step": 730 }, { "epoch": 0.185, "grad_norm": 1.3515625, "learning_rate": 4.891398835602925e-06, "loss": 0.2502, "step": 740 }, { "epoch": 0.1875, "grad_norm": 1.5546875, "learning_rate": 4.884947507201268e-06, "loss": 0.2368, "step": 750 }, { "epoch": 0.19, "grad_norm": 1.7421875, "learning_rate": 4.878314555813237e-06, "loss": 0.2262, "step": 760 }, { "epoch": 0.1925, "grad_norm": 1.6328125, "learning_rate": 4.8715004865637616e-06, "loss": 0.2392, "step": 770 }, { "epoch": 0.195, "grad_norm": 2.015625, "learning_rate": 4.8645058183705976e-06, "loss": 0.2403, "step": 780 }, { "epoch": 0.1975, "grad_norm": 1.2109375, "learning_rate": 4.8573310839048085e-06, "loss": 0.2466, "step": 790 }, { "epoch": 0.2, "grad_norm": 1.3125, "learning_rate": 4.8499768295502e-06, "loss": 0.2453, "step": 800 }, { "epoch": 0.2025, "grad_norm": 2.03125, "learning_rate": 4.842443615361718e-06, "loss": 0.2566, "step": 810 }, { "epoch": 0.205, "grad_norm": 1.9296875, "learning_rate": 4.834732015022786e-06, "loss": 0.2248, "step": 820 }, { "epoch": 0.2075, "grad_norm": 1.59375, "learning_rate": 4.826842615801628e-06, "loss": 0.2456, "step": 830 }, { "epoch": 0.21, "grad_norm": 1.484375, "learning_rate": 4.818776018506538e-06, "loss": 0.2445, "step": 840 }, { "epoch": 0.2125, "grad_norm": 1.5625, "learning_rate": 4.810532837440134e-06, "loss": 0.2451, "step": 850 }, { "epoch": 0.215, "grad_norm": 1.5390625, "learning_rate": 4.802113700352567e-06, "loss": 0.2605, "step": 860 }, { "epoch": 0.2175, "grad_norm": 1.2734375, "learning_rate": 4.793519248393721e-06, "loss": 0.2394, "step": 870 }, { "epoch": 0.22, "grad_norm": 1.203125, "learning_rate": 4.78475013606439e-06, "loss": 0.238, "step": 880 }, { "epoch": 0.2225, "grad_norm": 1.3671875, "learning_rate": 4.775807031166428e-06, "loss": 0.236, "step": 890 }, { "epoch": 0.225, "grad_norm": 1.7578125, "learning_rate": 4.766690614751897e-06, "loss": 0.2357, "step": 900 }, { "epoch": 0.2275, "grad_norm": 1.3359375, "learning_rate": 4.757401581071203e-06, "loss": 0.2436, "step": 910 }, { "epoch": 0.23, "grad_norm": 3.671875, "learning_rate": 4.747940637520226e-06, "loss": 0.2309, "step": 920 }, { "epoch": 0.2325, "grad_norm": 1.40625, "learning_rate": 4.738308504586445e-06, "loss": 0.2254, "step": 930 }, { "epoch": 0.235, "grad_norm": 1.359375, "learning_rate": 4.7285059157940765e-06, "loss": 0.2221, "step": 940 }, { "epoch": 0.2375, "grad_norm": 1.1640625, "learning_rate": 4.718533617648209e-06, "loss": 0.2246, "step": 950 }, { "epoch": 0.24, "grad_norm": 1.1171875, "learning_rate": 4.7083923695779546e-06, "loss": 0.2228, "step": 960 }, { "epoch": 0.2425, "grad_norm": 1.7109375, "learning_rate": 4.6980829438786176e-06, "loss": 0.2198, "step": 970 }, { "epoch": 0.245, "grad_norm": 1.9453125, "learning_rate": 4.687606125652882e-06, "loss": 0.223, "step": 980 }, { "epoch": 0.2475, "grad_norm": 1.5, "learning_rate": 4.676962712751015e-06, "loss": 0.2337, "step": 990 }, { "epoch": 0.25, "grad_norm": 1.359375, "learning_rate": 4.666153515710118e-06, "loss": 0.2273, "step": 1000 }, { "epoch": 0.2525, "grad_norm": 1.2109375, "learning_rate": 4.655179357692396e-06, "loss": 0.2134, "step": 1010 }, { "epoch": 0.255, "grad_norm": 1.171875, "learning_rate": 4.644041074422469e-06, "loss": 0.2078, "step": 1020 }, { "epoch": 0.2575, "grad_norm": 1.484375, "learning_rate": 4.632739514123733e-06, "loss": 0.2229, "step": 1030 }, { "epoch": 0.26, "grad_norm": 1.21875, "learning_rate": 4.6212755374537596e-06, "loss": 0.2166, "step": 1040 }, { "epoch": 0.2625, "grad_norm": 1.3359375, "learning_rate": 4.609650017438757e-06, "loss": 0.2222, "step": 1050 }, { "epoch": 0.265, "grad_norm": 1.640625, "learning_rate": 4.5978638394070835e-06, "loss": 0.2257, "step": 1060 }, { "epoch": 0.2675, "grad_norm": 1.3125, "learning_rate": 4.58591790092183e-06, "loss": 0.2269, "step": 1070 }, { "epoch": 0.27, "grad_norm": 1.140625, "learning_rate": 4.5738131117124605e-06, "loss": 0.2107, "step": 1080 }, { "epoch": 0.2725, "grad_norm": 1.296875, "learning_rate": 4.561550393605541e-06, "loss": 0.2285, "step": 1090 }, { "epoch": 0.275, "grad_norm": 1.2265625, "learning_rate": 4.549130680454532e-06, "loss": 0.219, "step": 1100 }, { "epoch": 0.2775, "grad_norm": 1.328125, "learning_rate": 4.536554918068673e-06, "loss": 0.2203, "step": 1110 }, { "epoch": 0.28, "grad_norm": 1.359375, "learning_rate": 4.523824064140961e-06, "loss": 0.2313, "step": 1120 }, { "epoch": 0.2825, "grad_norm": 1.515625, "learning_rate": 4.510939088175211e-06, "loss": 0.2209, "step": 1130 }, { "epoch": 0.285, "grad_norm": 1.46875, "learning_rate": 4.49790097141223e-06, "loss": 0.2191, "step": 1140 }, { "epoch": 0.2875, "grad_norm": 1.53125, "learning_rate": 4.484710706755087e-06, "loss": 0.2131, "step": 1150 }, { "epoch": 0.29, "grad_norm": 1.5859375, "learning_rate": 4.471369298693505e-06, "loss": 0.2121, "step": 1160 }, { "epoch": 0.2925, "grad_norm": 1.21875, "learning_rate": 4.457877763227361e-06, "loss": 0.2399, "step": 1170 }, { "epoch": 0.295, "grad_norm": 1.328125, "learning_rate": 4.444237127789315e-06, "loss": 0.2145, "step": 1180 }, { "epoch": 0.2975, "grad_norm": 1.171875, "learning_rate": 4.430448431166567e-06, "loss": 0.2179, "step": 1190 }, { "epoch": 0.3, "grad_norm": 1.3515625, "learning_rate": 4.416512723421752e-06, "loss": 0.2104, "step": 1200 }, { "epoch": 0.3025, "grad_norm": 1.234375, "learning_rate": 4.402431065812968e-06, "loss": 0.2194, "step": 1210 }, { "epoch": 0.305, "grad_norm": 1.0546875, "learning_rate": 4.388204530712959e-06, "loss": 0.2179, "step": 1220 }, { "epoch": 0.3075, "grad_norm": 1.1796875, "learning_rate": 4.373834201527457e-06, "loss": 0.2142, "step": 1230 }, { "epoch": 0.31, "grad_norm": 1.3203125, "learning_rate": 4.359321172612664e-06, "loss": 0.2054, "step": 1240 }, { "epoch": 0.3125, "grad_norm": 1.140625, "learning_rate": 4.344666549191921e-06, "loss": 0.2196, "step": 1250 }, { "epoch": 0.315, "grad_norm": 1.2265625, "learning_rate": 4.329871447271541e-06, "loss": 0.2165, "step": 1260 }, { "epoch": 0.3175, "grad_norm": 1.3515625, "learning_rate": 4.314936993555816e-06, "loss": 0.2021, "step": 1270 }, { "epoch": 0.32, "grad_norm": 1.453125, "learning_rate": 4.299864325361217e-06, "loss": 0.2113, "step": 1280 }, { "epoch": 0.3225, "grad_norm": 1.375, "learning_rate": 4.284654590529784e-06, "loss": 0.1996, "step": 1290 }, { "epoch": 0.325, "grad_norm": 1.4140625, "learning_rate": 4.269308947341711e-06, "loss": 0.2101, "step": 1300 }, { "epoch": 0.3275, "grad_norm": 1.3515625, "learning_rate": 4.25382856442714e-06, "loss": 0.2086, "step": 1310 }, { "epoch": 0.33, "grad_norm": 1.453125, "learning_rate": 4.238214620677164e-06, "loss": 0.2125, "step": 1320 }, { "epoch": 0.3325, "grad_norm": 1.1796875, "learning_rate": 4.222468305154052e-06, "loss": 0.2162, "step": 1330 }, { "epoch": 0.335, "grad_norm": 1.25, "learning_rate": 4.206590817000695e-06, "loss": 0.2127, "step": 1340 }, { "epoch": 0.3375, "grad_norm": 1.1796875, "learning_rate": 4.190583365349289e-06, "loss": 0.2095, "step": 1350 }, { "epoch": 0.34, "grad_norm": 1.2578125, "learning_rate": 4.174447169229252e-06, "loss": 0.217, "step": 1360 }, { "epoch": 0.3425, "grad_norm": 1.4140625, "learning_rate": 4.158183457474392e-06, "loss": 0.212, "step": 1370 }, { "epoch": 0.345, "grad_norm": 1.4453125, "learning_rate": 4.141793468629327e-06, "loss": 0.2221, "step": 1380 }, { "epoch": 0.3475, "grad_norm": 1.125, "learning_rate": 4.125278450855165e-06, "loss": 0.2079, "step": 1390 }, { "epoch": 0.35, "grad_norm": 1.28125, "learning_rate": 4.1086396618344474e-06, "loss": 0.2085, "step": 1400 }, { "epoch": 0.3525, "grad_norm": 1.3828125, "learning_rate": 4.09187836867538e-06, "loss": 0.2123, "step": 1410 }, { "epoch": 0.355, "grad_norm": 1.21875, "learning_rate": 4.074995847815331e-06, "loss": 0.2116, "step": 1420 }, { "epoch": 0.3575, "grad_norm": 1.3671875, "learning_rate": 4.057993384923626e-06, "loss": 0.2288, "step": 1430 }, { "epoch": 0.36, "grad_norm": 1.1484375, "learning_rate": 4.0408722748036426e-06, "loss": 0.2038, "step": 1440 }, { "epoch": 0.3625, "grad_norm": 1.28125, "learning_rate": 4.023633821294203e-06, "loss": 0.2123, "step": 1450 }, { "epoch": 0.365, "grad_norm": 1.03125, "learning_rate": 4.006279337170283e-06, "loss": 0.1979, "step": 1460 }, { "epoch": 0.3675, "grad_norm": 1.2109375, "learning_rate": 3.988810144043041e-06, "loss": 0.2113, "step": 1470 }, { "epoch": 0.37, "grad_norm": 1.4453125, "learning_rate": 3.971227572259167e-06, "loss": 0.2077, "step": 1480 }, { "epoch": 0.3725, "grad_norm": 1.1328125, "learning_rate": 3.953532960799577e-06, "loss": 0.1986, "step": 1490 }, { "epoch": 0.375, "grad_norm": 1.1484375, "learning_rate": 3.935727657177439e-06, "loss": 0.1972, "step": 1500 }, { "epoch": 0.3775, "grad_norm": 1.40625, "learning_rate": 3.917813017335562e-06, "loss": 0.2082, "step": 1510 }, { "epoch": 0.38, "grad_norm": 1.375, "learning_rate": 3.899790405543129e-06, "loss": 0.2086, "step": 1520 }, { "epoch": 0.3825, "grad_norm": 1.2734375, "learning_rate": 3.881661194291805e-06, "loss": 0.1968, "step": 1530 }, { "epoch": 0.385, "grad_norm": 1.1640625, "learning_rate": 3.863426764191216e-06, "loss": 0.2064, "step": 1540 }, { "epoch": 0.3875, "grad_norm": 1.1015625, "learning_rate": 3.845088503863813e-06, "loss": 0.1855, "step": 1550 }, { "epoch": 0.39, "grad_norm": 1.2109375, "learning_rate": 3.826647809839119e-06, "loss": 0.2039, "step": 1560 }, { "epoch": 0.3925, "grad_norm": 1.125, "learning_rate": 3.8081060864473794e-06, "loss": 0.2079, "step": 1570 }, { "epoch": 0.395, "grad_norm": 1.09375, "learning_rate": 3.7894647457126188e-06, "loss": 0.2126, "step": 1580 }, { "epoch": 0.3975, "grad_norm": 1.2734375, "learning_rate": 3.770725207245106e-06, "loss": 0.2156, "step": 1590 }, { "epoch": 0.4, "grad_norm": 1.1015625, "learning_rate": 3.751888898133249e-06, "loss": 0.2122, "step": 1600 }, { "epoch": 0.4025, "grad_norm": 1.3046875, "learning_rate": 3.7329572528349145e-06, "loss": 0.1996, "step": 1610 }, { "epoch": 0.405, "grad_norm": 1.2734375, "learning_rate": 3.7139317130681886e-06, "loss": 0.2136, "step": 1620 }, { "epoch": 0.4075, "grad_norm": 0.98046875, "learning_rate": 3.694813727701584e-06, "loss": 0.2055, "step": 1630 }, { "epoch": 0.41, "grad_norm": 1.421875, "learning_rate": 3.675604752643706e-06, "loss": 0.2184, "step": 1640 }, { "epoch": 0.4125, "grad_norm": 1.2265625, "learning_rate": 3.6563062507323752e-06, "loss": 0.2068, "step": 1650 }, { "epoch": 0.415, "grad_norm": 1.09375, "learning_rate": 3.6369196916232297e-06, "loss": 0.2034, "step": 1660 }, { "epoch": 0.4175, "grad_norm": 1.1640625, "learning_rate": 3.6174465516778032e-06, "loss": 0.1992, "step": 1670 }, { "epoch": 0.42, "grad_norm": 1.1171875, "learning_rate": 3.5978883138510963e-06, "loss": 0.2037, "step": 1680 }, { "epoch": 0.4225, "grad_norm": 1.15625, "learning_rate": 3.578246467578642e-06, "loss": 0.2185, "step": 1690 }, { "epoch": 0.425, "grad_norm": 1.1875, "learning_rate": 3.558522508663081e-06, "loss": 0.2038, "step": 1700 }, { "epoch": 0.4275, "grad_norm": 1.4296875, "learning_rate": 3.538717939160249e-06, "loss": 0.2088, "step": 1710 }, { "epoch": 0.43, "grad_norm": 1.453125, "learning_rate": 3.5188342672647897e-06, "loss": 0.2041, "step": 1720 }, { "epoch": 0.4325, "grad_norm": 1.125, "learning_rate": 3.4988730071953005e-06, "loss": 0.2065, "step": 1730 }, { "epoch": 0.435, "grad_norm": 1.171875, "learning_rate": 3.478835679079019e-06, "loss": 0.2041, "step": 1740 }, { "epoch": 0.4375, "grad_norm": 1.5390625, "learning_rate": 3.4587238088360605e-06, "loss": 0.1996, "step": 1750 }, { "epoch": 0.44, "grad_norm": 1.390625, "learning_rate": 3.438538928063208e-06, "loss": 0.1931, "step": 1760 }, { "epoch": 0.4425, "grad_norm": 1.2265625, "learning_rate": 3.4182825739172826e-06, "loss": 0.2095, "step": 1770 }, { "epoch": 0.445, "grad_norm": 1.140625, "learning_rate": 3.3979562889980777e-06, "loss": 0.2097, "step": 1780 }, { "epoch": 0.4475, "grad_norm": 1.34375, "learning_rate": 3.377561621230887e-06, "loss": 0.1874, "step": 1790 }, { "epoch": 0.45, "grad_norm": 1.171875, "learning_rate": 3.357100123748621e-06, "loss": 0.2025, "step": 1800 }, { "epoch": 0.4525, "grad_norm": 0.98828125, "learning_rate": 3.3365733547735334e-06, "loss": 0.1846, "step": 1810 }, { "epoch": 0.455, "grad_norm": 1.28125, "learning_rate": 3.315982877498555e-06, "loss": 0.2049, "step": 1820 }, { "epoch": 0.4575, "grad_norm": 1.1640625, "learning_rate": 3.2953302599682487e-06, "loss": 0.2021, "step": 1830 }, { "epoch": 0.46, "grad_norm": 1.265625, "learning_rate": 3.2746170749593998e-06, "loss": 0.1979, "step": 1840 }, { "epoch": 0.4625, "grad_norm": 1.3671875, "learning_rate": 3.2538448998612394e-06, "loss": 0.1965, "step": 1850 }, { "epoch": 0.465, "grad_norm": 1.1328125, "learning_rate": 3.233015316555326e-06, "loss": 0.2151, "step": 1860 }, { "epoch": 0.4675, "grad_norm": 1.078125, "learning_rate": 3.212129911295074e-06, "loss": 0.1824, "step": 1870 }, { "epoch": 0.47, "grad_norm": 1.109375, "learning_rate": 3.1911902745849526e-06, "loss": 0.2051, "step": 1880 }, { "epoch": 0.4725, "grad_norm": 1.09375, "learning_rate": 3.17019800105937e-06, "loss": 0.2145, "step": 1890 }, { "epoch": 0.475, "grad_norm": 1.125, "learning_rate": 3.1491546893612296e-06, "loss": 0.2003, "step": 1900 }, { "epoch": 0.4775, "grad_norm": 1.2109375, "learning_rate": 3.128061942020189e-06, "loss": 0.2078, "step": 1910 }, { "epoch": 0.48, "grad_norm": 1.3671875, "learning_rate": 3.1069213653306242e-06, "loss": 0.1979, "step": 1920 }, { "epoch": 0.4825, "grad_norm": 1.109375, "learning_rate": 3.0857345692292968e-06, "loss": 0.1925, "step": 1930 }, { "epoch": 0.485, "grad_norm": 1.171875, "learning_rate": 3.0645031671727598e-06, "loss": 0.2042, "step": 1940 }, { "epoch": 0.4875, "grad_norm": 1.1875, "learning_rate": 3.0432287760144797e-06, "loss": 0.2064, "step": 1950 }, { "epoch": 0.49, "grad_norm": 1.4921875, "learning_rate": 3.0219130158817093e-06, "loss": 0.2035, "step": 1960 }, { "epoch": 0.4925, "grad_norm": 1.125, "learning_rate": 3.0005575100521115e-06, "loss": 0.195, "step": 1970 }, { "epoch": 0.495, "grad_norm": 1.3515625, "learning_rate": 2.979163884830137e-06, "loss": 0.2044, "step": 1980 }, { "epoch": 0.4975, "grad_norm": 1.765625, "learning_rate": 2.957733769423174e-06, "loss": 0.212, "step": 1990 }, { "epoch": 0.5, "grad_norm": 1.28125, "learning_rate": 2.93626879581748e-06, "loss": 0.1985, "step": 2000 }, { "epoch": 0.5025, "grad_norm": 1.359375, "learning_rate": 2.914770598653902e-06, "loss": 0.1872, "step": 2010 }, { "epoch": 0.505, "grad_norm": 1.1328125, "learning_rate": 2.8932408151033868e-06, "loss": 0.1884, "step": 2020 }, { "epoch": 0.5075, "grad_norm": 1.203125, "learning_rate": 2.8716810847423083e-06, "loss": 0.2003, "step": 2030 }, { "epoch": 0.51, "grad_norm": 1.2265625, "learning_rate": 2.8500930494276035e-06, "loss": 0.2057, "step": 2040 }, { "epoch": 0.5125, "grad_norm": 1.2890625, "learning_rate": 2.828478353171745e-06, "loss": 0.197, "step": 2050 }, { "epoch": 0.515, "grad_norm": 1.2890625, "learning_rate": 2.8068386420175376e-06, "loss": 0.1982, "step": 2060 }, { "epoch": 0.5175, "grad_norm": 1.0390625, "learning_rate": 2.785175563912766e-06, "loss": 0.1865, "step": 2070 }, { "epoch": 0.52, "grad_norm": 1.1171875, "learning_rate": 2.7634907685846995e-06, "loss": 0.1963, "step": 2080 }, { "epoch": 0.5225, "grad_norm": 1.3515625, "learning_rate": 2.7417859074144604e-06, "loss": 0.1976, "step": 2090 }, { "epoch": 0.525, "grad_norm": 1.234375, "learning_rate": 2.7200626333112595e-06, "loss": 0.1997, "step": 2100 }, { "epoch": 0.5275, "grad_norm": 1.0703125, "learning_rate": 2.6983226005865236e-06, "loss": 0.1997, "step": 2110 }, { "epoch": 0.53, "grad_norm": 1.046875, "learning_rate": 2.676567464827917e-06, "loss": 0.2114, "step": 2120 }, { "epoch": 0.5325, "grad_norm": 1.1953125, "learning_rate": 2.6547988827732546e-06, "loss": 0.194, "step": 2130 }, { "epoch": 0.535, "grad_norm": 1.1015625, "learning_rate": 2.633018512184341e-06, "loss": 0.198, "step": 2140 }, { "epoch": 0.5375, "grad_norm": 1.2265625, "learning_rate": 2.6112280117207223e-06, "loss": 0.2126, "step": 2150 }, { "epoch": 0.54, "grad_norm": 1.40625, "learning_rate": 2.5894290408133744e-06, "loss": 0.2021, "step": 2160 }, { "epoch": 0.5425, "grad_norm": 1.2890625, "learning_rate": 2.56762325953833e-06, "loss": 0.1997, "step": 2170 }, { "epoch": 0.545, "grad_norm": 1.140625, "learning_rate": 2.5458123284902577e-06, "loss": 0.1984, "step": 2180 }, { "epoch": 0.5475, "grad_norm": 1.0078125, "learning_rate": 2.5239979086560003e-06, "loss": 0.1981, "step": 2190 }, { "epoch": 0.55, "grad_norm": 1.2890625, "learning_rate": 2.5021816612880884e-06, "loss": 0.2044, "step": 2200 }, { "epoch": 0.5525, "grad_norm": 1.1953125, "learning_rate": 2.4803652477782228e-06, "loss": 0.2039, "step": 2210 }, { "epoch": 0.555, "grad_norm": 1.078125, "learning_rate": 2.4585503295307565e-06, "loss": 0.203, "step": 2220 }, { "epoch": 0.5575, "grad_norm": 1.109375, "learning_rate": 2.436738567836176e-06, "loss": 0.1996, "step": 2230 }, { "epoch": 0.56, "grad_norm": 1.09375, "learning_rate": 2.4149316237445813e-06, "loss": 0.1979, "step": 2240 }, { "epoch": 0.5625, "grad_norm": 1.078125, "learning_rate": 2.3931311579391946e-06, "loss": 0.1939, "step": 2250 }, { "epoch": 0.565, "grad_norm": 1.3359375, "learning_rate": 2.37133883060989e-06, "loss": 0.1991, "step": 2260 }, { "epoch": 0.5675, "grad_norm": 1.140625, "learning_rate": 2.3495563013267668e-06, "loss": 0.19, "step": 2270 }, { "epoch": 0.57, "grad_norm": 1.125, "learning_rate": 2.3277852289137636e-06, "loss": 0.1957, "step": 2280 }, { "epoch": 0.5725, "grad_norm": 1.2578125, "learning_rate": 2.306027271322336e-06, "loss": 0.1966, "step": 2290 }, { "epoch": 0.575, "grad_norm": 1.203125, "learning_rate": 2.284284085505192e-06, "loss": 0.2099, "step": 2300 }, { "epoch": 0.5775, "grad_norm": 1.2265625, "learning_rate": 2.2625573272901156e-06, "loss": 0.1997, "step": 2310 }, { "epoch": 0.58, "grad_norm": 1.1796875, "learning_rate": 2.240848651253863e-06, "loss": 0.1897, "step": 2320 }, { "epoch": 0.5825, "grad_norm": 0.9921875, "learning_rate": 2.2191597105961613e-06, "loss": 0.1826, "step": 2330 }, { "epoch": 0.585, "grad_norm": 1.109375, "learning_rate": 2.1974921570138155e-06, "loss": 0.1917, "step": 2340 }, { "epoch": 0.5875, "grad_norm": 1.265625, "learning_rate": 2.1758476405749207e-06, "loss": 0.1963, "step": 2350 }, { "epoch": 0.59, "grad_norm": 1.140625, "learning_rate": 2.154227809593203e-06, "loss": 0.1931, "step": 2360 }, { "epoch": 0.5925, "grad_norm": 1.1953125, "learning_rate": 2.1326343105024962e-06, "loss": 0.202, "step": 2370 }, { "epoch": 0.595, "grad_norm": 1.2265625, "learning_rate": 2.111068787731358e-06, "loss": 0.2061, "step": 2380 }, { "epoch": 0.5975, "grad_norm": 1.578125, "learning_rate": 2.089532883577843e-06, "loss": 0.1867, "step": 2390 }, { "epoch": 0.6, "grad_norm": 1.234375, "learning_rate": 2.068028238084432e-06, "loss": 0.2039, "step": 2400 }, { "epoch": 0.6025, "grad_norm": 1.171875, "learning_rate": 2.046556488913137e-06, "loss": 0.209, "step": 2410 }, { "epoch": 0.605, "grad_norm": 1.234375, "learning_rate": 2.025119271220789e-06, "loss": 0.1894, "step": 2420 }, { "epoch": 0.6075, "grad_norm": 1.125, "learning_rate": 2.0037182175345137e-06, "loss": 0.2098, "step": 2430 }, { "epoch": 0.61, "grad_norm": 1.1171875, "learning_rate": 1.9823549576274048e-06, "loss": 0.1955, "step": 2440 }, { "epoch": 0.6125, "grad_norm": 1.1640625, "learning_rate": 1.961031118394418e-06, "loss": 0.1881, "step": 2450 }, { "epoch": 0.615, "grad_norm": 1.453125, "learning_rate": 1.939748323728468e-06, "loss": 0.1976, "step": 2460 }, { "epoch": 0.6175, "grad_norm": 1.4296875, "learning_rate": 1.918508194396769e-06, "loss": 0.2028, "step": 2470 }, { "epoch": 0.62, "grad_norm": 1.828125, "learning_rate": 1.8973123479174038e-06, "loss": 0.2065, "step": 2480 }, { "epoch": 0.6225, "grad_norm": 1.3359375, "learning_rate": 1.8761623984361444e-06, "loss": 0.198, "step": 2490 }, { "epoch": 0.625, "grad_norm": 1.1171875, "learning_rate": 1.8550599566035299e-06, "loss": 0.192, "step": 2500 }, { "epoch": 0.6275, "grad_norm": 1.2265625, "learning_rate": 1.834006629452207e-06, "loss": 0.1894, "step": 2510 }, { "epoch": 0.63, "grad_norm": 1.15625, "learning_rate": 1.8130040202745488e-06, "loss": 0.203, "step": 2520 }, { "epoch": 0.6325, "grad_norm": 1.171875, "learning_rate": 1.7920537285005607e-06, "loss": 0.1879, "step": 2530 }, { "epoch": 0.635, "grad_norm": 1.1953125, "learning_rate": 1.7711573495760725e-06, "loss": 0.1887, "step": 2540 }, { "epoch": 0.6375, "grad_norm": 1.2578125, "learning_rate": 1.750316474841242e-06, "loss": 0.1894, "step": 2550 }, { "epoch": 0.64, "grad_norm": 1.1015625, "learning_rate": 1.7295326914093713e-06, "loss": 0.1991, "step": 2560 }, { "epoch": 0.6425, "grad_norm": 1.3984375, "learning_rate": 1.7088075820460348e-06, "loss": 0.1982, "step": 2570 }, { "epoch": 0.645, "grad_norm": 1.1171875, "learning_rate": 1.6881427250485516e-06, "loss": 0.1876, "step": 2580 }, { "epoch": 0.6475, "grad_norm": 1.1015625, "learning_rate": 1.6675396941257896e-06, "loss": 0.1853, "step": 2590 }, { "epoch": 0.65, "grad_norm": 1.1171875, "learning_rate": 1.6470000582783205e-06, "loss": 0.1917, "step": 2600 }, { "epoch": 0.6525, "grad_norm": 1.296875, "learning_rate": 1.6265253816789372e-06, "loss": 0.2017, "step": 2610 }, { "epoch": 0.655, "grad_norm": 1.203125, "learning_rate": 1.6061172235535342e-06, "loss": 0.193, "step": 2620 }, { "epoch": 0.6575, "grad_norm": 1.25, "learning_rate": 1.5857771380623643e-06, "loss": 0.2018, "step": 2630 }, { "epoch": 0.66, "grad_norm": 1.4765625, "learning_rate": 1.5655066741816898e-06, "loss": 0.2062, "step": 2640 }, { "epoch": 0.6625, "grad_norm": 1.1484375, "learning_rate": 1.545307375585814e-06, "loss": 0.1961, "step": 2650 }, { "epoch": 0.665, "grad_norm": 1.421875, "learning_rate": 1.5251807805295302e-06, "loss": 0.208, "step": 2660 }, { "epoch": 0.6675, "grad_norm": 1.125, "learning_rate": 1.5051284217309743e-06, "loss": 0.1951, "step": 2670 }, { "epoch": 0.67, "grad_norm": 1.0859375, "learning_rate": 1.4851518262549058e-06, "loss": 0.1936, "step": 2680 }, { "epoch": 0.6725, "grad_norm": 1.25, "learning_rate": 1.465252515396413e-06, "loss": 0.2048, "step": 2690 }, { "epoch": 0.675, "grad_norm": 1.1328125, "learning_rate": 1.4454320045650606e-06, "loss": 0.1979, "step": 2700 }, { "epoch": 0.6775, "grad_norm": 1.265625, "learning_rate": 1.4256918031694866e-06, "loss": 0.198, "step": 2710 }, { "epoch": 0.68, "grad_norm": 1.15625, "learning_rate": 1.4060334145024543e-06, "loss": 0.1902, "step": 2720 }, { "epoch": 0.6825, "grad_norm": 1.046875, "learning_rate": 1.3864583356263706e-06, "loss": 0.1911, "step": 2730 }, { "epoch": 0.685, "grad_norm": 1.15625, "learning_rate": 1.366968057259282e-06, "loss": 0.2109, "step": 2740 }, { "epoch": 0.6875, "grad_norm": 1.140625, "learning_rate": 1.3475640636613447e-06, "loss": 0.2015, "step": 2750 }, { "epoch": 0.69, "grad_norm": 1.265625, "learning_rate": 1.3282478325217961e-06, "loss": 0.2006, "step": 2760 }, { "epoch": 0.6925, "grad_norm": 1.046875, "learning_rate": 1.3090208348464244e-06, "loss": 0.1842, "step": 2770 }, { "epoch": 0.695, "grad_norm": 1.2890625, "learning_rate": 1.289884534845542e-06, "loss": 0.2091, "step": 2780 }, { "epoch": 0.6975, "grad_norm": 0.953125, "learning_rate": 1.2708403898224839e-06, "loss": 0.185, "step": 2790 }, { "epoch": 0.7, "grad_norm": 1.0625, "learning_rate": 1.2518898500626259e-06, "loss": 0.1875, "step": 2800 }, { "epoch": 0.7025, "grad_norm": 1.140625, "learning_rate": 1.2330343587229397e-06, "loss": 0.1926, "step": 2810 }, { "epoch": 0.705, "grad_norm": 1.234375, "learning_rate": 1.2142753517220945e-06, "loss": 0.1988, "step": 2820 }, { "epoch": 0.7075, "grad_norm": 1.046875, "learning_rate": 1.1956142576311011e-06, "loss": 0.1988, "step": 2830 }, { "epoch": 0.71, "grad_norm": 1.1484375, "learning_rate": 1.177052497564524e-06, "loss": 0.1817, "step": 2840 }, { "epoch": 0.7125, "grad_norm": 1.2109375, "learning_rate": 1.1585914850722565e-06, "loss": 0.1887, "step": 2850 }, { "epoch": 0.715, "grad_norm": 1.140625, "learning_rate": 1.1402326260318752e-06, "loss": 0.1821, "step": 2860 }, { "epoch": 0.7175, "grad_norm": 1.234375, "learning_rate": 1.121977318541575e-06, "loss": 0.1948, "step": 2870 }, { "epoch": 0.72, "grad_norm": 1.2265625, "learning_rate": 1.1038269528136989e-06, "loss": 0.2054, "step": 2880 }, { "epoch": 0.7225, "grad_norm": 1.0859375, "learning_rate": 1.0857829110688695e-06, "loss": 0.1956, "step": 2890 }, { "epoch": 0.725, "grad_norm": 1.09375, "learning_rate": 1.0678465674307273e-06, "loss": 0.2033, "step": 2900 }, { "epoch": 0.7275, "grad_norm": 1.046875, "learning_rate": 1.0500192878212826e-06, "loss": 0.1986, "step": 2910 }, { "epoch": 0.73, "grad_norm": 1.1328125, "learning_rate": 1.032302429856899e-06, "loss": 0.1979, "step": 2920 }, { "epoch": 0.7325, "grad_norm": 1.28125, "learning_rate": 1.014697342744904e-06, "loss": 0.1994, "step": 2930 }, { "epoch": 0.735, "grad_norm": 1.0859375, "learning_rate": 9.97205367180842e-07, "loss": 0.1921, "step": 2940 }, { "epoch": 0.7375, "grad_norm": 1.1328125, "learning_rate": 9.798278352463752e-07, "loss": 0.1999, "step": 2950 }, { "epoch": 0.74, "grad_norm": 1.1015625, "learning_rate": 9.625660703078394e-07, "loss": 0.2021, "step": 2960 }, { "epoch": 0.7425, "grad_norm": 1.296875, "learning_rate": 9.45421386915468e-07, "loss": 0.2027, "step": 2970 }, { "epoch": 0.745, "grad_norm": 1.34375, "learning_rate": 9.283950907032788e-07, "loss": 0.1898, "step": 2980 }, { "epoch": 0.7475, "grad_norm": 1.1484375, "learning_rate": 9.114884782896482e-07, "loss": 0.1943, "step": 2990 }, { "epoch": 0.75, "grad_norm": 1.3515625, "learning_rate": 8.947028371785677e-07, "loss": 0.2189, "step": 3000 }, { "epoch": 0.7525, "grad_norm": 1.0703125, "learning_rate": 8.780394456615974e-07, "loss": 0.1889, "step": 3010 }, { "epoch": 0.755, "grad_norm": 1.3359375, "learning_rate": 8.614995727205155e-07, "loss": 0.2013, "step": 3020 }, { "epoch": 0.7575, "grad_norm": 1.0703125, "learning_rate": 8.450844779306827e-07, "loss": 0.1867, "step": 3030 }, { "epoch": 0.76, "grad_norm": 1.0234375, "learning_rate": 8.28795411365122e-07, "loss": 0.2004, "step": 3040 }, { "epoch": 0.7625, "grad_norm": 1.484375, "learning_rate": 8.126336134993176e-07, "loss": 0.1959, "step": 3050 }, { "epoch": 0.765, "grad_norm": 1.1484375, "learning_rate": 7.966003151167498e-07, "loss": 0.1855, "step": 3060 }, { "epoch": 0.7675, "grad_norm": 1.125, "learning_rate": 7.806967372151661e-07, "loss": 0.1901, "step": 3070 }, { "epoch": 0.77, "grad_norm": 1.609375, "learning_rate": 7.649240909135966e-07, "loss": 0.2125, "step": 3080 }, { "epoch": 0.7725, "grad_norm": 1.171875, "learning_rate": 7.492835773601234e-07, "loss": 0.1916, "step": 3090 }, { "epoch": 0.775, "grad_norm": 1.140625, "learning_rate": 7.337763876404078e-07, "loss": 0.1844, "step": 3100 }, { "epoch": 0.7775, "grad_norm": 1.0390625, "learning_rate": 7.184037026869867e-07, "loss": 0.1908, "step": 3110 }, { "epoch": 0.78, "grad_norm": 1.1015625, "learning_rate": 7.031666931893361e-07, "loss": 0.1858, "step": 3120 }, { "epoch": 0.7825, "grad_norm": 1.171875, "learning_rate": 6.880665195047226e-07, "loss": 0.1976, "step": 3130 }, { "epoch": 0.785, "grad_norm": 1.1015625, "learning_rate": 6.731043315698346e-07, "loss": 0.1901, "step": 3140 }, { "epoch": 0.7875, "grad_norm": 1.15625, "learning_rate": 6.58281268813212e-07, "loss": 0.2021, "step": 3150 }, { "epoch": 0.79, "grad_norm": 1.4296875, "learning_rate": 6.435984600684731e-07, "loss": 0.2088, "step": 3160 }, { "epoch": 0.7925, "grad_norm": 1.0390625, "learning_rate": 6.290570234883506e-07, "loss": 0.2004, "step": 3170 }, { "epoch": 0.795, "grad_norm": 4.46875, "learning_rate": 6.146580664595391e-07, "loss": 0.1964, "step": 3180 }, { "epoch": 0.7975, "grad_norm": 1.15625, "learning_rate": 6.004026855183656e-07, "loss": 0.1996, "step": 3190 }, { "epoch": 0.8, "grad_norm": 1.0390625, "learning_rate": 5.862919662672801e-07, "loss": 0.1919, "step": 3200 }, { "epoch": 0.8025, "grad_norm": 1.1796875, "learning_rate": 5.723269832921849e-07, "loss": 0.202, "step": 3210 }, { "epoch": 0.805, "grad_norm": 1.1015625, "learning_rate": 5.585088000806016e-07, "loss": 0.2024, "step": 3220 }, { "epoch": 0.8075, "grad_norm": 1.2734375, "learning_rate": 5.448384689406804e-07, "loss": 0.1962, "step": 3230 }, { "epoch": 0.81, "grad_norm": 1.2890625, "learning_rate": 5.313170309210655e-07, "loss": 0.1879, "step": 3240 }, { "epoch": 0.8125, "grad_norm": 0.99609375, "learning_rate": 5.179455157316124e-07, "loss": 0.1987, "step": 3250 }, { "epoch": 0.815, "grad_norm": 1.15625, "learning_rate": 5.047249416649713e-07, "loss": 0.2078, "step": 3260 }, { "epoch": 0.8175, "grad_norm": 14.25, "learning_rate": 4.916563155190446e-07, "loss": 0.2002, "step": 3270 }, { "epoch": 0.82, "grad_norm": 1.09375, "learning_rate": 4.787406325203101e-07, "loss": 0.1948, "step": 3280 }, { "epoch": 0.8225, "grad_norm": 1.0859375, "learning_rate": 4.6597887624803273e-07, "loss": 0.1981, "step": 3290 }, { "epoch": 0.825, "grad_norm": 1.1796875, "learning_rate": 4.533720185593621e-07, "loss": 0.1965, "step": 3300 }, { "epoch": 0.8275, "grad_norm": 1.1875, "learning_rate": 4.4092101951532076e-07, "loss": 0.1926, "step": 3310 }, { "epoch": 0.83, "grad_norm": 1.2578125, "learning_rate": 4.2862682730769157e-07, "loss": 0.1952, "step": 3320 }, { "epoch": 0.8325, "grad_norm": 1.1484375, "learning_rate": 4.164903781868096e-07, "loss": 0.1853, "step": 3330 }, { "epoch": 0.835, "grad_norm": 1.6015625, "learning_rate": 4.045125963902641e-07, "loss": 0.2008, "step": 3340 }, { "epoch": 0.8375, "grad_norm": 1.1015625, "learning_rate": 3.9269439407251365e-07, "loss": 0.2064, "step": 3350 }, { "epoch": 0.84, "grad_norm": 1.1640625, "learning_rate": 3.810366712354199e-07, "loss": 0.1833, "step": 3360 }, { "epoch": 0.8425, "grad_norm": 1.0546875, "learning_rate": 3.6954031565971187e-07, "loss": 0.1889, "step": 3370 }, { "epoch": 0.845, "grad_norm": 0.984375, "learning_rate": 3.5820620283737615e-07, "loss": 0.1785, "step": 3380 }, { "epoch": 0.8475, "grad_norm": 1.46875, "learning_rate": 3.4703519590498615e-07, "loss": 0.2022, "step": 3390 }, { "epoch": 0.85, "grad_norm": 1.140625, "learning_rate": 3.360281455779704e-07, "loss": 0.1942, "step": 3400 }, { "epoch": 0.8525, "grad_norm": 1.1640625, "learning_rate": 3.2518589008582597e-07, "loss": 0.1933, "step": 3410 }, { "epoch": 0.855, "grad_norm": 1.1875, "learning_rate": 3.1450925510828705e-07, "loss": 0.1902, "step": 3420 }, { "epoch": 0.8575, "grad_norm": 1.1796875, "learning_rate": 3.039990537124432e-07, "loss": 0.2161, "step": 3430 }, { "epoch": 0.86, "grad_norm": 1.078125, "learning_rate": 2.936560862908225e-07, "loss": 0.178, "step": 3440 }, { "epoch": 0.8625, "grad_norm": 1.234375, "learning_rate": 2.8348114050043813e-07, "loss": 0.2021, "step": 3450 }, { "epoch": 0.865, "grad_norm": 1.578125, "learning_rate": 2.7347499120280677e-07, "loss": 0.2043, "step": 3460 }, { "epoch": 0.8675, "grad_norm": 1.328125, "learning_rate": 2.636384004049375e-07, "loss": 0.1921, "step": 3470 }, { "epoch": 0.87, "grad_norm": 1.125, "learning_rate": 2.5397211720130267e-07, "loss": 0.19, "step": 3480 }, { "epoch": 0.8725, "grad_norm": 1.171875, "learning_rate": 2.4447687771679414e-07, "loss": 0.1914, "step": 3490 }, { "epoch": 0.875, "grad_norm": 1.09375, "learning_rate": 2.3515340505066043e-07, "loss": 0.1919, "step": 3500 }, { "epoch": 0.8775, "grad_norm": 1.0859375, "learning_rate": 2.260024092214419e-07, "loss": 0.2006, "step": 3510 }, { "epoch": 0.88, "grad_norm": 1.1640625, "learning_rate": 2.170245871129012e-07, "loss": 0.1982, "step": 3520 }, { "epoch": 0.8825, "grad_norm": 1.9453125, "learning_rate": 2.0822062242095015e-07, "loss": 0.2113, "step": 3530 }, { "epoch": 0.885, "grad_norm": 1.1171875, "learning_rate": 1.995911856015867e-07, "loss": 0.1969, "step": 3540 }, { "epoch": 0.8875, "grad_norm": 1.1796875, "learning_rate": 1.9113693381983405e-07, "loss": 0.1974, "step": 3550 }, { "epoch": 0.89, "grad_norm": 1.203125, "learning_rate": 1.8285851089969802e-07, "loss": 0.1987, "step": 3560 }, { "epoch": 0.8925, "grad_norm": 1.3046875, "learning_rate": 1.7475654727513502e-07, "loss": 0.201, "step": 3570 }, { "epoch": 0.895, "grad_norm": 1.21875, "learning_rate": 1.668316599420433e-07, "loss": 0.185, "step": 3580 }, { "epoch": 0.8975, "grad_norm": 1.0703125, "learning_rate": 1.5908445241127528e-07, "loss": 0.1872, "step": 3590 }, { "epoch": 0.9, "grad_norm": 1.1953125, "learning_rate": 1.5151551466267956e-07, "loss": 0.1985, "step": 3600 }, { "epoch": 0.9025, "grad_norm": 1.1796875, "learning_rate": 1.441254231001696e-07, "loss": 0.2036, "step": 3610 }, { "epoch": 0.905, "grad_norm": 1.125, "learning_rate": 1.3691474050782972e-07, "loss": 0.1902, "step": 3620 }, { "epoch": 0.9075, "grad_norm": 1.078125, "learning_rate": 1.2988401600705635e-07, "loss": 0.1921, "step": 3630 }, { "epoch": 0.91, "grad_norm": 1.1640625, "learning_rate": 1.2303378501474174e-07, "loss": 0.1752, "step": 3640 }, { "epoch": 0.9125, "grad_norm": 1.1953125, "learning_rate": 1.16364569202497e-07, "loss": 0.1893, "step": 3650 }, { "epoch": 0.915, "grad_norm": 1.1953125, "learning_rate": 1.0987687645692746e-07, "loss": 0.2005, "step": 3660 }, { "epoch": 0.9175, "grad_norm": 1.4375, "learning_rate": 1.035712008409534e-07, "loss": 0.1948, "step": 3670 }, { "epoch": 0.92, "grad_norm": 1.125, "learning_rate": 9.744802255618662e-08, "loss": 0.1903, "step": 3680 }, { "epoch": 0.9225, "grad_norm": 1.109375, "learning_rate": 9.150780790636054e-08, "loss": 0.2147, "step": 3690 }, { "epoch": 0.925, "grad_norm": 1.2734375, "learning_rate": 8.575100926181884e-08, "loss": 0.19, "step": 3700 }, { "epoch": 0.9275, "grad_norm": 1.09375, "learning_rate": 8.017806502506692e-08, "loss": 0.2004, "step": 3710 }, { "epoch": 0.93, "grad_norm": 1.2421875, "learning_rate": 7.478939959738502e-08, "loss": 0.2059, "step": 3720 }, { "epoch": 0.9325, "grad_norm": 1.234375, "learning_rate": 6.958542334650847e-08, "loss": 0.1908, "step": 3730 }, { "epoch": 0.935, "grad_norm": 1.6328125, "learning_rate": 6.456653257537665e-08, "loss": 0.1979, "step": 3740 }, { "epoch": 0.9375, "grad_norm": 1.0859375, "learning_rate": 5.973310949195343e-08, "loss": 0.1934, "step": 3750 }, { "epoch": 0.94, "grad_norm": 1.078125, "learning_rate": 5.50855221801197e-08, "loss": 0.1893, "step": 3760 }, { "epoch": 0.9425, "grad_norm": 1.15625, "learning_rate": 5.062412457164323e-08, "loss": 0.1985, "step": 3770 }, { "epoch": 0.945, "grad_norm": 1.1953125, "learning_rate": 4.634925641922472e-08, "loss": 0.2008, "step": 3780 }, { "epoch": 0.9475, "grad_norm": 1.234375, "learning_rate": 4.226124327062514e-08, "loss": 0.2056, "step": 3790 }, { "epoch": 0.95, "grad_norm": 1.0625, "learning_rate": 3.836039644387307e-08, "loss": 0.2036, "step": 3800 }, { "epoch": 0.9525, "grad_norm": 16.25, "learning_rate": 3.4647013003556996e-08, "loss": 0.199, "step": 3810 }, { "epoch": 0.955, "grad_norm": 1.1015625, "learning_rate": 3.112137573820284e-08, "loss": 0.1917, "step": 3820 }, { "epoch": 0.9575, "grad_norm": 1.0625, "learning_rate": 2.7783753138738713e-08, "loss": 0.1946, "step": 3830 }, { "epoch": 0.96, "grad_norm": 1.1171875, "learning_rate": 2.463439937804707e-08, "loss": 0.1816, "step": 3840 }, { "epoch": 0.9625, "grad_norm": 1.2421875, "learning_rate": 2.1673554291610775e-08, "loss": 0.1859, "step": 3850 }, { "epoch": 0.965, "grad_norm": 1.375, "learning_rate": 1.8901443359245765e-08, "loss": 0.1993, "step": 3860 }, { "epoch": 0.9675, "grad_norm": 1.328125, "learning_rate": 1.6318277687932816e-08, "loss": 0.2007, "step": 3870 }, { "epoch": 0.97, "grad_norm": 1.1484375, "learning_rate": 1.3924253995738769e-08, "loss": 0.1925, "step": 3880 }, { "epoch": 0.9725, "grad_norm": 1.15625, "learning_rate": 1.1719554596836546e-08, "loss": 0.1996, "step": 3890 }, { "epoch": 0.975, "grad_norm": 1.125, "learning_rate": 9.704347387620994e-09, "loss": 0.1836, "step": 3900 }, { "epoch": 0.9775, "grad_norm": 1.1640625, "learning_rate": 7.878785833923819e-09, "loss": 0.1928, "step": 3910 }, { "epoch": 0.98, "grad_norm": 1.46875, "learning_rate": 6.243008959324892e-09, "loss": 0.1967, "step": 3920 }, { "epoch": 0.9825, "grad_norm": 1.171875, "learning_rate": 4.797141334566268e-09, "loss": 0.1992, "step": 3930 }, { "epoch": 0.985, "grad_norm": 1.2734375, "learning_rate": 3.5412930680658876e-09, "loss": 0.1993, "step": 3940 }, { "epoch": 0.9875, "grad_norm": 1.125, "learning_rate": 2.475559797531224e-09, "loss": 0.1989, "step": 3950 }, { "epoch": 0.99, "grad_norm": 1.09375, "learning_rate": 1.6000226826770604e-09, "loss": 0.1826, "step": 3960 }, { "epoch": 0.9925, "grad_norm": 1.171875, "learning_rate": 9.147483990443184e-10, "loss": 0.1969, "step": 3970 }, { "epoch": 0.995, "grad_norm": 1.21875, "learning_rate": 4.197891329230097e-10, "loss": 0.2085, "step": 3980 }, { "epoch": 0.9975, "grad_norm": 1.2890625, "learning_rate": 1.1518257737763716e-10, "loss": 0.1928, "step": 3990 }, { "epoch": 1.0, "grad_norm": 1.125, "learning_rate": 9.51929376435956e-13, "loss": 0.1986, "step": 4000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.82656621576192e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }