{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 19.25, "learning_rate": 1.1250000000000001e-07, "loss": 1.8141, "step": 10 }, { "epoch": 0.005, "grad_norm": 15.25, "learning_rate": 2.3750000000000003e-07, "loss": 1.7844, "step": 20 }, { "epoch": 0.0075, "grad_norm": 18.25, "learning_rate": 3.625e-07, "loss": 1.7555, "step": 30 }, { "epoch": 0.01, "grad_norm": 14.0625, "learning_rate": 4.875000000000001e-07, "loss": 1.7414, "step": 40 }, { "epoch": 0.0125, "grad_norm": 17.75, "learning_rate": 6.125000000000001e-07, "loss": 1.7531, "step": 50 }, { "epoch": 0.015, "grad_norm": 15.5, "learning_rate": 7.375e-07, "loss": 1.7852, "step": 60 }, { "epoch": 0.0175, "grad_norm": 17.875, "learning_rate": 8.625e-07, "loss": 1.7844, "step": 70 }, { "epoch": 0.02, "grad_norm": 14.9375, "learning_rate": 9.875e-07, "loss": 1.7883, "step": 80 }, { "epoch": 0.0225, "grad_norm": 11.0, "learning_rate": 1.1125000000000001e-06, "loss": 1.7312, "step": 90 }, { "epoch": 0.025, "grad_norm": 12.9375, "learning_rate": 1.2375e-06, "loss": 1.75, "step": 100 }, { "epoch": 0.0275, "grad_norm": 12.0625, "learning_rate": 1.3625000000000003e-06, "loss": 1.7477, "step": 110 }, { "epoch": 0.03, "grad_norm": 14.3125, "learning_rate": 1.4875000000000002e-06, "loss": 1.7312, "step": 120 }, { "epoch": 0.0325, "grad_norm": 12.375, "learning_rate": 1.6125000000000002e-06, "loss": 1.7414, "step": 130 }, { "epoch": 0.035, "grad_norm": 12.3125, "learning_rate": 1.7375e-06, "loss": 1.7367, "step": 140 }, { "epoch": 0.0375, "grad_norm": 15.1875, "learning_rate": 1.8625000000000002e-06, "loss": 1.7656, "step": 150 }, { "epoch": 0.04, "grad_norm": 13.25, "learning_rate": 1.9875000000000005e-06, "loss": 1.7523, "step": 160 }, { "epoch": 0.0425, "grad_norm": 10.125, "learning_rate": 2.1125e-06, "loss": 1.6953, "step": 170 }, { "epoch": 0.045, "grad_norm": 13.375, "learning_rate": 2.2375e-06, "loss": 1.7109, "step": 180 }, { "epoch": 0.0475, "grad_norm": 9.5625, "learning_rate": 2.3625000000000003e-06, "loss": 1.7047, "step": 190 }, { "epoch": 0.05, "grad_norm": 11.875, "learning_rate": 2.4875000000000003e-06, "loss": 1.675, "step": 200 }, { "epoch": 0.0525, "grad_norm": 14.5, "learning_rate": 2.6125e-06, "loss": 1.6836, "step": 210 }, { "epoch": 0.055, "grad_norm": 12.875, "learning_rate": 2.7375e-06, "loss": 1.7133, "step": 220 }, { "epoch": 0.0575, "grad_norm": 11.0625, "learning_rate": 2.8625e-06, "loss": 1.6461, "step": 230 }, { "epoch": 0.06, "grad_norm": 13.625, "learning_rate": 2.9875e-06, "loss": 1.6242, "step": 240 }, { "epoch": 0.0625, "grad_norm": 15.375, "learning_rate": 3.1125000000000007e-06, "loss": 1.6523, "step": 250 }, { "epoch": 0.065, "grad_norm": 10.5, "learning_rate": 3.2375e-06, "loss": 1.6141, "step": 260 }, { "epoch": 0.0675, "grad_norm": 13.0625, "learning_rate": 3.3625000000000004e-06, "loss": 1.5766, "step": 270 }, { "epoch": 0.07, "grad_norm": 13.6875, "learning_rate": 3.4875000000000005e-06, "loss": 1.5625, "step": 280 }, { "epoch": 0.0725, "grad_norm": 19.0, "learning_rate": 3.6125000000000006e-06, "loss": 1.5367, "step": 290 }, { "epoch": 0.075, "grad_norm": 18.75, "learning_rate": 3.7375000000000006e-06, "loss": 1.4852, "step": 300 }, { "epoch": 0.0775, "grad_norm": 11.3125, "learning_rate": 3.8625e-06, "loss": 1.4773, "step": 310 }, { "epoch": 0.08, "grad_norm": 14.9375, "learning_rate": 3.9875e-06, "loss": 1.4672, "step": 320 }, { "epoch": 0.0825, "grad_norm": 12.3125, "learning_rate": 4.1125e-06, "loss": 1.4109, "step": 330 }, { "epoch": 0.085, "grad_norm": 12.625, "learning_rate": 4.2375000000000005e-06, "loss": 1.375, "step": 340 }, { "epoch": 0.0875, "grad_norm": 19.0, "learning_rate": 4.362500000000001e-06, "loss": 1.3438, "step": 350 }, { "epoch": 0.09, "grad_norm": 13.0625, "learning_rate": 4.4875e-06, "loss": 1.2742, "step": 360 }, { "epoch": 0.0925, "grad_norm": 12.625, "learning_rate": 4.6125e-06, "loss": 1.2508, "step": 370 }, { "epoch": 0.095, "grad_norm": 10.0625, "learning_rate": 4.737500000000001e-06, "loss": 1.268, "step": 380 }, { "epoch": 0.0975, "grad_norm": 10.125, "learning_rate": 4.8625000000000005e-06, "loss": 1.2055, "step": 390 }, { "epoch": 0.1, "grad_norm": 12.25, "learning_rate": 4.987500000000001e-06, "loss": 1.1742, "step": 400 }, { "epoch": 0.1025, "grad_norm": 12.4375, "learning_rate": 4.999922894111975e-06, "loss": 1.1199, "step": 410 }, { "epoch": 0.105, "grad_norm": 9.8125, "learning_rate": 4.999656361346094e-06, "loss": 1.1195, "step": 420 }, { "epoch": 0.1075, "grad_norm": 11.1875, "learning_rate": 4.999199470070484e-06, "loss": 1.0727, "step": 430 }, { "epoch": 0.11, "grad_norm": 11.625, "learning_rate": 4.998552255079182e-06, "loss": 1.0711, "step": 440 }, { "epoch": 0.1125, "grad_norm": 9.3125, "learning_rate": 4.9977147656601196e-06, "loss": 1.0414, "step": 450 }, { "epoch": 0.115, "grad_norm": 10.3125, "learning_rate": 4.996687065591355e-06, "loss": 1.0367, "step": 460 }, { "epoch": 0.1175, "grad_norm": 10.5, "learning_rate": 4.9954692331362295e-06, "loss": 1.0125, "step": 470 }, { "epoch": 0.12, "grad_norm": 10.625, "learning_rate": 4.9940613610373974e-06, "loss": 0.9828, "step": 480 }, { "epoch": 0.1225, "grad_norm": 10.125, "learning_rate": 4.992463556509772e-06, "loss": 0.9695, "step": 490 }, { "epoch": 0.125, "grad_norm": 10.75, "learning_rate": 4.990675941232353e-06, "loss": 0.9395, "step": 500 }, { "epoch": 0.1275, "grad_norm": 9.0, "learning_rate": 4.988698651338965e-06, "loss": 0.9137, "step": 510 }, { "epoch": 0.13, "grad_norm": 11.375, "learning_rate": 4.986531837407891e-06, "loss": 0.9082, "step": 520 }, { "epoch": 0.1325, "grad_norm": 12.5, "learning_rate": 4.9841756644503965e-06, "loss": 0.8809, "step": 530 }, { "epoch": 0.135, "grad_norm": 18.875, "learning_rate": 4.981630311898178e-06, "loss": 0.8648, "step": 540 }, { "epoch": 0.1375, "grad_norm": 9.8125, "learning_rate": 4.978895973589686e-06, "loss": 0.8633, "step": 550 }, { "epoch": 0.14, "grad_norm": 9.3125, "learning_rate": 4.975972857755369e-06, "loss": 0.8293, "step": 560 }, { "epoch": 0.1425, "grad_norm": 9.9375, "learning_rate": 4.972861187001815e-06, "loss": 0.8242, "step": 570 }, { "epoch": 0.145, "grad_norm": 9.25, "learning_rate": 4.9695611982947995e-06, "loss": 0.8031, "step": 580 }, { "epoch": 0.1475, "grad_norm": 9.0625, "learning_rate": 4.966073142941239e-06, "loss": 0.775, "step": 590 }, { "epoch": 0.15, "grad_norm": 9.4375, "learning_rate": 4.962397286570053e-06, "loss": 0.7996, "step": 600 }, { "epoch": 0.1525, "grad_norm": 10.5, "learning_rate": 4.958533909111936e-06, "loss": 0.7492, "step": 610 }, { "epoch": 0.155, "grad_norm": 11.1875, "learning_rate": 4.95448330477804e-06, "loss": 0.7184, "step": 620 }, { "epoch": 0.1575, "grad_norm": 9.6875, "learning_rate": 4.950245782037566e-06, "loss": 0.7023, "step": 630 }, { "epoch": 0.16, "grad_norm": 10.0625, "learning_rate": 4.945821663594277e-06, "loss": 0.6949, "step": 640 }, { "epoch": 0.1625, "grad_norm": 9.125, "learning_rate": 4.941211286361922e-06, "loss": 0.7043, "step": 650 }, { "epoch": 0.165, "grad_norm": 9.5625, "learning_rate": 4.936415001438577e-06, "loss": 0.6633, "step": 660 }, { "epoch": 0.1675, "grad_norm": 9.25, "learning_rate": 4.9314331740799084e-06, "loss": 0.652, "step": 670 }, { "epoch": 0.17, "grad_norm": 9.0625, "learning_rate": 4.926266183671356e-06, "loss": 0.6637, "step": 680 }, { "epoch": 0.1725, "grad_norm": 9.75, "learning_rate": 4.920914423699247e-06, "loss": 0.6262, "step": 690 }, { "epoch": 0.175, "grad_norm": 8.9375, "learning_rate": 4.915378301720822e-06, "loss": 0.6398, "step": 700 }, { "epoch": 0.1775, "grad_norm": 9.25, "learning_rate": 4.909658239333203e-06, "loss": 0.6195, "step": 710 }, { "epoch": 0.18, "grad_norm": 9.4375, "learning_rate": 4.903754672141288e-06, "loss": 0.5977, "step": 720 }, { "epoch": 0.1825, "grad_norm": 8.5, "learning_rate": 4.897668049724574e-06, "loss": 0.5766, "step": 730 }, { "epoch": 0.185, "grad_norm": 9.1875, "learning_rate": 4.891398835602925e-06, "loss": 0.5656, "step": 740 }, { "epoch": 0.1875, "grad_norm": 8.4375, "learning_rate": 4.884947507201268e-06, "loss": 0.5781, "step": 750 }, { "epoch": 0.19, "grad_norm": 9.0, "learning_rate": 4.878314555813237e-06, "loss": 0.559, "step": 760 }, { "epoch": 0.1925, "grad_norm": 8.5625, "learning_rate": 4.8715004865637616e-06, "loss": 0.5496, "step": 770 }, { "epoch": 0.195, "grad_norm": 8.5, "learning_rate": 4.8645058183705976e-06, "loss": 0.5264, "step": 780 }, { "epoch": 0.1975, "grad_norm": 9.0625, "learning_rate": 4.8573310839048085e-06, "loss": 0.5404, "step": 790 }, { "epoch": 0.2, "grad_norm": 8.625, "learning_rate": 4.8499768295502e-06, "loss": 0.5117, "step": 800 }, { "epoch": 0.2025, "grad_norm": 8.4375, "learning_rate": 4.842443615361718e-06, "loss": 0.5195, "step": 810 }, { "epoch": 0.205, "grad_norm": 8.4375, "learning_rate": 4.834732015022786e-06, "loss": 0.4818, "step": 820 }, { "epoch": 0.2075, "grad_norm": 8.25, "learning_rate": 4.826842615801628e-06, "loss": 0.4992, "step": 830 }, { "epoch": 0.21, "grad_norm": 8.375, "learning_rate": 4.818776018506538e-06, "loss": 0.4975, "step": 840 }, { "epoch": 0.2125, "grad_norm": 8.3125, "learning_rate": 4.810532837440134e-06, "loss": 0.4752, "step": 850 }, { "epoch": 0.215, "grad_norm": 8.0, "learning_rate": 4.802113700352567e-06, "loss": 0.4924, "step": 860 }, { "epoch": 0.2175, "grad_norm": 7.6875, "learning_rate": 4.793519248393721e-06, "loss": 0.4564, "step": 870 }, { "epoch": 0.22, "grad_norm": 8.125, "learning_rate": 4.78475013606439e-06, "loss": 0.443, "step": 880 }, { "epoch": 0.2225, "grad_norm": 7.65625, "learning_rate": 4.775807031166428e-06, "loss": 0.4428, "step": 890 }, { "epoch": 0.225, "grad_norm": 7.3125, "learning_rate": 4.766690614751897e-06, "loss": 0.4443, "step": 900 }, { "epoch": 0.2275, "grad_norm": 7.625, "learning_rate": 4.757401581071203e-06, "loss": 0.4482, "step": 910 }, { "epoch": 0.23, "grad_norm": 7.09375, "learning_rate": 4.747940637520226e-06, "loss": 0.4332, "step": 920 }, { "epoch": 0.2325, "grad_norm": 7.65625, "learning_rate": 4.738308504586445e-06, "loss": 0.4215, "step": 930 }, { "epoch": 0.235, "grad_norm": 6.53125, "learning_rate": 4.7285059157940765e-06, "loss": 0.4332, "step": 940 }, { "epoch": 0.2375, "grad_norm": 7.03125, "learning_rate": 4.718533617648209e-06, "loss": 0.4092, "step": 950 }, { "epoch": 0.24, "grad_norm": 6.96875, "learning_rate": 4.7083923695779546e-06, "loss": 0.4297, "step": 960 }, { "epoch": 0.2425, "grad_norm": 6.78125, "learning_rate": 4.6980829438786176e-06, "loss": 0.3949, "step": 970 }, { "epoch": 0.245, "grad_norm": 7.46875, "learning_rate": 4.687606125652882e-06, "loss": 0.4023, "step": 980 }, { "epoch": 0.2475, "grad_norm": 6.75, "learning_rate": 4.676962712751015e-06, "loss": 0.3988, "step": 990 }, { "epoch": 0.25, "grad_norm": 6.5, "learning_rate": 4.666153515710118e-06, "loss": 0.3975, "step": 1000 }, { "epoch": 0.2525, "grad_norm": 6.84375, "learning_rate": 4.655179357692396e-06, "loss": 0.4049, "step": 1010 }, { "epoch": 0.255, "grad_norm": 6.375, "learning_rate": 4.644041074422469e-06, "loss": 0.4037, "step": 1020 }, { "epoch": 0.2575, "grad_norm": 7.03125, "learning_rate": 4.632739514123733e-06, "loss": 0.3846, "step": 1030 }, { "epoch": 0.26, "grad_norm": 6.09375, "learning_rate": 4.6212755374537596e-06, "loss": 0.376, "step": 1040 }, { "epoch": 0.2625, "grad_norm": 6.59375, "learning_rate": 4.609650017438757e-06, "loss": 0.4123, "step": 1050 }, { "epoch": 0.265, "grad_norm": 6.84375, "learning_rate": 4.5978638394070835e-06, "loss": 0.3816, "step": 1060 }, { "epoch": 0.2675, "grad_norm": 6.1875, "learning_rate": 4.58591790092183e-06, "loss": 0.3854, "step": 1070 }, { "epoch": 0.27, "grad_norm": 6.75, "learning_rate": 4.5738131117124605e-06, "loss": 0.3783, "step": 1080 }, { "epoch": 0.2725, "grad_norm": 5.84375, "learning_rate": 4.561550393605541e-06, "loss": 0.384, "step": 1090 }, { "epoch": 0.275, "grad_norm": 7.625, "learning_rate": 4.549130680454532e-06, "loss": 0.41, "step": 1100 }, { "epoch": 0.2775, "grad_norm": 5.9375, "learning_rate": 4.536554918068673e-06, "loss": 0.3664, "step": 1110 }, { "epoch": 0.28, "grad_norm": 5.03125, "learning_rate": 4.523824064140961e-06, "loss": 0.3727, "step": 1120 }, { "epoch": 0.2825, "grad_norm": 6.09375, "learning_rate": 4.510939088175211e-06, "loss": 0.3764, "step": 1130 }, { "epoch": 0.285, "grad_norm": 5.875, "learning_rate": 4.49790097141223e-06, "loss": 0.358, "step": 1140 }, { "epoch": 0.2875, "grad_norm": 5.875, "learning_rate": 4.484710706755087e-06, "loss": 0.3549, "step": 1150 }, { "epoch": 0.29, "grad_norm": 5.84375, "learning_rate": 4.471369298693505e-06, "loss": 0.3553, "step": 1160 }, { "epoch": 0.2925, "grad_norm": 5.75, "learning_rate": 4.457877763227361e-06, "loss": 0.3623, "step": 1170 }, { "epoch": 0.295, "grad_norm": 5.5625, "learning_rate": 4.444237127789315e-06, "loss": 0.3629, "step": 1180 }, { "epoch": 0.2975, "grad_norm": 6.96875, "learning_rate": 4.430448431166567e-06, "loss": 0.3434, "step": 1190 }, { "epoch": 0.3, "grad_norm": 5.9375, "learning_rate": 4.416512723421752e-06, "loss": 0.3549, "step": 1200 }, { "epoch": 0.3025, "grad_norm": 5.53125, "learning_rate": 4.402431065812968e-06, "loss": 0.3461, "step": 1210 }, { "epoch": 0.305, "grad_norm": 5.375, "learning_rate": 4.388204530712959e-06, "loss": 0.3547, "step": 1220 }, { "epoch": 0.3075, "grad_norm": 5.90625, "learning_rate": 4.373834201527457e-06, "loss": 0.3383, "step": 1230 }, { "epoch": 0.31, "grad_norm": 5.6875, "learning_rate": 4.359321172612664e-06, "loss": 0.3414, "step": 1240 }, { "epoch": 0.3125, "grad_norm": 5.15625, "learning_rate": 4.344666549191921e-06, "loss": 0.3285, "step": 1250 }, { "epoch": 0.315, "grad_norm": 5.4375, "learning_rate": 4.329871447271541e-06, "loss": 0.3352, "step": 1260 }, { "epoch": 0.3175, "grad_norm": 5.0625, "learning_rate": 4.314936993555816e-06, "loss": 0.3441, "step": 1270 }, { "epoch": 0.32, "grad_norm": 5.09375, "learning_rate": 4.299864325361217e-06, "loss": 0.3523, "step": 1280 }, { "epoch": 0.3225, "grad_norm": 5.71875, "learning_rate": 4.284654590529784e-06, "loss": 0.3348, "step": 1290 }, { "epoch": 0.325, "grad_norm": 5.28125, "learning_rate": 4.269308947341711e-06, "loss": 0.3252, "step": 1300 }, { "epoch": 0.3275, "grad_norm": 4.8125, "learning_rate": 4.25382856442714e-06, "loss": 0.3439, "step": 1310 }, { "epoch": 0.33, "grad_norm": 5.4375, "learning_rate": 4.238214620677164e-06, "loss": 0.3326, "step": 1320 }, { "epoch": 0.3325, "grad_norm": 4.75, "learning_rate": 4.222468305154052e-06, "loss": 0.3332, "step": 1330 }, { "epoch": 0.335, "grad_norm": 5.25, "learning_rate": 4.206590817000695e-06, "loss": 0.3277, "step": 1340 }, { "epoch": 0.3375, "grad_norm": 5.03125, "learning_rate": 4.190583365349289e-06, "loss": 0.3363, "step": 1350 }, { "epoch": 0.34, "grad_norm": 5.0, "learning_rate": 4.174447169229252e-06, "loss": 0.3412, "step": 1360 }, { "epoch": 0.3425, "grad_norm": 5.28125, "learning_rate": 4.158183457474392e-06, "loss": 0.326, "step": 1370 }, { "epoch": 0.345, "grad_norm": 5.3125, "learning_rate": 4.141793468629327e-06, "loss": 0.334, "step": 1380 }, { "epoch": 0.3475, "grad_norm": 5.0, "learning_rate": 4.125278450855165e-06, "loss": 0.3367, "step": 1390 }, { "epoch": 0.35, "grad_norm": 6.28125, "learning_rate": 4.1086396618344474e-06, "loss": 0.3168, "step": 1400 }, { "epoch": 0.3525, "grad_norm": 5.15625, "learning_rate": 4.09187836867538e-06, "loss": 0.3193, "step": 1410 }, { "epoch": 0.355, "grad_norm": 5.15625, "learning_rate": 4.074995847815331e-06, "loss": 0.3225, "step": 1420 }, { "epoch": 0.3575, "grad_norm": 4.96875, "learning_rate": 4.057993384923626e-06, "loss": 0.332, "step": 1430 }, { "epoch": 0.36, "grad_norm": 4.9375, "learning_rate": 4.0408722748036426e-06, "loss": 0.3221, "step": 1440 }, { "epoch": 0.3625, "grad_norm": 4.8125, "learning_rate": 4.023633821294203e-06, "loss": 0.3211, "step": 1450 }, { "epoch": 0.365, "grad_norm": 5.125, "learning_rate": 4.006279337170283e-06, "loss": 0.3195, "step": 1460 }, { "epoch": 0.3675, "grad_norm": 4.96875, "learning_rate": 3.988810144043041e-06, "loss": 0.3225, "step": 1470 }, { "epoch": 0.37, "grad_norm": 4.6875, "learning_rate": 3.971227572259167e-06, "loss": 0.3242, "step": 1480 }, { "epoch": 0.3725, "grad_norm": 5.0625, "learning_rate": 3.953532960799577e-06, "loss": 0.3205, "step": 1490 }, { "epoch": 0.375, "grad_norm": 4.71875, "learning_rate": 3.935727657177439e-06, "loss": 0.3152, "step": 1500 }, { "epoch": 0.3775, "grad_norm": 4.90625, "learning_rate": 3.917813017335562e-06, "loss": 0.2998, "step": 1510 }, { "epoch": 0.38, "grad_norm": 5.15625, "learning_rate": 3.899790405543129e-06, "loss": 0.3229, "step": 1520 }, { "epoch": 0.3825, "grad_norm": 4.46875, "learning_rate": 3.881661194291805e-06, "loss": 0.3088, "step": 1530 }, { "epoch": 0.385, "grad_norm": 4.9375, "learning_rate": 3.863426764191216e-06, "loss": 0.3205, "step": 1540 }, { "epoch": 0.3875, "grad_norm": 5.15625, "learning_rate": 3.845088503863813e-06, "loss": 0.3137, "step": 1550 }, { "epoch": 0.39, "grad_norm": 4.5, "learning_rate": 3.826647809839119e-06, "loss": 0.3055, "step": 1560 }, { "epoch": 0.3925, "grad_norm": 5.15625, "learning_rate": 3.8081060864473794e-06, "loss": 0.316, "step": 1570 }, { "epoch": 0.395, "grad_norm": 5.0625, "learning_rate": 3.7894647457126188e-06, "loss": 0.3215, "step": 1580 }, { "epoch": 0.3975, "grad_norm": 5.03125, "learning_rate": 3.770725207245106e-06, "loss": 0.3125, "step": 1590 }, { "epoch": 0.4, "grad_norm": 4.84375, "learning_rate": 3.751888898133249e-06, "loss": 0.3168, "step": 1600 }, { "epoch": 0.4025, "grad_norm": 5.03125, "learning_rate": 3.7329572528349145e-06, "loss": 0.3074, "step": 1610 }, { "epoch": 0.405, "grad_norm": 4.90625, "learning_rate": 3.7139317130681886e-06, "loss": 0.3248, "step": 1620 }, { "epoch": 0.4075, "grad_norm": 4.59375, "learning_rate": 3.694813727701584e-06, "loss": 0.3041, "step": 1630 }, { "epoch": 0.41, "grad_norm": 5.09375, "learning_rate": 3.675604752643706e-06, "loss": 0.3049, "step": 1640 }, { "epoch": 0.4125, "grad_norm": 4.625, "learning_rate": 3.6563062507323752e-06, "loss": 0.3285, "step": 1650 }, { "epoch": 0.415, "grad_norm": 4.75, "learning_rate": 3.6369196916232297e-06, "loss": 0.2977, "step": 1660 }, { "epoch": 0.4175, "grad_norm": 4.71875, "learning_rate": 3.6174465516778032e-06, "loss": 0.3074, "step": 1670 }, { "epoch": 0.42, "grad_norm": 4.625, "learning_rate": 3.5978883138510963e-06, "loss": 0.3055, "step": 1680 }, { "epoch": 0.4225, "grad_norm": 4.53125, "learning_rate": 3.578246467578642e-06, "loss": 0.3096, "step": 1690 }, { "epoch": 0.425, "grad_norm": 4.84375, "learning_rate": 3.558522508663081e-06, "loss": 0.3102, "step": 1700 }, { "epoch": 0.4275, "grad_norm": 4.6875, "learning_rate": 3.538717939160249e-06, "loss": 0.2945, "step": 1710 }, { "epoch": 0.43, "grad_norm": 4.625, "learning_rate": 3.5188342672647897e-06, "loss": 0.2992, "step": 1720 }, { "epoch": 0.4325, "grad_norm": 4.5, "learning_rate": 3.4988730071953005e-06, "loss": 0.3107, "step": 1730 }, { "epoch": 0.435, "grad_norm": 4.375, "learning_rate": 3.478835679079019e-06, "loss": 0.3031, "step": 1740 }, { "epoch": 0.4375, "grad_norm": 4.8125, "learning_rate": 3.4587238088360605e-06, "loss": 0.317, "step": 1750 }, { "epoch": 0.44, "grad_norm": 4.5625, "learning_rate": 3.438538928063208e-06, "loss": 0.2965, "step": 1760 }, { "epoch": 0.4425, "grad_norm": 4.65625, "learning_rate": 3.4182825739172826e-06, "loss": 0.2899, "step": 1770 }, { "epoch": 0.445, "grad_norm": 4.375, "learning_rate": 3.3979562889980777e-06, "loss": 0.3107, "step": 1780 }, { "epoch": 0.4475, "grad_norm": 4.46875, "learning_rate": 3.377561621230887e-06, "loss": 0.2855, "step": 1790 }, { "epoch": 0.45, "grad_norm": 4.375, "learning_rate": 3.357100123748621e-06, "loss": 0.3061, "step": 1800 }, { "epoch": 0.4525, "grad_norm": 4.53125, "learning_rate": 3.3365733547735334e-06, "loss": 0.3068, "step": 1810 }, { "epoch": 0.455, "grad_norm": 4.5, "learning_rate": 3.315982877498555e-06, "loss": 0.2834, "step": 1820 }, { "epoch": 0.4575, "grad_norm": 4.34375, "learning_rate": 3.2953302599682487e-06, "loss": 0.3029, "step": 1830 }, { "epoch": 0.46, "grad_norm": 4.5625, "learning_rate": 3.2746170749593998e-06, "loss": 0.3121, "step": 1840 }, { "epoch": 0.4625, "grad_norm": 4.46875, "learning_rate": 3.2538448998612394e-06, "loss": 0.293, "step": 1850 }, { "epoch": 0.465, "grad_norm": 4.875, "learning_rate": 3.233015316555326e-06, "loss": 0.2988, "step": 1860 }, { "epoch": 0.4675, "grad_norm": 4.53125, "learning_rate": 3.212129911295074e-06, "loss": 0.3002, "step": 1870 }, { "epoch": 0.47, "grad_norm": 7.6875, "learning_rate": 3.1911902745849526e-06, "loss": 0.3146, "step": 1880 }, { "epoch": 0.4725, "grad_norm": 4.3125, "learning_rate": 3.17019800105937e-06, "loss": 0.2926, "step": 1890 }, { "epoch": 0.475, "grad_norm": 4.46875, "learning_rate": 3.1491546893612296e-06, "loss": 0.2998, "step": 1900 }, { "epoch": 0.4775, "grad_norm": 4.71875, "learning_rate": 3.128061942020189e-06, "loss": 0.3104, "step": 1910 }, { "epoch": 0.48, "grad_norm": 4.75, "learning_rate": 3.1069213653306242e-06, "loss": 0.2955, "step": 1920 }, { "epoch": 0.4825, "grad_norm": 4.4375, "learning_rate": 3.0857345692292968e-06, "loss": 0.2937, "step": 1930 }, { "epoch": 0.485, "grad_norm": 4.25, "learning_rate": 3.0645031671727598e-06, "loss": 0.2878, "step": 1940 }, { "epoch": 0.4875, "grad_norm": 4.53125, "learning_rate": 3.0432287760144797e-06, "loss": 0.2988, "step": 1950 }, { "epoch": 0.49, "grad_norm": 4.21875, "learning_rate": 3.0219130158817093e-06, "loss": 0.3043, "step": 1960 }, { "epoch": 0.4925, "grad_norm": 4.8125, "learning_rate": 3.0005575100521115e-06, "loss": 0.2842, "step": 1970 }, { "epoch": 0.495, "grad_norm": 4.5625, "learning_rate": 2.979163884830137e-06, "loss": 0.2922, "step": 1980 }, { "epoch": 0.4975, "grad_norm": 4.9375, "learning_rate": 2.957733769423174e-06, "loss": 0.2998, "step": 1990 }, { "epoch": 0.5, "grad_norm": 4.5625, "learning_rate": 2.93626879581748e-06, "loss": 0.2908, "step": 2000 }, { "epoch": 0.5025, "grad_norm": 4.71875, "learning_rate": 2.914770598653902e-06, "loss": 0.3014, "step": 2010 }, { "epoch": 0.505, "grad_norm": 4.65625, "learning_rate": 2.8932408151033868e-06, "loss": 0.2891, "step": 2020 }, { "epoch": 0.5075, "grad_norm": 4.65625, "learning_rate": 2.8716810847423083e-06, "loss": 0.2877, "step": 2030 }, { "epoch": 0.51, "grad_norm": 4.5, "learning_rate": 2.8500930494276035e-06, "loss": 0.3008, "step": 2040 }, { "epoch": 0.5125, "grad_norm": 4.75, "learning_rate": 2.828478353171745e-06, "loss": 0.2863, "step": 2050 }, { "epoch": 0.515, "grad_norm": 4.71875, "learning_rate": 2.8068386420175376e-06, "loss": 0.3018, "step": 2060 }, { "epoch": 0.5175, "grad_norm": 4.34375, "learning_rate": 2.785175563912766e-06, "loss": 0.3012, "step": 2070 }, { "epoch": 0.52, "grad_norm": 4.53125, "learning_rate": 2.7634907685846995e-06, "loss": 0.2785, "step": 2080 }, { "epoch": 0.5225, "grad_norm": 4.6875, "learning_rate": 2.7417859074144604e-06, "loss": 0.2887, "step": 2090 }, { "epoch": 0.525, "grad_norm": 4.3125, "learning_rate": 2.7200626333112595e-06, "loss": 0.3049, "step": 2100 }, { "epoch": 0.5275, "grad_norm": 4.4375, "learning_rate": 2.6983226005865236e-06, "loss": 0.2885, "step": 2110 }, { "epoch": 0.53, "grad_norm": 4.65625, "learning_rate": 2.676567464827917e-06, "loss": 0.2982, "step": 2120 }, { "epoch": 0.5325, "grad_norm": 4.40625, "learning_rate": 2.6547988827732546e-06, "loss": 0.2895, "step": 2130 }, { "epoch": 0.535, "grad_norm": 4.4375, "learning_rate": 2.633018512184341e-06, "loss": 0.3061, "step": 2140 }, { "epoch": 0.5375, "grad_norm": 4.34375, "learning_rate": 2.6112280117207223e-06, "loss": 0.2891, "step": 2150 }, { "epoch": 0.54, "grad_norm": 4.375, "learning_rate": 2.5894290408133744e-06, "loss": 0.2841, "step": 2160 }, { "epoch": 0.5425, "grad_norm": 4.46875, "learning_rate": 2.56762325953833e-06, "loss": 0.3105, "step": 2170 }, { "epoch": 0.545, "grad_norm": 4.6875, "learning_rate": 2.5458123284902577e-06, "loss": 0.2842, "step": 2180 }, { "epoch": 0.5475, "grad_norm": 4.5625, "learning_rate": 2.5239979086560003e-06, "loss": 0.2842, "step": 2190 }, { "epoch": 0.55, "grad_norm": 5.03125, "learning_rate": 2.5021816612880884e-06, "loss": 0.2877, "step": 2200 }, { "epoch": 0.5525, "grad_norm": 4.5625, "learning_rate": 2.4803652477782228e-06, "loss": 0.2854, "step": 2210 }, { "epoch": 0.555, "grad_norm": 4.59375, "learning_rate": 2.4585503295307565e-06, "loss": 0.2977, "step": 2220 }, { "epoch": 0.5575, "grad_norm": 4.53125, "learning_rate": 2.436738567836176e-06, "loss": 0.2866, "step": 2230 }, { "epoch": 0.56, "grad_norm": 4.59375, "learning_rate": 2.4149316237445813e-06, "loss": 0.2797, "step": 2240 }, { "epoch": 0.5625, "grad_norm": 4.4375, "learning_rate": 2.3931311579391946e-06, "loss": 0.298, "step": 2250 }, { "epoch": 0.565, "grad_norm": 4.9375, "learning_rate": 2.37133883060989e-06, "loss": 0.2781, "step": 2260 }, { "epoch": 0.5675, "grad_norm": 4.65625, "learning_rate": 2.3495563013267668e-06, "loss": 0.2961, "step": 2270 }, { "epoch": 0.57, "grad_norm": 4.4375, "learning_rate": 2.3277852289137636e-06, "loss": 0.2891, "step": 2280 }, { "epoch": 0.5725, "grad_norm": 4.5625, "learning_rate": 2.306027271322336e-06, "loss": 0.2824, "step": 2290 }, { "epoch": 0.575, "grad_norm": 4.28125, "learning_rate": 2.284284085505192e-06, "loss": 0.2824, "step": 2300 }, { "epoch": 0.5775, "grad_norm": 4.59375, "learning_rate": 2.2625573272901156e-06, "loss": 0.2807, "step": 2310 }, { "epoch": 0.58, "grad_norm": 4.34375, "learning_rate": 2.240848651253863e-06, "loss": 0.2918, "step": 2320 }, { "epoch": 0.5825, "grad_norm": 4.625, "learning_rate": 2.2191597105961613e-06, "loss": 0.292, "step": 2330 }, { "epoch": 0.585, "grad_norm": 4.65625, "learning_rate": 2.1974921570138155e-06, "loss": 0.2791, "step": 2340 }, { "epoch": 0.5875, "grad_norm": 4.28125, "learning_rate": 2.1758476405749207e-06, "loss": 0.2807, "step": 2350 }, { "epoch": 0.59, "grad_norm": 4.0625, "learning_rate": 2.154227809593203e-06, "loss": 0.3051, "step": 2360 }, { "epoch": 0.5925, "grad_norm": 5.03125, "learning_rate": 2.1326343105024962e-06, "loss": 0.2787, "step": 2370 }, { "epoch": 0.595, "grad_norm": 4.4375, "learning_rate": 2.111068787731358e-06, "loss": 0.2942, "step": 2380 }, { "epoch": 0.5975, "grad_norm": 4.34375, "learning_rate": 2.089532883577843e-06, "loss": 0.2822, "step": 2390 }, { "epoch": 0.6, "grad_norm": 4.40625, "learning_rate": 2.068028238084432e-06, "loss": 0.2879, "step": 2400 }, { "epoch": 0.6025, "grad_norm": 4.6875, "learning_rate": 2.046556488913137e-06, "loss": 0.285, "step": 2410 }, { "epoch": 0.605, "grad_norm": 4.65625, "learning_rate": 2.025119271220789e-06, "loss": 0.2838, "step": 2420 }, { "epoch": 0.6075, "grad_norm": 4.75, "learning_rate": 2.0037182175345137e-06, "loss": 0.3084, "step": 2430 }, { "epoch": 0.61, "grad_norm": 4.34375, "learning_rate": 1.9823549576274048e-06, "loss": 0.2877, "step": 2440 }, { "epoch": 0.6125, "grad_norm": 4.8125, "learning_rate": 1.961031118394418e-06, "loss": 0.2707, "step": 2450 }, { "epoch": 0.615, "grad_norm": 4.53125, "learning_rate": 1.939748323728468e-06, "loss": 0.2808, "step": 2460 }, { "epoch": 0.6175, "grad_norm": 4.28125, "learning_rate": 1.918508194396769e-06, "loss": 0.2855, "step": 2470 }, { "epoch": 0.62, "grad_norm": 4.84375, "learning_rate": 1.8973123479174038e-06, "loss": 0.2892, "step": 2480 }, { "epoch": 0.6225, "grad_norm": 4.5, "learning_rate": 1.8761623984361444e-06, "loss": 0.2858, "step": 2490 }, { "epoch": 0.625, "grad_norm": 4.65625, "learning_rate": 1.8550599566035299e-06, "loss": 0.2791, "step": 2500 }, { "epoch": 0.6275, "grad_norm": 4.25, "learning_rate": 1.834006629452207e-06, "loss": 0.2859, "step": 2510 }, { "epoch": 0.63, "grad_norm": 4.34375, "learning_rate": 1.8130040202745488e-06, "loss": 0.2822, "step": 2520 }, { "epoch": 0.6325, "grad_norm": 4.71875, "learning_rate": 1.7920537285005607e-06, "loss": 0.291, "step": 2530 }, { "epoch": 0.635, "grad_norm": 4.625, "learning_rate": 1.7711573495760725e-06, "loss": 0.2809, "step": 2540 }, { "epoch": 0.6375, "grad_norm": 4.84375, "learning_rate": 1.750316474841242e-06, "loss": 0.2828, "step": 2550 }, { "epoch": 0.64, "grad_norm": 4.53125, "learning_rate": 1.7295326914093713e-06, "loss": 0.2826, "step": 2560 }, { "epoch": 0.6425, "grad_norm": 4.71875, "learning_rate": 1.7088075820460348e-06, "loss": 0.2797, "step": 2570 }, { "epoch": 0.645, "grad_norm": 4.46875, "learning_rate": 1.6881427250485516e-06, "loss": 0.2867, "step": 2580 }, { "epoch": 0.6475, "grad_norm": 4.1875, "learning_rate": 1.6675396941257896e-06, "loss": 0.2916, "step": 2590 }, { "epoch": 0.65, "grad_norm": 4.8125, "learning_rate": 1.6470000582783205e-06, "loss": 0.2734, "step": 2600 }, { "epoch": 0.6525, "grad_norm": 4.34375, "learning_rate": 1.6265253816789372e-06, "loss": 0.2771, "step": 2610 }, { "epoch": 0.655, "grad_norm": 4.5625, "learning_rate": 1.6061172235535342e-06, "loss": 0.2916, "step": 2620 }, { "epoch": 0.6575, "grad_norm": 4.5625, "learning_rate": 1.5857771380623643e-06, "loss": 0.2849, "step": 2630 }, { "epoch": 0.66, "grad_norm": 4.34375, "learning_rate": 1.5655066741816898e-06, "loss": 0.2889, "step": 2640 }, { "epoch": 0.6625, "grad_norm": 4.78125, "learning_rate": 1.545307375585814e-06, "loss": 0.28, "step": 2650 }, { "epoch": 0.665, "grad_norm": 4.46875, "learning_rate": 1.5251807805295302e-06, "loss": 0.2896, "step": 2660 }, { "epoch": 0.6675, "grad_norm": 4.375, "learning_rate": 1.5051284217309743e-06, "loss": 0.2957, "step": 2670 }, { "epoch": 0.67, "grad_norm": 4.71875, "learning_rate": 1.4851518262549058e-06, "loss": 0.2791, "step": 2680 }, { "epoch": 0.6725, "grad_norm": 3.875, "learning_rate": 1.465252515396413e-06, "loss": 0.2904, "step": 2690 }, { "epoch": 0.675, "grad_norm": 4.625, "learning_rate": 1.4454320045650606e-06, "loss": 0.291, "step": 2700 }, { "epoch": 0.6775, "grad_norm": 4.3125, "learning_rate": 1.4256918031694866e-06, "loss": 0.2803, "step": 2710 }, { "epoch": 0.68, "grad_norm": 4.34375, "learning_rate": 1.4060334145024543e-06, "loss": 0.2761, "step": 2720 }, { "epoch": 0.6825, "grad_norm": 4.4375, "learning_rate": 1.3864583356263706e-06, "loss": 0.2811, "step": 2730 }, { "epoch": 0.685, "grad_norm": 4.84375, "learning_rate": 1.366968057259282e-06, "loss": 0.2865, "step": 2740 }, { "epoch": 0.6875, "grad_norm": 4.375, "learning_rate": 1.3475640636613447e-06, "loss": 0.2865, "step": 2750 }, { "epoch": 0.69, "grad_norm": 4.5625, "learning_rate": 1.3282478325217961e-06, "loss": 0.2738, "step": 2760 }, { "epoch": 0.6925, "grad_norm": 4.46875, "learning_rate": 1.3090208348464244e-06, "loss": 0.2827, "step": 2770 }, { "epoch": 0.695, "grad_norm": 4.5, "learning_rate": 1.289884534845542e-06, "loss": 0.2831, "step": 2780 }, { "epoch": 0.6975, "grad_norm": 4.53125, "learning_rate": 1.2708403898224839e-06, "loss": 0.2918, "step": 2790 }, { "epoch": 0.7, "grad_norm": 4.25, "learning_rate": 1.2518898500626259e-06, "loss": 0.2802, "step": 2800 }, { "epoch": 0.7025, "grad_norm": 4.875, "learning_rate": 1.2330343587229397e-06, "loss": 0.2855, "step": 2810 }, { "epoch": 0.705, "grad_norm": 3.8125, "learning_rate": 1.2142753517220945e-06, "loss": 0.2723, "step": 2820 }, { "epoch": 0.7075, "grad_norm": 4.46875, "learning_rate": 1.1956142576311011e-06, "loss": 0.2845, "step": 2830 }, { "epoch": 0.71, "grad_norm": 4.5, "learning_rate": 1.177052497564524e-06, "loss": 0.286, "step": 2840 }, { "epoch": 0.7125, "grad_norm": 4.59375, "learning_rate": 1.1585914850722565e-06, "loss": 0.2892, "step": 2850 }, { "epoch": 0.715, "grad_norm": 4.46875, "learning_rate": 1.1402326260318752e-06, "loss": 0.2805, "step": 2860 }, { "epoch": 0.7175, "grad_norm": 4.96875, "learning_rate": 1.121977318541575e-06, "loss": 0.268, "step": 2870 }, { "epoch": 0.72, "grad_norm": 4.53125, "learning_rate": 1.1038269528136989e-06, "loss": 0.3004, "step": 2880 }, { "epoch": 0.7225, "grad_norm": 4.6875, "learning_rate": 1.0857829110688695e-06, "loss": 0.2844, "step": 2890 }, { "epoch": 0.725, "grad_norm": 4.21875, "learning_rate": 1.0678465674307273e-06, "loss": 0.2857, "step": 2900 }, { "epoch": 0.7275, "grad_norm": 4.5625, "learning_rate": 1.0500192878212826e-06, "loss": 0.2766, "step": 2910 }, { "epoch": 0.73, "grad_norm": 4.34375, "learning_rate": 1.032302429856899e-06, "loss": 0.2811, "step": 2920 }, { "epoch": 0.7325, "grad_norm": 4.5625, "learning_rate": 1.014697342744904e-06, "loss": 0.2915, "step": 2930 }, { "epoch": 0.735, "grad_norm": 4.5, "learning_rate": 9.97205367180842e-07, "loss": 0.2818, "step": 2940 }, { "epoch": 0.7375, "grad_norm": 4.40625, "learning_rate": 9.798278352463752e-07, "loss": 0.2786, "step": 2950 }, { "epoch": 0.74, "grad_norm": 4.375, "learning_rate": 9.625660703078394e-07, "loss": 0.2902, "step": 2960 }, { "epoch": 0.7425, "grad_norm": 4.15625, "learning_rate": 9.45421386915468e-07, "loss": 0.275, "step": 2970 }, { "epoch": 0.745, "grad_norm": 4.5, "learning_rate": 9.283950907032788e-07, "loss": 0.2768, "step": 2980 }, { "epoch": 0.7475, "grad_norm": 4.90625, "learning_rate": 9.114884782896482e-07, "loss": 0.2816, "step": 2990 }, { "epoch": 0.75, "grad_norm": 4.65625, "learning_rate": 8.947028371785677e-07, "loss": 0.2854, "step": 3000 }, { "epoch": 0.7525, "grad_norm": 4.5, "learning_rate": 8.780394456615974e-07, "loss": 0.2867, "step": 3010 }, { "epoch": 0.755, "grad_norm": 4.59375, "learning_rate": 8.614995727205155e-07, "loss": 0.2806, "step": 3020 }, { "epoch": 0.7575, "grad_norm": 4.71875, "learning_rate": 8.450844779306827e-07, "loss": 0.2857, "step": 3030 }, { "epoch": 0.76, "grad_norm": 4.84375, "learning_rate": 8.28795411365122e-07, "loss": 0.2798, "step": 3040 }, { "epoch": 0.7625, "grad_norm": 4.09375, "learning_rate": 8.126336134993176e-07, "loss": 0.2917, "step": 3050 }, { "epoch": 0.765, "grad_norm": 4.5, "learning_rate": 7.966003151167498e-07, "loss": 0.2762, "step": 3060 }, { "epoch": 0.7675, "grad_norm": 4.15625, "learning_rate": 7.806967372151661e-07, "loss": 0.2785, "step": 3070 }, { "epoch": 0.77, "grad_norm": 3.953125, "learning_rate": 7.649240909135966e-07, "loss": 0.2707, "step": 3080 }, { "epoch": 0.7725, "grad_norm": 4.75, "learning_rate": 7.492835773601234e-07, "loss": 0.2892, "step": 3090 }, { "epoch": 0.775, "grad_norm": 4.6875, "learning_rate": 7.337763876404078e-07, "loss": 0.2836, "step": 3100 }, { "epoch": 0.7775, "grad_norm": 4.28125, "learning_rate": 7.184037026869867e-07, "loss": 0.2817, "step": 3110 }, { "epoch": 0.78, "grad_norm": 4.65625, "learning_rate": 7.031666931893361e-07, "loss": 0.2774, "step": 3120 }, { "epoch": 0.7825, "grad_norm": 4.53125, "learning_rate": 6.880665195047226e-07, "loss": 0.2728, "step": 3130 }, { "epoch": 0.785, "grad_norm": 4.34375, "learning_rate": 6.731043315698346e-07, "loss": 0.292, "step": 3140 }, { "epoch": 0.7875, "grad_norm": 4.78125, "learning_rate": 6.58281268813212e-07, "loss": 0.2863, "step": 3150 }, { "epoch": 0.79, "grad_norm": 4.3125, "learning_rate": 6.435984600684731e-07, "loss": 0.2785, "step": 3160 }, { "epoch": 0.7925, "grad_norm": 4.71875, "learning_rate": 6.290570234883506e-07, "loss": 0.2809, "step": 3170 }, { "epoch": 0.795, "grad_norm": 4.3125, "learning_rate": 6.146580664595391e-07, "loss": 0.2836, "step": 3180 }, { "epoch": 0.7975, "grad_norm": 4.5625, "learning_rate": 6.004026855183656e-07, "loss": 0.2928, "step": 3190 }, { "epoch": 0.8, "grad_norm": 4.28125, "learning_rate": 5.862919662672801e-07, "loss": 0.2785, "step": 3200 }, { "epoch": 0.8025, "grad_norm": 4.34375, "learning_rate": 5.723269832921849e-07, "loss": 0.2794, "step": 3210 }, { "epoch": 0.805, "grad_norm": 4.65625, "learning_rate": 5.585088000806016e-07, "loss": 0.2951, "step": 3220 }, { "epoch": 0.8075, "grad_norm": 4.5625, "learning_rate": 5.448384689406804e-07, "loss": 0.2764, "step": 3230 }, { "epoch": 0.81, "grad_norm": 4.46875, "learning_rate": 5.313170309210655e-07, "loss": 0.2811, "step": 3240 }, { "epoch": 0.8125, "grad_norm": 4.28125, "learning_rate": 5.179455157316124e-07, "loss": 0.2779, "step": 3250 }, { "epoch": 0.815, "grad_norm": 4.28125, "learning_rate": 5.047249416649713e-07, "loss": 0.2884, "step": 3260 }, { "epoch": 0.8175, "grad_norm": 4.625, "learning_rate": 4.916563155190446e-07, "loss": 0.2847, "step": 3270 }, { "epoch": 0.82, "grad_norm": 4.4375, "learning_rate": 4.787406325203101e-07, "loss": 0.2717, "step": 3280 }, { "epoch": 0.8225, "grad_norm": 4.375, "learning_rate": 4.6597887624803273e-07, "loss": 0.2791, "step": 3290 }, { "epoch": 0.825, "grad_norm": 4.40625, "learning_rate": 4.533720185593621e-07, "loss": 0.285, "step": 3300 }, { "epoch": 0.8275, "grad_norm": 4.0, "learning_rate": 4.4092101951532076e-07, "loss": 0.287, "step": 3310 }, { "epoch": 0.83, "grad_norm": 4.40625, "learning_rate": 4.2862682730769157e-07, "loss": 0.283, "step": 3320 }, { "epoch": 0.8325, "grad_norm": 4.46875, "learning_rate": 4.164903781868096e-07, "loss": 0.274, "step": 3330 }, { "epoch": 0.835, "grad_norm": 5.03125, "learning_rate": 4.045125963902641e-07, "loss": 0.2686, "step": 3340 }, { "epoch": 0.8375, "grad_norm": 4.46875, "learning_rate": 3.9269439407251365e-07, "loss": 0.2926, "step": 3350 }, { "epoch": 0.84, "grad_norm": 4.34375, "learning_rate": 3.810366712354199e-07, "loss": 0.2704, "step": 3360 }, { "epoch": 0.8425, "grad_norm": 4.5, "learning_rate": 3.6954031565971187e-07, "loss": 0.2824, "step": 3370 }, { "epoch": 0.845, "grad_norm": 4.25, "learning_rate": 3.5820620283737615e-07, "loss": 0.2885, "step": 3380 }, { "epoch": 0.8475, "grad_norm": 4.28125, "learning_rate": 3.4703519590498615e-07, "loss": 0.2701, "step": 3390 }, { "epoch": 0.85, "grad_norm": 4.1875, "learning_rate": 3.360281455779704e-07, "loss": 0.2806, "step": 3400 }, { "epoch": 0.8525, "grad_norm": 4.28125, "learning_rate": 3.2518589008582597e-07, "loss": 0.2904, "step": 3410 }, { "epoch": 0.855, "grad_norm": 4.28125, "learning_rate": 3.1450925510828705e-07, "loss": 0.2768, "step": 3420 }, { "epoch": 0.8575, "grad_norm": 4.53125, "learning_rate": 3.039990537124432e-07, "loss": 0.2825, "step": 3430 }, { "epoch": 0.86, "grad_norm": 4.25, "learning_rate": 2.936560862908225e-07, "loss": 0.28, "step": 3440 }, { "epoch": 0.8625, "grad_norm": 4.28125, "learning_rate": 2.8348114050043813e-07, "loss": 0.2953, "step": 3450 }, { "epoch": 0.865, "grad_norm": 4.3125, "learning_rate": 2.7347499120280677e-07, "loss": 0.2745, "step": 3460 }, { "epoch": 0.8675, "grad_norm": 4.53125, "learning_rate": 2.636384004049375e-07, "loss": 0.2791, "step": 3470 }, { "epoch": 0.87, "grad_norm": 4.53125, "learning_rate": 2.5397211720130267e-07, "loss": 0.2949, "step": 3480 }, { "epoch": 0.8725, "grad_norm": 4.4375, "learning_rate": 2.4447687771679414e-07, "loss": 0.2795, "step": 3490 }, { "epoch": 0.875, "grad_norm": 4.25, "learning_rate": 2.3515340505066043e-07, "loss": 0.2737, "step": 3500 }, { "epoch": 0.8775, "grad_norm": 4.21875, "learning_rate": 2.260024092214419e-07, "loss": 0.2788, "step": 3510 }, { "epoch": 0.88, "grad_norm": 4.4375, "learning_rate": 2.170245871129012e-07, "loss": 0.2848, "step": 3520 }, { "epoch": 0.8825, "grad_norm": 4.375, "learning_rate": 2.0822062242095015e-07, "loss": 0.2863, "step": 3530 }, { "epoch": 0.885, "grad_norm": 4.4375, "learning_rate": 1.995911856015867e-07, "loss": 0.2741, "step": 3540 }, { "epoch": 0.8875, "grad_norm": 5.3125, "learning_rate": 1.9113693381983405e-07, "loss": 0.2753, "step": 3550 }, { "epoch": 0.89, "grad_norm": 4.4375, "learning_rate": 1.8285851089969802e-07, "loss": 0.2934, "step": 3560 }, { "epoch": 0.8925, "grad_norm": 4.21875, "learning_rate": 1.7475654727513502e-07, "loss": 0.2773, "step": 3570 }, { "epoch": 0.895, "grad_norm": 4.71875, "learning_rate": 1.668316599420433e-07, "loss": 0.2919, "step": 3580 }, { "epoch": 0.8975, "grad_norm": 4.28125, "learning_rate": 1.5908445241127528e-07, "loss": 0.2774, "step": 3590 }, { "epoch": 0.9, "grad_norm": 4.46875, "learning_rate": 1.5151551466267956e-07, "loss": 0.274, "step": 3600 }, { "epoch": 0.9025, "grad_norm": 4.53125, "learning_rate": 1.441254231001696e-07, "loss": 0.2831, "step": 3610 }, { "epoch": 0.905, "grad_norm": 4.46875, "learning_rate": 1.3691474050782972e-07, "loss": 0.2728, "step": 3620 }, { "epoch": 0.9075, "grad_norm": 4.25, "learning_rate": 1.2988401600705635e-07, "loss": 0.2923, "step": 3630 }, { "epoch": 0.91, "grad_norm": 4.59375, "learning_rate": 1.2303378501474174e-07, "loss": 0.2838, "step": 3640 }, { "epoch": 0.9125, "grad_norm": 4.40625, "learning_rate": 1.16364569202497e-07, "loss": 0.2697, "step": 3650 }, { "epoch": 0.915, "grad_norm": 4.53125, "learning_rate": 1.0987687645692746e-07, "loss": 0.2811, "step": 3660 }, { "epoch": 0.9175, "grad_norm": 4.3125, "learning_rate": 1.035712008409534e-07, "loss": 0.2924, "step": 3670 }, { "epoch": 0.92, "grad_norm": 4.40625, "learning_rate": 9.744802255618662e-08, "loss": 0.2797, "step": 3680 }, { "epoch": 0.9225, "grad_norm": 4.28125, "learning_rate": 9.150780790636054e-08, "loss": 0.2854, "step": 3690 }, { "epoch": 0.925, "grad_norm": 4.15625, "learning_rate": 8.575100926181884e-08, "loss": 0.2755, "step": 3700 }, { "epoch": 0.9275, "grad_norm": 4.5625, "learning_rate": 8.017806502506692e-08, "loss": 0.2926, "step": 3710 }, { "epoch": 0.93, "grad_norm": 4.3125, "learning_rate": 7.478939959738502e-08, "loss": 0.2817, "step": 3720 }, { "epoch": 0.9325, "grad_norm": 4.4375, "learning_rate": 6.958542334650847e-08, "loss": 0.273, "step": 3730 }, { "epoch": 0.935, "grad_norm": 4.3125, "learning_rate": 6.456653257537665e-08, "loss": 0.3051, "step": 3740 }, { "epoch": 0.9375, "grad_norm": 4.125, "learning_rate": 5.973310949195343e-08, "loss": 0.2812, "step": 3750 }, { "epoch": 0.94, "grad_norm": 4.5, "learning_rate": 5.50855221801197e-08, "loss": 0.2746, "step": 3760 }, { "epoch": 0.9425, "grad_norm": 4.375, "learning_rate": 5.062412457164323e-08, "loss": 0.2776, "step": 3770 }, { "epoch": 0.945, "grad_norm": 4.28125, "learning_rate": 4.634925641922472e-08, "loss": 0.2827, "step": 3780 }, { "epoch": 0.9475, "grad_norm": 4.09375, "learning_rate": 4.226124327062514e-08, "loss": 0.2875, "step": 3790 }, { "epoch": 0.95, "grad_norm": 4.375, "learning_rate": 3.836039644387307e-08, "loss": 0.2829, "step": 3800 }, { "epoch": 0.9525, "grad_norm": 4.1875, "learning_rate": 3.4647013003556996e-08, "loss": 0.2669, "step": 3810 }, { "epoch": 0.955, "grad_norm": 4.6875, "learning_rate": 3.112137573820284e-08, "loss": 0.2908, "step": 3820 }, { "epoch": 0.9575, "grad_norm": 4.625, "learning_rate": 2.7783753138738713e-08, "loss": 0.2735, "step": 3830 }, { "epoch": 0.96, "grad_norm": 4.46875, "learning_rate": 2.463439937804707e-08, "loss": 0.2855, "step": 3840 }, { "epoch": 0.9625, "grad_norm": 4.375, "learning_rate": 2.1673554291610775e-08, "loss": 0.2746, "step": 3850 }, { "epoch": 0.965, "grad_norm": 4.6875, "learning_rate": 1.8901443359245765e-08, "loss": 0.2762, "step": 3860 }, { "epoch": 0.9675, "grad_norm": 4.6875, "learning_rate": 1.6318277687932816e-08, "loss": 0.2747, "step": 3870 }, { "epoch": 0.97, "grad_norm": 4.59375, "learning_rate": 1.3924253995738769e-08, "loss": 0.2746, "step": 3880 }, { "epoch": 0.9725, "grad_norm": 4.53125, "learning_rate": 1.1719554596836546e-08, "loss": 0.2838, "step": 3890 }, { "epoch": 0.975, "grad_norm": 4.5, "learning_rate": 9.704347387620994e-09, "loss": 0.2898, "step": 3900 }, { "epoch": 0.9775, "grad_norm": 4.5625, "learning_rate": 7.878785833923819e-09, "loss": 0.2694, "step": 3910 }, { "epoch": 0.98, "grad_norm": 4.40625, "learning_rate": 6.243008959324892e-09, "loss": 0.2751, "step": 3920 }, { "epoch": 0.9825, "grad_norm": 4.3125, "learning_rate": 4.797141334566268e-09, "loss": 0.2928, "step": 3930 }, { "epoch": 0.985, "grad_norm": 4.5, "learning_rate": 3.5412930680658876e-09, "loss": 0.2745, "step": 3940 }, { "epoch": 0.9875, "grad_norm": 4.71875, "learning_rate": 2.475559797531224e-09, "loss": 0.2865, "step": 3950 }, { "epoch": 0.99, "grad_norm": 4.34375, "learning_rate": 1.6000226826770604e-09, "loss": 0.2758, "step": 3960 }, { "epoch": 0.9925, "grad_norm": 4.21875, "learning_rate": 9.147483990443184e-10, "loss": 0.2887, "step": 3970 }, { "epoch": 0.995, "grad_norm": 4.65625, "learning_rate": 4.197891329230097e-10, "loss": 0.2796, "step": 3980 }, { "epoch": 0.9975, "grad_norm": 4.46875, "learning_rate": 1.1518257737763716e-10, "loss": 0.2779, "step": 3990 }, { "epoch": 1.0, "grad_norm": 4.28125, "learning_rate": 9.51929376435956e-13, "loss": 0.2984, "step": 4000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.0307891462144e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }