{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 19.25, "learning_rate": 1.1250000000000001e-07, "loss": 1.8141, "step": 10 }, { "epoch": 0.005, "grad_norm": 15.25, "learning_rate": 2.3750000000000003e-07, "loss": 1.7844, "step": 20 }, { "epoch": 0.0075, "grad_norm": 18.25, "learning_rate": 3.625e-07, "loss": 1.7555, "step": 30 }, { "epoch": 0.01, "grad_norm": 14.0625, "learning_rate": 4.875000000000001e-07, "loss": 1.7414, "step": 40 }, { "epoch": 0.0125, "grad_norm": 17.75, "learning_rate": 6.125000000000001e-07, "loss": 1.7531, "step": 50 }, { "epoch": 0.015, "grad_norm": 15.5, "learning_rate": 7.375e-07, "loss": 1.7852, "step": 60 }, { "epoch": 0.0175, "grad_norm": 17.875, "learning_rate": 8.625e-07, "loss": 1.7844, "step": 70 }, { "epoch": 0.02, "grad_norm": 14.9375, "learning_rate": 9.875e-07, "loss": 1.7883, "step": 80 }, { "epoch": 0.0225, "grad_norm": 11.0, "learning_rate": 1.1125000000000001e-06, "loss": 1.7312, "step": 90 }, { "epoch": 0.025, "grad_norm": 12.9375, "learning_rate": 1.2375e-06, "loss": 1.75, "step": 100 }, { "epoch": 0.0275, "grad_norm": 12.0625, "learning_rate": 1.3625000000000003e-06, "loss": 1.7477, "step": 110 }, { "epoch": 0.03, "grad_norm": 14.3125, "learning_rate": 1.4875000000000002e-06, "loss": 1.7312, "step": 120 }, { "epoch": 0.0325, "grad_norm": 12.375, "learning_rate": 1.6125000000000002e-06, "loss": 1.7414, "step": 130 }, { "epoch": 0.035, "grad_norm": 12.3125, "learning_rate": 1.7375e-06, "loss": 1.7367, "step": 140 }, { "epoch": 0.0375, "grad_norm": 15.1875, "learning_rate": 1.8625000000000002e-06, "loss": 1.7656, "step": 150 }, { "epoch": 0.04, "grad_norm": 13.25, "learning_rate": 1.9875000000000005e-06, "loss": 1.7523, "step": 160 }, { "epoch": 0.0425, "grad_norm": 10.125, "learning_rate": 2.1125e-06, "loss": 1.6953, "step": 170 }, { "epoch": 0.045, "grad_norm": 13.375, "learning_rate": 2.2375e-06, "loss": 1.7109, "step": 180 }, { "epoch": 0.0475, "grad_norm": 9.5625, "learning_rate": 2.3625000000000003e-06, "loss": 1.7047, "step": 190 }, { "epoch": 0.05, "grad_norm": 11.875, "learning_rate": 2.4875000000000003e-06, "loss": 1.675, "step": 200 }, { "epoch": 0.0525, "grad_norm": 14.5, "learning_rate": 2.6125e-06, "loss": 1.6836, "step": 210 }, { "epoch": 0.055, "grad_norm": 12.875, "learning_rate": 2.7375e-06, "loss": 1.7133, "step": 220 }, { "epoch": 0.0575, "grad_norm": 11.0625, "learning_rate": 2.8625e-06, "loss": 1.6461, "step": 230 }, { "epoch": 0.06, "grad_norm": 13.625, "learning_rate": 2.9875e-06, "loss": 1.6242, "step": 240 }, { "epoch": 0.0625, "grad_norm": 15.375, "learning_rate": 3.1125000000000007e-06, "loss": 1.6523, "step": 250 }, { "epoch": 0.065, "grad_norm": 10.5, "learning_rate": 3.2375e-06, "loss": 1.6141, "step": 260 }, { "epoch": 0.0675, "grad_norm": 13.0625, "learning_rate": 3.3625000000000004e-06, "loss": 1.5766, "step": 270 }, { "epoch": 0.07, "grad_norm": 13.6875, "learning_rate": 3.4875000000000005e-06, "loss": 1.5625, "step": 280 }, { "epoch": 0.0725, "grad_norm": 19.0, "learning_rate": 3.6125000000000006e-06, "loss": 1.5367, "step": 290 }, { "epoch": 0.075, "grad_norm": 18.75, "learning_rate": 3.7375000000000006e-06, "loss": 1.4852, "step": 300 }, { "epoch": 0.0775, "grad_norm": 11.3125, "learning_rate": 3.8625e-06, "loss": 1.4773, "step": 310 }, { "epoch": 0.08, "grad_norm": 14.9375, "learning_rate": 3.9875e-06, "loss": 1.4672, "step": 320 }, { "epoch": 0.0825, "grad_norm": 12.3125, "learning_rate": 4.1125e-06, "loss": 1.4109, "step": 330 }, { "epoch": 0.085, "grad_norm": 12.625, "learning_rate": 4.2375000000000005e-06, "loss": 1.375, "step": 340 }, { "epoch": 0.0875, "grad_norm": 19.0, "learning_rate": 4.362500000000001e-06, "loss": 1.3438, "step": 350 }, { "epoch": 0.09, "grad_norm": 13.0625, "learning_rate": 4.4875e-06, "loss": 1.2742, "step": 360 }, { "epoch": 0.0925, "grad_norm": 12.625, "learning_rate": 4.6125e-06, "loss": 1.2508, "step": 370 }, { "epoch": 0.095, "grad_norm": 10.0625, "learning_rate": 4.737500000000001e-06, "loss": 1.268, "step": 380 }, { "epoch": 0.0975, "grad_norm": 10.125, "learning_rate": 4.8625000000000005e-06, "loss": 1.2055, "step": 390 }, { "epoch": 0.1, "grad_norm": 12.25, "learning_rate": 4.987500000000001e-06, "loss": 1.1742, "step": 400 }, { "epoch": 0.1025, "grad_norm": 12.4375, "learning_rate": 4.999922894111975e-06, "loss": 1.1199, "step": 410 }, { "epoch": 0.105, "grad_norm": 9.8125, "learning_rate": 4.999656361346094e-06, "loss": 1.1195, "step": 420 }, { "epoch": 0.1075, "grad_norm": 11.1875, "learning_rate": 4.999199470070484e-06, "loss": 1.0727, "step": 430 }, { "epoch": 0.11, "grad_norm": 11.625, "learning_rate": 4.998552255079182e-06, "loss": 1.0711, "step": 440 }, { "epoch": 0.1125, "grad_norm": 9.3125, "learning_rate": 4.9977147656601196e-06, "loss": 1.0414, "step": 450 }, { "epoch": 0.115, "grad_norm": 10.3125, "learning_rate": 4.996687065591355e-06, "loss": 1.0367, "step": 460 }, { "epoch": 0.1175, "grad_norm": 10.5, "learning_rate": 4.9954692331362295e-06, "loss": 1.0125, "step": 470 }, { "epoch": 0.12, "grad_norm": 10.625, "learning_rate": 4.9940613610373974e-06, "loss": 0.9828, "step": 480 }, { "epoch": 0.1225, "grad_norm": 10.125, "learning_rate": 4.992463556509772e-06, "loss": 0.9695, "step": 490 }, { "epoch": 0.125, "grad_norm": 10.75, "learning_rate": 4.990675941232353e-06, "loss": 0.9395, "step": 500 }, { "epoch": 0.1275, "grad_norm": 9.0, "learning_rate": 4.988698651338965e-06, "loss": 0.9137, "step": 510 }, { "epoch": 0.13, "grad_norm": 11.375, "learning_rate": 4.986531837407891e-06, "loss": 0.9082, "step": 520 }, { "epoch": 0.1325, "grad_norm": 12.5, "learning_rate": 4.9841756644503965e-06, "loss": 0.8809, "step": 530 }, { "epoch": 0.135, "grad_norm": 18.875, "learning_rate": 4.981630311898178e-06, "loss": 0.8648, "step": 540 }, { "epoch": 0.1375, "grad_norm": 9.8125, "learning_rate": 4.978895973589686e-06, "loss": 0.8633, "step": 550 }, { "epoch": 0.14, "grad_norm": 9.3125, "learning_rate": 4.975972857755369e-06, "loss": 0.8293, "step": 560 }, { "epoch": 0.1425, "grad_norm": 9.9375, "learning_rate": 4.972861187001815e-06, "loss": 0.8242, "step": 570 }, { "epoch": 0.145, "grad_norm": 9.25, "learning_rate": 4.9695611982947995e-06, "loss": 0.8031, "step": 580 }, { "epoch": 0.1475, "grad_norm": 9.0625, "learning_rate": 4.966073142941239e-06, "loss": 0.775, "step": 590 }, { "epoch": 0.15, "grad_norm": 9.4375, "learning_rate": 4.962397286570053e-06, "loss": 0.7996, "step": 600 }, { "epoch": 0.1525, "grad_norm": 10.5, "learning_rate": 4.958533909111936e-06, "loss": 0.7492, "step": 610 }, { "epoch": 0.155, "grad_norm": 11.1875, "learning_rate": 4.95448330477804e-06, "loss": 0.7184, "step": 620 }, { "epoch": 0.1575, "grad_norm": 9.6875, "learning_rate": 4.950245782037566e-06, "loss": 0.7023, "step": 630 }, { "epoch": 0.16, "grad_norm": 10.0625, "learning_rate": 4.945821663594277e-06, "loss": 0.6949, "step": 640 }, { "epoch": 0.1625, "grad_norm": 9.125, "learning_rate": 4.941211286361922e-06, "loss": 0.7043, "step": 650 }, { "epoch": 0.165, "grad_norm": 9.5625, "learning_rate": 4.936415001438577e-06, "loss": 0.6633, "step": 660 }, { "epoch": 0.1675, "grad_norm": 9.25, "learning_rate": 4.9314331740799084e-06, "loss": 0.652, "step": 670 }, { "epoch": 0.17, "grad_norm": 9.0625, "learning_rate": 4.926266183671356e-06, "loss": 0.6637, "step": 680 }, { "epoch": 0.1725, "grad_norm": 9.75, "learning_rate": 4.920914423699247e-06, "loss": 0.6262, "step": 690 }, { "epoch": 0.175, "grad_norm": 8.9375, "learning_rate": 4.915378301720822e-06, "loss": 0.6398, "step": 700 }, { "epoch": 0.1775, "grad_norm": 9.25, "learning_rate": 4.909658239333203e-06, "loss": 0.6195, "step": 710 }, { "epoch": 0.18, "grad_norm": 9.4375, "learning_rate": 4.903754672141288e-06, "loss": 0.5977, "step": 720 }, { "epoch": 0.1825, "grad_norm": 8.5, "learning_rate": 4.897668049724574e-06, "loss": 0.5766, "step": 730 }, { "epoch": 0.185, "grad_norm": 9.1875, "learning_rate": 4.891398835602925e-06, "loss": 0.5656, "step": 740 }, { "epoch": 0.1875, "grad_norm": 8.4375, "learning_rate": 4.884947507201268e-06, "loss": 0.5781, "step": 750 }, { "epoch": 0.19, "grad_norm": 9.0, "learning_rate": 4.878314555813237e-06, "loss": 0.559, "step": 760 }, { "epoch": 0.1925, "grad_norm": 8.5625, "learning_rate": 4.8715004865637616e-06, "loss": 0.5496, "step": 770 }, { "epoch": 0.195, "grad_norm": 8.5, "learning_rate": 4.8645058183705976e-06, "loss": 0.5264, "step": 780 }, { "epoch": 0.1975, "grad_norm": 9.0625, "learning_rate": 4.8573310839048085e-06, "loss": 0.5404, "step": 790 }, { "epoch": 0.2, "grad_norm": 8.625, "learning_rate": 4.8499768295502e-06, "loss": 0.5117, "step": 800 }, { "epoch": 0.2025, "grad_norm": 8.4375, "learning_rate": 4.842443615361718e-06, "loss": 0.5195, "step": 810 }, { "epoch": 0.205, "grad_norm": 8.4375, "learning_rate": 4.834732015022786e-06, "loss": 0.4818, "step": 820 }, { "epoch": 0.2075, "grad_norm": 8.25, "learning_rate": 4.826842615801628e-06, "loss": 0.4992, "step": 830 }, { "epoch": 0.21, "grad_norm": 8.375, "learning_rate": 4.818776018506538e-06, "loss": 0.4975, "step": 840 }, { "epoch": 0.2125, "grad_norm": 8.3125, "learning_rate": 4.810532837440134e-06, "loss": 0.4752, "step": 850 }, { "epoch": 0.215, "grad_norm": 8.0, "learning_rate": 4.802113700352567e-06, "loss": 0.4924, "step": 860 }, { "epoch": 0.2175, "grad_norm": 7.6875, "learning_rate": 4.793519248393721e-06, "loss": 0.4564, "step": 870 }, { "epoch": 0.22, "grad_norm": 8.125, "learning_rate": 4.78475013606439e-06, "loss": 0.443, "step": 880 }, { "epoch": 0.2225, "grad_norm": 7.65625, "learning_rate": 4.775807031166428e-06, "loss": 0.4428, "step": 890 }, { "epoch": 0.225, "grad_norm": 7.3125, "learning_rate": 4.766690614751897e-06, "loss": 0.4443, "step": 900 }, { "epoch": 0.2275, "grad_norm": 7.625, "learning_rate": 4.757401581071203e-06, "loss": 0.4482, "step": 910 }, { "epoch": 0.23, "grad_norm": 7.09375, "learning_rate": 4.747940637520226e-06, "loss": 0.4332, "step": 920 }, { "epoch": 0.2325, "grad_norm": 7.65625, "learning_rate": 4.738308504586445e-06, "loss": 0.4215, "step": 930 }, { "epoch": 0.235, "grad_norm": 6.53125, "learning_rate": 4.7285059157940765e-06, "loss": 0.4332, "step": 940 }, { "epoch": 0.2375, "grad_norm": 7.03125, "learning_rate": 4.718533617648209e-06, "loss": 0.4092, "step": 950 }, { "epoch": 0.24, "grad_norm": 6.96875, "learning_rate": 4.7083923695779546e-06, "loss": 0.4297, "step": 960 }, { "epoch": 0.2425, "grad_norm": 6.78125, "learning_rate": 4.6980829438786176e-06, "loss": 0.3949, "step": 970 }, { "epoch": 0.245, "grad_norm": 7.46875, "learning_rate": 4.687606125652882e-06, "loss": 0.4023, "step": 980 }, { "epoch": 0.2475, "grad_norm": 6.75, "learning_rate": 4.676962712751015e-06, "loss": 0.3988, "step": 990 }, { "epoch": 0.25, "grad_norm": 6.5, "learning_rate": 4.666153515710118e-06, "loss": 0.3975, "step": 1000 }, { "epoch": 0.2525, "grad_norm": 6.84375, "learning_rate": 4.655179357692396e-06, "loss": 0.4049, "step": 1010 }, { "epoch": 0.255, "grad_norm": 6.375, "learning_rate": 4.644041074422469e-06, "loss": 0.4037, "step": 1020 }, { "epoch": 0.2575, "grad_norm": 7.03125, "learning_rate": 4.632739514123733e-06, "loss": 0.3846, "step": 1030 }, { "epoch": 0.26, "grad_norm": 6.09375, "learning_rate": 4.6212755374537596e-06, "loss": 0.376, "step": 1040 }, { "epoch": 0.2625, "grad_norm": 6.59375, "learning_rate": 4.609650017438757e-06, "loss": 0.4123, "step": 1050 }, { "epoch": 0.265, "grad_norm": 6.84375, "learning_rate": 4.5978638394070835e-06, "loss": 0.3816, "step": 1060 }, { "epoch": 0.2675, "grad_norm": 6.1875, "learning_rate": 4.58591790092183e-06, "loss": 0.3854, "step": 1070 }, { "epoch": 0.27, "grad_norm": 6.75, "learning_rate": 4.5738131117124605e-06, "loss": 0.3783, "step": 1080 }, { "epoch": 0.2725, "grad_norm": 5.84375, "learning_rate": 4.561550393605541e-06, "loss": 0.384, "step": 1090 }, { "epoch": 0.275, "grad_norm": 7.625, "learning_rate": 4.549130680454532e-06, "loss": 0.41, "step": 1100 }, { "epoch": 0.2775, "grad_norm": 5.9375, "learning_rate": 4.536554918068673e-06, "loss": 0.3664, "step": 1110 }, { "epoch": 0.28, "grad_norm": 5.03125, "learning_rate": 4.523824064140961e-06, "loss": 0.3727, "step": 1120 }, { "epoch": 0.2825, "grad_norm": 6.09375, "learning_rate": 4.510939088175211e-06, "loss": 0.3764, "step": 1130 }, { "epoch": 0.285, "grad_norm": 5.875, "learning_rate": 4.49790097141223e-06, "loss": 0.358, "step": 1140 }, { "epoch": 0.2875, "grad_norm": 5.875, "learning_rate": 4.484710706755087e-06, "loss": 0.3549, "step": 1150 }, { "epoch": 0.29, "grad_norm": 5.84375, "learning_rate": 4.471369298693505e-06, "loss": 0.3553, "step": 1160 }, { "epoch": 0.2925, "grad_norm": 5.75, "learning_rate": 4.457877763227361e-06, "loss": 0.3623, "step": 1170 }, { "epoch": 0.295, "grad_norm": 5.5625, "learning_rate": 4.444237127789315e-06, "loss": 0.3629, "step": 1180 }, { "epoch": 0.2975, "grad_norm": 6.96875, "learning_rate": 4.430448431166567e-06, "loss": 0.3434, "step": 1190 }, { "epoch": 0.3, "grad_norm": 5.9375, "learning_rate": 4.416512723421752e-06, "loss": 0.3549, "step": 1200 }, { "epoch": 0.3025, "grad_norm": 5.53125, "learning_rate": 4.402431065812968e-06, "loss": 0.3461, "step": 1210 }, { "epoch": 0.305, "grad_norm": 5.375, "learning_rate": 4.388204530712959e-06, "loss": 0.3547, "step": 1220 }, { "epoch": 0.3075, "grad_norm": 5.90625, "learning_rate": 4.373834201527457e-06, "loss": 0.3383, "step": 1230 }, { "epoch": 0.31, "grad_norm": 5.6875, "learning_rate": 4.359321172612664e-06, "loss": 0.3414, "step": 1240 }, { "epoch": 0.3125, "grad_norm": 5.15625, "learning_rate": 4.344666549191921e-06, "loss": 0.3285, "step": 1250 }, { "epoch": 0.315, "grad_norm": 5.4375, "learning_rate": 4.329871447271541e-06, "loss": 0.3352, "step": 1260 }, { "epoch": 0.3175, "grad_norm": 5.0625, "learning_rate": 4.314936993555816e-06, "loss": 0.3441, "step": 1270 }, { "epoch": 0.32, "grad_norm": 5.09375, "learning_rate": 4.299864325361217e-06, "loss": 0.3523, "step": 1280 }, { "epoch": 0.3225, "grad_norm": 5.71875, "learning_rate": 4.284654590529784e-06, "loss": 0.3348, "step": 1290 }, { "epoch": 0.325, "grad_norm": 5.28125, "learning_rate": 4.269308947341711e-06, "loss": 0.3252, "step": 1300 }, { "epoch": 0.3275, "grad_norm": 4.8125, "learning_rate": 4.25382856442714e-06, "loss": 0.3439, "step": 1310 }, { "epoch": 0.33, "grad_norm": 5.4375, "learning_rate": 4.238214620677164e-06, "loss": 0.3326, "step": 1320 }, { "epoch": 0.3325, "grad_norm": 4.75, "learning_rate": 4.222468305154052e-06, "loss": 0.3332, "step": 1330 }, { "epoch": 0.335, "grad_norm": 5.25, "learning_rate": 4.206590817000695e-06, "loss": 0.3277, "step": 1340 }, { "epoch": 0.3375, "grad_norm": 5.03125, "learning_rate": 4.190583365349289e-06, "loss": 0.3363, "step": 1350 }, { "epoch": 0.34, "grad_norm": 5.0, "learning_rate": 4.174447169229252e-06, "loss": 0.3412, "step": 1360 }, { "epoch": 0.3425, "grad_norm": 5.28125, "learning_rate": 4.158183457474392e-06, "loss": 0.326, "step": 1370 }, { "epoch": 0.345, "grad_norm": 5.3125, "learning_rate": 4.141793468629327e-06, "loss": 0.334, "step": 1380 }, { "epoch": 0.3475, "grad_norm": 5.0, "learning_rate": 4.125278450855165e-06, "loss": 0.3367, "step": 1390 }, { "epoch": 0.35, "grad_norm": 6.28125, "learning_rate": 4.1086396618344474e-06, "loss": 0.3168, "step": 1400 }, { "epoch": 0.3525, "grad_norm": 5.15625, "learning_rate": 4.09187836867538e-06, "loss": 0.3193, "step": 1410 }, { "epoch": 0.355, "grad_norm": 5.15625, "learning_rate": 4.074995847815331e-06, "loss": 0.3225, "step": 1420 }, { "epoch": 0.3575, "grad_norm": 4.96875, "learning_rate": 4.057993384923626e-06, "loss": 0.332, "step": 1430 }, { "epoch": 0.36, "grad_norm": 4.9375, "learning_rate": 4.0408722748036426e-06, "loss": 0.3221, "step": 1440 }, { "epoch": 0.3625, "grad_norm": 4.8125, "learning_rate": 4.023633821294203e-06, "loss": 0.3211, "step": 1450 }, { "epoch": 0.365, "grad_norm": 5.125, "learning_rate": 4.006279337170283e-06, "loss": 0.3195, "step": 1460 }, { "epoch": 0.3675, "grad_norm": 4.96875, "learning_rate": 3.988810144043041e-06, "loss": 0.3225, "step": 1470 }, { "epoch": 0.37, "grad_norm": 4.6875, "learning_rate": 3.971227572259167e-06, "loss": 0.3242, "step": 1480 }, { "epoch": 0.3725, "grad_norm": 5.0625, "learning_rate": 3.953532960799577e-06, "loss": 0.3205, "step": 1490 }, { "epoch": 0.375, "grad_norm": 4.71875, "learning_rate": 3.935727657177439e-06, "loss": 0.3152, "step": 1500 }, { "epoch": 0.3775, "grad_norm": 4.90625, "learning_rate": 3.917813017335562e-06, "loss": 0.2998, "step": 1510 }, { "epoch": 0.38, "grad_norm": 5.15625, "learning_rate": 3.899790405543129e-06, "loss": 0.3229, "step": 1520 }, { "epoch": 0.3825, "grad_norm": 4.46875, "learning_rate": 3.881661194291805e-06, "loss": 0.3088, "step": 1530 }, { "epoch": 0.385, "grad_norm": 4.9375, "learning_rate": 3.863426764191216e-06, "loss": 0.3205, "step": 1540 }, { "epoch": 0.3875, "grad_norm": 5.15625, "learning_rate": 3.845088503863813e-06, "loss": 0.3137, "step": 1550 }, { "epoch": 0.39, "grad_norm": 4.5, "learning_rate": 3.826647809839119e-06, "loss": 0.3055, "step": 1560 }, { "epoch": 0.3925, "grad_norm": 5.15625, "learning_rate": 3.8081060864473794e-06, "loss": 0.316, "step": 1570 }, { "epoch": 0.395, "grad_norm": 5.0625, "learning_rate": 3.7894647457126188e-06, "loss": 0.3215, "step": 1580 }, { "epoch": 0.3975, "grad_norm": 5.03125, "learning_rate": 3.770725207245106e-06, "loss": 0.3125, "step": 1590 }, { "epoch": 0.4, "grad_norm": 4.84375, "learning_rate": 3.751888898133249e-06, "loss": 0.3168, "step": 1600 }, { "epoch": 0.4025, "grad_norm": 5.03125, "learning_rate": 3.7329572528349145e-06, "loss": 0.3074, "step": 1610 }, { "epoch": 0.405, "grad_norm": 4.90625, "learning_rate": 3.7139317130681886e-06, "loss": 0.3248, "step": 1620 }, { "epoch": 0.4075, "grad_norm": 4.59375, "learning_rate": 3.694813727701584e-06, "loss": 0.3041, "step": 1630 }, { "epoch": 0.41, "grad_norm": 5.09375, "learning_rate": 3.675604752643706e-06, "loss": 0.3049, "step": 1640 }, { "epoch": 0.4125, "grad_norm": 4.625, "learning_rate": 3.6563062507323752e-06, "loss": 0.3285, "step": 1650 }, { "epoch": 0.415, "grad_norm": 4.75, "learning_rate": 3.6369196916232297e-06, "loss": 0.2977, "step": 1660 }, { "epoch": 0.4175, "grad_norm": 4.71875, "learning_rate": 3.6174465516778032e-06, "loss": 0.3074, "step": 1670 }, { "epoch": 0.42, "grad_norm": 4.625, "learning_rate": 3.5978883138510963e-06, "loss": 0.3055, "step": 1680 }, { "epoch": 0.4225, "grad_norm": 4.53125, "learning_rate": 3.578246467578642e-06, "loss": 0.3096, "step": 1690 }, { "epoch": 0.425, "grad_norm": 4.84375, "learning_rate": 3.558522508663081e-06, "loss": 0.3102, "step": 1700 }, { "epoch": 0.4275, "grad_norm": 4.6875, "learning_rate": 3.538717939160249e-06, "loss": 0.2945, "step": 1710 }, { "epoch": 0.43, "grad_norm": 4.625, "learning_rate": 3.5188342672647897e-06, "loss": 0.2992, "step": 1720 }, { "epoch": 0.4325, "grad_norm": 4.5, "learning_rate": 3.4988730071953005e-06, "loss": 0.3107, "step": 1730 }, { "epoch": 0.435, "grad_norm": 4.375, "learning_rate": 3.478835679079019e-06, "loss": 0.3031, "step": 1740 }, { "epoch": 0.4375, "grad_norm": 4.8125, "learning_rate": 3.4587238088360605e-06, "loss": 0.317, "step": 1750 }, { "epoch": 0.44, "grad_norm": 4.5625, "learning_rate": 3.438538928063208e-06, "loss": 0.2965, "step": 1760 }, { "epoch": 0.4425, "grad_norm": 4.65625, "learning_rate": 3.4182825739172826e-06, "loss": 0.2899, "step": 1770 }, { "epoch": 0.445, "grad_norm": 4.375, "learning_rate": 3.3979562889980777e-06, "loss": 0.3107, "step": 1780 }, { "epoch": 0.4475, "grad_norm": 4.46875, "learning_rate": 3.377561621230887e-06, "loss": 0.2855, "step": 1790 }, { "epoch": 0.45, "grad_norm": 4.375, "learning_rate": 3.357100123748621e-06, "loss": 0.3061, "step": 1800 }, { "epoch": 0.4525, "grad_norm": 4.53125, "learning_rate": 3.3365733547735334e-06, "loss": 0.3068, "step": 1810 }, { "epoch": 0.455, "grad_norm": 4.5, "learning_rate": 3.315982877498555e-06, "loss": 0.2834, "step": 1820 }, { "epoch": 0.4575, "grad_norm": 4.34375, "learning_rate": 3.2953302599682487e-06, "loss": 0.3029, "step": 1830 }, { "epoch": 0.46, "grad_norm": 4.5625, "learning_rate": 3.2746170749593998e-06, "loss": 0.3121, "step": 1840 }, { "epoch": 0.4625, "grad_norm": 4.46875, "learning_rate": 3.2538448998612394e-06, "loss": 0.293, "step": 1850 }, { "epoch": 0.465, "grad_norm": 4.875, "learning_rate": 3.233015316555326e-06, "loss": 0.2988, "step": 1860 }, { "epoch": 0.4675, "grad_norm": 4.53125, "learning_rate": 3.212129911295074e-06, "loss": 0.3002, "step": 1870 }, { "epoch": 0.47, "grad_norm": 7.6875, "learning_rate": 3.1911902745849526e-06, "loss": 0.3146, "step": 1880 }, { "epoch": 0.4725, "grad_norm": 4.3125, "learning_rate": 3.17019800105937e-06, "loss": 0.2926, "step": 1890 }, { "epoch": 0.475, "grad_norm": 4.46875, "learning_rate": 3.1491546893612296e-06, "loss": 0.2998, "step": 1900 }, { "epoch": 0.4775, "grad_norm": 4.71875, "learning_rate": 3.128061942020189e-06, "loss": 0.3104, "step": 1910 }, { "epoch": 0.48, "grad_norm": 4.75, "learning_rate": 3.1069213653306242e-06, "loss": 0.2955, "step": 1920 }, { "epoch": 0.4825, "grad_norm": 4.4375, "learning_rate": 3.0857345692292968e-06, "loss": 0.2937, "step": 1930 }, { "epoch": 0.485, "grad_norm": 4.25, "learning_rate": 3.0645031671727598e-06, "loss": 0.2878, "step": 1940 }, { "epoch": 0.4875, "grad_norm": 4.53125, "learning_rate": 3.0432287760144797e-06, "loss": 0.2988, "step": 1950 }, { "epoch": 0.49, "grad_norm": 4.21875, "learning_rate": 3.0219130158817093e-06, "loss": 0.3043, "step": 1960 }, { "epoch": 0.4925, "grad_norm": 4.8125, "learning_rate": 3.0005575100521115e-06, "loss": 0.2842, "step": 1970 }, { "epoch": 0.495, "grad_norm": 4.5625, "learning_rate": 2.979163884830137e-06, "loss": 0.2922, "step": 1980 }, { "epoch": 0.4975, "grad_norm": 4.9375, "learning_rate": 2.957733769423174e-06, "loss": 0.2998, "step": 1990 }, { "epoch": 0.5, "grad_norm": 4.5625, "learning_rate": 2.93626879581748e-06, "loss": 0.2908, "step": 2000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5153945731072e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }