|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 19.25, |
|
"learning_rate": 1.1250000000000001e-07, |
|
"loss": 1.8141, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 15.25, |
|
"learning_rate": 2.3750000000000003e-07, |
|
"loss": 1.7844, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 18.25, |
|
"learning_rate": 3.625e-07, |
|
"loss": 1.7555, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 4.875000000000001e-07, |
|
"loss": 1.7414, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 17.75, |
|
"learning_rate": 6.125000000000001e-07, |
|
"loss": 1.7531, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 15.5, |
|
"learning_rate": 7.375e-07, |
|
"loss": 1.7852, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 17.875, |
|
"learning_rate": 8.625e-07, |
|
"loss": 1.7844, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 9.875e-07, |
|
"loss": 1.7883, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.1125000000000001e-06, |
|
"loss": 1.7312, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.2375e-06, |
|
"loss": 1.75, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.3625000000000003e-06, |
|
"loss": 1.7477, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.4875000000000002e-06, |
|
"loss": 1.7312, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.6125000000000002e-06, |
|
"loss": 1.7414, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.7375e-06, |
|
"loss": 1.7367, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 1.8625000000000002e-06, |
|
"loss": 1.7656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.9875000000000005e-06, |
|
"loss": 1.7523, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.1125e-06, |
|
"loss": 1.6953, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 13.375, |
|
"learning_rate": 2.2375e-06, |
|
"loss": 1.7109, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.3625000000000003e-06, |
|
"loss": 1.7047, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.4875000000000003e-06, |
|
"loss": 1.675, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 14.5, |
|
"learning_rate": 2.6125e-06, |
|
"loss": 1.6836, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.7375e-06, |
|
"loss": 1.7133, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.8625e-06, |
|
"loss": 1.6461, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 13.625, |
|
"learning_rate": 2.9875e-06, |
|
"loss": 1.6242, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 15.375, |
|
"learning_rate": 3.1125000000000007e-06, |
|
"loss": 1.6523, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 10.5, |
|
"learning_rate": 3.2375e-06, |
|
"loss": 1.6141, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 3.3625000000000004e-06, |
|
"loss": 1.5766, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 3.4875000000000005e-06, |
|
"loss": 1.5625, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 19.0, |
|
"learning_rate": 3.6125000000000006e-06, |
|
"loss": 1.5367, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 18.75, |
|
"learning_rate": 3.7375000000000006e-06, |
|
"loss": 1.4852, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 3.8625e-06, |
|
"loss": 1.4773, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 3.9875e-06, |
|
"loss": 1.4672, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 4.1125e-06, |
|
"loss": 1.4109, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 12.625, |
|
"learning_rate": 4.2375000000000005e-06, |
|
"loss": 1.375, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 19.0, |
|
"learning_rate": 4.362500000000001e-06, |
|
"loss": 1.3438, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 4.4875e-06, |
|
"loss": 1.2742, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 12.625, |
|
"learning_rate": 4.6125e-06, |
|
"loss": 1.2508, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 4.737500000000001e-06, |
|
"loss": 1.268, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.8625000000000005e-06, |
|
"loss": 1.2055, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.25, |
|
"learning_rate": 4.987500000000001e-06, |
|
"loss": 1.1742, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 4.999922894111975e-06, |
|
"loss": 1.1199, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.999656361346094e-06, |
|
"loss": 1.1195, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.999199470070484e-06, |
|
"loss": 1.0727, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.625, |
|
"learning_rate": 4.998552255079182e-06, |
|
"loss": 1.0711, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.9977147656601196e-06, |
|
"loss": 1.0414, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 4.996687065591355e-06, |
|
"loss": 1.0367, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.9954692331362295e-06, |
|
"loss": 1.0125, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.625, |
|
"learning_rate": 4.9940613610373974e-06, |
|
"loss": 0.9828, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.992463556509772e-06, |
|
"loss": 0.9695, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 10.75, |
|
"learning_rate": 4.990675941232353e-06, |
|
"loss": 0.9395, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.988698651338965e-06, |
|
"loss": 0.9137, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 11.375, |
|
"learning_rate": 4.986531837407891e-06, |
|
"loss": 0.9082, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 12.5, |
|
"learning_rate": 4.9841756644503965e-06, |
|
"loss": 0.8809, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 18.875, |
|
"learning_rate": 4.981630311898178e-06, |
|
"loss": 0.8648, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.978895973589686e-06, |
|
"loss": 0.8633, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.975972857755369e-06, |
|
"loss": 0.8293, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 4.972861187001815e-06, |
|
"loss": 0.8242, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.9695611982947995e-06, |
|
"loss": 0.8031, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.966073142941239e-06, |
|
"loss": 0.775, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.962397286570053e-06, |
|
"loss": 0.7996, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.958533909111936e-06, |
|
"loss": 0.7492, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.95448330477804e-06, |
|
"loss": 0.7184, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 4.950245782037566e-06, |
|
"loss": 0.7023, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 4.945821663594277e-06, |
|
"loss": 0.6949, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.941211286361922e-06, |
|
"loss": 0.7043, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.936415001438577e-06, |
|
"loss": 0.6633, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.9314331740799084e-06, |
|
"loss": 0.652, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.926266183671356e-06, |
|
"loss": 0.6637, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 9.75, |
|
"learning_rate": 4.920914423699247e-06, |
|
"loss": 0.6262, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.915378301720822e-06, |
|
"loss": 0.6398, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.909658239333203e-06, |
|
"loss": 0.6195, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.903754672141288e-06, |
|
"loss": 0.5977, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.897668049724574e-06, |
|
"loss": 0.5766, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.891398835602925e-06, |
|
"loss": 0.5656, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.884947507201268e-06, |
|
"loss": 0.5781, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.878314555813237e-06, |
|
"loss": 0.559, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 4.8715004865637616e-06, |
|
"loss": 0.5496, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.8645058183705976e-06, |
|
"loss": 0.5264, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.8573310839048085e-06, |
|
"loss": 0.5404, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.8499768295502e-06, |
|
"loss": 0.5117, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.842443615361718e-06, |
|
"loss": 0.5195, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.834732015022786e-06, |
|
"loss": 0.4818, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 8.25, |
|
"learning_rate": 4.826842615801628e-06, |
|
"loss": 0.4992, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.818776018506538e-06, |
|
"loss": 0.4975, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.810532837440134e-06, |
|
"loss": 0.4752, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 8.0, |
|
"learning_rate": 4.802113700352567e-06, |
|
"loss": 0.4924, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 4.793519248393721e-06, |
|
"loss": 0.4564, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 8.125, |
|
"learning_rate": 4.78475013606439e-06, |
|
"loss": 0.443, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 4.775807031166428e-06, |
|
"loss": 0.4428, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 4.766690614751897e-06, |
|
"loss": 0.4443, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.757401581071203e-06, |
|
"loss": 0.4482, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 4.747940637520226e-06, |
|
"loss": 0.4332, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 4.738308504586445e-06, |
|
"loss": 0.4215, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.7285059157940765e-06, |
|
"loss": 0.4332, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.718533617648209e-06, |
|
"loss": 0.4092, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 4.7083923695779546e-06, |
|
"loss": 0.4297, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 4.6980829438786176e-06, |
|
"loss": 0.3949, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 4.687606125652882e-06, |
|
"loss": 0.4023, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 6.75, |
|
"learning_rate": 4.676962712751015e-06, |
|
"loss": 0.3988, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.5, |
|
"learning_rate": 4.666153515710118e-06, |
|
"loss": 0.3975, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 4.655179357692396e-06, |
|
"loss": 0.4049, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 6.375, |
|
"learning_rate": 4.644041074422469e-06, |
|
"loss": 0.4037, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.632739514123733e-06, |
|
"loss": 0.3846, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.6212755374537596e-06, |
|
"loss": 0.376, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 4.609650017438757e-06, |
|
"loss": 0.4123, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 4.5978638394070835e-06, |
|
"loss": 0.3816, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 4.58591790092183e-06, |
|
"loss": 0.3854, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.75, |
|
"learning_rate": 4.5738131117124605e-06, |
|
"loss": 0.3783, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.561550393605541e-06, |
|
"loss": 0.384, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.549130680454532e-06, |
|
"loss": 0.41, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.536554918068673e-06, |
|
"loss": 0.3664, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.523824064140961e-06, |
|
"loss": 0.3727, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.510939088175211e-06, |
|
"loss": 0.3764, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.49790097141223e-06, |
|
"loss": 0.358, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.484710706755087e-06, |
|
"loss": 0.3549, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.471369298693505e-06, |
|
"loss": 0.3553, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.457877763227361e-06, |
|
"loss": 0.3623, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.444237127789315e-06, |
|
"loss": 0.3629, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 4.430448431166567e-06, |
|
"loss": 0.3434, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.416512723421752e-06, |
|
"loss": 0.3549, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.402431065812968e-06, |
|
"loss": 0.3461, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.388204530712959e-06, |
|
"loss": 0.3547, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.373834201527457e-06, |
|
"loss": 0.3383, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.359321172612664e-06, |
|
"loss": 0.3414, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.344666549191921e-06, |
|
"loss": 0.3285, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.329871447271541e-06, |
|
"loss": 0.3352, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.314936993555816e-06, |
|
"loss": 0.3441, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.299864325361217e-06, |
|
"loss": 0.3523, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 4.284654590529784e-06, |
|
"loss": 0.3348, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.269308947341711e-06, |
|
"loss": 0.3252, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.25382856442714e-06, |
|
"loss": 0.3439, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.238214620677164e-06, |
|
"loss": 0.3326, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.222468305154052e-06, |
|
"loss": 0.3332, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.206590817000695e-06, |
|
"loss": 0.3277, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.190583365349289e-06, |
|
"loss": 0.3363, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.174447169229252e-06, |
|
"loss": 0.3412, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.158183457474392e-06, |
|
"loss": 0.326, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.141793468629327e-06, |
|
"loss": 0.334, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.125278450855165e-06, |
|
"loss": 0.3367, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 4.1086396618344474e-06, |
|
"loss": 0.3168, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.09187836867538e-06, |
|
"loss": 0.3193, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.074995847815331e-06, |
|
"loss": 0.3225, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.057993384923626e-06, |
|
"loss": 0.332, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.0408722748036426e-06, |
|
"loss": 0.3221, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.023633821294203e-06, |
|
"loss": 0.3211, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.006279337170283e-06, |
|
"loss": 0.3195, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.988810144043041e-06, |
|
"loss": 0.3225, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.971227572259167e-06, |
|
"loss": 0.3242, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.953532960799577e-06, |
|
"loss": 0.3205, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.935727657177439e-06, |
|
"loss": 0.3152, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.917813017335562e-06, |
|
"loss": 0.2998, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.899790405543129e-06, |
|
"loss": 0.3229, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.881661194291805e-06, |
|
"loss": 0.3088, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.863426764191216e-06, |
|
"loss": 0.3205, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.845088503863813e-06, |
|
"loss": 0.3137, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.826647809839119e-06, |
|
"loss": 0.3055, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.8081060864473794e-06, |
|
"loss": 0.316, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.7894647457126188e-06, |
|
"loss": 0.3215, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.770725207245106e-06, |
|
"loss": 0.3125, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.751888898133249e-06, |
|
"loss": 0.3168, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.7329572528349145e-06, |
|
"loss": 0.3074, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.7139317130681886e-06, |
|
"loss": 0.3248, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 3.694813727701584e-06, |
|
"loss": 0.3041, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.675604752643706e-06, |
|
"loss": 0.3049, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.6563062507323752e-06, |
|
"loss": 0.3285, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 4.75, |
|
"learning_rate": 3.6369196916232297e-06, |
|
"loss": 0.2977, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.6174465516778032e-06, |
|
"loss": 0.3074, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.5978883138510963e-06, |
|
"loss": 0.3055, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.578246467578642e-06, |
|
"loss": 0.3096, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.558522508663081e-06, |
|
"loss": 0.3102, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.538717939160249e-06, |
|
"loss": 0.2945, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.5188342672647897e-06, |
|
"loss": 0.2992, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.4988730071953005e-06, |
|
"loss": 0.3107, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.478835679079019e-06, |
|
"loss": 0.3031, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.4587238088360605e-06, |
|
"loss": 0.317, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.438538928063208e-06, |
|
"loss": 0.2965, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.4182825739172826e-06, |
|
"loss": 0.2899, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.3979562889980777e-06, |
|
"loss": 0.3107, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.377561621230887e-06, |
|
"loss": 0.2855, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.357100123748621e-06, |
|
"loss": 0.3061, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.3365733547735334e-06, |
|
"loss": 0.3068, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.315982877498555e-06, |
|
"loss": 0.2834, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 3.2953302599682487e-06, |
|
"loss": 0.3029, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.2746170749593998e-06, |
|
"loss": 0.3121, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.2538448998612394e-06, |
|
"loss": 0.293, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 4.875, |
|
"learning_rate": 3.233015316555326e-06, |
|
"loss": 0.2988, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.212129911295074e-06, |
|
"loss": 0.3002, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 3.1911902745849526e-06, |
|
"loss": 0.3146, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.17019800105937e-06, |
|
"loss": 0.2926, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.1491546893612296e-06, |
|
"loss": 0.2998, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.128061942020189e-06, |
|
"loss": 0.3104, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.75, |
|
"learning_rate": 3.1069213653306242e-06, |
|
"loss": 0.2955, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 3.0857345692292968e-06, |
|
"loss": 0.2937, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.0645031671727598e-06, |
|
"loss": 0.2878, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.0432287760144797e-06, |
|
"loss": 0.2988, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 3.0219130158817093e-06, |
|
"loss": 0.3043, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.0005575100521115e-06, |
|
"loss": 0.2842, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.979163884830137e-06, |
|
"loss": 0.2922, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 2.957733769423174e-06, |
|
"loss": 0.2998, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.93626879581748e-06, |
|
"loss": 0.2908, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.5153945731072e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|