|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 19.25, |
|
"learning_rate": 1.1250000000000001e-07, |
|
"loss": 1.8141, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 15.25, |
|
"learning_rate": 2.3750000000000003e-07, |
|
"loss": 1.7844, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 18.25, |
|
"learning_rate": 3.625e-07, |
|
"loss": 1.7555, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 4.875000000000001e-07, |
|
"loss": 1.7414, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 17.75, |
|
"learning_rate": 6.125000000000001e-07, |
|
"loss": 1.7531, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 15.5, |
|
"learning_rate": 7.375e-07, |
|
"loss": 1.7852, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 17.875, |
|
"learning_rate": 8.625e-07, |
|
"loss": 1.7844, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 9.875e-07, |
|
"loss": 1.7883, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.1125000000000001e-06, |
|
"loss": 1.7312, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.2375e-06, |
|
"loss": 1.75, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 1.3625000000000003e-06, |
|
"loss": 1.7477, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.4875000000000002e-06, |
|
"loss": 1.7312, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.6125000000000002e-06, |
|
"loss": 1.7414, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.7375e-06, |
|
"loss": 1.7367, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 1.8625000000000002e-06, |
|
"loss": 1.7656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.9875000000000005e-06, |
|
"loss": 1.7523, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.1125e-06, |
|
"loss": 1.6953, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 13.375, |
|
"learning_rate": 2.2375e-06, |
|
"loss": 1.7109, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.3625000000000003e-06, |
|
"loss": 1.7047, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.4875000000000003e-06, |
|
"loss": 1.675, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 14.5, |
|
"learning_rate": 2.6125e-06, |
|
"loss": 1.6836, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.7375e-06, |
|
"loss": 1.7133, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.8625e-06, |
|
"loss": 1.6461, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 13.625, |
|
"learning_rate": 2.9875e-06, |
|
"loss": 1.6242, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 15.375, |
|
"learning_rate": 3.1125000000000007e-06, |
|
"loss": 1.6523, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 10.5, |
|
"learning_rate": 3.2375e-06, |
|
"loss": 1.6141, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 3.3625000000000004e-06, |
|
"loss": 1.5766, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 3.4875000000000005e-06, |
|
"loss": 1.5625, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 19.0, |
|
"learning_rate": 3.6125000000000006e-06, |
|
"loss": 1.5367, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 18.75, |
|
"learning_rate": 3.7375000000000006e-06, |
|
"loss": 1.4852, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 3.8625e-06, |
|
"loss": 1.4773, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 3.9875e-06, |
|
"loss": 1.4672, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 4.1125e-06, |
|
"loss": 1.4109, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 12.625, |
|
"learning_rate": 4.2375000000000005e-06, |
|
"loss": 1.375, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 19.0, |
|
"learning_rate": 4.362500000000001e-06, |
|
"loss": 1.3438, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 4.4875e-06, |
|
"loss": 1.2742, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 12.625, |
|
"learning_rate": 4.6125e-06, |
|
"loss": 1.2508, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 4.737500000000001e-06, |
|
"loss": 1.268, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.8625000000000005e-06, |
|
"loss": 1.2055, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 12.25, |
|
"learning_rate": 4.987500000000001e-06, |
|
"loss": 1.1742, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 4.999922894111975e-06, |
|
"loss": 1.1199, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.999656361346094e-06, |
|
"loss": 1.1195, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.999199470070484e-06, |
|
"loss": 1.0727, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.625, |
|
"learning_rate": 4.998552255079182e-06, |
|
"loss": 1.0711, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.9977147656601196e-06, |
|
"loss": 1.0414, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 4.996687065591355e-06, |
|
"loss": 1.0367, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.9954692331362295e-06, |
|
"loss": 1.0125, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.625, |
|
"learning_rate": 4.9940613610373974e-06, |
|
"loss": 0.9828, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.992463556509772e-06, |
|
"loss": 0.9695, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 10.75, |
|
"learning_rate": 4.990675941232353e-06, |
|
"loss": 0.9395, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.988698651338965e-06, |
|
"loss": 0.9137, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 11.375, |
|
"learning_rate": 4.986531837407891e-06, |
|
"loss": 0.9082, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 12.5, |
|
"learning_rate": 4.9841756644503965e-06, |
|
"loss": 0.8809, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 18.875, |
|
"learning_rate": 4.981630311898178e-06, |
|
"loss": 0.8648, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.978895973589686e-06, |
|
"loss": 0.8633, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.975972857755369e-06, |
|
"loss": 0.8293, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 4.972861187001815e-06, |
|
"loss": 0.8242, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.9695611982947995e-06, |
|
"loss": 0.8031, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.966073142941239e-06, |
|
"loss": 0.775, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.962397286570053e-06, |
|
"loss": 0.7996, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.958533909111936e-06, |
|
"loss": 0.7492, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.95448330477804e-06, |
|
"loss": 0.7184, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 4.950245782037566e-06, |
|
"loss": 0.7023, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 4.945821663594277e-06, |
|
"loss": 0.6949, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.941211286361922e-06, |
|
"loss": 0.7043, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.936415001438577e-06, |
|
"loss": 0.6633, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.9314331740799084e-06, |
|
"loss": 0.652, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.926266183671356e-06, |
|
"loss": 0.6637, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 9.75, |
|
"learning_rate": 4.920914423699247e-06, |
|
"loss": 0.6262, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.915378301720822e-06, |
|
"loss": 0.6398, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.909658239333203e-06, |
|
"loss": 0.6195, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.903754672141288e-06, |
|
"loss": 0.5977, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.897668049724574e-06, |
|
"loss": 0.5766, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.891398835602925e-06, |
|
"loss": 0.5656, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.884947507201268e-06, |
|
"loss": 0.5781, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.878314555813237e-06, |
|
"loss": 0.559, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 4.8715004865637616e-06, |
|
"loss": 0.5496, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.8645058183705976e-06, |
|
"loss": 0.5264, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.8573310839048085e-06, |
|
"loss": 0.5404, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.8499768295502e-06, |
|
"loss": 0.5117, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.842443615361718e-06, |
|
"loss": 0.5195, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.834732015022786e-06, |
|
"loss": 0.4818, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 8.25, |
|
"learning_rate": 4.826842615801628e-06, |
|
"loss": 0.4992, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.818776018506538e-06, |
|
"loss": 0.4975, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.810532837440134e-06, |
|
"loss": 0.4752, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 8.0, |
|
"learning_rate": 4.802113700352567e-06, |
|
"loss": 0.4924, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 4.793519248393721e-06, |
|
"loss": 0.4564, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 8.125, |
|
"learning_rate": 4.78475013606439e-06, |
|
"loss": 0.443, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 4.775807031166428e-06, |
|
"loss": 0.4428, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 4.766690614751897e-06, |
|
"loss": 0.4443, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.757401581071203e-06, |
|
"loss": 0.4482, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 4.747940637520226e-06, |
|
"loss": 0.4332, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 4.738308504586445e-06, |
|
"loss": 0.4215, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.7285059157940765e-06, |
|
"loss": 0.4332, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.718533617648209e-06, |
|
"loss": 0.4092, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 4.7083923695779546e-06, |
|
"loss": 0.4297, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 4.6980829438786176e-06, |
|
"loss": 0.3949, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 4.687606125652882e-06, |
|
"loss": 0.4023, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 6.75, |
|
"learning_rate": 4.676962712751015e-06, |
|
"loss": 0.3988, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.5, |
|
"learning_rate": 4.666153515710118e-06, |
|
"loss": 0.3975, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 4.655179357692396e-06, |
|
"loss": 0.4049, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 6.375, |
|
"learning_rate": 4.644041074422469e-06, |
|
"loss": 0.4037, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 4.632739514123733e-06, |
|
"loss": 0.3846, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.6212755374537596e-06, |
|
"loss": 0.376, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 4.609650017438757e-06, |
|
"loss": 0.4123, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 4.5978638394070835e-06, |
|
"loss": 0.3816, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 4.58591790092183e-06, |
|
"loss": 0.3854, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.75, |
|
"learning_rate": 4.5738131117124605e-06, |
|
"loss": 0.3783, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.561550393605541e-06, |
|
"loss": 0.384, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.549130680454532e-06, |
|
"loss": 0.41, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.536554918068673e-06, |
|
"loss": 0.3664, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.523824064140961e-06, |
|
"loss": 0.3727, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.510939088175211e-06, |
|
"loss": 0.3764, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.49790097141223e-06, |
|
"loss": 0.358, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.484710706755087e-06, |
|
"loss": 0.3549, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.471369298693505e-06, |
|
"loss": 0.3553, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.457877763227361e-06, |
|
"loss": 0.3623, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.444237127789315e-06, |
|
"loss": 0.3629, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 4.430448431166567e-06, |
|
"loss": 0.3434, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.416512723421752e-06, |
|
"loss": 0.3549, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.402431065812968e-06, |
|
"loss": 0.3461, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.388204530712959e-06, |
|
"loss": 0.3547, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.373834201527457e-06, |
|
"loss": 0.3383, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.359321172612664e-06, |
|
"loss": 0.3414, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.344666549191921e-06, |
|
"loss": 0.3285, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.329871447271541e-06, |
|
"loss": 0.3352, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.314936993555816e-06, |
|
"loss": 0.3441, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.299864325361217e-06, |
|
"loss": 0.3523, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 4.284654590529784e-06, |
|
"loss": 0.3348, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.269308947341711e-06, |
|
"loss": 0.3252, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.25382856442714e-06, |
|
"loss": 0.3439, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.238214620677164e-06, |
|
"loss": 0.3326, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.222468305154052e-06, |
|
"loss": 0.3332, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.206590817000695e-06, |
|
"loss": 0.3277, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.190583365349289e-06, |
|
"loss": 0.3363, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.174447169229252e-06, |
|
"loss": 0.3412, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 4.158183457474392e-06, |
|
"loss": 0.326, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.141793468629327e-06, |
|
"loss": 0.334, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.125278450855165e-06, |
|
"loss": 0.3367, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 4.1086396618344474e-06, |
|
"loss": 0.3168, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.09187836867538e-06, |
|
"loss": 0.3193, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.074995847815331e-06, |
|
"loss": 0.3225, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.057993384923626e-06, |
|
"loss": 0.332, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.0408722748036426e-06, |
|
"loss": 0.3221, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.023633821294203e-06, |
|
"loss": 0.3211, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.006279337170283e-06, |
|
"loss": 0.3195, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.988810144043041e-06, |
|
"loss": 0.3225, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.971227572259167e-06, |
|
"loss": 0.3242, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.953532960799577e-06, |
|
"loss": 0.3205, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.935727657177439e-06, |
|
"loss": 0.3152, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.917813017335562e-06, |
|
"loss": 0.2998, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.899790405543129e-06, |
|
"loss": 0.3229, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.881661194291805e-06, |
|
"loss": 0.3088, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.863426764191216e-06, |
|
"loss": 0.3205, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.845088503863813e-06, |
|
"loss": 0.3137, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.826647809839119e-06, |
|
"loss": 0.3055, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.8081060864473794e-06, |
|
"loss": 0.316, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.7894647457126188e-06, |
|
"loss": 0.3215, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.770725207245106e-06, |
|
"loss": 0.3125, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.751888898133249e-06, |
|
"loss": 0.3168, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.7329572528349145e-06, |
|
"loss": 0.3074, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.7139317130681886e-06, |
|
"loss": 0.3248, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 3.694813727701584e-06, |
|
"loss": 0.3041, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.675604752643706e-06, |
|
"loss": 0.3049, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.6563062507323752e-06, |
|
"loss": 0.3285, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 4.75, |
|
"learning_rate": 3.6369196916232297e-06, |
|
"loss": 0.2977, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.6174465516778032e-06, |
|
"loss": 0.3074, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.5978883138510963e-06, |
|
"loss": 0.3055, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.578246467578642e-06, |
|
"loss": 0.3096, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.558522508663081e-06, |
|
"loss": 0.3102, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.538717939160249e-06, |
|
"loss": 0.2945, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.5188342672647897e-06, |
|
"loss": 0.2992, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.4988730071953005e-06, |
|
"loss": 0.3107, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.478835679079019e-06, |
|
"loss": 0.3031, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.4587238088360605e-06, |
|
"loss": 0.317, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.438538928063208e-06, |
|
"loss": 0.2965, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.4182825739172826e-06, |
|
"loss": 0.2899, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.3979562889980777e-06, |
|
"loss": 0.3107, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.377561621230887e-06, |
|
"loss": 0.2855, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.357100123748621e-06, |
|
"loss": 0.3061, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.3365733547735334e-06, |
|
"loss": 0.3068, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.315982877498555e-06, |
|
"loss": 0.2834, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 3.2953302599682487e-06, |
|
"loss": 0.3029, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.2746170749593998e-06, |
|
"loss": 0.3121, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.2538448998612394e-06, |
|
"loss": 0.293, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 4.875, |
|
"learning_rate": 3.233015316555326e-06, |
|
"loss": 0.2988, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.212129911295074e-06, |
|
"loss": 0.3002, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 3.1911902745849526e-06, |
|
"loss": 0.3146, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.17019800105937e-06, |
|
"loss": 0.2926, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.1491546893612296e-06, |
|
"loss": 0.2998, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.128061942020189e-06, |
|
"loss": 0.3104, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.75, |
|
"learning_rate": 3.1069213653306242e-06, |
|
"loss": 0.2955, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 3.0857345692292968e-06, |
|
"loss": 0.2937, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.0645031671727598e-06, |
|
"loss": 0.2878, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.0432287760144797e-06, |
|
"loss": 0.2988, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 3.0219130158817093e-06, |
|
"loss": 0.3043, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.0005575100521115e-06, |
|
"loss": 0.2842, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.979163884830137e-06, |
|
"loss": 0.2922, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 2.957733769423174e-06, |
|
"loss": 0.2998, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.93626879581748e-06, |
|
"loss": 0.2908, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5025, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.914770598653902e-06, |
|
"loss": 0.3014, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.8932408151033868e-06, |
|
"loss": 0.2891, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5075, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.8716810847423083e-06, |
|
"loss": 0.2877, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.5, |
|
"learning_rate": 2.8500930494276035e-06, |
|
"loss": 0.3008, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 4.75, |
|
"learning_rate": 2.828478353171745e-06, |
|
"loss": 0.2863, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.8068386420175376e-06, |
|
"loss": 0.3018, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5175, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 2.785175563912766e-06, |
|
"loss": 0.3012, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.7634907685846995e-06, |
|
"loss": 0.2785, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5225, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.7417859074144604e-06, |
|
"loss": 0.2887, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 2.7200626333112595e-06, |
|
"loss": 0.3049, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5275, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.6983226005865236e-06, |
|
"loss": 0.2885, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.676567464827917e-06, |
|
"loss": 0.2982, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5325, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 2.6547988827732546e-06, |
|
"loss": 0.2895, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.633018512184341e-06, |
|
"loss": 0.3061, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 2.6112280117207223e-06, |
|
"loss": 0.2891, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.375, |
|
"learning_rate": 2.5894290408133744e-06, |
|
"loss": 0.2841, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5425, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.56762325953833e-06, |
|
"loss": 0.3105, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.5458123284902577e-06, |
|
"loss": 0.2842, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5475, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.5239979086560003e-06, |
|
"loss": 0.2842, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 2.5021816612880884e-06, |
|
"loss": 0.2877, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5525, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.4803652477782228e-06, |
|
"loss": 0.2854, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 2.4585503295307565e-06, |
|
"loss": 0.2977, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5575, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.436738567836176e-06, |
|
"loss": 0.2866, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 2.4149316237445813e-06, |
|
"loss": 0.2797, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.3931311579391946e-06, |
|
"loss": 0.298, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 2.37133883060989e-06, |
|
"loss": 0.2781, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5675, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.3495563013267668e-06, |
|
"loss": 0.2961, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.3277852289137636e-06, |
|
"loss": 0.2891, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5725, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 2.306027271322336e-06, |
|
"loss": 0.2824, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 2.284284085505192e-06, |
|
"loss": 0.2824, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5775, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 2.2625573272901156e-06, |
|
"loss": 0.2807, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 2.240848651253863e-06, |
|
"loss": 0.2918, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5825, |
|
"grad_norm": 4.625, |
|
"learning_rate": 2.2191597105961613e-06, |
|
"loss": 0.292, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.1974921570138155e-06, |
|
"loss": 0.2791, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 2.1758476405749207e-06, |
|
"loss": 0.2807, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 2.154227809593203e-06, |
|
"loss": 0.3051, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5925, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 2.1326343105024962e-06, |
|
"loss": 0.2787, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.111068787731358e-06, |
|
"loss": 0.2942, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.5975, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 2.089532883577843e-06, |
|
"loss": 0.2822, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 2.068028238084432e-06, |
|
"loss": 0.2879, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6025, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.046556488913137e-06, |
|
"loss": 0.285, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.025119271220789e-06, |
|
"loss": 0.2838, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6075, |
|
"grad_norm": 4.75, |
|
"learning_rate": 2.0037182175345137e-06, |
|
"loss": 0.3084, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9823549576274048e-06, |
|
"loss": 0.2877, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.961031118394418e-06, |
|
"loss": 0.2707, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.939748323728468e-06, |
|
"loss": 0.2808, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6175, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.918508194396769e-06, |
|
"loss": 0.2855, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.8973123479174038e-06, |
|
"loss": 0.2892, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6225, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.8761623984361444e-06, |
|
"loss": 0.2858, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.8550599566035299e-06, |
|
"loss": 0.2791, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6275, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.834006629452207e-06, |
|
"loss": 0.2859, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.8130040202745488e-06, |
|
"loss": 0.2822, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6325, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7920537285005607e-06, |
|
"loss": 0.291, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.7711573495760725e-06, |
|
"loss": 0.2809, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.750316474841242e-06, |
|
"loss": 0.2828, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.7295326914093713e-06, |
|
"loss": 0.2826, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6425, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.7088075820460348e-06, |
|
"loss": 0.2797, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.6881427250485516e-06, |
|
"loss": 0.2867, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6475, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.6675396941257896e-06, |
|
"loss": 0.2916, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.6470000582783205e-06, |
|
"loss": 0.2734, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6525, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.6265253816789372e-06, |
|
"loss": 0.2771, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.6061172235535342e-06, |
|
"loss": 0.2916, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6575, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.5857771380623643e-06, |
|
"loss": 0.2849, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.5655066741816898e-06, |
|
"loss": 0.2889, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.545307375585814e-06, |
|
"loss": 0.28, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.5251807805295302e-06, |
|
"loss": 0.2896, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6675, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.5051284217309743e-06, |
|
"loss": 0.2957, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.4851518262549058e-06, |
|
"loss": 0.2791, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6725, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.465252515396413e-06, |
|
"loss": 0.2904, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.4454320045650606e-06, |
|
"loss": 0.291, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6775, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.4256918031694866e-06, |
|
"loss": 0.2803, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.4060334145024543e-06, |
|
"loss": 0.2761, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6825, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.3864583356263706e-06, |
|
"loss": 0.2811, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.366968057259282e-06, |
|
"loss": 0.2865, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.3475640636613447e-06, |
|
"loss": 0.2865, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.3282478325217961e-06, |
|
"loss": 0.2738, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.6925, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.3090208348464244e-06, |
|
"loss": 0.2827, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.289884534845542e-06, |
|
"loss": 0.2831, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.6975, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.2708403898224839e-06, |
|
"loss": 0.2918, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.2518898500626259e-06, |
|
"loss": 0.2802, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7025, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.2330343587229397e-06, |
|
"loss": 0.2855, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.2142753517220945e-06, |
|
"loss": 0.2723, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7075, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.1956142576311011e-06, |
|
"loss": 0.2845, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.177052497564524e-06, |
|
"loss": 0.286, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.1585914850722565e-06, |
|
"loss": 0.2892, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.1402326260318752e-06, |
|
"loss": 0.2805, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7175, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.121977318541575e-06, |
|
"loss": 0.268, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.1038269528136989e-06, |
|
"loss": 0.3004, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7225, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.0857829110688695e-06, |
|
"loss": 0.2844, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.0678465674307273e-06, |
|
"loss": 0.2857, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7275, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.0500192878212826e-06, |
|
"loss": 0.2766, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.032302429856899e-06, |
|
"loss": 0.2811, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7325, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 1.014697342744904e-06, |
|
"loss": 0.2915, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 4.5, |
|
"learning_rate": 9.97205367180842e-07, |
|
"loss": 0.2818, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.798278352463752e-07, |
|
"loss": 0.2786, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.375, |
|
"learning_rate": 9.625660703078394e-07, |
|
"loss": 0.2902, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7425, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 9.45421386915468e-07, |
|
"loss": 0.275, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 4.5, |
|
"learning_rate": 9.283950907032788e-07, |
|
"loss": 0.2768, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7475, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 9.114884782896482e-07, |
|
"loss": 0.2816, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 8.947028371785677e-07, |
|
"loss": 0.2854, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7525, |
|
"grad_norm": 4.5, |
|
"learning_rate": 8.780394456615974e-07, |
|
"loss": 0.2867, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 8.614995727205155e-07, |
|
"loss": 0.2806, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7575, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 8.450844779306827e-07, |
|
"loss": 0.2857, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.28795411365122e-07, |
|
"loss": 0.2798, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 8.126336134993176e-07, |
|
"loss": 0.2917, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 4.5, |
|
"learning_rate": 7.966003151167498e-07, |
|
"loss": 0.2762, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7675, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 7.806967372151661e-07, |
|
"loss": 0.2785, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 7.649240909135966e-07, |
|
"loss": 0.2707, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7725, |
|
"grad_norm": 4.75, |
|
"learning_rate": 7.492835773601234e-07, |
|
"loss": 0.2892, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 7.337763876404078e-07, |
|
"loss": 0.2836, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7775, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 7.184037026869867e-07, |
|
"loss": 0.2817, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 7.031666931893361e-07, |
|
"loss": 0.2774, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.7825, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 6.880665195047226e-07, |
|
"loss": 0.2728, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 6.731043315698346e-07, |
|
"loss": 0.292, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 6.58281268813212e-07, |
|
"loss": 0.2863, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.435984600684731e-07, |
|
"loss": 0.2785, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.7925, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 6.290570234883506e-07, |
|
"loss": 0.2809, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.146580664595391e-07, |
|
"loss": 0.2836, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.7975, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 6.004026855183656e-07, |
|
"loss": 0.2928, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 5.862919662672801e-07, |
|
"loss": 0.2785, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8025, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 5.723269832921849e-07, |
|
"loss": 0.2794, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.805, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 5.585088000806016e-07, |
|
"loss": 0.2951, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8075, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 5.448384689406804e-07, |
|
"loss": 0.2764, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 5.313170309210655e-07, |
|
"loss": 0.2811, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 5.179455157316124e-07, |
|
"loss": 0.2779, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.815, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 5.047249416649713e-07, |
|
"loss": 0.2884, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8175, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.916563155190446e-07, |
|
"loss": 0.2847, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 4.787406325203101e-07, |
|
"loss": 0.2717, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8225, |
|
"grad_norm": 4.375, |
|
"learning_rate": 4.6597887624803273e-07, |
|
"loss": 0.2791, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.533720185593621e-07, |
|
"loss": 0.285, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8275, |
|
"grad_norm": 4.0, |
|
"learning_rate": 4.4092101951532076e-07, |
|
"loss": 0.287, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.2862682730769157e-07, |
|
"loss": 0.283, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8325, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 4.164903781868096e-07, |
|
"loss": 0.274, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.835, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.045125963902641e-07, |
|
"loss": 0.2686, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8375, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.9269439407251365e-07, |
|
"loss": 0.2926, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 3.810366712354199e-07, |
|
"loss": 0.2704, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8425, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.6954031565971187e-07, |
|
"loss": 0.2824, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.845, |
|
"grad_norm": 4.25, |
|
"learning_rate": 3.5820620283737615e-07, |
|
"loss": 0.2885, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8475, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.4703519590498615e-07, |
|
"loss": 0.2701, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 3.360281455779704e-07, |
|
"loss": 0.2806, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8525, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.2518589008582597e-07, |
|
"loss": 0.2904, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.855, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.1450925510828705e-07, |
|
"loss": 0.2768, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.8575, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.039990537124432e-07, |
|
"loss": 0.2825, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.936560862908225e-07, |
|
"loss": 0.28, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8625, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 2.8348114050043813e-07, |
|
"loss": 0.2953, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.865, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 2.7347499120280677e-07, |
|
"loss": 0.2745, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.8675, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.636384004049375e-07, |
|
"loss": 0.2791, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.5397211720130267e-07, |
|
"loss": 0.2949, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8725, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.4447687771679414e-07, |
|
"loss": 0.2795, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.3515340505066043e-07, |
|
"loss": 0.2737, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8775, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 2.260024092214419e-07, |
|
"loss": 0.2788, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 2.170245871129012e-07, |
|
"loss": 0.2848, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8825, |
|
"grad_norm": 4.375, |
|
"learning_rate": 2.0822062242095015e-07, |
|
"loss": 0.2863, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.885, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.995911856015867e-07, |
|
"loss": 0.2741, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8875, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9113693381983405e-07, |
|
"loss": 0.2753, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 1.8285851089969802e-07, |
|
"loss": 0.2934, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.8925, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.7475654727513502e-07, |
|
"loss": 0.2773, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.895, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.668316599420433e-07, |
|
"loss": 0.2919, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.8975, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.5908445241127528e-07, |
|
"loss": 0.2774, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.5151551466267956e-07, |
|
"loss": 0.274, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9025, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.441254231001696e-07, |
|
"loss": 0.2831, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.905, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.3691474050782972e-07, |
|
"loss": 0.2728, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9075, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.2988401600705635e-07, |
|
"loss": 0.2923, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.2303378501474174e-07, |
|
"loss": 0.2838, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9125, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.16364569202497e-07, |
|
"loss": 0.2697, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.915, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.0987687645692746e-07, |
|
"loss": 0.2811, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9175, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.035712008409534e-07, |
|
"loss": 0.2924, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.744802255618662e-08, |
|
"loss": 0.2797, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9225, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.150780790636054e-08, |
|
"loss": 0.2854, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 8.575100926181884e-08, |
|
"loss": 0.2755, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9275, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 8.017806502506692e-08, |
|
"loss": 0.2926, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 7.478939959738502e-08, |
|
"loss": 0.2817, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9325, |
|
"grad_norm": 4.4375, |
|
"learning_rate": 6.958542334650847e-08, |
|
"loss": 0.273, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.935, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.456653257537665e-08, |
|
"loss": 0.3051, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 4.125, |
|
"learning_rate": 5.973310949195343e-08, |
|
"loss": 0.2812, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.5, |
|
"learning_rate": 5.50855221801197e-08, |
|
"loss": 0.2746, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9425, |
|
"grad_norm": 4.375, |
|
"learning_rate": 5.062412457164323e-08, |
|
"loss": 0.2776, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.945, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.634925641922472e-08, |
|
"loss": 0.2827, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.9475, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 4.226124327062514e-08, |
|
"loss": 0.2875, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.836039644387307e-08, |
|
"loss": 0.2829, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9525, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 3.4647013003556996e-08, |
|
"loss": 0.2669, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.955, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.112137573820284e-08, |
|
"loss": 0.2908, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.9575, |
|
"grad_norm": 4.625, |
|
"learning_rate": 2.7783753138738713e-08, |
|
"loss": 0.2735, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.463439937804707e-08, |
|
"loss": 0.2855, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.9625, |
|
"grad_norm": 4.375, |
|
"learning_rate": 2.1673554291610775e-08, |
|
"loss": 0.2746, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.965, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.8901443359245765e-08, |
|
"loss": 0.2762, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9675, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.6318277687932816e-08, |
|
"loss": 0.2747, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.3924253995738769e-08, |
|
"loss": 0.2746, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9725, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 1.1719554596836546e-08, |
|
"loss": 0.2838, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 4.5, |
|
"learning_rate": 9.704347387620994e-09, |
|
"loss": 0.2898, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9775, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 7.878785833923819e-09, |
|
"loss": 0.2694, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 6.243008959324892e-09, |
|
"loss": 0.2751, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9825, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 4.797141334566268e-09, |
|
"loss": 0.2928, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.985, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.5412930680658876e-09, |
|
"loss": 0.2745, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.9875, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.475559797531224e-09, |
|
"loss": 0.2865, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.6000226826770604e-09, |
|
"loss": 0.2758, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.9925, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 9.147483990443184e-10, |
|
"loss": 0.2887, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.995, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 4.197891329230097e-10, |
|
"loss": 0.2796, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.9975, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.1518257737763716e-10, |
|
"loss": 0.2779, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.51929376435956e-13, |
|
"loss": 0.2984, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.0307891462144e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|