|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 276.0, |
|
"learning_rate": 1.1250000000000001e-07, |
|
"loss": 0.4619, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 236.0, |
|
"learning_rate": 2.3750000000000003e-07, |
|
"loss": 0.457, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 276.0, |
|
"learning_rate": 3.625e-07, |
|
"loss": 0.51, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 194.0, |
|
"learning_rate": 4.875000000000001e-07, |
|
"loss": 0.4631, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 264.0, |
|
"learning_rate": 6.125000000000001e-07, |
|
"loss": 0.5266, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 21.75, |
|
"learning_rate": 7.375e-07, |
|
"loss": 0.5422, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 21.875, |
|
"learning_rate": 8.625e-07, |
|
"loss": 0.5314, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 170.0, |
|
"learning_rate": 9.875e-07, |
|
"loss": 0.4754, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.1125000000000001e-06, |
|
"loss": 0.5029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 278.0, |
|
"learning_rate": 1.2375e-06, |
|
"loss": 0.4709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3625000000000003e-06, |
|
"loss": 0.3525, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 74.0, |
|
"learning_rate": 1.4875000000000002e-06, |
|
"loss": 0.3973, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.6125000000000002e-06, |
|
"loss": 0.3656, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 232.0, |
|
"learning_rate": 1.7375e-06, |
|
"loss": 0.3996, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.8625000000000002e-06, |
|
"loss": 0.3652, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.9875000000000005e-06, |
|
"loss": 0.4156, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 22.625, |
|
"learning_rate": 2.1125e-06, |
|
"loss": 0.3318, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.2375e-06, |
|
"loss": 0.3256, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2.3625000000000003e-06, |
|
"loss": 0.3281, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.875, |
|
"learning_rate": 2.4875000000000003e-06, |
|
"loss": 0.3357, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.6125e-06, |
|
"loss": 0.3479, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 2.7375e-06, |
|
"loss": 0.3328, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.8625e-06, |
|
"loss": 0.3164, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.9875e-06, |
|
"loss": 0.299, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 3.1125000000000007e-06, |
|
"loss": 0.3129, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.2375e-06, |
|
"loss": 0.3131, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 3.3625000000000004e-06, |
|
"loss": 0.3053, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.4875000000000005e-06, |
|
"loss": 0.299, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.6125000000000006e-06, |
|
"loss": 0.3273, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 3.7375000000000006e-06, |
|
"loss": 0.3041, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.8625e-06, |
|
"loss": 0.3057, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.9875e-06, |
|
"loss": 0.315, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.1125e-06, |
|
"loss": 0.3063, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.2375000000000005e-06, |
|
"loss": 0.3023, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.362500000000001e-06, |
|
"loss": 0.2834, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.4875e-06, |
|
"loss": 0.2767, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.6125e-06, |
|
"loss": 0.2859, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.737500000000001e-06, |
|
"loss": 0.2889, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.8625000000000005e-06, |
|
"loss": 0.2916, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.987500000000001e-06, |
|
"loss": 0.2722, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.999922894111975e-06, |
|
"loss": 0.2822, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.999656361346094e-06, |
|
"loss": 0.269, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.999199470070484e-06, |
|
"loss": 0.2836, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.998552255079182e-06, |
|
"loss": 0.2577, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.9977147656601196e-06, |
|
"loss": 0.266, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.996687065591355e-06, |
|
"loss": 0.2792, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.9954692331362295e-06, |
|
"loss": 0.2801, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.9940613610373974e-06, |
|
"loss": 0.26, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.992463556509772e-06, |
|
"loss": 0.2613, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 4.990675941232353e-06, |
|
"loss": 0.2385, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.988698651338965e-06, |
|
"loss": 0.2574, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.986531837407891e-06, |
|
"loss": 0.2559, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.9841756644503965e-06, |
|
"loss": 0.2622, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.981630311898178e-06, |
|
"loss": 0.2537, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.978895973589686e-06, |
|
"loss": 0.2832, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.975972857755369e-06, |
|
"loss": 0.2471, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.972861187001815e-06, |
|
"loss": 0.2648, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 4.9695611982947995e-06, |
|
"loss": 0.2697, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 4.966073142941239e-06, |
|
"loss": 0.2592, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.962397286570053e-06, |
|
"loss": 0.267, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.958533909111936e-06, |
|
"loss": 0.253, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.95448330477804e-06, |
|
"loss": 0.2488, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.950245782037566e-06, |
|
"loss": 0.2552, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.945821663594277e-06, |
|
"loss": 0.2485, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.941211286361922e-06, |
|
"loss": 0.2612, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.936415001438577e-06, |
|
"loss": 0.2465, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.9314331740799084e-06, |
|
"loss": 0.2449, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.926266183671356e-06, |
|
"loss": 0.2431, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.920914423699247e-06, |
|
"loss": 0.2479, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.915378301720822e-06, |
|
"loss": 0.2327, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.909658239333203e-06, |
|
"loss": 0.2434, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.903754672141288e-06, |
|
"loss": 0.2437, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.897668049724574e-06, |
|
"loss": 0.248, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.891398835602925e-06, |
|
"loss": 0.2502, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.884947507201268e-06, |
|
"loss": 0.2368, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.878314555813237e-06, |
|
"loss": 0.2262, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.8715004865637616e-06, |
|
"loss": 0.2392, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.8645058183705976e-06, |
|
"loss": 0.2403, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.8573310839048085e-06, |
|
"loss": 0.2466, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.8499768295502e-06, |
|
"loss": 0.2453, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.842443615361718e-06, |
|
"loss": 0.2566, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.834732015022786e-06, |
|
"loss": 0.2248, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.826842615801628e-06, |
|
"loss": 0.2456, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.818776018506538e-06, |
|
"loss": 0.2445, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.810532837440134e-06, |
|
"loss": 0.2451, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.802113700352567e-06, |
|
"loss": 0.2605, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.793519248393721e-06, |
|
"loss": 0.2394, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.78475013606439e-06, |
|
"loss": 0.238, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.775807031166428e-06, |
|
"loss": 0.236, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.766690614751897e-06, |
|
"loss": 0.2357, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 4.757401581071203e-06, |
|
"loss": 0.2436, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 4.747940637520226e-06, |
|
"loss": 0.2309, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.738308504586445e-06, |
|
"loss": 0.2254, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.7285059157940765e-06, |
|
"loss": 0.2221, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.718533617648209e-06, |
|
"loss": 0.2246, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.7083923695779546e-06, |
|
"loss": 0.2228, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.6980829438786176e-06, |
|
"loss": 0.2198, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.687606125652882e-06, |
|
"loss": 0.223, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.676962712751015e-06, |
|
"loss": 0.2337, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.666153515710118e-06, |
|
"loss": 0.2273, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.655179357692396e-06, |
|
"loss": 0.2134, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.644041074422469e-06, |
|
"loss": 0.2078, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.632739514123733e-06, |
|
"loss": 0.2229, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.6212755374537596e-06, |
|
"loss": 0.2166, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 4.609650017438757e-06, |
|
"loss": 0.2222, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.5978638394070835e-06, |
|
"loss": 0.2257, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.58591790092183e-06, |
|
"loss": 0.2269, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.5738131117124605e-06, |
|
"loss": 0.2107, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.561550393605541e-06, |
|
"loss": 0.2285, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.549130680454532e-06, |
|
"loss": 0.219, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4.536554918068673e-06, |
|
"loss": 0.2203, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.523824064140961e-06, |
|
"loss": 0.2313, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.510939088175211e-06, |
|
"loss": 0.2209, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.49790097141223e-06, |
|
"loss": 0.2191, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.484710706755087e-06, |
|
"loss": 0.2131, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.471369298693505e-06, |
|
"loss": 0.2121, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.457877763227361e-06, |
|
"loss": 0.2399, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4.444237127789315e-06, |
|
"loss": 0.2145, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.430448431166567e-06, |
|
"loss": 0.2179, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.416512723421752e-06, |
|
"loss": 0.2104, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.402431065812968e-06, |
|
"loss": 0.2194, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.388204530712959e-06, |
|
"loss": 0.2179, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.373834201527457e-06, |
|
"loss": 0.2142, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.359321172612664e-06, |
|
"loss": 0.2054, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.344666549191921e-06, |
|
"loss": 0.2196, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.329871447271541e-06, |
|
"loss": 0.2165, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.314936993555816e-06, |
|
"loss": 0.2021, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.299864325361217e-06, |
|
"loss": 0.2113, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.284654590529784e-06, |
|
"loss": 0.1996, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.269308947341711e-06, |
|
"loss": 0.2101, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.25382856442714e-06, |
|
"loss": 0.2086, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.238214620677164e-06, |
|
"loss": 0.2125, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.222468305154052e-06, |
|
"loss": 0.2162, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.206590817000695e-06, |
|
"loss": 0.2127, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.190583365349289e-06, |
|
"loss": 0.2095, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.174447169229252e-06, |
|
"loss": 0.217, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.158183457474392e-06, |
|
"loss": 0.212, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 4.141793468629327e-06, |
|
"loss": 0.2221, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.125278450855165e-06, |
|
"loss": 0.2079, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.1086396618344474e-06, |
|
"loss": 0.2085, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.09187836867538e-06, |
|
"loss": 0.2123, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.074995847815331e-06, |
|
"loss": 0.2116, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.057993384923626e-06, |
|
"loss": 0.2288, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.0408722748036426e-06, |
|
"loss": 0.2038, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.023633821294203e-06, |
|
"loss": 0.2123, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.006279337170283e-06, |
|
"loss": 0.1979, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.988810144043041e-06, |
|
"loss": 0.2113, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.971227572259167e-06, |
|
"loss": 0.2077, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.953532960799577e-06, |
|
"loss": 0.1986, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 3.935727657177439e-06, |
|
"loss": 0.1972, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.917813017335562e-06, |
|
"loss": 0.2082, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.899790405543129e-06, |
|
"loss": 0.2086, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.881661194291805e-06, |
|
"loss": 0.1968, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.863426764191216e-06, |
|
"loss": 0.2064, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.845088503863813e-06, |
|
"loss": 0.1855, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.826647809839119e-06, |
|
"loss": 0.2039, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.8081060864473794e-06, |
|
"loss": 0.2079, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.7894647457126188e-06, |
|
"loss": 0.2126, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.770725207245106e-06, |
|
"loss": 0.2156, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.751888898133249e-06, |
|
"loss": 0.2122, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.7329572528349145e-06, |
|
"loss": 0.1996, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.7139317130681886e-06, |
|
"loss": 0.2136, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 3.694813727701584e-06, |
|
"loss": 0.2055, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.675604752643706e-06, |
|
"loss": 0.2184, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.6563062507323752e-06, |
|
"loss": 0.2068, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.6369196916232297e-06, |
|
"loss": 0.2034, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.6174465516778032e-06, |
|
"loss": 0.1992, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.5978883138510963e-06, |
|
"loss": 0.2037, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.578246467578642e-06, |
|
"loss": 0.2185, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.558522508663081e-06, |
|
"loss": 0.2038, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.538717939160249e-06, |
|
"loss": 0.2088, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.5188342672647897e-06, |
|
"loss": 0.2041, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.4988730071953005e-06, |
|
"loss": 0.2065, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.478835679079019e-06, |
|
"loss": 0.2041, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 3.4587238088360605e-06, |
|
"loss": 0.1996, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.438538928063208e-06, |
|
"loss": 0.1931, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.4182825739172826e-06, |
|
"loss": 0.2095, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.3979562889980777e-06, |
|
"loss": 0.2097, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 3.377561621230887e-06, |
|
"loss": 0.1874, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.357100123748621e-06, |
|
"loss": 0.2025, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 3.3365733547735334e-06, |
|
"loss": 0.1846, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.315982877498555e-06, |
|
"loss": 0.2049, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.2953302599682487e-06, |
|
"loss": 0.2021, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.2746170749593998e-06, |
|
"loss": 0.1979, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.2538448998612394e-06, |
|
"loss": 0.1965, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.233015316555326e-06, |
|
"loss": 0.2151, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.212129911295074e-06, |
|
"loss": 0.1824, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.1911902745849526e-06, |
|
"loss": 0.2051, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.17019800105937e-06, |
|
"loss": 0.2145, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.1491546893612296e-06, |
|
"loss": 0.2003, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.128061942020189e-06, |
|
"loss": 0.2078, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.1069213653306242e-06, |
|
"loss": 0.1979, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.0857345692292968e-06, |
|
"loss": 0.1925, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.0645031671727598e-06, |
|
"loss": 0.2042, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.0432287760144797e-06, |
|
"loss": 0.2064, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.0219130158817093e-06, |
|
"loss": 0.2035, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.0005575100521115e-06, |
|
"loss": 0.195, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.979163884830137e-06, |
|
"loss": 0.2044, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.957733769423174e-06, |
|
"loss": 0.212, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.93626879581748e-06, |
|
"loss": 0.1985, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5025, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.914770598653902e-06, |
|
"loss": 0.1872, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.8932408151033868e-06, |
|
"loss": 0.1884, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5075, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.8716810847423083e-06, |
|
"loss": 0.2003, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.8500930494276035e-06, |
|
"loss": 0.2057, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.828478353171745e-06, |
|
"loss": 0.197, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.8068386420175376e-06, |
|
"loss": 0.1982, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5175, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.785175563912766e-06, |
|
"loss": 0.1865, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.7634907685846995e-06, |
|
"loss": 0.1963, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5225, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.7417859074144604e-06, |
|
"loss": 0.1976, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.7200626333112595e-06, |
|
"loss": 0.1997, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5275, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.6983226005865236e-06, |
|
"loss": 0.1997, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.676567464827917e-06, |
|
"loss": 0.2114, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5325, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.6547988827732546e-06, |
|
"loss": 0.194, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.633018512184341e-06, |
|
"loss": 0.198, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.6112280117207223e-06, |
|
"loss": 0.2126, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.5894290408133744e-06, |
|
"loss": 0.2021, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5425, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.56762325953833e-06, |
|
"loss": 0.1997, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.5458123284902577e-06, |
|
"loss": 0.1984, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5475, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.5239979086560003e-06, |
|
"loss": 0.1981, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.5021816612880884e-06, |
|
"loss": 0.2044, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5525, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.4803652477782228e-06, |
|
"loss": 0.2039, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.4585503295307565e-06, |
|
"loss": 0.203, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5575, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.436738567836176e-06, |
|
"loss": 0.1996, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.4149316237445813e-06, |
|
"loss": 0.1979, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.3931311579391946e-06, |
|
"loss": 0.1939, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.37133883060989e-06, |
|
"loss": 0.1991, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5675, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.3495563013267668e-06, |
|
"loss": 0.19, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.3277852289137636e-06, |
|
"loss": 0.1957, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5725, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 2.306027271322336e-06, |
|
"loss": 0.1966, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.284284085505192e-06, |
|
"loss": 0.2099, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5775, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.2625573272901156e-06, |
|
"loss": 0.1997, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.240848651253863e-06, |
|
"loss": 0.1897, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5825, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.2191597105961613e-06, |
|
"loss": 0.1826, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.1974921570138155e-06, |
|
"loss": 0.1917, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.1758476405749207e-06, |
|
"loss": 0.1963, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.154227809593203e-06, |
|
"loss": 0.1931, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5925, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.1326343105024962e-06, |
|
"loss": 0.202, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.111068787731358e-06, |
|
"loss": 0.2061, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.5975, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.089532883577843e-06, |
|
"loss": 0.1867, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.068028238084432e-06, |
|
"loss": 0.2039, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6025, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.046556488913137e-06, |
|
"loss": 0.209, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.025119271220789e-06, |
|
"loss": 0.1894, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6075, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.0037182175345137e-06, |
|
"loss": 0.2098, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.9823549576274048e-06, |
|
"loss": 0.1955, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.961031118394418e-06, |
|
"loss": 0.1881, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.939748323728468e-06, |
|
"loss": 0.1976, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6175, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.918508194396769e-06, |
|
"loss": 0.2028, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.8973123479174038e-06, |
|
"loss": 0.2065, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6225, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.8761623984361444e-06, |
|
"loss": 0.198, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.8550599566035299e-06, |
|
"loss": 0.192, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6275, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.834006629452207e-06, |
|
"loss": 0.1894, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.8130040202745488e-06, |
|
"loss": 0.203, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6325, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.7920537285005607e-06, |
|
"loss": 0.1879, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.7711573495760725e-06, |
|
"loss": 0.1887, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.750316474841242e-06, |
|
"loss": 0.1894, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.7295326914093713e-06, |
|
"loss": 0.1991, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6425, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.7088075820460348e-06, |
|
"loss": 0.1982, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.6881427250485516e-06, |
|
"loss": 0.1876, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6475, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.6675396941257896e-06, |
|
"loss": 0.1853, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.6470000582783205e-06, |
|
"loss": 0.1917, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6525, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.6265253816789372e-06, |
|
"loss": 0.2017, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.6061172235535342e-06, |
|
"loss": 0.193, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6575, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.5857771380623643e-06, |
|
"loss": 0.2018, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.5655066741816898e-06, |
|
"loss": 0.2062, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.545307375585814e-06, |
|
"loss": 0.1961, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.5251807805295302e-06, |
|
"loss": 0.208, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6675, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.5051284217309743e-06, |
|
"loss": 0.1951, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.4851518262549058e-06, |
|
"loss": 0.1936, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6725, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.465252515396413e-06, |
|
"loss": 0.2048, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.4454320045650606e-06, |
|
"loss": 0.1979, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6775, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.4256918031694866e-06, |
|
"loss": 0.198, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.4060334145024543e-06, |
|
"loss": 0.1902, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6825, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.3864583356263706e-06, |
|
"loss": 0.1911, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.366968057259282e-06, |
|
"loss": 0.2109, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.3475640636613447e-06, |
|
"loss": 0.2015, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.3282478325217961e-06, |
|
"loss": 0.2006, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.6925, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.3090208348464244e-06, |
|
"loss": 0.1842, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.289884534845542e-06, |
|
"loss": 0.2091, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.6975, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.2708403898224839e-06, |
|
"loss": 0.185, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.2518898500626259e-06, |
|
"loss": 0.1875, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7025, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.2330343587229397e-06, |
|
"loss": 0.1926, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.2142753517220945e-06, |
|
"loss": 0.1988, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7075, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.1956142576311011e-06, |
|
"loss": 0.1988, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.177052497564524e-06, |
|
"loss": 0.1817, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.1585914850722565e-06, |
|
"loss": 0.1887, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.1402326260318752e-06, |
|
"loss": 0.1821, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7175, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.121977318541575e-06, |
|
"loss": 0.1948, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.1038269528136989e-06, |
|
"loss": 0.2054, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7225, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.0857829110688695e-06, |
|
"loss": 0.1956, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.0678465674307273e-06, |
|
"loss": 0.2033, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7275, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.0500192878212826e-06, |
|
"loss": 0.1986, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.032302429856899e-06, |
|
"loss": 0.1979, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7325, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.014697342744904e-06, |
|
"loss": 0.1994, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.97205367180842e-07, |
|
"loss": 0.1921, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 9.798278352463752e-07, |
|
"loss": 0.1999, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.625660703078394e-07, |
|
"loss": 0.2021, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7425, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.45421386915468e-07, |
|
"loss": 0.2027, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.283950907032788e-07, |
|
"loss": 0.1898, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7475, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 9.114884782896482e-07, |
|
"loss": 0.1943, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 8.947028371785677e-07, |
|
"loss": 0.2189, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7525, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.780394456615974e-07, |
|
"loss": 0.1889, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 8.614995727205155e-07, |
|
"loss": 0.2013, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7575, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.450844779306827e-07, |
|
"loss": 0.1867, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 8.28795411365122e-07, |
|
"loss": 0.2004, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 8.126336134993176e-07, |
|
"loss": 0.1959, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 7.966003151167498e-07, |
|
"loss": 0.1855, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7675, |
|
"grad_norm": 1.125, |
|
"learning_rate": 7.806967372151661e-07, |
|
"loss": 0.1901, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 7.649240909135966e-07, |
|
"loss": 0.2125, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7725, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 7.492835773601234e-07, |
|
"loss": 0.1916, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 7.337763876404078e-07, |
|
"loss": 0.1844, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7775, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.184037026869867e-07, |
|
"loss": 0.1908, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.031666931893361e-07, |
|
"loss": 0.1858, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.7825, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 6.880665195047226e-07, |
|
"loss": 0.1976, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 6.731043315698346e-07, |
|
"loss": 0.1901, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.58281268813212e-07, |
|
"loss": 0.2021, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 6.435984600684731e-07, |
|
"loss": 0.2088, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.7925, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.290570234883506e-07, |
|
"loss": 0.2004, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 6.146580664595391e-07, |
|
"loss": 0.1964, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.7975, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.004026855183656e-07, |
|
"loss": 0.1996, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.862919662672801e-07, |
|
"loss": 0.1919, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8025, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 5.723269832921849e-07, |
|
"loss": 0.202, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.805, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.585088000806016e-07, |
|
"loss": 0.2024, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8075, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 5.448384689406804e-07, |
|
"loss": 0.1962, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 5.313170309210655e-07, |
|
"loss": 0.1879, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 5.179455157316124e-07, |
|
"loss": 0.1987, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.815, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 5.047249416649713e-07, |
|
"loss": 0.2078, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8175, |
|
"grad_norm": 14.25, |
|
"learning_rate": 4.916563155190446e-07, |
|
"loss": 0.2002, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.787406325203101e-07, |
|
"loss": 0.1948, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8225, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.6597887624803273e-07, |
|
"loss": 0.1981, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.533720185593621e-07, |
|
"loss": 0.1965, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8275, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.4092101951532076e-07, |
|
"loss": 0.1926, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.2862682730769157e-07, |
|
"loss": 0.1952, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8325, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.164903781868096e-07, |
|
"loss": 0.1853, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.835, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.045125963902641e-07, |
|
"loss": 0.2008, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8375, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.9269439407251365e-07, |
|
"loss": 0.2064, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.810366712354199e-07, |
|
"loss": 0.1833, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8425, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.6954031565971187e-07, |
|
"loss": 0.1889, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.845, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 3.5820620283737615e-07, |
|
"loss": 0.1785, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8475, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.4703519590498615e-07, |
|
"loss": 0.2022, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.360281455779704e-07, |
|
"loss": 0.1942, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8525, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.2518589008582597e-07, |
|
"loss": 0.1933, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.855, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.1450925510828705e-07, |
|
"loss": 0.1902, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.8575, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.039990537124432e-07, |
|
"loss": 0.2161, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.936560862908225e-07, |
|
"loss": 0.178, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8625, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.8348114050043813e-07, |
|
"loss": 0.2021, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.865, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.7347499120280677e-07, |
|
"loss": 0.2043, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.8675, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.636384004049375e-07, |
|
"loss": 0.1921, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.5397211720130267e-07, |
|
"loss": 0.19, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8725, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.4447687771679414e-07, |
|
"loss": 0.1914, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.3515340505066043e-07, |
|
"loss": 0.1919, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8775, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.260024092214419e-07, |
|
"loss": 0.2006, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.170245871129012e-07, |
|
"loss": 0.1982, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8825, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.0822062242095015e-07, |
|
"loss": 0.2113, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.885, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.995911856015867e-07, |
|
"loss": 0.1969, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8875, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.9113693381983405e-07, |
|
"loss": 0.1974, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.8285851089969802e-07, |
|
"loss": 0.1987, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.8925, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.7475654727513502e-07, |
|
"loss": 0.201, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.895, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.668316599420433e-07, |
|
"loss": 0.185, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.8975, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.5908445241127528e-07, |
|
"loss": 0.1872, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.5151551466267956e-07, |
|
"loss": 0.1985, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9025, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.441254231001696e-07, |
|
"loss": 0.2036, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.905, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.3691474050782972e-07, |
|
"loss": 0.1902, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9075, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.2988401600705635e-07, |
|
"loss": 0.1921, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.2303378501474174e-07, |
|
"loss": 0.1752, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9125, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.16364569202497e-07, |
|
"loss": 0.1893, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.915, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.0987687645692746e-07, |
|
"loss": 0.2005, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9175, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.035712008409534e-07, |
|
"loss": 0.1948, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.744802255618662e-08, |
|
"loss": 0.1903, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9225, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.150780790636054e-08, |
|
"loss": 0.2147, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 8.575100926181884e-08, |
|
"loss": 0.19, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9275, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 8.017806502506692e-08, |
|
"loss": 0.2004, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.478939959738502e-08, |
|
"loss": 0.2059, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9325, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 6.958542334650847e-08, |
|
"loss": 0.1908, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.935, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 6.456653257537665e-08, |
|
"loss": 0.1979, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 5.973310949195343e-08, |
|
"loss": 0.1934, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 5.50855221801197e-08, |
|
"loss": 0.1893, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9425, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 5.062412457164323e-08, |
|
"loss": 0.1985, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.945, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.634925641922472e-08, |
|
"loss": 0.2008, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.9475, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.226124327062514e-08, |
|
"loss": 0.2056, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.836039644387307e-08, |
|
"loss": 0.2036, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9525, |
|
"grad_norm": 16.25, |
|
"learning_rate": 3.4647013003556996e-08, |
|
"loss": 0.199, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.955, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.112137573820284e-08, |
|
"loss": 0.1917, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.9575, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 2.7783753138738713e-08, |
|
"loss": 0.1946, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.463439937804707e-08, |
|
"loss": 0.1816, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.9625, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.1673554291610775e-08, |
|
"loss": 0.1859, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.965, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.8901443359245765e-08, |
|
"loss": 0.1993, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9675, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.6318277687932816e-08, |
|
"loss": 0.2007, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.3924253995738769e-08, |
|
"loss": 0.1925, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9725, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.1719554596836546e-08, |
|
"loss": 0.1996, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.704347387620994e-09, |
|
"loss": 0.1836, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9775, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.878785833923819e-09, |
|
"loss": 0.1928, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 6.243008959324892e-09, |
|
"loss": 0.1967, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9825, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.797141334566268e-09, |
|
"loss": 0.1992, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.985, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.5412930680658876e-09, |
|
"loss": 0.1993, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.9875, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.475559797531224e-09, |
|
"loss": 0.1989, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.6000226826770604e-09, |
|
"loss": 0.1826, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.9925, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.147483990443184e-10, |
|
"loss": 0.1969, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.995, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.197891329230097e-10, |
|
"loss": 0.2085, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.9975, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.1518257737763716e-10, |
|
"loss": 0.1928, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.51929376435956e-13, |
|
"loss": 0.1986, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.82656621576192e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|