|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 54.75, |
|
"learning_rate": 7.2e-07, |
|
"loss": 13.3473, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 65.0, |
|
"learning_rate": 1.52e-06, |
|
"loss": 13.3436, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 57.0, |
|
"learning_rate": 2.3200000000000002e-06, |
|
"loss": 13.2209, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 58.0, |
|
"learning_rate": 3.12e-06, |
|
"loss": 13.1524, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 59.0, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 12.7597, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 80.5, |
|
"learning_rate": 4.7200000000000005e-06, |
|
"loss": 12.78, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.028, |
|
"grad_norm": 64.5, |
|
"learning_rate": 5.5200000000000005e-06, |
|
"loss": 12.5047, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 72.5, |
|
"learning_rate": 6.3200000000000005e-06, |
|
"loss": 12.1863, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 58.0, |
|
"learning_rate": 7.1200000000000004e-06, |
|
"loss": 12.0318, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 57.25, |
|
"learning_rate": 7.92e-06, |
|
"loss": 11.5564, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044, |
|
"grad_norm": 47.0, |
|
"learning_rate": 8.720000000000001e-06, |
|
"loss": 11.5071, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 40.5, |
|
"learning_rate": 9.52e-06, |
|
"loss": 10.9574, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.052, |
|
"grad_norm": 41.5, |
|
"learning_rate": 1.0320000000000001e-05, |
|
"loss": 10.7984, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 39.0, |
|
"learning_rate": 1.1120000000000002e-05, |
|
"loss": 10.665, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 57.25, |
|
"learning_rate": 1.1920000000000001e-05, |
|
"loss": 10.3269, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 21.5, |
|
"learning_rate": 1.2720000000000002e-05, |
|
"loss": 10.2163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.068, |
|
"grad_norm": 58.25, |
|
"learning_rate": 1.3520000000000003e-05, |
|
"loss": 10.1014, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 27.875, |
|
"learning_rate": 1.432e-05, |
|
"loss": 9.5063, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.076, |
|
"grad_norm": 30.5, |
|
"learning_rate": 1.5120000000000001e-05, |
|
"loss": 9.6283, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 27.5, |
|
"learning_rate": 1.5920000000000003e-05, |
|
"loss": 9.6242, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 28.0, |
|
"learning_rate": 1.672e-05, |
|
"loss": 9.5881, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 56.75, |
|
"learning_rate": 1.752e-05, |
|
"loss": 9.3942, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.092, |
|
"grad_norm": 28.75, |
|
"learning_rate": 1.832e-05, |
|
"loss": 9.0517, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 42.75, |
|
"learning_rate": 1.912e-05, |
|
"loss": 9.1673, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 43.25, |
|
"learning_rate": 1.9920000000000002e-05, |
|
"loss": 8.6786, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 35.75, |
|
"learning_rate": 1.9999210442038164e-05, |
|
"loss": 8.981, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 19.125, |
|
"learning_rate": 1.9996481265944146e-05, |
|
"loss": 8.7596, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 28.0, |
|
"learning_rate": 1.9991803256020393e-05, |
|
"loss": 8.6905, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.116, |
|
"grad_norm": 37.25, |
|
"learning_rate": 1.99851773242542e-05, |
|
"loss": 8.5464, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 26.5, |
|
"learning_rate": 1.99766047623841e-05, |
|
"loss": 8.5164, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.124, |
|
"grad_norm": 29.875, |
|
"learning_rate": 1.996608724164801e-05, |
|
"loss": 8.4347, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 30.375, |
|
"learning_rate": 1.995362681245744e-05, |
|
"loss": 8.1998, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 27.0, |
|
"learning_rate": 1.9939225903997748e-05, |
|
"loss": 8.3628, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 22.75, |
|
"learning_rate": 1.992288732375458e-05, |
|
"loss": 8.0026, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 28.0, |
|
"learning_rate": 1.9904614256966514e-05, |
|
"loss": 8.1026, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 25.125, |
|
"learning_rate": 1.9884410266004134e-05, |
|
"loss": 7.7991, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.148, |
|
"grad_norm": 33.0, |
|
"learning_rate": 1.986227928967551e-05, |
|
"loss": 7.9716, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 29.375, |
|
"learning_rate": 1.983822564245833e-05, |
|
"loss": 7.7902, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 21.875, |
|
"learning_rate": 1.981225401365877e-05, |
|
"loss": 7.6093, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 28.0, |
|
"learning_rate": 1.9784369466497333e-05, |
|
"loss": 7.6624, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.164, |
|
"grad_norm": 23.25, |
|
"learning_rate": 1.9754577437121733e-05, |
|
"loss": 7.8131, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 27.75, |
|
"learning_rate": 1.9722883733547128e-05, |
|
"loss": 7.7623, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.172, |
|
"grad_norm": 20.375, |
|
"learning_rate": 1.968929453452383e-05, |
|
"loss": 7.6954, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 25.125, |
|
"learning_rate": 1.965381638833274e-05, |
|
"loss": 7.6341, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 31.25, |
|
"learning_rate": 1.9616456211508756e-05, |
|
"loss": 7.759, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 29.125, |
|
"learning_rate": 1.9577221287492368e-05, |
|
"loss": 7.6304, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.188, |
|
"grad_norm": 29.625, |
|
"learning_rate": 1.9536119265209763e-05, |
|
"loss": 7.3878, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 19.875, |
|
"learning_rate": 1.9493158157581617e-05, |
|
"loss": 7.175, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.196, |
|
"grad_norm": 21.625, |
|
"learning_rate": 1.9448346339960984e-05, |
|
"loss": 7.3158, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 21.375, |
|
"learning_rate": 1.9401692548500504e-05, |
|
"loss": 7.1866, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 22.875, |
|
"learning_rate": 1.935320587844926e-05, |
|
"loss": 7.2426, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 23.0, |
|
"learning_rate": 1.9302895782379648e-05, |
|
"loss": 7.2151, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.212, |
|
"grad_norm": 18.125, |
|
"learning_rate": 1.925077206834458e-05, |
|
"loss": 7.1319, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 26.0, |
|
"learning_rate": 1.9196844897965393e-05, |
|
"loss": 7.2889, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 17.875, |
|
"learning_rate": 1.914112478445079e-05, |
|
"loss": 7.1092, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 28.5, |
|
"learning_rate": 1.9083622590547313e-05, |
|
"loss": 7.3018, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 27.375, |
|
"learning_rate": 1.9024349526421596e-05, |
|
"loss": 7.1044, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 23.5, |
|
"learning_rate": 1.896331714747493e-05, |
|
"loss": 7.2433, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.236, |
|
"grad_norm": 22.625, |
|
"learning_rate": 1.8900537352090523e-05, |
|
"loss": 6.9127, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 29.75, |
|
"learning_rate": 1.8836022379313884e-05, |
|
"loss": 7.0044, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.244, |
|
"grad_norm": 23.625, |
|
"learning_rate": 1.8769784806466768e-05, |
|
"loss": 6.7093, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 18.875, |
|
"learning_rate": 1.870183754669526e-05, |
|
"loss": 6.7646, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 20.5, |
|
"learning_rate": 1.863219384645227e-05, |
|
"loss": 6.8328, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 27.75, |
|
"learning_rate": 1.8560867282915164e-05, |
|
"loss": 6.7888, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 20.25, |
|
"learning_rate": 1.848787176133882e-05, |
|
"loss": 6.8189, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 28.125, |
|
"learning_rate": 1.8413221512344805e-05, |
|
"loss": 6.8333, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.268, |
|
"grad_norm": 20.0, |
|
"learning_rate": 1.8336931089147076e-05, |
|
"loss": 6.6808, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 18.625, |
|
"learning_rate": 1.8259015364714786e-05, |
|
"loss": 6.6521, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 22.75, |
|
"learning_rate": 1.8179489528872808e-05, |
|
"loss": 6.7383, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 25.125, |
|
"learning_rate": 1.80983690853404e-05, |
|
"loss": 6.77, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.284, |
|
"grad_norm": 26.875, |
|
"learning_rate": 1.8015669848708768e-05, |
|
"loss": 6.7929, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 23.375, |
|
"learning_rate": 1.793140794135795e-05, |
|
"loss": 6.7116, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.292, |
|
"grad_norm": 23.125, |
|
"learning_rate": 1.7845599790313735e-05, |
|
"loss": 6.5916, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 21.125, |
|
"learning_rate": 1.7758262124045195e-05, |
|
"loss": 6.6243, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 19.625, |
|
"learning_rate": 1.7669411969203417e-05, |
|
"loss": 6.6006, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 28.25, |
|
"learning_rate": 1.7579066647302134e-05, |
|
"loss": 6.5745, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.308, |
|
"grad_norm": 16.0, |
|
"learning_rate": 1.7487243771340862e-05, |
|
"loss": 6.4531, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 20.875, |
|
"learning_rate": 1.7393961242371203e-05, |
|
"loss": 6.5237, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.316, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 1.7299237246007018e-05, |
|
"loss": 6.3753, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 18.875, |
|
"learning_rate": 1.720309024887907e-05, |
|
"loss": 6.419, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 20.875, |
|
"learning_rate": 1.710553899503496e-05, |
|
"loss": 6.3493, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 20.625, |
|
"learning_rate": 1.700660250228492e-05, |
|
"loss": 6.376, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.332, |
|
"grad_norm": 18.375, |
|
"learning_rate": 1.690630005849423e-05, |
|
"loss": 6.471, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 1.6804651217823055e-05, |
|
"loss": 6.43, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 15.875, |
|
"learning_rate": 1.6701675796914284e-05, |
|
"loss": 6.2644, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 18.125, |
|
"learning_rate": 1.6597393871030264e-05, |
|
"loss": 6.2126, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 19.75, |
|
"learning_rate": 1.649182577013906e-05, |
|
"loss": 6.3967, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 25.5, |
|
"learning_rate": 1.6384992074951124e-05, |
|
"loss": 6.2047, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.356, |
|
"grad_norm": 23.75, |
|
"learning_rate": 1.6276913612907005e-05, |
|
"loss": 6.4128, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 43.75, |
|
"learning_rate": 1.6167611454117027e-05, |
|
"loss": 6.1328, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.364, |
|
"grad_norm": 19.625, |
|
"learning_rate": 1.6057106907253617e-05, |
|
"loss": 6.2045, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 18.75, |
|
"learning_rate": 1.5945421515397135e-05, |
|
"loss": 6.1914, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.372, |
|
"grad_norm": 26.125, |
|
"learning_rate": 1.5832577051836016e-05, |
|
"loss": 6.3516, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 20.375, |
|
"learning_rate": 1.5718595515822027e-05, |
|
"loss": 6.2091, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 19.75, |
|
"learning_rate": 1.5603499128281447e-05, |
|
"loss": 6.148, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 18.0, |
|
"learning_rate": 1.5487310327483087e-05, |
|
"loss": 6.0491, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.388, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.5370051764663872e-05, |
|
"loss": 6.2801, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.5251746299612959e-05, |
|
"loss": 6.1509, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.396, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.5132416996215171e-05, |
|
"loss": 6.1506, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 21.625, |
|
"learning_rate": 1.5012087117954643e-05, |
|
"loss": 6.1863, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.404, |
|
"grad_norm": 22.875, |
|
"learning_rate": 1.4890780123379565e-05, |
|
"loss": 6.1573, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 18.375, |
|
"learning_rate": 1.4768519661528879e-05, |
|
"loss": 6.0607, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.412, |
|
"grad_norm": 20.0, |
|
"learning_rate": 1.464532956732188e-05, |
|
"loss": 6.1736, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.4521233856911507e-05, |
|
"loss": 6.0672, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.43962567230024e-05, |
|
"loss": 5.7487, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.4270422530134433e-05, |
|
"loss": 6.0476, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.428, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.4143755809932843e-05, |
|
"loss": 6.1103, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.4016281256325702e-05, |
|
"loss": 5.907, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.436, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.388802372072981e-05, |
|
"loss": 5.9089, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 15.625, |
|
"learning_rate": 1.3759008207205869e-05, |
|
"loss": 5.9699, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.444, |
|
"grad_norm": 20.0, |
|
"learning_rate": 1.3629259867583864e-05, |
|
"loss": 5.7375, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 1.349880399655969e-05, |
|
"loss": 5.9555, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.452, |
|
"grad_norm": 18.125, |
|
"learning_rate": 1.3367666026763884e-05, |
|
"loss": 5.986, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 19.125, |
|
"learning_rate": 1.3235871523803496e-05, |
|
"loss": 5.896, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.3103446181278015e-05, |
|
"loss": 5.8373, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.297041581577035e-05, |
|
"loss": 5.8646, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.468, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.2836806361813846e-05, |
|
"loss": 5.8566, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 21.75, |
|
"learning_rate": 1.270264386683628e-05, |
|
"loss": 5.6326, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.476, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.256795448608188e-05, |
|
"loss": 5.7247, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 15.875, |
|
"learning_rate": 1.2432764477512294e-05, |
|
"loss": 5.6694, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.484, |
|
"grad_norm": 16.25, |
|
"learning_rate": 1.2297100196687557e-05, |
|
"loss": 5.8965, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.2160988091628023e-05, |
|
"loss": 5.6092, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.492, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.202445469765826e-05, |
|
"loss": 5.6918, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.1887526632233954e-05, |
|
"loss": 5.8349, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.1750230589752763e-05, |
|
"loss": 5.8349, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 15.25, |
|
"learning_rate": 1.1612593336350209e-05, |
|
"loss": 5.6769, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.508, |
|
"grad_norm": 16.875, |
|
"learning_rate": 1.1474641704681551e-05, |
|
"loss": 5.6439, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.1336402588690727e-05, |
|
"loss": 5.653, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.516, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.1197902938367297e-05, |
|
"loss": 5.5384, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.105916975449252e-05, |
|
"loss": 5.5844, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.524, |
|
"grad_norm": 15.5625, |
|
"learning_rate": 1.0920230083375474e-05, |
|
"loss": 5.7039, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.0781111011580336e-05, |
|
"loss": 5.8199, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.532, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.0641839660645806e-05, |
|
"loss": 5.6763, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.0502443181797696e-05, |
|
"loss": 5.6979, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 15.5625, |
|
"learning_rate": 1.036294875065576e-05, |
|
"loss": 5.6108, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.0223383561935738e-05, |
|
"loss": 5.6434, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.548, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.0083774824147707e-05, |
|
"loss": 5.5022, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 22.875, |
|
"learning_rate": 9.944149754291719e-06, |
|
"loss": 5.6792, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.556, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 9.80453557255179e-06, |
|
"loss": 5.6344, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 13.5, |
|
"learning_rate": 9.664959496989286e-06, |
|
"loss": 5.5327, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.564, |
|
"grad_norm": 13.125, |
|
"learning_rate": 9.525448738236691e-06, |
|
"loss": 5.477, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 12.625, |
|
"learning_rate": 9.386030494192847e-06, |
|
"loss": 5.5999, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.572, |
|
"grad_norm": 15.875, |
|
"learning_rate": 9.246731944720675e-06, |
|
"loss": 5.6199, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 15.25, |
|
"learning_rate": 9.107580246348395e-06, |
|
"loss": 5.5676, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 12.25, |
|
"learning_rate": 8.968602526975329e-06, |
|
"loss": 5.4793, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 14.0, |
|
"learning_rate": 8.829825880583228e-06, |
|
"loss": 5.5789, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.588, |
|
"grad_norm": 14.125, |
|
"learning_rate": 8.69127736195428e-06, |
|
"loss": 5.5571, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 8.552983981396709e-06, |
|
"loss": 5.5194, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.596, |
|
"grad_norm": 14.0, |
|
"learning_rate": 8.414972699479076e-06, |
|
"loss": 5.5605, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 13.375, |
|
"learning_rate": 8.277270421774234e-06, |
|
"loss": 5.4973, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.604, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 8.139903993614069e-06, |
|
"loss": 5.6634, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 15.625, |
|
"learning_rate": 8.00290019485593e-06, |
|
"loss": 5.6461, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.612, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 7.866285734661842e-06, |
|
"loss": 5.7029, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 7.730087246291503e-06, |
|
"loss": 5.6841, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 7.594331281910082e-06, |
|
"loss": 5.6831, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 7.4590443074118325e-06, |
|
"loss": 5.6429, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.628, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 7.324252697260475e-06, |
|
"loss": 5.6396, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 14.125, |
|
"learning_rate": 7.189982729347491e-06, |
|
"loss": 5.7249, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.636, |
|
"grad_norm": 14.25, |
|
"learning_rate": 7.056260579869165e-06, |
|
"loss": 5.6366, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 6.923112318223497e-06, |
|
"loss": 5.5417, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.644, |
|
"grad_norm": 14.625, |
|
"learning_rate": 6.790563901927907e-06, |
|
"loss": 5.5896, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 14.75, |
|
"learning_rate": 6.658641171558785e-06, |
|
"loss": 5.8033, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.652, |
|
"grad_norm": 13.5, |
|
"learning_rate": 6.52736984571381e-06, |
|
"loss": 5.6251, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 15.375, |
|
"learning_rate": 6.396775515998055e-06, |
|
"loss": 5.5814, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 16.125, |
|
"learning_rate": 6.2668836420348535e-06, |
|
"loss": 5.5373, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 14.5, |
|
"learning_rate": 6.137719546502401e-06, |
|
"loss": 5.4405, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.668, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 6.009308410197048e-06, |
|
"loss": 5.5314, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 13.625, |
|
"learning_rate": 5.881675267124254e-06, |
|
"loss": 5.5082, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.676, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 5.754844999618144e-06, |
|
"loss": 5.5669, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 12.5, |
|
"learning_rate": 5.628842333490674e-06, |
|
"loss": 5.4605, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.684, |
|
"grad_norm": 13.375, |
|
"learning_rate": 5.50369183321126e-06, |
|
"loss": 5.5224, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 5.379417897117917e-06, |
|
"loss": 5.5977, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.692, |
|
"grad_norm": 11.75, |
|
"learning_rate": 5.256044752660709e-06, |
|
"loss": 5.6513, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 16.375, |
|
"learning_rate": 5.133596451678603e-06, |
|
"loss": 5.6258, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 13.375, |
|
"learning_rate": 5.012096865710494e-06, |
|
"loss": 5.6574, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 4.891569681341403e-06, |
|
"loss": 5.5567, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.708, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 4.772038395584735e-06, |
|
"loss": 5.438, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 4.6535263113014885e-06, |
|
"loss": 5.3993, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.716, |
|
"grad_norm": 15.25, |
|
"learning_rate": 4.53605653265731e-06, |
|
"loss": 5.7265, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 18.0, |
|
"learning_rate": 4.419651960618302e-06, |
|
"loss": 5.5694, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.724, |
|
"grad_norm": 13.0, |
|
"learning_rate": 4.304335288486426e-06, |
|
"loss": 5.3722, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 4.190128997475402e-06, |
|
"loss": 5.555, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.732, |
|
"grad_norm": 12.0, |
|
"learning_rate": 4.0770553523279535e-06, |
|
"loss": 5.5382, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 3.965136396975235e-06, |
|
"loss": 5.6195, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 3.854393950239356e-06, |
|
"loss": 5.5777, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 14.375, |
|
"learning_rate": 3.7448496015797296e-06, |
|
"loss": 5.5992, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.748, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 3.636524706884181e-06, |
|
"loss": 5.5852, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 3.5294403843055604e-06, |
|
"loss": 5.5306, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.756, |
|
"grad_norm": 19.375, |
|
"learning_rate": 3.4236175101447265e-06, |
|
"loss": 5.6253, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 13.25, |
|
"learning_rate": 3.3190767147806825e-06, |
|
"loss": 5.6067, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.764, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 3.2158383786486204e-06, |
|
"loss": 5.4635, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 3.113922628266718e-06, |
|
"loss": 5.5343, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.772, |
|
"grad_norm": 13.375, |
|
"learning_rate": 3.013349332312451e-06, |
|
"loss": 5.5415, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 2.9141380977491373e-06, |
|
"loss": 5.4654, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 2.816308266003541e-06, |
|
"loss": 5.7947, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 2.7198789091951903e-06, |
|
"loss": 5.5924, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.788, |
|
"grad_norm": 16.5, |
|
"learning_rate": 2.624868826418262e-06, |
|
"loss": 5.5665, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 11.75, |
|
"learning_rate": 2.5312965400766475e-06, |
|
"loss": 5.6325, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.796, |
|
"grad_norm": 15.375, |
|
"learning_rate": 2.4391802922729703e-06, |
|
"loss": 5.5853, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 2.3485380412522586e-06, |
|
"loss": 5.529, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.804, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 2.259387457900948e-06, |
|
"loss": 5.3686, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 2.171745922301903e-06, |
|
"loss": 5.6398, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.812, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 2.0856305203461436e-06, |
|
"loss": 5.6724, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 12.0, |
|
"learning_rate": 2.0010580404019066e-06, |
|
"loss": 5.4096, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.918044970041729e-06, |
|
"loss": 5.5648, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.8366074928281608e-06, |
|
"loss": 5.4581, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.828, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.7567614851587444e-06, |
|
"loss": 5.5112, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.6785225131708749e-06, |
|
"loss": 5.5498, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.836, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.601905829707171e-06, |
|
"loss": 5.6644, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.526926371341878e-06, |
|
"loss": 5.6666, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.844, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.4535987554689712e-06, |
|
"loss": 5.6801, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.381937277452451e-06, |
|
"loss": 5.4535, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.852, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.3119559078394462e-06, |
|
"loss": 5.5883, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.2436682896366282e-06, |
|
"loss": 5.48, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.1770877356504684e-06, |
|
"loss": 5.6772, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.1122272258918864e-06, |
|
"loss": 5.5312, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.868, |
|
"grad_norm": 14.375, |
|
"learning_rate": 1.0490994050457748e-06, |
|
"loss": 5.5698, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 9.877165800058874e-07, |
|
"loss": 5.5503, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.876, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 9.280907174755916e-07, |
|
"loss": 5.7086, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 8.702334416349279e-07, |
|
"loss": 5.7326, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.884, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 8.141560318744601e-07, |
|
"loss": 5.6012, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 12.25, |
|
"learning_rate": 7.598694205963331e-07, |
|
"loss": 5.4755, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.892, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 7.073841910829771e-07, |
|
"loss": 5.6836, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 6.567105754338798e-07, |
|
"loss": 5.6414, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 17.0, |
|
"learning_rate": 6.078584525708175e-07, |
|
"loss": 5.402, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 13.25, |
|
"learning_rate": 5.608373463119354e-07, |
|
"loss": 5.6034, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.908, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 5.156564235150686e-07, |
|
"loss": 5.5214, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 4.723244922906356e-07, |
|
"loss": 5.7135, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.916, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 4.308500002844862e-07, |
|
"loss": 5.5808, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 3.912410330310157e-07, |
|
"loss": 5.6566, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.924, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 3.5350531237686723e-07, |
|
"loss": 5.6426, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 3.1765019497555617e-07, |
|
"loss": 5.5883, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.932, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 2.836826708532603e-07, |
|
"loss": 5.5928, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 2.516093620461124e-07, |
|
"loss": 5.5457, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.214365213092118e-07, |
|
"loss": 5.4391, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.9317003089764365e-07, |
|
"loss": 5.4379, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.948, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.668154014197243e-07, |
|
"loss": 5.441, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 11.25, |
|
"learning_rate": 1.4237777076268723e-07, |
|
"loss": 5.473, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.956, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.1986190309104861e-07, |
|
"loss": 5.571, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 14.25, |
|
"learning_rate": 9.9272187917826e-08, |
|
"loss": 5.6245, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.964, |
|
"grad_norm": 11.875, |
|
"learning_rate": 8.061263924878604e-08, |
|
"loss": 5.4666, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 6.388689479991606e-08, |
|
"loss": 5.4966, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.972, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.9098215288235776e-08, |
|
"loss": 5.6736, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 12.25, |
|
"learning_rate": 3.6249483796116924e-08, |
|
"loss": 5.5797, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.5343205209225062e-08, |
|
"loss": 5.564, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.6381505728176872e-08, |
|
"loss": 5.5886, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.988, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 9.366132454046162e-09, |
|
"loss": 5.6782, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 4.298453047749674e-09, |
|
"loss": 5.5379, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.996, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.1794554634314558e-09, |
|
"loss": 5.5883, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 11.125, |
|
"learning_rate": 9.74775584916543e-12, |
|
"loss": 5.4554, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3638567985152e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|