|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.91111111111111, |
|
"eval_steps": 500, |
|
"global_step": 896, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 3.4277842044830322, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 1.3797, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 2.029878854751587, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 1.6295349597930908, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.2302, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.916685163974762, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.1664, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.7659821510314941, |
|
"learning_rate": 0.000199982965150241, |
|
"loss": 0.1398, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.0613561868667603, |
|
"learning_rate": 0.00019984672117252423, |
|
"loss": 0.1207, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.7991588711738586, |
|
"learning_rate": 0.00019957441887293156, |
|
"loss": 0.1119, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.6806069016456604, |
|
"learning_rate": 0.00019916642931015662, |
|
"loss": 0.0885, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6726401448249817, |
|
"learning_rate": 0.00019862330844011466, |
|
"loss": 0.0849, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.44822949171066284, |
|
"learning_rate": 0.00019794579635835704, |
|
"loss": 0.0721, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.40653809905052185, |
|
"learning_rate": 0.0001971348162915637, |
|
"loss": 0.0729, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.40992143750190735, |
|
"learning_rate": 0.00019619147333948823, |
|
"loss": 0.0666, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.6051583290100098, |
|
"learning_rate": 0.00019511705296906945, |
|
"loss": 0.0606, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.46464696526527405, |
|
"learning_rate": 0.00019391301926276156, |
|
"loss": 0.0631, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.5700137615203857, |
|
"learning_rate": 0.00019258101292347042, |
|
"loss": 0.0594, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.5737802386283875, |
|
"learning_rate": 0.0001911228490388136, |
|
"loss": 0.0609, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.4904007315635681, |
|
"learning_rate": 0.0001895405146077514, |
|
"loss": 0.0539, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4444526731967926, |
|
"learning_rate": 0.00018783616583295943, |
|
"loss": 0.0514, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.3706776201725006, |
|
"learning_rate": 0.00018601212518263156, |
|
"loss": 0.0495, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.29509109258651733, |
|
"learning_rate": 0.00018407087822571794, |
|
"loss": 0.0477, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.4063428044319153, |
|
"learning_rate": 0.00018201507024490988, |
|
"loss": 0.0485, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.4438524544239044, |
|
"learning_rate": 0.0001798475026319875, |
|
"loss": 0.046, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.111111111111111, |
|
"grad_norm": 0.26472207903862, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 0.0452, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.3637102246284485, |
|
"learning_rate": 0.0001751890515105738, |
|
"loss": 0.0471, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.2891571819782257, |
|
"learning_rate": 0.00017270451594255233, |
|
"loss": 0.0436, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 0.380941778421402, |
|
"learning_rate": 0.00017012090797319628, |
|
"loss": 0.0439, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.573648989200592, |
|
"learning_rate": 0.00016744174821250237, |
|
"loss": 0.0434, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 0.34533384442329407, |
|
"learning_rate": 0.0001646706874762089, |
|
"loss": 0.0369, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.444444444444445, |
|
"grad_norm": 0.5272351503372192, |
|
"learning_rate": 0.0001618115018109318, |
|
"loss": 0.0403, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.2849768400192261, |
|
"learning_rate": 0.00015886808734865202, |
|
"loss": 0.0388, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.888888888888889, |
|
"grad_norm": 0.301411896944046, |
|
"learning_rate": 0.00015584445499756578, |
|
"loss": 0.0405, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 0.32372748851776123, |
|
"learning_rate": 0.0001527447249765329, |
|
"loss": 0.0362, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 0.26995155215263367, |
|
"learning_rate": 0.00014957312120057005, |
|
"loss": 0.0335, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.555555555555555, |
|
"grad_norm": 0.35481250286102295, |
|
"learning_rate": 0.00014633396552504063, |
|
"loss": 0.0424, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 0.3813558518886566, |
|
"learning_rate": 0.00014303167185638366, |
|
"loss": 0.0378, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.3790789842605591, |
|
"learning_rate": 0.0001396707401374078, |
|
"loss": 0.0403, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 8.222222222222221, |
|
"grad_norm": 0.28233516216278076, |
|
"learning_rate": 0.00013625575021534536, |
|
"loss": 0.0304, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.444444444444445, |
|
"grad_norm": 0.4000020921230316, |
|
"learning_rate": 0.00013279135560102337, |
|
"loss": 0.033, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 0.29253950715065, |
|
"learning_rate": 0.00012928227712765504, |
|
"loss": 0.033, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.33955538272857666, |
|
"learning_rate": 0.00012573329651789297, |
|
"loss": 0.0338, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 9.11111111111111, |
|
"grad_norm": 0.3720948398113251, |
|
"learning_rate": 0.00012214924986791003, |
|
"loss": 0.0353, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 0.2159433513879776, |
|
"learning_rate": 0.00011853502105738692, |
|
"loss": 0.0328, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.555555555555555, |
|
"grad_norm": 0.20489104092121124, |
|
"learning_rate": 0.00011489553509438657, |
|
"loss": 0.0351, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 0.19427303969860077, |
|
"learning_rate": 0.00011123575140418414, |
|
"loss": 0.0302, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.2984829545021057, |
|
"learning_rate": 0.00010756065707119729, |
|
"loss": 0.0285, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 10.222222222222221, |
|
"grad_norm": 0.19880931079387665, |
|
"learning_rate": 0.0001038752600432265, |
|
"loss": 0.0283, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 10.444444444444445, |
|
"grad_norm": 0.30339422821998596, |
|
"learning_rate": 0.00010018458230726523, |
|
"loss": 0.0268, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 0.20127274096012115, |
|
"learning_rate": 9.649365304617952e-05, |
|
"loss": 0.031, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.88888888888889, |
|
"grad_norm": 0.2175353467464447, |
|
"learning_rate": 9.280750178558138e-05, |
|
"loss": 0.0259, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 0.26974818110466003, |
|
"learning_rate": 8.913115154023605e-05, |
|
"loss": 0.0306, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 11.333333333333334, |
|
"grad_norm": 0.4030226767063141, |
|
"learning_rate": 8.546961196934043e-05, |
|
"loss": 0.0248, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 11.555555555555555, |
|
"grad_norm": 0.23470337688922882, |
|
"learning_rate": 8.182787255000155e-05, |
|
"loss": 0.0262, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.777777777777779, |
|
"grad_norm": 0.40099358558654785, |
|
"learning_rate": 7.82108957782161e-05, |
|
"loss": 0.0259, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.17941424250602722, |
|
"learning_rate": 7.462361040661667e-05, |
|
"loss": 0.0299, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 12.222222222222221, |
|
"grad_norm": 0.24401065707206726, |
|
"learning_rate": 7.107090472819896e-05, |
|
"loss": 0.0231, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 12.444444444444445, |
|
"grad_norm": 0.26819756627082825, |
|
"learning_rate": 6.755761991518219e-05, |
|
"loss": 0.0262, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.666666666666666, |
|
"grad_norm": 0.300659716129303, |
|
"learning_rate": 6.408854342207982e-05, |
|
"loss": 0.0289, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 12.88888888888889, |
|
"grad_norm": 0.2212854027748108, |
|
"learning_rate": 6.0668402461969807e-05, |
|
"loss": 0.0243, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 13.11111111111111, |
|
"grad_norm": 0.28721633553504944, |
|
"learning_rate": 5.730185756485395e-05, |
|
"loss": 0.0256, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 0.2350863218307495, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 0.0261, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 13.555555555555555, |
|
"grad_norm": 0.31170952320098877, |
|
"learning_rate": 5.074782665911341e-05, |
|
"loss": 0.0247, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 13.777777777777779, |
|
"grad_norm": 0.18656305968761444, |
|
"learning_rate": 4.756927164427685e-05, |
|
"loss": 0.0233, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.2578571140766144, |
|
"learning_rate": 4.446216250999641e-05, |
|
"loss": 0.0257, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 14.222222222222221, |
|
"grad_norm": 0.16936838626861572, |
|
"learning_rate": 4.1430733226599114e-05, |
|
"loss": 0.0212, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 14.444444444444445, |
|
"grad_norm": 0.15445557236671448, |
|
"learning_rate": 3.8479114637605285e-05, |
|
"loss": 0.0246, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 14.666666666666666, |
|
"grad_norm": 0.1869278848171234, |
|
"learning_rate": 3.561132883074427e-05, |
|
"loss": 0.0232, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 14.88888888888889, |
|
"grad_norm": 0.20103834569454193, |
|
"learning_rate": 3.2831283657168275e-05, |
|
"loss": 0.0226, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 15.11111111111111, |
|
"grad_norm": 0.2504471242427826, |
|
"learning_rate": 3.0142767406333518e-05, |
|
"loss": 0.0184, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 15.333333333333334, |
|
"grad_norm": 0.18746283650398254, |
|
"learning_rate": 2.7549443643804585e-05, |
|
"loss": 0.019, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 15.555555555555555, |
|
"grad_norm": 0.25808605551719666, |
|
"learning_rate": 2.505484621901655e-05, |
|
"loss": 0.0181, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 15.777777777777779, |
|
"grad_norm": 0.2831310033798218, |
|
"learning_rate": 2.2662374449797664e-05, |
|
"loss": 0.022, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.18062320351600647, |
|
"learning_rate": 2.0375288490214404e-05, |
|
"loss": 0.0205, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 16.22222222222222, |
|
"grad_norm": 0.15133315324783325, |
|
"learning_rate": 1.819670488805111e-05, |
|
"loss": 0.0198, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 16.444444444444443, |
|
"grad_norm": 0.20342513918876648, |
|
"learning_rate": 1.6129592337977995e-05, |
|
"loss": 0.02, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.21008381247520447, |
|
"learning_rate": 1.4176767636194122e-05, |
|
"loss": 0.0232, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 16.88888888888889, |
|
"grad_norm": 0.25737613439559937, |
|
"learning_rate": 1.234089184205851e-05, |
|
"loss": 0.0227, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 17.11111111111111, |
|
"grad_norm": 0.15780089795589447, |
|
"learning_rate": 1.0624466651939247e-05, |
|
"loss": 0.0203, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 17.333333333333332, |
|
"grad_norm": 0.16062302887439728, |
|
"learning_rate": 9.029830990222132e-06, |
|
"loss": 0.0244, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 17.555555555555557, |
|
"grad_norm": 0.18100588023662567, |
|
"learning_rate": 7.55915782212413e-06, |
|
"loss": 0.0173, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 0.20718730986118317, |
|
"learning_rate": 6.214451192654747e-06, |
|
"loss": 0.0186, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.18695631623268127, |
|
"learning_rate": 4.9975434957601264e-06, |
|
"loss": 0.0177, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 18.22222222222222, |
|
"grad_norm": 0.1640794426202774, |
|
"learning_rate": 3.910092977371394e-06, |
|
"loss": 0.0169, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 18.444444444444443, |
|
"grad_norm": 0.18873189389705658, |
|
"learning_rate": 2.953581475759404e-06, |
|
"loss": 0.0214, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 18.666666666666668, |
|
"grad_norm": 0.1896572709083557, |
|
"learning_rate": 2.1293124022754407e-06, |
|
"loss": 0.0173, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 18.88888888888889, |
|
"grad_norm": 0.09888505190610886, |
|
"learning_rate": 1.4384089652291543e-06, |
|
"loss": 0.0185, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 19.11111111111111, |
|
"grad_norm": 0.17348803579807281, |
|
"learning_rate": 8.818126393241643e-07, |
|
"loss": 0.0182, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 19.333333333333332, |
|
"grad_norm": 0.20844842493534088, |
|
"learning_rate": 4.602818827369126e-07, |
|
"loss": 0.0217, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 19.555555555555557, |
|
"grad_norm": 0.16763387620449066, |
|
"learning_rate": 1.7439110358704602e-07, |
|
"loss": 0.0175, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 19.77777777777778, |
|
"grad_norm": 0.13927114009857178, |
|
"learning_rate": 2.4529877207557505e-08, |
|
"loss": 0.0221, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 19.91111111111111, |
|
"step": 896, |
|
"total_flos": 6.431855929189344e+16, |
|
"train_loss": 0.060006462794262916, |
|
"train_runtime": 634.4726, |
|
"train_samples_per_second": 90.381, |
|
"train_steps_per_second": 1.412 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 896, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.431855929189344e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|