|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 870, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11494252873563218, |
|
"grad_norm": 4.644741535186768, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.1688, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 2.3340258598327637, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.5977, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 1.2856472730636597, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 0.4087, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.7791835069656372, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.3196, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"grad_norm": 1.9682774543762207, |
|
"learning_rate": 9.998698142908953e-05, |
|
"loss": 0.2605, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 1.491060495376587, |
|
"learning_rate": 9.990744804507315e-05, |
|
"loss": 0.2392, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8045977011494253, |
|
"grad_norm": 1.1608132123947144, |
|
"learning_rate": 9.975572871372513e-05, |
|
"loss": 0.2292, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.7938467264175415, |
|
"learning_rate": 9.953204288132234e-05, |
|
"loss": 0.208, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 1.011293888092041, |
|
"learning_rate": 9.923671408622129e-05, |
|
"loss": 0.1914, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"grad_norm": 0.6203814148902893, |
|
"learning_rate": 9.887016949089333e-05, |
|
"loss": 0.1906, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.264367816091954, |
|
"grad_norm": 0.8494653701782227, |
|
"learning_rate": 9.843293926407866e-05, |
|
"loss": 0.1684, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 2.562549114227295, |
|
"learning_rate": 9.7925655813952e-05, |
|
"loss": 0.1627, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4942528735632183, |
|
"grad_norm": 0.9271527528762817, |
|
"learning_rate": 9.734905287340985e-05, |
|
"loss": 0.16, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6091954022988506, |
|
"grad_norm": 0.7429759502410889, |
|
"learning_rate": 9.670396443880208e-05, |
|
"loss": 0.1459, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 2.3561415672302246, |
|
"learning_rate": 9.599132356364247e-05, |
|
"loss": 0.1455, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 0.823359489440918, |
|
"learning_rate": 9.521216100904378e-05, |
|
"loss": 0.1416, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9540229885057472, |
|
"grad_norm": 0.8451648950576782, |
|
"learning_rate": 9.436760375282859e-05, |
|
"loss": 0.1354, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.40135863423347473, |
|
"learning_rate": 9.345887335947281e-05, |
|
"loss": 0.1297, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"grad_norm": 0.5253118276596069, |
|
"learning_rate": 9.248728421323941e-05, |
|
"loss": 0.1357, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 0.6746734976768494, |
|
"learning_rate": 9.145424161705776e-05, |
|
"loss": 0.1215, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.567961573600769, |
|
"learning_rate": 9.036123975989892e-05, |
|
"loss": 0.1223, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"grad_norm": 0.8848130702972412, |
|
"learning_rate": 8.9209859555586e-05, |
|
"loss": 0.1035, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"grad_norm": 0.39595693349838257, |
|
"learning_rate": 8.800176635616657e-05, |
|
"loss": 0.1059, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.8946174383163452, |
|
"learning_rate": 8.673870754315336e-05, |
|
"loss": 0.1128, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 0.3929500877857208, |
|
"learning_rate": 8.54225100001184e-05, |
|
"loss": 0.0929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"grad_norm": 0.45673438906669617, |
|
"learning_rate": 8.405507747029523e-05, |
|
"loss": 0.0947, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 0.6119157671928406, |
|
"learning_rate": 8.263838780301182e-05, |
|
"loss": 0.0906, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.218390804597701, |
|
"grad_norm": 0.39985448122024536, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 0.0943, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.5258167386054993, |
|
"learning_rate": 7.966550171627592e-05, |
|
"loss": 0.0979, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.37616410851478577, |
|
"learning_rate": 7.81136052682082e-05, |
|
"loss": 0.0948, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.5632183908045976, |
|
"grad_norm": 0.5887285470962524, |
|
"learning_rate": 7.652104540598712e-05, |
|
"loss": 0.0996, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.67816091954023, |
|
"grad_norm": 0.6064506769180298, |
|
"learning_rate": 7.489012560227742e-05, |
|
"loss": 0.0915, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 3.263113021850586, |
|
"learning_rate": 7.322320481342054e-05, |
|
"loss": 0.0906, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.9080459770114944, |
|
"grad_norm": 0.49090155959129333, |
|
"learning_rate": 7.152269406744903e-05, |
|
"loss": 0.0784, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.022988505747127, |
|
"grad_norm": 0.6040999889373779, |
|
"learning_rate": 6.979105297678462e-05, |
|
"loss": 0.0871, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.4672296941280365, |
|
"learning_rate": 6.803078618066378e-05, |
|
"loss": 0.0751, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.252873563218391, |
|
"grad_norm": 2.379974365234375, |
|
"learning_rate": 6.624443972243698e-05, |
|
"loss": 0.0778, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.3678160919540225, |
|
"grad_norm": 1.2806555032730103, |
|
"learning_rate": 6.443459736698105e-05, |
|
"loss": 0.0763, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.482758620689655, |
|
"grad_norm": 0.937697172164917, |
|
"learning_rate": 6.260387686355121e-05, |
|
"loss": 0.0752, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.597701149425287, |
|
"grad_norm": 0.45729660987854004, |
|
"learning_rate": 6.075492615947823e-05, |
|
"loss": 0.0844, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.712643678160919, |
|
"grad_norm": 0.9074535369873047, |
|
"learning_rate": 5.889041957018745e-05, |
|
"loss": 0.0743, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.4833395779132843, |
|
"learning_rate": 5.7013053911078677e-05, |
|
"loss": 0.0693, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.942528735632184, |
|
"grad_norm": 0.7885887026786804, |
|
"learning_rate": 5.51255445968625e-05, |
|
"loss": 0.0785, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.057471264367816, |
|
"grad_norm": 0.4227044880390167, |
|
"learning_rate": 5.32306217139946e-05, |
|
"loss": 0.1054, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.172413793103448, |
|
"grad_norm": 0.6370951533317566, |
|
"learning_rate": 5.133102607188874e-05, |
|
"loss": 0.0729, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.287356321839081, |
|
"grad_norm": 0.6800243258476257, |
|
"learning_rate": 4.942950523862033e-05, |
|
"loss": 0.0723, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.402298850574713, |
|
"grad_norm": 0.4485301375389099, |
|
"learning_rate": 4.752880956685407e-05, |
|
"loss": 0.0817, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.583954393863678, |
|
"learning_rate": 4.56316882157442e-05, |
|
"loss": 0.067, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.6321839080459775, |
|
"grad_norm": 0.4566665589809418, |
|
"learning_rate": 4.3740885174560736e-05, |
|
"loss": 0.0609, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.747126436781609, |
|
"grad_norm": 0.6292296051979065, |
|
"learning_rate": 4.185913529379381e-05, |
|
"loss": 0.0684, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.862068965517241, |
|
"grad_norm": 6.967523097991943, |
|
"learning_rate": 3.998916032947594e-05, |
|
"loss": 0.0744, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 5.977011494252873, |
|
"grad_norm": 0.7225087881088257, |
|
"learning_rate": 3.8133665006444255e-05, |
|
"loss": 0.0623, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.091954022988506, |
|
"grad_norm": 0.35148298740386963, |
|
"learning_rate": 3.629533310623658e-05, |
|
"loss": 0.0744, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 0.35271674394607544, |
|
"learning_rate": 3.447682358527974e-05, |
|
"loss": 0.0615, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.32183908045977, |
|
"grad_norm": 0.47165122628211975, |
|
"learning_rate": 3.268076672898492e-05, |
|
"loss": 0.0657, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.436781609195402, |
|
"grad_norm": 0.7261269092559814, |
|
"learning_rate": 3.090976034731257e-05, |
|
"loss": 0.0666, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.551724137931035, |
|
"grad_norm": 0.30101433396339417, |
|
"learning_rate": 2.91663660173098e-05, |
|
"loss": 0.0618, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 1.2878450155258179, |
|
"learning_rate": 2.745310537805479e-05, |
|
"loss": 0.0549, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.781609195402299, |
|
"grad_norm": 0.7477713227272034, |
|
"learning_rate": 2.5772456483367497e-05, |
|
"loss": 0.0673, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.9708021283149719, |
|
"learning_rate": 2.4126850217561698e-05, |
|
"loss": 0.0519, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.011494252873563, |
|
"grad_norm": 0.3879060447216034, |
|
"learning_rate": 2.2518666779423074e-05, |
|
"loss": 0.0559, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.126436781609195, |
|
"grad_norm": 0.7332598567008972, |
|
"learning_rate": 2.0950232239498446e-05, |
|
"loss": 0.0557, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.241379310344827, |
|
"grad_norm": 0.5789719223976135, |
|
"learning_rate": 1.9423815175676025e-05, |
|
"loss": 0.0547, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.35632183908046, |
|
"grad_norm": 0.7040994763374329, |
|
"learning_rate": 1.7941623391922772e-05, |
|
"loss": 0.0562, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.471264367816092, |
|
"grad_norm": 0.37133750319480896, |
|
"learning_rate": 1.650580072492496e-05, |
|
"loss": 0.0604, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 0.3775554895401001, |
|
"learning_rate": 1.5118423943250771e-05, |
|
"loss": 0.0526, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.7011494252873565, |
|
"grad_norm": 0.46704328060150146, |
|
"learning_rate": 1.378149974351991e-05, |
|
"loss": 0.0507, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.816091954022989, |
|
"grad_norm": 0.26738837361335754, |
|
"learning_rate": 1.2496961847925153e-05, |
|
"loss": 0.0534, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.931034482758621, |
|
"grad_norm": 1.4386327266693115, |
|
"learning_rate": 1.126666820730366e-05, |
|
"loss": 0.0513, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.045977011494253, |
|
"grad_norm": 0.4603922367095947, |
|
"learning_rate": 1.0092398313803863e-05, |
|
"loss": 0.0626, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.160919540229886, |
|
"grad_norm": 0.49872225522994995, |
|
"learning_rate": 8.975850627034604e-06, |
|
"loss": 0.0553, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.8791252374649048, |
|
"learning_rate": 7.918640117419507e-06, |
|
"loss": 0.0438, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.39080459770115, |
|
"grad_norm": 0.621422290802002, |
|
"learning_rate": 6.922295930309691e-06, |
|
"loss": 0.0539, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.505747126436782, |
|
"grad_norm": 1.4802906513214111, |
|
"learning_rate": 5.988259174233713e-06, |
|
"loss": 0.0514, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.620689655172415, |
|
"grad_norm": 1.4139326810836792, |
|
"learning_rate": 5.117880836483452e-06, |
|
"loss": 0.0593, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.735632183908045, |
|
"grad_norm": 0.7812165021896362, |
|
"learning_rate": 4.312419829051173e-06, |
|
"loss": 0.0468, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.850574712643677, |
|
"grad_norm": 0.4393046200275421, |
|
"learning_rate": 3.5730411677439125e-06, |
|
"loss": 0.0553, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 1.7484021186828613, |
|
"learning_rate": 2.9008142871088663e-06, |
|
"loss": 0.0504, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.080459770114942, |
|
"grad_norm": 0.6218745708465576, |
|
"learning_rate": 2.296711493607334e-06, |
|
"loss": 0.0542, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.195402298850574, |
|
"grad_norm": 0.4588974118232727, |
|
"learning_rate": 1.7616065592742038e-06, |
|
"loss": 0.0554, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.310344827586206, |
|
"grad_norm": 1.559602975845337, |
|
"learning_rate": 1.2962734578973568e-06, |
|
"loss": 0.0484, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.425287356321839, |
|
"grad_norm": 0.2676626145839691, |
|
"learning_rate": 9.013852455448335e-07, |
|
"loss": 0.0485, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.540229885057471, |
|
"grad_norm": 0.4192800223827362, |
|
"learning_rate": 5.775130870590783e-07, |
|
"loss": 0.0488, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.7301256656646729, |
|
"learning_rate": 3.251254299261874e-07, |
|
"loss": 0.0531, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.770114942528735, |
|
"grad_norm": 0.2674349844455719, |
|
"learning_rate": 1.4458732671523977e-07, |
|
"loss": 0.0504, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 9.885057471264368, |
|
"grad_norm": 0.4260900318622589, |
|
"learning_rate": 3.6159907067601085e-08, |
|
"loss": 0.0504, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.2096983194351196, |
|
"learning_rate": 0.0, |
|
"loss": 0.0608, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 870, |
|
"total_flos": 9.390719321209728e+16, |
|
"train_loss": 0.11421718011642325, |
|
"train_runtime": 878.5225, |
|
"train_samples_per_second": 48.217, |
|
"train_steps_per_second": 0.99 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 870, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.390719321209728e+16, |
|
"train_batch_size": 49, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|