|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 566, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0088339222614841, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.145, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0176678445229682, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.1351, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.026501766784452298, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 0.1249, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0353356890459364, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 0.1189, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044169611307420496, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 0.1181, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.053003533568904596, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 4.999961496300632e-05, |
|
"loss": 0.1127, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.061837455830388695, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 4.9986140051876094e-05, |
|
"loss": 0.1102, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0706713780918728, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 4.995342646712217e-05, |
|
"loss": 0.1093, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07950530035335689, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 4.9901502197807084e-05, |
|
"loss": 0.109, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08833922261484099, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 4.9830411669255416e-05, |
|
"loss": 0.107, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09717314487632508, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 4.974021570504443e-05, |
|
"loss": 0.1041, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 4.963099147496465e-05, |
|
"loss": 0.1029, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11484098939929328, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 4.9502832428995005e-05, |
|
"loss": 0.1047, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12367491166077739, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 4.935584821734901e-05, |
|
"loss": 0.1044, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13250883392226148, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 4.919016459666026e-05, |
|
"loss": 0.1063, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 4.9005923322387706e-05, |
|
"loss": 0.1025, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1501766784452297, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 4.880328202753264e-05, |
|
"loss": 0.1022, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15901060070671377, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 4.858241408777117e-05, |
|
"loss": 0.1031, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16784452296819788, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 4.834350847311758e-05, |
|
"loss": 0.1016, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 4.8086769586245554e-05, |
|
"loss": 0.1008, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1855123674911661, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 4.7812417087605456e-05, |
|
"loss": 0.1043, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19434628975265017, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 4.752068570748746e-05, |
|
"loss": 0.1038, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20318021201413428, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 4.721182504519118e-05, |
|
"loss": 0.1014, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 4.688609935547371e-05, |
|
"loss": 0.1004, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22084805653710246, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 4.654378732245869e-05, |
|
"loss": 0.1017, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22968197879858657, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 4.618518182120011e-05, |
|
"loss": 0.1006, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23851590106007067, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 4.5810589667104347e-05, |
|
"loss": 0.1008, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24734982332155478, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 4.542033135342537e-05, |
|
"loss": 0.1012, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25618374558303886, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 4.5014740777057405e-05, |
|
"loss": 0.1026, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26501766784452296, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 4.45941649528596e-05, |
|
"loss": 0.1017, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27385159010600707, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 4.4158963716757444e-05, |
|
"loss": 0.099, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 4.370950941787456e-05, |
|
"loss": 0.1021, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2915194346289753, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 4.324618659995855e-05, |
|
"loss": 0.1006, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3003533568904594, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 4.27693916723734e-05, |
|
"loss": 0.1006, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30918727915194344, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 4.227953257093985e-05, |
|
"loss": 0.0995, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.31802120141342755, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 4.1777028408913985e-05, |
|
"loss": 0.1005, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32685512367491165, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 4.126230911840269e-05, |
|
"loss": 0.0995, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33568904593639576, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 4.07358150825226e-05, |
|
"loss": 0.0984, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34452296819787986, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 4.0197996758617594e-05, |
|
"loss": 0.0979, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35335689045936397, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 3.964931429285675e-05, |
|
"loss": 0.0998, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3621908127208481, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 3.909023712654291e-05, |
|
"loss": 0.1012, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3710247349823322, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.852124359446845e-05, |
|
"loss": 0.0987, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37985865724381623, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 3.794282051566199e-05, |
|
"loss": 0.0982, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38869257950530034, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 3.7355462776876184e-05, |
|
"loss": 0.0984, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39752650176678445, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 3.6759672909172846e-05, |
|
"loss": 0.0973, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.40636042402826855, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 3.615596065796791e-05, |
|
"loss": 0.1007, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41519434628975266, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.554484254690379e-05, |
|
"loss": 0.0972, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 3.492684143592252e-05, |
|
"loss": 0.0974, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43286219081272087, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 3.4302486073917686e-05, |
|
"loss": 0.0991, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4416961130742049, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 3.3672310646347844e-05, |
|
"loss": 0.0979, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.450530035335689, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 3.3036854318198575e-05, |
|
"loss": 0.0987, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.45936395759717313, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 3.2396660772684114e-05, |
|
"loss": 0.0999, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46819787985865724, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.1752277746083325e-05, |
|
"loss": 0.0979, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.47703180212014135, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.110425655910795e-05, |
|
"loss": 0.0983, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48586572438162545, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 3.045315164520405e-05, |
|
"loss": 0.0981, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.49469964664310956, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 2.9799520076190268e-05, |
|
"loss": 0.0987, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5035335689045937, |
|
"grad_norm": 0.07080078125, |
|
"learning_rate": 2.914392108563883e-05, |
|
"loss": 0.0963, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5123674911660777, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.848691559040687e-05, |
|
"loss": 0.0977, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5212014134275619, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 2.7829065710727682e-05, |
|
"loss": 0.0959, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 2.7170934289272327e-05, |
|
"loss": 0.0983, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5388692579505301, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 2.6513084409593137e-05, |
|
"loss": 0.0981, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5477031802120141, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 2.585607891436118e-05, |
|
"loss": 0.0972, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5565371024734982, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 2.5200479923809738e-05, |
|
"loss": 0.0964, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.4546848354795954e-05, |
|
"loss": 0.0966, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5742049469964664, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 2.3895743440892053e-05, |
|
"loss": 0.0983, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5830388692579506, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 2.3247722253916677e-05, |
|
"loss": 0.0983, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5918727915194346, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 2.2603339227315902e-05, |
|
"loss": 0.0982, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6007067137809188, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 2.1963145681801434e-05, |
|
"loss": 0.0968, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6095406360424028, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 2.132768935365215e-05, |
|
"loss": 0.0976, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6183745583038869, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 2.069751392608232e-05, |
|
"loss": 0.0974, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.627208480565371, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 2.0073158564077483e-05, |
|
"loss": 0.0992, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6360424028268551, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 1.9455157453096225e-05, |
|
"loss": 0.0992, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6448763250883393, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.8844039342032095e-05, |
|
"loss": 0.0961, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6537102473498233, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 1.8240327090827153e-05, |
|
"loss": 0.097, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6625441696113075, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 1.764453722312383e-05, |
|
"loss": 0.0979, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6713780918727915, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 1.705717948433801e-05, |
|
"loss": 0.0963, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6802120141342756, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.6478756405531564e-05, |
|
"loss": 0.0969, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6890459363957597, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.5909762873457096e-05, |
|
"loss": 0.0963, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6978798586572438, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.5350685707143258e-05, |
|
"loss": 0.0973, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 1.4802003241382406e-05, |
|
"loss": 0.0963, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.715547703180212, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.4264184917477397e-05, |
|
"loss": 0.0964, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7243816254416962, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 1.3737690881597321e-05, |
|
"loss": 0.0981, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7332155477031802, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 1.3222971591086014e-05, |
|
"loss": 0.0977, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7420494699646644, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 1.2720467429060156e-05, |
|
"loss": 0.0975, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7508833922261484, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 1.2230608327626608e-05, |
|
"loss": 0.0978, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7597173144876325, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 1.1753813400041453e-05, |
|
"loss": 0.0954, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7685512367491166, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 1.1290490582125454e-05, |
|
"loss": 0.0952, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7773851590106007, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 1.0841036283242558e-05, |
|
"loss": 0.0971, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7862190812720848, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 1.0405835047140401e-05, |
|
"loss": 0.0982, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7950530035335689, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 9.985259222942602e-06, |
|
"loss": 0.0952, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.803886925795053, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 9.57966864657463e-06, |
|
"loss": 0.0978, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8127208480565371, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 9.189410332895662e-06, |
|
"loss": 0.0989, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8215547703180212, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 8.814818178799892e-06, |
|
"loss": 0.0981, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8303886925795053, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 8.456212677541312e-06, |
|
"loss": 0.0945, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8392226148409894, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 8.113900644526301e-06, |
|
"loss": 0.0988, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 7.788174954808826e-06, |
|
"loss": 0.0973, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8568904593639576, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 7.479314292512542e-06, |
|
"loss": 0.0972, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8657243816254417, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 7.187582912394548e-06, |
|
"loss": 0.0977, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8745583038869258, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 6.913230413754452e-06, |
|
"loss": 0.0962, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8833922261484098, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 6.656491526882422e-06, |
|
"loss": 0.0958, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.892226148409894, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 6.417585912228833e-06, |
|
"loss": 0.0959, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.901060070671378, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 6.196717972467361e-06, |
|
"loss": 0.0978, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9098939929328622, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 5.994076677612297e-06, |
|
"loss": 0.0982, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9187279151943463, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 5.809835403339747e-06, |
|
"loss": 0.0971, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9275618374558304, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 5.644151782650993e-06, |
|
"loss": 0.0953, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9363957597173145, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 5.497167571004998e-06, |
|
"loss": 0.0956, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9452296819787986, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 5.36900852503536e-06, |
|
"loss": 0.0963, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9540636042402827, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 5.259784294955576e-06, |
|
"loss": 0.0985, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9628975265017667, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 5.169588330744585e-06, |
|
"loss": 0.0982, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9717314487632509, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 5.098497802192923e-06, |
|
"loss": 0.0979, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.980565371024735, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 5.046573532877835e-06, |
|
"loss": 0.0977, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9893992932862191, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 5.013859948123909e-06, |
|
"loss": 0.0976, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9982332155477032, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 5.000385036993684e-06, |
|
"loss": 0.0977, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 566, |
|
"total_flos": 5.828408442271826e+17, |
|
"train_loss": 0.10050818478102819, |
|
"train_runtime": 2227.4528, |
|
"train_samples_per_second": 32.502, |
|
"train_steps_per_second": 0.254 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 566, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.828408442271826e+17, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|