|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.269938650306749, |
|
"eval_steps": 10000000, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 30.28219704365505, |
|
"learning_rate": 2.4691358024691355e-08, |
|
"loss": 2.9799, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 30.66075380374518, |
|
"learning_rate": 4.938271604938271e-08, |
|
"loss": 2.9842, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 29.626899456192994, |
|
"learning_rate": 7.407407407407407e-08, |
|
"loss": 3.015, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 29.470505364249597, |
|
"learning_rate": 9.876543209876542e-08, |
|
"loss": 2.9547, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 27.407219480683995, |
|
"learning_rate": 1.2345679012345677e-07, |
|
"loss": 2.8784, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 20.68736366691469, |
|
"learning_rate": 1.4814814814814815e-07, |
|
"loss": 2.7798, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 17.11747188448127, |
|
"learning_rate": 1.728395061728395e-07, |
|
"loss": 2.5723, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 7.782003292603646, |
|
"learning_rate": 1.9753086419753084e-07, |
|
"loss": 2.4467, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1042944785276074, |
|
"grad_norm": 4.365497818859387, |
|
"learning_rate": 2.222222222222222e-07, |
|
"loss": 2.2687, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2269938650306749, |
|
"grad_norm": 3.3711421712732283, |
|
"learning_rate": 2.4691358024691354e-07, |
|
"loss": 2.1767, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3496932515337423, |
|
"grad_norm": 2.956591715084401, |
|
"learning_rate": 2.716049382716049e-07, |
|
"loss": 2.1332, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4723926380368098, |
|
"grad_norm": 2.635796811630019, |
|
"learning_rate": 2.962962962962963e-07, |
|
"loss": 2.1135, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5950920245398774, |
|
"grad_norm": 2.3273823610234876, |
|
"learning_rate": 3.209876543209876e-07, |
|
"loss": 2.1097, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7177914110429446, |
|
"grad_norm": 2.2617900128343833, |
|
"learning_rate": 3.45679012345679e-07, |
|
"loss": 2.1161, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8404907975460123, |
|
"grad_norm": 2.1638223937937076, |
|
"learning_rate": 3.703703703703703e-07, |
|
"loss": 2.0616, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9631901840490797, |
|
"grad_norm": 2.1437544811484854, |
|
"learning_rate": 3.950617283950617e-07, |
|
"loss": 2.0815, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.085889570552147, |
|
"grad_norm": 2.1420606825819197, |
|
"learning_rate": 4.1975308641975306e-07, |
|
"loss": 2.063, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.208588957055215, |
|
"grad_norm": 2.144143735036332, |
|
"learning_rate": 4.444444444444444e-07, |
|
"loss": 2.0513, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.331288343558282, |
|
"grad_norm": 2.09031994519009, |
|
"learning_rate": 4.6913580246913576e-07, |
|
"loss": 2.0583, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.4539877300613497, |
|
"grad_norm": 2.074953413736776, |
|
"learning_rate": 4.938271604938271e-07, |
|
"loss": 2.0446, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5766871165644174, |
|
"grad_norm": 2.0364457610758473, |
|
"learning_rate": 5.185185185185185e-07, |
|
"loss": 2.0336, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6993865030674846, |
|
"grad_norm": 2.09662270077011, |
|
"learning_rate": 5.432098765432098e-07, |
|
"loss": 2.03, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.8220858895705523, |
|
"grad_norm": 2.0184952150772273, |
|
"learning_rate": 5.679012345679012e-07, |
|
"loss": 2.0197, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.9447852760736195, |
|
"grad_norm": 2.2026915181725535, |
|
"learning_rate": 5.925925925925926e-07, |
|
"loss": 2.007, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.067484662576687, |
|
"grad_norm": 1.9452709028668997, |
|
"learning_rate": 6.172839506172839e-07, |
|
"loss": 2.0014, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.190184049079755, |
|
"grad_norm": 1.933109075200724, |
|
"learning_rate": 6.419753086419752e-07, |
|
"loss": 1.977, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.312883435582822, |
|
"grad_norm": 1.9739789693701668, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 1.9816, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.4355828220858897, |
|
"grad_norm": 2.0255729355536625, |
|
"learning_rate": 6.91358024691358e-07, |
|
"loss": 2.0138, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.558282208588957, |
|
"grad_norm": 2.109260451722745, |
|
"learning_rate": 7.160493827160494e-07, |
|
"loss": 1.9852, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.6809815950920246, |
|
"grad_norm": 2.02230955779347, |
|
"learning_rate": 7.407407407407406e-07, |
|
"loss": 1.989, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.8036809815950923, |
|
"grad_norm": 2.114117608740877, |
|
"learning_rate": 7.65432098765432e-07, |
|
"loss": 1.9656, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.9263803680981595, |
|
"grad_norm": 1.9608507920028475, |
|
"learning_rate": 7.901234567901234e-07, |
|
"loss": 1.9819, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.049079754601227, |
|
"grad_norm": 1.992359665195648, |
|
"learning_rate": 8.148148148148147e-07, |
|
"loss": 1.9517, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.171779141104294, |
|
"grad_norm": 2.0184004724781524, |
|
"learning_rate": 8.395061728395061e-07, |
|
"loss": 1.8995, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.294478527607362, |
|
"grad_norm": 1.9438721415658846, |
|
"learning_rate": 8.641975308641974e-07, |
|
"loss": 1.9493, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.41717791411043, |
|
"grad_norm": 2.037329633517169, |
|
"learning_rate": 8.888888888888888e-07, |
|
"loss": 1.9592, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.539877300613497, |
|
"grad_norm": 2.045589243795842, |
|
"learning_rate": 9.135802469135801e-07, |
|
"loss": 1.9269, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.662576687116564, |
|
"grad_norm": 2.0376567384336814, |
|
"learning_rate": 9.382716049382715e-07, |
|
"loss": 1.9091, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.785276073619632, |
|
"grad_norm": 2.151886931601921, |
|
"learning_rate": 9.629629629629628e-07, |
|
"loss": 1.9338, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.9079754601226995, |
|
"grad_norm": 2.1016930186631955, |
|
"learning_rate": 9.876543209876542e-07, |
|
"loss": 1.9361, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.030674846625767, |
|
"grad_norm": 1.8425197677510754, |
|
"learning_rate": 9.999953571567085e-07, |
|
"loss": 1.8946, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.153374233128835, |
|
"grad_norm": 2.1108335201344826, |
|
"learning_rate": 9.999582149277185e-07, |
|
"loss": 1.8636, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.276073619631902, |
|
"grad_norm": 1.965491417959117, |
|
"learning_rate": 9.99883933228855e-07, |
|
"loss": 1.8863, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.398773006134969, |
|
"grad_norm": 2.0748380468153442, |
|
"learning_rate": 9.997725175781443e-07, |
|
"loss": 1.8708, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.521472392638037, |
|
"grad_norm": 2.0061874641389994, |
|
"learning_rate": 9.99623976252115e-07, |
|
"loss": 1.8629, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.644171779141105, |
|
"grad_norm": 1.9227878063794117, |
|
"learning_rate": 9.994383202851812e-07, |
|
"loss": 1.8574, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.766871165644172, |
|
"grad_norm": 2.022882706029264, |
|
"learning_rate": 9.992155634688238e-07, |
|
"loss": 1.8489, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.889570552147239, |
|
"grad_norm": 1.9151165000656245, |
|
"learning_rate": 9.98955722350566e-07, |
|
"loss": 1.8702, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.012269938650307, |
|
"grad_norm": 1.8932452511409288, |
|
"learning_rate": 9.986588162327434e-07, |
|
"loss": 1.8569, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.134969325153374, |
|
"grad_norm": 2.0203097592372035, |
|
"learning_rate": 9.983248671710714e-07, |
|
"loss": 1.7873, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.257668711656442, |
|
"grad_norm": 1.9081758147724377, |
|
"learning_rate": 9.979538999730047e-07, |
|
"loss": 1.779, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.38036809815951, |
|
"grad_norm": 2.072946207781536, |
|
"learning_rate": 9.975459421958967e-07, |
|
"loss": 1.8006, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.5030674846625764, |
|
"grad_norm": 2.0035469674735893, |
|
"learning_rate": 9.971010241449513e-07, |
|
"loss": 1.8018, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.625766871165644, |
|
"grad_norm": 2.0835946373439143, |
|
"learning_rate": 9.966191788709714e-07, |
|
"loss": 1.8, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.748466257668712, |
|
"grad_norm": 2.051477600119681, |
|
"learning_rate": 9.961004421679046e-07, |
|
"loss": 1.7869, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.871165644171779, |
|
"grad_norm": 1.9111605839024262, |
|
"learning_rate": 9.955448525701835e-07, |
|
"loss": 1.7929, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.993865030674847, |
|
"grad_norm": 2.036265101667604, |
|
"learning_rate": 9.949524513498636e-07, |
|
"loss": 1.8098, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.116564417177914, |
|
"grad_norm": 1.9440590850722006, |
|
"learning_rate": 9.943232825135566e-07, |
|
"loss": 1.7158, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.2392638036809815, |
|
"grad_norm": 1.9745053511466075, |
|
"learning_rate": 9.93657392799163e-07, |
|
"loss": 1.7269, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.361963190184049, |
|
"grad_norm": 2.0172262033751522, |
|
"learning_rate": 9.92954831672398e-07, |
|
"loss": 1.7173, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.484662576687117, |
|
"grad_norm": 2.017376130474714, |
|
"learning_rate": 9.922156513231197e-07, |
|
"loss": 1.723, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.6073619631901845, |
|
"grad_norm": 2.129118497262339, |
|
"learning_rate": 9.914399066614487e-07, |
|
"loss": 1.7109, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.730061349693251, |
|
"grad_norm": 2.0003358633590986, |
|
"learning_rate": 9.906276553136922e-07, |
|
"loss": 1.7287, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.852760736196319, |
|
"grad_norm": 2.0380074509433554, |
|
"learning_rate": 9.897789576180616e-07, |
|
"loss": 1.7128, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.975460122699387, |
|
"grad_norm": 1.972210672677589, |
|
"learning_rate": 9.888938766201907e-07, |
|
"loss": 1.7209, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.098159509202453, |
|
"grad_norm": 2.0353896296888263, |
|
"learning_rate": 9.879724780684517e-07, |
|
"loss": 1.6636, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.220858895705522, |
|
"grad_norm": 2.1575258154001067, |
|
"learning_rate": 9.87014830409073e-07, |
|
"loss": 1.6493, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.343558282208589, |
|
"grad_norm": 2.0731699660667156, |
|
"learning_rate": 9.860210047810515e-07, |
|
"loss": 1.627, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.466257668711656, |
|
"grad_norm": 2.1169883616519476, |
|
"learning_rate": 9.849910750108717e-07, |
|
"loss": 1.6516, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.588957055214724, |
|
"grad_norm": 2.0906272175984917, |
|
"learning_rate": 9.839251176070183e-07, |
|
"loss": 1.6619, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.71165644171779, |
|
"grad_norm": 2.059390905577669, |
|
"learning_rate": 9.828232117542947e-07, |
|
"loss": 1.6457, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.83435582822086, |
|
"grad_norm": 2.111680195842936, |
|
"learning_rate": 9.816854393079402e-07, |
|
"loss": 1.6178, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.957055214723926, |
|
"grad_norm": 2.144260583400968, |
|
"learning_rate": 9.805118847875487e-07, |
|
"loss": 1.6382, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 9.079754601226995, |
|
"grad_norm": 2.285829860466323, |
|
"learning_rate": 9.793026353707914e-07, |
|
"loss": 1.5706, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.202453987730062, |
|
"grad_norm": 2.153794951665122, |
|
"learning_rate": 9.780577808869398e-07, |
|
"loss": 1.5595, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.325153374233128, |
|
"grad_norm": 2.138270605360391, |
|
"learning_rate": 9.767774138101934e-07, |
|
"loss": 1.5649, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.447852760736197, |
|
"grad_norm": 2.140711587703231, |
|
"learning_rate": 9.754616292528093e-07, |
|
"loss": 1.5466, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.570552147239264, |
|
"grad_norm": 2.221273893753507, |
|
"learning_rate": 9.74110524958038e-07, |
|
"loss": 1.5284, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.69325153374233, |
|
"grad_norm": 2.3103597560152545, |
|
"learning_rate": 9.72724201292862e-07, |
|
"loss": 1.5592, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.815950920245399, |
|
"grad_norm": 2.2066393219378373, |
|
"learning_rate": 9.713027612405394e-07, |
|
"loss": 1.546, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.938650306748466, |
|
"grad_norm": 2.274268167957973, |
|
"learning_rate": 9.698463103929541e-07, |
|
"loss": 1.5556, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 10.061349693251534, |
|
"grad_norm": 2.4288759947142666, |
|
"learning_rate": 9.68354956942773e-07, |
|
"loss": 1.5192, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 10.184049079754601, |
|
"grad_norm": 2.56196295256849, |
|
"learning_rate": 9.668288116754076e-07, |
|
"loss": 1.4731, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 10.30674846625767, |
|
"grad_norm": 2.351647479927511, |
|
"learning_rate": 9.652679879607843e-07, |
|
"loss": 1.4728, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 10.429447852760736, |
|
"grad_norm": 2.3330020594718897, |
|
"learning_rate": 9.636726017449236e-07, |
|
"loss": 1.4558, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.552147239263803, |
|
"grad_norm": 2.420606264665063, |
|
"learning_rate": 9.62042771541326e-07, |
|
"loss": 1.4287, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.674846625766872, |
|
"grad_norm": 2.3947612477487867, |
|
"learning_rate": 9.603786184221692e-07, |
|
"loss": 1.4469, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.797546012269938, |
|
"grad_norm": 2.3399004198935303, |
|
"learning_rate": 9.586802660093136e-07, |
|
"loss": 1.4396, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 10.920245398773005, |
|
"grad_norm": 2.387016507283165, |
|
"learning_rate": 9.56947840465119e-07, |
|
"loss": 1.4595, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 11.042944785276074, |
|
"grad_norm": 2.533848992609839, |
|
"learning_rate": 9.551814704830734e-07, |
|
"loss": 1.417, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 11.16564417177914, |
|
"grad_norm": 2.5734232080415254, |
|
"learning_rate": 9.533812872782313e-07, |
|
"loss": 1.348, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 11.28834355828221, |
|
"grad_norm": 2.6920925651578087, |
|
"learning_rate": 9.515474245774684e-07, |
|
"loss": 1.3556, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 11.411042944785276, |
|
"grad_norm": 2.7013002533947574, |
|
"learning_rate": 9.496800186095465e-07, |
|
"loss": 1.3539, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 11.533742331288344, |
|
"grad_norm": 2.731007254322984, |
|
"learning_rate": 9.477792080949938e-07, |
|
"loss": 1.3436, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.656441717791411, |
|
"grad_norm": 2.8652743906067375, |
|
"learning_rate": 9.458451342358e-07, |
|
"loss": 1.3521, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 11.779141104294478, |
|
"grad_norm": 2.5726368131527075, |
|
"learning_rate": 9.43877940704928e-07, |
|
"loss": 1.3426, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 11.901840490797547, |
|
"grad_norm": 2.6814604487668463, |
|
"learning_rate": 9.418777736356393e-07, |
|
"loss": 1.3419, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 12.024539877300613, |
|
"grad_norm": 3.1868357701860774, |
|
"learning_rate": 9.39844781610641e-07, |
|
"loss": 1.3155, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 12.14723926380368, |
|
"grad_norm": 3.082069159033982, |
|
"learning_rate": 9.377791156510454e-07, |
|
"loss": 1.2428, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 12.269938650306749, |
|
"grad_norm": 3.075730479893534, |
|
"learning_rate": 9.356809292051539e-07, |
|
"loss": 1.224, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4050, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 92928324206592.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|