|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1370, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 5.749124526977539, |
|
"learning_rate": 1.4492753623188407e-05, |
|
"loss": 0.9214, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 2.569692611694336, |
|
"learning_rate": 2.8985507246376814e-05, |
|
"loss": 0.506, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21897810218978103, |
|
"grad_norm": 1.676221489906311, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.3337, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 1.1421884298324585, |
|
"learning_rate": 5.797101449275363e-05, |
|
"loss": 0.2416, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"grad_norm": 0.7032585144042969, |
|
"learning_rate": 7.246376811594203e-05, |
|
"loss": 0.1709, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.43795620437956206, |
|
"grad_norm": 0.8790648579597473, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 0.1541, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5109489051094891, |
|
"grad_norm": 0.9456900358200073, |
|
"learning_rate": 9.999985422436231e-05, |
|
"loss": 0.1368, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.583941605839416, |
|
"grad_norm": 0.8083729147911072, |
|
"learning_rate": 9.998236217634196e-05, |
|
"loss": 0.1253, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.656934306569343, |
|
"grad_norm": 0.5535995960235596, |
|
"learning_rate": 9.993572668745786e-05, |
|
"loss": 0.1117, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 0.7210536599159241, |
|
"learning_rate": 9.985997494967441e-05, |
|
"loss": 0.0905, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8029197080291971, |
|
"grad_norm": 0.5236138105392456, |
|
"learning_rate": 9.975515113189827e-05, |
|
"loss": 0.0917, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8759124087591241, |
|
"grad_norm": 0.4034636318683624, |
|
"learning_rate": 9.962131635422462e-05, |
|
"loss": 0.0802, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.948905109489051, |
|
"grad_norm": 0.6598319411277771, |
|
"learning_rate": 9.945854865229965e-05, |
|
"loss": 0.0744, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0218978102189782, |
|
"grad_norm": 0.6162580847740173, |
|
"learning_rate": 9.926694293181986e-05, |
|
"loss": 0.0808, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.094890510948905, |
|
"grad_norm": 0.5971336960792542, |
|
"learning_rate": 9.904661091319503e-05, |
|
"loss": 0.0765, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.167883211678832, |
|
"grad_norm": 0.7087730765342712, |
|
"learning_rate": 9.879768106640687e-05, |
|
"loss": 0.0713, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2408759124087592, |
|
"grad_norm": 0.6969349980354309, |
|
"learning_rate": 9.852029853610148e-05, |
|
"loss": 0.0665, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.313868613138686, |
|
"grad_norm": 0.6811894178390503, |
|
"learning_rate": 9.821462505695917e-05, |
|
"loss": 0.0631, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3868613138686132, |
|
"grad_norm": 0.34767240285873413, |
|
"learning_rate": 9.788083885939116e-05, |
|
"loss": 0.064, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4598540145985401, |
|
"grad_norm": 0.3470080494880676, |
|
"learning_rate": 9.75191345656179e-05, |
|
"loss": 0.0557, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5328467153284673, |
|
"grad_norm": 0.39414817094802856, |
|
"learning_rate": 9.712972307618981e-05, |
|
"loss": 0.0627, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6058394160583942, |
|
"grad_norm": 0.5173900723457336, |
|
"learning_rate": 9.671283144701663e-05, |
|
"loss": 0.0573, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6788321167883211, |
|
"grad_norm": 0.34683912992477417, |
|
"learning_rate": 9.626870275697682e-05, |
|
"loss": 0.0603, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7518248175182483, |
|
"grad_norm": 0.5500655174255371, |
|
"learning_rate": 9.579759596618454e-05, |
|
"loss": 0.062, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"grad_norm": 0.3647141456604004, |
|
"learning_rate": 9.529978576499652e-05, |
|
"loss": 0.0504, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.897810218978102, |
|
"grad_norm": 0.3856804072856903, |
|
"learning_rate": 9.477556241384724e-05, |
|
"loss": 0.0543, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9708029197080292, |
|
"grad_norm": 0.35589075088500977, |
|
"learning_rate": 9.422523157400533e-05, |
|
"loss": 0.0496, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0437956204379564, |
|
"grad_norm": 0.37966451048851013, |
|
"learning_rate": 9.36491141293504e-05, |
|
"loss": 0.0475, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.116788321167883, |
|
"grad_norm": 0.4046926200389862, |
|
"learning_rate": 9.304754599927377e-05, |
|
"loss": 0.0466, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.18978102189781, |
|
"grad_norm": 0.5988451242446899, |
|
"learning_rate": 9.242087794281243e-05, |
|
"loss": 0.0529, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.2627737226277373, |
|
"grad_norm": 0.3989510238170624, |
|
"learning_rate": 9.176947535413046e-05, |
|
"loss": 0.0509, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.335766423357664, |
|
"grad_norm": 0.4888112246990204, |
|
"learning_rate": 9.10937180494669e-05, |
|
"loss": 0.0405, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.408759124087591, |
|
"grad_norm": 0.5207938551902771, |
|
"learning_rate": 9.039400004567469e-05, |
|
"loss": 0.0472, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4817518248175183, |
|
"grad_norm": 0.41739702224731445, |
|
"learning_rate": 8.967072933047945e-05, |
|
"loss": 0.0523, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.554744525547445, |
|
"grad_norm": 0.5469692349433899, |
|
"learning_rate": 8.892432762459221e-05, |
|
"loss": 0.0466, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.627737226277372, |
|
"grad_norm": 0.44405660033226013, |
|
"learning_rate": 8.815523013581488e-05, |
|
"loss": 0.0475, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.7007299270072993, |
|
"grad_norm": 0.42182251811027527, |
|
"learning_rate": 8.736388530528162e-05, |
|
"loss": 0.0423, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7737226277372264, |
|
"grad_norm": 0.323812335729599, |
|
"learning_rate": 8.655075454598426e-05, |
|
"loss": 0.0421, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.846715328467153, |
|
"grad_norm": 0.44694793224334717, |
|
"learning_rate": 8.571631197373422e-05, |
|
"loss": 0.0447, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.9197080291970803, |
|
"grad_norm": 0.2432471662759781, |
|
"learning_rate": 8.486104413071755e-05, |
|
"loss": 0.0438, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9927007299270074, |
|
"grad_norm": 0.42758315801620483, |
|
"learning_rate": 8.398544970180469e-05, |
|
"loss": 0.0466, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.065693430656934, |
|
"grad_norm": 0.3887377679347992, |
|
"learning_rate": 8.309003922377996e-05, |
|
"loss": 0.04, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.1386861313868613, |
|
"grad_norm": 0.33644479513168335, |
|
"learning_rate": 8.217533478766068e-05, |
|
"loss": 0.0455, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.2116788321167884, |
|
"grad_norm": 0.29482313990592957, |
|
"learning_rate": 8.124186973427911e-05, |
|
"loss": 0.0398, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.2846715328467155, |
|
"grad_norm": 0.46486881375312805, |
|
"learning_rate": 8.029018834330506e-05, |
|
"loss": 0.0432, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3576642335766422, |
|
"grad_norm": 0.4706019163131714, |
|
"learning_rate": 7.932084551589027e-05, |
|
"loss": 0.036, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4306569343065694, |
|
"grad_norm": 0.502895176410675, |
|
"learning_rate": 7.833440645111975e-05, |
|
"loss": 0.0372, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.5036496350364965, |
|
"grad_norm": 0.37358635663986206, |
|
"learning_rate": 7.733144631645852e-05, |
|
"loss": 0.0415, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.576642335766423, |
|
"grad_norm": 0.38203373551368713, |
|
"learning_rate": 7.631254991238621e-05, |
|
"loss": 0.0362, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 0.38717857003211975, |
|
"learning_rate": 7.527831133141476e-05, |
|
"loss": 0.0416, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7226277372262775, |
|
"grad_norm": 0.32451218366622925, |
|
"learning_rate": 7.422933361168825e-05, |
|
"loss": 0.0378, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.795620437956204, |
|
"grad_norm": 0.3965657651424408, |
|
"learning_rate": 7.316622838536673e-05, |
|
"loss": 0.0374, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.8686131386861313, |
|
"grad_norm": 0.5710194110870361, |
|
"learning_rate": 7.208961552199913e-05, |
|
"loss": 0.0372, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.9416058394160585, |
|
"grad_norm": 0.3017335832118988, |
|
"learning_rate": 7.100012276709302e-05, |
|
"loss": 0.0411, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.014598540145985, |
|
"grad_norm": 0.42834600806236267, |
|
"learning_rate": 6.98983853760924e-05, |
|
"loss": 0.0324, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.087591240875913, |
|
"grad_norm": 0.2624529302120209, |
|
"learning_rate": 6.878504574397626e-05, |
|
"loss": 0.0362, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.160583941605839, |
|
"grad_norm": 0.3475762605667114, |
|
"learning_rate": 6.766075303069459e-05, |
|
"loss": 0.0398, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.233576642335766, |
|
"grad_norm": 0.5206601023674011, |
|
"learning_rate": 6.65261627826597e-05, |
|
"loss": 0.0367, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.306569343065694, |
|
"grad_norm": 0.4349361062049866, |
|
"learning_rate": 6.538193655051381e-05, |
|
"loss": 0.0365, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.37956204379562, |
|
"grad_norm": 0.5204529166221619, |
|
"learning_rate": 6.422874150339579e-05, |
|
"loss": 0.0358, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.452554744525547, |
|
"grad_norm": 0.27471956610679626, |
|
"learning_rate": 6.30672500399318e-05, |
|
"loss": 0.0336, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.525547445255475, |
|
"grad_norm": 0.3909718990325928, |
|
"learning_rate": 6.189813939617682e-05, |
|
"loss": 0.0363, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.598540145985401, |
|
"grad_norm": 0.24090121686458588, |
|
"learning_rate": 6.072209125073561e-05, |
|
"loss": 0.0363, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.671532846715328, |
|
"grad_norm": 0.3554544448852539, |
|
"learning_rate": 5.95397913272932e-05, |
|
"loss": 0.0359, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.744525547445256, |
|
"grad_norm": 0.31269019842147827, |
|
"learning_rate": 5.8351928994787006e-05, |
|
"loss": 0.0405, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.817518248175182, |
|
"grad_norm": 0.40918299555778503, |
|
"learning_rate": 5.7159196865453294e-05, |
|
"loss": 0.0361, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.89051094890511, |
|
"grad_norm": 0.522287130355835, |
|
"learning_rate": 5.596229039098271e-05, |
|
"loss": 0.0336, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.963503649635037, |
|
"grad_norm": 0.48848339915275574, |
|
"learning_rate": 5.4761907457020077e-05, |
|
"loss": 0.0317, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.036496350364963, |
|
"grad_norm": 0.4906352162361145, |
|
"learning_rate": 5.355874797624515e-05, |
|
"loss": 0.0295, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.109489051094891, |
|
"grad_norm": 0.24832546710968018, |
|
"learning_rate": 5.235351348027129e-05, |
|
"loss": 0.0299, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.182481751824818, |
|
"grad_norm": 0.3912707567214966, |
|
"learning_rate": 5.1146906710600306e-05, |
|
"loss": 0.0306, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.255474452554744, |
|
"grad_norm": 0.23428483307361603, |
|
"learning_rate": 4.993963120887183e-05, |
|
"loss": 0.0312, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.328467153284672, |
|
"grad_norm": 0.39608365297317505, |
|
"learning_rate": 4.8732390906646097e-05, |
|
"loss": 0.0323, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.401459854014599, |
|
"grad_norm": 0.24625132977962494, |
|
"learning_rate": 4.75258897149594e-05, |
|
"loss": 0.0346, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.474452554744525, |
|
"grad_norm": 0.4109422564506531, |
|
"learning_rate": 4.632083111389153e-05, |
|
"loss": 0.0305, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.547445255474453, |
|
"grad_norm": 0.4575289487838745, |
|
"learning_rate": 4.5117917742384456e-05, |
|
"loss": 0.0335, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.62043795620438, |
|
"grad_norm": 0.3231428265571594, |
|
"learning_rate": 4.391785098855156e-05, |
|
"loss": 0.032, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.693430656934306, |
|
"grad_norm": 0.22695614397525787, |
|
"learning_rate": 4.272133058071595e-05, |
|
"loss": 0.0341, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.766423357664234, |
|
"grad_norm": 0.3635699450969696, |
|
"learning_rate": 4.1529054179416875e-05, |
|
"loss": 0.0303, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.839416058394161, |
|
"grad_norm": 0.44931599497795105, |
|
"learning_rate": 4.034171697062157e-05, |
|
"loss": 0.0312, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.912408759124087, |
|
"grad_norm": 0.38062986731529236, |
|
"learning_rate": 3.916001126038008e-05, |
|
"loss": 0.0298, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.985401459854015, |
|
"grad_norm": 0.44949817657470703, |
|
"learning_rate": 3.7984626071159224e-05, |
|
"loss": 0.0301, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.0583941605839415, |
|
"grad_norm": 0.2569969892501831, |
|
"learning_rate": 3.681624674009121e-05, |
|
"loss": 0.03, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.131386861313868, |
|
"grad_norm": 0.3157390356063843, |
|
"learning_rate": 3.5655554519370956e-05, |
|
"loss": 0.0303, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.204379562043796, |
|
"grad_norm": 0.27934950590133667, |
|
"learning_rate": 3.450322617903543e-05, |
|
"loss": 0.0281, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.2773722627737225, |
|
"grad_norm": 0.2728220224380493, |
|
"learning_rate": 3.3359933612356156e-05, |
|
"loss": 0.0287, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.350364963503649, |
|
"grad_norm": 0.27907413244247437, |
|
"learning_rate": 3.2226343444075465e-05, |
|
"loss": 0.0268, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.423357664233577, |
|
"grad_norm": 0.2804538309574127, |
|
"learning_rate": 3.110311664171458e-05, |
|
"loss": 0.03, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.4963503649635035, |
|
"grad_norm": 0.5569368600845337, |
|
"learning_rate": 2.999090813018035e-05, |
|
"loss": 0.028, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.569343065693431, |
|
"grad_norm": 0.7178159952163696, |
|
"learning_rate": 2.8890366409895148e-05, |
|
"loss": 0.0325, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.642335766423358, |
|
"grad_norm": 0.19943037629127502, |
|
"learning_rate": 2.780213317867292e-05, |
|
"loss": 0.025, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.7153284671532845, |
|
"grad_norm": 0.1796846240758896, |
|
"learning_rate": 2.672684295756147e-05, |
|
"loss": 0.0257, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.788321167883212, |
|
"grad_norm": 0.23872807621955872, |
|
"learning_rate": 2.566512272086945e-05, |
|
"loss": 0.0304, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.861313868613139, |
|
"grad_norm": 0.2793905735015869, |
|
"learning_rate": 2.4617591530593613e-05, |
|
"loss": 0.0245, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.934306569343065, |
|
"grad_norm": 0.2569703161716461, |
|
"learning_rate": 2.3584860175459584e-05, |
|
"loss": 0.0269, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.007299270072993, |
|
"grad_norm": 0.27794763445854187, |
|
"learning_rate": 2.2567530814786463e-05, |
|
"loss": 0.0287, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.08029197080292, |
|
"grad_norm": 0.237870454788208, |
|
"learning_rate": 2.156619662738319e-05, |
|
"loss": 0.0261, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.153284671532846, |
|
"grad_norm": 0.3608488142490387, |
|
"learning_rate": 2.0581441465680986e-05, |
|
"loss": 0.0259, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.226277372262774, |
|
"grad_norm": 0.3045063316822052, |
|
"learning_rate": 1.961383951530394e-05, |
|
"loss": 0.0286, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.299270072992701, |
|
"grad_norm": 0.2131444364786148, |
|
"learning_rate": 1.866395496027602e-05, |
|
"loss": 0.0254, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.372262773722627, |
|
"grad_norm": 0.25216737389564514, |
|
"learning_rate": 1.7732341654059785e-05, |
|
"loss": 0.0222, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.445255474452555, |
|
"grad_norm": 0.2963661849498749, |
|
"learning_rate": 1.6819542796618487e-05, |
|
"loss": 0.0292, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.518248175182482, |
|
"grad_norm": 0.4280974864959717, |
|
"learning_rate": 1.592609061769004e-05, |
|
"loss": 0.0263, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.591240875912408, |
|
"grad_norm": 0.23306907713413239, |
|
"learning_rate": 1.5052506066457461e-05, |
|
"loss": 0.0211, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.664233576642336, |
|
"grad_norm": 0.2898852825164795, |
|
"learning_rate": 1.4199298507796698e-05, |
|
"loss": 0.0209, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.737226277372263, |
|
"grad_norm": 0.22745266556739807, |
|
"learning_rate": 1.3366965425278899e-05, |
|
"loss": 0.0218, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.81021897810219, |
|
"grad_norm": 0.244761660695076, |
|
"learning_rate": 1.2555992131100457e-05, |
|
"loss": 0.0213, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.883211678832117, |
|
"grad_norm": 0.3487051725387573, |
|
"learning_rate": 1.1766851483109858e-05, |
|
"loss": 0.0251, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.956204379562044, |
|
"grad_norm": 0.38351985812187195, |
|
"learning_rate": 1.1000003609096337e-05, |
|
"loss": 0.0233, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.02919708029197, |
|
"grad_norm": 0.2925453782081604, |
|
"learning_rate": 1.0255895638501045e-05, |
|
"loss": 0.0258, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.102189781021897, |
|
"grad_norm": 0.2964305281639099, |
|
"learning_rate": 9.534961441707307e-06, |
|
"loss": 0.0193, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.175182481751825, |
|
"grad_norm": 0.4270758628845215, |
|
"learning_rate": 8.837621377061877e-06, |
|
"loss": 0.0223, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.248175182481752, |
|
"grad_norm": 0.22104066610336304, |
|
"learning_rate": 8.16428204577468e-06, |
|
"loss": 0.0217, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 8.321167883211679, |
|
"grad_norm": 0.24666981399059296, |
|
"learning_rate": 7.515336054840022e-06, |
|
"loss": 0.0254, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.394160583941606, |
|
"grad_norm": 0.16614589095115662, |
|
"learning_rate": 6.8911617881174725e-06, |
|
"loss": 0.02, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.467153284671532, |
|
"grad_norm": 0.2146356999874115, |
|
"learning_rate": 6.292123185705867e-06, |
|
"loss": 0.024, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.540145985401459, |
|
"grad_norm": 0.1602909117937088, |
|
"learning_rate": 5.718569531739154e-06, |
|
"loss": 0.0267, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.613138686131387, |
|
"grad_norm": 0.4400380849838257, |
|
"learning_rate": 5.170835250727663e-06, |
|
"loss": 0.0231, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.686131386861314, |
|
"grad_norm": 0.18121108412742615, |
|
"learning_rate": 4.6492397125637525e-06, |
|
"loss": 0.0192, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.75912408759124, |
|
"grad_norm": 0.26767614483833313, |
|
"learning_rate": 4.154087046305322e-06, |
|
"loss": 0.0249, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.832116788321168, |
|
"grad_norm": 0.2408941090106964, |
|
"learning_rate": 3.6856659628459912e-06, |
|
"loss": 0.0214, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.905109489051094, |
|
"grad_norm": 0.2535434663295746, |
|
"learning_rate": 3.244249586575038e-06, |
|
"loss": 0.0203, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.978102189781023, |
|
"grad_norm": 0.6523481011390686, |
|
"learning_rate": 2.830095296125612e-06, |
|
"loss": 0.0222, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.05109489051095, |
|
"grad_norm": 0.16764762997627258, |
|
"learning_rate": 2.4434445743037713e-06, |
|
"loss": 0.0225, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.124087591240876, |
|
"grad_norm": 0.35821276903152466, |
|
"learning_rate": 2.0845228672860538e-06, |
|
"loss": 0.0206, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 9.197080291970803, |
|
"grad_norm": 0.38857728242874146, |
|
"learning_rate": 1.7535394531675187e-06, |
|
"loss": 0.0222, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 9.27007299270073, |
|
"grad_norm": 0.2351900190114975, |
|
"learning_rate": 1.4506873199370497e-06, |
|
"loss": 0.021, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 9.343065693430656, |
|
"grad_norm": 0.33225876092910767, |
|
"learning_rate": 1.1761430529509899e-06, |
|
"loss": 0.0258, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.416058394160585, |
|
"grad_norm": 0.2640361785888672, |
|
"learning_rate": 9.300667319706857e-07, |
|
"loss": 0.0202, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 9.489051094890511, |
|
"grad_norm": 0.15353083610534668, |
|
"learning_rate": 7.126018378241062e-07, |
|
"loss": 0.0215, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.562043795620438, |
|
"grad_norm": 0.33280253410339355, |
|
"learning_rate": 5.238751687458021e-07, |
|
"loss": 0.0217, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.635036496350365, |
|
"grad_norm": 0.18690501153469086, |
|
"learning_rate": 3.639967664440802e-07, |
|
"loss": 0.0194, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.708029197080291, |
|
"grad_norm": 0.6559237837791443, |
|
"learning_rate": 2.3305985193852742e-07, |
|
"loss": 0.0242, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.78102189781022, |
|
"grad_norm": 0.3035587966442108, |
|
"learning_rate": 1.3114077120517376e-07, |
|
"loss": 0.0271, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.854014598540147, |
|
"grad_norm": 0.30467328429222107, |
|
"learning_rate": 5.8298950661112017e-08, |
|
"loss": 0.0187, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 9.927007299270073, |
|
"grad_norm": 0.22451713681221008, |
|
"learning_rate": 1.4576862514487089e-08, |
|
"loss": 0.022, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.33209139108657837, |
|
"learning_rate": 0.0, |
|
"loss": 0.0227, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1370, |
|
"total_flos": 1.9634851190645568e+17, |
|
"train_loss": 0.05370959477485531, |
|
"train_runtime": 1800.9203, |
|
"train_samples_per_second": 48.631, |
|
"train_steps_per_second": 0.761 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1370, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9634851190645568e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|