|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998570816064027, |
|
"eval_steps": 80, |
|
"global_step": 1166, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004287551807917679, |
|
"grad_norm": 49.0459476401418, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 1.2482, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008575103615835357, |
|
"grad_norm": 46.49636895863856, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.2111, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012862655423753037, |
|
"grad_norm": 75.02893655779658, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.2375, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017150207231670715, |
|
"grad_norm": 3.8741623300934016, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0833, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021437759039588396, |
|
"grad_norm": 8.80211103072482, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.0052, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.025725310847506073, |
|
"grad_norm": 11.672065444786128, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0283, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.030012862655423755, |
|
"grad_norm": 4.987083560214949, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.0801, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03430041446334143, |
|
"grad_norm": 2.966299005953653, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.0753, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.038587966271259114, |
|
"grad_norm": 3.0065164197825074, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.9409, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04287551807917679, |
|
"grad_norm": 3.381394216826962, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0198, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04716306988709447, |
|
"grad_norm": 3.6309559490350654, |
|
"learning_rate": 4.926711109482401e-06, |
|
"loss": 1.0699, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05145062169501215, |
|
"grad_norm": 4.134626791396976, |
|
"learning_rate": 4.854242179361151e-06, |
|
"loss": 1.0268, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.055738173502929825, |
|
"grad_norm": 3.3445998120937954, |
|
"learning_rate": 4.782587680591894e-06, |
|
"loss": 1.2315, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06002572531084751, |
|
"grad_norm": 2.4730316039429288, |
|
"learning_rate": 4.711742096614044e-06, |
|
"loss": 0.9914, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06431327711876518, |
|
"grad_norm": 3.063463397168402, |
|
"learning_rate": 4.641699923379107e-06, |
|
"loss": 0.9354, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06860082892668286, |
|
"grad_norm": 3.72217017018073, |
|
"learning_rate": 4.5724556693791996e-06, |
|
"loss": 1.0356, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06860082892668286, |
|
"eval_loss": 0.9710695743560791, |
|
"eval_runtime": 20.9077, |
|
"eval_samples_per_second": 9.566, |
|
"eval_steps_per_second": 2.391, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07288838073460055, |
|
"grad_norm": 2.8296964090488466, |
|
"learning_rate": 4.504003855675734e-06, |
|
"loss": 1.0218, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07717593254251823, |
|
"grad_norm": 3.9492533196858313, |
|
"learning_rate": 4.436339015928335e-06, |
|
"loss": 0.9059, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0814634843504359, |
|
"grad_norm": 2.945729589644999, |
|
"learning_rate": 4.369455696423936e-06, |
|
"loss": 0.9862, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08575103615835358, |
|
"grad_norm": 2.8934728265120553, |
|
"learning_rate": 4.303348456106082e-06, |
|
"loss": 0.9869, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09003858796627126, |
|
"grad_norm": 3.6370231743108636, |
|
"learning_rate": 4.238011866604447e-06, |
|
"loss": 1.0137, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09432613977418894, |
|
"grad_norm": 2.8095030003011976, |
|
"learning_rate": 4.173440512264544e-06, |
|
"loss": 0.9834, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09861369158210662, |
|
"grad_norm": 2.5605385368574205, |
|
"learning_rate": 4.109628990177651e-06, |
|
"loss": 0.9828, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1029012433900243, |
|
"grad_norm": 3.328262187145733, |
|
"learning_rate": 4.04657191021096e-06, |
|
"loss": 0.9892, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10718879519794197, |
|
"grad_norm": 3.4906689890189835, |
|
"learning_rate": 3.984263895037921e-06, |
|
"loss": 0.9903, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11147634700585965, |
|
"grad_norm": 2.845690441710927, |
|
"learning_rate": 3.9226995801688165e-06, |
|
"loss": 1.1259, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11576389881377733, |
|
"grad_norm": 2.7267526391731334, |
|
"learning_rate": 3.8618736139815605e-06, |
|
"loss": 0.9734, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12005145062169502, |
|
"grad_norm": 2.062870391280079, |
|
"learning_rate": 3.8017806577526982e-06, |
|
"loss": 1.0515, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1243390024296127, |
|
"grad_norm": 2.814034294378353, |
|
"learning_rate": 3.7424153856886596e-06, |
|
"loss": 0.8449, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12862655423753036, |
|
"grad_norm": 3.0711652275263357, |
|
"learning_rate": 3.6837724849572175e-06, |
|
"loss": 0.8388, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13291410604544804, |
|
"grad_norm": 3.437400624778036, |
|
"learning_rate": 3.6258466557191907e-06, |
|
"loss": 1.1761, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13720165785336572, |
|
"grad_norm": 3.081997188340036, |
|
"learning_rate": 3.5686326111603775e-06, |
|
"loss": 1.0348, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13720165785336572, |
|
"eval_loss": 0.9355081915855408, |
|
"eval_runtime": 19.1453, |
|
"eval_samples_per_second": 10.446, |
|
"eval_steps_per_second": 2.612, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1414892096612834, |
|
"grad_norm": 2.8114117836276513, |
|
"learning_rate": 3.512125077523717e-06, |
|
"loss": 1.0048, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1457767614692011, |
|
"grad_norm": 2.707619981442126, |
|
"learning_rate": 3.456318794141709e-06, |
|
"loss": 0.8126, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15006431327711878, |
|
"grad_norm": 2.6357116905007856, |
|
"learning_rate": 3.4012085134690555e-06, |
|
"loss": 0.8684, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15435186508503645, |
|
"grad_norm": 2.902285405452677, |
|
"learning_rate": 3.346789001115556e-06, |
|
"loss": 1.023, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15863941689295413, |
|
"grad_norm": 3.1470604950987546, |
|
"learning_rate": 3.293055035879266e-06, |
|
"loss": 0.979, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1629269687008718, |
|
"grad_norm": 2.429084645315808, |
|
"learning_rate": 3.2400014097798766e-06, |
|
"loss": 0.9414, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1672145205087895, |
|
"grad_norm": 2.6607376091673283, |
|
"learning_rate": 3.187622928092377e-06, |
|
"loss": 1.0719, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17150207231670717, |
|
"grad_norm": 3.064466109382867, |
|
"learning_rate": 3.1359144093809653e-06, |
|
"loss": 0.9007, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17578962412462484, |
|
"grad_norm": 3.3202976410761815, |
|
"learning_rate": 3.084870685533213e-06, |
|
"loss": 1.0225, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.18007717593254252, |
|
"grad_norm": 9.376729410524437, |
|
"learning_rate": 3.034486601794506e-06, |
|
"loss": 0.8927, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1843647277404602, |
|
"grad_norm": 4.24264134885146, |
|
"learning_rate": 2.984757016802754e-06, |
|
"loss": 0.8904, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18865227954837788, |
|
"grad_norm": 3.401996163001482, |
|
"learning_rate": 2.935676802623359e-06, |
|
"loss": 0.89, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19293983135629555, |
|
"grad_norm": 2.811911382062161, |
|
"learning_rate": 2.887240844784472e-06, |
|
"loss": 1.0528, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19722738316421323, |
|
"grad_norm": 6.117694360086105, |
|
"learning_rate": 2.8394440423125295e-06, |
|
"loss": 0.957, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2015149349721309, |
|
"grad_norm": 2.5418193456876024, |
|
"learning_rate": 2.7922813077680564e-06, |
|
"loss": 0.8757, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2058024867800486, |
|
"grad_norm": 2.980094551138022, |
|
"learning_rate": 2.7457475672817817e-06, |
|
"loss": 0.7212, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2058024867800486, |
|
"eval_loss": 0.9225141406059265, |
|
"eval_runtime": 19.142, |
|
"eval_samples_per_second": 10.448, |
|
"eval_steps_per_second": 2.612, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21009003858796627, |
|
"grad_norm": 2.8229472248870113, |
|
"learning_rate": 2.6998377605910153e-06, |
|
"loss": 0.7847, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.21437759039588394, |
|
"grad_norm": 3.313576806878507, |
|
"learning_rate": 2.6545468410763466e-06, |
|
"loss": 0.7885, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21866514220380162, |
|
"grad_norm": 2.7546057206480863, |
|
"learning_rate": 2.60986977579862e-06, |
|
"loss": 0.9853, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2229526940117193, |
|
"grad_norm": 2.62325989569962, |
|
"learning_rate": 2.565801545536224e-06, |
|
"loss": 0.8693, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22724024581963698, |
|
"grad_norm": 2.494330977302947, |
|
"learning_rate": 2.52233714482269e-06, |
|
"loss": 0.8483, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.23152779762755465, |
|
"grad_norm": 3.0508796153780815, |
|
"learning_rate": 2.479471581984588e-06, |
|
"loss": 0.9198, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23581534943547233, |
|
"grad_norm": 2.8596047579477077, |
|
"learning_rate": 2.437199879179748e-06, |
|
"loss": 0.8111, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.24010290124339004, |
|
"grad_norm": 2.9634105907172676, |
|
"learning_rate": 2.395517072435804e-06, |
|
"loss": 0.9249, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24439045305130772, |
|
"grad_norm": 2.876075717907315, |
|
"learning_rate": 2.35441821168905e-06, |
|
"loss": 0.8466, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2486780048592254, |
|
"grad_norm": 2.707429407550077, |
|
"learning_rate": 2.313898360823632e-06, |
|
"loss": 0.8335, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25296555666714304, |
|
"grad_norm": 2.424659422914543, |
|
"learning_rate": 2.2739525977110787e-06, |
|
"loss": 0.9591, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2572531084750607, |
|
"grad_norm": 2.3503862613323054, |
|
"learning_rate": 2.234576014250154e-06, |
|
"loss": 0.8867, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2615406602829784, |
|
"grad_norm": 2.487930933633346, |
|
"learning_rate": 2.195763716407068e-06, |
|
"loss": 0.9301, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2658282120908961, |
|
"grad_norm": 2.9951853980801206, |
|
"learning_rate": 2.15751082425603e-06, |
|
"loss": 0.8933, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.27011576389881375, |
|
"grad_norm": 2.6479548343780044, |
|
"learning_rate": 2.119812472020151e-06, |
|
"loss": 1.1902, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.27440331570673143, |
|
"grad_norm": 3.031373376327646, |
|
"learning_rate": 2.082663808112706e-06, |
|
"loss": 0.8077, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.27440331570673143, |
|
"eval_loss": 0.9141628742218018, |
|
"eval_runtime": 19.1329, |
|
"eval_samples_per_second": 10.453, |
|
"eval_steps_per_second": 2.613, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2786908675146491, |
|
"grad_norm": 3.8925204318472777, |
|
"learning_rate": 2.0460599951787676e-06, |
|
"loss": 1.003, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2829784193225668, |
|
"grad_norm": 2.8374998009671213, |
|
"learning_rate": 2.0099962101371912e-06, |
|
"loss": 0.9517, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2872659711304845, |
|
"grad_norm": 2.7039222330267725, |
|
"learning_rate": 1.9744676442229893e-06, |
|
"loss": 0.8129, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2915535229384022, |
|
"grad_norm": 3.5945398321446014, |
|
"learning_rate": 1.9394695030300814e-06, |
|
"loss": 0.9616, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2958410747463199, |
|
"grad_norm": 4.022714585976749, |
|
"learning_rate": 1.904997006554424e-06, |
|
"loss": 0.9347, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.30012862655423755, |
|
"grad_norm": 2.449025495886818, |
|
"learning_rate": 1.8710453892375372e-06, |
|
"loss": 0.9365, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.30441617836215523, |
|
"grad_norm": 1.97102566822208, |
|
"learning_rate": 1.837609900010423e-06, |
|
"loss": 1.0678, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3087037301700729, |
|
"grad_norm": 2.9794081240205292, |
|
"learning_rate": 1.8046858023378982e-06, |
|
"loss": 0.8233, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3129912819779906, |
|
"grad_norm": 2.623718879014175, |
|
"learning_rate": 1.7722683742633203e-06, |
|
"loss": 1.089, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.31727883378590827, |
|
"grad_norm": 3.1393160530750808, |
|
"learning_rate": 1.7403529084537394e-06, |
|
"loss": 0.8982, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32156638559382594, |
|
"grad_norm": 3.5048967956402506, |
|
"learning_rate": 1.708934712245474e-06, |
|
"loss": 0.8602, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3258539374017436, |
|
"grad_norm": 2.731824579589462, |
|
"learning_rate": 1.6780091076901074e-06, |
|
"loss": 0.9206, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3301414892096613, |
|
"grad_norm": 2.6508962987997164, |
|
"learning_rate": 1.6475714316009228e-06, |
|
"loss": 0.9, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.334429041017579, |
|
"grad_norm": 3.3563303629070194, |
|
"learning_rate": 1.6176170355997885e-06, |
|
"loss": 0.9077, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.33871659282549665, |
|
"grad_norm": 2.510811889533464, |
|
"learning_rate": 1.5881412861644783e-06, |
|
"loss": 0.8796, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.34300414463341433, |
|
"grad_norm": 2.7025767408039423, |
|
"learning_rate": 1.559139564676459e-06, |
|
"loss": 0.8442, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34300414463341433, |
|
"eval_loss": 0.9068123698234558, |
|
"eval_runtime": 19.1503, |
|
"eval_samples_per_second": 10.444, |
|
"eval_steps_per_second": 2.611, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.347291696441332, |
|
"grad_norm": 2.3723461526691687, |
|
"learning_rate": 1.530607267469142e-06, |
|
"loss": 0.8776, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3515792482492497, |
|
"grad_norm": 2.326426360163379, |
|
"learning_rate": 1.502539805876598e-06, |
|
"loss": 0.8368, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35586680005716737, |
|
"grad_norm": 2.609797589938526, |
|
"learning_rate": 1.4749326062827624e-06, |
|
"loss": 0.8314, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.36015435186508504, |
|
"grad_norm": 3.018034878541378, |
|
"learning_rate": 1.447781110171118e-06, |
|
"loss": 0.8639, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3644419036730027, |
|
"grad_norm": 2.9665919577118394, |
|
"learning_rate": 1.421080774174884e-06, |
|
"loss": 0.8738, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3687294554809204, |
|
"grad_norm": 2.7395052466656082, |
|
"learning_rate": 1.3948270701276901e-06, |
|
"loss": 0.8124, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3730170072888381, |
|
"grad_norm": 2.4814426410417547, |
|
"learning_rate": 1.3690154851147892e-06, |
|
"loss": 0.8018, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.37730455909675575, |
|
"grad_norm": 2.9039262323143697, |
|
"learning_rate": 1.3436415215247681e-06, |
|
"loss": 0.7621, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38159211090467343, |
|
"grad_norm": 2.776860383566422, |
|
"learning_rate": 1.3187006971017973e-06, |
|
"loss": 0.9082, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3858796627125911, |
|
"grad_norm": 2.1181970553123257, |
|
"learning_rate": 1.2941885449984206e-06, |
|
"loss": 0.8821, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3901672145205088, |
|
"grad_norm": 2.5660032110684767, |
|
"learning_rate": 1.2701006138288915e-06, |
|
"loss": 0.9639, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.39445476632842646, |
|
"grad_norm": 4.632813592554, |
|
"learning_rate": 1.2464324677230652e-06, |
|
"loss": 0.9052, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39874231813634414, |
|
"grad_norm": 3.394837987828098, |
|
"learning_rate": 1.22317968638086e-06, |
|
"loss": 0.8723, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4030298699442618, |
|
"grad_norm": 2.716611388027132, |
|
"learning_rate": 1.2003378651272997e-06, |
|
"loss": 0.8457, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4073174217521795, |
|
"grad_norm": 2.1520375204146625, |
|
"learning_rate": 1.1779026149681347e-06, |
|
"loss": 0.9502, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4116049735600972, |
|
"grad_norm": 2.7552080232688696, |
|
"learning_rate": 1.155869562646071e-06, |
|
"loss": 0.831, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4116049735600972, |
|
"eval_loss": 0.9032189846038818, |
|
"eval_runtime": 19.1348, |
|
"eval_samples_per_second": 10.452, |
|
"eval_steps_per_second": 2.613, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.41589252536801485, |
|
"grad_norm": 2.0689491545152663, |
|
"learning_rate": 1.1342343506976044e-06, |
|
"loss": 0.8338, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.42018007717593253, |
|
"grad_norm": 3.2134570774473494, |
|
"learning_rate": 1.1129926375104734e-06, |
|
"loss": 0.8289, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4244676289838502, |
|
"grad_norm": 2.822597617038844, |
|
"learning_rate": 1.0921400973817452e-06, |
|
"loss": 0.9218, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4287551807917679, |
|
"grad_norm": 3.1918997161652003, |
|
"learning_rate": 1.071672420576549e-06, |
|
"loss": 0.8467, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43304273259968556, |
|
"grad_norm": 3.0230981418285885, |
|
"learning_rate": 1.051585313387455e-06, |
|
"loss": 0.8871, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.43733028440760324, |
|
"grad_norm": 2.345223930068171, |
|
"learning_rate": 1.0318744981945303e-06, |
|
"loss": 1.1172, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4416178362155209, |
|
"grad_norm": 2.23873439529607, |
|
"learning_rate": 1.0125357135260713e-06, |
|
"loss": 1.0281, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4459053880234386, |
|
"grad_norm": 2.17521388870601, |
|
"learning_rate": 9.93564714120029e-07, |
|
"loss": 0.9027, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4501929398313563, |
|
"grad_norm": 2.805982076338286, |
|
"learning_rate": 9.749572709861371e-07, |
|
"loss": 0.8424, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.45448049163927395, |
|
"grad_norm": 3.076865463356093, |
|
"learning_rate": 9.5670917146877e-07, |
|
"loss": 1.0317, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.45876804344719163, |
|
"grad_norm": 2.351075063586948, |
|
"learning_rate": 9.38816219310525e-07, |
|
"loss": 0.9455, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4630555952551093, |
|
"grad_norm": 3.042494238619368, |
|
"learning_rate": 9.212742347165562e-07, |
|
"loss": 0.914, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.467343147063027, |
|
"grad_norm": 2.51152797954818, |
|
"learning_rate": 9.040790544196722e-07, |
|
"loss": 0.8507, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.47163069887094466, |
|
"grad_norm": 2.2530854319732647, |
|
"learning_rate": 8.872265317462145e-07, |
|
"loss": 0.7923, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.47591825067886234, |
|
"grad_norm": 2.4496883783363876, |
|
"learning_rate": 8.707125366827236e-07, |
|
"loss": 0.8219, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4802058024867801, |
|
"grad_norm": 2.85322554374369, |
|
"learning_rate": 8.54532955943421e-07, |
|
"loss": 0.9696, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4802058024867801, |
|
"eval_loss": 0.8974508047103882, |
|
"eval_runtime": 19.0314, |
|
"eval_samples_per_second": 10.509, |
|
"eval_steps_per_second": 2.627, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.48449335429469775, |
|
"grad_norm": 2.657892610245309, |
|
"learning_rate": 8.386836930385165e-07, |
|
"loss": 0.8421, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.48878090610261543, |
|
"grad_norm": 2.602711920187755, |
|
"learning_rate": 8.231606683433569e-07, |
|
"loss": 0.8295, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4930684579105331, |
|
"grad_norm": 2.1213399589470616, |
|
"learning_rate": 8.079598191684382e-07, |
|
"loss": 0.8803, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4973560097184508, |
|
"grad_norm": 2.244791995121278, |
|
"learning_rate": 7.930770998302978e-07, |
|
"loss": 0.8343, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5016435615263685, |
|
"grad_norm": 2.566423079642009, |
|
"learning_rate": 7.785084817233009e-07, |
|
"loss": 0.8736, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5059311133342861, |
|
"grad_norm": 2.3039665634657487, |
|
"learning_rate": 7.642499533923466e-07, |
|
"loss": 0.8449, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5102186651422038, |
|
"grad_norm": 2.6413196407680095, |
|
"learning_rate": 7.502975206065118e-07, |
|
"loss": 0.9765, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5145062169501214, |
|
"grad_norm": 2.6494694608389806, |
|
"learning_rate": 7.366472064336485e-07, |
|
"loss": 1.041, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5187937687580392, |
|
"grad_norm": 2.777084503723626, |
|
"learning_rate": 7.232950513159591e-07, |
|
"loss": 0.7932, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5230813205659568, |
|
"grad_norm": 2.487662762486022, |
|
"learning_rate": 7.102371131465766e-07, |
|
"loss": 0.8704, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5273688723738745, |
|
"grad_norm": 2.55250344248028, |
|
"learning_rate": 6.974694673471581e-07, |
|
"loss": 0.8656, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5316564241817922, |
|
"grad_norm": 3.215607059371808, |
|
"learning_rate": 6.849882069465297e-07, |
|
"loss": 0.8775, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5359439759897099, |
|
"grad_norm": 2.663679299659993, |
|
"learning_rate": 6.727894426603947e-07, |
|
"loss": 0.9558, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5402315277976275, |
|
"grad_norm": 1.8863771094416066, |
|
"learning_rate": 6.608693029721375e-07, |
|
"loss": 1.0649, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5445190796055452, |
|
"grad_norm": 2.3166038184540345, |
|
"learning_rate": 6.492239342147439e-07, |
|
"loss": 0.9116, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5488066314134629, |
|
"grad_norm": 2.451365227453816, |
|
"learning_rate": 6.378495006538639e-07, |
|
"loss": 0.9949, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5488066314134629, |
|
"eval_loss": 0.8972450494766235, |
|
"eval_runtime": 19.0746, |
|
"eval_samples_per_second": 10.485, |
|
"eval_steps_per_second": 2.621, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5530941832213806, |
|
"grad_norm": 2.431525558235171, |
|
"learning_rate": 6.267421845720492e-07, |
|
"loss": 0.9208, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5573817350292982, |
|
"grad_norm": 3.025842320604373, |
|
"learning_rate": 6.158981863541822e-07, |
|
"loss": 0.8782, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.561669286837216, |
|
"grad_norm": 2.713593393993989, |
|
"learning_rate": 6.053137245741388e-07, |
|
"loss": 0.8848, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5659568386451336, |
|
"grad_norm": 2.364400143996411, |
|
"learning_rate": 5.949850360827048e-07, |
|
"loss": 0.8214, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5702443904530513, |
|
"grad_norm": 3.1249075781445654, |
|
"learning_rate": 5.849083760967786e-07, |
|
"loss": 0.9232, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.574531942260969, |
|
"grad_norm": 2.371939438179526, |
|
"learning_rate": 5.750800182898949e-07, |
|
"loss": 0.9599, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5788194940688867, |
|
"grad_norm": 2.657320971034355, |
|
"learning_rate": 5.654962548840998e-07, |
|
"loss": 0.957, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5831070458768044, |
|
"grad_norm": 1.9610510581627134, |
|
"learning_rate": 5.561533967432111e-07, |
|
"loss": 0.8544, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.587394597684722, |
|
"grad_norm": 1.9590021785129759, |
|
"learning_rate": 5.470477734675006e-07, |
|
"loss": 0.9022, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5916821494926398, |
|
"grad_norm": 3.0946081747543532, |
|
"learning_rate": 5.381757334898326e-07, |
|
"loss": 0.8607, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5959697013005574, |
|
"grad_norm": 2.6523600890631447, |
|
"learning_rate": 5.295336441732995e-07, |
|
"loss": 0.9483, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6002572531084751, |
|
"grad_norm": 2.6162593545931943, |
|
"learning_rate": 5.211178919103908e-07, |
|
"loss": 0.8484, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6045448049163927, |
|
"grad_norm": 2.7584028741566935, |
|
"learning_rate": 5.129248822237369e-07, |
|
"loss": 0.8153, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6088323567243105, |
|
"grad_norm": 2.2177368688014263, |
|
"learning_rate": 5.049510398684706e-07, |
|
"loss": 0.8865, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6131199085322281, |
|
"grad_norm": 2.7385320493457175, |
|
"learning_rate": 4.97192808936251e-07, |
|
"loss": 0.8288, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6174074603401458, |
|
"grad_norm": 2.969022490845044, |
|
"learning_rate": 4.896466529609901e-07, |
|
"loss": 0.8154, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6174074603401458, |
|
"eval_loss": 0.8948369026184082, |
|
"eval_runtime": 19.0474, |
|
"eval_samples_per_second": 10.5, |
|
"eval_steps_per_second": 2.625, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6216950121480634, |
|
"grad_norm": 2.4080414960336576, |
|
"learning_rate": 4.823090550263373e-07, |
|
"loss": 0.8132, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6259825639559812, |
|
"grad_norm": 2.4062548632108305, |
|
"learning_rate": 4.7517651787496393e-07, |
|
"loss": 0.8563, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6302701157638988, |
|
"grad_norm": 2.4974176263678145, |
|
"learning_rate": 4.682455640197012e-07, |
|
"loss": 0.83, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6345576675718165, |
|
"grad_norm": 2.782934403189113, |
|
"learning_rate": 4.6151273585658703e-07, |
|
"loss": 0.7722, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6388452193797342, |
|
"grad_norm": 3.589995905013926, |
|
"learning_rate": 4.549745957798709e-07, |
|
"loss": 0.9013, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6431327711876519, |
|
"grad_norm": 1.5164412373226497, |
|
"learning_rate": 4.486277262990414e-07, |
|
"loss": 0.8061, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6474203229955695, |
|
"grad_norm": 2.4822941257221496, |
|
"learning_rate": 4.424687301579312e-07, |
|
"loss": 0.8018, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6517078748034872, |
|
"grad_norm": 2.3163460863010044, |
|
"learning_rate": 4.364942304559619e-07, |
|
"loss": 0.97, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6559954266114049, |
|
"grad_norm": 2.5709552598683048, |
|
"learning_rate": 4.307008707715972e-07, |
|
"loss": 0.8424, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6602829784193226, |
|
"grad_norm": 2.97580695190474, |
|
"learning_rate": 4.250853152880694e-07, |
|
"loss": 0.9495, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6645705302272402, |
|
"grad_norm": 2.780816600415984, |
|
"learning_rate": 4.1964424892144836e-07, |
|
"loss": 0.9217, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.668858082035158, |
|
"grad_norm": 3.106789897489022, |
|
"learning_rate": 4.143743774511321e-07, |
|
"loss": 0.7872, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6731456338430756, |
|
"grad_norm": 2.6180834213520425, |
|
"learning_rate": 4.0927242765282987e-07, |
|
"loss": 0.9516, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6774331856509933, |
|
"grad_norm": 3.3693090699327675, |
|
"learning_rate": 4.043351474341226e-07, |
|
"loss": 0.918, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6817207374589109, |
|
"grad_norm": 2.6455267685583292, |
|
"learning_rate": 3.995593059726841e-07, |
|
"loss": 0.8869, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6860082892668287, |
|
"grad_norm": 2.6976779093683985, |
|
"learning_rate": 3.9494169385724855e-07, |
|
"loss": 0.9682, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6860082892668287, |
|
"eval_loss": 0.8931262493133545, |
|
"eval_runtime": 19.039, |
|
"eval_samples_per_second": 10.505, |
|
"eval_steps_per_second": 2.626, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6902958410747463, |
|
"grad_norm": 2.73935107332009, |
|
"learning_rate": 3.904791232314207e-07, |
|
"loss": 0.9497, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.694583392882664, |
|
"grad_norm": 2.594513781497614, |
|
"learning_rate": 3.861684279404229e-07, |
|
"loss": 0.9616, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6988709446905816, |
|
"grad_norm": 2.2189306808012694, |
|
"learning_rate": 3.820064636808802e-07, |
|
"loss": 1.0817, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7031584964984994, |
|
"grad_norm": 2.636673641067638, |
|
"learning_rate": 3.7799010815375107e-07, |
|
"loss": 0.9003, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.707446048306417, |
|
"grad_norm": 2.074697655719305, |
|
"learning_rate": 3.741162612205157e-07, |
|
"loss": 1.1193, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7117336001143347, |
|
"grad_norm": 2.4627913841496425, |
|
"learning_rate": 3.703818450627364e-07, |
|
"loss": 0.8603, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7160211519222524, |
|
"grad_norm": 2.4468198599625044, |
|
"learning_rate": 3.6678380434511737e-07, |
|
"loss": 1.012, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7203087037301701, |
|
"grad_norm": 2.231882643368985, |
|
"learning_rate": 3.6331910638219e-07, |
|
"loss": 1.0174, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7245962555380877, |
|
"grad_norm": 3.0961788514759374, |
|
"learning_rate": 3.5998474130876195e-07, |
|
"loss": 0.982, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7288838073460054, |
|
"grad_norm": 2.699948117104775, |
|
"learning_rate": 3.567777222542744e-07, |
|
"loss": 0.9954, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7331713591539231, |
|
"grad_norm": 2.0352300295309815, |
|
"learning_rate": 3.536950855212165e-07, |
|
"loss": 0.9349, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7374589109618408, |
|
"grad_norm": 2.3439290267355664, |
|
"learning_rate": 3.5073389076775976e-07, |
|
"loss": 1.1525, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7417464627697584, |
|
"grad_norm": 1.8013900413258555, |
|
"learning_rate": 3.4789122119478064e-07, |
|
"loss": 1.0404, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7460340145776762, |
|
"grad_norm": 2.432177798756092, |
|
"learning_rate": 3.451641837374477e-07, |
|
"loss": 1.1018, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7503215663855938, |
|
"grad_norm": 2.339931147492053, |
|
"learning_rate": 3.425499092615662e-07, |
|
"loss": 1.0088, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7546091181935115, |
|
"grad_norm": 2.865649768086174, |
|
"learning_rate": 3.4004555276487614e-07, |
|
"loss": 0.9491, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7546091181935115, |
|
"eval_loss": 0.9030270576477051, |
|
"eval_runtime": 19.0612, |
|
"eval_samples_per_second": 10.493, |
|
"eval_steps_per_second": 2.623, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7588966700014291, |
|
"grad_norm": 1.9608521922901478, |
|
"learning_rate": 3.376482935835186e-07, |
|
"loss": 1.0566, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7631842218093469, |
|
"grad_norm": 2.001405740517041, |
|
"learning_rate": 3.3535533560389486e-07, |
|
"loss": 1.0203, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7674717736172646, |
|
"grad_norm": 3.114574828020116, |
|
"learning_rate": 3.331639074801548e-07, |
|
"loss": 0.9489, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7717593254251822, |
|
"grad_norm": 2.3092110155282985, |
|
"learning_rate": 3.3107126285757256e-07, |
|
"loss": 0.9057, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7760468772331, |
|
"grad_norm": 2.714773467015402, |
|
"learning_rate": 3.290746806020766e-07, |
|
"loss": 0.9441, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7803344290410176, |
|
"grad_norm": 3.186486155591397, |
|
"learning_rate": 3.271714650362233e-07, |
|
"loss": 1.0213, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7846219808489353, |
|
"grad_norm": 2.5861267874594174, |
|
"learning_rate": 3.2535894618192186e-07, |
|
"loss": 0.9689, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7889095326568529, |
|
"grad_norm": 3.826794117846477, |
|
"learning_rate": 3.236344800102375e-07, |
|
"loss": 0.8944, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7931970844647707, |
|
"grad_norm": 2.7462154972645814, |
|
"learning_rate": 3.2199544869862547e-07, |
|
"loss": 1.0624, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7974846362726883, |
|
"grad_norm": 2.8358584012860777, |
|
"learning_rate": 3.204392608959703e-07, |
|
"loss": 0.9327, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.801772188080606, |
|
"grad_norm": 2.101011305773841, |
|
"learning_rate": 3.1896335199583363e-07, |
|
"loss": 0.9159, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8060597398885236, |
|
"grad_norm": 2.5833966453921198, |
|
"learning_rate": 3.1756518441834397e-07, |
|
"loss": 1.0248, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8103472916964414, |
|
"grad_norm": 6.2684841674997775, |
|
"learning_rate": 3.151927901941027e-07, |
|
"loss": 1.0413, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.814634843504359, |
|
"grad_norm": 2.276372461235212, |
|
"learning_rate": 3.1032417503134016e-07, |
|
"loss": 1.1134, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8189223953122767, |
|
"grad_norm": 2.6812797606926555, |
|
"learning_rate": 3.054555598685777e-07, |
|
"loss": 0.9336, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8232099471201944, |
|
"grad_norm": 2.417389224635538, |
|
"learning_rate": 3.0058694470581514e-07, |
|
"loss": 1.068, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8232099471201944, |
|
"eval_loss": 0.9022178053855896, |
|
"eval_runtime": 19.0566, |
|
"eval_samples_per_second": 10.495, |
|
"eval_steps_per_second": 2.624, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8274974989281121, |
|
"grad_norm": 2.503558384043619, |
|
"learning_rate": 2.9571832954305265e-07, |
|
"loss": 1.0307, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8317850507360297, |
|
"grad_norm": 2.249581998288759, |
|
"learning_rate": 2.9084971438029017e-07, |
|
"loss": 0.9962, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8360726025439474, |
|
"grad_norm": 3.875121879524354, |
|
"learning_rate": 2.8598109921752763e-07, |
|
"loss": 1.0143, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8403601543518651, |
|
"grad_norm": 1.730446489555693, |
|
"learning_rate": 2.8111248405476515e-07, |
|
"loss": 0.9869, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8446477061597828, |
|
"grad_norm": 3.3232425457372745, |
|
"learning_rate": 2.7624386889200266e-07, |
|
"loss": 1.0683, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8489352579677004, |
|
"grad_norm": 2.4807364391304225, |
|
"learning_rate": 2.713752537292401e-07, |
|
"loss": 0.9797, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8532228097756182, |
|
"grad_norm": 2.3616731714440724, |
|
"learning_rate": 2.6650663856647764e-07, |
|
"loss": 1.0389, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8575103615835358, |
|
"grad_norm": 2.6062865385957514, |
|
"learning_rate": 2.616380234037151e-07, |
|
"loss": 1.0267, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8617979133914535, |
|
"grad_norm": 2.907979106229624, |
|
"learning_rate": 2.567694082409526e-07, |
|
"loss": 0.9329, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8660854651993711, |
|
"grad_norm": 2.6968962865268407, |
|
"learning_rate": 2.5190079307819013e-07, |
|
"loss": 0.9386, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8703730170072889, |
|
"grad_norm": 2.21037904641155, |
|
"learning_rate": 2.4703217791542765e-07, |
|
"loss": 1.0709, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8746605688152065, |
|
"grad_norm": 2.8255592337377915, |
|
"learning_rate": 2.421635627526651e-07, |
|
"loss": 0.8629, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8789481206231242, |
|
"grad_norm": 2.6598508915895516, |
|
"learning_rate": 2.3729494758990262e-07, |
|
"loss": 0.9564, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8832356724310418, |
|
"grad_norm": 2.5349227338549043, |
|
"learning_rate": 2.3242633242714014e-07, |
|
"loss": 0.9884, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8875232242389596, |
|
"grad_norm": 1.8000674399648018, |
|
"learning_rate": 2.275577172643776e-07, |
|
"loss": 1.1607, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8918107760468772, |
|
"grad_norm": 2.0150522007599654, |
|
"learning_rate": 2.2268910210161506e-07, |
|
"loss": 1.06, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8918107760468772, |
|
"eval_loss": 0.9013447761535645, |
|
"eval_runtime": 19.0476, |
|
"eval_samples_per_second": 10.5, |
|
"eval_steps_per_second": 2.625, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8960983278547949, |
|
"grad_norm": 2.714964092055051, |
|
"learning_rate": 2.1782048693885258e-07, |
|
"loss": 0.8408, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9003858796627126, |
|
"grad_norm": 2.328572268378804, |
|
"learning_rate": 2.129518717760901e-07, |
|
"loss": 0.9585, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9046734314706303, |
|
"grad_norm": 3.3294058341520847, |
|
"learning_rate": 2.0808325661332756e-07, |
|
"loss": 1.0169, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9089609832785479, |
|
"grad_norm": 2.3567378214154453, |
|
"learning_rate": 2.0321464145056507e-07, |
|
"loss": 1.0411, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9132485350864656, |
|
"grad_norm": 2.3516481223731702, |
|
"learning_rate": 1.9834602628780256e-07, |
|
"loss": 1.0731, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9175360868943833, |
|
"grad_norm": 2.3183322496467484, |
|
"learning_rate": 1.934774111250401e-07, |
|
"loss": 0.9489, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.921823638702301, |
|
"grad_norm": 1.97960646392379, |
|
"learning_rate": 1.8860879596227757e-07, |
|
"loss": 1.0331, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9261111905102186, |
|
"grad_norm": 2.3395284448987086, |
|
"learning_rate": 1.8374018079951505e-07, |
|
"loss": 0.7678, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9303987423181364, |
|
"grad_norm": 1.945316009597557, |
|
"learning_rate": 1.788715656367526e-07, |
|
"loss": 1.0268, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.934686294126054, |
|
"grad_norm": 2.5392514762785603, |
|
"learning_rate": 1.7400295047399006e-07, |
|
"loss": 1.0499, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9389738459339717, |
|
"grad_norm": 2.147497075881358, |
|
"learning_rate": 1.6913433531122752e-07, |
|
"loss": 0.8984, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9432613977418893, |
|
"grad_norm": 2.3338579963123043, |
|
"learning_rate": 1.6426572014846504e-07, |
|
"loss": 1.0244, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9475489495498071, |
|
"grad_norm": 2.272071646300021, |
|
"learning_rate": 1.5939710498570255e-07, |
|
"loss": 0.8251, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9518365013577247, |
|
"grad_norm": 1.6609121532926487, |
|
"learning_rate": 1.5452848982294004e-07, |
|
"loss": 0.9794, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9561240531656424, |
|
"grad_norm": 2.2563188492644493, |
|
"learning_rate": 1.4965987466017753e-07, |
|
"loss": 0.972, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9604116049735602, |
|
"grad_norm": 2.667525192015302, |
|
"learning_rate": 1.4479125949741505e-07, |
|
"loss": 0.8302, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9604116049735602, |
|
"eval_loss": 0.9002277851104736, |
|
"eval_runtime": 19.0592, |
|
"eval_samples_per_second": 10.494, |
|
"eval_steps_per_second": 2.623, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9646991567814778, |
|
"grad_norm": 1.8891294277340498, |
|
"learning_rate": 1.3992264433465253e-07, |
|
"loss": 0.8849, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9689867085893955, |
|
"grad_norm": 2.5578344667960127, |
|
"learning_rate": 1.3505402917189002e-07, |
|
"loss": 0.9591, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9732742603973131, |
|
"grad_norm": 2.001959937555614, |
|
"learning_rate": 1.301854140091275e-07, |
|
"loss": 1.0343, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9775618122052309, |
|
"grad_norm": 1.578569313456066, |
|
"learning_rate": 1.2531679884636503e-07, |
|
"loss": 1.0543, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9818493640131485, |
|
"grad_norm": 2.0720069005801953, |
|
"learning_rate": 1.2044818368360252e-07, |
|
"loss": 0.8641, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9861369158210662, |
|
"grad_norm": 1.8810386084158826, |
|
"learning_rate": 1.1557956852084e-07, |
|
"loss": 1.0326, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9904244676289838, |
|
"grad_norm": 2.0266012589600613, |
|
"learning_rate": 1.1071095335807752e-07, |
|
"loss": 1.0289, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9947120194369016, |
|
"grad_norm": 2.2518373465247357, |
|
"learning_rate": 1.0584233819531501e-07, |
|
"loss": 0.939, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9989995712448192, |
|
"grad_norm": 2.7326466173600337, |
|
"learning_rate": 1.009737230325525e-07, |
|
"loss": 1.0486, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9998570816064027, |
|
"step": 1166, |
|
"total_flos": 1.7681949461682586e+17, |
|
"train_loss": 0.944941889564946, |
|
"train_runtime": 15099.9882, |
|
"train_samples_per_second": 2.78, |
|
"train_steps_per_second": 0.077 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1166, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1166, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7681949461682586e+17, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|