|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 1682, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011890606420927466, |
|
"grad_norm": 128.86566162109375, |
|
"learning_rate": 4.9970273483947685e-05, |
|
"loss": 9.559, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 23.844635009765625, |
|
"learning_rate": 4.913793103448276e-05, |
|
"loss": 2.0043, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 43.76908493041992, |
|
"learning_rate": 4.827586206896552e-05, |
|
"loss": 1.6394, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 112.72496032714844, |
|
"learning_rate": 4.741379310344828e-05, |
|
"loss": 1.1869, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 57.330135345458984, |
|
"learning_rate": 4.655172413793104e-05, |
|
"loss": 1.4015, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 12.192914962768555, |
|
"learning_rate": 4.5689655172413794e-05, |
|
"loss": 1.1619, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 49.972900390625, |
|
"learning_rate": 4.482758620689655e-05, |
|
"loss": 1.4928, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 28.061901092529297, |
|
"learning_rate": 4.396551724137931e-05, |
|
"loss": 1.1286, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 13.678406715393066, |
|
"learning_rate": 4.3103448275862066e-05, |
|
"loss": 0.9936, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 56.66800308227539, |
|
"learning_rate": 4.224137931034483e-05, |
|
"loss": 1.1352, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 18.401317596435547, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 1.0754, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 28.412200927734375, |
|
"learning_rate": 4.0517241379310344e-05, |
|
"loss": 1.0104, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 62.137596130371094, |
|
"learning_rate": 3.965517241379311e-05, |
|
"loss": 0.9393, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 44.91804504394531, |
|
"learning_rate": 3.8793103448275865e-05, |
|
"loss": 0.727, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 15.308109283447266, |
|
"learning_rate": 3.793103448275862e-05, |
|
"loss": 0.8675, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 11.947402000427246, |
|
"learning_rate": 3.7068965517241385e-05, |
|
"loss": 0.7525, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 22.51788902282715, |
|
"learning_rate": 3.620689655172414e-05, |
|
"loss": 0.7872, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 39.137386322021484, |
|
"learning_rate": 3.53448275862069e-05, |
|
"loss": 0.7889, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 38.08049774169922, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 0.7347, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 10.072871208190918, |
|
"learning_rate": 3.3620689655172414e-05, |
|
"loss": 0.7422, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 24.6478328704834, |
|
"learning_rate": 3.275862068965517e-05, |
|
"loss": 0.7217, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 8.815550804138184, |
|
"learning_rate": 3.1896551724137935e-05, |
|
"loss": 0.767, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 7.418780326843262, |
|
"learning_rate": 3.103448275862069e-05, |
|
"loss": 0.7365, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 16.163270950317383, |
|
"learning_rate": 3.017241379310345e-05, |
|
"loss": 0.6203, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 47.155818939208984, |
|
"learning_rate": 2.9310344827586206e-05, |
|
"loss": 0.7505, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 17.693836212158203, |
|
"learning_rate": 2.844827586206897e-05, |
|
"loss": 0.6014, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 15.081289291381836, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.5907, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 235.15663146972656, |
|
"learning_rate": 2.672413793103448e-05, |
|
"loss": 0.5196, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 13.673110961914062, |
|
"learning_rate": 2.5862068965517244e-05, |
|
"loss": 0.5441, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 22.076805114746094, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.5455, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_cer": 0.020121099208197483, |
|
"eval_loss": 0.46177592873573303, |
|
"eval_runtime": 644.2587, |
|
"eval_samples_per_second": 2.611, |
|
"eval_steps_per_second": 0.328, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 6.016767501831055, |
|
"learning_rate": 2.413793103448276e-05, |
|
"loss": 0.416, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 5.592987060546875, |
|
"learning_rate": 2.327586206896552e-05, |
|
"loss": 0.4161, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 4.175529479980469, |
|
"learning_rate": 2.2413793103448276e-05, |
|
"loss": 0.4516, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 7.126400470733643, |
|
"learning_rate": 2.1551724137931033e-05, |
|
"loss": 0.4583, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 5.696765899658203, |
|
"learning_rate": 2.0689655172413793e-05, |
|
"loss": 0.3918, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 11.261072158813477, |
|
"learning_rate": 1.9827586206896554e-05, |
|
"loss": 0.4423, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 3.23542857170105, |
|
"learning_rate": 1.896551724137931e-05, |
|
"loss": 0.3769, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 4.922264099121094, |
|
"learning_rate": 1.810344827586207e-05, |
|
"loss": 0.4311, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 3.692586898803711, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.3667, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 2.88181471824646, |
|
"learning_rate": 1.6379310344827585e-05, |
|
"loss": 0.3167, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 3.277984142303467, |
|
"learning_rate": 1.5517241379310346e-05, |
|
"loss": 0.3331, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 7.566446304321289, |
|
"learning_rate": 1.4655172413793103e-05, |
|
"loss": 0.3046, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 17.953258514404297, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.3332, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 11.560026168823242, |
|
"learning_rate": 1.2931034482758622e-05, |
|
"loss": 0.3299, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 5.917276859283447, |
|
"learning_rate": 1.206896551724138e-05, |
|
"loss": 0.2961, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 3.665133476257324, |
|
"learning_rate": 1.1206896551724138e-05, |
|
"loss": 0.3142, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 2.3258779048919678, |
|
"learning_rate": 1.0344827586206897e-05, |
|
"loss": 0.3005, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 2.856088638305664, |
|
"learning_rate": 9.482758620689655e-06, |
|
"loss": 0.2652, |
|
"step": 1363 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 8.568778991699219, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 0.2652, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 4.4803667068481445, |
|
"learning_rate": 7.758620689655173e-06, |
|
"loss": 0.2541, |
|
"step": 1421 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 13.121492385864258, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.2754, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 2.48468279838562, |
|
"learning_rate": 6.03448275862069e-06, |
|
"loss": 0.2379, |
|
"step": 1479 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 1.497287631034851, |
|
"learning_rate": 5.172413793103448e-06, |
|
"loss": 0.2273, |
|
"step": 1508 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 2.972078800201416, |
|
"learning_rate": 4.310344827586207e-06, |
|
"loss": 0.2254, |
|
"step": 1537 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 12.911340713500977, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.2448, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 1.3689017295837402, |
|
"learning_rate": 2.586206896551724e-06, |
|
"loss": 0.2089, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 4.04969596862793, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 0.2174, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 2.9180474281311035, |
|
"learning_rate": 8.620689655172415e-07, |
|
"loss": 0.2381, |
|
"step": 1653 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.1190409660339355, |
|
"learning_rate": 0.0, |
|
"loss": 0.2068, |
|
"step": 1682 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_cer": 0.0032914143766495886, |
|
"eval_loss": 0.23910197615623474, |
|
"eval_runtime": 636.093, |
|
"eval_samples_per_second": 2.644, |
|
"eval_steps_per_second": 0.332, |
|
"step": 1682 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1682, |
|
"total_flos": 1.9906356553640313e+19, |
|
"train_loss": 0.6271170827069549, |
|
"train_runtime": 2733.4434, |
|
"train_samples_per_second": 4.92, |
|
"train_steps_per_second": 0.615 |
|
} |
|
], |
|
"logging_steps": 29, |
|
"max_steps": 1682, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9906356553640313e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|