|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 300, |
|
"global_step": 273, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003663003663003663, |
|
"grad_norm": 165.0, |
|
"learning_rate": 2e-06, |
|
"loss": 9.1083, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018315018315018316, |
|
"grad_norm": 64.0, |
|
"learning_rate": 1.9998820020169668e-06, |
|
"loss": 8.3175, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03663003663003663, |
|
"grad_norm": 26.875, |
|
"learning_rate": 1.999402682936637e-06, |
|
"loss": 7.3293, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054945054945054944, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.998554844493029e-06, |
|
"loss": 6.8602, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07326007326007326, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.997338799317767e-06, |
|
"loss": 6.6054, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09157509157509157, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.995754995814884e-06, |
|
"loss": 6.3957, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.9938040179954784e-06, |
|
"loss": 6.256, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1282051282051282, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.991486585262365e-06, |
|
"loss": 6.1321, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.988803552144804e-06, |
|
"loss": 6.018, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16483516483516483, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.9857559079834022e-06, |
|
"loss": 5.9438, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18315018315018314, |
|
"grad_norm": 6.5, |
|
"learning_rate": 1.982344776565302e-06, |
|
"loss": 5.8733, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20146520146520147, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.978571415709799e-06, |
|
"loss": 5.8341, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.9744372168045322e-06, |
|
"loss": 5.793, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.9699437042924264e-06, |
|
"loss": 5.7305, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.965092535109567e-06, |
|
"loss": 5.671, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.959885498074224e-06, |
|
"loss": 5.6563, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 1.954324513227244e-06, |
|
"loss": 5.5778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31135531135531136, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.948411631124053e-06, |
|
"loss": 5.5838, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.942149032078538e-06, |
|
"loss": 5.5305, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.34798534798534797, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.935539025359077e-06, |
|
"loss": 5.5108, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.928584048337022e-06, |
|
"loss": 5.494, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.9212866655879395e-06, |
|
"loss": 5.4749, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.40293040293040294, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.9136495679459563e-06, |
|
"loss": 5.4473, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42124542124542125, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.9056755715115372e-06, |
|
"loss": 5.4071, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.8973676166130791e-06, |
|
"loss": 5.3994, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.45787545787545786, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.8887287667226963e-06, |
|
"loss": 5.3773, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8797622073265943e-06, |
|
"loss": 5.3734, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4945054945054945, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.8704712447504579e-06, |
|
"loss": 5.323, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.8608593049402752e-06, |
|
"loss": 5.2964, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5311355311355311, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.850929932199058e-06, |
|
"loss": 5.2951, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.8406867878799152e-06, |
|
"loss": 5.2498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5677655677655677, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.8301336490359678e-06, |
|
"loss": 5.2432, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.819274407027599e-06, |
|
"loss": 5.2436, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6043956043956044, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.8081130660875555e-06, |
|
"loss": 5.2218, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6227106227106227, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.79665374184443e-06, |
|
"loss": 5.2194, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.7849006598050625e-06, |
|
"loss": 5.1988, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.7728581537964318e-06, |
|
"loss": 5.1814, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6776556776556777, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.7605306643676006e-06, |
|
"loss": 5.1318, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6959706959706959, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.747922737152308e-06, |
|
"loss": 5.1609, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.7350390211928166e-06, |
|
"loss": 5.1395, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.721884267225624e-06, |
|
"loss": 5.0794, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7509157509157509, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.7084633259296795e-06, |
|
"loss": 5.0974, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.6947811461377467e-06, |
|
"loss": 5.1007, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7875457875457875, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.6808427730115712e-06, |
|
"loss": 5.0777, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8058608058608059, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.6666533461815323e-06, |
|
"loss": 5.0517, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.6522180978514552e-06, |
|
"loss": 5.0462, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8424908424908425, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 1.6375423508692912e-06, |
|
"loss": 5.0538, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8608058608058609, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.622631516764372e-06, |
|
"loss": 5.0335, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.607491093751966e-06, |
|
"loss": 5.0177, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8974358974358975, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.592126664705868e-06, |
|
"loss": 4.998, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9157509157509157, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.5765438950997703e-06, |
|
"loss": 4.992, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9340659340659341, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.5607485309181812e-06, |
|
"loss": 4.9831, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.544746396537651e-06, |
|
"loss": 4.9835, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9706959706959707, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.5285433925790945e-06, |
|
"loss": 4.962, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 1.5121454937319975e-06, |
|
"loss": 4.9708, |
|
"step": 270 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 819, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7871723276837847e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|