|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 3000, |
|
"global_step": 88686, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.382721060821325e-05, |
|
"grad_norm": 1760.0, |
|
"learning_rate": 0.00029999661727893914, |
|
"loss": 57.5, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.000289851836817536, |
|
"loss": 4.5083, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"eval_loss": 2.8375253677368164, |
|
"eval_runtime": 88.2347, |
|
"eval_samples_per_second": 1069.931, |
|
"eval_steps_per_second": 8.364, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.000279703673635072, |
|
"loss": 3.6752, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"eval_loss": 2.74216628074646, |
|
"eval_runtime": 88.2236, |
|
"eval_samples_per_second": 1070.065, |
|
"eval_steps_per_second": 8.365, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00026955551045260807, |
|
"loss": 3.598, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"eval_loss": 2.70210862159729, |
|
"eval_runtime": 88.2673, |
|
"eval_samples_per_second": 1069.535, |
|
"eval_steps_per_second": 8.361, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.0002594073472701441, |
|
"loss": 3.5651, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"eval_loss": 2.686271905899048, |
|
"eval_runtime": 88.2934, |
|
"eval_samples_per_second": 1069.219, |
|
"eval_steps_per_second": 8.358, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0002492591840876801, |
|
"loss": 3.5409, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"eval_loss": 2.672954797744751, |
|
"eval_runtime": 88.2673, |
|
"eval_samples_per_second": 1069.535, |
|
"eval_steps_per_second": 8.361, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0002391110209052161, |
|
"loss": 3.5353, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"eval_loss": 2.6632580757141113, |
|
"eval_runtime": 88.3039, |
|
"eval_samples_per_second": 1069.092, |
|
"eval_steps_per_second": 8.358, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00022896285772275215, |
|
"loss": 3.5268, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"eval_loss": 2.6613523960113525, |
|
"eval_runtime": 88.3108, |
|
"eval_samples_per_second": 1069.009, |
|
"eval_steps_per_second": 8.357, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0002188146945402882, |
|
"loss": 3.5223, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"eval_loss": 2.6542599201202393, |
|
"eval_runtime": 88.2938, |
|
"eval_samples_per_second": 1069.214, |
|
"eval_steps_per_second": 8.358, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00020866653135782423, |
|
"loss": 3.5185, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"eval_loss": 2.6546196937561035, |
|
"eval_runtime": 88.3801, |
|
"eval_samples_per_second": 1068.17, |
|
"eval_steps_per_second": 8.35, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00019851836817536025, |
|
"loss": 3.515, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"eval_loss": 2.6509780883789062, |
|
"eval_runtime": 88.3972, |
|
"eval_samples_per_second": 1067.963, |
|
"eval_steps_per_second": 8.349, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.0001883702049928963, |
|
"loss": 3.5138, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"eval_loss": 2.6480565071105957, |
|
"eval_runtime": 88.3803, |
|
"eval_samples_per_second": 1068.168, |
|
"eval_steps_per_second": 8.35, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001782220418104323, |
|
"loss": 3.508, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"eval_loss": 2.6456427574157715, |
|
"eval_runtime": 88.3938, |
|
"eval_samples_per_second": 1068.005, |
|
"eval_steps_per_second": 8.349, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00016807387862796832, |
|
"loss": 3.5071, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"eval_loss": 2.642657518386841, |
|
"eval_runtime": 88.3875, |
|
"eval_samples_per_second": 1068.081, |
|
"eval_steps_per_second": 8.35, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00015792571544550436, |
|
"loss": 3.5058, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"eval_loss": 2.6432926654815674, |
|
"eval_runtime": 88.4239, |
|
"eval_samples_per_second": 1067.641, |
|
"eval_steps_per_second": 8.346, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00014777755226304037, |
|
"loss": 3.5063, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"eval_loss": 2.644076108932495, |
|
"eval_runtime": 88.4016, |
|
"eval_samples_per_second": 1067.91, |
|
"eval_steps_per_second": 8.348, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0001376293890805764, |
|
"loss": 3.5003, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"eval_loss": 2.6422553062438965, |
|
"eval_runtime": 88.447, |
|
"eval_samples_per_second": 1067.362, |
|
"eval_steps_per_second": 8.344, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00012748122589811243, |
|
"loss": 3.5018, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"eval_loss": 2.643885612487793, |
|
"eval_runtime": 88.2483, |
|
"eval_samples_per_second": 1069.765, |
|
"eval_steps_per_second": 8.363, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00011733306271564845, |
|
"loss": 3.4992, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"eval_loss": 2.6422975063323975, |
|
"eval_runtime": 88.2892, |
|
"eval_samples_per_second": 1069.27, |
|
"eval_steps_per_second": 8.359, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00010718489953318448, |
|
"loss": 3.5011, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"eval_loss": 2.6424670219421387, |
|
"eval_runtime": 88.2616, |
|
"eval_samples_per_second": 1069.604, |
|
"eval_steps_per_second": 8.362, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.703673635072052e-05, |
|
"loss": 3.5017, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"eval_loss": 2.6414718627929688, |
|
"eval_runtime": 88.2423, |
|
"eval_samples_per_second": 1069.838, |
|
"eval_steps_per_second": 8.363, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 8.688857316825655e-05, |
|
"loss": 3.5014, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"eval_loss": 2.6421916484832764, |
|
"eval_runtime": 88.2249, |
|
"eval_samples_per_second": 1070.049, |
|
"eval_steps_per_second": 8.365, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.674040998579256e-05, |
|
"loss": 3.4996, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"eval_loss": 2.6411330699920654, |
|
"eval_runtime": 88.2534, |
|
"eval_samples_per_second": 1069.703, |
|
"eval_steps_per_second": 8.362, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 6.659224680332859e-05, |
|
"loss": 3.4998, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"eval_loss": 2.6422553062438965, |
|
"eval_runtime": 88.2778, |
|
"eval_samples_per_second": 1069.408, |
|
"eval_steps_per_second": 8.36, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.644408362086462e-05, |
|
"loss": 3.5002, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"eval_loss": 2.6412177085876465, |
|
"eval_runtime": 88.2555, |
|
"eval_samples_per_second": 1069.678, |
|
"eval_steps_per_second": 8.362, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.629592043840065e-05, |
|
"loss": 3.5011, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"eval_loss": 2.641514301300049, |
|
"eval_runtime": 88.2215, |
|
"eval_samples_per_second": 1070.09, |
|
"eval_steps_per_second": 8.365, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 3.614775725593667e-05, |
|
"loss": 3.5005, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"eval_loss": 2.6411118507385254, |
|
"eval_runtime": 88.2883, |
|
"eval_samples_per_second": 1069.281, |
|
"eval_steps_per_second": 8.359, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.59995940734727e-05, |
|
"loss": 3.4975, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"eval_loss": 2.6413235664367676, |
|
"eval_runtime": 88.2689, |
|
"eval_samples_per_second": 1069.516, |
|
"eval_steps_per_second": 8.361, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.5851430891008727e-05, |
|
"loss": 3.5013, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"eval_loss": 2.6411330699920654, |
|
"eval_runtime": 88.2658, |
|
"eval_samples_per_second": 1069.554, |
|
"eval_steps_per_second": 8.361, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 5.703267708544753e-06, |
|
"loss": 3.5006, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"eval_loss": 2.6412177085876465, |
|
"eval_runtime": 88.2303, |
|
"eval_samples_per_second": 1069.984, |
|
"eval_steps_per_second": 8.364, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 88686, |
|
"total_flos": 1.2409318398044897e+18, |
|
"train_loss": 3.5531258104435874, |
|
"train_runtime": 34812.4565, |
|
"train_samples_per_second": 326.075, |
|
"train_steps_per_second": 2.548 |
|
} |
|
], |
|
"logging_steps": 3000, |
|
"max_steps": 88686, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2409318398044897e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|