|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 3000, |
|
"global_step": 88686, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.382721060821325e-05, |
|
"grad_norm": 1896.0, |
|
"learning_rate": 0.00029999661727893914, |
|
"loss": 32.25, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.000289851836817536, |
|
"loss": 3.8074, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10148163182463973, |
|
"eval_loss": 2.4561102390289307, |
|
"eval_runtime": 92.9065, |
|
"eval_samples_per_second": 1016.13, |
|
"eval_steps_per_second": 7.943, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.000279703673635072, |
|
"loss": 3.182, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20296326364927947, |
|
"eval_loss": 2.3882644176483154, |
|
"eval_runtime": 92.9373, |
|
"eval_samples_per_second": 1015.792, |
|
"eval_steps_per_second": 7.941, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.00026955551045260807, |
|
"loss": 3.1161, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.30444489547391923, |
|
"eval_loss": 2.365419626235962, |
|
"eval_runtime": 92.9113, |
|
"eval_samples_per_second": 1016.077, |
|
"eval_steps_per_second": 7.943, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0002594073472701441, |
|
"loss": 3.0863, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.40592652729855894, |
|
"eval_loss": 2.339674472808838, |
|
"eval_runtime": 92.9289, |
|
"eval_samples_per_second": 1015.884, |
|
"eval_steps_per_second": 7.942, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0002492591840876801, |
|
"loss": 3.0643, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5074081591231987, |
|
"eval_loss": 2.3348894119262695, |
|
"eval_runtime": 92.9352, |
|
"eval_samples_per_second": 1015.815, |
|
"eval_steps_per_second": 7.941, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0002391110209052161, |
|
"loss": 3.0593, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6088897909478385, |
|
"eval_loss": 2.32326602935791, |
|
"eval_runtime": 92.9449, |
|
"eval_samples_per_second": 1015.709, |
|
"eval_steps_per_second": 7.94, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00022896285772275215, |
|
"loss": 3.0519, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7103714227724782, |
|
"eval_loss": 2.325340986251831, |
|
"eval_runtime": 92.952, |
|
"eval_samples_per_second": 1015.632, |
|
"eval_steps_per_second": 7.94, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0002188146945402882, |
|
"loss": 3.0464, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8118530545971179, |
|
"eval_loss": 2.320810079574585, |
|
"eval_runtime": 92.965, |
|
"eval_samples_per_second": 1015.489, |
|
"eval_steps_per_second": 7.938, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00020866653135782423, |
|
"loss": 3.0434, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9133346864217576, |
|
"eval_loss": 2.3228955268859863, |
|
"eval_runtime": 93.1104, |
|
"eval_samples_per_second": 1013.904, |
|
"eval_steps_per_second": 7.926, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00019851836817536025, |
|
"loss": 3.0389, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.0148163182463974, |
|
"eval_loss": 2.3193914890289307, |
|
"eval_runtime": 93.0867, |
|
"eval_samples_per_second": 1014.162, |
|
"eval_steps_per_second": 7.928, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001883702049928963, |
|
"loss": 3.0377, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.116297950071037, |
|
"eval_loss": 2.3154852390289307, |
|
"eval_runtime": 93.1029, |
|
"eval_samples_per_second": 1013.985, |
|
"eval_steps_per_second": 7.927, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001782220418104323, |
|
"loss": 3.0319, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.217779581895677, |
|
"eval_loss": 2.3132622241973877, |
|
"eval_runtime": 93.1111, |
|
"eval_samples_per_second": 1013.897, |
|
"eval_steps_per_second": 7.926, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00016807387862796832, |
|
"loss": 3.0328, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"eval_loss": 2.310361623764038, |
|
"eval_runtime": 93.0989, |
|
"eval_samples_per_second": 1014.03, |
|
"eval_steps_per_second": 7.927, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00015792571544550436, |
|
"loss": 3.0314, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.4207428455449564, |
|
"eval_loss": 2.309779405593872, |
|
"eval_runtime": 93.116, |
|
"eval_samples_per_second": 1013.843, |
|
"eval_steps_per_second": 7.926, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00014777755226304037, |
|
"loss": 3.0306, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.522224477369596, |
|
"eval_loss": 2.3095569610595703, |
|
"eval_runtime": 93.0822, |
|
"eval_samples_per_second": 1014.211, |
|
"eval_steps_per_second": 7.928, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.0001376293890805764, |
|
"loss": 3.0251, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.6237061091942357, |
|
"eval_loss": 2.314659595489502, |
|
"eval_runtime": 93.0784, |
|
"eval_samples_per_second": 1014.253, |
|
"eval_steps_per_second": 7.929, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00012748122589811243, |
|
"loss": 3.027, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.7251877410188756, |
|
"eval_loss": 2.3094723224639893, |
|
"eval_runtime": 92.9414, |
|
"eval_samples_per_second": 1015.748, |
|
"eval_steps_per_second": 7.94, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00011733306271564845, |
|
"loss": 3.0238, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.8266693728435153, |
|
"eval_loss": 2.3098957538604736, |
|
"eval_runtime": 92.9753, |
|
"eval_samples_per_second": 1015.377, |
|
"eval_steps_per_second": 7.938, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00010718489953318448, |
|
"loss": 3.0246, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.928151004668155, |
|
"eval_loss": 2.308159828186035, |
|
"eval_runtime": 92.9437, |
|
"eval_samples_per_second": 1015.722, |
|
"eval_steps_per_second": 7.94, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 9.703673635072052e-05, |
|
"loss": 3.0253, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.029632636492795, |
|
"eval_loss": 2.30761981010437, |
|
"eval_runtime": 92.9512, |
|
"eval_samples_per_second": 1015.641, |
|
"eval_steps_per_second": 7.94, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 8.688857316825655e-05, |
|
"loss": 3.0261, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.1311142683174347, |
|
"eval_loss": 2.310319185256958, |
|
"eval_runtime": 92.926, |
|
"eval_samples_per_second": 1015.916, |
|
"eval_steps_per_second": 7.942, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 7.674040998579256e-05, |
|
"loss": 3.0242, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.232595900142074, |
|
"eval_loss": 2.3085408210754395, |
|
"eval_runtime": 92.9709, |
|
"eval_samples_per_second": 1015.425, |
|
"eval_steps_per_second": 7.938, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 6.659224680332859e-05, |
|
"loss": 3.0245, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.334077531966714, |
|
"eval_loss": 2.3078315258026123, |
|
"eval_runtime": 92.9373, |
|
"eval_samples_per_second": 1015.793, |
|
"eval_steps_per_second": 7.941, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 5.644408362086462e-05, |
|
"loss": 3.024, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.435559163791354, |
|
"eval_loss": 2.3096206188201904, |
|
"eval_runtime": 92.9124, |
|
"eval_samples_per_second": 1016.065, |
|
"eval_steps_per_second": 7.943, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.629592043840065e-05, |
|
"loss": 3.0251, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.5370407956159937, |
|
"eval_loss": 2.30861496925354, |
|
"eval_runtime": 92.9585, |
|
"eval_samples_per_second": 1015.561, |
|
"eval_steps_per_second": 7.939, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.614775725593667e-05, |
|
"loss": 3.0257, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"eval_loss": 2.308117389678955, |
|
"eval_runtime": 92.9722, |
|
"eval_samples_per_second": 1015.411, |
|
"eval_steps_per_second": 7.938, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.59995940734727e-05, |
|
"loss": 3.0224, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.740004059265273, |
|
"eval_loss": 2.308805465698242, |
|
"eval_runtime": 92.931, |
|
"eval_samples_per_second": 1015.861, |
|
"eval_steps_per_second": 7.941, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.5851430891008727e-05, |
|
"loss": 3.0267, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.841485691089913, |
|
"eval_loss": 2.3082549571990967, |
|
"eval_runtime": 92.9136, |
|
"eval_samples_per_second": 1016.051, |
|
"eval_steps_per_second": 7.943, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 5.703267708544753e-06, |
|
"loss": 3.0243, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.9429673229145523, |
|
"eval_loss": 2.3083608150482178, |
|
"eval_runtime": 92.9434, |
|
"eval_samples_per_second": 1015.726, |
|
"eval_steps_per_second": 7.94, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 88686, |
|
"total_flos": 1.2889476521431695e+18, |
|
"train_loss": 3.0687490486097015, |
|
"train_runtime": 36823.0101, |
|
"train_samples_per_second": 308.271, |
|
"train_steps_per_second": 2.408 |
|
} |
|
], |
|
"logging_steps": 3000, |
|
"max_steps": 88686, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2889476521431695e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|