|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 315, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 5.117466449737549, |
|
"learning_rate": 0.000125, |
|
"loss": 1.112, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 3.9644668102264404, |
|
"learning_rate": 0.00019991169537951468, |
|
"loss": 0.4916, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 2.397569417953491, |
|
"learning_rate": 0.00019892005856487878, |
|
"loss": 0.3606, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 1.5157995223999023, |
|
"learning_rate": 0.00019683737802329244, |
|
"loss": 0.2953, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 2.1924238204956055, |
|
"learning_rate": 0.00019368662478381799, |
|
"loss": 0.2249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 2.078732967376709, |
|
"learning_rate": 0.00018950255023668876, |
|
"loss": 0.2383, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 1.4331333637237549, |
|
"learning_rate": 0.0001843313028411149, |
|
"loss": 0.1957, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.2698412698412698, |
|
"grad_norm": 0.9676660299301147, |
|
"learning_rate": 0.00017822991912854713, |
|
"loss": 0.1727, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.7661018371582031, |
|
"learning_rate": 0.00017126569461540443, |
|
"loss": 0.1687, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 1.013567328453064, |
|
"learning_rate": 0.00016351544156381414, |
|
"loss": 0.1581, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.746031746031746, |
|
"grad_norm": 1.0763999223709106, |
|
"learning_rate": 0.0001550646417769301, |
|
"loss": 0.1281, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.9065868258476257, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 0.1301, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0634920634920633, |
|
"grad_norm": 0.8975210785865784, |
|
"learning_rate": 0.00013644093473793215, |
|
"loss": 0.1131, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.7319889664649963, |
|
"learning_rate": 0.00012647343859284997, |
|
"loss": 0.0953, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.849612832069397, |
|
"learning_rate": 0.00011621395233447248, |
|
"loss": 0.1131, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.5396825396825395, |
|
"grad_norm": 0.8651995062828064, |
|
"learning_rate": 0.00010577563347894285, |
|
"loss": 0.1016, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.6984126984126986, |
|
"grad_norm": 1.222275972366333, |
|
"learning_rate": 9.527361198546714e-05, |
|
"loss": 0.0894, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.0224725008010864, |
|
"learning_rate": 8.48237204246785e-05, |
|
"loss": 0.092, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.015873015873016, |
|
"grad_norm": 0.9545143246650696, |
|
"learning_rate": 7.454121639751371e-05, |
|
"loss": 0.0983, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.1746031746031744, |
|
"grad_norm": 0.6206408143043518, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 0.0956, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.5555378198623657, |
|
"learning_rate": 5.492891942537703e-05, |
|
"loss": 0.0828, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.492063492063492, |
|
"grad_norm": 0.5887261629104614, |
|
"learning_rate": 4.581544128948413e-05, |
|
"loss": 0.083, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.6507936507936507, |
|
"grad_norm": 0.4162333011627197, |
|
"learning_rate": 3.729959445038136e-05, |
|
"loss": 0.0714, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.4157885015010834, |
|
"learning_rate": 2.9475304866143027e-05, |
|
"loss": 0.0688, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"grad_norm": 0.4919394850730896, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.057, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.1269841269841265, |
|
"grad_norm": 0.5343856811523438, |
|
"learning_rate": 1.6238011635695848e-05, |
|
"loss": 0.0651, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.42693275213241577, |
|
"learning_rate": 1.0971009376368612e-05, |
|
"loss": 0.0531, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.4508126676082611, |
|
"learning_rate": 6.6859568160797525e-06, |
|
"loss": 0.0644, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.603174603174603, |
|
"grad_norm": 0.4287712872028351, |
|
"learning_rate": 3.4301161560792774e-06, |
|
"loss": 0.0467, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.39040446281433105, |
|
"learning_rate": 1.239397853554336e-06, |
|
"loss": 0.0587, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.920634920634921, |
|
"grad_norm": 0.39611175656318665, |
|
"learning_rate": 1.379645454479661e-07, |
|
"loss": 0.056, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 315, |
|
"total_flos": 5801722395569280.0, |
|
"train_loss": 0.16537500326595608, |
|
"train_runtime": 123.2447, |
|
"train_samples_per_second": 40.894, |
|
"train_steps_per_second": 2.556 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 315, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5801722395569280.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|