|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.1039228439331055, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.2535, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9736298322677612, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.1389, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.9868311285972595, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1137, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7625114321708679, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.0243, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8334870338439941, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0505, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7253231406211853, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9779, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6524099707603455, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.9797, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.68406742811203, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 0.9878, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6620015501976013, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 0.9352, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7781227231025696, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.9205, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6981615424156189, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 0.9168, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5809632539749146, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.9345, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6306660771369934, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 0.8842, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.645531952381134, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 0.8496, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.7515069842338562, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.8134, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.8647974133491516, |
|
"learning_rate": 9.613812221777212e-06, |
|
"loss": 0.8645, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.5082105398178101, |
|
"learning_rate": 9.519168849742603e-06, |
|
"loss": 0.8339, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.6905492544174194, |
|
"learning_rate": 9.414737964294636e-06, |
|
"loss": 0.8524, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.7012932896614075, |
|
"learning_rate": 9.30074573947683e-06, |
|
"loss": 0.8566, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6087114810943604, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 0.8107, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.7151578664779663, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.9001, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.746478796005249, |
|
"learning_rate": 8.903970133383297e-06, |
|
"loss": 0.8522, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.5843214392662048, |
|
"learning_rate": 8.754400164907496e-06, |
|
"loss": 0.8164, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.6262736916542053, |
|
"learning_rate": 8.596699001693257e-06, |
|
"loss": 0.8739, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6539023518562317, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.8353, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.7221904397010803, |
|
"learning_rate": 8.258286144107277e-06, |
|
"loss": 0.7376, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.6120831966400146, |
|
"learning_rate": 8.078307376628292e-06, |
|
"loss": 0.7247, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.5770366787910461, |
|
"learning_rate": 7.891661680839932e-06, |
|
"loss": 0.6993, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.4881764352321625, |
|
"learning_rate": 7.698753289757565e-06, |
|
"loss": 0.6871, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6157920360565186, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.7247, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.5322688817977905, |
|
"learning_rate": 7.295832266935059e-06, |
|
"loss": 0.7549, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.7124238014221191, |
|
"learning_rate": 7.08669227240909e-06, |
|
"loss": 0.6858, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.5642361044883728, |
|
"learning_rate": 6.873032967079562e-06, |
|
"loss": 0.6737, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.7349158525466919, |
|
"learning_rate": 6.655317089424791e-06, |
|
"loss": 0.7092, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6204760670661926, |
|
"learning_rate": 6.434016163555452e-06, |
|
"loss": 0.6878, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.4934922754764557, |
|
"learning_rate": 6.209609477998339e-06, |
|
"loss": 0.6883, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.5973473787307739, |
|
"learning_rate": 5.982583047664151e-06, |
|
"loss": 0.7312, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.9578109979629517, |
|
"learning_rate": 5.753428561247416e-06, |
|
"loss": 0.6466, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.6420013904571533, |
|
"learning_rate": 5.522642316338268e-06, |
|
"loss": 0.5745, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.6083827614784241, |
|
"learning_rate": 5.290724144552379e-06, |
|
"loss": 0.6074, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 0.5411021709442139, |
|
"learning_rate": 5.0581763290069865e-06, |
|
"loss": 0.5958, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.542936384677887, |
|
"learning_rate": 4.825502516487497e-06, |
|
"loss": 0.517, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.643763542175293, |
|
"learning_rate": 4.59320662666071e-06, |
|
"loss": 0.6125, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.5902148485183716, |
|
"learning_rate": 4.361791760697027e-06, |
|
"loss": 0.5574, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.5409561991691589, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.5619, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.5303182601928711, |
|
"learning_rate": 3.903606879060483e-06, |
|
"loss": 0.5723, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.5597081184387207, |
|
"learning_rate": 3.6778291898139907e-06, |
|
"loss": 0.57, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.557049572467804, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.558, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.5532881021499634, |
|
"learning_rate": 3.2353471764306567e-06, |
|
"loss": 0.5642, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.6130328178405762, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.582, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.6685742139816284, |
|
"learning_rate": 2.8081442660546126e-06, |
|
"loss": 0.4826, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.7372143268585205, |
|
"learning_rate": 2.601434433748771e-06, |
|
"loss": 0.4799, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.5380725860595703, |
|
"learning_rate": 2.3999193603539234e-06, |
|
"loss": 0.4595, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.611395537853241, |
|
"learning_rate": 2.204035482646267e-06, |
|
"loss": 0.4414, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.6131682991981506, |
|
"learning_rate": 2.0142070414860704e-06, |
|
"loss": 0.42, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.5948655605316162, |
|
"learning_rate": 1.8308451630064484e-06, |
|
"loss": 0.4228, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 0.5260858535766602, |
|
"learning_rate": 1.6543469682057105e-06, |
|
"loss": 0.4486, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.6490086913108826, |
|
"learning_rate": 1.4850947128716914e-06, |
|
"loss": 0.4064, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.512779951095581, |
|
"learning_rate": 1.3234549597008572e-06, |
|
"loss": 0.4806, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.6591514945030212, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.4537, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.5841071605682373, |
|
"learning_rate": 1.0243960175257605e-06, |
|
"loss": 0.425, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.5767258405685425, |
|
"learning_rate": 8.876245235966884e-07, |
|
"loss": 0.4903, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 0.681662380695343, |
|
"learning_rate": 7.597595192178702e-07, |
|
"loss": 0.4259, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.6221340894699097, |
|
"learning_rate": 6.410779315161885e-07, |
|
"loss": 0.4243, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.5294339656829834, |
|
"learning_rate": 5.318367983829393e-07, |
|
"loss": 0.3387, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.5295357704162598, |
|
"learning_rate": 4.322727117869951e-07, |
|
"loss": 0.3378, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 0.5497756004333496, |
|
"learning_rate": 3.426013053692878e-07, |
|
"loss": 0.3857, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.5334697961807251, |
|
"learning_rate": 2.63016787428354e-07, |
|
"loss": 0.3996, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 0.48104792833328247, |
|
"learning_rate": 1.9369152030840553e-07, |
|
"loss": 0.3743, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.53028404712677, |
|
"learning_rate": 1.3477564710088097e-07, |
|
"loss": 0.3542, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 0.5596076846122742, |
|
"learning_rate": 8.639676646793382e-08, |
|
"loss": 0.3587, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.5660611391067505, |
|
"learning_rate": 4.865965629214819e-08, |
|
"loss": 0.414, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 0.4768368899822235, |
|
"learning_rate": 2.1646046750978255e-08, |
|
"loss": 0.3887, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.5675653219223022, |
|
"learning_rate": 5.414443307377171e-09, |
|
"loss": 0.3811, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5585565567016602, |
|
"learning_rate": 0.0, |
|
"loss": 0.3763, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 750, |
|
"total_flos": 118427603697664.0, |
|
"train_loss": 0.662273271560669, |
|
"train_runtime": 26488.1763, |
|
"train_samples_per_second": 0.227, |
|
"train_steps_per_second": 0.028 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 100000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 118427603697664.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|