|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.5, |
|
"eval_steps": 500, |
|
"global_step": 280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 3.3681201934814453, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 1.2703, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 1.8422191143035889, |
|
"learning_rate": 0.00019974902686642558, |
|
"loss": 0.2993, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 1.02767014503479, |
|
"learning_rate": 0.00019821986184473755, |
|
"loss": 0.187, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.2515795230865479, |
|
"learning_rate": 0.00019532224059997692, |
|
"loss": 0.119, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 1.3811193704605103, |
|
"learning_rate": 0.00019109653447608606, |
|
"loss": 0.0941, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.8252667784690857, |
|
"learning_rate": 0.00018560161846773002, |
|
"loss": 0.0928, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.9444553256034851, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.0764, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.889909029006958, |
|
"learning_rate": 0.00017112700697095954, |
|
"loss": 0.0707, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 0.8613348007202148, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.068, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.9881114363670349, |
|
"learning_rate": 0.0001527022711573479, |
|
"loss": 0.071, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 0.8167979121208191, |
|
"learning_rate": 0.0001423212834444425, |
|
"loss": 0.0581, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.6212513446807861, |
|
"learning_rate": 0.00013135065100377814, |
|
"loss": 0.0697, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1607142857142858, |
|
"grad_norm": 0.9011110067367554, |
|
"learning_rate": 0.00011994322306515927, |
|
"loss": 0.0447, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.7870438098907471, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.0413, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 0.3967013359069824, |
|
"learning_rate": 9.645759168379463e-05, |
|
"loss": 0.0534, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4754908084869385, |
|
"learning_rate": 8.47066037126754e-05, |
|
"loss": 0.0415, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5178571428571428, |
|
"grad_norm": 0.564179539680481, |
|
"learning_rate": 7.316869223065156e-05, |
|
"loss": 0.0364, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 0.38168346881866455, |
|
"learning_rate": 6.200461012896402e-05, |
|
"loss": 0.0363, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6964285714285714, |
|
"grad_norm": 0.2824791371822357, |
|
"learning_rate": 5.136990189057187e-05, |
|
"loss": 0.0392, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.5657776594161987, |
|
"learning_rate": 4.141273645397754e-05, |
|
"loss": 0.0387, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.4336404800415039, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.0418, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 0.32696864008903503, |
|
"learning_rate": 2.407457728556115e-05, |
|
"loss": 0.0334, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0535714285714284, |
|
"grad_norm": 0.6701914072036743, |
|
"learning_rate": 1.693514886817772e-05, |
|
"loss": 0.0298, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.35550254583358765, |
|
"learning_rate": 1.0953028253055542e-05, |
|
"loss": 0.031, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.232142857142857, |
|
"grad_norm": 0.26464036107063293, |
|
"learning_rate": 6.211561822781476e-06, |
|
"loss": 0.0325, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 0.349539190530777, |
|
"learning_rate": 2.7768104444869436e-06, |
|
"loss": 0.0235, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 0.255789190530777, |
|
"learning_rate": 6.966290714375933e-07, |
|
"loss": 0.0342, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.19940122961997986, |
|
"learning_rate": 0.0, |
|
"loss": 0.0338, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"step": 280, |
|
"total_flos": 5489054121567744.0, |
|
"train_loss": 0.10958067295806749, |
|
"train_runtime": 107.8356, |
|
"train_samples_per_second": 41.545, |
|
"train_steps_per_second": 2.597 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 280, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5489054121567744.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|