|
{ |
|
"best_global_step": 2500, |
|
"best_metric": 0.2815941572189331, |
|
"best_model_checkpoint": "output/checkpoint-2500", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2607, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11514104778353483, |
|
"grad_norm": 1.027288794517517, |
|
"learning_rate": 7.586206896551724e-05, |
|
"loss": 1.2679, |
|
"mean_token_accuracy": 0.7215237715840339, |
|
"num_tokens": 1099167.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23028209556706966, |
|
"grad_norm": 0.695963978767395, |
|
"learning_rate": 0.0001524904214559387, |
|
"loss": 1.0582, |
|
"mean_token_accuracy": 0.751626193523407, |
|
"num_tokens": 2216195.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"grad_norm": 0.6828327775001526, |
|
"learning_rate": 0.0001998705544249015, |
|
"loss": 0.9402, |
|
"mean_token_accuracy": 0.7751538950204849, |
|
"num_tokens": 3351028.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4605641911341393, |
|
"grad_norm": 0.6992027759552002, |
|
"learning_rate": 0.0001982973099683902, |
|
"loss": 0.8234, |
|
"mean_token_accuracy": 0.8007429817318916, |
|
"num_tokens": 4434795.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5757052389176741, |
|
"grad_norm": 0.696792721748352, |
|
"learning_rate": 0.00019496396989003193, |
|
"loss": 0.7318, |
|
"mean_token_accuracy": 0.8213237491250038, |
|
"num_tokens": 5539056.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5757052389176741, |
|
"eval_loss": 0.7196161150932312, |
|
"eval_mean_token_accuracy": 0.8238662799405311, |
|
"eval_num_tokens": 5539056.0, |
|
"eval_runtime": 262.5506, |
|
"eval_samples_per_second": 5.881, |
|
"eval_steps_per_second": 0.735, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.690846286701209, |
|
"grad_norm": 0.7375836968421936, |
|
"learning_rate": 0.0001899302204343428, |
|
"loss": 0.6422, |
|
"mean_token_accuracy": 0.8431682422757149, |
|
"num_tokens": 6644904.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8059873344847438, |
|
"grad_norm": 0.8660125136375427, |
|
"learning_rate": 0.00018328619509919044, |
|
"loss": 0.6101, |
|
"mean_token_accuracy": 0.8511215424537659, |
|
"num_tokens": 7735586.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9211283822682786, |
|
"grad_norm": 0.6886998414993286, |
|
"learning_rate": 0.00017515086072006204, |
|
"loss": 0.5523, |
|
"mean_token_accuracy": 0.8642850863933563, |
|
"num_tokens": 8832415.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.035693724812896, |
|
"grad_norm": 0.4650283753871918, |
|
"learning_rate": 0.00016566988726928513, |
|
"loss": 0.4643, |
|
"mean_token_accuracy": 0.8853124245327322, |
|
"num_tokens": 10069372.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1508347725964305, |
|
"grad_norm": 0.45214158296585083, |
|
"learning_rate": 0.00015501303951322943, |
|
"loss": 0.3563, |
|
"mean_token_accuracy": 0.9097330266237259, |
|
"num_tokens": 11175820.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1508347725964305, |
|
"eval_loss": 0.4442647695541382, |
|
"eval_mean_token_accuracy": 0.8929883834611566, |
|
"eval_num_tokens": 11175820.0, |
|
"eval_runtime": 261.982, |
|
"eval_samples_per_second": 5.894, |
|
"eval_steps_per_second": 0.737, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2659758203799654, |
|
"grad_norm": 0.4694233536720276, |
|
"learning_rate": 0.00014337113723205126, |
|
"loss": 0.3286, |
|
"mean_token_accuracy": 0.9172343072295189, |
|
"num_tokens": 12296528.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3811168681635002, |
|
"grad_norm": 0.41704440116882324, |
|
"learning_rate": 0.00013095263843179028, |
|
"loss": 0.3329, |
|
"mean_token_accuracy": 0.9168813681602478, |
|
"num_tokens": 13402337.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.496257915947035, |
|
"grad_norm": 0.503028392791748, |
|
"learning_rate": 0.00011797990672926652, |
|
"loss": 0.303, |
|
"mean_token_accuracy": 0.9241538748145104, |
|
"num_tokens": 14515324.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.61139896373057, |
|
"grad_norm": 0.5045246481895447, |
|
"learning_rate": 0.00010468522974537567, |
|
"loss": 0.2799, |
|
"mean_token_accuracy": 0.9298068460822105, |
|
"num_tokens": 15613579.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7265400115141047, |
|
"grad_norm": 0.38614559173583984, |
|
"learning_rate": 9.130665980078394e-05, |
|
"loss": 0.2729, |
|
"mean_token_accuracy": 0.9326073843240738, |
|
"num_tokens": 16703824.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7265400115141047, |
|
"eval_loss": 0.33792445063591003, |
|
"eval_mean_token_accuracy": 0.9197617784682951, |
|
"eval_num_tokens": 16703824.0, |
|
"eval_runtime": 263.1103, |
|
"eval_samples_per_second": 5.868, |
|
"eval_steps_per_second": 0.734, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8416810592976396, |
|
"grad_norm": 0.4411994218826294, |
|
"learning_rate": 7.808375138984745e-05, |
|
"loss": 0.2643, |
|
"mean_token_accuracy": 0.9346091681718827, |
|
"num_tokens": 17799886.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9568221070811744, |
|
"grad_norm": 0.4209880828857422, |
|
"learning_rate": 6.525327175685459e-05, |
|
"loss": 0.25, |
|
"mean_token_accuracy": 0.9377276867628097, |
|
"num_tokens": 18905695.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.071387449625792, |
|
"grad_norm": 0.24632178246974945, |
|
"learning_rate": 5.304496138031373e-05, |
|
"loss": 0.1888, |
|
"mean_token_accuracy": 0.9527464275384069, |
|
"num_tokens": 19951625.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.186528497409326, |
|
"grad_norm": 0.26484355330467224, |
|
"learning_rate": 4.167742027736482e-05, |
|
"loss": 0.1298, |
|
"mean_token_accuracy": 0.9669929701089859, |
|
"num_tokens": 21086937.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.301669545192861, |
|
"grad_norm": 0.3589678704738617, |
|
"learning_rate": 3.135419378747742e-05, |
|
"loss": 0.1374, |
|
"mean_token_accuracy": 0.9652480971813202, |
|
"num_tokens": 22171874.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.301669545192861, |
|
"eval_loss": 0.2958945631980896, |
|
"eval_mean_token_accuracy": 0.933752671424589, |
|
"eval_num_tokens": 22171874.0, |
|
"eval_runtime": 262.4289, |
|
"eval_samples_per_second": 5.883, |
|
"eval_steps_per_second": 0.735, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.416810592976396, |
|
"grad_norm": 0.11929790675640106, |
|
"learning_rate": 2.226012792275538e-05, |
|
"loss": 0.1233, |
|
"mean_token_accuracy": 0.9680935338139534, |
|
"num_tokens": 23289276.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5319516407599307, |
|
"grad_norm": 0.2738422155380249, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 0.1222, |
|
"mean_token_accuracy": 0.9688387343287468, |
|
"num_tokens": 24386257.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.6470926885434656, |
|
"grad_norm": 0.10122287273406982, |
|
"learning_rate": 8.385900637134792e-06, |
|
"loss": 0.117, |
|
"mean_token_accuracy": 0.9703225392103195, |
|
"num_tokens": 25488867.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.7622337363270004, |
|
"grad_norm": 0.1763971745967865, |
|
"learning_rate": 3.85416887020934e-06, |
|
"loss": 0.1182, |
|
"mean_token_accuracy": 0.9700831747055054, |
|
"num_tokens": 26588923.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.8773747841105353, |
|
"grad_norm": 0.1859463006258011, |
|
"learning_rate": 1.0440086954749517e-06, |
|
"loss": 0.1201, |
|
"mean_token_accuracy": 0.9696193218231202, |
|
"num_tokens": 27694888.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.8773747841105353, |
|
"eval_loss": 0.2815941572189331, |
|
"eval_mean_token_accuracy": 0.938341686453844, |
|
"eval_num_tokens": 27694888.0, |
|
"eval_runtime": 262.6157, |
|
"eval_samples_per_second": 5.879, |
|
"eval_steps_per_second": 0.735, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.99251583189407, |
|
"grad_norm": 0.3561207056045532, |
|
"learning_rate": 5.738383307818396e-09, |
|
"loss": 0.1175, |
|
"mean_token_accuracy": 0.9702259311079979, |
|
"num_tokens": 28776204.0, |
|
"step": 2600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2607, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3468058668043796e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|