|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 549, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0546448087431694, |
|
"grad_norm": 9.216311123646195, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0197, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1092896174863388, |
|
"grad_norm": 2.84822970052699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9201, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 1.1976026595489564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8893, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 0.7151750657967473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8686, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.273224043715847, |
|
"grad_norm": 0.7178785706016054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8566, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 1.3738089444275707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8464, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3825136612021858, |
|
"grad_norm": 0.7307703509770203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.837, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"grad_norm": 0.9793900236646151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.834, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.6074469784310406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8239, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.546448087431694, |
|
"grad_norm": 0.7581415259721198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8203, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6010928961748634, |
|
"grad_norm": 0.6207562995736402, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8191, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.712118501956403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8148, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7103825136612022, |
|
"grad_norm": 0.5406181591483699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.816, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7650273224043715, |
|
"grad_norm": 0.801157400991226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8084, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 1.3907122538264838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8101, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"grad_norm": 0.9374210060236701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8095, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9289617486338798, |
|
"grad_norm": 0.7357026277429077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8056, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.6439903150890136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8067, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.8027574419975281, |
|
"eval_runtime": 18.4703, |
|
"eval_samples_per_second": 266.536, |
|
"eval_steps_per_second": 1.083, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0382513661202186, |
|
"grad_norm": 0.9901943553002889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7796, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.092896174863388, |
|
"grad_norm": 0.6647757287607619, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7669, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1475409836065573, |
|
"grad_norm": 0.9580968284481025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2021857923497268, |
|
"grad_norm": 0.8889371423217748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7675, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2568306010928962, |
|
"grad_norm": 0.9247757328115808, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7625, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 0.7030804123659421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7669, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.366120218579235, |
|
"grad_norm": 0.6820492241392998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7679, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4207650273224044, |
|
"grad_norm": 0.5636718084741676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7606, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4754098360655736, |
|
"grad_norm": 0.639581027268557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7593, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.530054644808743, |
|
"grad_norm": 0.6987258742060128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7612, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5846994535519126, |
|
"grad_norm": 0.6936443486720386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7582, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"grad_norm": 0.5505691855153351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7625, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6939890710382515, |
|
"grad_norm": 0.5640162883535096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7626, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.748633879781421, |
|
"grad_norm": 0.6321872003984678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.764, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.8032786885245902, |
|
"grad_norm": 0.6338738498810821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8579234972677594, |
|
"grad_norm": 0.5229579892525594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7672, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9125683060109289, |
|
"grad_norm": 0.7287452929104943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7615, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 0.7034416943673282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7596, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7888949513435364, |
|
"eval_runtime": 18.3314, |
|
"eval_samples_per_second": 268.556, |
|
"eval_steps_per_second": 1.091, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.021857923497268, |
|
"grad_norm": 1.0144238515169914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7482, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0765027322404372, |
|
"grad_norm": 0.8603434912719397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7216, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.1311475409836067, |
|
"grad_norm": 0.7563179754120521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7175, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.185792349726776, |
|
"grad_norm": 0.9559125673842068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7222, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.240437158469945, |
|
"grad_norm": 0.769800549346291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7184, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2950819672131146, |
|
"grad_norm": 0.6419920496502396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7222, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.349726775956284, |
|
"grad_norm": 0.7532650987823766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7246, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.4043715846994536, |
|
"grad_norm": 0.5920199594002061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7228, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.459016393442623, |
|
"grad_norm": 0.6161056536206126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.722, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.5136612021857925, |
|
"grad_norm": 0.7438641211945972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7215, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.5683060109289615, |
|
"grad_norm": 0.6857540384247945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7255, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 0.5697372445993161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7231, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.6775956284153004, |
|
"grad_norm": 0.868972289179613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7238, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.73224043715847, |
|
"grad_norm": 0.6159867401214144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7208, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.7868852459016393, |
|
"grad_norm": 0.584105759494615, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7263, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.841530054644809, |
|
"grad_norm": 0.5906676150264403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7221, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.8961748633879782, |
|
"grad_norm": 0.5994810510441251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7266, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.9508196721311473, |
|
"grad_norm": 0.7788656644195687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7194, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7868959903717041, |
|
"eval_runtime": 18.0047, |
|
"eval_samples_per_second": 273.428, |
|
"eval_steps_per_second": 1.111, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 549, |
|
"total_flos": 919595447746560.0, |
|
"train_loss": 0.7765402672286459, |
|
"train_runtime": 3626.8952, |
|
"train_samples_per_second": 77.367, |
|
"train_steps_per_second": 0.151 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 549, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 919595447746560.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|