{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06493506493506493, "grad_norm": 6.185503005981445, "learning_rate": 4.991681456235483e-05, "loss": 4.5691, "num_input_tokens_seen": 200, "step": 5 }, { "epoch": 0.12987012987012986, "grad_norm": 4.274949550628662, "learning_rate": 4.9579822269987574e-05, "loss": 5.3026, "num_input_tokens_seen": 376, "step": 10 }, { "epoch": 0.19480519480519481, "grad_norm": 3.6588423252105713, "learning_rate": 4.898732434036244e-05, "loss": 3.9166, "num_input_tokens_seen": 592, "step": 15 }, { "epoch": 0.2597402597402597, "grad_norm": 7.039975643157959, "learning_rate": 4.814547975052245e-05, "loss": 4.0923, "num_input_tokens_seen": 800, "step": 20 }, { "epoch": 0.3246753246753247, "grad_norm": 7.7588348388671875, "learning_rate": 4.7063039419658035e-05, "loss": 3.5885, "num_input_tokens_seen": 992, "step": 25 }, { "epoch": 0.38961038961038963, "grad_norm": 6.176125526428223, "learning_rate": 4.5751255243877015e-05, "loss": 2.6237, "num_input_tokens_seen": 1160, "step": 30 }, { "epoch": 0.45454545454545453, "grad_norm": 5.279819488525391, "learning_rate": 4.422376313348405e-05, "loss": 2.1141, "num_input_tokens_seen": 1384, "step": 35 }, { "epoch": 0.5194805194805194, "grad_norm": 6.731473922729492, "learning_rate": 4.2496441268589046e-05, "loss": 1.6475, "num_input_tokens_seen": 1584, "step": 40 }, { "epoch": 0.5844155844155844, "grad_norm": 4.555207252502441, "learning_rate": 4.058724504646834e-05, "loss": 2.3441, "num_input_tokens_seen": 1784, "step": 45 }, { "epoch": 0.6493506493506493, "grad_norm": 4.005583763122559, "learning_rate": 3.851602043638994e-05, "loss": 1.6557, "num_input_tokens_seen": 2008, "step": 50 }, { "epoch": 0.7142857142857143, "grad_norm": 4.235661029815674, "learning_rate": 3.6304297682067144e-05, "loss": 1.4454, "num_input_tokens_seen": 2200, "step": 55 }, { "epoch": 0.7792207792207793, "grad_norm": 4.245209693908691, "learning_rate": 3.3975067496189965e-05, "loss": 1.6826, "num_input_tokens_seen": 2448, "step": 60 }, { "epoch": 0.8441558441558441, "grad_norm": 3.096132278442383, "learning_rate": 3.1552542073477555e-05, "loss": 1.4454, "num_input_tokens_seen": 2656, "step": 65 }, { "epoch": 0.9090909090909091, "grad_norm": 3.5163967609405518, "learning_rate": 2.9061903406505154e-05, "loss": 1.4228, "num_input_tokens_seen": 2888, "step": 70 }, { "epoch": 0.974025974025974, "grad_norm": 8.242713928222656, "learning_rate": 2.652904152054607e-05, "loss": 1.3693, "num_input_tokens_seen": 3080, "step": 75 }, { "epoch": 1.0389610389610389, "grad_norm": 3.690077543258667, "learning_rate": 2.3980285348460363e-05, "loss": 1.9372, "num_input_tokens_seen": 3272, "step": 80 }, { "epoch": 1.103896103896104, "grad_norm": 3.8814001083374023, "learning_rate": 2.1442129043167874e-05, "loss": 1.0038, "num_input_tokens_seen": 3456, "step": 85 }, { "epoch": 1.1688311688311688, "grad_norm": 6.378023624420166, "learning_rate": 1.8940956572669692e-05, "loss": 1.0141, "num_input_tokens_seen": 3648, "step": 90 }, { "epoch": 1.2337662337662338, "grad_norm": 2.539525032043457, "learning_rate": 1.6502767460434588e-05, "loss": 0.9873, "num_input_tokens_seen": 3856, "step": 95 }, { "epoch": 1.2987012987012987, "grad_norm": 2.735564708709717, "learning_rate": 1.4152906522061048e-05, "loss": 1.4175, "num_input_tokens_seen": 4144, "step": 100 }, { "epoch": 1.2987012987012987, "eval_loss": 1.2581640481948853, "eval_runtime": 1.1348, "eval_samples_per_second": 12.337, "eval_steps_per_second": 12.337, "num_input_tokens_seen": 4144, "step": 100 }, { "epoch": 1.3636363636363638, "grad_norm": 3.563999891281128, "learning_rate": 1.1915800407584704e-05, "loss": 1.7128, "num_input_tokens_seen": 4368, "step": 105 }, { "epoch": 1.4285714285714286, "grad_norm": 5.1972432136535645, "learning_rate": 9.814703688056321e-06, "loss": 0.8593, "num_input_tokens_seen": 4560, "step": 110 }, { "epoch": 1.4935064935064934, "grad_norm": 5.918083667755127, "learning_rate": 7.871457125803896e-06, "loss": 1.2189, "num_input_tokens_seen": 4736, "step": 115 }, { "epoch": 1.5584415584415585, "grad_norm": 3.3397321701049805, "learning_rate": 6.106260641143546e-06, "loss": 1.0924, "num_input_tokens_seen": 4928, "step": 120 }, { "epoch": 1.6233766233766234, "grad_norm": 4.0677900314331055, "learning_rate": 4.537463335535161e-06, "loss": 1.422, "num_input_tokens_seen": 5152, "step": 125 }, { "epoch": 1.6883116883116882, "grad_norm": 3.7872872352600098, "learning_rate": 3.181372753878595e-06, "loss": 1.0191, "num_input_tokens_seen": 5360, "step": 130 }, { "epoch": 1.7532467532467533, "grad_norm": 3.1329193115234375, "learning_rate": 2.0520853686560178e-06, "loss": 1.0357, "num_input_tokens_seen": 5560, "step": 135 }, { "epoch": 1.8181818181818183, "grad_norm": 3.5212812423706055, "learning_rate": 1.1613400480268099e-06, "loss": 0.9097, "num_input_tokens_seen": 5728, "step": 140 }, { "epoch": 1.883116883116883, "grad_norm": 3.1972975730895996, "learning_rate": 5.183960310644748e-07, "loss": 0.8963, "num_input_tokens_seen": 5928, "step": 145 }, { "epoch": 1.948051948051948, "grad_norm": 3.8994648456573486, "learning_rate": 1.29936678574899e-07, "loss": 1.2464, "num_input_tokens_seen": 6136, "step": 150 }, { "epoch": 2.0, "num_input_tokens_seen": 6288, "step": 154, "total_flos": 267521861763072.0, "train_loss": 1.872008863981668, "train_runtime": 58.2813, "train_samples_per_second": 2.642, "train_steps_per_second": 2.642 } ], "logging_steps": 5, "max_steps": 154, "num_input_tokens_seen": 6288, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 267521861763072.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }