{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 25872, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01932591218305504, "grad_norm": 135.7185821533203, "learning_rate": 1.9088098918083462e-06, "loss": 2.7419, "step": 500 }, { "epoch": 0.03865182436611008, "grad_norm": 71.08179473876953, "learning_rate": 3.840803709428131e-06, "loss": 1.6608, "step": 1000 }, { "epoch": 0.05797773654916512, "grad_norm": 102.95220184326172, "learning_rate": 5.772797527047914e-06, "loss": 1.5107, "step": 1500 }, { "epoch": 0.07730364873222016, "grad_norm": 72.49132537841797, "learning_rate": 7.704791344667698e-06, "loss": 1.3103, "step": 2000 }, { "epoch": 0.0966295609152752, "grad_norm": 78.74112701416016, "learning_rate": 9.636785162287482e-06, "loss": 1.3606, "step": 2500 }, { "epoch": 0.11595547309833024, "grad_norm": 132.5709991455078, "learning_rate": 9.825631334822196e-06, "loss": 1.2151, "step": 3000 }, { "epoch": 0.13528138528138528, "grad_norm": 89.91647338867188, "learning_rate": 9.61089159938155e-06, "loss": 1.1923, "step": 3500 }, { "epoch": 0.15460729746444032, "grad_norm": 31.426151275634766, "learning_rate": 9.396581343411786e-06, "loss": 1.1765, "step": 4000 }, { "epoch": 0.17393320964749537, "grad_norm": 93.264404296875, "learning_rate": 9.18184160797114e-06, "loss": 1.1631, "step": 4500 }, { "epoch": 0.1932591218305504, "grad_norm": 94.13616943359375, "learning_rate": 8.967101872530494e-06, "loss": 1.1176, "step": 5000 }, { "epoch": 0.21258503401360543, "grad_norm": 67.35560607910156, "learning_rate": 8.752362137089849e-06, "loss": 1.1277, "step": 5500 }, { "epoch": 0.23191094619666047, "grad_norm": 34.71173858642578, "learning_rate": 8.537622401649201e-06, "loss": 1.0943, "step": 6000 }, { "epoch": 0.25123685837971554, "grad_norm": 134.21011352539062, "learning_rate": 8.322882666208556e-06, "loss": 1.052, "step": 6500 }, { "epoch": 0.27056277056277056, "grad_norm": 100.96661376953125, "learning_rate": 8.10857241023879e-06, "loss": 1.0651, "step": 7000 }, { "epoch": 0.2898886827458256, "grad_norm": 95.49282836914062, "learning_rate": 7.893832674798145e-06, "loss": 1.0536, "step": 7500 }, { "epoch": 0.30921459492888065, "grad_norm": 94.79146575927734, "learning_rate": 7.6790929393575e-06, "loss": 1.0546, "step": 8000 }, { "epoch": 0.32854050711193566, "grad_norm": 67.1232681274414, "learning_rate": 7.464353203916853e-06, "loss": 1.0366, "step": 8500 }, { "epoch": 0.34786641929499074, "grad_norm": 48.50230026245117, "learning_rate": 7.250042947947089e-06, "loss": 1.0106, "step": 9000 }, { "epoch": 0.36719233147804575, "grad_norm": 28.64300537109375, "learning_rate": 7.035303212506442e-06, "loss": 1.0843, "step": 9500 }, { "epoch": 0.3865182436611008, "grad_norm": 48.05903625488281, "learning_rate": 6.820563477065797e-06, "loss": 1.0454, "step": 10000 }, { "epoch": 0.40584415584415584, "grad_norm": 53.515464782714844, "learning_rate": 6.605823741625152e-06, "loss": 0.9767, "step": 10500 }, { "epoch": 0.42517006802721086, "grad_norm": 2.856135606765747, "learning_rate": 6.391084006184504e-06, "loss": 0.9433, "step": 11000 }, { "epoch": 0.44449598021026593, "grad_norm": 125.483154296875, "learning_rate": 6.17677375021474e-06, "loss": 0.9314, "step": 11500 }, { "epoch": 0.46382189239332094, "grad_norm": 45.01868438720703, "learning_rate": 5.962034014774094e-06, "loss": 0.9568, "step": 12000 }, { "epoch": 0.483147804576376, "grad_norm": 15.953567504882812, "learning_rate": 5.747294279333448e-06, "loss": 0.8972, "step": 12500 }, { "epoch": 0.5024737167594311, "grad_norm": 85.09109497070312, "learning_rate": 5.532554543892802e-06, "loss": 0.9156, "step": 13000 }, { "epoch": 0.5217996289424861, "grad_norm": 53.42656707763672, "learning_rate": 5.317814808452156e-06, "loss": 0.9581, "step": 13500 }, { "epoch": 0.5411255411255411, "grad_norm": 72.82543182373047, "learning_rate": 5.103504552482392e-06, "loss": 0.9065, "step": 14000 }, { "epoch": 0.5604514533085961, "grad_norm": 88.66947174072266, "learning_rate": 4.8887648170417454e-06, "loss": 0.9134, "step": 14500 }, { "epoch": 0.5797773654916512, "grad_norm": 86.12175750732422, "learning_rate": 4.6740250816011e-06, "loss": 0.8819, "step": 15000 }, { "epoch": 0.5991032776747063, "grad_norm": 61.66287612915039, "learning_rate": 4.459285346160454e-06, "loss": 0.8798, "step": 15500 }, { "epoch": 0.6184291898577613, "grad_norm": 51.16645431518555, "learning_rate": 4.244975090190689e-06, "loss": 0.8437, "step": 16000 }, { "epoch": 0.6377551020408163, "grad_norm": 135.60870361328125, "learning_rate": 4.0302353547500435e-06, "loss": 0.8831, "step": 16500 }, { "epoch": 0.6570810142238713, "grad_norm": 89.30845642089844, "learning_rate": 3.815495619309397e-06, "loss": 0.8872, "step": 17000 }, { "epoch": 0.6764069264069265, "grad_norm": 54.0753288269043, "learning_rate": 3.6007558838687514e-06, "loss": 0.8717, "step": 17500 }, { "epoch": 0.6957328385899815, "grad_norm": 28.510011672973633, "learning_rate": 3.3860161484281057e-06, "loss": 0.8599, "step": 18000 }, { "epoch": 0.7150587507730365, "grad_norm": 42.08302307128906, "learning_rate": 3.1712764129874596e-06, "loss": 0.7934, "step": 18500 }, { "epoch": 0.7343846629560915, "grad_norm": 54.24616622924805, "learning_rate": 2.956966157017695e-06, "loss": 0.7987, "step": 19000 }, { "epoch": 0.7537105751391465, "grad_norm": 66.43406677246094, "learning_rate": 2.742226421577049e-06, "loss": 0.8707, "step": 19500 }, { "epoch": 0.7730364873222016, "grad_norm": 47.240234375, "learning_rate": 2.527486686136403e-06, "loss": 0.8444, "step": 20000 }, { "epoch": 0.7923623995052567, "grad_norm": 50.62797164916992, "learning_rate": 2.312746950695757e-06, "loss": 0.8591, "step": 20500 }, { "epoch": 0.8116883116883117, "grad_norm": 41.326271057128906, "learning_rate": 2.0980072152551107e-06, "loss": 0.7583, "step": 21000 }, { "epoch": 0.8310142238713667, "grad_norm": 101.81986236572266, "learning_rate": 1.883267479814465e-06, "loss": 0.9015, "step": 21500 }, { "epoch": 0.8503401360544217, "grad_norm": 109.75096893310547, "learning_rate": 1.6689572238447004e-06, "loss": 0.798, "step": 22000 }, { "epoch": 0.8696660482374768, "grad_norm": 74.9537582397461, "learning_rate": 1.4542174884040545e-06, "loss": 0.8254, "step": 22500 }, { "epoch": 0.8889919604205319, "grad_norm": 53.3721923828125, "learning_rate": 1.2394777529634084e-06, "loss": 0.8392, "step": 23000 }, { "epoch": 0.9083178726035869, "grad_norm": 74.43936157226562, "learning_rate": 1.0247380175227625e-06, "loss": 0.7987, "step": 23500 }, { "epoch": 0.9276437847866419, "grad_norm": 22.659399032592773, "learning_rate": 8.099982820821166e-07, "loss": 0.8334, "step": 24000 }, { "epoch": 0.946969696969697, "grad_norm": 57.379234313964844, "learning_rate": 5.952585466414706e-07, "loss": 0.7609, "step": 24500 }, { "epoch": 0.966295609152752, "grad_norm": 48.525848388671875, "learning_rate": 3.8051881120082463e-07, "loss": 0.8165, "step": 25000 }, { "epoch": 0.985621521335807, "grad_norm": 29.62494468688965, "learning_rate": 1.6577907576017866e-07, "loss": 0.8133, "step": 25500 } ], "logging_steps": 500, "max_steps": 25872, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }