{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 88686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.382721060821325e-05, "grad_norm": 1896.0, "learning_rate": 0.00029999661727893914, "loss": 32.25, "step": 1 }, { "epoch": 0.10148163182463973, "grad_norm": 1.2734375, "learning_rate": 0.000289851836817536, "loss": 3.8074, "step": 3000 }, { "epoch": 0.10148163182463973, "eval_loss": 2.4561102390289307, "eval_runtime": 92.9065, "eval_samples_per_second": 1016.13, "eval_steps_per_second": 7.943, "step": 3000 }, { "epoch": 0.20296326364927947, "grad_norm": 1.15625, "learning_rate": 0.000279703673635072, "loss": 3.182, "step": 6000 }, { "epoch": 0.20296326364927947, "eval_loss": 2.3882644176483154, "eval_runtime": 92.9373, "eval_samples_per_second": 1015.792, "eval_steps_per_second": 7.941, "step": 6000 }, { "epoch": 0.30444489547391923, "grad_norm": 1.390625, "learning_rate": 0.00026955551045260807, "loss": 3.1161, "step": 9000 }, { "epoch": 0.30444489547391923, "eval_loss": 2.365419626235962, "eval_runtime": 92.9113, "eval_samples_per_second": 1016.077, "eval_steps_per_second": 7.943, "step": 9000 }, { "epoch": 0.40592652729855894, "grad_norm": 1.1640625, "learning_rate": 0.0002594073472701441, "loss": 3.0863, "step": 12000 }, { "epoch": 0.40592652729855894, "eval_loss": 2.339674472808838, "eval_runtime": 92.9289, "eval_samples_per_second": 1015.884, "eval_steps_per_second": 7.942, "step": 12000 }, { "epoch": 0.5074081591231987, "grad_norm": 1.3359375, "learning_rate": 0.0002492591840876801, "loss": 3.0643, "step": 15000 }, { "epoch": 0.5074081591231987, "eval_loss": 2.3348894119262695, "eval_runtime": 92.9352, "eval_samples_per_second": 1015.815, "eval_steps_per_second": 7.941, "step": 15000 }, { "epoch": 0.6088897909478385, "grad_norm": 1.25, "learning_rate": 0.0002391110209052161, "loss": 3.0593, "step": 18000 }, { "epoch": 0.6088897909478385, "eval_loss": 2.32326602935791, "eval_runtime": 92.9449, "eval_samples_per_second": 1015.709, "eval_steps_per_second": 7.94, "step": 18000 }, { "epoch": 0.7103714227724782, "grad_norm": 1.46875, "learning_rate": 0.00022896285772275215, "loss": 3.0519, "step": 21000 }, { "epoch": 0.7103714227724782, "eval_loss": 2.325340986251831, "eval_runtime": 92.952, "eval_samples_per_second": 1015.632, "eval_steps_per_second": 7.94, "step": 21000 }, { "epoch": 0.8118530545971179, "grad_norm": 1.3515625, "learning_rate": 0.0002188146945402882, "loss": 3.0464, "step": 24000 }, { "epoch": 0.8118530545971179, "eval_loss": 2.320810079574585, "eval_runtime": 92.965, "eval_samples_per_second": 1015.489, "eval_steps_per_second": 7.938, "step": 24000 }, { "epoch": 0.9133346864217576, "grad_norm": 1.234375, "learning_rate": 0.00020866653135782423, "loss": 3.0434, "step": 27000 }, { "epoch": 0.9133346864217576, "eval_loss": 2.3228955268859863, "eval_runtime": 93.1104, "eval_samples_per_second": 1013.904, "eval_steps_per_second": 7.926, "step": 27000 }, { "epoch": 1.0148163182463974, "grad_norm": 1.21875, "learning_rate": 0.00019851836817536025, "loss": 3.0389, "step": 30000 }, { "epoch": 1.0148163182463974, "eval_loss": 2.3193914890289307, "eval_runtime": 93.0867, "eval_samples_per_second": 1014.162, "eval_steps_per_second": 7.928, "step": 30000 }, { "epoch": 1.116297950071037, "grad_norm": 1.2734375, "learning_rate": 0.0001883702049928963, "loss": 3.0377, "step": 33000 }, { "epoch": 1.116297950071037, "eval_loss": 2.3154852390289307, "eval_runtime": 93.1029, "eval_samples_per_second": 1013.985, "eval_steps_per_second": 7.927, "step": 33000 }, { "epoch": 1.217779581895677, "grad_norm": 1.296875, "learning_rate": 0.0001782220418104323, "loss": 3.0319, "step": 36000 }, { "epoch": 1.217779581895677, "eval_loss": 2.3132622241973877, "eval_runtime": 93.1111, "eval_samples_per_second": 1013.897, "eval_steps_per_second": 7.926, "step": 36000 }, { "epoch": 1.3192612137203166, "grad_norm": 1.1796875, "learning_rate": 0.00016807387862796832, "loss": 3.0328, "step": 39000 }, { "epoch": 1.3192612137203166, "eval_loss": 2.310361623764038, "eval_runtime": 93.0989, "eval_samples_per_second": 1014.03, "eval_steps_per_second": 7.927, "step": 39000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.265625, "learning_rate": 0.00015792571544550436, "loss": 3.0314, "step": 42000 }, { "epoch": 1.4207428455449564, "eval_loss": 2.309779405593872, "eval_runtime": 93.116, "eval_samples_per_second": 1013.843, "eval_steps_per_second": 7.926, "step": 42000 }, { "epoch": 1.522224477369596, "grad_norm": 1.4140625, "learning_rate": 0.00014777755226304037, "loss": 3.0306, "step": 45000 }, { "epoch": 1.522224477369596, "eval_loss": 2.3095569610595703, "eval_runtime": 93.0822, "eval_samples_per_second": 1014.211, "eval_steps_per_second": 7.928, "step": 45000 }, { "epoch": 1.6237061091942357, "grad_norm": 1.3359375, "learning_rate": 0.0001376293890805764, "loss": 3.0251, "step": 48000 }, { "epoch": 1.6237061091942357, "eval_loss": 2.314659595489502, "eval_runtime": 93.0784, "eval_samples_per_second": 1014.253, "eval_steps_per_second": 7.929, "step": 48000 }, { "epoch": 1.7251877410188756, "grad_norm": 1.3515625, "learning_rate": 0.00012748122589811243, "loss": 3.027, "step": 51000 }, { "epoch": 1.7251877410188756, "eval_loss": 2.3094723224639893, "eval_runtime": 92.9414, "eval_samples_per_second": 1015.748, "eval_steps_per_second": 7.94, "step": 51000 }, { "epoch": 1.8266693728435153, "grad_norm": 1.2109375, "learning_rate": 0.00011733306271564845, "loss": 3.0238, "step": 54000 }, { "epoch": 1.8266693728435153, "eval_loss": 2.3098957538604736, "eval_runtime": 92.9753, "eval_samples_per_second": 1015.377, "eval_steps_per_second": 7.938, "step": 54000 }, { "epoch": 1.928151004668155, "grad_norm": 1.3125, "learning_rate": 0.00010718489953318448, "loss": 3.0246, "step": 57000 }, { "epoch": 1.928151004668155, "eval_loss": 2.308159828186035, "eval_runtime": 92.9437, "eval_samples_per_second": 1015.722, "eval_steps_per_second": 7.94, "step": 57000 }, { "epoch": 2.029632636492795, "grad_norm": 1.328125, "learning_rate": 9.703673635072052e-05, "loss": 3.0253, "step": 60000 }, { "epoch": 2.029632636492795, "eval_loss": 2.30761981010437, "eval_runtime": 92.9512, "eval_samples_per_second": 1015.641, "eval_steps_per_second": 7.94, "step": 60000 }, { "epoch": 2.1311142683174347, "grad_norm": 1.296875, "learning_rate": 8.688857316825655e-05, "loss": 3.0261, "step": 63000 }, { "epoch": 2.1311142683174347, "eval_loss": 2.310319185256958, "eval_runtime": 92.926, "eval_samples_per_second": 1015.916, "eval_steps_per_second": 7.942, "step": 63000 }, { "epoch": 2.232595900142074, "grad_norm": 1.2265625, "learning_rate": 7.674040998579256e-05, "loss": 3.0242, "step": 66000 }, { "epoch": 2.232595900142074, "eval_loss": 2.3085408210754395, "eval_runtime": 92.9709, "eval_samples_per_second": 1015.425, "eval_steps_per_second": 7.938, "step": 66000 }, { "epoch": 2.334077531966714, "grad_norm": 1.2890625, "learning_rate": 6.659224680332859e-05, "loss": 3.0245, "step": 69000 }, { "epoch": 2.334077531966714, "eval_loss": 2.3078315258026123, "eval_runtime": 92.9373, "eval_samples_per_second": 1015.793, "eval_steps_per_second": 7.941, "step": 69000 }, { "epoch": 2.435559163791354, "grad_norm": 1.234375, "learning_rate": 5.644408362086462e-05, "loss": 3.024, "step": 72000 }, { "epoch": 2.435559163791354, "eval_loss": 2.3096206188201904, "eval_runtime": 92.9124, "eval_samples_per_second": 1016.065, "eval_steps_per_second": 7.943, "step": 72000 }, { "epoch": 2.5370407956159937, "grad_norm": 1.3828125, "learning_rate": 4.629592043840065e-05, "loss": 3.0251, "step": 75000 }, { "epoch": 2.5370407956159937, "eval_loss": 2.30861496925354, "eval_runtime": 92.9585, "eval_samples_per_second": 1015.561, "eval_steps_per_second": 7.939, "step": 75000 }, { "epoch": 2.638522427440633, "grad_norm": 1.265625, "learning_rate": 3.614775725593667e-05, "loss": 3.0257, "step": 78000 }, { "epoch": 2.638522427440633, "eval_loss": 2.308117389678955, "eval_runtime": 92.9722, "eval_samples_per_second": 1015.411, "eval_steps_per_second": 7.938, "step": 78000 }, { "epoch": 2.740004059265273, "grad_norm": 1.2421875, "learning_rate": 2.59995940734727e-05, "loss": 3.0224, "step": 81000 }, { "epoch": 2.740004059265273, "eval_loss": 2.308805465698242, "eval_runtime": 92.931, "eval_samples_per_second": 1015.861, "eval_steps_per_second": 7.941, "step": 81000 }, { "epoch": 2.841485691089913, "grad_norm": 1.2890625, "learning_rate": 1.5851430891008727e-05, "loss": 3.0267, "step": 84000 }, { "epoch": 2.841485691089913, "eval_loss": 2.3082549571990967, "eval_runtime": 92.9136, "eval_samples_per_second": 1016.051, "eval_steps_per_second": 7.943, "step": 84000 }, { "epoch": 2.9429673229145523, "grad_norm": 1.2578125, "learning_rate": 5.703267708544753e-06, "loss": 3.0243, "step": 87000 }, { "epoch": 2.9429673229145523, "eval_loss": 2.3083608150482178, "eval_runtime": 92.9434, "eval_samples_per_second": 1015.726, "eval_steps_per_second": 7.94, "step": 87000 }, { "epoch": 3.0, "step": 88686, "total_flos": 1.2889476521431695e+18, "train_loss": 3.0687490486097015, "train_runtime": 36823.0101, "train_samples_per_second": 308.271, "train_steps_per_second": 2.408 } ], "logging_steps": 3000, "max_steps": 88686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2889476521431695e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }