{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 88686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.382721060821325e-05, "grad_norm": 1760.0, "learning_rate": 0.00029999661727893914, "loss": 57.5, "step": 1 }, { "epoch": 0.10148163182463973, "grad_norm": 1.3515625, "learning_rate": 0.000289851836817536, "loss": 4.5083, "step": 3000 }, { "epoch": 0.10148163182463973, "eval_loss": 2.8375253677368164, "eval_runtime": 88.2347, "eval_samples_per_second": 1069.931, "eval_steps_per_second": 8.364, "step": 3000 }, { "epoch": 0.20296326364927947, "grad_norm": 1.1875, "learning_rate": 0.000279703673635072, "loss": 3.6752, "step": 6000 }, { "epoch": 0.20296326364927947, "eval_loss": 2.74216628074646, "eval_runtime": 88.2236, "eval_samples_per_second": 1070.065, "eval_steps_per_second": 8.365, "step": 6000 }, { "epoch": 0.30444489547391923, "grad_norm": 1.203125, "learning_rate": 0.00026955551045260807, "loss": 3.598, "step": 9000 }, { "epoch": 0.30444489547391923, "eval_loss": 2.70210862159729, "eval_runtime": 88.2673, "eval_samples_per_second": 1069.535, "eval_steps_per_second": 8.361, "step": 9000 }, { "epoch": 0.40592652729855894, "grad_norm": 1.3203125, "learning_rate": 0.0002594073472701441, "loss": 3.5651, "step": 12000 }, { "epoch": 0.40592652729855894, "eval_loss": 2.686271905899048, "eval_runtime": 88.2934, "eval_samples_per_second": 1069.219, "eval_steps_per_second": 8.358, "step": 12000 }, { "epoch": 0.5074081591231987, "grad_norm": 1.3359375, "learning_rate": 0.0002492591840876801, "loss": 3.5409, "step": 15000 }, { "epoch": 0.5074081591231987, "eval_loss": 2.672954797744751, "eval_runtime": 88.2673, "eval_samples_per_second": 1069.535, "eval_steps_per_second": 8.361, "step": 15000 }, { "epoch": 0.6088897909478385, "grad_norm": 1.1796875, "learning_rate": 0.0002391110209052161, "loss": 3.5353, "step": 18000 }, { "epoch": 0.6088897909478385, "eval_loss": 2.6632580757141113, "eval_runtime": 88.3039, "eval_samples_per_second": 1069.092, "eval_steps_per_second": 8.358, "step": 18000 }, { "epoch": 0.7103714227724782, "grad_norm": 1.2265625, "learning_rate": 0.00022896285772275215, "loss": 3.5268, "step": 21000 }, { "epoch": 0.7103714227724782, "eval_loss": 2.6613523960113525, "eval_runtime": 88.3108, "eval_samples_per_second": 1069.009, "eval_steps_per_second": 8.357, "step": 21000 }, { "epoch": 0.8118530545971179, "grad_norm": 1.328125, "learning_rate": 0.0002188146945402882, "loss": 3.5223, "step": 24000 }, { "epoch": 0.8118530545971179, "eval_loss": 2.6542599201202393, "eval_runtime": 88.2938, "eval_samples_per_second": 1069.214, "eval_steps_per_second": 8.358, "step": 24000 }, { "epoch": 0.9133346864217576, "grad_norm": 1.28125, "learning_rate": 0.00020866653135782423, "loss": 3.5185, "step": 27000 }, { "epoch": 0.9133346864217576, "eval_loss": 2.6546196937561035, "eval_runtime": 88.3801, "eval_samples_per_second": 1068.17, "eval_steps_per_second": 8.35, "step": 27000 }, { "epoch": 1.0148163182463974, "grad_norm": 1.2578125, "learning_rate": 0.00019851836817536025, "loss": 3.515, "step": 30000 }, { "epoch": 1.0148163182463974, "eval_loss": 2.6509780883789062, "eval_runtime": 88.3972, "eval_samples_per_second": 1067.963, "eval_steps_per_second": 8.349, "step": 30000 }, { "epoch": 1.116297950071037, "grad_norm": 1.3828125, "learning_rate": 0.0001883702049928963, "loss": 3.5138, "step": 33000 }, { "epoch": 1.116297950071037, "eval_loss": 2.6480565071105957, "eval_runtime": 88.3803, "eval_samples_per_second": 1068.168, "eval_steps_per_second": 8.35, "step": 33000 }, { "epoch": 1.217779581895677, "grad_norm": 1.328125, "learning_rate": 0.0001782220418104323, "loss": 3.508, "step": 36000 }, { "epoch": 1.217779581895677, "eval_loss": 2.6456427574157715, "eval_runtime": 88.3938, "eval_samples_per_second": 1068.005, "eval_steps_per_second": 8.349, "step": 36000 }, { "epoch": 1.3192612137203166, "grad_norm": 1.453125, "learning_rate": 0.00016807387862796832, "loss": 3.5071, "step": 39000 }, { "epoch": 1.3192612137203166, "eval_loss": 2.642657518386841, "eval_runtime": 88.3875, "eval_samples_per_second": 1068.081, "eval_steps_per_second": 8.35, "step": 39000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.2265625, "learning_rate": 0.00015792571544550436, "loss": 3.5058, "step": 42000 }, { "epoch": 1.4207428455449564, "eval_loss": 2.6432926654815674, "eval_runtime": 88.4239, "eval_samples_per_second": 1067.641, "eval_steps_per_second": 8.346, "step": 42000 }, { "epoch": 1.522224477369596, "grad_norm": 1.265625, "learning_rate": 0.00014777755226304037, "loss": 3.5063, "step": 45000 }, { "epoch": 1.522224477369596, "eval_loss": 2.644076108932495, "eval_runtime": 88.4016, "eval_samples_per_second": 1067.91, "eval_steps_per_second": 8.348, "step": 45000 }, { "epoch": 1.6237061091942357, "grad_norm": 1.3046875, "learning_rate": 0.0001376293890805764, "loss": 3.5003, "step": 48000 }, { "epoch": 1.6237061091942357, "eval_loss": 2.6422553062438965, "eval_runtime": 88.447, "eval_samples_per_second": 1067.362, "eval_steps_per_second": 8.344, "step": 48000 }, { "epoch": 1.7251877410188756, "grad_norm": 1.2734375, "learning_rate": 0.00012748122589811243, "loss": 3.5018, "step": 51000 }, { "epoch": 1.7251877410188756, "eval_loss": 2.643885612487793, "eval_runtime": 88.2483, "eval_samples_per_second": 1069.765, "eval_steps_per_second": 8.363, "step": 51000 }, { "epoch": 1.8266693728435153, "grad_norm": 1.2734375, "learning_rate": 0.00011733306271564845, "loss": 3.4992, "step": 54000 }, { "epoch": 1.8266693728435153, "eval_loss": 2.6422975063323975, "eval_runtime": 88.2892, "eval_samples_per_second": 1069.27, "eval_steps_per_second": 8.359, "step": 54000 }, { "epoch": 1.928151004668155, "grad_norm": 1.4765625, "learning_rate": 0.00010718489953318448, "loss": 3.5011, "step": 57000 }, { "epoch": 1.928151004668155, "eval_loss": 2.6424670219421387, "eval_runtime": 88.2616, "eval_samples_per_second": 1069.604, "eval_steps_per_second": 8.362, "step": 57000 }, { "epoch": 2.029632636492795, "grad_norm": 1.46875, "learning_rate": 9.703673635072052e-05, "loss": 3.5017, "step": 60000 }, { "epoch": 2.029632636492795, "eval_loss": 2.6414718627929688, "eval_runtime": 88.2423, "eval_samples_per_second": 1069.838, "eval_steps_per_second": 8.363, "step": 60000 }, { "epoch": 2.1311142683174347, "grad_norm": 1.2734375, "learning_rate": 8.688857316825655e-05, "loss": 3.5014, "step": 63000 }, { "epoch": 2.1311142683174347, "eval_loss": 2.6421916484832764, "eval_runtime": 88.2249, "eval_samples_per_second": 1070.049, "eval_steps_per_second": 8.365, "step": 63000 }, { "epoch": 2.232595900142074, "grad_norm": 1.3046875, "learning_rate": 7.674040998579256e-05, "loss": 3.4996, "step": 66000 }, { "epoch": 2.232595900142074, "eval_loss": 2.6411330699920654, "eval_runtime": 88.2534, "eval_samples_per_second": 1069.703, "eval_steps_per_second": 8.362, "step": 66000 }, { "epoch": 2.334077531966714, "grad_norm": 1.21875, "learning_rate": 6.659224680332859e-05, "loss": 3.4998, "step": 69000 }, { "epoch": 2.334077531966714, "eval_loss": 2.6422553062438965, "eval_runtime": 88.2778, "eval_samples_per_second": 1069.408, "eval_steps_per_second": 8.36, "step": 69000 }, { "epoch": 2.435559163791354, "grad_norm": 1.515625, "learning_rate": 5.644408362086462e-05, "loss": 3.5002, "step": 72000 }, { "epoch": 2.435559163791354, "eval_loss": 2.6412177085876465, "eval_runtime": 88.2555, "eval_samples_per_second": 1069.678, "eval_steps_per_second": 8.362, "step": 72000 }, { "epoch": 2.5370407956159937, "grad_norm": 1.28125, "learning_rate": 4.629592043840065e-05, "loss": 3.5011, "step": 75000 }, { "epoch": 2.5370407956159937, "eval_loss": 2.641514301300049, "eval_runtime": 88.2215, "eval_samples_per_second": 1070.09, "eval_steps_per_second": 8.365, "step": 75000 }, { "epoch": 2.638522427440633, "grad_norm": 1.484375, "learning_rate": 3.614775725593667e-05, "loss": 3.5005, "step": 78000 }, { "epoch": 2.638522427440633, "eval_loss": 2.6411118507385254, "eval_runtime": 88.2883, "eval_samples_per_second": 1069.281, "eval_steps_per_second": 8.359, "step": 78000 }, { "epoch": 2.740004059265273, "grad_norm": 1.3203125, "learning_rate": 2.59995940734727e-05, "loss": 3.4975, "step": 81000 }, { "epoch": 2.740004059265273, "eval_loss": 2.6413235664367676, "eval_runtime": 88.2689, "eval_samples_per_second": 1069.516, "eval_steps_per_second": 8.361, "step": 81000 }, { "epoch": 2.841485691089913, "grad_norm": 1.3046875, "learning_rate": 1.5851430891008727e-05, "loss": 3.5013, "step": 84000 }, { "epoch": 2.841485691089913, "eval_loss": 2.6411330699920654, "eval_runtime": 88.2658, "eval_samples_per_second": 1069.554, "eval_steps_per_second": 8.361, "step": 84000 }, { "epoch": 2.9429673229145523, "grad_norm": 1.171875, "learning_rate": 5.703267708544753e-06, "loss": 3.5006, "step": 87000 }, { "epoch": 2.9429673229145523, "eval_loss": 2.6412177085876465, "eval_runtime": 88.2303, "eval_samples_per_second": 1069.984, "eval_steps_per_second": 8.364, "step": 87000 }, { "epoch": 3.0, "step": 88686, "total_flos": 1.2409318398044897e+18, "train_loss": 3.5531258104435874, "train_runtime": 34812.4565, "train_samples_per_second": 326.075, "train_steps_per_second": 2.548 } ], "logging_steps": 3000, "max_steps": 88686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2409318398044897e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }