{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 88686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.382721060821325e-05, "grad_norm": 146.0, "learning_rate": 0.00029999661727893914, "loss": 28.0, "step": 1 }, { "epoch": 0.10148163182463973, "grad_norm": 1.0625, "learning_rate": 0.000289851836817536, "loss": 4.8045, "step": 3000 }, { "epoch": 0.10148163182463973, "eval_loss": 3.3920648097991943, "eval_runtime": 84.2066, "eval_samples_per_second": 1121.111, "eval_steps_per_second": 8.764, "step": 3000 }, { "epoch": 0.20296326364927947, "grad_norm": 1.1875, "learning_rate": 0.000279703673635072, "loss": 4.1932, "step": 6000 }, { "epoch": 0.20296326364927947, "eval_loss": 3.254700183868408, "eval_runtime": 83.4018, "eval_samples_per_second": 1131.93, "eval_steps_per_second": 8.849, "step": 6000 }, { "epoch": 0.30444489547391923, "grad_norm": 1.15625, "learning_rate": 0.00026955551045260807, "loss": 4.1098, "step": 9000 }, { "epoch": 0.30444489547391923, "eval_loss": 3.204374074935913, "eval_runtime": 83.3884, "eval_samples_per_second": 1132.112, "eval_steps_per_second": 8.85, "step": 9000 }, { "epoch": 0.40592652729855894, "grad_norm": 1.15625, "learning_rate": 0.0002594073472701441, "loss": 4.0752, "step": 12000 }, { "epoch": 0.40592652729855894, "eval_loss": 3.173970937728882, "eval_runtime": 83.3988, "eval_samples_per_second": 1131.971, "eval_steps_per_second": 8.849, "step": 12000 }, { "epoch": 0.5074081591231987, "grad_norm": 1.21875, "learning_rate": 0.0002492591840876801, "loss": 4.0515, "step": 15000 }, { "epoch": 0.5074081591231987, "eval_loss": 3.157351016998291, "eval_runtime": 83.395, "eval_samples_per_second": 1132.022, "eval_steps_per_second": 8.849, "step": 15000 }, { "epoch": 0.6088897909478385, "grad_norm": 1.1328125, "learning_rate": 0.0002391110209052161, "loss": 4.0446, "step": 18000 }, { "epoch": 0.6088897909478385, "eval_loss": 3.1490302085876465, "eval_runtime": 83.4067, "eval_samples_per_second": 1131.863, "eval_steps_per_second": 8.848, "step": 18000 }, { "epoch": 0.7103714227724782, "grad_norm": 1.1796875, "learning_rate": 0.00022896285772275215, "loss": 4.036, "step": 21000 }, { "epoch": 0.7103714227724782, "eval_loss": 3.1390371322631836, "eval_runtime": 83.3672, "eval_samples_per_second": 1132.399, "eval_steps_per_second": 8.852, "step": 21000 }, { "epoch": 0.8118530545971179, "grad_norm": 1.1484375, "learning_rate": 0.0002188146945402882, "loss": 4.0298, "step": 24000 }, { "epoch": 0.8118530545971179, "eval_loss": 3.1364541053771973, "eval_runtime": 83.5197, "eval_samples_per_second": 1130.332, "eval_steps_per_second": 8.836, "step": 24000 }, { "epoch": 0.9133346864217576, "grad_norm": 1.171875, "learning_rate": 0.00020866653135782423, "loss": 4.0273, "step": 27000 }, { "epoch": 0.9133346864217576, "eval_loss": 3.1308434009552, "eval_runtime": 83.5474, "eval_samples_per_second": 1129.957, "eval_steps_per_second": 8.833, "step": 27000 }, { "epoch": 1.0148163182463974, "grad_norm": 1.171875, "learning_rate": 0.00019851836817536025, "loss": 4.023, "step": 30000 }, { "epoch": 1.0148163182463974, "eval_loss": 3.1285781860351562, "eval_runtime": 83.5362, "eval_samples_per_second": 1130.109, "eval_steps_per_second": 8.834, "step": 30000 }, { "epoch": 1.116297950071037, "grad_norm": 1.1640625, "learning_rate": 0.0001883702049928963, "loss": 4.0229, "step": 33000 }, { "epoch": 1.116297950071037, "eval_loss": 3.1244707107543945, "eval_runtime": 83.5446, "eval_samples_per_second": 1129.996, "eval_steps_per_second": 8.834, "step": 33000 }, { "epoch": 1.217779581895677, "grad_norm": 1.21875, "learning_rate": 0.0001782220418104323, "loss": 4.0158, "step": 36000 }, { "epoch": 1.217779581895677, "eval_loss": 3.1234333515167236, "eval_runtime": 83.5361, "eval_samples_per_second": 1130.11, "eval_steps_per_second": 8.835, "step": 36000 }, { "epoch": 1.3192612137203166, "grad_norm": 1.1796875, "learning_rate": 0.00016807387862796832, "loss": 4.0163, "step": 39000 }, { "epoch": 1.3192612137203166, "eval_loss": 3.121718406677246, "eval_runtime": 83.5263, "eval_samples_per_second": 1130.242, "eval_steps_per_second": 8.836, "step": 39000 }, { "epoch": 1.4207428455449564, "grad_norm": 1.2421875, "learning_rate": 0.00015792571544550436, "loss": 4.0156, "step": 42000 }, { "epoch": 1.4207428455449564, "eval_loss": 3.1207656860351562, "eval_runtime": 83.531, "eval_samples_per_second": 1130.179, "eval_steps_per_second": 8.835, "step": 42000 }, { "epoch": 1.522224477369596, "grad_norm": 1.328125, "learning_rate": 0.00014777755226304037, "loss": 4.0147, "step": 45000 }, { "epoch": 1.522224477369596, "eval_loss": 3.119389295578003, "eval_runtime": 83.5355, "eval_samples_per_second": 1130.119, "eval_steps_per_second": 8.835, "step": 45000 }, { "epoch": 1.6237061091942357, "grad_norm": 1.28125, "learning_rate": 0.0001376293890805764, "loss": 4.0114, "step": 48000 }, { "epoch": 1.6237061091942357, "eval_loss": 3.11716628074646, "eval_runtime": 83.398, "eval_samples_per_second": 1131.981, "eval_steps_per_second": 8.849, "step": 48000 }, { "epoch": 1.7251877410188756, "grad_norm": 1.3515625, "learning_rate": 0.00012748122589811243, "loss": 4.0113, "step": 51000 }, { "epoch": 1.7251877410188756, "eval_loss": 3.1180977821350098, "eval_runtime": 83.3949, "eval_samples_per_second": 1132.024, "eval_steps_per_second": 8.849, "step": 51000 }, { "epoch": 1.8266693728435153, "grad_norm": 1.40625, "learning_rate": 0.00011733306271564845, "loss": 4.0097, "step": 54000 }, { "epoch": 1.8266693728435153, "eval_loss": 3.116975784301758, "eval_runtime": 83.3757, "eval_samples_per_second": 1132.284, "eval_steps_per_second": 8.851, "step": 54000 }, { "epoch": 1.928151004668155, "grad_norm": 1.140625, "learning_rate": 0.00010718489953318448, "loss": 4.0105, "step": 57000 }, { "epoch": 1.928151004668155, "eval_loss": 3.116128921508789, "eval_runtime": 83.3931, "eval_samples_per_second": 1132.049, "eval_steps_per_second": 8.85, "step": 57000 }, { "epoch": 2.029632636492795, "grad_norm": 1.1953125, "learning_rate": 9.703673635072052e-05, "loss": 4.0115, "step": 60000 }, { "epoch": 2.029632636492795, "eval_loss": 3.1166157722473145, "eval_runtime": 83.3926, "eval_samples_per_second": 1132.055, "eval_steps_per_second": 8.85, "step": 60000 }, { "epoch": 2.1311142683174347, "grad_norm": 1.4609375, "learning_rate": 8.688857316825655e-05, "loss": 4.0117, "step": 63000 }, { "epoch": 2.1311142683174347, "eval_loss": 3.1167428493499756, "eval_runtime": 83.4107, "eval_samples_per_second": 1131.809, "eval_steps_per_second": 8.848, "step": 63000 }, { "epoch": 2.232595900142074, "grad_norm": 1.2109375, "learning_rate": 7.674040998579256e-05, "loss": 4.0082, "step": 66000 }, { "epoch": 2.232595900142074, "eval_loss": 3.116044282913208, "eval_runtime": 83.4033, "eval_samples_per_second": 1131.909, "eval_steps_per_second": 8.849, "step": 66000 }, { "epoch": 2.334077531966714, "grad_norm": 1.265625, "learning_rate": 6.659224680332859e-05, "loss": 4.0089, "step": 69000 }, { "epoch": 2.334077531966714, "eval_loss": 3.1157476902008057, "eval_runtime": 83.3805, "eval_samples_per_second": 1132.219, "eval_steps_per_second": 8.851, "step": 69000 }, { "epoch": 2.435559163791354, "grad_norm": 1.2421875, "learning_rate": 5.644408362086462e-05, "loss": 4.0087, "step": 72000 }, { "epoch": 2.435559163791354, "eval_loss": 3.1155996322631836, "eval_runtime": 83.4329, "eval_samples_per_second": 1131.508, "eval_steps_per_second": 8.845, "step": 72000 }, { "epoch": 2.5370407956159937, "grad_norm": 1.34375, "learning_rate": 4.629592043840065e-05, "loss": 4.0104, "step": 75000 }, { "epoch": 2.5370407956159937, "eval_loss": 3.115832567214966, "eval_runtime": 83.4175, "eval_samples_per_second": 1131.716, "eval_steps_per_second": 8.847, "step": 75000 }, { "epoch": 2.638522427440633, "grad_norm": 1.3203125, "learning_rate": 3.614775725593667e-05, "loss": 4.0088, "step": 78000 }, { "epoch": 2.638522427440633, "eval_loss": 3.115938425064087, "eval_runtime": 83.3984, "eval_samples_per_second": 1131.977, "eval_steps_per_second": 8.849, "step": 78000 }, { "epoch": 2.740004059265273, "grad_norm": 1.234375, "learning_rate": 2.59995940734727e-05, "loss": 4.007, "step": 81000 }, { "epoch": 2.740004059265273, "eval_loss": 3.115938425064087, "eval_runtime": 83.4084, "eval_samples_per_second": 1131.841, "eval_steps_per_second": 8.848, "step": 81000 }, { "epoch": 2.841485691089913, "grad_norm": 1.3984375, "learning_rate": 1.5851430891008727e-05, "loss": 4.0116, "step": 84000 }, { "epoch": 2.841485691089913, "eval_loss": 3.116044282913208, "eval_runtime": 83.4021, "eval_samples_per_second": 1131.925, "eval_steps_per_second": 8.849, "step": 84000 }, { "epoch": 2.9429673229145523, "grad_norm": 1.2109375, "learning_rate": 5.703267708544753e-06, "loss": 4.0099, "step": 87000 }, { "epoch": 2.9429673229145523, "eval_loss": 3.115874767303467, "eval_runtime": 83.5591, "eval_samples_per_second": 1129.799, "eval_steps_per_second": 8.832, "step": 87000 }, { "epoch": 3.0, "step": 88686, "total_flos": 1.19291602746581e+18, "train_loss": 4.055220102383691, "train_runtime": 32766.006, "train_samples_per_second": 346.44, "train_steps_per_second": 2.707 } ], "logging_steps": 3000, "max_steps": 88686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.19291602746581e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }