{ "best_metric": 0.014762421138584614, "best_model_checkpoint": "saves/chess/both/checkpoint-1000", "epoch": 8.0, "eval_steps": 1000, "global_step": 4920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16260162601626016, "grad_norm": 1.6060084762973008, "learning_rate": 1.0162601626016261e-06, "loss": 1.0073, "step": 100 }, { "epoch": 0.3252032520325203, "grad_norm": 3.0440875723802594, "learning_rate": 2.0325203252032523e-06, "loss": 0.0302, "step": 200 }, { "epoch": 0.4878048780487805, "grad_norm": 0.9455772942098933, "learning_rate": 3.0487804878048782e-06, "loss": 0.0229, "step": 300 }, { "epoch": 0.6504065040650406, "grad_norm": 0.3831361794753519, "learning_rate": 4.0650406504065046e-06, "loss": 0.0197, "step": 400 }, { "epoch": 0.8130081300813008, "grad_norm": 0.5841201203132319, "learning_rate": 4.999959730768458e-06, "loss": 0.0183, "step": 500 }, { "epoch": 0.975609756097561, "grad_norm": 0.36973884839665605, "learning_rate": 4.992664502959351e-06, "loss": 0.0172, "step": 600 }, { "epoch": 1.1382113821138211, "grad_norm": 0.5308905808120226, "learning_rate": 4.9728272933003704e-06, "loss": 0.0149, "step": 700 }, { "epoch": 1.3008130081300813, "grad_norm": 0.3199255336490814, "learning_rate": 4.940547913829274e-06, "loss": 0.015, "step": 800 }, { "epoch": 1.4634146341463414, "grad_norm": 0.4781960951795096, "learning_rate": 4.89598878006206e-06, "loss": 0.0147, "step": 900 }, { "epoch": 1.6260162601626016, "grad_norm": 0.5141788877979915, "learning_rate": 4.839374093790139e-06, "loss": 0.0144, "step": 1000 }, { "epoch": 1.6260162601626016, "eval_loss": 0.014762421138584614, "eval_runtime": 232.6693, "eval_samples_per_second": 150.265, "eval_steps_per_second": 0.589, "step": 1000 }, { "epoch": 1.7886178861788617, "grad_norm": 0.46618935598211575, "learning_rate": 4.770988714996401e-06, "loss": 0.0135, "step": 1100 }, { "epoch": 1.951219512195122, "grad_norm": 0.5151587169163754, "learning_rate": 4.691176728566159e-06, "loss": 0.0138, "step": 1200 }, { "epoch": 2.113821138211382, "grad_norm": 0.31817021965315584, "learning_rate": 4.600339713004673e-06, "loss": 0.0109, "step": 1300 }, { "epoch": 2.2764227642276422, "grad_norm": 0.455194899760826, "learning_rate": 4.498934719872278e-06, "loss": 0.011, "step": 1400 }, { "epoch": 2.4390243902439024, "grad_norm": 0.3846115904845964, "learning_rate": 4.387471974103713e-06, "loss": 0.0109, "step": 1500 }, { "epoch": 2.6016260162601625, "grad_norm": 0.45607541160743026, "learning_rate": 4.266512306782628e-06, "loss": 0.0113, "step": 1600 }, { "epoch": 2.7642276422764227, "grad_norm": 0.25989554289266725, "learning_rate": 4.136664333288392e-06, "loss": 0.0111, "step": 1700 }, { "epoch": 2.926829268292683, "grad_norm": 0.2511070728030906, "learning_rate": 3.998581391013531e-06, "loss": 0.0104, "step": 1800 }, { "epoch": 3.089430894308943, "grad_norm": 0.24155318360631117, "learning_rate": 3.8529582520598395e-06, "loss": 0.0082, "step": 1900 }, { "epoch": 3.252032520325203, "grad_norm": 0.36145185236007943, "learning_rate": 3.7005276274534145e-06, "loss": 0.0068, "step": 2000 }, { "epoch": 3.252032520325203, "eval_loss": 0.01827162876725197, "eval_runtime": 223.5741, "eval_samples_per_second": 156.378, "eval_steps_per_second": 0.613, "step": 2000 }, { "epoch": 3.4146341463414633, "grad_norm": 0.34297060011843655, "learning_rate": 3.5420564804678583e-06, "loss": 0.0077, "step": 2100 }, { "epoch": 3.5772357723577235, "grad_norm": 0.6840105339133411, "learning_rate": 3.378342167605362e-06, "loss": 0.0073, "step": 2200 }, { "epoch": 3.7398373983739837, "grad_norm": 0.30055235437579364, "learning_rate": 3.21020842665256e-06, "loss": 0.0074, "step": 2300 }, { "epoch": 3.902439024390244, "grad_norm": 0.4192737034811711, "learning_rate": 3.038501231997454e-06, "loss": 0.0069, "step": 2400 }, { "epoch": 4.065040650406504, "grad_norm": 0.4192262784149686, "learning_rate": 2.8640845380616257e-06, "loss": 0.0052, "step": 2500 }, { "epoch": 4.227642276422764, "grad_norm": 0.6960471269583507, "learning_rate": 2.6878359322649085e-06, "loss": 0.0029, "step": 2600 }, { "epoch": 4.390243902439025, "grad_norm": 0.3999816795268035, "learning_rate": 2.510642219394847e-06, "loss": 0.0034, "step": 2700 }, { "epoch": 4.5528455284552845, "grad_norm": 0.5318936146765854, "learning_rate": 2.3333949595984614e-06, "loss": 0.0032, "step": 2800 }, { "epoch": 4.715447154471545, "grad_norm": 0.551874865947909, "learning_rate": 2.1569859824471445e-06, "loss": 0.0033, "step": 2900 }, { "epoch": 4.878048780487805, "grad_norm": 0.3248081788920533, "learning_rate": 1.9823028996459487e-06, "loss": 0.003, "step": 3000 }, { "epoch": 4.878048780487805, "eval_loss": 0.02072063274681568, "eval_runtime": 223.6378, "eval_samples_per_second": 156.333, "eval_steps_per_second": 0.613, "step": 3000 }, { "epoch": 5.040650406504065, "grad_norm": 0.024739170373672136, "learning_rate": 1.8102246389652839e-06, "loss": 0.0021, "step": 3100 }, { "epoch": 5.203252032520325, "grad_norm": 0.2258552274726826, "learning_rate": 1.6416170218663446e-06, "loss": 0.0007, "step": 3200 }, { "epoch": 5.365853658536586, "grad_norm": 0.3296293455992763, "learning_rate": 1.4773284070716504e-06, "loss": 0.0008, "step": 3300 }, { "epoch": 5.528455284552845, "grad_norm": 0.4671709634217062, "learning_rate": 1.3181854220003568e-06, "loss": 0.0007, "step": 3400 }, { "epoch": 5.691056910569106, "grad_norm": 0.8788212030675407, "learning_rate": 1.164988803545826e-06, "loss": 0.0007, "step": 3500 }, { "epoch": 5.853658536585366, "grad_norm": 0.18192782552523573, "learning_rate": 1.0185093691228535e-06, "loss": 0.0008, "step": 3600 }, { "epoch": 6.016260162601626, "grad_norm": 0.33475268532885594, "learning_rate": 8.794841382564212e-07, "loss": 0.0006, "step": 3700 }, { "epoch": 6.178861788617886, "grad_norm": 0.35558413708310876, "learning_rate": 7.486126242264469e-07, "loss": 0.0001, "step": 3800 }, { "epoch": 6.341463414634147, "grad_norm": 0.6415217440841863, "learning_rate": 6.265533144273175e-07, "loss": 0.0001, "step": 3900 }, { "epoch": 6.504065040650406, "grad_norm": 0.4057619591042224, "learning_rate": 5.139203571514673e-07, "loss": 0.0001, "step": 4000 }, { "epoch": 6.504065040650406, "eval_loss": 0.03588717430830002, "eval_runtime": 224.5359, "eval_samples_per_second": 155.708, "eval_steps_per_second": 0.61, "step": 4000 }, { "epoch": 6.666666666666667, "grad_norm": 0.015517979645165311, "learning_rate": 4.1128047146765936e-07, "loss": 0.0001, "step": 4100 }, { "epoch": 6.829268292682927, "grad_norm": 0.21375939504530958, "learning_rate": 3.191500957420626e-07, "loss": 0.0001, "step": 4200 }, { "epoch": 6.991869918699187, "grad_norm": 0.5625757372660679, "learning_rate": 2.3799278914952957e-07, "loss": 0.0001, "step": 4300 }, { "epoch": 7.154471544715447, "grad_norm": 0.001893242486182687, "learning_rate": 1.682168992494923e-07, "loss": 0.0, "step": 4400 }, { "epoch": 7.317073170731708, "grad_norm": 0.041593607854158006, "learning_rate": 1.1017350736221927e-07, "loss": 0.0, "step": 4500 }, { "epoch": 7.479674796747967, "grad_norm": 0.0678219109325394, "learning_rate": 6.415466208337662e-08, "loss": 0.0, "step": 4600 }, { "epoch": 7.642276422764228, "grad_norm": 0.0002078547810165964, "learning_rate": 3.039190982506823e-08, "loss": 0.0, "step": 4700 }, { "epoch": 7.804878048780488, "grad_norm": 0.002895716916894551, "learning_rate": 9.055129777021664e-09, "loss": 0.0, "step": 4800 }, { "epoch": 7.967479674796748, "grad_norm": 0.004024341830230839, "learning_rate": 2.5167914987633825e-10, "loss": 0.0, "step": 4900 }, { "epoch": 8.0, "step": 4920, "total_flos": 971375200174080.0, "train_loss": 0.02723053244252349, "train_runtime": 62097.9085, "train_samples_per_second": 40.537, "train_steps_per_second": 0.079 } ], "logging_steps": 100, "max_steps": 4920, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 971375200174080.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }