{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6812375816065853, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022707919386886176, "grad_norm": 0.5527037382125854, "learning_rate": 0.00019770114942528738, "loss": 0.0749, "step": 10 }, { "epoch": 0.04541583877377235, "grad_norm": 0.18355610966682434, "learning_rate": 0.0001931034482758621, "loss": 0.0973, "step": 20 }, { "epoch": 0.06812375816065853, "grad_norm": 0.15812426805496216, "learning_rate": 0.00018850574712643678, "loss": 0.089, "step": 30 }, { "epoch": 0.0908316775475447, "grad_norm": 0.19676333665847778, "learning_rate": 0.0001839080459770115, "loss": 0.0885, "step": 40 }, { "epoch": 0.11353959693443089, "grad_norm": 0.23726195096969604, "learning_rate": 0.0001793103448275862, "loss": 0.0865, "step": 50 }, { "epoch": 0.11353959693443089, "eval_loss": 0.09350696206092834, "eval_runtime": 246.8339, "eval_samples_per_second": 0.81, "eval_steps_per_second": 0.81, "step": 50 }, { "epoch": 0.13624751632131707, "grad_norm": 0.18502545356750488, "learning_rate": 0.00017471264367816095, "loss": 0.0823, "step": 60 }, { "epoch": 0.15895543570820322, "grad_norm": 0.38577064871788025, "learning_rate": 0.00017011494252873563, "loss": 0.0813, "step": 70 }, { "epoch": 0.1816633550950894, "grad_norm": 0.27274832129478455, "learning_rate": 0.00016551724137931035, "loss": 0.0887, "step": 80 }, { "epoch": 0.2043712744819756, "grad_norm": 0.18888869881629944, "learning_rate": 0.00016091954022988506, "loss": 0.088, "step": 90 }, { "epoch": 0.22707919386886177, "grad_norm": 0.19606022536754608, "learning_rate": 0.0001563218390804598, "loss": 0.0896, "step": 100 }, { "epoch": 0.22707919386886177, "eval_loss": 0.09240507334470749, "eval_runtime": 239.9776, "eval_samples_per_second": 0.833, "eval_steps_per_second": 0.833, "step": 100 }, { "epoch": 0.24978711325574796, "grad_norm": 0.21395204961299896, "learning_rate": 0.00015172413793103449, "loss": 0.0782, "step": 110 }, { "epoch": 0.27249503264263414, "grad_norm": 0.22302134335041046, "learning_rate": 0.0001471264367816092, "loss": 0.0794, "step": 120 }, { "epoch": 0.2952029520295203, "grad_norm": 0.27699050307273865, "learning_rate": 0.0001425287356321839, "loss": 0.0802, "step": 130 }, { "epoch": 0.31791087141640645, "grad_norm": 0.24119599163532257, "learning_rate": 0.00013793103448275863, "loss": 0.0829, "step": 140 }, { "epoch": 0.34061879080329266, "grad_norm": 0.23131313920021057, "learning_rate": 0.00013333333333333334, "loss": 0.0796, "step": 150 }, { "epoch": 0.34061879080329266, "eval_loss": 0.09314567595720291, "eval_runtime": 243.4064, "eval_samples_per_second": 0.822, "eval_steps_per_second": 0.822, "step": 150 }, { "epoch": 0.3633267101901788, "grad_norm": 0.311929851770401, "learning_rate": 0.00012873563218390805, "loss": 0.0783, "step": 160 }, { "epoch": 0.386034629577065, "grad_norm": 0.2242027074098587, "learning_rate": 0.00012413793103448277, "loss": 0.0818, "step": 170 }, { "epoch": 0.4087425489639512, "grad_norm": 0.2559347450733185, "learning_rate": 0.00011954022988505748, "loss": 0.0789, "step": 180 }, { "epoch": 0.43145046835083734, "grad_norm": 0.23956073820590973, "learning_rate": 0.00011494252873563218, "loss": 0.0734, "step": 190 }, { "epoch": 0.45415838773772355, "grad_norm": 0.290977418422699, "learning_rate": 0.0001103448275862069, "loss": 0.0753, "step": 200 }, { "epoch": 0.45415838773772355, "eval_loss": 0.09412585943937302, "eval_runtime": 243.3084, "eval_samples_per_second": 0.822, "eval_steps_per_second": 0.822, "step": 200 }, { "epoch": 0.4768663071246097, "grad_norm": 0.37943148612976074, "learning_rate": 0.00010574712643678162, "loss": 0.0742, "step": 210 }, { "epoch": 0.4995742265114959, "grad_norm": 0.25092101097106934, "learning_rate": 0.00010114942528735633, "loss": 0.0745, "step": 220 }, { "epoch": 0.5222821458983821, "grad_norm": 0.27841272950172424, "learning_rate": 9.655172413793105e-05, "loss": 0.0732, "step": 230 }, { "epoch": 0.5449900652852683, "grad_norm": 0.24122974276542664, "learning_rate": 9.195402298850575e-05, "loss": 0.0738, "step": 240 }, { "epoch": 0.5676979846721544, "grad_norm": 0.33553385734558105, "learning_rate": 8.735632183908047e-05, "loss": 0.075, "step": 250 }, { "epoch": 0.5676979846721544, "eval_loss": 0.08785887062549591, "eval_runtime": 243.3288, "eval_samples_per_second": 0.822, "eval_steps_per_second": 0.822, "step": 250 }, { "epoch": 0.5904059040590406, "grad_norm": 0.228590190410614, "learning_rate": 8.275862068965517e-05, "loss": 0.0743, "step": 260 }, { "epoch": 0.6131138234459268, "grad_norm": 0.24921129643917084, "learning_rate": 7.81609195402299e-05, "loss": 0.075, "step": 270 }, { "epoch": 0.6358217428328129, "grad_norm": 0.27115267515182495, "learning_rate": 7.35632183908046e-05, "loss": 0.0749, "step": 280 }, { "epoch": 0.6585296622196991, "grad_norm": 0.1899242103099823, "learning_rate": 6.896551724137931e-05, "loss": 0.0705, "step": 290 }, { "epoch": 0.6812375816065853, "grad_norm": 0.2068161964416504, "learning_rate": 6.436781609195403e-05, "loss": 0.0726, "step": 300 }, { "epoch": 0.6812375816065853, "eval_loss": 0.08430539816617966, "eval_runtime": 243.0981, "eval_samples_per_second": 0.823, "eval_steps_per_second": 0.823, "step": 300 } ], "logging_steps": 10, "max_steps": 440, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.261504929129062e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }