{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1469, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013614703880190605, "grad_norm": 46457.98828125, "learning_rate": 8.163265306122449e-07, "loss": 2.0792, "step": 20 }, { "epoch": 0.02722940776038121, "grad_norm": 35946.40234375, "learning_rate": 1.6326530612244897e-06, "loss": 2.0791, "step": 40 }, { "epoch": 0.04084411164057182, "grad_norm": 23938.412109375, "learning_rate": 2.4489795918367347e-06, "loss": 2.0782, "step": 60 }, { "epoch": 0.05445881552076242, "grad_norm": 32852.3359375, "learning_rate": 3.2653061224489794e-06, "loss": 2.0771, "step": 80 }, { "epoch": 0.06807351940095303, "grad_norm": 32399.712890625, "learning_rate": 4.081632653061225e-06, "loss": 2.0752, "step": 100 }, { "epoch": 0.08168822328114364, "grad_norm": 30975.478515625, "learning_rate": 4.897959183673469e-06, "loss": 2.0722, "step": 120 }, { "epoch": 0.09530292716133425, "grad_norm": 35666.22265625, "learning_rate": 5.7142857142857145e-06, "loss": 2.0674, "step": 140 }, { "epoch": 0.10891763104152484, "grad_norm": 36289.2578125, "learning_rate": 6.530612244897959e-06, "loss": 2.0555, "step": 160 }, { "epoch": 0.12253233492171545, "grad_norm": 34320.37890625, "learning_rate": 7.346938775510204e-06, "loss": 2.0414, "step": 180 }, { "epoch": 0.13614703880190607, "grad_norm": 49061.66796875, "learning_rate": 8.16326530612245e-06, "loss": 2.0297, "step": 200 }, { "epoch": 0.14976174268209666, "grad_norm": 60060.45703125, "learning_rate": 8.979591836734694e-06, "loss": 2.0094, "step": 220 }, { "epoch": 0.16337644656228728, "grad_norm": 80072.6015625, "learning_rate": 9.795918367346939e-06, "loss": 1.9789, "step": 240 }, { "epoch": 0.17699115044247787, "grad_norm": 95368.8828125, "learning_rate": 1.0612244897959184e-05, "loss": 1.9378, "step": 260 }, { "epoch": 0.1906058543226685, "grad_norm": 70028.4765625, "learning_rate": 1.1428571428571429e-05, "loss": 1.9166, "step": 280 }, { "epoch": 0.2042205582028591, "grad_norm": 90220.5859375, "learning_rate": 1.2244897959183674e-05, "loss": 1.8492, "step": 300 }, { "epoch": 0.21783526208304968, "grad_norm": 59154.35546875, "learning_rate": 1.3061224489795918e-05, "loss": 1.8556, "step": 320 }, { "epoch": 0.2314499659632403, "grad_norm": 129406.875, "learning_rate": 1.3877551020408165e-05, "loss": 1.8048, "step": 340 }, { "epoch": 0.2450646698434309, "grad_norm": 176714.859375, "learning_rate": 1.4693877551020408e-05, "loss": 1.743, "step": 360 }, { "epoch": 0.2586793737236215, "grad_norm": 225369.375, "learning_rate": 1.5510204081632655e-05, "loss": 1.708, "step": 380 }, { "epoch": 0.27229407760381213, "grad_norm": 324853.53125, "learning_rate": 1.63265306122449e-05, "loss": 1.7032, "step": 400 }, { "epoch": 0.2859087814840027, "grad_norm": 77990.5, "learning_rate": 1.7142857142857142e-05, "loss": 1.7964, "step": 420 }, { "epoch": 0.2995234853641933, "grad_norm": 243999.53125, "learning_rate": 1.7959183673469387e-05, "loss": 1.7472, "step": 440 }, { "epoch": 0.3131381892443839, "grad_norm": 402316.78125, "learning_rate": 1.8775510204081636e-05, "loss": 1.6196, "step": 460 }, { "epoch": 0.32675289312457456, "grad_norm": 173016.109375, "learning_rate": 1.9591836734693877e-05, "loss": 1.6543, "step": 480 }, { "epoch": 0.34036759700476515, "grad_norm": 166789.75, "learning_rate": 2.0408163265306123e-05, "loss": 1.7865, "step": 500 }, { "epoch": 0.34036759700476515, "eval_accuracy": 0.1981891348088531, "eval_loss": 1.6593838930130005, "eval_runtime": 296.9022, "eval_samples_per_second": 10.044, "eval_steps_per_second": 1.256, "step": 500 }, { "epoch": 0.35398230088495575, "grad_norm": 160284.609375, "learning_rate": 2.1224489795918368e-05, "loss": 1.6528, "step": 520 }, { "epoch": 0.36759700476514634, "grad_norm": 195971.28125, "learning_rate": 2.2040816326530613e-05, "loss": 1.7909, "step": 540 }, { "epoch": 0.381211708645337, "grad_norm": 152683.9375, "learning_rate": 2.2857142857142858e-05, "loss": 1.6901, "step": 560 }, { "epoch": 0.3948264125255276, "grad_norm": 188117.421875, "learning_rate": 2.3673469387755103e-05, "loss": 1.7382, "step": 580 }, { "epoch": 0.4084411164057182, "grad_norm": 221595.171875, "learning_rate": 2.448979591836735e-05, "loss": 1.7579, "step": 600 }, { "epoch": 0.42205582028590877, "grad_norm": 250219.484375, "learning_rate": 2.5306122448979594e-05, "loss": 1.6842, "step": 620 }, { "epoch": 0.43567052416609936, "grad_norm": 362297.6875, "learning_rate": 2.6122448979591835e-05, "loss": 1.6874, "step": 640 }, { "epoch": 0.44928522804629, "grad_norm": 268312.65625, "learning_rate": 2.6938775510204084e-05, "loss": 1.6339, "step": 660 }, { "epoch": 0.4628999319264806, "grad_norm": 301583.75, "learning_rate": 2.775510204081633e-05, "loss": 1.6268, "step": 680 }, { "epoch": 0.4765146358066712, "grad_norm": 168292.046875, "learning_rate": 2.857142857142857e-05, "loss": 1.7985, "step": 700 }, { "epoch": 0.4901293396868618, "grad_norm": 274193.40625, "learning_rate": 2.9387755102040816e-05, "loss": 1.7689, "step": 720 }, { "epoch": 0.5037440435670524, "grad_norm": 142488.421875, "learning_rate": 2.9977307110438728e-05, "loss": 1.7201, "step": 740 }, { "epoch": 0.517358747447243, "grad_norm": 273465.96875, "learning_rate": 2.9886535552193645e-05, "loss": 1.6105, "step": 760 }, { "epoch": 0.5309734513274337, "grad_norm": 408773.3125, "learning_rate": 2.9795763993948565e-05, "loss": 1.6843, "step": 780 }, { "epoch": 0.5445881552076243, "grad_norm": 187610.140625, "learning_rate": 2.970499243570348e-05, "loss": 1.7259, "step": 800 }, { "epoch": 0.5582028590878149, "grad_norm": 141979.921875, "learning_rate": 2.9614220877458398e-05, "loss": 1.7572, "step": 820 }, { "epoch": 0.5718175629680055, "grad_norm": 183132.390625, "learning_rate": 2.9523449319213314e-05, "loss": 1.7094, "step": 840 }, { "epoch": 0.585432266848196, "grad_norm": 299271.59375, "learning_rate": 2.943267776096823e-05, "loss": 1.7294, "step": 860 }, { "epoch": 0.5990469707283866, "grad_norm": 207611.09375, "learning_rate": 2.9341906202723147e-05, "loss": 1.7864, "step": 880 }, { "epoch": 0.6126616746085772, "grad_norm": 262759.0625, "learning_rate": 2.9251134644478064e-05, "loss": 1.7288, "step": 900 }, { "epoch": 0.6262763784887678, "grad_norm": 291027.125, "learning_rate": 2.916036308623298e-05, "loss": 1.6042, "step": 920 }, { "epoch": 0.6398910823689585, "grad_norm": 355390.71875, "learning_rate": 2.90695915279879e-05, "loss": 1.7426, "step": 940 }, { "epoch": 0.6535057862491491, "grad_norm": 87989.2578125, "learning_rate": 2.8978819969742813e-05, "loss": 1.6938, "step": 960 }, { "epoch": 0.6671204901293397, "grad_norm": 302982.09375, "learning_rate": 2.888804841149773e-05, "loss": 1.8035, "step": 980 }, { "epoch": 0.6807351940095303, "grad_norm": 197021.3125, "learning_rate": 2.879727685325265e-05, "loss": 1.7716, "step": 1000 }, { "epoch": 0.6807351940095303, "eval_accuracy": 0.1981891348088531, "eval_loss": 1.6633707284927368, "eval_runtime": 291.4938, "eval_samples_per_second": 10.23, "eval_steps_per_second": 1.28, "step": 1000 }, { "epoch": 0.6943498978897209, "grad_norm": 348515.0625, "learning_rate": 2.8706505295007566e-05, "loss": 1.7593, "step": 1020 }, { "epoch": 0.7079646017699115, "grad_norm": 252458.609375, "learning_rate": 2.8615733736762483e-05, "loss": 1.7406, "step": 1040 }, { "epoch": 0.7215793056501021, "grad_norm": 179026.3125, "learning_rate": 2.8524962178517396e-05, "loss": 1.806, "step": 1060 }, { "epoch": 0.7351940095302927, "grad_norm": 230475.78125, "learning_rate": 2.8434190620272316e-05, "loss": 1.7327, "step": 1080 }, { "epoch": 0.7488087134104833, "grad_norm": 155040.359375, "learning_rate": 2.8343419062027232e-05, "loss": 1.6378, "step": 1100 }, { "epoch": 0.762423417290674, "grad_norm": 200565.359375, "learning_rate": 2.825264750378215e-05, "loss": 1.7561, "step": 1120 }, { "epoch": 0.7760381211708646, "grad_norm": 160959.6875, "learning_rate": 2.8161875945537065e-05, "loss": 1.7322, "step": 1140 }, { "epoch": 0.7896528250510552, "grad_norm": 277028.65625, "learning_rate": 2.8071104387291985e-05, "loss": 1.6283, "step": 1160 }, { "epoch": 0.8032675289312458, "grad_norm": 145514.578125, "learning_rate": 2.79803328290469e-05, "loss": 1.696, "step": 1180 }, { "epoch": 0.8168822328114363, "grad_norm": 274652.6875, "learning_rate": 2.7889561270801815e-05, "loss": 1.7019, "step": 1200 }, { "epoch": 0.8304969366916269, "grad_norm": 108995.4453125, "learning_rate": 2.779878971255673e-05, "loss": 1.7189, "step": 1220 }, { "epoch": 0.8441116405718175, "grad_norm": 84566.8046875, "learning_rate": 2.770801815431165e-05, "loss": 1.8068, "step": 1240 }, { "epoch": 0.8577263444520081, "grad_norm": 239363.8125, "learning_rate": 2.7617246596066568e-05, "loss": 1.8275, "step": 1260 }, { "epoch": 0.8713410483321987, "grad_norm": 213879.875, "learning_rate": 2.752647503782148e-05, "loss": 1.7145, "step": 1280 }, { "epoch": 0.8849557522123894, "grad_norm": 201732.859375, "learning_rate": 2.74357034795764e-05, "loss": 1.648, "step": 1300 }, { "epoch": 0.89857045609258, "grad_norm": 262597.78125, "learning_rate": 2.7344931921331318e-05, "loss": 1.6619, "step": 1320 }, { "epoch": 0.9121851599727706, "grad_norm": 343452.15625, "learning_rate": 2.7254160363086234e-05, "loss": 1.6732, "step": 1340 }, { "epoch": 0.9257998638529612, "grad_norm": 164955.390625, "learning_rate": 2.716338880484115e-05, "loss": 1.837, "step": 1360 }, { "epoch": 0.9394145677331518, "grad_norm": 243246.953125, "learning_rate": 2.7072617246596067e-05, "loss": 1.7863, "step": 1380 }, { "epoch": 0.9530292716133424, "grad_norm": 133524.296875, "learning_rate": 2.6981845688350984e-05, "loss": 1.671, "step": 1400 }, { "epoch": 0.966643975493533, "grad_norm": 187500.75, "learning_rate": 2.68910741301059e-05, "loss": 1.6882, "step": 1420 }, { "epoch": 0.9802586793737236, "grad_norm": 311251.9375, "learning_rate": 2.6800302571860817e-05, "loss": 1.6963, "step": 1440 }, { "epoch": 0.9938733832539143, "grad_norm": 451302.75, "learning_rate": 2.6709531013615737e-05, "loss": 1.7184, "step": 1460 } ], "logging_steps": 20, "max_steps": 7345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.24301109962752e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }