{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10526315789473684, "grad_norm": 6.923758029937744, "learning_rate": 2.0833333333333336e-05, "loss": 1.3079, "step": 10 }, { "epoch": 0.21052631578947367, "grad_norm": 2.75594425201416, "learning_rate": 4.166666666666667e-05, "loss": 0.5705, "step": 20 }, { "epoch": 0.3157894736842105, "grad_norm": 1.7995272874832153, "learning_rate": 6.25e-05, "loss": 0.3302, "step": 30 }, { "epoch": 0.42105263157894735, "grad_norm": 1.2510942220687866, "learning_rate": 8.333333333333334e-05, "loss": 0.2171, "step": 40 }, { "epoch": 0.5263157894736842, "grad_norm": 0.9268723130226135, "learning_rate": 9.999878693264872e-05, "loss": 0.1775, "step": 50 }, { "epoch": 0.631578947368421, "grad_norm": 1.2328051328659058, "learning_rate": 9.995633575547026e-05, "loss": 0.1635, "step": 60 }, { "epoch": 0.7368421052631579, "grad_norm": 0.896913468837738, "learning_rate": 9.985329005918702e-05, "loss": 0.1443, "step": 70 }, { "epoch": 0.8421052631578947, "grad_norm": 0.9884301424026489, "learning_rate": 9.968977483303847e-05, "loss": 0.1296, "step": 80 }, { "epoch": 0.9473684210526315, "grad_norm": 0.9594321250915527, "learning_rate": 9.946598841275812e-05, "loss": 0.1197, "step": 90 }, { "epoch": 1.0526315789473684, "grad_norm": 1.0888608694076538, "learning_rate": 9.91822022400022e-05, "loss": 0.1163, "step": 100 }, { "epoch": 1.1578947368421053, "grad_norm": 1.4427963495254517, "learning_rate": 9.883876053310475e-05, "loss": 0.104, "step": 110 }, { "epoch": 1.263157894736842, "grad_norm": 0.4913580119609833, "learning_rate": 9.843607986955799e-05, "loss": 0.0981, "step": 120 }, { "epoch": 1.368421052631579, "grad_norm": 0.7194355130195618, "learning_rate": 9.797464868072488e-05, "loss": 0.0868, "step": 130 }, { "epoch": 1.4736842105263157, "grad_norm": 0.8133114576339722, "learning_rate": 9.745502665939639e-05, "loss": 0.0955, "step": 140 }, { "epoch": 1.5789473684210527, "grad_norm": 0.8647440671920776, "learning_rate": 9.687784408091243e-05, "loss": 0.0835, "step": 150 }, { "epoch": 1.6842105263157894, "grad_norm": 0.8574523329734802, "learning_rate": 9.624380103866959e-05, "loss": 0.083, "step": 160 }, { "epoch": 1.7894736842105263, "grad_norm": 0.83241868019104, "learning_rate": 9.555366659494302e-05, "loss": 0.076, "step": 170 }, { "epoch": 1.8947368421052633, "grad_norm": 1.2344025373458862, "learning_rate": 9.480827784805278e-05, "loss": 0.0702, "step": 180 }, { "epoch": 2.0, "grad_norm": 2.3658246994018555, "learning_rate": 9.400853891700561e-05, "loss": 0.0745, "step": 190 }, { "epoch": 2.1052631578947367, "grad_norm": 0.6137243509292603, "learning_rate": 9.315541984484414e-05, "loss": 0.0653, "step": 200 }, { "epoch": 2.2105263157894735, "grad_norm": 0.44892561435699463, "learning_rate": 9.22499554220336e-05, "loss": 0.0731, "step": 210 }, { "epoch": 2.3157894736842106, "grad_norm": 0.6144806146621704, "learning_rate": 9.129324393131305e-05, "loss": 0.0641, "step": 220 }, { "epoch": 2.4210526315789473, "grad_norm": 0.47163206338882446, "learning_rate": 9.028644581553374e-05, "loss": 0.0658, "step": 230 }, { "epoch": 2.526315789473684, "grad_norm": 0.5073703527450562, "learning_rate": 8.923078227010038e-05, "loss": 0.0694, "step": 240 }, { "epoch": 2.6315789473684212, "grad_norm": 0.4855697751045227, "learning_rate": 8.812753376172265e-05, "loss": 0.0649, "step": 250 }, { "epoch": 2.736842105263158, "grad_norm": 0.7437257766723633, "learning_rate": 8.697803847527355e-05, "loss": 0.0651, "step": 260 }, { "epoch": 2.8421052631578947, "grad_norm": 0.6212872862815857, "learning_rate": 8.578369069063853e-05, "loss": 0.0602, "step": 270 }, { "epoch": 2.9473684210526314, "grad_norm": 0.36850064992904663, "learning_rate": 8.454593909152427e-05, "loss": 0.0559, "step": 280 }, { "epoch": 3.0526315789473686, "grad_norm": 0.5061463117599487, "learning_rate": 8.326628500827826e-05, "loss": 0.0528, "step": 290 }, { "epoch": 3.1578947368421053, "grad_norm": 0.5094528198242188, "learning_rate": 8.194628059685077e-05, "loss": 0.0543, "step": 300 }, { "epoch": 3.263157894736842, "grad_norm": 0.8378345966339111, "learning_rate": 8.058752695610772e-05, "loss": 0.0636, "step": 310 }, { "epoch": 3.3684210526315788, "grad_norm": 0.9896312355995178, "learning_rate": 7.91916721857786e-05, "loss": 0.0507, "step": 320 }, { "epoch": 3.473684210526316, "grad_norm": 0.36987894773483276, "learning_rate": 7.776040938739435e-05, "loss": 0.0561, "step": 330 }, { "epoch": 3.5789473684210527, "grad_norm": 0.32678183913230896, "learning_rate": 7.629547461064054e-05, "loss": 0.0562, "step": 340 }, { "epoch": 3.6842105263157894, "grad_norm": 0.35407042503356934, "learning_rate": 7.479864474761653e-05, "loss": 0.0521, "step": 350 }, { "epoch": 3.7894736842105265, "grad_norm": 0.5267050266265869, "learning_rate": 7.327173537755487e-05, "loss": 0.0549, "step": 360 }, { "epoch": 3.8947368421052633, "grad_norm": 0.46137216687202454, "learning_rate": 7.171659856461513e-05, "loss": 0.0445, "step": 370 }, { "epoch": 4.0, "grad_norm": 1.071692705154419, "learning_rate": 7.01351206114233e-05, "loss": 0.0495, "step": 380 }, { "epoch": 4.105263157894737, "grad_norm": 0.46627840399742126, "learning_rate": 6.852921977108183e-05, "loss": 0.0478, "step": 390 }, { "epoch": 4.2105263157894735, "grad_norm": 0.3222876191139221, "learning_rate": 6.690084392042513e-05, "loss": 0.0442, "step": 400 }, { "epoch": 4.315789473684211, "grad_norm": 0.5170292854309082, "learning_rate": 6.525196819734314e-05, "loss": 0.0548, "step": 410 }, { "epoch": 4.421052631578947, "grad_norm": 0.29512906074523926, "learning_rate": 6.358459260503848e-05, "loss": 0.0453, "step": 420 }, { "epoch": 4.526315789473684, "grad_norm": 0.37867993116378784, "learning_rate": 6.19007395861234e-05, "loss": 0.0481, "step": 430 }, { "epoch": 4.631578947368421, "grad_norm": 0.36428308486938477, "learning_rate": 6.0202451569498676e-05, "loss": 0.0473, "step": 440 }, { "epoch": 4.7368421052631575, "grad_norm": 0.606037437915802, "learning_rate": 5.849178849299026e-05, "loss": 0.0415, "step": 450 }, { "epoch": 4.842105263157895, "grad_norm": 0.4544004499912262, "learning_rate": 5.677082530474845e-05, "loss": 0.0437, "step": 460 }, { "epoch": 4.947368421052632, "grad_norm": 0.23344260454177856, "learning_rate": 5.50416494464403e-05, "loss": 0.0447, "step": 470 }, { "epoch": 5.052631578947368, "grad_norm": 0.6657449007034302, "learning_rate": 5.3306358321287966e-05, "loss": 0.0445, "step": 480 }, { "epoch": 5.157894736842105, "grad_norm": 0.26515915989875793, "learning_rate": 5.156705675002431e-05, "loss": 0.0378, "step": 490 }, { "epoch": 5.2631578947368425, "grad_norm": 0.6301982998847961, "learning_rate": 4.982585441785133e-05, "loss": 0.0426, "step": 500 }, { "epoch": 5.368421052631579, "grad_norm": 0.22558610141277313, "learning_rate": 4.8084863315498234e-05, "loss": 0.038, "step": 510 }, { "epoch": 5.473684210526316, "grad_norm": 0.3168763816356659, "learning_rate": 4.634619517748315e-05, "loss": 0.0408, "step": 520 }, { "epoch": 5.578947368421053, "grad_norm": 0.6440910696983337, "learning_rate": 4.461195892068543e-05, "loss": 0.0363, "step": 530 }, { "epoch": 5.684210526315789, "grad_norm": 0.3828079104423523, "learning_rate": 4.288425808633575e-05, "loss": 0.0445, "step": 540 }, { "epoch": 5.7894736842105265, "grad_norm": 0.34126198291778564, "learning_rate": 4.116518828852651e-05, "loss": 0.0358, "step": 550 }, { "epoch": 5.894736842105263, "grad_norm": 0.5263332724571228, "learning_rate": 3.9456834672337516e-05, "loss": 0.0392, "step": 560 }, { "epoch": 6.0, "grad_norm": 1.0714961290359497, "learning_rate": 3.776126938466003e-05, "loss": 0.0388, "step": 570 }, { "epoch": 6.105263157894737, "grad_norm": 0.38259050250053406, "learning_rate": 3.608054906078691e-05, "loss": 0.0402, "step": 580 }, { "epoch": 6.2105263157894735, "grad_norm": 0.42619675397872925, "learning_rate": 3.441671232981769e-05, "loss": 0.0379, "step": 590 }, { "epoch": 6.315789473684211, "grad_norm": 0.26903611421585083, "learning_rate": 3.2771777341903976e-05, "loss": 0.0356, "step": 600 }, { "epoch": 6.421052631578947, "grad_norm": 0.33861419558525085, "learning_rate": 3.114773932033517e-05, "loss": 0.0382, "step": 610 }, { "epoch": 6.526315789473684, "grad_norm": 0.2860042452812195, "learning_rate": 2.9546568141433006e-05, "loss": 0.0334, "step": 620 }, { "epoch": 6.631578947368421, "grad_norm": 0.29263758659362793, "learning_rate": 2.7970205945190787e-05, "loss": 0.0368, "step": 630 }, { "epoch": 6.7368421052631575, "grad_norm": 0.27042821049690247, "learning_rate": 2.6420564779555447e-05, "loss": 0.0374, "step": 640 }, { "epoch": 6.842105263157895, "grad_norm": 0.3030681014060974, "learning_rate": 2.4899524281209602e-05, "loss": 0.0351, "step": 650 }, { "epoch": 6.947368421052632, "grad_norm": 0.31766682863235474, "learning_rate": 2.340892939566701e-05, "loss": 0.0332, "step": 660 }, { "epoch": 7.052631578947368, "grad_norm": 0.3257026970386505, "learning_rate": 2.1950588139446597e-05, "loss": 0.0361, "step": 670 }, { "epoch": 7.157894736842105, "grad_norm": 0.5466029047966003, "learning_rate": 2.0526269407039395e-05, "loss": 0.0331, "step": 680 }, { "epoch": 7.2631578947368425, "grad_norm": 0.16269290447235107, "learning_rate": 1.913770082532873e-05, "loss": 0.0283, "step": 690 }, { "epoch": 7.368421052631579, "grad_norm": 0.20265574753284454, "learning_rate": 1.7786566658065725e-05, "loss": 0.0295, "step": 700 }, { "epoch": 7.473684210526316, "grad_norm": 0.3146597146987915, "learning_rate": 1.647450576294225e-05, "loss": 0.0324, "step": 710 }, { "epoch": 7.578947368421053, "grad_norm": 0.46915167570114136, "learning_rate": 1.5203109603739135e-05, "loss": 0.032, "step": 720 }, { "epoch": 7.684210526315789, "grad_norm": 0.20551757514476776, "learning_rate": 1.3973920319960655e-05, "loss": 0.0328, "step": 730 }, { "epoch": 7.7894736842105265, "grad_norm": 0.33174726366996765, "learning_rate": 1.278842885629707e-05, "loss": 0.0331, "step": 740 }, { "epoch": 7.894736842105263, "grad_norm": 0.4016113579273224, "learning_rate": 1.1648073154183797e-05, "loss": 0.0339, "step": 750 }, { "epoch": 8.0, "grad_norm": 0.6715713739395142, "learning_rate": 1.0554236407650837e-05, "loss": 0.0329, "step": 760 }, { "epoch": 8.105263157894736, "grad_norm": 0.3500043749809265, "learning_rate": 9.508245385578084e-06, "loss": 0.0347, "step": 770 }, { "epoch": 8.210526315789474, "grad_norm": 0.24927648901939392, "learning_rate": 8.51136882239159e-06, "loss": 0.0333, "step": 780 }, { "epoch": 8.31578947368421, "grad_norm": 0.3222803771495819, "learning_rate": 7.564815879152471e-06, "loss": 0.0298, "step": 790 }, { "epoch": 8.421052631578947, "grad_norm": 0.3161856532096863, "learning_rate": 6.66973467690557e-06, "loss": 0.0288, "step": 800 }, { "epoch": 8.526315789473685, "grad_norm": 0.28306710720062256, "learning_rate": 5.8272109040663435e-06, "loss": 0.0291, "step": 810 }, { "epoch": 8.631578947368421, "grad_norm": 0.19094383716583252, "learning_rate": 5.03826649953561e-06, "loss": 0.0279, "step": 820 }, { "epoch": 8.736842105263158, "grad_norm": 0.2410457879304886, "learning_rate": 4.303858413139045e-06, "loss": 0.0266, "step": 830 }, { "epoch": 8.842105263157894, "grad_norm": 0.3597564101219177, "learning_rate": 3.6248774448952695e-06, "loss": 0.0271, "step": 840 }, { "epoch": 8.947368421052632, "grad_norm": 0.309693306684494, "learning_rate": 3.0021471645203192e-06, "loss": 0.0318, "step": 850 }, { "epoch": 9.052631578947368, "grad_norm": 0.196163609623909, "learning_rate": 2.436422912479053e-06, "loss": 0.0263, "step": 860 }, { "epoch": 9.157894736842104, "grad_norm": 0.2737872302532196, "learning_rate": 1.928390883795167e-06, "loss": 0.0299, "step": 870 }, { "epoch": 9.263157894736842, "grad_norm": 0.19878268241882324, "learning_rate": 1.4786672957312397e-06, "loss": 0.028, "step": 880 }, { "epoch": 9.368421052631579, "grad_norm": 0.3096800744533539, "learning_rate": 1.0877976403482314e-06, "loss": 0.0304, "step": 890 }, { "epoch": 9.473684210526315, "grad_norm": 0.2538807690143585, "learning_rate": 7.562560228510651e-07, "loss": 0.0276, "step": 900 }, { "epoch": 9.578947368421053, "grad_norm": 0.24415156245231628, "learning_rate": 4.84444586522903e-07, "loss": 0.0335, "step": 910 }, { "epoch": 9.68421052631579, "grad_norm": 0.20988790690898895, "learning_rate": 2.7269302494559725e-07, "loss": 0.0263, "step": 920 }, { "epoch": 9.789473684210526, "grad_norm": 0.21873541176319122, "learning_rate": 1.21258182097983e-07, "loss": 0.0295, "step": 930 }, { "epoch": 9.894736842105264, "grad_norm": 0.19810830056667328, "learning_rate": 3.032374081706757e-08, "loss": 0.0273, "step": 940 }, { "epoch": 10.0, "grad_norm": 0.30767157673835754, "learning_rate": 0.0, "loss": 0.026, "step": 950 }, { "epoch": 10.0, "step": 950, "total_flos": 1.3123801162092672e+17, "train_loss": 0.07566471291215796, "train_runtime": 1176.7966, "train_samples_per_second": 51.173, "train_steps_per_second": 0.807 } ], "logging_steps": 10, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3123801162092672e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }