{ "best_metric": 0.05326759070158005, "best_model_checkpoint": "./output/checkpoint-750", "epoch": 0.7560483870967742, "eval_steps": 150, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010080645161290322, "grad_norm": 4.718156814575195, "learning_rate": 1.25e-05, "loss": 1.4603, "step": 10 }, { "epoch": 0.020161290322580645, "grad_norm": 2.660947561264038, "learning_rate": 2.5e-05, "loss": 0.9242, "step": 20 }, { "epoch": 0.03024193548387097, "grad_norm": 1.879041075706482, "learning_rate": 3.75e-05, "loss": 0.448, "step": 30 }, { "epoch": 0.04032258064516129, "grad_norm": 1.869635820388794, "learning_rate": 5e-05, "loss": 0.3737, "step": 40 }, { "epoch": 0.05040322580645161, "grad_norm": 1.2459924221038818, "learning_rate": 6.25e-05, "loss": 0.2163, "step": 50 }, { "epoch": 0.06048387096774194, "grad_norm": 1.4758652448654175, "learning_rate": 7.5e-05, "loss": 0.2292, "step": 60 }, { "epoch": 0.07056451612903226, "grad_norm": 2.0161190032958984, "learning_rate": 8.75e-05, "loss": 0.1981, "step": 70 }, { "epoch": 0.08064516129032258, "grad_norm": 1.2931406497955322, "learning_rate": 0.0001, "loss": 0.1827, "step": 80 }, { "epoch": 0.0907258064516129, "grad_norm": 1.0741955041885376, "learning_rate": 0.00011250000000000001, "loss": 0.1407, "step": 90 }, { "epoch": 0.10080645161290322, "grad_norm": 1.0442911386489868, "learning_rate": 0.000125, "loss": 0.1546, "step": 100 }, { "epoch": 0.11088709677419355, "grad_norm": 1.0597628355026245, "learning_rate": 0.00012499871543489787, "loss": 0.102, "step": 110 }, { "epoch": 0.12096774193548387, "grad_norm": 0.9368671774864197, "learning_rate": 0.00012499486179239495, "loss": 0.1541, "step": 120 }, { "epoch": 0.1310483870967742, "grad_norm": 0.7519365549087524, "learning_rate": 0.00012498843923089938, "loss": 0.1643, "step": 130 }, { "epoch": 0.14112903225806453, "grad_norm": 0.4844861328601837, "learning_rate": 0.0001249794480144175, "loss": 0.1243, "step": 140 }, { "epoch": 0.15120967741935484, "grad_norm": 0.7643828392028809, "learning_rate": 0.000124967888512543, "loss": 0.1229, "step": 150 }, { "epoch": 0.15120967741935484, "eval_loss": 0.08507546782493591, "eval_runtime": 57.5316, "eval_samples_per_second": 8.691, "eval_steps_per_second": 8.691, "step": 150 }, { "epoch": 0.16129032258064516, "grad_norm": 0.47043290734291077, "learning_rate": 0.00012495376120044173, "loss": 0.1681, "step": 160 }, { "epoch": 0.17137096774193547, "grad_norm": 0.6022606492042542, "learning_rate": 0.00012493706665883217, "loss": 0.1328, "step": 170 }, { "epoch": 0.1814516129032258, "grad_norm": 0.5551249980926514, "learning_rate": 0.00012491780557396154, "loss": 0.1344, "step": 180 }, { "epoch": 0.19153225806451613, "grad_norm": 0.8580924868583679, "learning_rate": 0.00012489597873757756, "loss": 0.1418, "step": 190 }, { "epoch": 0.20161290322580644, "grad_norm": 0.5060119032859802, "learning_rate": 0.00012487158704689602, "loss": 0.1207, "step": 200 }, { "epoch": 0.21169354838709678, "grad_norm": 0.5322176218032837, "learning_rate": 0.0001248446315045638, "loss": 0.0723, "step": 210 }, { "epoch": 0.2217741935483871, "grad_norm": 1.1198161840438843, "learning_rate": 0.00012481511321861763, "loss": 0.1267, "step": 220 }, { "epoch": 0.2318548387096774, "grad_norm": 0.317910760641098, "learning_rate": 0.00012478303340243864, "loss": 0.0905, "step": 230 }, { "epoch": 0.24193548387096775, "grad_norm": 0.5275834202766418, "learning_rate": 0.00012474839337470246, "loss": 0.1228, "step": 240 }, { "epoch": 0.25201612903225806, "grad_norm": 0.2738340497016907, "learning_rate": 0.0001247111945593249, "loss": 0.1433, "step": 250 }, { "epoch": 0.2620967741935484, "grad_norm": 0.718310534954071, "learning_rate": 0.00012467143848540359, "loss": 0.1075, "step": 260 }, { "epoch": 0.2721774193548387, "grad_norm": 0.18305560946464539, "learning_rate": 0.000124629126787155, "loss": 0.1053, "step": 270 }, { "epoch": 0.28225806451612906, "grad_norm": 0.48630911111831665, "learning_rate": 0.00012458426120384738, "loss": 0.107, "step": 280 }, { "epoch": 0.2923387096774194, "grad_norm": 0.24452829360961914, "learning_rate": 0.00012453684357972906, "loss": 0.117, "step": 290 }, { "epoch": 0.3024193548387097, "grad_norm": 0.7329663634300232, "learning_rate": 0.00012448687586395289, "loss": 0.0766, "step": 300 }, { "epoch": 0.3024193548387097, "eval_loss": 0.061199020594358444, "eval_runtime": 60.0112, "eval_samples_per_second": 8.332, "eval_steps_per_second": 8.332, "step": 300 }, { "epoch": 0.3125, "grad_norm": 0.9389004111289978, "learning_rate": 0.00012443436011049593, "loss": 0.129, "step": 310 }, { "epoch": 0.3225806451612903, "grad_norm": 0.8787228465080261, "learning_rate": 0.0001243792984780751, "loss": 0.1333, "step": 320 }, { "epoch": 0.3326612903225806, "grad_norm": 0.3159072697162628, "learning_rate": 0.00012432169323005853, "loss": 0.0931, "step": 330 }, { "epoch": 0.34274193548387094, "grad_norm": 0.7588217258453369, "learning_rate": 0.00012426154673437223, "loss": 0.1053, "step": 340 }, { "epoch": 0.3528225806451613, "grad_norm": 1.1616908311843872, "learning_rate": 0.00012419886146340314, "loss": 0.1468, "step": 350 }, { "epoch": 0.3629032258064516, "grad_norm": 0.8137270212173462, "learning_rate": 0.0001241336399938972, "loss": 0.1196, "step": 360 }, { "epoch": 0.37298387096774194, "grad_norm": 0.27941054105758667, "learning_rate": 0.00012406588500685355, "loss": 0.0915, "step": 370 }, { "epoch": 0.38306451612903225, "grad_norm": 0.22469285130500793, "learning_rate": 0.00012399559928741435, "loss": 0.0607, "step": 380 }, { "epoch": 0.39314516129032256, "grad_norm": 0.20622070133686066, "learning_rate": 0.00012392278572475023, "loss": 0.0657, "step": 390 }, { "epoch": 0.4032258064516129, "grad_norm": 0.1868823766708374, "learning_rate": 0.0001238474473119416, "loss": 0.0873, "step": 400 }, { "epoch": 0.41330645161290325, "grad_norm": 0.262215793132782, "learning_rate": 0.00012376958714585545, "loss": 0.0899, "step": 410 }, { "epoch": 0.42338709677419356, "grad_norm": 0.8614699840545654, "learning_rate": 0.0001236892084270183, "loss": 0.0724, "step": 420 }, { "epoch": 0.4334677419354839, "grad_norm": 0.917412281036377, "learning_rate": 0.00012360631445948448, "loss": 0.1351, "step": 430 }, { "epoch": 0.4435483870967742, "grad_norm": 0.8552457094192505, "learning_rate": 0.00012352090865070026, "loss": 0.1108, "step": 440 }, { "epoch": 0.4536290322580645, "grad_norm": 0.6661000847816467, "learning_rate": 0.00012343299451136397, "loss": 0.0681, "step": 450 }, { "epoch": 0.4536290322580645, "eval_loss": 0.06194188818335533, "eval_runtime": 57.5815, "eval_samples_per_second": 8.683, "eval_steps_per_second": 8.683, "step": 450 }, { "epoch": 0.4637096774193548, "grad_norm": 0.17224998772144318, "learning_rate": 0.00012334257565528155, "loss": 0.0752, "step": 460 }, { "epoch": 0.4737903225806452, "grad_norm": 0.6695602536201477, "learning_rate": 0.000123249655799218, "loss": 0.1084, "step": 470 }, { "epoch": 0.4838709677419355, "grad_norm": 0.256228506565094, "learning_rate": 0.00012315423876274468, "loss": 0.0635, "step": 480 }, { "epoch": 0.4939516129032258, "grad_norm": 0.25890034437179565, "learning_rate": 0.0001230563284680822, "loss": 0.0857, "step": 490 }, { "epoch": 0.5040322580645161, "grad_norm": 0.20878875255584717, "learning_rate": 0.00012295592893993935, "loss": 0.0967, "step": 500 }, { "epoch": 0.5141129032258065, "grad_norm": 0.23766882717609406, "learning_rate": 0.00012285304430534745, "loss": 0.1212, "step": 510 }, { "epoch": 0.5241935483870968, "grad_norm": 0.18952979147434235, "learning_rate": 0.00012274767879349083, "loss": 0.0889, "step": 520 }, { "epoch": 0.5342741935483871, "grad_norm": 0.4890676736831665, "learning_rate": 0.00012263983673553306, "loss": 0.09, "step": 530 }, { "epoch": 0.5443548387096774, "grad_norm": 0.6612870097160339, "learning_rate": 0.0001225295225644387, "loss": 0.1209, "step": 540 }, { "epoch": 0.5544354838709677, "grad_norm": 0.3861521780490875, "learning_rate": 0.0001224167408147913, "loss": 0.085, "step": 550 }, { "epoch": 0.5645161290322581, "grad_norm": 0.22604888677597046, "learning_rate": 0.0001223014961226068, "loss": 0.0877, "step": 560 }, { "epoch": 0.5745967741935484, "grad_norm": 0.4841513931751251, "learning_rate": 0.00012218379322514317, "loss": 0.0861, "step": 570 }, { "epoch": 0.5846774193548387, "grad_norm": 0.16400082409381866, "learning_rate": 0.00012206363696070545, "loss": 0.1509, "step": 580 }, { "epoch": 0.594758064516129, "grad_norm": 0.14709672331809998, "learning_rate": 0.0001219410322684471, "loss": 0.0619, "step": 590 }, { "epoch": 0.6048387096774194, "grad_norm": 0.17841552197933197, "learning_rate": 0.0001218159841881668, "loss": 0.0782, "step": 600 }, { "epoch": 0.6048387096774194, "eval_loss": 0.053511977195739746, "eval_runtime": 58.4751, "eval_samples_per_second": 8.551, "eval_steps_per_second": 8.551, "step": 600 }, { "epoch": 0.6149193548387096, "grad_norm": 0.6630131006240845, "learning_rate": 0.00012168849786010133, "loss": 0.077, "step": 610 }, { "epoch": 0.625, "grad_norm": 0.16494181752204895, "learning_rate": 0.00012155857852471433, "loss": 0.1101, "step": 620 }, { "epoch": 0.6350806451612904, "grad_norm": 0.9052111506462097, "learning_rate": 0.0001214262315224808, "loss": 0.1151, "step": 630 }, { "epoch": 0.6451612903225806, "grad_norm": 0.7651068568229675, "learning_rate": 0.00012129146229366766, "loss": 0.1093, "step": 640 }, { "epoch": 0.655241935483871, "grad_norm": 0.16763581335544586, "learning_rate": 0.00012115427637811003, "loss": 0.0711, "step": 650 }, { "epoch": 0.6653225806451613, "grad_norm": 0.6529646515846252, "learning_rate": 0.00012101467941498357, "loss": 0.1042, "step": 660 }, { "epoch": 0.6754032258064516, "grad_norm": 0.21442104876041412, "learning_rate": 0.0001208726771425727, "loss": 0.0624, "step": 670 }, { "epoch": 0.6854838709677419, "grad_norm": 0.27069932222366333, "learning_rate": 0.00012072827539803463, "loss": 0.0808, "step": 680 }, { "epoch": 0.6955645161290323, "grad_norm": 0.203572615981102, "learning_rate": 0.00012058148011715949, "loss": 0.0861, "step": 690 }, { "epoch": 0.7056451612903226, "grad_norm": 0.18072012066841125, "learning_rate": 0.00012043229733412636, "loss": 0.053, "step": 700 }, { "epoch": 0.7157258064516129, "grad_norm": 0.62689608335495, "learning_rate": 0.0001202807331812551, "loss": 0.0998, "step": 710 }, { "epoch": 0.7258064516129032, "grad_norm": 0.14409402012825012, "learning_rate": 0.00012012679388875441, "loss": 0.0709, "step": 720 }, { "epoch": 0.7358870967741935, "grad_norm": 0.17387409508228302, "learning_rate": 0.00011997048578446568, "loss": 0.1087, "step": 730 }, { "epoch": 0.7459677419354839, "grad_norm": 0.6958820223808289, "learning_rate": 0.00011981181529360282, "loss": 0.1266, "step": 740 }, { "epoch": 0.7560483870967742, "grad_norm": 0.6624193787574768, "learning_rate": 0.00011965078893848828, "loss": 0.1003, "step": 750 }, { "epoch": 0.7560483870967742, "eval_loss": 0.05326759070158005, "eval_runtime": 57.7375, "eval_samples_per_second": 8.66, "eval_steps_per_second": 8.66, "step": 750 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.769083236135731e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }