{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993935718617344, "eval_steps": 100, "global_step": 412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01212856276531231, "grad_norm": 14.8125, "learning_rate": 2.380952380952381e-06, "loss": 0.8047, "mean_token_accuracy": 0.8034533846529813, "step": 5 }, { "epoch": 0.02425712553062462, "grad_norm": 6.96875, "learning_rate": 4.761904761904762e-06, "loss": 0.7637, "mean_token_accuracy": 0.8090282869012707, "step": 10 }, { "epoch": 0.036385688295936934, "grad_norm": 3.984375, "learning_rate": 7.1428571428571436e-06, "loss": 0.721, "mean_token_accuracy": 0.8129444648093843, "step": 15 }, { "epoch": 0.04851425106124924, "grad_norm": 2.90625, "learning_rate": 9.523809523809525e-06, "loss": 0.6684, "mean_token_accuracy": 0.8205217497556208, "step": 20 }, { "epoch": 0.06064281382656155, "grad_norm": 2.0625, "learning_rate": 1.1904761904761905e-05, "loss": 0.6118, "mean_token_accuracy": 0.8317586143695014, "step": 25 }, { "epoch": 0.07277137659187387, "grad_norm": 1.296875, "learning_rate": 1.4285714285714287e-05, "loss": 0.6032, "mean_token_accuracy": 0.8310453323558162, "step": 30 }, { "epoch": 0.08489993935718618, "grad_norm": 1.0703125, "learning_rate": 1.6666666666666667e-05, "loss": 0.5922, "mean_token_accuracy": 0.8324016373411535, "step": 35 }, { "epoch": 0.09702850212249849, "grad_norm": 0.99609375, "learning_rate": 1.904761904761905e-05, "loss": 0.5777, "mean_token_accuracy": 0.8366324535679374, "step": 40 }, { "epoch": 0.1091570648878108, "grad_norm": 0.93359375, "learning_rate": 1.9996755966513702e-05, "loss": 0.584, "mean_token_accuracy": 0.8340573069403714, "step": 45 }, { "epoch": 0.1212856276531231, "grad_norm": 0.94921875, "learning_rate": 1.9976938939060172e-05, "loss": 0.5828, "mean_token_accuracy": 0.8344712243401758, "step": 50 }, { "epoch": 0.1334141904184354, "grad_norm": 0.91796875, "learning_rate": 1.99391427932908e-05, "loss": 0.5625, "mean_token_accuracy": 0.8387799364613879, "step": 55 }, { "epoch": 0.14554275318374774, "grad_norm": 1.5625, "learning_rate": 1.9883435640414922e-05, "loss": 0.565, "mean_token_accuracy": 0.8390288978494626, "step": 60 }, { "epoch": 0.15767131594906003, "grad_norm": 0.91015625, "learning_rate": 1.980991786848731e-05, "loss": 0.5705, "mean_token_accuracy": 0.8378207478005866, "step": 65 }, { "epoch": 0.16979987871437235, "grad_norm": 0.953125, "learning_rate": 1.971872196150208e-05, "loss": 0.5674, "mean_token_accuracy": 0.8382407746823068, "step": 70 }, { "epoch": 0.18192844147968465, "grad_norm": 0.8671875, "learning_rate": 1.961001226064762e-05, "loss": 0.5744, "mean_token_accuracy": 0.8358473851417398, "step": 75 }, { "epoch": 0.19405700424499697, "grad_norm": 0.8828125, "learning_rate": 1.9483984668152618e-05, "loss": 0.5809, "mean_token_accuracy": 0.8347812805474095, "step": 80 }, { "epoch": 0.20618556701030927, "grad_norm": 1.0078125, "learning_rate": 1.9340866294257044e-05, "loss": 0.5687, "mean_token_accuracy": 0.8378635141739978, "step": 85 }, { "epoch": 0.2183141297756216, "grad_norm": 6.25, "learning_rate": 1.918091504794411e-05, "loss": 0.565, "mean_token_accuracy": 0.8387218963831866, "step": 90 }, { "epoch": 0.23044269254093389, "grad_norm": 0.9296875, "learning_rate": 1.9004419172170887e-05, "loss": 0.5746, "mean_token_accuracy": 0.8359634652981427, "step": 95 }, { "epoch": 0.2425712553062462, "grad_norm": 0.921875, "learning_rate": 1.881169672443498e-05, "loss": 0.5633, "mean_token_accuracy": 0.8383049242424242, "step": 100 }, { "epoch": 0.2425712553062462, "eval_loss": 0.5801094770431519, "eval_mean_token_accuracy": 0.8435259856630825, "eval_runtime": 0.5662, "eval_samples_per_second": 128.931, "eval_steps_per_second": 5.299, "step": 100 }, { "epoch": 0.2546998180715585, "grad_norm": 0.8828125, "learning_rate": 1.860309500361345e-05, "loss": 0.5686, "mean_token_accuracy": 0.8384546065493647, "step": 105 }, { "epoch": 0.2668283808368708, "grad_norm": 0.921875, "learning_rate": 1.837898992410674e-05, "loss": 0.5634, "mean_token_accuracy": 0.8386195625610948, "step": 110 }, { "epoch": 0.27895694360218315, "grad_norm": 0.9375, "learning_rate": 1.8139785338415515e-05, "loss": 0.5706, "mean_token_accuracy": 0.8389448924731184, "step": 115 }, { "epoch": 0.2910855063674955, "grad_norm": 0.890625, "learning_rate": 1.788591230937119e-05, "loss": 0.547, "mean_token_accuracy": 0.8426228005865102, "step": 120 }, { "epoch": 0.30321406913280774, "grad_norm": 0.91015625, "learning_rate": 1.7617828333331547e-05, "loss": 0.5609, "mean_token_accuracy": 0.8385706867057674, "step": 125 }, { "epoch": 0.31534263189812006, "grad_norm": 0.90625, "learning_rate": 1.733601651574137e-05, "loss": 0.5632, "mean_token_accuracy": 0.8384118401759533, "step": 130 }, { "epoch": 0.3274711946634324, "grad_norm": 0.8828125, "learning_rate": 1.704098470054379e-05, "loss": 0.5547, "mean_token_accuracy": 0.8413352272727275, "step": 135 }, { "epoch": 0.3395997574287447, "grad_norm": 2.921875, "learning_rate": 1.6733264555011196e-05, "loss": 0.5655, "mean_token_accuracy": 0.8374816715542522, "step": 140 }, { "epoch": 0.35172832019405703, "grad_norm": 0.91796875, "learning_rate": 1.6413410611644827e-05, "loss": 0.5737, "mean_token_accuracy": 0.8358046187683283, "step": 145 }, { "epoch": 0.3638568829593693, "grad_norm": 0.89453125, "learning_rate": 1.6081999268869765e-05, "loss": 0.5612, "mean_token_accuracy": 0.8389051808406647, "step": 150 }, { "epoch": 0.3759854457246816, "grad_norm": 0.859375, "learning_rate": 1.5739627752325995e-05, "loss": 0.5586, "mean_token_accuracy": 0.8389387829912023, "step": 155 }, { "epoch": 0.38811400848999394, "grad_norm": 0.8828125, "learning_rate": 1.538691303862744e-05, "loss": 0.5651, "mean_token_accuracy": 0.8381796798631476, "step": 160 }, { "epoch": 0.40024257125530627, "grad_norm": 0.8515625, "learning_rate": 1.5024490743528393e-05, "loss": 0.5635, "mean_token_accuracy": 0.8385920698924731, "step": 165 }, { "epoch": 0.41237113402061853, "grad_norm": 0.890625, "learning_rate": 1.4653013976500977e-05, "loss": 0.5644, "mean_token_accuracy": 0.8381750977517107, "step": 170 }, { "epoch": 0.42449969678593086, "grad_norm": 0.88671875, "learning_rate": 1.4273152163787726e-05, "loss": 0.5645, "mean_token_accuracy": 0.8384500244379277, "step": 175 }, { "epoch": 0.4366282595512432, "grad_norm": 0.90234375, "learning_rate": 1.3885589842050253e-05, "loss": 0.562, "mean_token_accuracy": 0.8376863391984362, "step": 180 }, { "epoch": 0.4487568223165555, "grad_norm": 0.96875, "learning_rate": 1.3491025424787916e-05, "loss": 0.5687, "mean_token_accuracy": 0.8375504032258064, "step": 185 }, { "epoch": 0.46088538508186777, "grad_norm": 0.8984375, "learning_rate": 1.3090169943749475e-05, "loss": 0.571, "mean_token_accuracy": 0.8365408113391984, "step": 190 }, { "epoch": 0.4730139478471801, "grad_norm": 0.84375, "learning_rate": 1.2683745767605846e-05, "loss": 0.5591, "mean_token_accuracy": 0.83859665200391, "step": 195 }, { "epoch": 0.4851425106124924, "grad_norm": 0.90234375, "learning_rate": 1.2272485300192902e-05, "loss": 0.5463, "mean_token_accuracy": 0.8424104960899316, "step": 200 }, { "epoch": 0.4851425106124924, "eval_loss": 0.5737597942352295, "eval_mean_token_accuracy": 0.8452422975272437, "eval_runtime": 0.5658, "eval_samples_per_second": 129.014, "eval_steps_per_second": 5.302, "step": 200 }, { "epoch": 0.49727107337780474, "grad_norm": 0.90234375, "learning_rate": 1.1857129660670281e-05, "loss": 0.5524, "mean_token_accuracy": 0.8409075635386118, "step": 205 }, { "epoch": 0.509399636143117, "grad_norm": 0.8671875, "learning_rate": 1.1438427347974554e-05, "loss": 0.5547, "mean_token_accuracy": 0.8406127810361681, "step": 210 }, { "epoch": 0.5215281989084294, "grad_norm": 0.921875, "learning_rate": 1.101713289197356e-05, "loss": 0.5697, "mean_token_accuracy": 0.8376405180840665, "step": 215 }, { "epoch": 0.5336567616737417, "grad_norm": 0.87890625, "learning_rate": 1.0594005493752568e-05, "loss": 0.5496, "mean_token_accuracy": 0.8417033235581621, "step": 220 }, { "epoch": 0.5457853244390539, "grad_norm": 0.8515625, "learning_rate": 1.0169807657482625e-05, "loss": 0.5546, "mean_token_accuracy": 0.8411183406647116, "step": 225 }, { "epoch": 0.5579138872043663, "grad_norm": 0.89453125, "learning_rate": 9.745303816336488e-06, "loss": 0.5588, "mean_token_accuracy": 0.8404768450635386, "step": 230 }, { "epoch": 0.5700424499696786, "grad_norm": 0.875, "learning_rate": 9.321258954928394e-06, "loss": 0.548, "mean_token_accuracy": 0.8420546187683285, "step": 235 }, { "epoch": 0.582171012734991, "grad_norm": 0.88671875, "learning_rate": 8.898437230760058e-06, "loss": 0.5577, "mean_token_accuracy": 0.8416300097751712, "step": 240 }, { "epoch": 0.5942995755003032, "grad_norm": 0.87109375, "learning_rate": 8.47760059715722e-06, "loss": 0.5555, "mean_token_accuracy": 0.840709005376344, "step": 245 }, { "epoch": 0.6064281382656155, "grad_norm": 0.87109375, "learning_rate": 8.059507430178248e-06, "loss": 0.5689, "mean_token_accuracy": 0.8373686461388076, "step": 250 }, { "epoch": 0.6185567010309279, "grad_norm": 0.875, "learning_rate": 7.644911161969225e-06, "loss": 0.5592, "mean_token_accuracy": 0.8399468475073313, "step": 255 }, { "epoch": 0.6306852637962401, "grad_norm": 0.8984375, "learning_rate": 7.2345589230282895e-06, "loss": 0.5643, "mean_token_accuracy": 0.8382331378299119, "step": 260 }, { "epoch": 0.6428138265615525, "grad_norm": 0.875, "learning_rate": 6.829190195825983e-06, "loss": 0.5619, "mean_token_accuracy": 0.8390609726295211, "step": 265 }, { "epoch": 0.6549423893268648, "grad_norm": 0.86328125, "learning_rate": 6.429535482207847e-06, "loss": 0.5524, "mean_token_accuracy": 0.8410572458455523, "step": 270 }, { "epoch": 0.667070952092177, "grad_norm": 0.88671875, "learning_rate": 6.036314986980749e-06, "loss": 0.5562, "mean_token_accuracy": 0.8407471896383187, "step": 275 }, { "epoch": 0.6791995148574894, "grad_norm": 0.921875, "learning_rate": 5.650237320055107e-06, "loss": 0.5642, "mean_token_accuracy": 0.8386149804496578, "step": 280 }, { "epoch": 0.6913280776228017, "grad_norm": 0.89453125, "learning_rate": 5.271998219481953e-06, "loss": 0.5696, "mean_token_accuracy": 0.8364155669599217, "step": 285 }, { "epoch": 0.7034566403881141, "grad_norm": 0.8671875, "learning_rate": 4.902279297685945e-06, "loss": 0.5426, "mean_token_accuracy": 0.844026454056696, "step": 290 }, { "epoch": 0.7155852031534263, "grad_norm": 0.8828125, "learning_rate": 4.541746813153698e-06, "loss": 0.5547, "mean_token_accuracy": 0.8409136730205278, "step": 295 }, { "epoch": 0.7277137659187386, "grad_norm": 0.8671875, "learning_rate": 4.191050469790961e-06, "loss": 0.5606, "mean_token_accuracy": 0.8398429863147605, "step": 300 }, { "epoch": 0.7277137659187386, "eval_loss": 0.5722371339797974, "eval_mean_token_accuracy": 0.8450929546359655, "eval_runtime": 0.5672, "eval_samples_per_second": 128.703, "eval_steps_per_second": 5.289, "step": 300 }, { "epoch": 0.739842328684051, "grad_norm": 0.85546875, "learning_rate": 3.8508222461122565e-06, "loss": 0.5574, "mean_token_accuracy": 0.840337854349951, "step": 305 }, { "epoch": 0.7519708914493632, "grad_norm": 0.87109375, "learning_rate": 3.5216752563729004e-06, "loss": 0.5512, "mean_token_accuracy": 0.8418026026392962, "step": 310 }, { "epoch": 0.7640994542146755, "grad_norm": 0.8515625, "learning_rate": 3.2042026456956554e-06, "loss": 0.5604, "mean_token_accuracy": 0.8396490102639296, "step": 315 }, { "epoch": 0.7762280169799879, "grad_norm": 0.88671875, "learning_rate": 2.8989765211831433e-06, "loss": 0.5566, "mean_token_accuracy": 0.8406647116324534, "step": 320 }, { "epoch": 0.7883565797453002, "grad_norm": 0.90625, "learning_rate": 2.6065469209421568e-06, "loss": 0.5445, "mean_token_accuracy": 0.8426793132942327, "step": 325 }, { "epoch": 0.8004851425106125, "grad_norm": 0.88671875, "learning_rate": 2.3274408228778357e-06, "loss": 0.5578, "mean_token_accuracy": 0.8400522360703813, "step": 330 }, { "epoch": 0.8126137052759248, "grad_norm": 0.859375, "learning_rate": 2.062161195043851e-06, "loss": 0.5553, "mean_token_accuracy": 0.8400812561094819, "step": 335 }, { "epoch": 0.8247422680412371, "grad_norm": 0.8828125, "learning_rate": 1.8111860892599832e-06, "loss": 0.5528, "mean_token_accuracy": 0.841089320625611, "step": 340 }, { "epoch": 0.8368708308065494, "grad_norm": 0.8984375, "learning_rate": 1.574967779630414e-06, "loss": 0.5516, "mean_token_accuracy": 0.8412099828934508, "step": 345 }, { "epoch": 0.8489993935718617, "grad_norm": 0.84765625, "learning_rate": 1.3539319475152457e-06, "loss": 0.5465, "mean_token_accuracy": 0.8418774437927663, "step": 350 }, { "epoch": 0.8611279563371741, "grad_norm": 0.8828125, "learning_rate": 1.1484769144239038e-06, "loss": 0.5579, "mean_token_accuracy": 0.8393939393939392, "step": 355 }, { "epoch": 0.8732565191024864, "grad_norm": 0.83984375, "learning_rate": 9.589729242128531e-07, "loss": 0.5568, "mean_token_accuracy": 0.8397498167155426, "step": 360 }, { "epoch": 0.8853850818677986, "grad_norm": 0.85546875, "learning_rate": 7.857614758811527e-07, "loss": 0.5573, "mean_token_accuracy": 0.8403760386119259, "step": 365 }, { "epoch": 0.897513644633111, "grad_norm": 0.875, "learning_rate": 6.291547081661631e-07, "loss": 0.5459, "mean_token_accuracy": 0.8424960288367547, "step": 370 }, { "epoch": 0.9096422073984233, "grad_norm": 0.86328125, "learning_rate": 4.894348370484648e-07, "loss": 0.5585, "mean_token_accuracy": 0.8394794721407625, "step": 375 }, { "epoch": 0.9217707701637355, "grad_norm": 0.8828125, "learning_rate": 3.668536471795614e-07, "loss": 0.5671, "mean_token_accuracy": 0.8379566837732162, "step": 380 }, { "epoch": 0.9338993329290479, "grad_norm": 0.8984375, "learning_rate": 2.6163203814894013e-07, "loss": 0.5508, "mean_token_accuracy": 0.8413230083088953, "step": 385 }, { "epoch": 0.9460278956943602, "grad_norm": 0.8828125, "learning_rate": 1.7395962640806675e-07, "loss": 0.5474, "mean_token_accuracy": 0.8426228005865102, "step": 390 }, { "epoch": 0.9581564584596726, "grad_norm": 0.89453125, "learning_rate": 1.039944035687368e-07, "loss": 0.558, "mean_token_accuracy": 0.8407914833822092, "step": 395 }, { "epoch": 0.9702850212249848, "grad_norm": 0.90234375, "learning_rate": 5.186245169149784e-08, "loss": 0.5661, "mean_token_accuracy": 0.8385386119257087, "step": 400 }, { "epoch": 0.9702850212249848, "eval_loss": 0.5721316337585449, "eval_mean_token_accuracy": 0.8450646699974657, "eval_runtime": 0.5673, "eval_samples_per_second": 128.69, "eval_steps_per_second": 5.289, "step": 400 }, { "epoch": 0.9824135839902971, "grad_norm": 0.90234375, "learning_rate": 1.7657716077265075e-08, "loss": 0.562, "mean_token_accuracy": 0.8391877443792767, "step": 405 }, { "epoch": 0.9945421467556095, "grad_norm": 0.90234375, "learning_rate": 1.4418359715351327e-09, "loss": 0.558, "mean_token_accuracy": 0.8401652614858259, "step": 410 }, { "epoch": 0.9993935718617344, "mean_token_accuracy": 0.8415696786412512, "step": 412, "total_flos": 1.1596255879128678e+17, "train_loss": 0.571030998982272, "train_runtime": 553.8515, "train_samples_per_second": 95.222, "train_steps_per_second": 0.744 } ], "logging_steps": 5, "max_steps": 412, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1596255879128678e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }