{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00390625, "grad_norm": 13.52961750257624, "learning_rate": 3.846153846153847e-07, "loss": 1.2001, "step": 1 }, { "epoch": 0.01953125, "grad_norm": 14.719058439285638, "learning_rate": 1.9230769230769234e-06, "loss": 1.2458, "step": 5 }, { "epoch": 0.0390625, "grad_norm": 4.77060856046236, "learning_rate": 3.846153846153847e-06, "loss": 1.1124, "step": 10 }, { "epoch": 0.05859375, "grad_norm": 2.4602749084741933, "learning_rate": 5.769230769230769e-06, "loss": 1.0517, "step": 15 }, { "epoch": 0.078125, "grad_norm": 1.7463079572164537, "learning_rate": 7.692307692307694e-06, "loss": 0.9195, "step": 20 }, { "epoch": 0.09765625, "grad_norm": 1.3968704201975504, "learning_rate": 9.615384615384616e-06, "loss": 0.9425, "step": 25 }, { "epoch": 0.1171875, "grad_norm": 1.427533324978979, "learning_rate": 1.1538461538461538e-05, "loss": 0.93, "step": 30 }, { "epoch": 0.13671875, "grad_norm": 1.3118669922360116, "learning_rate": 1.3461538461538463e-05, "loss": 0.8991, "step": 35 }, { "epoch": 0.15625, "grad_norm": 1.2907394066527953, "learning_rate": 1.5384615384615387e-05, "loss": 0.8919, "step": 40 }, { "epoch": 0.17578125, "grad_norm": 1.4159243735379308, "learning_rate": 1.730769230769231e-05, "loss": 0.8803, "step": 45 }, { "epoch": 0.1953125, "grad_norm": 1.299112560801352, "learning_rate": 1.923076923076923e-05, "loss": 0.8959, "step": 50 }, { "epoch": 0.21484375, "grad_norm": 1.2702838813576585, "learning_rate": 1.99979011499924e-05, "loss": 0.8755, "step": 55 }, { "epoch": 0.234375, "grad_norm": 1.367440629877651, "learning_rate": 1.998507803482828e-05, "loss": 0.9024, "step": 60 }, { "epoch": 0.25390625, "grad_norm": 1.2280977464389735, "learning_rate": 1.996061276533154e-05, "loss": 0.8664, "step": 65 }, { "epoch": 0.2734375, "grad_norm": 1.3022228088323475, "learning_rate": 1.9924533866912017e-05, "loss": 0.8665, "step": 70 }, { "epoch": 0.29296875, "grad_norm": 1.2078818317069273, "learning_rate": 1.9876883405951378e-05, "loss": 0.8624, "step": 75 }, { "epoch": 0.3125, "grad_norm": 1.5644048727951139, "learning_rate": 1.9817716940755586e-05, "loss": 0.8761, "step": 80 }, { "epoch": 0.33203125, "grad_norm": 1.3636980168093404, "learning_rate": 1.9747103456776406e-05, "loss": 0.8376, "step": 85 }, { "epoch": 0.3515625, "grad_norm": 1.2165350413614253, "learning_rate": 1.9665125286177448e-05, "loss": 0.8769, "step": 90 }, { "epoch": 0.37109375, "grad_norm": 1.2457077882152825, "learning_rate": 1.9571878011838557e-05, "loss": 0.8728, "step": 95 }, { "epoch": 0.390625, "grad_norm": 2.4186928992413006, "learning_rate": 1.9467470355910438e-05, "loss": 0.8834, "step": 100 }, { "epoch": 0.41015625, "grad_norm": 1.2020347163023293, "learning_rate": 1.935202405304951e-05, "loss": 0.8585, "step": 105 }, { "epoch": 0.4296875, "grad_norm": 1.1135509900758418, "learning_rate": 1.922567370848072e-05, "loss": 0.8443, "step": 110 }, { "epoch": 0.44921875, "grad_norm": 1.190845364147945, "learning_rate": 1.9088566641053887e-05, "loss": 0.868, "step": 115 }, { "epoch": 0.46875, "grad_norm": 1.107956734795049, "learning_rate": 1.8940862711476515e-05, "loss": 0.8316, "step": 120 }, { "epoch": 0.48828125, "grad_norm": 1.2765334463268576, "learning_rate": 1.878273413592334e-05, "loss": 0.8578, "step": 125 }, { "epoch": 0.5078125, "grad_norm": 1.1648480884158616, "learning_rate": 1.8614365285240002e-05, "loss": 0.846, "step": 130 }, { "epoch": 0.52734375, "grad_norm": 1.135005532726839, "learning_rate": 1.8435952469974858e-05, "loss": 0.8615, "step": 135 }, { "epoch": 0.546875, "grad_norm": 1.2325035965433688, "learning_rate": 1.8247703711489684e-05, "loss": 0.8349, "step": 140 }, { "epoch": 0.56640625, "grad_norm": 1.0925509469327004, "learning_rate": 1.804983849941607e-05, "loss": 0.8546, "step": 145 }, { "epoch": 0.5859375, "grad_norm": 1.2488571066795477, "learning_rate": 1.7842587535740315e-05, "loss": 0.9118, "step": 150 }, { "epoch": 0.60546875, "grad_norm": 1.1937624775603797, "learning_rate": 1.762619246581524e-05, "loss": 0.8505, "step": 155 }, { "epoch": 0.625, "grad_norm": 1.1137502231777754, "learning_rate": 1.740090559661252e-05, "loss": 0.8352, "step": 160 }, { "epoch": 0.64453125, "grad_norm": 1.1828666444300104, "learning_rate": 1.7166989602544036e-05, "loss": 0.8359, "step": 165 }, { "epoch": 0.6640625, "grad_norm": 3.65203815585326, "learning_rate": 1.6924717219195258e-05, "loss": 0.8689, "step": 170 }, { "epoch": 0.68359375, "grad_norm": 1.1416859572228402, "learning_rate": 1.667437092532776e-05, "loss": 0.8472, "step": 175 }, { "epoch": 0.703125, "grad_norm": 1.1051850039812487, "learning_rate": 1.6416242613521612e-05, "loss": 0.828, "step": 180 }, { "epoch": 0.72265625, "grad_norm": 1.1362330964244478, "learning_rate": 1.6150633249841696e-05, "loss": 0.8316, "step": 185 }, { "epoch": 0.7421875, "grad_norm": 1.1663508920755614, "learning_rate": 1.5877852522924733e-05, "loss": 0.8199, "step": 190 }, { "epoch": 0.76171875, "grad_norm": 1.069723196727746, "learning_rate": 1.5598218482896182e-05, "loss": 0.8322, "step": 195 }, { "epoch": 0.78125, "grad_norm": 1.1519350666106931, "learning_rate": 1.5312057170538033e-05, "loss": 0.8329, "step": 200 }, { "epoch": 0.80078125, "grad_norm": 1.113639432195006, "learning_rate": 1.501970223713983e-05, "loss": 0.8501, "step": 205 }, { "epoch": 0.8203125, "grad_norm": 6.880127196389451, "learning_rate": 1.4721494555476189e-05, "loss": 0.8796, "step": 210 }, { "epoch": 0.83984375, "grad_norm": 1.8661887735033835, "learning_rate": 1.4417781822364396e-05, "loss": 0.8453, "step": 215 }, { "epoch": 0.859375, "grad_norm": 1.1561250264726648, "learning_rate": 1.4108918153265485e-05, "loss": 0.8785, "step": 220 }, { "epoch": 0.87890625, "grad_norm": 1.0352123961524995, "learning_rate": 1.379526366940142e-05, "loss": 0.7975, "step": 225 }, { "epoch": 0.8984375, "grad_norm": 1.1431323585880604, "learning_rate": 1.3477184077869892e-05, "loss": 0.9143, "step": 230 }, { "epoch": 0.91796875, "grad_norm": 1.0873880063235835, "learning_rate": 1.3155050245246171e-05, "loss": 0.822, "step": 235 }, { "epoch": 0.9375, "grad_norm": 1.0523607160783444, "learning_rate": 1.28292377651693e-05, "loss": 0.8323, "step": 240 }, { "epoch": 0.95703125, "grad_norm": 1.0157998886067603, "learning_rate": 1.250012652041669e-05, "loss": 0.8663, "step": 245 }, { "epoch": 0.9765625, "grad_norm": 1.050004770812735, "learning_rate": 1.216810023997781e-05, "loss": 0.795, "step": 250 }, { "epoch": 0.99609375, "grad_norm": 0.9901708666135668, "learning_rate": 1.1833546051643325e-05, "loss": 0.809, "step": 255 }, { "epoch": 1.0, "eval_loss": 0.8800936341285706, "eval_runtime": 4.2726, "eval_samples_per_second": 38.384, "eval_steps_per_second": 0.702, "step": 256 }, { "epoch": 1.015625, "grad_norm": 1.6464925548656197, "learning_rate": 1.1496854030631443e-05, "loss": 0.6149, "step": 260 }, { "epoch": 1.03515625, "grad_norm": 1.072559199121631, "learning_rate": 1.1158416744777644e-05, "loss": 0.5546, "step": 265 }, { "epoch": 1.0546875, "grad_norm": 1.108994610542106, "learning_rate": 1.0818628796818134e-05, "loss": 0.5328, "step": 270 }, { "epoch": 1.07421875, "grad_norm": 1.0468024771181579, "learning_rate": 1.0477886364300722e-05, "loss": 0.5665, "step": 275 }, { "epoch": 1.09375, "grad_norm": 1.1024876273097197, "learning_rate": 1.013658673765951e-05, "loss": 0.5031, "step": 280 }, { "epoch": 1.11328125, "grad_norm": 1.0010594169346207, "learning_rate": 9.79512785699204e-06, "loss": 0.5128, "step": 285 }, { "epoch": 1.1328125, "grad_norm": 0.9826520234931142, "learning_rate": 9.453907848078901e-06, "loss": 0.5214, "step": 290 }, { "epoch": 1.15234375, "grad_norm": 1.06456378745709, "learning_rate": 9.113324558186922e-06, "loss": 0.4919, "step": 295 }, { "epoch": 1.171875, "grad_norm": 1.0193727837903817, "learning_rate": 8.773775092197018e-06, "loss": 0.5426, "step": 300 }, { "epoch": 1.19140625, "grad_norm": 1.1463010979822952, "learning_rate": 8.43565534959769e-06, "loss": 0.5146, "step": 305 }, { "epoch": 1.2109375, "grad_norm": 1.0520157917383735, "learning_rate": 8.099359562883931e-06, "loss": 0.5131, "step": 310 }, { "epoch": 1.23046875, "grad_norm": 1.1402388191576989, "learning_rate": 7.76527983789973e-06, "loss": 0.5388, "step": 315 }, { "epoch": 1.25, "grad_norm": 1.1994848233232585, "learning_rate": 7.433805696660267e-06, "loss": 0.5114, "step": 320 }, { "epoch": 1.26953125, "grad_norm": 1.1554670397128242, "learning_rate": 7.105323623186595e-06, "loss": 0.5415, "step": 325 }, { "epoch": 1.2890625, "grad_norm": 1.0601143896880463, "learning_rate": 6.78021661288262e-06, "loss": 0.5271, "step": 330 }, { "epoch": 1.30859375, "grad_norm": 0.9948697176939006, "learning_rate": 6.458863725979549e-06, "loss": 0.5093, "step": 335 }, { "epoch": 1.328125, "grad_norm": 1.2614924364675497, "learning_rate": 6.141639645568646e-06, "loss": 0.5263, "step": 340 }, { "epoch": 1.34765625, "grad_norm": 1.0090836036415356, "learning_rate": 5.828914240737496e-06, "loss": 0.5241, "step": 345 }, { "epoch": 1.3671875, "grad_norm": 0.9710310992423488, "learning_rate": 5.521052135319182e-06, "loss": 0.5223, "step": 350 }, { "epoch": 1.38671875, "grad_norm": 0.9666264916437364, "learning_rate": 5.2184122827572315e-06, "loss": 0.522, "step": 355 }, { "epoch": 1.40625, "grad_norm": 0.9913680270139593, "learning_rate": 4.921347547581939e-06, "loss": 0.5026, "step": 360 }, { "epoch": 1.42578125, "grad_norm": 1.1019424690864423, "learning_rate": 4.630204293986122e-06, "loss": 0.5055, "step": 365 }, { "epoch": 1.4453125, "grad_norm": 1.1364708272450168, "learning_rate": 4.345321981979942e-06, "loss": 0.5126, "step": 370 }, { "epoch": 1.46484375, "grad_norm": 0.944966666142233, "learning_rate": 4.067032771595749e-06, "loss": 0.5155, "step": 375 }, { "epoch": 1.484375, "grad_norm": 0.9992299626369062, "learning_rate": 3.7956611356043196e-06, "loss": 0.5262, "step": 380 }, { "epoch": 1.50390625, "grad_norm": 2.2555525600089146, "learning_rate": 3.53152348119413e-06, "loss": 0.5274, "step": 385 }, { "epoch": 1.5234375, "grad_norm": 2.6543631759205293, "learning_rate": 3.2749277810547286e-06, "loss": 0.5407, "step": 390 }, { "epoch": 1.54296875, "grad_norm": 0.9454239654950802, "learning_rate": 3.0261732142943435e-06, "loss": 0.5088, "step": 395 }, { "epoch": 1.5625, "grad_norm": 0.9651848810153458, "learning_rate": 2.7855498176104435e-06, "loss": 0.5156, "step": 400 }, { "epoch": 1.58203125, "grad_norm": 0.9488391689639479, "learning_rate": 2.5533381471199138e-06, "loss": 0.5027, "step": 405 }, { "epoch": 1.6015625, "grad_norm": 1.0344338188963158, "learning_rate": 2.3298089512431744e-06, "loss": 0.516, "step": 410 }, { "epoch": 1.62109375, "grad_norm": 1.0757146443275445, "learning_rate": 2.1152228550236264e-06, "loss": 0.4998, "step": 415 }, { "epoch": 1.640625, "grad_norm": 1.0916629078435063, "learning_rate": 1.9098300562505266e-06, "loss": 0.5618, "step": 420 }, { "epoch": 1.66015625, "grad_norm": 0.9001642552665725, "learning_rate": 1.713870033739541e-06, "loss": 0.5004, "step": 425 }, { "epoch": 1.6796875, "grad_norm": 0.9887925721175237, "learning_rate": 1.5275712681111643e-06, "loss": 0.5118, "step": 430 }, { "epoch": 1.69921875, "grad_norm": 0.9521081586764941, "learning_rate": 1.3511509753925422e-06, "loss": 0.5313, "step": 435 }, { "epoch": 1.71875, "grad_norm": 0.9103965187812971, "learning_rate": 1.1848148537532845e-06, "loss": 0.5009, "step": 440 }, { "epoch": 1.73828125, "grad_norm": 0.9494855728059224, "learning_rate": 1.0287568436706208e-06, "loss": 0.4873, "step": 445 }, { "epoch": 1.7578125, "grad_norm": 1.0147241592824456, "learning_rate": 8.831589018034659e-07, "loss": 0.5237, "step": 450 }, { "epoch": 1.77734375, "grad_norm": 0.9625772091753605, "learning_rate": 7.481907888390994e-07, "loss": 0.5034, "step": 455 }, { "epoch": 1.796875, "grad_norm": 0.9458519608918025, "learning_rate": 6.240098715597975e-07, "loss": 0.5066, "step": 460 }, { "epoch": 1.81640625, "grad_norm": 0.9480774154256925, "learning_rate": 5.107609393602019e-07, "loss": 0.5054, "step": 465 }, { "epoch": 1.8359375, "grad_norm": 0.9585344451470228, "learning_rate": 4.0857603542936776e-07, "loss": 0.5494, "step": 470 }, { "epoch": 1.85546875, "grad_norm": 0.9741379097553984, "learning_rate": 3.175743027943079e-07, "loss": 0.5096, "step": 475 }, { "epoch": 1.875, "grad_norm": 0.9143220140212833, "learning_rate": 2.3786184540455449e-07, "loss": 0.5118, "step": 480 }, { "epoch": 1.89453125, "grad_norm": 0.9419358410206972, "learning_rate": 1.6953160441969707e-07, "loss": 0.5156, "step": 485 }, { "epoch": 1.9140625, "grad_norm": 0.9508691680531732, "learning_rate": 1.1266324984415266e-07, "loss": 0.5034, "step": 490 }, { "epoch": 1.93359375, "grad_norm": 0.9252556940433395, "learning_rate": 6.732308763550022e-08, "loss": 0.5109, "step": 495 }, { "epoch": 1.953125, "grad_norm": 0.9221461974805585, "learning_rate": 3.356398239470427e-08, "loss": 0.5186, "step": 500 }, { "epoch": 1.97265625, "grad_norm": 0.9366413757174445, "learning_rate": 1.142529572835227e-08, "loss": 0.4819, "step": 505 }, { "epoch": 1.9921875, "grad_norm": 0.896293999569403, "learning_rate": 9.328403547792518e-10, "loss": 0.5409, "step": 510 }, { "epoch": 2.0, "eval_loss": 0.8888566493988037, "eval_runtime": 5.0992, "eval_samples_per_second": 32.162, "eval_steps_per_second": 0.588, "step": 512 }, { "epoch": 2.0, "step": 512, "total_flos": 107202383708160.0, "train_loss": 0.697848245035857, "train_runtime": 3596.6385, "train_samples_per_second": 9.078, "train_steps_per_second": 0.142 } ], "logging_steps": 5, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 107202383708160.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }