{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033783783783783786, "grad_norm": 7.693039168386045, "learning_rate": 1.0869565217391305e-05, "loss": 0.5133, "step": 5 }, { "epoch": 0.06756756756756757, "grad_norm": 4.296719547848672, "learning_rate": 2.173913043478261e-05, "loss": 0.3581, "step": 10 }, { "epoch": 0.10135135135135136, "grad_norm": 3.2417658261755915, "learning_rate": 3.260869565217392e-05, "loss": 0.3547, "step": 15 }, { "epoch": 0.13513513513513514, "grad_norm": 2.8412337980778553, "learning_rate": 4.347826086956522e-05, "loss": 0.2666, "step": 20 }, { "epoch": 0.16891891891891891, "grad_norm": 4.90451510502916, "learning_rate": 4.9997494236918504e-05, "loss": 0.3582, "step": 25 }, { "epoch": 0.20270270270270271, "grad_norm": 22.780898735414855, "learning_rate": 4.996931081151707e-05, "loss": 0.3768, "step": 30 }, { "epoch": 0.23648648648648649, "grad_norm": 3.320519938493738, "learning_rate": 4.990985111773183e-05, "loss": 0.4268, "step": 35 }, { "epoch": 0.2702702702702703, "grad_norm": 2.5294359467709095, "learning_rate": 4.981919792077782e-05, "loss": 0.2873, "step": 40 }, { "epoch": 0.30405405405405406, "grad_norm": 2.3280683716637873, "learning_rate": 4.969747740582118e-05, "loss": 0.2544, "step": 45 }, { "epoch": 0.33783783783783783, "grad_norm": 2.8289139949817335, "learning_rate": 4.95448590023351e-05, "loss": 0.2697, "step": 50 }, { "epoch": 0.3716216216216216, "grad_norm": 2.1629521504260993, "learning_rate": 4.936155514826161e-05, "loss": 0.3052, "step": 55 }, { "epoch": 0.40540540540540543, "grad_norm": 1.9564591527643984, "learning_rate": 4.914782099430755e-05, "loss": 0.2319, "step": 60 }, { "epoch": 0.4391891891891892, "grad_norm": 2.6620448846059714, "learning_rate": 4.890395404878627e-05, "loss": 0.2613, "step": 65 }, { "epoch": 0.47297297297297297, "grad_norm": 2.333611941701714, "learning_rate": 4.863029376349949e-05, "loss": 0.2595, "step": 70 }, { "epoch": 0.5067567567567568, "grad_norm": 2.5677617585089147, "learning_rate": 4.8327221061235635e-05, "loss": 0.2733, "step": 75 }, { "epoch": 0.5405405405405406, "grad_norm": 1.8945353614021259, "learning_rate": 4.799515780554253e-05, "loss": 0.2298, "step": 80 }, { "epoch": 0.5743243243243243, "grad_norm": 1.9687212656745774, "learning_rate": 4.763456621351229e-05, "loss": 0.2657, "step": 85 }, { "epoch": 0.6081081081081081, "grad_norm": 2.2237600068784653, "learning_rate": 4.724594821239601e-05, "loss": 0.244, "step": 90 }, { "epoch": 0.6418918918918919, "grad_norm": 2.1737571541215295, "learning_rate": 4.6829844740943586e-05, "loss": 0.2409, "step": 95 }, { "epoch": 0.6756756756756757, "grad_norm": 1.7570800060495113, "learning_rate": 4.6386834996441395e-05, "loss": 0.2054, "step": 100 }, { "epoch": 0.7094594594594594, "grad_norm": 1.5936369087529354, "learning_rate": 4.5917535628495714e-05, "loss": 0.2658, "step": 105 }, { "epoch": 0.7432432432432432, "grad_norm": 1.687954605448381, "learning_rate": 4.542259988068434e-05, "loss": 0.2306, "step": 110 }, { "epoch": 0.777027027027027, "grad_norm": 1.6798383407257704, "learning_rate": 4.4902716681270805e-05, "loss": 0.2051, "step": 115 }, { "epoch": 0.8108108108108109, "grad_norm": 1.9894543465428474, "learning_rate": 4.435860968424745e-05, "loss": 0.2885, "step": 120 }, { "epoch": 0.8445945945945946, "grad_norm": 1.5663367660943097, "learning_rate": 4.379103626204153e-05, "loss": 0.2412, "step": 125 }, { "epoch": 0.8783783783783784, "grad_norm": 1.4832323634362194, "learning_rate": 4.320078645128699e-05, "loss": 0.2374, "step": 130 }, { "epoch": 0.9121621621621622, "grad_norm": 1.999609063337418, "learning_rate": 4.258868185312901e-05, "loss": 0.21, "step": 135 }, { "epoch": 0.9459459459459459, "grad_norm": 1.449925989442346, "learning_rate": 4.195557448959231e-05, "loss": 0.2533, "step": 140 }, { "epoch": 0.9797297297297297, "grad_norm": 1.278422978606916, "learning_rate": 4.130234561760477e-05, "loss": 0.235, "step": 145 }, { "epoch": 1.0135135135135136, "grad_norm": 1.1102776403418126, "learning_rate": 4.0629904502327556e-05, "loss": 0.2101, "step": 150 }, { "epoch": 1.0472972972972974, "grad_norm": 1.452689044129832, "learning_rate": 3.993918715149896e-05, "loss": 0.1268, "step": 155 }, { "epoch": 1.0810810810810811, "grad_norm": 2.564080697825406, "learning_rate": 3.923115501255381e-05, "loss": 0.1504, "step": 160 }, { "epoch": 1.114864864864865, "grad_norm": 1.4284081142136367, "learning_rate": 3.8506793634331925e-05, "loss": 0.1501, "step": 165 }, { "epoch": 1.1486486486486487, "grad_norm": 1.3921371445923525, "learning_rate": 3.7767111295238555e-05, "loss": 0.1517, "step": 170 }, { "epoch": 1.1824324324324325, "grad_norm": 1.0189911635230715, "learning_rate": 3.701313759976626e-05, "loss": 0.1482, "step": 175 }, { "epoch": 1.2162162162162162, "grad_norm": 1.4942069614270297, "learning_rate": 3.624592204533184e-05, "loss": 0.1297, "step": 180 }, { "epoch": 1.25, "grad_norm": 1.3920628246297202, "learning_rate": 3.546653256142321e-05, "loss": 0.1325, "step": 185 }, { "epoch": 1.2837837837837838, "grad_norm": 1.0187887226878, "learning_rate": 3.467605402308966e-05, "loss": 0.142, "step": 190 }, { "epoch": 1.3175675675675675, "grad_norm": 1.1984277342370044, "learning_rate": 3.3875586740844675e-05, "loss": 0.1347, "step": 195 }, { "epoch": 1.3513513513513513, "grad_norm": 1.1170028929426064, "learning_rate": 3.3066244929083246e-05, "loss": 0.1304, "step": 200 }, { "epoch": 1.385135135135135, "grad_norm": 1.0186976243308743, "learning_rate": 3.2249155155145665e-05, "loss": 0.141, "step": 205 }, { "epoch": 1.4189189189189189, "grad_norm": 1.2898941169402762, "learning_rate": 3.142545477118649e-05, "loss": 0.1354, "step": 210 }, { "epoch": 1.4527027027027026, "grad_norm": 0.9961209346671913, "learning_rate": 3.059629033103166e-05, "loss": 0.1302, "step": 215 }, { "epoch": 1.4864864864864864, "grad_norm": 0.9875250619369351, "learning_rate": 2.9762815994227135e-05, "loss": 0.1403, "step": 220 }, { "epoch": 1.5202702702702702, "grad_norm": 1.0982139109346354, "learning_rate": 2.8926191919500854e-05, "loss": 0.1235, "step": 225 }, { "epoch": 1.554054054054054, "grad_norm": 0.9668045928793142, "learning_rate": 2.808758264987406e-05, "loss": 0.1257, "step": 230 }, { "epoch": 1.5878378378378377, "grad_norm": 0.8581177979692445, "learning_rate": 2.7248155491669854e-05, "loss": 0.1174, "step": 235 }, { "epoch": 1.6216216216216215, "grad_norm": 1.1046256528106388, "learning_rate": 2.6409078889675382e-05, "loss": 0.1214, "step": 240 }, { "epoch": 1.6554054054054053, "grad_norm": 0.880781318496101, "learning_rate": 2.5571520800719363e-05, "loss": 0.1376, "step": 245 }, { "epoch": 1.689189189189189, "grad_norm": 0.8458236531048872, "learning_rate": 2.473664706792873e-05, "loss": 0.1026, "step": 250 }, { "epoch": 1.722972972972973, "grad_norm": 3.38684923966137, "learning_rate": 2.390561979792763e-05, "loss": 0.1335, "step": 255 }, { "epoch": 1.7567567567567568, "grad_norm": 0.8747529589638384, "learning_rate": 2.3079595743237243e-05, "loss": 0.1089, "step": 260 }, { "epoch": 1.7905405405405406, "grad_norm": 1.061409594201986, "learning_rate": 2.2259724692128448e-05, "loss": 0.1211, "step": 265 }, { "epoch": 1.8243243243243243, "grad_norm": 0.7187465056471518, "learning_rate": 2.144714786816836e-05, "loss": 0.093, "step": 270 }, { "epoch": 1.8581081081081081, "grad_norm": 0.7427271054292853, "learning_rate": 2.0642996341688498e-05, "loss": 0.1061, "step": 275 }, { "epoch": 1.8918918918918919, "grad_norm": 0.9102492961843579, "learning_rate": 1.9848389455385845e-05, "loss": 0.1117, "step": 280 }, { "epoch": 1.9256756756756757, "grad_norm": 0.6300885493957281, "learning_rate": 1.9064433266248287e-05, "loss": 0.1089, "step": 285 }, { "epoch": 1.9594594594594594, "grad_norm": 0.8142160296904185, "learning_rate": 1.829221900597305e-05, "loss": 0.1039, "step": 290 }, { "epoch": 1.9932432432432432, "grad_norm": 2.6048425820022088, "learning_rate": 1.7532821562021373e-05, "loss": 0.1028, "step": 295 }, { "epoch": 2.027027027027027, "grad_norm": 0.7413049752502284, "learning_rate": 1.6787297981423618e-05, "loss": 0.0681, "step": 300 }, { "epoch": 2.060810810810811, "grad_norm": 1.1954219728615871, "learning_rate": 1.6056685999417336e-05, "loss": 0.0589, "step": 305 }, { "epoch": 2.0945945945945947, "grad_norm": 0.5252454560354707, "learning_rate": 1.5342002594966657e-05, "loss": 0.0606, "step": 310 }, { "epoch": 2.1283783783783785, "grad_norm": 0.6859669085189071, "learning_rate": 1.4644242575173363e-05, "loss": 0.056, "step": 315 }, { "epoch": 2.1621621621621623, "grad_norm": 0.7090167332441359, "learning_rate": 1.3964377190550165e-05, "loss": 0.0608, "step": 320 }, { "epoch": 2.195945945945946, "grad_norm": 0.47669323187220364, "learning_rate": 1.330335278308384e-05, "loss": 0.0462, "step": 325 }, { "epoch": 2.22972972972973, "grad_norm": 0.3967741627628627, "learning_rate": 1.2662089468969717e-05, "loss": 0.0506, "step": 330 }, { "epoch": 2.2635135135135136, "grad_norm": 0.7073240580865503, "learning_rate": 1.2041479857851485e-05, "loss": 0.0584, "step": 335 }, { "epoch": 2.2972972972972974, "grad_norm": 0.534095245997379, "learning_rate": 1.14423878103487e-05, "loss": 0.0448, "step": 340 }, { "epoch": 2.331081081081081, "grad_norm": 0.5592832360622514, "learning_rate": 1.086564723560177e-05, "loss": 0.0515, "step": 345 }, { "epoch": 2.364864864864865, "grad_norm": 0.5012383371857895, "learning_rate": 1.031206093050798e-05, "loss": 0.0468, "step": 350 }, { "epoch": 2.3986486486486487, "grad_norm": 0.5873919693251372, "learning_rate": 9.78239946226439e-06, "loss": 0.0481, "step": 355 }, { "epoch": 2.4324324324324325, "grad_norm": 0.7663749270398796, "learning_rate": 9.277400095772979e-06, "loss": 0.051, "step": 360 }, { "epoch": 2.4662162162162162, "grad_norm": 0.593558971064878, "learning_rate": 8.797765767401159e-06, "loss": 0.0554, "step": 365 }, { "epoch": 2.5, "grad_norm": 0.5159284273389936, "learning_rate": 8.34416410652601e-06, "loss": 0.0448, "step": 370 }, { "epoch": 2.5337837837837838, "grad_norm": 0.6366287598455895, "learning_rate": 7.917226506224227e-06, "loss": 0.0472, "step": 375 }, { "epoch": 2.5675675675675675, "grad_norm": 0.9410891314952453, "learning_rate": 7.51754724440146e-06, "loss": 0.0502, "step": 380 }, { "epoch": 2.6013513513513513, "grad_norm": 0.47104911363203844, "learning_rate": 7.145682656584196e-06, "loss": 0.0494, "step": 385 }, { "epoch": 2.635135135135135, "grad_norm": 0.48833508181607244, "learning_rate": 6.802150361525786e-06, "loss": 0.0454, "step": 390 }, { "epoch": 2.668918918918919, "grad_norm": 0.5274580172661194, "learning_rate": 6.487428540704467e-06, "loss": 0.0405, "step": 395 }, { "epoch": 2.7027027027027026, "grad_norm": 0.4401735004442807, "learning_rate": 6.201955272716275e-06, "loss": 0.0387, "step": 400 }, { "epoch": 2.7364864864864864, "grad_norm": 0.5222808193914125, "learning_rate": 5.946127923489382e-06, "loss": 0.0501, "step": 405 }, { "epoch": 2.77027027027027, "grad_norm": 0.885902051673082, "learning_rate": 5.720302593168628e-06, "loss": 0.05, "step": 410 }, { "epoch": 2.804054054054054, "grad_norm": 0.4765687214481478, "learning_rate": 5.524793620440148e-06, "loss": 0.038, "step": 415 }, { "epoch": 2.8378378378378377, "grad_norm": 0.5896068685310154, "learning_rate": 5.3598731449861e-06, "loss": 0.0466, "step": 420 }, { "epoch": 2.8716216216216215, "grad_norm": 0.4787445681998641, "learning_rate": 5.225770728678475e-06, "loss": 0.0486, "step": 425 }, { "epoch": 2.9054054054054053, "grad_norm": 0.7298521032646316, "learning_rate": 5.122673036039321e-06, "loss": 0.0534, "step": 430 }, { "epoch": 2.939189189189189, "grad_norm": 0.5563651328125397, "learning_rate": 5.050723574412132e-06, "loss": 0.0399, "step": 435 }, { "epoch": 2.972972972972973, "grad_norm": 0.42205363383969957, "learning_rate": 5.010022494206098e-06, "loss": 0.0405, "step": 440 }, { "epoch": 3.0, "step": 444, "total_flos": 311859968540672.0, "train_loss": 0.15194854576576938, "train_runtime": 8344.2681, "train_samples_per_second": 3.401, "train_steps_per_second": 0.053 } ], "logging_steps": 5, "max_steps": 444, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 311859968540672.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }