|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 444, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.033783783783783786, |
|
"grad_norm": 7.693039168386045, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 0.5133, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06756756756756757, |
|
"grad_norm": 4.296719547848672, |
|
"learning_rate": 2.173913043478261e-05, |
|
"loss": 0.3581, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10135135135135136, |
|
"grad_norm": 3.2417658261755915, |
|
"learning_rate": 3.260869565217392e-05, |
|
"loss": 0.3547, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 2.8412337980778553, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.2666, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16891891891891891, |
|
"grad_norm": 4.90451510502916, |
|
"learning_rate": 4.9997494236918504e-05, |
|
"loss": 0.3582, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20270270270270271, |
|
"grad_norm": 22.780898735414855, |
|
"learning_rate": 4.996931081151707e-05, |
|
"loss": 0.3768, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23648648648648649, |
|
"grad_norm": 3.320519938493738, |
|
"learning_rate": 4.990985111773183e-05, |
|
"loss": 0.4268, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 2.5294359467709095, |
|
"learning_rate": 4.981919792077782e-05, |
|
"loss": 0.2873, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30405405405405406, |
|
"grad_norm": 2.3280683716637873, |
|
"learning_rate": 4.969747740582118e-05, |
|
"loss": 0.2544, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.33783783783783783, |
|
"grad_norm": 2.8289139949817335, |
|
"learning_rate": 4.95448590023351e-05, |
|
"loss": 0.2697, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3716216216216216, |
|
"grad_norm": 2.1629521504260993, |
|
"learning_rate": 4.936155514826161e-05, |
|
"loss": 0.3052, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 1.9564591527643984, |
|
"learning_rate": 4.914782099430755e-05, |
|
"loss": 0.2319, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4391891891891892, |
|
"grad_norm": 2.6620448846059714, |
|
"learning_rate": 4.890395404878627e-05, |
|
"loss": 0.2613, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.47297297297297297, |
|
"grad_norm": 2.333611941701714, |
|
"learning_rate": 4.863029376349949e-05, |
|
"loss": 0.2595, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5067567567567568, |
|
"grad_norm": 2.5677617585089147, |
|
"learning_rate": 4.8327221061235635e-05, |
|
"loss": 0.2733, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 1.8945353614021259, |
|
"learning_rate": 4.799515780554253e-05, |
|
"loss": 0.2298, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5743243243243243, |
|
"grad_norm": 1.9687212656745774, |
|
"learning_rate": 4.763456621351229e-05, |
|
"loss": 0.2657, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6081081081081081, |
|
"grad_norm": 2.2237600068784653, |
|
"learning_rate": 4.724594821239601e-05, |
|
"loss": 0.244, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6418918918918919, |
|
"grad_norm": 2.1737571541215295, |
|
"learning_rate": 4.6829844740943586e-05, |
|
"loss": 0.2409, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 1.7570800060495113, |
|
"learning_rate": 4.6386834996441395e-05, |
|
"loss": 0.2054, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7094594594594594, |
|
"grad_norm": 1.5936369087529354, |
|
"learning_rate": 4.5917535628495714e-05, |
|
"loss": 0.2658, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7432432432432432, |
|
"grad_norm": 1.687954605448381, |
|
"learning_rate": 4.542259988068434e-05, |
|
"loss": 0.2306, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.777027027027027, |
|
"grad_norm": 1.6798383407257704, |
|
"learning_rate": 4.4902716681270805e-05, |
|
"loss": 0.2051, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 1.9894543465428474, |
|
"learning_rate": 4.435860968424745e-05, |
|
"loss": 0.2885, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8445945945945946, |
|
"grad_norm": 1.5663367660943097, |
|
"learning_rate": 4.379103626204153e-05, |
|
"loss": 0.2412, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8783783783783784, |
|
"grad_norm": 1.4832323634362194, |
|
"learning_rate": 4.320078645128699e-05, |
|
"loss": 0.2374, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9121621621621622, |
|
"grad_norm": 1.999609063337418, |
|
"learning_rate": 4.258868185312901e-05, |
|
"loss": 0.21, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 1.449925989442346, |
|
"learning_rate": 4.195557448959231e-05, |
|
"loss": 0.2533, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9797297297297297, |
|
"grad_norm": 1.278422978606916, |
|
"learning_rate": 4.130234561760477e-05, |
|
"loss": 0.235, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0135135135135136, |
|
"grad_norm": 1.1102776403418126, |
|
"learning_rate": 4.0629904502327556e-05, |
|
"loss": 0.2101, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0472972972972974, |
|
"grad_norm": 1.452689044129832, |
|
"learning_rate": 3.993918715149896e-05, |
|
"loss": 0.1268, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 2.564080697825406, |
|
"learning_rate": 3.923115501255381e-05, |
|
"loss": 0.1504, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.114864864864865, |
|
"grad_norm": 1.4284081142136367, |
|
"learning_rate": 3.8506793634331925e-05, |
|
"loss": 0.1501, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1486486486486487, |
|
"grad_norm": 1.3921371445923525, |
|
"learning_rate": 3.7767111295238555e-05, |
|
"loss": 0.1517, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1824324324324325, |
|
"grad_norm": 1.0189911635230715, |
|
"learning_rate": 3.701313759976626e-05, |
|
"loss": 0.1482, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 1.4942069614270297, |
|
"learning_rate": 3.624592204533184e-05, |
|
"loss": 0.1297, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.3920628246297202, |
|
"learning_rate": 3.546653256142321e-05, |
|
"loss": 0.1325, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2837837837837838, |
|
"grad_norm": 1.0187887226878, |
|
"learning_rate": 3.467605402308966e-05, |
|
"loss": 0.142, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3175675675675675, |
|
"grad_norm": 1.1984277342370044, |
|
"learning_rate": 3.3875586740844675e-05, |
|
"loss": 0.1347, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 1.1170028929426064, |
|
"learning_rate": 3.3066244929083246e-05, |
|
"loss": 0.1304, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.385135135135135, |
|
"grad_norm": 1.0186976243308743, |
|
"learning_rate": 3.2249155155145665e-05, |
|
"loss": 0.141, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4189189189189189, |
|
"grad_norm": 1.2898941169402762, |
|
"learning_rate": 3.142545477118649e-05, |
|
"loss": 0.1354, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4527027027027026, |
|
"grad_norm": 0.9961209346671913, |
|
"learning_rate": 3.059629033103166e-05, |
|
"loss": 0.1302, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 0.9875250619369351, |
|
"learning_rate": 2.9762815994227135e-05, |
|
"loss": 0.1403, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5202702702702702, |
|
"grad_norm": 1.0982139109346354, |
|
"learning_rate": 2.8926191919500854e-05, |
|
"loss": 0.1235, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.554054054054054, |
|
"grad_norm": 0.9668045928793142, |
|
"learning_rate": 2.808758264987406e-05, |
|
"loss": 0.1257, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5878378378378377, |
|
"grad_norm": 0.8581177979692445, |
|
"learning_rate": 2.7248155491669854e-05, |
|
"loss": 0.1174, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 1.1046256528106388, |
|
"learning_rate": 2.6409078889675382e-05, |
|
"loss": 0.1214, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6554054054054053, |
|
"grad_norm": 0.880781318496101, |
|
"learning_rate": 2.5571520800719363e-05, |
|
"loss": 0.1376, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.689189189189189, |
|
"grad_norm": 0.8458236531048872, |
|
"learning_rate": 2.473664706792873e-05, |
|
"loss": 0.1026, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.722972972972973, |
|
"grad_norm": 3.38684923966137, |
|
"learning_rate": 2.390561979792763e-05, |
|
"loss": 0.1335, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 0.8747529589638384, |
|
"learning_rate": 2.3079595743237243e-05, |
|
"loss": 0.1089, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7905405405405406, |
|
"grad_norm": 1.061409594201986, |
|
"learning_rate": 2.2259724692128448e-05, |
|
"loss": 0.1211, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.8243243243243243, |
|
"grad_norm": 0.7187465056471518, |
|
"learning_rate": 2.144714786816836e-05, |
|
"loss": 0.093, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8581081081081081, |
|
"grad_norm": 0.7427271054292853, |
|
"learning_rate": 2.0642996341688498e-05, |
|
"loss": 0.1061, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 0.9102492961843579, |
|
"learning_rate": 1.9848389455385845e-05, |
|
"loss": 0.1117, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9256756756756757, |
|
"grad_norm": 0.6300885493957281, |
|
"learning_rate": 1.9064433266248287e-05, |
|
"loss": 0.1089, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9594594594594594, |
|
"grad_norm": 0.8142160296904185, |
|
"learning_rate": 1.829221900597305e-05, |
|
"loss": 0.1039, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9932432432432432, |
|
"grad_norm": 2.6048425820022088, |
|
"learning_rate": 1.7532821562021373e-05, |
|
"loss": 0.1028, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 0.7413049752502284, |
|
"learning_rate": 1.6787297981423618e-05, |
|
"loss": 0.0681, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.060810810810811, |
|
"grad_norm": 1.1954219728615871, |
|
"learning_rate": 1.6056685999417336e-05, |
|
"loss": 0.0589, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.0945945945945947, |
|
"grad_norm": 0.5252454560354707, |
|
"learning_rate": 1.5342002594966657e-05, |
|
"loss": 0.0606, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.1283783783783785, |
|
"grad_norm": 0.6859669085189071, |
|
"learning_rate": 1.4644242575173363e-05, |
|
"loss": 0.056, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.7090167332441359, |
|
"learning_rate": 1.3964377190550165e-05, |
|
"loss": 0.0608, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.195945945945946, |
|
"grad_norm": 0.47669323187220364, |
|
"learning_rate": 1.330335278308384e-05, |
|
"loss": 0.0462, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.22972972972973, |
|
"grad_norm": 0.3967741627628627, |
|
"learning_rate": 1.2662089468969717e-05, |
|
"loss": 0.0506, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.2635135135135136, |
|
"grad_norm": 0.7073240580865503, |
|
"learning_rate": 1.2041479857851485e-05, |
|
"loss": 0.0584, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 0.534095245997379, |
|
"learning_rate": 1.14423878103487e-05, |
|
"loss": 0.0448, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.331081081081081, |
|
"grad_norm": 0.5592832360622514, |
|
"learning_rate": 1.086564723560177e-05, |
|
"loss": 0.0515, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.364864864864865, |
|
"grad_norm": 0.5012383371857895, |
|
"learning_rate": 1.031206093050798e-05, |
|
"loss": 0.0468, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3986486486486487, |
|
"grad_norm": 0.5873919693251372, |
|
"learning_rate": 9.78239946226439e-06, |
|
"loss": 0.0481, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.7663749270398796, |
|
"learning_rate": 9.277400095772979e-06, |
|
"loss": 0.051, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.4662162162162162, |
|
"grad_norm": 0.593558971064878, |
|
"learning_rate": 8.797765767401159e-06, |
|
"loss": 0.0554, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5159284273389936, |
|
"learning_rate": 8.34416410652601e-06, |
|
"loss": 0.0448, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.5337837837837838, |
|
"grad_norm": 0.6366287598455895, |
|
"learning_rate": 7.917226506224227e-06, |
|
"loss": 0.0472, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 0.9410891314952453, |
|
"learning_rate": 7.51754724440146e-06, |
|
"loss": 0.0502, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.6013513513513513, |
|
"grad_norm": 0.47104911363203844, |
|
"learning_rate": 7.145682656584196e-06, |
|
"loss": 0.0494, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.635135135135135, |
|
"grad_norm": 0.48833508181607244, |
|
"learning_rate": 6.802150361525786e-06, |
|
"loss": 0.0454, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.668918918918919, |
|
"grad_norm": 0.5274580172661194, |
|
"learning_rate": 6.487428540704467e-06, |
|
"loss": 0.0405, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.4401735004442807, |
|
"learning_rate": 6.201955272716275e-06, |
|
"loss": 0.0387, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.7364864864864864, |
|
"grad_norm": 0.5222808193914125, |
|
"learning_rate": 5.946127923489382e-06, |
|
"loss": 0.0501, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.77027027027027, |
|
"grad_norm": 0.885902051673082, |
|
"learning_rate": 5.720302593168628e-06, |
|
"loss": 0.05, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.804054054054054, |
|
"grad_norm": 0.4765687214481478, |
|
"learning_rate": 5.524793620440148e-06, |
|
"loss": 0.038, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 0.5896068685310154, |
|
"learning_rate": 5.3598731449861e-06, |
|
"loss": 0.0466, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.8716216216216215, |
|
"grad_norm": 0.4787445681998641, |
|
"learning_rate": 5.225770728678475e-06, |
|
"loss": 0.0486, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.9054054054054053, |
|
"grad_norm": 0.7298521032646316, |
|
"learning_rate": 5.122673036039321e-06, |
|
"loss": 0.0534, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.939189189189189, |
|
"grad_norm": 0.5563651328125397, |
|
"learning_rate": 5.050723574412132e-06, |
|
"loss": 0.0399, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 0.42205363383969957, |
|
"learning_rate": 5.010022494206098e-06, |
|
"loss": 0.0405, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 444, |
|
"total_flos": 311859968540672.0, |
|
"train_loss": 0.15194854576576938, |
|
"train_runtime": 8344.2681, |
|
"train_samples_per_second": 3.401, |
|
"train_steps_per_second": 0.053 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 444, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 311859968540672.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|