|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9936575052854124, |
|
"eval_steps": 500, |
|
"global_step": 354, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.042283298097251586, |
|
"grad_norm": 1.2008394002914429, |
|
"learning_rate": 4.9975392245612254e-05, |
|
"loss": 4.1473, |
|
"num_input_tokens_seen": 68384, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08456659619450317, |
|
"grad_norm": 1.1565380096435547, |
|
"learning_rate": 4.9901617425775067e-05, |
|
"loss": 4.021, |
|
"num_input_tokens_seen": 128224, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12684989429175475, |
|
"grad_norm": 0.972896158695221, |
|
"learning_rate": 4.9778820775100185e-05, |
|
"loss": 3.8212, |
|
"num_input_tokens_seen": 197760, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16913319238900634, |
|
"grad_norm": 1.0641047954559326, |
|
"learning_rate": 4.9607244033573156e-05, |
|
"loss": 3.7653, |
|
"num_input_tokens_seen": 262832, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21141649048625794, |
|
"grad_norm": 1.0994701385498047, |
|
"learning_rate": 4.93872249706591e-05, |
|
"loss": 3.7434, |
|
"num_input_tokens_seen": 333472, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2536997885835095, |
|
"grad_norm": 1.193864107131958, |
|
"learning_rate": 4.91191967203629e-05, |
|
"loss": 3.5488, |
|
"num_input_tokens_seen": 393616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2959830866807611, |
|
"grad_norm": 1.0215297937393188, |
|
"learning_rate": 4.8803686928552736e-05, |
|
"loss": 3.5732, |
|
"num_input_tokens_seen": 458240, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3382663847780127, |
|
"grad_norm": 1.312198519706726, |
|
"learning_rate": 4.84413167142257e-05, |
|
"loss": 3.7108, |
|
"num_input_tokens_seen": 524576, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.38054968287526425, |
|
"grad_norm": 1.3579617738723755, |
|
"learning_rate": 4.803279944676032e-05, |
|
"loss": 3.6871, |
|
"num_input_tokens_seen": 591856, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.42283298097251587, |
|
"grad_norm": 1.4524191617965698, |
|
"learning_rate": 4.7578939341563095e-05, |
|
"loss": 3.286, |
|
"num_input_tokens_seen": 655648, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.248831033706665, |
|
"learning_rate": 4.70806298768736e-05, |
|
"loss": 3.5377, |
|
"num_input_tokens_seen": 721280, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.507399577167019, |
|
"grad_norm": 1.2427473068237305, |
|
"learning_rate": 4.653885203484515e-05, |
|
"loss": 3.56, |
|
"num_input_tokens_seen": 784688, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5496828752642706, |
|
"grad_norm": 1.323653221130371, |
|
"learning_rate": 4.595467237036329e-05, |
|
"loss": 3.4937, |
|
"num_input_tokens_seen": 850848, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5919661733615222, |
|
"grad_norm": 1.3730229139328003, |
|
"learning_rate": 4.532924091140417e-05, |
|
"loss": 3.3823, |
|
"num_input_tokens_seen": 912480, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6342494714587738, |
|
"grad_norm": 1.5328903198242188, |
|
"learning_rate": 4.466378889506607e-05, |
|
"loss": 3.3798, |
|
"num_input_tokens_seen": 978448, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6765327695560254, |
|
"grad_norm": 1.4153543710708618, |
|
"learning_rate": 4.395962634373097e-05, |
|
"loss": 3.3044, |
|
"num_input_tokens_seen": 1041280, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.718816067653277, |
|
"grad_norm": 1.6301600933074951, |
|
"learning_rate": 4.3218139486127854e-05, |
|
"loss": 3.3661, |
|
"num_input_tokens_seen": 1102224, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7610993657505285, |
|
"grad_norm": 1.6634522676467896, |
|
"learning_rate": 4.2440788028374624e-05, |
|
"loss": 3.3829, |
|
"num_input_tokens_seen": 1166576, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8033826638477801, |
|
"grad_norm": 1.4539167881011963, |
|
"learning_rate": 4.1629102280370904e-05, |
|
"loss": 3.2241, |
|
"num_input_tokens_seen": 1230096, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8456659619450317, |
|
"grad_norm": 1.453364372253418, |
|
"learning_rate": 4.0784680143198836e-05, |
|
"loss": 3.0931, |
|
"num_input_tokens_seen": 1297968, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8879492600422833, |
|
"grad_norm": 2.0559964179992676, |
|
"learning_rate": 3.990918396346254e-05, |
|
"loss": 3.3581, |
|
"num_input_tokens_seen": 1361760, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 2.0426218509674072, |
|
"learning_rate": 3.900433726075865e-05, |
|
"loss": 3.2308, |
|
"num_input_tokens_seen": 1422096, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9725158562367865, |
|
"grad_norm": 1.7696195840835571, |
|
"learning_rate": 3.8071921334720696e-05, |
|
"loss": 3.1427, |
|
"num_input_tokens_seen": 1491120, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.014799154334038, |
|
"grad_norm": 2.3205981254577637, |
|
"learning_rate": 3.711377175831626e-05, |
|
"loss": 3.0988, |
|
"num_input_tokens_seen": 1556440, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0570824524312896, |
|
"grad_norm": 1.7631757259368896, |
|
"learning_rate": 3.613177476430079e-05, |
|
"loss": 3.0695, |
|
"num_input_tokens_seen": 1624200, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0993657505285412, |
|
"grad_norm": 2.32861328125, |
|
"learning_rate": 3.512786353194134e-05, |
|
"loss": 3.1109, |
|
"num_input_tokens_seen": 1686600, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1416490486257929, |
|
"grad_norm": 2.1291067600250244, |
|
"learning_rate": 3.410401438132056e-05, |
|
"loss": 3.116, |
|
"num_input_tokens_seen": 1755144, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1839323467230445, |
|
"grad_norm": 1.8158444166183472, |
|
"learning_rate": 3.3062242882712724e-05, |
|
"loss": 2.9528, |
|
"num_input_tokens_seen": 1819672, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.226215644820296, |
|
"grad_norm": 2.161515951156616, |
|
"learning_rate": 3.200459988869111e-05, |
|
"loss": 2.8494, |
|
"num_input_tokens_seen": 1886136, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2684989429175475, |
|
"grad_norm": 2.321112632751465, |
|
"learning_rate": 3.093316749677788e-05, |
|
"loss": 3.1525, |
|
"num_input_tokens_seen": 1947656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3107822410147991, |
|
"grad_norm": 2.4329445362091064, |
|
"learning_rate": 2.985005495058446e-05, |
|
"loss": 2.8991, |
|
"num_input_tokens_seen": 2013272, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.3530655391120507, |
|
"grad_norm": 2.7929720878601074, |
|
"learning_rate": 2.875739448751176e-05, |
|
"loss": 3.026, |
|
"num_input_tokens_seen": 2079816, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 2.6059908866882324, |
|
"learning_rate": 2.7657337141184138e-05, |
|
"loss": 2.8813, |
|
"num_input_tokens_seen": 2146008, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.437632135306554, |
|
"grad_norm": 2.831233501434326, |
|
"learning_rate": 2.655204850688085e-05, |
|
"loss": 2.7721, |
|
"num_input_tokens_seen": 2212840, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4799154334038054, |
|
"grad_norm": 2.757953643798828, |
|
"learning_rate": 2.5443704478301154e-05, |
|
"loss": 2.737, |
|
"num_input_tokens_seen": 2278824, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.522198731501057, |
|
"grad_norm": 2.8655178546905518, |
|
"learning_rate": 2.433448696405563e-05, |
|
"loss": 2.6645, |
|
"num_input_tokens_seen": 2339768, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5644820295983086, |
|
"grad_norm": 2.8346216678619385, |
|
"learning_rate": 2.3226579592316538e-05, |
|
"loss": 2.7233, |
|
"num_input_tokens_seen": 2407976, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.6067653276955602, |
|
"grad_norm": 2.432692766189575, |
|
"learning_rate": 2.2122163412082927e-05, |
|
"loss": 2.7463, |
|
"num_input_tokens_seen": 2469176, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6490486257928119, |
|
"grad_norm": 3.0815346240997314, |
|
"learning_rate": 2.1023412599523204e-05, |
|
"loss": 2.8348, |
|
"num_input_tokens_seen": 2528152, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.6913319238900635, |
|
"grad_norm": 3.362581968307495, |
|
"learning_rate": 1.993249017784766e-05, |
|
"loss": 2.7732, |
|
"num_input_tokens_seen": 2588984, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.733615221987315, |
|
"grad_norm": 3.2196719646453857, |
|
"learning_rate": 1.8851543759137007e-05, |
|
"loss": 2.7693, |
|
"num_input_tokens_seen": 2657016, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.7758985200845667, |
|
"grad_norm": 2.6926162242889404, |
|
"learning_rate": 1.778270131650948e-05, |
|
"loss": 2.6524, |
|
"num_input_tokens_seen": 2719656, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 2.9597244262695312, |
|
"learning_rate": 1.672806699494966e-05, |
|
"loss": 2.7338, |
|
"num_input_tokens_seen": 2786168, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 3.4195263385772705, |
|
"learning_rate": 1.5689716969045848e-05, |
|
"loss": 2.7467, |
|
"num_input_tokens_seen": 2849400, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.9027484143763214, |
|
"grad_norm": 2.8715097904205322, |
|
"learning_rate": 1.4669695355790552e-05, |
|
"loss": 2.5769, |
|
"num_input_tokens_seen": 2917000, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.945031712473573, |
|
"grad_norm": 3.514866352081299, |
|
"learning_rate": 1.3670010190490073e-05, |
|
"loss": 2.784, |
|
"num_input_tokens_seen": 2980712, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9873150105708244, |
|
"grad_norm": 3.751493215560913, |
|
"learning_rate": 1.2692629473705453e-05, |
|
"loss": 2.7729, |
|
"num_input_tokens_seen": 3045848, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.029598308668076, |
|
"grad_norm": 2.950978994369507, |
|
"learning_rate": 1.173947729700644e-05, |
|
"loss": 2.8171, |
|
"num_input_tokens_seen": 3105128, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0718816067653276, |
|
"grad_norm": 3.061615467071533, |
|
"learning_rate": 1.081243005516571e-05, |
|
"loss": 2.5436, |
|
"num_input_tokens_seen": 3175016, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.1141649048625792, |
|
"grad_norm": 3.8741908073425293, |
|
"learning_rate": 9.913312752249903e-06, |
|
"loss": 2.4681, |
|
"num_input_tokens_seen": 3245608, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.156448202959831, |
|
"grad_norm": 4.65464448928833, |
|
"learning_rate": 9.043895408879505e-06, |
|
"loss": 2.4036, |
|
"num_input_tokens_seen": 3308040, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.1987315010570825, |
|
"grad_norm": 3.1693899631500244, |
|
"learning_rate": 8.20588957773018e-06, |
|
"loss": 2.4283, |
|
"num_input_tokens_seen": 3376152, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.241014799154334, |
|
"grad_norm": 3.5537617206573486, |
|
"learning_rate": 7.400944974135427e-06, |
|
"loss": 2.6161, |
|
"num_input_tokens_seen": 3438504, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.2832980972515857, |
|
"grad_norm": 3.5316078662872314, |
|
"learning_rate": 6.6306462284233234e-06, |
|
"loss": 2.6151, |
|
"num_input_tokens_seen": 3508488, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 3.0121572017669678, |
|
"learning_rate": 5.896509766381028e-06, |
|
"loss": 2.5051, |
|
"num_input_tokens_seen": 3572248, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.367864693446089, |
|
"grad_norm": 3.4623148441314697, |
|
"learning_rate": 5.199980823988157e-06, |
|
"loss": 2.535, |
|
"num_input_tokens_seen": 3627304, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.41014799154334, |
|
"grad_norm": 2.7531158924102783, |
|
"learning_rate": 4.542430602295774e-06, |
|
"loss": 2.4052, |
|
"num_input_tokens_seen": 3696504, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.452431289640592, |
|
"grad_norm": 3.238701105117798, |
|
"learning_rate": 3.925153568052123e-06, |
|
"loss": 2.6136, |
|
"num_input_tokens_seen": 3757976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4947145877378434, |
|
"grad_norm": 3.5991199016571045, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 2.2962, |
|
"num_input_tokens_seen": 3829192, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.536997885835095, |
|
"grad_norm": 3.9785468578338623, |
|
"learning_rate": 2.8161981235857143e-06, |
|
"loss": 2.6799, |
|
"num_input_tokens_seen": 3900648, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.5792811839323466, |
|
"grad_norm": 3.7737724781036377, |
|
"learning_rate": 2.3267028256193036e-06, |
|
"loss": 2.3375, |
|
"num_input_tokens_seen": 3970936, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.6215644820295982, |
|
"grad_norm": 3.0493130683898926, |
|
"learning_rate": 1.881842641895104e-06, |
|
"loss": 2.5113, |
|
"num_input_tokens_seen": 4033192, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.66384778012685, |
|
"grad_norm": 4.20770788192749, |
|
"learning_rate": 1.4824933332241692e-06, |
|
"loss": 2.5114, |
|
"num_input_tokens_seen": 4095144, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.7061310782241015, |
|
"grad_norm": 3.411532402038574, |
|
"learning_rate": 1.129441066782702e-06, |
|
"loss": 2.5342, |
|
"num_input_tokens_seen": 4159320, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.748414376321353, |
|
"grad_norm": 3.43635630607605, |
|
"learning_rate": 8.233808684473959e-07, |
|
"loss": 2.4773, |
|
"num_input_tokens_seen": 4227128, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 3.7875025272369385, |
|
"learning_rate": 5.649152545533332e-07, |
|
"loss": 2.4826, |
|
"num_input_tokens_seen": 4293848, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.8329809725158563, |
|
"grad_norm": 3.2557740211486816, |
|
"learning_rate": 3.5455304576806347e-07, |
|
"loss": 2.5766, |
|
"num_input_tokens_seen": 4358088, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.875264270613108, |
|
"grad_norm": 3.3667635917663574, |
|
"learning_rate": 1.927083654168854e-07, |
|
"loss": 2.5021, |
|
"num_input_tokens_seen": 4421432, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.9175475687103596, |
|
"grad_norm": 3.3414418697357178, |
|
"learning_rate": 7.969982423124689e-08, |
|
"loss": 2.6718, |
|
"num_input_tokens_seen": 4482632, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.9598308668076108, |
|
"grad_norm": 3.738370895385742, |
|
"learning_rate": 1.5749893125160954e-08, |
|
"loss": 2.3902, |
|
"num_input_tokens_seen": 4547096, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 354, |
|
"num_input_tokens_seen": 4597544, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.475245368337203e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|