|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 200, |
|
"global_step": 762, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003940886699507389, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0, |
|
"loss": 1.7215, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.019704433497536946, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 1.7672, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03940886699507389, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.3376623376623376e-05, |
|
"loss": 1.7749, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.059113300492610835, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.7334, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07881773399014778, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.9350649350649355e-05, |
|
"loss": 1.7012, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09852216748768473, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 1.6683, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11822660098522167, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 7.532467532467533e-05, |
|
"loss": 1.6283, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 1.5815, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15763546798029557, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001012987012987013, |
|
"loss": 1.5755, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17733990147783252, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 1.5314, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19704433497536947, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 1.5258, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21674876847290642, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00014025974025974028, |
|
"loss": 1.5131, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.23645320197044334, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 0.00015324675324675325, |
|
"loss": 1.498, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2561576354679803, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 0.00016623376623376625, |
|
"loss": 1.4953, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.00017922077922077922, |
|
"loss": 1.4889, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2955665024630542, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 0.00019220779220779222, |
|
"loss": 1.4768, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.31527093596059114, |
|
"grad_norm": 0.12158203125, |
|
"learning_rate": 0.00019999579326114222, |
|
"loss": 1.4826, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.33497536945812806, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.00019994847151359784, |
|
"loss": 1.4514, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.35467980295566504, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 0.0001998485945607536, |
|
"loss": 1.469, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.37438423645320196, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 0.00019969621492020869, |
|
"loss": 1.4381, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.39408866995073893, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 0.00019949141271668306, |
|
"loss": 1.4513, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.00019923429563988614, |
|
"loss": 1.4403, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.43349753694581283, |
|
"grad_norm": 0.1083984375, |
|
"learning_rate": 0.00019892499888789098, |
|
"loss": 1.4521, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.45320197044334976, |
|
"grad_norm": 0.12158203125, |
|
"learning_rate": 0.00019856368509604412, |
|
"loss": 1.4494, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4729064039408867, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 0.00019815054425144815, |
|
"loss": 1.4289, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.49261083743842365, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 0.00019768579359306205, |
|
"loss": 1.4261, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5123152709359606, |
|
"grad_norm": 0.1123046875, |
|
"learning_rate": 0.00019716967749747207, |
|
"loss": 1.4212, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5320197044334976, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.00019660246735039266, |
|
"loss": 1.4517, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 0.00019598446140396605, |
|
"loss": 1.4208, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 0.00019531598461993392, |
|
"loss": 1.4357, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5911330049261084, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 0.00019459738849876543, |
|
"loss": 1.4108, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6108374384236454, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 0.00019382905089482995, |
|
"loss": 1.4396, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6305418719211823, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.00019301137581771266, |
|
"loss": 1.4114, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6502463054187192, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.00019214479321977697, |
|
"loss": 1.4221, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6699507389162561, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00019122975877008567, |
|
"loss": 1.4234, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.00019026675361479969, |
|
"loss": 1.4378, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7093596059113301, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.0001892562841241804, |
|
"loss": 1.4178, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.729064039408867, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 0.00018819888162632838, |
|
"loss": 1.4221, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7487684729064039, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 0.00018709510212779903, |
|
"loss": 1.4267, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7684729064039408, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.0001859455260212414, |
|
"loss": 1.4226, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"grad_norm": 0.10986328125, |
|
"learning_rate": 0.00018475075778021438, |
|
"loss": 1.4328, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"eval_loss": 1.4307746887207031, |
|
"eval_runtime": 143.9646, |
|
"eval_samples_per_second": 50.116, |
|
"eval_steps_per_second": 3.133, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8078817733990148, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.00018351142564134078, |
|
"loss": 1.4341, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 0.0001822281812739659, |
|
"loss": 1.4172, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8472906403940886, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.41, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8669950738916257, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.00017953267762658827, |
|
"loss": 1.4099, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8866995073891626, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.00017812183570440428, |
|
"loss": 1.4176, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9064039408866995, |
|
"grad_norm": 0.10546875, |
|
"learning_rate": 0.00017666991552407724, |
|
"loss": 1.4101, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9261083743842364, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 0.0001751776805386344, |
|
"loss": 1.4029, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9458128078817734, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.000173645915399555, |
|
"loss": 1.3944, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.00017207542554418227, |
|
"loss": 1.4001, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.000170467036772206, |
|
"loss": 1.4076, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0039408866995074, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.00016882159481143802, |
|
"loss": 1.4025, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0236453201970444, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.00016713996487310916, |
|
"loss": 1.405, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0433497536945813, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00016542303119692129, |
|
"loss": 1.4017, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0630541871921182, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00016367169658609355, |
|
"loss": 1.4038, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0827586206896551, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 0.0001618868819326479, |
|
"loss": 1.3942, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.102463054187192, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 0.00016006952573318278, |
|
"loss": 1.3961, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1221674876847292, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.00015822058359539002, |
|
"loss": 1.4007, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.141871921182266, |
|
"grad_norm": 0.10009765625, |
|
"learning_rate": 0.0001563410277355743, |
|
"loss": 1.4071, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.161576354679803, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.0001544318464674397, |
|
"loss": 1.385, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.18128078817734, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00015249404368241116, |
|
"loss": 1.3933, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2009852216748769, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.0001505286383217657, |
|
"loss": 1.3886, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2206896551724138, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.0001485366638408496, |
|
"loss": 1.387, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2403940886699507, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.0001465191676656634, |
|
"loss": 1.3949, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2600985221674876, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.00014447721064210186, |
|
"loss": 1.3927, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2798029556650246, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.00014241186647813626, |
|
"loss": 1.4009, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2995073891625615, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.00014032422117923426, |
|
"loss": 1.3838, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3192118226600984, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00013821537247731336, |
|
"loss": 1.3958, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3389162561576355, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00013608642925352793, |
|
"loss": 1.4111, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3586206896551725, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 0.00013393851095519423, |
|
"loss": 1.4007, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.3783251231527094, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00013177274700715914, |
|
"loss": 1.3933, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3980295566502463, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00012959027621792265, |
|
"loss": 1.3997, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.4177339901477832, |
|
"grad_norm": 0.09716796875, |
|
"learning_rate": 0.00012739224618082612, |
|
"loss": 1.4075, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4374384236453202, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 0.00012517981267062134, |
|
"loss": 1.3927, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.457142857142857, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.00012295413903573756, |
|
"loss": 1.3956, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4768472906403942, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.00012071639558656614, |
|
"loss": 1.4016, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.4965517241379311, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 0.00011846775898008438, |
|
"loss": 1.3889, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.516256157635468, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 0.00011620941160114229, |
|
"loss": 1.3962, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.535960591133005, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.0001139425409407374, |
|
"loss": 1.3996, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.555665024630542, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.00011166833897160465, |
|
"loss": 1.4013, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.5753694581280788, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.00010938800152144984, |
|
"loss": 1.3788, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5753694581280788, |
|
"eval_loss": 1.4156588315963745, |
|
"eval_runtime": 144.1156, |
|
"eval_samples_per_second": 50.064, |
|
"eval_steps_per_second": 3.129, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5950738916256157, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.00010710272764415566, |
|
"loss": 1.3997, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6147783251231527, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 0.00010481371898929186, |
|
"loss": 1.3844, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6344827586206896, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 0.0001025221791702601, |
|
"loss": 1.4087, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.6541871921182265, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 0.00010022931313140638, |
|
"loss": 1.3909, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6738916256157634, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 9.793632651443357e-05, |
|
"loss": 1.3925, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.6935960591133004, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 9.564442502444735e-05, |
|
"loss": 1.3906, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7133004926108373, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 9.33548137959686e-05, |
|
"loss": 1.4122, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.7330049261083744, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 9.106869675924605e-05, |
|
"loss": 1.3932, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.7527093596059113, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 8.878727600720207e-05, |
|
"loss": 1.3974, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7724137931034483, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 8.651175116334443e-05, |
|
"loss": 1.3872, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7921182266009852, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 8.424331875097688e-05, |
|
"loss": 1.4025, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8118226600985223, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 8.19831715640394e-05, |
|
"loss": 1.4054, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8315270935960593, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 7.973249803991006e-05, |
|
"loss": 1.3878, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8512315270935962, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 7.749248163449693e-05, |
|
"loss": 1.3862, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.870935960591133, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 7.526430019995001e-05, |
|
"loss": 1.4118, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.89064039408867, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 7.304912536531944e-05, |
|
"loss": 1.3954, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.910344827586207, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 7.084812192048594e-05, |
|
"loss": 1.4003, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9300492610837439, |
|
"grad_norm": 0.1005859375, |
|
"learning_rate": 6.866244720368737e-05, |
|
"loss": 1.3998, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9497536945812808, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 6.6493250492964e-05, |
|
"loss": 1.3837, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.9694581280788177, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 6.434167240184135e-05, |
|
"loss": 1.411, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9891625615763546, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 6.220884427956953e-05, |
|
"loss": 1.3949, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.007881773399015, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 6.0095887616233796e-05, |
|
"loss": 1.3871, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.027586206896552, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 5.800391345304914e-05, |
|
"loss": 1.3871, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.0472906403940887, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 5.593402179814944e-05, |
|
"loss": 1.3887, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.0669950738916256, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 5.388730104817769e-05, |
|
"loss": 1.3913, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.0866995073891625, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 5.18648274159821e-05, |
|
"loss": 1.3854, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.1064039408866995, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 4.9867664364718725e-05, |
|
"loss": 1.3915, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.1261083743842364, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 4.7896862048657965e-05, |
|
"loss": 1.3917, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.1458128078817733, |
|
"grad_norm": 0.095703125, |
|
"learning_rate": 4.595345676098923e-05, |
|
"loss": 1.3855, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.1655172413793102, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 4.403847038891424e-05, |
|
"loss": 1.3838, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.185221674876847, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 4.2152909876315316e-05, |
|
"loss": 1.3919, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.204926108374384, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 4.0297766694280915e-05, |
|
"loss": 1.3979, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.224630541871921, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 3.8474016319767435e-05, |
|
"loss": 1.3865, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.2443349753694584, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 3.6682617722671096e-05, |
|
"loss": 1.3903, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.264039408866995, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 3.4924512861579315e-05, |
|
"loss": 1.3841, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.283743842364532, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 3.3200626188467344e-05, |
|
"loss": 1.3965, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.303448275862069, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 3.151186416260006e-05, |
|
"loss": 1.4112, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.323152709359606, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 2.9859114773895025e-05, |
|
"loss": 1.385, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.342857142857143, |
|
"grad_norm": 0.0986328125, |
|
"learning_rate": 2.8243247075996693e-05, |
|
"loss": 1.3838, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.36256157635468, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 2.6665110729308263e-05, |
|
"loss": 1.3938, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.36256157635468, |
|
"eval_loss": 1.4132238626480103, |
|
"eval_runtime": 144.0354, |
|
"eval_samples_per_second": 50.092, |
|
"eval_steps_per_second": 3.131, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.382266009852217, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 2.5125535554220482e-05, |
|
"loss": 1.3974, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.4019704433497537, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 2.3625331094773206e-05, |
|
"loss": 1.3814, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.4216748768472907, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 2.2165286192978342e-05, |
|
"loss": 1.3858, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.4413793103448276, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 2.074616857402867e-05, |
|
"loss": 1.3804, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.4610837438423645, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 1.936872444261022e-05, |
|
"loss": 1.3868, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.4807881773399014, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 1.8033678090530813e-05, |
|
"loss": 1.3923, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.5004926108374383, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 1.6741731515870594e-05, |
|
"loss": 1.3889, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.5201970443349753, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 1.549356405385538e-05, |
|
"loss": 1.3736, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.539901477832512, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 1.428983201964662e-05, |
|
"loss": 1.3955, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.559605911330049, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 1.313116836323568e-05, |
|
"loss": 1.3946, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.5793103448275865, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 1.2018182336624273e-05, |
|
"loss": 1.3907, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.599014778325123, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 1.0951459173465629e-05, |
|
"loss": 1.4041, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.6187192118226603, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 9.93155978133541e-06, |
|
"loss": 1.391, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.638423645320197, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 8.959020446793288e-06, |
|
"loss": 1.3882, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.658128078817734, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 8.034352553391367e-06, |
|
"loss": 1.4001, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.677832512315271, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 7.158042312776847e-06, |
|
"loss": 1.3824, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.697536945812808, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 6.330550509030852e-06, |
|
"loss": 1.379, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.717241379310345, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 5.552312256377423e-06, |
|
"loss": 1.3787, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.736945812807882, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 4.823736770390552e-06, |
|
"loss": 1.3902, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.7566502463054188, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 4.14520715281923e-06, |
|
"loss": 1.3991, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.7763546798029557, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 3.517080190143629e-06, |
|
"loss": 1.3866, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.7960591133004926, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 2.9396861659686915e-06, |
|
"loss": 1.3864, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.8157635467980295, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 2.4133286873533112e-06, |
|
"loss": 1.373, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.8354679802955665, |
|
"grad_norm": 0.0966796875, |
|
"learning_rate": 1.9382845251668335e-06, |
|
"loss": 1.384, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.8551724137931034, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 1.514803468556547e-06, |
|
"loss": 1.3768, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.8748768472906403, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 1.14310819360276e-06, |
|
"loss": 1.3968, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.8945812807881772, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 8.233941462306271e-07, |
|
"loss": 1.3811, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.914285714285714, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 5.558294394402253e-07, |
|
"loss": 1.4115, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.933990147783251, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 3.405547649087959e-07, |
|
"loss": 1.4119, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.9536945812807884, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 1.7768331901187875e-07, |
|
"loss": 1.395, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.973399014778325, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 6.730074330203451e-08, |
|
"loss": 1.3966, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.9931034482758623, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 9.46507947655606e-09, |
|
"loss": 1.3686, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 762, |
|
"total_flos": 7.533943292711404e+17, |
|
"train_loss": 1.4212341141200129, |
|
"train_runtime": 13229.3679, |
|
"train_samples_per_second": 14.73, |
|
"train_steps_per_second": 0.058 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 762, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.533943292711404e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|