|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 1446, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013831258644536654, |
|
"grad_norm": 133.08836364746094, |
|
"learning_rate": 0.0, |
|
"loss": 1.6747, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006915629322268326, |
|
"grad_norm": 10.979754447937012, |
|
"learning_rate": 5.517241379310345e-06, |
|
"loss": 1.5287, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.013831258644536652, |
|
"grad_norm": 5.598673343658447, |
|
"learning_rate": 1.2413793103448277e-05, |
|
"loss": 1.1371, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02074688796680498, |
|
"grad_norm": 4.170475959777832, |
|
"learning_rate": 1.9310344827586207e-05, |
|
"loss": 1.0279, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.027662517289073305, |
|
"grad_norm": 3.909233331680298, |
|
"learning_rate": 2.620689655172414e-05, |
|
"loss": 0.9947, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.034578146611341634, |
|
"grad_norm": 3.94997501373291, |
|
"learning_rate": 3.310344827586207e-05, |
|
"loss": 0.9337, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04149377593360996, |
|
"grad_norm": 3.896550178527832, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8761, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.048409405255878286, |
|
"grad_norm": 3.581073522567749, |
|
"learning_rate": 4.689655172413793e-05, |
|
"loss": 0.8054, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05532503457814661, |
|
"grad_norm": 2.294735908508301, |
|
"learning_rate": 5.379310344827586e-05, |
|
"loss": 0.7511, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06224066390041494, |
|
"grad_norm": 1.9514353275299072, |
|
"learning_rate": 6.068965517241379e-05, |
|
"loss": 0.7493, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06915629322268327, |
|
"grad_norm": 2.1982648372650146, |
|
"learning_rate": 6.758620689655173e-05, |
|
"loss": 0.8087, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07607192254495158, |
|
"grad_norm": 2.2902512550354004, |
|
"learning_rate": 7.448275862068966e-05, |
|
"loss": 0.7906, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08298755186721991, |
|
"grad_norm": 1.9472178220748901, |
|
"learning_rate": 8.137931034482759e-05, |
|
"loss": 0.7872, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08990318118948824, |
|
"grad_norm": 1.7574316263198853, |
|
"learning_rate": 8.827586206896552e-05, |
|
"loss": 0.7898, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09681881051175657, |
|
"grad_norm": 1.8785020112991333, |
|
"learning_rate": 9.517241379310345e-05, |
|
"loss": 0.8166, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1037344398340249, |
|
"grad_norm": 1.5703641176223755, |
|
"learning_rate": 0.0001020689655172414, |
|
"loss": 0.7987, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11065006915629322, |
|
"grad_norm": 1.4895148277282715, |
|
"learning_rate": 0.00010896551724137931, |
|
"loss": 0.8054, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11756569847856155, |
|
"grad_norm": 1.7236794233322144, |
|
"learning_rate": 0.00011586206896551725, |
|
"loss": 0.8111, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12448132780082988, |
|
"grad_norm": 1.788547396659851, |
|
"learning_rate": 0.00012275862068965518, |
|
"loss": 0.8067, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1313969571230982, |
|
"grad_norm": 1.6218682527542114, |
|
"learning_rate": 0.0001296551724137931, |
|
"loss": 0.8175, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.13831258644536654, |
|
"grad_norm": 1.438413381576538, |
|
"learning_rate": 0.00013655172413793104, |
|
"loss": 0.8485, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14522821576763487, |
|
"grad_norm": 1.6937086582183838, |
|
"learning_rate": 0.00014344827586206896, |
|
"loss": 0.8562, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.15214384508990317, |
|
"grad_norm": 1.4097000360488892, |
|
"learning_rate": 0.0001503448275862069, |
|
"loss": 0.851, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1590594744121715, |
|
"grad_norm": 2.706479072570801, |
|
"learning_rate": 0.00015724137931034485, |
|
"loss": 0.8461, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16597510373443983, |
|
"grad_norm": 2.0034353733062744, |
|
"learning_rate": 0.00016413793103448276, |
|
"loss": 0.8548, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17289073305670816, |
|
"grad_norm": 1.3242019414901733, |
|
"learning_rate": 0.0001710344827586207, |
|
"loss": 0.8553, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1798063623789765, |
|
"grad_norm": 2.6091411113739014, |
|
"learning_rate": 0.00017793103448275862, |
|
"loss": 0.848, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18672199170124482, |
|
"grad_norm": 1.3679989576339722, |
|
"learning_rate": 0.00018482758620689654, |
|
"loss": 0.8716, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.19363762102351315, |
|
"grad_norm": 1.323454737663269, |
|
"learning_rate": 0.0001917241379310345, |
|
"loss": 0.8488, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.20055325034578148, |
|
"grad_norm": 2.7075045108795166, |
|
"learning_rate": 0.00019862068965517243, |
|
"loss": 0.9631, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2074688796680498, |
|
"grad_norm": 2.2683520317077637, |
|
"learning_rate": 0.0001999953352135947, |
|
"loss": 0.9498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2143845089903181, |
|
"grad_norm": 8.38696002960205, |
|
"learning_rate": 0.0001999763852647035, |
|
"loss": 0.891, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.22130013831258644, |
|
"grad_norm": 24.307083129882812, |
|
"learning_rate": 0.00019994286136445976, |
|
"loss": 0.9308, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22821576763485477, |
|
"grad_norm": 2.0212178230285645, |
|
"learning_rate": 0.0001998947683997744, |
|
"loss": 0.8901, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2351313969571231, |
|
"grad_norm": 1.2989600896835327, |
|
"learning_rate": 0.00019983211338134828, |
|
"loss": 0.8546, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.24204702627939143, |
|
"grad_norm": 1.2097629308700562, |
|
"learning_rate": 0.00019975490544265012, |
|
"loss": 0.8543, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.24896265560165975, |
|
"grad_norm": 30.946725845336914, |
|
"learning_rate": 0.00019966315583858516, |
|
"loss": 1.6915, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.25587828492392806, |
|
"grad_norm": 2.118316173553467, |
|
"learning_rate": 0.0001995568779438545, |
|
"loss": 1.0665, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2627939142461964, |
|
"grad_norm": 3.086249589920044, |
|
"learning_rate": 0.00019943608725100532, |
|
"loss": 0.9988, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2697095435684647, |
|
"grad_norm": 1.1766245365142822, |
|
"learning_rate": 0.00019930080136817255, |
|
"loss": 0.9133, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2766251728907331, |
|
"grad_norm": 1.33064603805542, |
|
"learning_rate": 0.00019915104001651203, |
|
"loss": 0.8809, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2835408022130014, |
|
"grad_norm": 1.6439151763916016, |
|
"learning_rate": 0.00019898682502732568, |
|
"loss": 0.8967, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.29045643153526973, |
|
"grad_norm": 1.3271840810775757, |
|
"learning_rate": 0.00019880818033887916, |
|
"loss": 0.8866, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.29737206085753803, |
|
"grad_norm": 1.1745598316192627, |
|
"learning_rate": 0.0001986151319929121, |
|
"loss": 0.8598, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.30428769017980634, |
|
"grad_norm": 1.0390416383743286, |
|
"learning_rate": 0.00019840770813084205, |
|
"loss": 0.8434, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3112033195020747, |
|
"grad_norm": 1.0289931297302246, |
|
"learning_rate": 0.00019818593898966212, |
|
"loss": 0.8765, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.318118948824343, |
|
"grad_norm": 1.1883145570755005, |
|
"learning_rate": 0.00019794985689753337, |
|
"loss": 0.8859, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.32503457814661135, |
|
"grad_norm": 1.2715511322021484, |
|
"learning_rate": 0.00019769949626907186, |
|
"loss": 0.8626, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.33195020746887965, |
|
"grad_norm": 1.287985920906067, |
|
"learning_rate": 0.00019743489360033231, |
|
"loss": 0.8805, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.338865836791148, |
|
"grad_norm": 1.1524231433868408, |
|
"learning_rate": 0.00019715608746348763, |
|
"loss": 0.8588, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3457814661134163, |
|
"grad_norm": 1.0479854345321655, |
|
"learning_rate": 0.00019686311850120625, |
|
"loss": 0.858, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.35269709543568467, |
|
"grad_norm": 0.9751342535018921, |
|
"learning_rate": 0.0001965560294207274, |
|
"loss": 0.8558, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.359612724757953, |
|
"grad_norm": 0.9835847020149231, |
|
"learning_rate": 0.00019623486498763555, |
|
"loss": 0.8755, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3665283540802213, |
|
"grad_norm": 3.7135465145111084, |
|
"learning_rate": 0.00019589967201933471, |
|
"loss": 0.8584, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.37344398340248963, |
|
"grad_norm": 1.0380736589431763, |
|
"learning_rate": 0.00019555049937822384, |
|
"loss": 0.8544, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.38035961272475793, |
|
"grad_norm": 1.0324746370315552, |
|
"learning_rate": 0.00019518739796457366, |
|
"loss": 0.8673, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3872752420470263, |
|
"grad_norm": 1.0108332633972168, |
|
"learning_rate": 0.00019481042070910705, |
|
"loss": 0.8443, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3941908713692946, |
|
"grad_norm": 0.8845415711402893, |
|
"learning_rate": 0.00019441962256528292, |
|
"loss": 0.857, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.40110650069156295, |
|
"grad_norm": 0.8270080089569092, |
|
"learning_rate": 0.00019401506050128556, |
|
"loss": 0.8583, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.40802213001383125, |
|
"grad_norm": 0.8553124070167542, |
|
"learning_rate": 0.00019359679349172004, |
|
"loss": 0.8291, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4149377593360996, |
|
"grad_norm": 0.9521012306213379, |
|
"learning_rate": 0.00019316488250901534, |
|
"loss": 0.8486, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4218533886583679, |
|
"grad_norm": 0.8840038776397705, |
|
"learning_rate": 0.00019271939051453612, |
|
"loss": 0.8258, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4287690179806362, |
|
"grad_norm": 1.223691463470459, |
|
"learning_rate": 0.00019226038244940464, |
|
"loss": 0.8142, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.43568464730290457, |
|
"grad_norm": 0.8988921642303467, |
|
"learning_rate": 0.00019178792522503394, |
|
"loss": 0.8611, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4426002766251729, |
|
"grad_norm": 0.813004732131958, |
|
"learning_rate": 0.000191302087713374, |
|
"loss": 0.8208, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.44951590594744123, |
|
"grad_norm": 0.8524804711341858, |
|
"learning_rate": 0.00019080294073687193, |
|
"loss": 0.8393, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.45643153526970953, |
|
"grad_norm": 0.9394051432609558, |
|
"learning_rate": 0.000190290557058148, |
|
"loss": 0.8567, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4633471645919779, |
|
"grad_norm": 0.9042031168937683, |
|
"learning_rate": 0.00018976501136938864, |
|
"loss": 0.8387, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4702627939142462, |
|
"grad_norm": 0.7205588221549988, |
|
"learning_rate": 0.00018922638028145828, |
|
"loss": 0.8231, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.47717842323651455, |
|
"grad_norm": 0.798876941204071, |
|
"learning_rate": 0.0001886747423127316, |
|
"loss": 0.8254, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.48409405255878285, |
|
"grad_norm": 0.7704493403434753, |
|
"learning_rate": 0.00018811017787764747, |
|
"loss": 0.8244, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.49100968188105115, |
|
"grad_norm": 0.8552532196044922, |
|
"learning_rate": 0.00018753276927498659, |
|
"loss": 0.8347, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4979253112033195, |
|
"grad_norm": 0.7718790173530579, |
|
"learning_rate": 0.00018694260067587463, |
|
"loss": 0.7962, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5048409405255878, |
|
"grad_norm": 0.888421356678009, |
|
"learning_rate": 0.00018633975811151223, |
|
"loss": 0.8284, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5117565698478561, |
|
"grad_norm": 0.8525059819221497, |
|
"learning_rate": 0.00018572432946063367, |
|
"loss": 0.8241, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5186721991701245, |
|
"grad_norm": 0.6882660388946533, |
|
"learning_rate": 0.00018509640443669682, |
|
"loss": 0.8001, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5255878284923928, |
|
"grad_norm": 0.7356276512145996, |
|
"learning_rate": 0.00018445607457480493, |
|
"loss": 0.8177, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5325034578146611, |
|
"grad_norm": 0.7074828147888184, |
|
"learning_rate": 0.0001838034332183634, |
|
"loss": 0.837, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5394190871369294, |
|
"grad_norm": 0.7650839686393738, |
|
"learning_rate": 0.0001831385755054726, |
|
"loss": 0.8219, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5463347164591977, |
|
"grad_norm": 0.6335296630859375, |
|
"learning_rate": 0.00018246159835505932, |
|
"loss": 0.8146, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5532503457814661, |
|
"grad_norm": 0.8082166314125061, |
|
"learning_rate": 0.0001817726004527485, |
|
"loss": 0.8083, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5601659751037344, |
|
"grad_norm": 0.6119678616523743, |
|
"learning_rate": 0.0001810716822364774, |
|
"loss": 0.7853, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5670816044260027, |
|
"grad_norm": 0.7896009683609009, |
|
"learning_rate": 0.00018035894588185438, |
|
"loss": 0.7868, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.573997233748271, |
|
"grad_norm": 0.7086299657821655, |
|
"learning_rate": 0.0001796344952872643, |
|
"loss": 0.8234, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5809128630705395, |
|
"grad_norm": 0.689249575138092, |
|
"learning_rate": 0.00017889843605872305, |
|
"loss": 0.7917, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5878284923928078, |
|
"grad_norm": 0.6687580943107605, |
|
"learning_rate": 0.0001781508754944827, |
|
"loss": 0.7956, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5947441217150761, |
|
"grad_norm": 0.7054949998855591, |
|
"learning_rate": 0.0001773919225693903, |
|
"loss": 0.7979, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6016597510373444, |
|
"grad_norm": 0.8244301676750183, |
|
"learning_rate": 0.00017662168791900232, |
|
"loss": 0.7949, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6085753803596127, |
|
"grad_norm": 0.7373748421669006, |
|
"learning_rate": 0.00017584028382345654, |
|
"loss": 0.7742, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6154910096818811, |
|
"grad_norm": 0.716790497303009, |
|
"learning_rate": 0.00017504782419110497, |
|
"loss": 0.8082, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6224066390041494, |
|
"grad_norm": 0.7063632011413574, |
|
"learning_rate": 0.00017424442454190862, |
|
"loss": 0.7859, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6293222683264177, |
|
"grad_norm": 0.8756738901138306, |
|
"learning_rate": 0.00017343020199059783, |
|
"loss": 0.791, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.636237897648686, |
|
"grad_norm": 0.667121946811676, |
|
"learning_rate": 0.0001726052752296001, |
|
"loss": 0.8044, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6431535269709544, |
|
"grad_norm": 0.7018240690231323, |
|
"learning_rate": 0.00017176976451173758, |
|
"loss": 0.7829, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6500691562932227, |
|
"grad_norm": 0.6935915350914001, |
|
"learning_rate": 0.00017092379163269764, |
|
"loss": 0.7975, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.656984785615491, |
|
"grad_norm": 0.6710845232009888, |
|
"learning_rate": 0.00017006747991327796, |
|
"loss": 0.7777, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6639004149377593, |
|
"grad_norm": 0.7099134922027588, |
|
"learning_rate": 0.00016920095418140977, |
|
"loss": 0.7755, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6708160442600276, |
|
"grad_norm": 0.652553915977478, |
|
"learning_rate": 0.00016832434075396101, |
|
"loss": 0.7802, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.677731673582296, |
|
"grad_norm": 0.6347463130950928, |
|
"learning_rate": 0.00016743776741832292, |
|
"loss": 0.7814, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6846473029045643, |
|
"grad_norm": 0.5948991179466248, |
|
"learning_rate": 0.00016654136341378157, |
|
"loss": 0.7704, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6915629322268326, |
|
"grad_norm": 0.6586953401565552, |
|
"learning_rate": 0.00016563525941267845, |
|
"loss": 0.7781, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6984785615491009, |
|
"grad_norm": 0.6398725509643555, |
|
"learning_rate": 0.00016471958750136176, |
|
"loss": 0.7707, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7053941908713693, |
|
"grad_norm": 0.6144907474517822, |
|
"learning_rate": 0.00016379448116093156, |
|
"loss": 0.7714, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7123098201936376, |
|
"grad_norm": 0.6330462694168091, |
|
"learning_rate": 0.00016286007524778185, |
|
"loss": 0.7653, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.719225449515906, |
|
"grad_norm": 0.5762836933135986, |
|
"learning_rate": 0.00016191650597394198, |
|
"loss": 0.7715, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7261410788381742, |
|
"grad_norm": 0.578916609287262, |
|
"learning_rate": 0.00016096391088722047, |
|
"loss": 0.785, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7330567081604425, |
|
"grad_norm": 0.5828131437301636, |
|
"learning_rate": 0.0001600024288511541, |
|
"loss": 0.7424, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.739972337482711, |
|
"grad_norm": 0.5606003999710083, |
|
"learning_rate": 0.00015903220002476515, |
|
"loss": 0.7782, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7468879668049793, |
|
"grad_norm": 0.5754717588424683, |
|
"learning_rate": 0.0001580533658421302, |
|
"loss": 0.7865, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7538035961272476, |
|
"grad_norm": 0.680016279220581, |
|
"learning_rate": 0.0001570660689917623, |
|
"loss": 0.7637, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.7607192254495159, |
|
"grad_norm": 0.6306326985359192, |
|
"learning_rate": 0.00015607045339581096, |
|
"loss": 0.7528, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7676348547717843, |
|
"grad_norm": 0.5506445169448853, |
|
"learning_rate": 0.00015506666418908203, |
|
"loss": 0.767, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7745504840940526, |
|
"grad_norm": 0.6151379346847534, |
|
"learning_rate": 0.00015405484769788073, |
|
"loss": 0.7511, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7814661134163209, |
|
"grad_norm": 0.6520763635635376, |
|
"learning_rate": 0.00015303515141868116, |
|
"loss": 0.7529, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7883817427385892, |
|
"grad_norm": 0.5401438474655151, |
|
"learning_rate": 0.00015200772399662514, |
|
"loss": 0.7754, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7952973720608575, |
|
"grad_norm": 0.6025466322898865, |
|
"learning_rate": 0.00015097271520385366, |
|
"loss": 0.7577, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8022130013831259, |
|
"grad_norm": 0.6601846218109131, |
|
"learning_rate": 0.00014993027591767396, |
|
"loss": 0.7406, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8091286307053942, |
|
"grad_norm": 0.6068835854530334, |
|
"learning_rate": 0.0001488805580985655, |
|
"loss": 0.764, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8160442600276625, |
|
"grad_norm": 0.586649477481842, |
|
"learning_rate": 0.00014782371476802824, |
|
"loss": 0.7547, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8229598893499308, |
|
"grad_norm": 0.4942811131477356, |
|
"learning_rate": 0.00014675989998627598, |
|
"loss": 0.7539, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8298755186721992, |
|
"grad_norm": 0.5191232562065125, |
|
"learning_rate": 0.00014568926882977832, |
|
"loss": 0.75, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8367911479944675, |
|
"grad_norm": 0.5124319195747375, |
|
"learning_rate": 0.00014461197736865481, |
|
"loss": 0.7207, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8437067773167358, |
|
"grad_norm": 0.5561709403991699, |
|
"learning_rate": 0.00014352818264392364, |
|
"loss": 0.7504, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8506224066390041, |
|
"grad_norm": 0.5596902370452881, |
|
"learning_rate": 0.00014243804264460957, |
|
"loss": 0.7634, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8575380359612724, |
|
"grad_norm": 0.5537763237953186, |
|
"learning_rate": 0.00014134171628471276, |
|
"loss": 0.7293, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8644536652835408, |
|
"grad_norm": 0.6073982119560242, |
|
"learning_rate": 0.00014023936338004373, |
|
"loss": 0.7358, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8713692946058091, |
|
"grad_norm": 0.5265364050865173, |
|
"learning_rate": 0.00013913114462492601, |
|
"loss": 0.7415, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8782849239280774, |
|
"grad_norm": 0.5465463399887085, |
|
"learning_rate": 0.00013801722156877143, |
|
"loss": 0.7351, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8852005532503457, |
|
"grad_norm": 0.5381340384483337, |
|
"learning_rate": 0.00013689775659253006, |
|
"loss": 0.7403, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8921161825726142, |
|
"grad_norm": 0.5682520270347595, |
|
"learning_rate": 0.00013577291288501952, |
|
"loss": 0.7299, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8990318118948825, |
|
"grad_norm": 0.538758397102356, |
|
"learning_rate": 0.00013464285441913636, |
|
"loss": 0.7501, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9059474412171508, |
|
"grad_norm": 0.6069577932357788, |
|
"learning_rate": 0.00013350774592795292, |
|
"loss": 0.7373, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9128630705394191, |
|
"grad_norm": 0.5356409549713135, |
|
"learning_rate": 0.0001323677528807036, |
|
"loss": 0.7343, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9197786998616874, |
|
"grad_norm": 0.5176509618759155, |
|
"learning_rate": 0.00013122304145866381, |
|
"loss": 0.7298, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9266943291839558, |
|
"grad_norm": 0.6070849895477295, |
|
"learning_rate": 0.00013007377853092503, |
|
"loss": 0.7352, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9336099585062241, |
|
"grad_norm": 0.5752330422401428, |
|
"learning_rate": 0.00012892013163006962, |
|
"loss": 0.7323, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.9405255878284924, |
|
"grad_norm": 0.550092339515686, |
|
"learning_rate": 0.00012776226892774903, |
|
"loss": 0.7437, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9474412171507607, |
|
"grad_norm": 0.5363165736198425, |
|
"learning_rate": 0.00012660035921016854, |
|
"loss": 0.7199, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9543568464730291, |
|
"grad_norm": 0.5486807823181152, |
|
"learning_rate": 0.00012543457185348298, |
|
"loss": 0.7159, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9612724757952974, |
|
"grad_norm": 0.6245793104171753, |
|
"learning_rate": 0.00012426507679910576, |
|
"loss": 0.7295, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.9681881051175657, |
|
"grad_norm": 0.5306342244148254, |
|
"learning_rate": 0.00012309204452893606, |
|
"loss": 0.7239, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.975103734439834, |
|
"grad_norm": 0.5488132238388062, |
|
"learning_rate": 0.00012191564604050683, |
|
"loss": 0.7027, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.9820193637621023, |
|
"grad_norm": 0.5506658554077148, |
|
"learning_rate": 0.00012073605282205802, |
|
"loss": 0.7397, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9889349930843707, |
|
"grad_norm": 0.5438380241394043, |
|
"learning_rate": 0.00011955343682753794, |
|
"loss": 0.7241, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.995850622406639, |
|
"grad_norm": 0.5309740900993347, |
|
"learning_rate": 0.0001183679704515368, |
|
"loss": 0.7209, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0027662517289073, |
|
"grad_norm": 0.6508172750473022, |
|
"learning_rate": 0.00011717982650415624, |
|
"loss": 0.6672, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.0096818810511756, |
|
"grad_norm": 0.6566136479377747, |
|
"learning_rate": 0.00011598917818581791, |
|
"loss": 0.5846, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.016597510373444, |
|
"grad_norm": 0.5107012987136841, |
|
"learning_rate": 0.00011479619906201557, |
|
"loss": 0.5685, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.0235131396957122, |
|
"grad_norm": 0.48931655287742615, |
|
"learning_rate": 0.00011360106303801364, |
|
"loss": 0.5846, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0304287690179805, |
|
"grad_norm": 0.5219966769218445, |
|
"learning_rate": 0.00011240394433349637, |
|
"loss": 0.5588, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.037344398340249, |
|
"grad_norm": 0.5277289152145386, |
|
"learning_rate": 0.00011120501745717112, |
|
"loss": 0.5727, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0442600276625174, |
|
"grad_norm": 0.5424395799636841, |
|
"learning_rate": 0.00011000445718132966, |
|
"loss": 0.566, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0511756569847857, |
|
"grad_norm": 0.5607298612594604, |
|
"learning_rate": 0.00010880243851637078, |
|
"loss": 0.57, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.058091286307054, |
|
"grad_norm": 0.5789066553115845, |
|
"learning_rate": 0.00010759913668528841, |
|
"loss": 0.5659, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.0650069156293223, |
|
"grad_norm": 0.5418556928634644, |
|
"learning_rate": 0.00010639472709812861, |
|
"loss": 0.573, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0719225449515906, |
|
"grad_norm": 0.49530017375946045, |
|
"learning_rate": 0.0001051893853264195, |
|
"loss": 0.5695, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.0788381742738589, |
|
"grad_norm": 0.5037497878074646, |
|
"learning_rate": 0.00010398328707757738, |
|
"loss": 0.5651, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0857538035961272, |
|
"grad_norm": 0.5208268761634827, |
|
"learning_rate": 0.00010277660816929313, |
|
"loss": 0.5883, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.0926694329183957, |
|
"grad_norm": 0.5543485283851624, |
|
"learning_rate": 0.00010156952450390269, |
|
"loss": 0.5537, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.099585062240664, |
|
"grad_norm": 0.6337606310844421, |
|
"learning_rate": 0.00010036221204274512, |
|
"loss": 0.5649, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.1065006915629323, |
|
"grad_norm": 0.4707985818386078, |
|
"learning_rate": 9.915484678051175e-05, |
|
"loss": 0.5471, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1134163208852006, |
|
"grad_norm": 0.5109753608703613, |
|
"learning_rate": 9.794760471959116e-05, |
|
"loss": 0.5663, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.120331950207469, |
|
"grad_norm": 0.5100296139717102, |
|
"learning_rate": 9.674066184441221e-05, |
|
"loss": 0.5713, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1272475795297372, |
|
"grad_norm": 0.5414464473724365, |
|
"learning_rate": 9.553419409579035e-05, |
|
"loss": 0.5611, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.1341632088520055, |
|
"grad_norm": 0.5717695355415344, |
|
"learning_rate": 9.432837734527995e-05, |
|
"loss": 0.5817, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1410788381742738, |
|
"grad_norm": 0.5524587035179138, |
|
"learning_rate": 9.312338736953683e-05, |
|
"loss": 0.5757, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.147994467496542, |
|
"grad_norm": 0.5250436663627625, |
|
"learning_rate": 9.191939982469458e-05, |
|
"loss": 0.569, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1549100968188104, |
|
"grad_norm": 0.4710783064365387, |
|
"learning_rate": 9.071659022075849e-05, |
|
"loss": 0.565, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.161825726141079, |
|
"grad_norm": 0.4436459541320801, |
|
"learning_rate": 8.951513389602076e-05, |
|
"loss": 0.5666, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.1687413554633472, |
|
"grad_norm": 0.5180997252464294, |
|
"learning_rate": 8.831520599150083e-05, |
|
"loss": 0.5595, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.1756569847856155, |
|
"grad_norm": 0.49515190720558167, |
|
"learning_rate": 8.71169814254142e-05, |
|
"loss": 0.584, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1825726141078838, |
|
"grad_norm": 0.5082345008850098, |
|
"learning_rate": 8.592063486767406e-05, |
|
"loss": 0.5687, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.1894882434301521, |
|
"grad_norm": 0.5054699182510376, |
|
"learning_rate": 8.472634071442896e-05, |
|
"loss": 0.5796, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.1964038727524204, |
|
"grad_norm": 0.4742473065853119, |
|
"learning_rate": 8.353427306264032e-05, |
|
"loss": 0.5575, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.2033195020746887, |
|
"grad_norm": 0.46908038854599, |
|
"learning_rate": 8.23446056847037e-05, |
|
"loss": 0.5654, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.210235131396957, |
|
"grad_norm": 0.4599679708480835, |
|
"learning_rate": 8.115751200311725e-05, |
|
"loss": 0.5452, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.2171507607192256, |
|
"grad_norm": 0.48915913701057434, |
|
"learning_rate": 7.99731650652013e-05, |
|
"loss": 0.5667, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2240663900414939, |
|
"grad_norm": 0.5099295377731323, |
|
"learning_rate": 7.879173751787259e-05, |
|
"loss": 0.5673, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.2309820193637622, |
|
"grad_norm": 0.49911564588546753, |
|
"learning_rate": 7.761340158247674e-05, |
|
"loss": 0.5695, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2378976486860305, |
|
"grad_norm": 0.4832319915294647, |
|
"learning_rate": 7.64383290296829e-05, |
|
"loss": 0.5477, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.2448132780082988, |
|
"grad_norm": 0.4508809447288513, |
|
"learning_rate": 7.526669115444414e-05, |
|
"loss": 0.5738, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.251728907330567, |
|
"grad_norm": 0.5219544768333435, |
|
"learning_rate": 7.409865875102704e-05, |
|
"loss": 0.5581, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.2586445366528354, |
|
"grad_norm": 0.4636399745941162, |
|
"learning_rate": 7.293440208811435e-05, |
|
"loss": 0.5593, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2655601659751037, |
|
"grad_norm": 0.4862774908542633, |
|
"learning_rate": 7.177409088398425e-05, |
|
"loss": 0.5481, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.272475795297372, |
|
"grad_norm": 0.5330405235290527, |
|
"learning_rate": 7.06178942817699e-05, |
|
"loss": 0.5595, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.2793914246196403, |
|
"grad_norm": 0.49150362610816956, |
|
"learning_rate": 6.946598082480268e-05, |
|
"loss": 0.5429, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.2863070539419086, |
|
"grad_norm": 0.5385686755180359, |
|
"learning_rate": 6.831851843204308e-05, |
|
"loss": 0.5692, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.293222683264177, |
|
"grad_norm": 0.487090528011322, |
|
"learning_rate": 6.71756743736024e-05, |
|
"loss": 0.5484, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.3001383125864454, |
|
"grad_norm": 0.5285480618476868, |
|
"learning_rate": 6.603761524635914e-05, |
|
"loss": 0.5563, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3070539419087137, |
|
"grad_norm": 0.48829564452171326, |
|
"learning_rate": 6.490450694967358e-05, |
|
"loss": 0.5606, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.313969571230982, |
|
"grad_norm": 0.4818781912326813, |
|
"learning_rate": 6.377651466120391e-05, |
|
"loss": 0.5538, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3208852005532503, |
|
"grad_norm": 0.47652438282966614, |
|
"learning_rate": 6.265380281282762e-05, |
|
"loss": 0.5584, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.3278008298755186, |
|
"grad_norm": 0.47311243414878845, |
|
"learning_rate": 6.15365350666718e-05, |
|
"loss": 0.5409, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.334716459197787, |
|
"grad_norm": 0.46478375792503357, |
|
"learning_rate": 6.042487429125516e-05, |
|
"loss": 0.5554, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.3416320885200554, |
|
"grad_norm": 0.4783475399017334, |
|
"learning_rate": 5.931898253774628e-05, |
|
"loss": 0.5465, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3485477178423237, |
|
"grad_norm": 0.48976966738700867, |
|
"learning_rate": 5.821902101634069e-05, |
|
"loss": 0.5563, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.355463347164592, |
|
"grad_norm": 0.4838036298751831, |
|
"learning_rate": 5.7125150072760635e-05, |
|
"loss": 0.5628, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3623789764868603, |
|
"grad_norm": 0.46755191683769226, |
|
"learning_rate": 5.603752916488085e-05, |
|
"loss": 0.5472, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.3692946058091287, |
|
"grad_norm": 0.50596022605896, |
|
"learning_rate": 5.4956316839483734e-05, |
|
"loss": 0.5632, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.376210235131397, |
|
"grad_norm": 0.49392423033714294, |
|
"learning_rate": 5.388167070914738e-05, |
|
"loss": 0.5363, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.3831258644536653, |
|
"grad_norm": 0.4692226052284241, |
|
"learning_rate": 5.281374742926987e-05, |
|
"loss": 0.536, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3900414937759336, |
|
"grad_norm": 0.5283923745155334, |
|
"learning_rate": 5.175270267523278e-05, |
|
"loss": 0.5553, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.3969571230982019, |
|
"grad_norm": 0.47653043270111084, |
|
"learning_rate": 5.069869111970793e-05, |
|
"loss": 0.5492, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4038727524204702, |
|
"grad_norm": 0.45839253067970276, |
|
"learning_rate": 4.965186641011013e-05, |
|
"loss": 0.5297, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.4107883817427385, |
|
"grad_norm": 0.4624306261539459, |
|
"learning_rate": 4.861238114619929e-05, |
|
"loss": 0.5384, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.417704011065007, |
|
"grad_norm": 0.48706522583961487, |
|
"learning_rate": 4.75803868578355e-05, |
|
"loss": 0.5369, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.4246196403872753, |
|
"grad_norm": 0.4311310350894928, |
|
"learning_rate": 4.655603398288979e-05, |
|
"loss": 0.5276, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4315352697095436, |
|
"grad_norm": 0.47203701734542847, |
|
"learning_rate": 4.5539471845314304e-05, |
|
"loss": 0.5347, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.438450899031812, |
|
"grad_norm": 0.46674981713294983, |
|
"learning_rate": 4.453084863337471e-05, |
|
"loss": 0.5299, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4453665283540802, |
|
"grad_norm": 0.42725497484207153, |
|
"learning_rate": 4.353031137804821e-05, |
|
"loss": 0.5369, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.4522821576763485, |
|
"grad_norm": 0.4412887990474701, |
|
"learning_rate": 4.253800593159029e-05, |
|
"loss": 0.5418, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.4591977869986168, |
|
"grad_norm": 0.45071831345558167, |
|
"learning_rate": 4.155407694627322e-05, |
|
"loss": 0.5551, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.4661134163208853, |
|
"grad_norm": 0.44687119126319885, |
|
"learning_rate": 4.057866785329959e-05, |
|
"loss": 0.5424, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4730290456431536, |
|
"grad_norm": 0.45058682560920715, |
|
"learning_rate": 3.96119208418937e-05, |
|
"loss": 0.5524, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.479944674965422, |
|
"grad_norm": 0.5045320391654968, |
|
"learning_rate": 3.8653976838574104e-05, |
|
"loss": 0.5403, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.4868603042876902, |
|
"grad_norm": 0.4632035791873932, |
|
"learning_rate": 3.770497548661021e-05, |
|
"loss": 0.5346, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.4937759336099585, |
|
"grad_norm": 0.4648893177509308, |
|
"learning_rate": 3.676505512566597e-05, |
|
"loss": 0.5423, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5006915629322268, |
|
"grad_norm": 0.4647049605846405, |
|
"learning_rate": 3.5834352771633475e-05, |
|
"loss": 0.5319, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.5076071922544951, |
|
"grad_norm": 0.45878130197525024, |
|
"learning_rate": 3.491300409665963e-05, |
|
"loss": 0.533, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.5145228215767634, |
|
"grad_norm": 0.4348316192626953, |
|
"learning_rate": 3.4001143409368773e-05, |
|
"loss": 0.5111, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.5214384508990317, |
|
"grad_norm": 0.47867557406425476, |
|
"learning_rate": 3.309890363528386e-05, |
|
"loss": 0.527, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5283540802213, |
|
"grad_norm": 0.47667166590690613, |
|
"learning_rate": 3.220641629744947e-05, |
|
"loss": 0.5184, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.5352697095435683, |
|
"grad_norm": 0.4480649530887604, |
|
"learning_rate": 3.132381149725916e-05, |
|
"loss": 0.5218, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5421853388658366, |
|
"grad_norm": 0.4330606460571289, |
|
"learning_rate": 3.0451217895489992e-05, |
|
"loss": 0.5317, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.5491009681881052, |
|
"grad_norm": 0.45356714725494385, |
|
"learning_rate": 2.9588762693547355e-05, |
|
"loss": 0.5307, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5560165975103735, |
|
"grad_norm": 0.46869832277297974, |
|
"learning_rate": 2.8736571614922046e-05, |
|
"loss": 0.5231, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.5629322268326418, |
|
"grad_norm": 0.5013293623924255, |
|
"learning_rate": 2.7894768886863233e-05, |
|
"loss": 0.5272, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.56984785615491, |
|
"grad_norm": 0.44263824820518494, |
|
"learning_rate": 2.7063477222269306e-05, |
|
"loss": 0.53, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.5767634854771784, |
|
"grad_norm": 0.45062586665153503, |
|
"learning_rate": 2.6242817801799557e-05, |
|
"loss": 0.5116, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.583679114799447, |
|
"grad_norm": 0.4699437618255615, |
|
"learning_rate": 2.5432910256209187e-05, |
|
"loss": 0.5314, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.5905947441217152, |
|
"grad_norm": 0.45320945978164673, |
|
"learning_rate": 2.4633872648910252e-05, |
|
"loss": 0.5244, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5975103734439835, |
|
"grad_norm": 0.45921820402145386, |
|
"learning_rate": 2.3845821458761063e-05, |
|
"loss": 0.5265, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.6044260027662518, |
|
"grad_norm": 0.4836861789226532, |
|
"learning_rate": 2.3068871563086757e-05, |
|
"loss": 0.5397, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.61134163208852, |
|
"grad_norm": 0.4829198122024536, |
|
"learning_rate": 2.230313622093295e-05, |
|
"loss": 0.5247, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.6182572614107884, |
|
"grad_norm": 0.42497992515563965, |
|
"learning_rate": 2.154872705655566e-05, |
|
"loss": 0.4998, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.6251728907330567, |
|
"grad_norm": 0.4427035450935364, |
|
"learning_rate": 2.0805754043149394e-05, |
|
"loss": 0.52, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.632088520055325, |
|
"grad_norm": 0.4457587003707886, |
|
"learning_rate": 2.0074325486815883e-05, |
|
"loss": 0.4931, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.6390041493775933, |
|
"grad_norm": 0.46790486574172974, |
|
"learning_rate": 1.9354548010775896e-05, |
|
"loss": 0.529, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.6459197786998616, |
|
"grad_norm": 0.4801090955734253, |
|
"learning_rate": 1.864652653982636e-05, |
|
"loss": 0.5188, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.65283540802213, |
|
"grad_norm": 0.4337711036205292, |
|
"learning_rate": 1.7950364285044996e-05, |
|
"loss": 0.5105, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.6597510373443982, |
|
"grad_norm": 0.45500195026397705, |
|
"learning_rate": 1.7266162728744993e-05, |
|
"loss": 0.5049, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.4910541772842407, |
|
"learning_rate": 1.6594021609681344e-05, |
|
"loss": 0.5079, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.673582295988935, |
|
"grad_norm": 0.4468669295310974, |
|
"learning_rate": 1.5934038908511616e-05, |
|
"loss": 0.5135, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6804979253112033, |
|
"grad_norm": 0.4427061975002289, |
|
"learning_rate": 1.5286310833512963e-05, |
|
"loss": 0.5019, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.6874135546334716, |
|
"grad_norm": 0.44807228446006775, |
|
"learning_rate": 1.4650931806557389e-05, |
|
"loss": 0.5121, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.69432918395574, |
|
"grad_norm": 0.45542779564857483, |
|
"learning_rate": 1.402799444934757e-05, |
|
"loss": 0.5183, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.7012448132780082, |
|
"grad_norm": 0.44287553429603577, |
|
"learning_rate": 1.3417589569914978e-05, |
|
"loss": 0.4962, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.7081604426002768, |
|
"grad_norm": 0.43979722261428833, |
|
"learning_rate": 1.2819806149382441e-05, |
|
"loss": 0.5065, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.715076071922545, |
|
"grad_norm": 0.4374731481075287, |
|
"learning_rate": 1.2234731328993055e-05, |
|
"loss": 0.5144, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.7219917012448134, |
|
"grad_norm": 0.4652646481990814, |
|
"learning_rate": 1.1662450397407188e-05, |
|
"loss": 0.5017, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.7289073305670817, |
|
"grad_norm": 0.4413568377494812, |
|
"learning_rate": 1.1103046778269687e-05, |
|
"loss": 0.5109, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.73582295988935, |
|
"grad_norm": 0.4495951235294342, |
|
"learning_rate": 1.0556602018048866e-05, |
|
"loss": 0.502, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.7427385892116183, |
|
"grad_norm": 0.4346306025981903, |
|
"learning_rate": 1.0023195774149119e-05, |
|
"loss": 0.4963, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.7496542185338866, |
|
"grad_norm": 0.4612555503845215, |
|
"learning_rate": 9.502905803299e-06, |
|
"loss": 0.5006, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.7565698478561549, |
|
"grad_norm": 0.46448782086372375, |
|
"learning_rate": 8.995807950216262e-06, |
|
"loss": 0.5195, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7634854771784232, |
|
"grad_norm": 0.44382619857788086, |
|
"learning_rate": 8.501976136551749e-06, |
|
"loss": 0.5061, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.7704011065006915, |
|
"grad_norm": 0.45043641328811646, |
|
"learning_rate": 8.021482350113474e-06, |
|
"loss": 0.5254, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.7773167358229598, |
|
"grad_norm": 0.44958069920539856, |
|
"learning_rate": 7.554396634372707e-06, |
|
"loss": 0.5311, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.784232365145228, |
|
"grad_norm": 0.433776319026947, |
|
"learning_rate": 7.100787078253446e-06, |
|
"loss": 0.486, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.7911479944674964, |
|
"grad_norm": 0.41777148842811584, |
|
"learning_rate": 6.660719806206839e-06, |
|
"loss": 0.5108, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.798063623789765, |
|
"grad_norm": 0.4487934112548828, |
|
"learning_rate": 6.234258968571971e-06, |
|
"loss": 0.5006, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8049792531120332, |
|
"grad_norm": 0.4544650614261627, |
|
"learning_rate": 5.821466732224412e-06, |
|
"loss": 0.5037, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.8118948824343015, |
|
"grad_norm": 0.41929152607917786, |
|
"learning_rate": 5.422403271513854e-06, |
|
"loss": 0.4978, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.8188105117565698, |
|
"grad_norm": 0.4613235294818878, |
|
"learning_rate": 5.0371267594923834e-06, |
|
"loss": 0.5209, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.8257261410788381, |
|
"grad_norm": 0.44448205828666687, |
|
"learning_rate": 4.66569335943422e-06, |
|
"loss": 0.5014, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8326417704011067, |
|
"grad_norm": 0.43924444913864136, |
|
"learning_rate": 4.3081572166486675e-06, |
|
"loss": 0.5247, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.839557399723375, |
|
"grad_norm": 0.44769856333732605, |
|
"learning_rate": 3.964570450587113e-06, |
|
"loss": 0.5005, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.8464730290456433, |
|
"grad_norm": 0.4331699311733246, |
|
"learning_rate": 3.6349831472453743e-06, |
|
"loss": 0.5041, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.8533886583679116, |
|
"grad_norm": 0.43336230516433716, |
|
"learning_rate": 3.3194433518624614e-06, |
|
"loss": 0.5162, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.8603042876901799, |
|
"grad_norm": 0.4407755434513092, |
|
"learning_rate": 3.017997061916833e-06, |
|
"loss": 0.501, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.8672199170124482, |
|
"grad_norm": 0.43583956360816956, |
|
"learning_rate": 2.7306882204211626e-06, |
|
"loss": 0.5009, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.8741355463347165, |
|
"grad_norm": 0.42576801776885986, |
|
"learning_rate": 2.4575587095166054e-06, |
|
"loss": 0.5128, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.8810511756569848, |
|
"grad_norm": 0.42050647735595703, |
|
"learning_rate": 2.198648344367449e-06, |
|
"loss": 0.5069, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.887966804979253, |
|
"grad_norm": 0.4313651919364929, |
|
"learning_rate": 1.953994867357134e-06, |
|
"loss": 0.5055, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.8948824343015214, |
|
"grad_norm": 0.43365243077278137, |
|
"learning_rate": 1.7236339425863446e-06, |
|
"loss": 0.5145, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.9017980636237897, |
|
"grad_norm": 0.4598560035228729, |
|
"learning_rate": 1.507599150674177e-06, |
|
"loss": 0.5191, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.908713692946058, |
|
"grad_norm": 0.4304969310760498, |
|
"learning_rate": 1.3059219838629234e-06, |
|
"loss": 0.501, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.9156293222683263, |
|
"grad_norm": 0.44647660851478577, |
|
"learning_rate": 1.11863184142732e-06, |
|
"loss": 0.5063, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.9225449515905948, |
|
"grad_norm": 0.40728816390037537, |
|
"learning_rate": 9.457560253889219e-07, |
|
"loss": 0.5009, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.929460580912863, |
|
"grad_norm": 0.4395773410797119, |
|
"learning_rate": 7.873197365361407e-07, |
|
"loss": 0.4993, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.9363762102351314, |
|
"grad_norm": 0.44724011421203613, |
|
"learning_rate": 6.433460707506722e-07, |
|
"loss": 0.5065, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9432918395573997, |
|
"grad_norm": 0.41976112127304077, |
|
"learning_rate": 5.138560156407124e-07, |
|
"loss": 0.5058, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.950207468879668, |
|
"grad_norm": 0.43243739008903503, |
|
"learning_rate": 3.988684474814819e-07, |
|
"loss": 0.4919, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.9571230982019365, |
|
"grad_norm": 0.4788164496421814, |
|
"learning_rate": 2.984001284635496e-07, |
|
"loss": 0.5128, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.9640387275242048, |
|
"grad_norm": 0.42695632576942444, |
|
"learning_rate": 2.1246570424940936e-07, |
|
"loss": 0.5116, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.9709543568464731, |
|
"grad_norm": 0.40510523319244385, |
|
"learning_rate": 1.4107770183845458e-07, |
|
"loss": 0.5058, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.9778699861687414, |
|
"grad_norm": 0.4062838554382324, |
|
"learning_rate": 8.424652774089436e-08, |
|
"loss": 0.5128, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9847856154910097, |
|
"grad_norm": 0.4485253393650055, |
|
"learning_rate": 4.198046646075593e-08, |
|
"loss": 0.5062, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.991701244813278, |
|
"grad_norm": 0.43608593940734863, |
|
"learning_rate": 1.4285679288228437e-08, |
|
"loss": 0.4919, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9986168741355463, |
|
"grad_norm": 0.4265099763870239, |
|
"learning_rate": 1.166203401481436e-09, |
|
"loss": 0.5001, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1446, |
|
"total_flos": 6.011530730704732e+18, |
|
"train_loss": 0.6795949197244182, |
|
"train_runtime": 13742.7497, |
|
"train_samples_per_second": 6.734, |
|
"train_steps_per_second": 0.105 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1446, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.011530730704732e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|