{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1446, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013831258644536654, "grad_norm": 133.08836364746094, "learning_rate": 0.0, "loss": 1.6747, "step": 1 }, { "epoch": 0.006915629322268326, "grad_norm": 10.979754447937012, "learning_rate": 5.517241379310345e-06, "loss": 1.5287, "step": 5 }, { "epoch": 0.013831258644536652, "grad_norm": 5.598673343658447, "learning_rate": 1.2413793103448277e-05, "loss": 1.1371, "step": 10 }, { "epoch": 0.02074688796680498, "grad_norm": 4.170475959777832, "learning_rate": 1.9310344827586207e-05, "loss": 1.0279, "step": 15 }, { "epoch": 0.027662517289073305, "grad_norm": 3.909233331680298, "learning_rate": 2.620689655172414e-05, "loss": 0.9947, "step": 20 }, { "epoch": 0.034578146611341634, "grad_norm": 3.94997501373291, "learning_rate": 3.310344827586207e-05, "loss": 0.9337, "step": 25 }, { "epoch": 0.04149377593360996, "grad_norm": 3.896550178527832, "learning_rate": 4e-05, "loss": 0.8761, "step": 30 }, { "epoch": 0.048409405255878286, "grad_norm": 3.581073522567749, "learning_rate": 4.689655172413793e-05, "loss": 0.8054, "step": 35 }, { "epoch": 0.05532503457814661, "grad_norm": 2.294735908508301, "learning_rate": 5.379310344827586e-05, "loss": 0.7511, "step": 40 }, { "epoch": 0.06224066390041494, "grad_norm": 1.9514353275299072, "learning_rate": 6.068965517241379e-05, "loss": 0.7493, "step": 45 }, { "epoch": 0.06915629322268327, "grad_norm": 2.1982648372650146, "learning_rate": 6.758620689655173e-05, "loss": 0.8087, "step": 50 }, { "epoch": 0.07607192254495158, "grad_norm": 2.2902512550354004, "learning_rate": 7.448275862068966e-05, "loss": 0.7906, "step": 55 }, { "epoch": 0.08298755186721991, "grad_norm": 1.9472178220748901, "learning_rate": 8.137931034482759e-05, "loss": 0.7872, "step": 60 }, { "epoch": 0.08990318118948824, "grad_norm": 1.7574316263198853, "learning_rate": 8.827586206896552e-05, "loss": 0.7898, "step": 65 }, { "epoch": 0.09681881051175657, "grad_norm": 1.8785020112991333, "learning_rate": 9.517241379310345e-05, "loss": 0.8166, "step": 70 }, { "epoch": 0.1037344398340249, "grad_norm": 1.5703641176223755, "learning_rate": 0.0001020689655172414, "loss": 0.7987, "step": 75 }, { "epoch": 0.11065006915629322, "grad_norm": 1.4895148277282715, "learning_rate": 0.00010896551724137931, "loss": 0.8054, "step": 80 }, { "epoch": 0.11756569847856155, "grad_norm": 1.7236794233322144, "learning_rate": 0.00011586206896551725, "loss": 0.8111, "step": 85 }, { "epoch": 0.12448132780082988, "grad_norm": 1.788547396659851, "learning_rate": 0.00012275862068965518, "loss": 0.8067, "step": 90 }, { "epoch": 0.1313969571230982, "grad_norm": 1.6218682527542114, "learning_rate": 0.0001296551724137931, "loss": 0.8175, "step": 95 }, { "epoch": 0.13831258644536654, "grad_norm": 1.438413381576538, "learning_rate": 0.00013655172413793104, "loss": 0.8485, "step": 100 }, { "epoch": 0.14522821576763487, "grad_norm": 1.6937086582183838, "learning_rate": 0.00014344827586206896, "loss": 0.8562, "step": 105 }, { "epoch": 0.15214384508990317, "grad_norm": 1.4097000360488892, "learning_rate": 0.0001503448275862069, "loss": 0.851, "step": 110 }, { "epoch": 0.1590594744121715, "grad_norm": 2.706479072570801, "learning_rate": 0.00015724137931034485, "loss": 0.8461, "step": 115 }, { "epoch": 0.16597510373443983, "grad_norm": 2.0034353733062744, "learning_rate": 0.00016413793103448276, "loss": 0.8548, "step": 120 }, { "epoch": 0.17289073305670816, "grad_norm": 1.3242019414901733, "learning_rate": 0.0001710344827586207, "loss": 0.8553, "step": 125 }, { "epoch": 0.1798063623789765, "grad_norm": 2.6091411113739014, "learning_rate": 0.00017793103448275862, "loss": 0.848, "step": 130 }, { "epoch": 0.18672199170124482, "grad_norm": 1.3679989576339722, "learning_rate": 0.00018482758620689654, "loss": 0.8716, "step": 135 }, { "epoch": 0.19363762102351315, "grad_norm": 1.323454737663269, "learning_rate": 0.0001917241379310345, "loss": 0.8488, "step": 140 }, { "epoch": 0.20055325034578148, "grad_norm": 2.7075045108795166, "learning_rate": 0.00019862068965517243, "loss": 0.9631, "step": 145 }, { "epoch": 0.2074688796680498, "grad_norm": 2.2683520317077637, "learning_rate": 0.0001999953352135947, "loss": 0.9498, "step": 150 }, { "epoch": 0.2143845089903181, "grad_norm": 8.38696002960205, "learning_rate": 0.0001999763852647035, "loss": 0.891, "step": 155 }, { "epoch": 0.22130013831258644, "grad_norm": 24.307083129882812, "learning_rate": 0.00019994286136445976, "loss": 0.9308, "step": 160 }, { "epoch": 0.22821576763485477, "grad_norm": 2.0212178230285645, "learning_rate": 0.0001998947683997744, "loss": 0.8901, "step": 165 }, { "epoch": 0.2351313969571231, "grad_norm": 1.2989600896835327, "learning_rate": 0.00019983211338134828, "loss": 0.8546, "step": 170 }, { "epoch": 0.24204702627939143, "grad_norm": 1.2097629308700562, "learning_rate": 0.00019975490544265012, "loss": 0.8543, "step": 175 }, { "epoch": 0.24896265560165975, "grad_norm": 30.946725845336914, "learning_rate": 0.00019966315583858516, "loss": 1.6915, "step": 180 }, { "epoch": 0.25587828492392806, "grad_norm": 2.118316173553467, "learning_rate": 0.0001995568779438545, "loss": 1.0665, "step": 185 }, { "epoch": 0.2627939142461964, "grad_norm": 3.086249589920044, "learning_rate": 0.00019943608725100532, "loss": 0.9988, "step": 190 }, { "epoch": 0.2697095435684647, "grad_norm": 1.1766245365142822, "learning_rate": 0.00019930080136817255, "loss": 0.9133, "step": 195 }, { "epoch": 0.2766251728907331, "grad_norm": 1.33064603805542, "learning_rate": 0.00019915104001651203, "loss": 0.8809, "step": 200 }, { "epoch": 0.2835408022130014, "grad_norm": 1.6439151763916016, "learning_rate": 0.00019898682502732568, "loss": 0.8967, "step": 205 }, { "epoch": 0.29045643153526973, "grad_norm": 1.3271840810775757, "learning_rate": 0.00019880818033887916, "loss": 0.8866, "step": 210 }, { "epoch": 0.29737206085753803, "grad_norm": 1.1745598316192627, "learning_rate": 0.0001986151319929121, "loss": 0.8598, "step": 215 }, { "epoch": 0.30428769017980634, "grad_norm": 1.0390416383743286, "learning_rate": 0.00019840770813084205, "loss": 0.8434, "step": 220 }, { "epoch": 0.3112033195020747, "grad_norm": 1.0289931297302246, "learning_rate": 0.00019818593898966212, "loss": 0.8765, "step": 225 }, { "epoch": 0.318118948824343, "grad_norm": 1.1883145570755005, "learning_rate": 0.00019794985689753337, "loss": 0.8859, "step": 230 }, { "epoch": 0.32503457814661135, "grad_norm": 1.2715511322021484, "learning_rate": 0.00019769949626907186, "loss": 0.8626, "step": 235 }, { "epoch": 0.33195020746887965, "grad_norm": 1.287985920906067, "learning_rate": 0.00019743489360033231, "loss": 0.8805, "step": 240 }, { "epoch": 0.338865836791148, "grad_norm": 1.1524231433868408, "learning_rate": 0.00019715608746348763, "loss": 0.8588, "step": 245 }, { "epoch": 0.3457814661134163, "grad_norm": 1.0479854345321655, "learning_rate": 0.00019686311850120625, "loss": 0.858, "step": 250 }, { "epoch": 0.35269709543568467, "grad_norm": 0.9751342535018921, "learning_rate": 0.0001965560294207274, "loss": 0.8558, "step": 255 }, { "epoch": 0.359612724757953, "grad_norm": 0.9835847020149231, "learning_rate": 0.00019623486498763555, "loss": 0.8755, "step": 260 }, { "epoch": 0.3665283540802213, "grad_norm": 3.7135465145111084, "learning_rate": 0.00019589967201933471, "loss": 0.8584, "step": 265 }, { "epoch": 0.37344398340248963, "grad_norm": 1.0380736589431763, "learning_rate": 0.00019555049937822384, "loss": 0.8544, "step": 270 }, { "epoch": 0.38035961272475793, "grad_norm": 1.0324746370315552, "learning_rate": 0.00019518739796457366, "loss": 0.8673, "step": 275 }, { "epoch": 0.3872752420470263, "grad_norm": 1.0108332633972168, "learning_rate": 0.00019481042070910705, "loss": 0.8443, "step": 280 }, { "epoch": 0.3941908713692946, "grad_norm": 0.8845415711402893, "learning_rate": 0.00019441962256528292, "loss": 0.857, "step": 285 }, { "epoch": 0.40110650069156295, "grad_norm": 0.8270080089569092, "learning_rate": 0.00019401506050128556, "loss": 0.8583, "step": 290 }, { "epoch": 0.40802213001383125, "grad_norm": 0.8553124070167542, "learning_rate": 0.00019359679349172004, "loss": 0.8291, "step": 295 }, { "epoch": 0.4149377593360996, "grad_norm": 0.9521012306213379, "learning_rate": 0.00019316488250901534, "loss": 0.8486, "step": 300 }, { "epoch": 0.4218533886583679, "grad_norm": 0.8840038776397705, "learning_rate": 0.00019271939051453612, "loss": 0.8258, "step": 305 }, { "epoch": 0.4287690179806362, "grad_norm": 1.223691463470459, "learning_rate": 0.00019226038244940464, "loss": 0.8142, "step": 310 }, { "epoch": 0.43568464730290457, "grad_norm": 0.8988921642303467, "learning_rate": 0.00019178792522503394, "loss": 0.8611, "step": 315 }, { "epoch": 0.4426002766251729, "grad_norm": 0.813004732131958, "learning_rate": 0.000191302087713374, "loss": 0.8208, "step": 320 }, { "epoch": 0.44951590594744123, "grad_norm": 0.8524804711341858, "learning_rate": 0.00019080294073687193, "loss": 0.8393, "step": 325 }, { "epoch": 0.45643153526970953, "grad_norm": 0.9394051432609558, "learning_rate": 0.000190290557058148, "loss": 0.8567, "step": 330 }, { "epoch": 0.4633471645919779, "grad_norm": 0.9042031168937683, "learning_rate": 0.00018976501136938864, "loss": 0.8387, "step": 335 }, { "epoch": 0.4702627939142462, "grad_norm": 0.7205588221549988, "learning_rate": 0.00018922638028145828, "loss": 0.8231, "step": 340 }, { "epoch": 0.47717842323651455, "grad_norm": 0.798876941204071, "learning_rate": 0.0001886747423127316, "loss": 0.8254, "step": 345 }, { "epoch": 0.48409405255878285, "grad_norm": 0.7704493403434753, "learning_rate": 0.00018811017787764747, "loss": 0.8244, "step": 350 }, { "epoch": 0.49100968188105115, "grad_norm": 0.8552532196044922, "learning_rate": 0.00018753276927498659, "loss": 0.8347, "step": 355 }, { "epoch": 0.4979253112033195, "grad_norm": 0.7718790173530579, "learning_rate": 0.00018694260067587463, "loss": 0.7962, "step": 360 }, { "epoch": 0.5048409405255878, "grad_norm": 0.888421356678009, "learning_rate": 0.00018633975811151223, "loss": 0.8284, "step": 365 }, { "epoch": 0.5117565698478561, "grad_norm": 0.8525059819221497, "learning_rate": 0.00018572432946063367, "loss": 0.8241, "step": 370 }, { "epoch": 0.5186721991701245, "grad_norm": 0.6882660388946533, "learning_rate": 0.00018509640443669682, "loss": 0.8001, "step": 375 }, { "epoch": 0.5255878284923928, "grad_norm": 0.7356276512145996, "learning_rate": 0.00018445607457480493, "loss": 0.8177, "step": 380 }, { "epoch": 0.5325034578146611, "grad_norm": 0.7074828147888184, "learning_rate": 0.0001838034332183634, "loss": 0.837, "step": 385 }, { "epoch": 0.5394190871369294, "grad_norm": 0.7650839686393738, "learning_rate": 0.0001831385755054726, "loss": 0.8219, "step": 390 }, { "epoch": 0.5463347164591977, "grad_norm": 0.6335296630859375, "learning_rate": 0.00018246159835505932, "loss": 0.8146, "step": 395 }, { "epoch": 0.5532503457814661, "grad_norm": 0.8082166314125061, "learning_rate": 0.0001817726004527485, "loss": 0.8083, "step": 400 }, { "epoch": 0.5601659751037344, "grad_norm": 0.6119678616523743, "learning_rate": 0.0001810716822364774, "loss": 0.7853, "step": 405 }, { "epoch": 0.5670816044260027, "grad_norm": 0.7896009683609009, "learning_rate": 0.00018035894588185438, "loss": 0.7868, "step": 410 }, { "epoch": 0.573997233748271, "grad_norm": 0.7086299657821655, "learning_rate": 0.0001796344952872643, "loss": 0.8234, "step": 415 }, { "epoch": 0.5809128630705395, "grad_norm": 0.689249575138092, "learning_rate": 0.00017889843605872305, "loss": 0.7917, "step": 420 }, { "epoch": 0.5878284923928078, "grad_norm": 0.6687580943107605, "learning_rate": 0.0001781508754944827, "loss": 0.7956, "step": 425 }, { "epoch": 0.5947441217150761, "grad_norm": 0.7054949998855591, "learning_rate": 0.0001773919225693903, "loss": 0.7979, "step": 430 }, { "epoch": 0.6016597510373444, "grad_norm": 0.8244301676750183, "learning_rate": 0.00017662168791900232, "loss": 0.7949, "step": 435 }, { "epoch": 0.6085753803596127, "grad_norm": 0.7373748421669006, "learning_rate": 0.00017584028382345654, "loss": 0.7742, "step": 440 }, { "epoch": 0.6154910096818811, "grad_norm": 0.716790497303009, "learning_rate": 0.00017504782419110497, "loss": 0.8082, "step": 445 }, { "epoch": 0.6224066390041494, "grad_norm": 0.7063632011413574, "learning_rate": 0.00017424442454190862, "loss": 0.7859, "step": 450 }, { "epoch": 0.6293222683264177, "grad_norm": 0.8756738901138306, "learning_rate": 0.00017343020199059783, "loss": 0.791, "step": 455 }, { "epoch": 0.636237897648686, "grad_norm": 0.667121946811676, "learning_rate": 0.0001726052752296001, "loss": 0.8044, "step": 460 }, { "epoch": 0.6431535269709544, "grad_norm": 0.7018240690231323, "learning_rate": 0.00017176976451173758, "loss": 0.7829, "step": 465 }, { "epoch": 0.6500691562932227, "grad_norm": 0.6935915350914001, "learning_rate": 0.00017092379163269764, "loss": 0.7975, "step": 470 }, { "epoch": 0.656984785615491, "grad_norm": 0.6710845232009888, "learning_rate": 0.00017006747991327796, "loss": 0.7777, "step": 475 }, { "epoch": 0.6639004149377593, "grad_norm": 0.7099134922027588, "learning_rate": 0.00016920095418140977, "loss": 0.7755, "step": 480 }, { "epoch": 0.6708160442600276, "grad_norm": 0.652553915977478, "learning_rate": 0.00016832434075396101, "loss": 0.7802, "step": 485 }, { "epoch": 0.677731673582296, "grad_norm": 0.6347463130950928, "learning_rate": 0.00016743776741832292, "loss": 0.7814, "step": 490 }, { "epoch": 0.6846473029045643, "grad_norm": 0.5948991179466248, "learning_rate": 0.00016654136341378157, "loss": 0.7704, "step": 495 }, { "epoch": 0.6915629322268326, "grad_norm": 0.6586953401565552, "learning_rate": 0.00016563525941267845, "loss": 0.7781, "step": 500 }, { "epoch": 0.6984785615491009, "grad_norm": 0.6398725509643555, "learning_rate": 0.00016471958750136176, "loss": 0.7707, "step": 505 }, { "epoch": 0.7053941908713693, "grad_norm": 0.6144907474517822, "learning_rate": 0.00016379448116093156, "loss": 0.7714, "step": 510 }, { "epoch": 0.7123098201936376, "grad_norm": 0.6330462694168091, "learning_rate": 0.00016286007524778185, "loss": 0.7653, "step": 515 }, { "epoch": 0.719225449515906, "grad_norm": 0.5762836933135986, "learning_rate": 0.00016191650597394198, "loss": 0.7715, "step": 520 }, { "epoch": 0.7261410788381742, "grad_norm": 0.578916609287262, "learning_rate": 0.00016096391088722047, "loss": 0.785, "step": 525 }, { "epoch": 0.7330567081604425, "grad_norm": 0.5828131437301636, "learning_rate": 0.0001600024288511541, "loss": 0.7424, "step": 530 }, { "epoch": 0.739972337482711, "grad_norm": 0.5606003999710083, "learning_rate": 0.00015903220002476515, "loss": 0.7782, "step": 535 }, { "epoch": 0.7468879668049793, "grad_norm": 0.5754717588424683, "learning_rate": 0.0001580533658421302, "loss": 0.7865, "step": 540 }, { "epoch": 0.7538035961272476, "grad_norm": 0.680016279220581, "learning_rate": 0.0001570660689917623, "loss": 0.7637, "step": 545 }, { "epoch": 0.7607192254495159, "grad_norm": 0.6306326985359192, "learning_rate": 0.00015607045339581096, "loss": 0.7528, "step": 550 }, { "epoch": 0.7676348547717843, "grad_norm": 0.5506445169448853, "learning_rate": 0.00015506666418908203, "loss": 0.767, "step": 555 }, { "epoch": 0.7745504840940526, "grad_norm": 0.6151379346847534, "learning_rate": 0.00015405484769788073, "loss": 0.7511, "step": 560 }, { "epoch": 0.7814661134163209, "grad_norm": 0.6520763635635376, "learning_rate": 0.00015303515141868116, "loss": 0.7529, "step": 565 }, { "epoch": 0.7883817427385892, "grad_norm": 0.5401438474655151, "learning_rate": 0.00015200772399662514, "loss": 0.7754, "step": 570 }, { "epoch": 0.7952973720608575, "grad_norm": 0.6025466322898865, "learning_rate": 0.00015097271520385366, "loss": 0.7577, "step": 575 }, { "epoch": 0.8022130013831259, "grad_norm": 0.6601846218109131, "learning_rate": 0.00014993027591767396, "loss": 0.7406, "step": 580 }, { "epoch": 0.8091286307053942, "grad_norm": 0.6068835854530334, "learning_rate": 0.0001488805580985655, "loss": 0.764, "step": 585 }, { "epoch": 0.8160442600276625, "grad_norm": 0.586649477481842, "learning_rate": 0.00014782371476802824, "loss": 0.7547, "step": 590 }, { "epoch": 0.8229598893499308, "grad_norm": 0.4942811131477356, "learning_rate": 0.00014675989998627598, "loss": 0.7539, "step": 595 }, { "epoch": 0.8298755186721992, "grad_norm": 0.5191232562065125, "learning_rate": 0.00014568926882977832, "loss": 0.75, "step": 600 }, { "epoch": 0.8367911479944675, "grad_norm": 0.5124319195747375, "learning_rate": 0.00014461197736865481, "loss": 0.7207, "step": 605 }, { "epoch": 0.8437067773167358, "grad_norm": 0.5561709403991699, "learning_rate": 0.00014352818264392364, "loss": 0.7504, "step": 610 }, { "epoch": 0.8506224066390041, "grad_norm": 0.5596902370452881, "learning_rate": 0.00014243804264460957, "loss": 0.7634, "step": 615 }, { "epoch": 0.8575380359612724, "grad_norm": 0.5537763237953186, "learning_rate": 0.00014134171628471276, "loss": 0.7293, "step": 620 }, { "epoch": 0.8644536652835408, "grad_norm": 0.6073982119560242, "learning_rate": 0.00014023936338004373, "loss": 0.7358, "step": 625 }, { "epoch": 0.8713692946058091, "grad_norm": 0.5265364050865173, "learning_rate": 0.00013913114462492601, "loss": 0.7415, "step": 630 }, { "epoch": 0.8782849239280774, "grad_norm": 0.5465463399887085, "learning_rate": 0.00013801722156877143, "loss": 0.7351, "step": 635 }, { "epoch": 0.8852005532503457, "grad_norm": 0.5381340384483337, "learning_rate": 0.00013689775659253006, "loss": 0.7403, "step": 640 }, { "epoch": 0.8921161825726142, "grad_norm": 0.5682520270347595, "learning_rate": 0.00013577291288501952, "loss": 0.7299, "step": 645 }, { "epoch": 0.8990318118948825, "grad_norm": 0.538758397102356, "learning_rate": 0.00013464285441913636, "loss": 0.7501, "step": 650 }, { "epoch": 0.9059474412171508, "grad_norm": 0.6069577932357788, "learning_rate": 0.00013350774592795292, "loss": 0.7373, "step": 655 }, { "epoch": 0.9128630705394191, "grad_norm": 0.5356409549713135, "learning_rate": 0.0001323677528807036, "loss": 0.7343, "step": 660 }, { "epoch": 0.9197786998616874, "grad_norm": 0.5176509618759155, "learning_rate": 0.00013122304145866381, "loss": 0.7298, "step": 665 }, { "epoch": 0.9266943291839558, "grad_norm": 0.6070849895477295, "learning_rate": 0.00013007377853092503, "loss": 0.7352, "step": 670 }, { "epoch": 0.9336099585062241, "grad_norm": 0.5752330422401428, "learning_rate": 0.00012892013163006962, "loss": 0.7323, "step": 675 }, { "epoch": 0.9405255878284924, "grad_norm": 0.550092339515686, "learning_rate": 0.00012776226892774903, "loss": 0.7437, "step": 680 }, { "epoch": 0.9474412171507607, "grad_norm": 0.5363165736198425, "learning_rate": 0.00012660035921016854, "loss": 0.7199, "step": 685 }, { "epoch": 0.9543568464730291, "grad_norm": 0.5486807823181152, "learning_rate": 0.00012543457185348298, "loss": 0.7159, "step": 690 }, { "epoch": 0.9612724757952974, "grad_norm": 0.6245793104171753, "learning_rate": 0.00012426507679910576, "loss": 0.7295, "step": 695 }, { "epoch": 0.9681881051175657, "grad_norm": 0.5306342244148254, "learning_rate": 0.00012309204452893606, "loss": 0.7239, "step": 700 }, { "epoch": 0.975103734439834, "grad_norm": 0.5488132238388062, "learning_rate": 0.00012191564604050683, "loss": 0.7027, "step": 705 }, { "epoch": 0.9820193637621023, "grad_norm": 0.5506658554077148, "learning_rate": 0.00012073605282205802, "loss": 0.7397, "step": 710 }, { "epoch": 0.9889349930843707, "grad_norm": 0.5438380241394043, "learning_rate": 0.00011955343682753794, "loss": 0.7241, "step": 715 }, { "epoch": 0.995850622406639, "grad_norm": 0.5309740900993347, "learning_rate": 0.0001183679704515368, "loss": 0.7209, "step": 720 }, { "epoch": 1.0027662517289073, "grad_norm": 0.6508172750473022, "learning_rate": 0.00011717982650415624, "loss": 0.6672, "step": 725 }, { "epoch": 1.0096818810511756, "grad_norm": 0.6566136479377747, "learning_rate": 0.00011598917818581791, "loss": 0.5846, "step": 730 }, { "epoch": 1.016597510373444, "grad_norm": 0.5107012987136841, "learning_rate": 0.00011479619906201557, "loss": 0.5685, "step": 735 }, { "epoch": 1.0235131396957122, "grad_norm": 0.48931655287742615, "learning_rate": 0.00011360106303801364, "loss": 0.5846, "step": 740 }, { "epoch": 1.0304287690179805, "grad_norm": 0.5219966769218445, "learning_rate": 0.00011240394433349637, "loss": 0.5588, "step": 745 }, { "epoch": 1.037344398340249, "grad_norm": 0.5277289152145386, "learning_rate": 0.00011120501745717112, "loss": 0.5727, "step": 750 }, { "epoch": 1.0442600276625174, "grad_norm": 0.5424395799636841, "learning_rate": 0.00011000445718132966, "loss": 0.566, "step": 755 }, { "epoch": 1.0511756569847857, "grad_norm": 0.5607298612594604, "learning_rate": 0.00010880243851637078, "loss": 0.57, "step": 760 }, { "epoch": 1.058091286307054, "grad_norm": 0.5789066553115845, "learning_rate": 0.00010759913668528841, "loss": 0.5659, "step": 765 }, { "epoch": 1.0650069156293223, "grad_norm": 0.5418556928634644, "learning_rate": 0.00010639472709812861, "loss": 0.573, "step": 770 }, { "epoch": 1.0719225449515906, "grad_norm": 0.49530017375946045, "learning_rate": 0.0001051893853264195, "loss": 0.5695, "step": 775 }, { "epoch": 1.0788381742738589, "grad_norm": 0.5037497878074646, "learning_rate": 0.00010398328707757738, "loss": 0.5651, "step": 780 }, { "epoch": 1.0857538035961272, "grad_norm": 0.5208268761634827, "learning_rate": 0.00010277660816929313, "loss": 0.5883, "step": 785 }, { "epoch": 1.0926694329183957, "grad_norm": 0.5543485283851624, "learning_rate": 0.00010156952450390269, "loss": 0.5537, "step": 790 }, { "epoch": 1.099585062240664, "grad_norm": 0.6337606310844421, "learning_rate": 0.00010036221204274512, "loss": 0.5649, "step": 795 }, { "epoch": 1.1065006915629323, "grad_norm": 0.4707985818386078, "learning_rate": 9.915484678051175e-05, "loss": 0.5471, "step": 800 }, { "epoch": 1.1134163208852006, "grad_norm": 0.5109753608703613, "learning_rate": 9.794760471959116e-05, "loss": 0.5663, "step": 805 }, { "epoch": 1.120331950207469, "grad_norm": 0.5100296139717102, "learning_rate": 9.674066184441221e-05, "loss": 0.5713, "step": 810 }, { "epoch": 1.1272475795297372, "grad_norm": 0.5414464473724365, "learning_rate": 9.553419409579035e-05, "loss": 0.5611, "step": 815 }, { "epoch": 1.1341632088520055, "grad_norm": 0.5717695355415344, "learning_rate": 9.432837734527995e-05, "loss": 0.5817, "step": 820 }, { "epoch": 1.1410788381742738, "grad_norm": 0.5524587035179138, "learning_rate": 9.312338736953683e-05, "loss": 0.5757, "step": 825 }, { "epoch": 1.147994467496542, "grad_norm": 0.5250436663627625, "learning_rate": 9.191939982469458e-05, "loss": 0.569, "step": 830 }, { "epoch": 1.1549100968188104, "grad_norm": 0.4710783064365387, "learning_rate": 9.071659022075849e-05, "loss": 0.565, "step": 835 }, { "epoch": 1.161825726141079, "grad_norm": 0.4436459541320801, "learning_rate": 8.951513389602076e-05, "loss": 0.5666, "step": 840 }, { "epoch": 1.1687413554633472, "grad_norm": 0.5180997252464294, "learning_rate": 8.831520599150083e-05, "loss": 0.5595, "step": 845 }, { "epoch": 1.1756569847856155, "grad_norm": 0.49515190720558167, "learning_rate": 8.71169814254142e-05, "loss": 0.584, "step": 850 }, { "epoch": 1.1825726141078838, "grad_norm": 0.5082345008850098, "learning_rate": 8.592063486767406e-05, "loss": 0.5687, "step": 855 }, { "epoch": 1.1894882434301521, "grad_norm": 0.5054699182510376, "learning_rate": 8.472634071442896e-05, "loss": 0.5796, "step": 860 }, { "epoch": 1.1964038727524204, "grad_norm": 0.4742473065853119, "learning_rate": 8.353427306264032e-05, "loss": 0.5575, "step": 865 }, { "epoch": 1.2033195020746887, "grad_norm": 0.46908038854599, "learning_rate": 8.23446056847037e-05, "loss": 0.5654, "step": 870 }, { "epoch": 1.210235131396957, "grad_norm": 0.4599679708480835, "learning_rate": 8.115751200311725e-05, "loss": 0.5452, "step": 875 }, { "epoch": 1.2171507607192256, "grad_norm": 0.48915913701057434, "learning_rate": 7.99731650652013e-05, "loss": 0.5667, "step": 880 }, { "epoch": 1.2240663900414939, "grad_norm": 0.5099295377731323, "learning_rate": 7.879173751787259e-05, "loss": 0.5673, "step": 885 }, { "epoch": 1.2309820193637622, "grad_norm": 0.49911564588546753, "learning_rate": 7.761340158247674e-05, "loss": 0.5695, "step": 890 }, { "epoch": 1.2378976486860305, "grad_norm": 0.4832319915294647, "learning_rate": 7.64383290296829e-05, "loss": 0.5477, "step": 895 }, { "epoch": 1.2448132780082988, "grad_norm": 0.4508809447288513, "learning_rate": 7.526669115444414e-05, "loss": 0.5738, "step": 900 }, { "epoch": 1.251728907330567, "grad_norm": 0.5219544768333435, "learning_rate": 7.409865875102704e-05, "loss": 0.5581, "step": 905 }, { "epoch": 1.2586445366528354, "grad_norm": 0.4636399745941162, "learning_rate": 7.293440208811435e-05, "loss": 0.5593, "step": 910 }, { "epoch": 1.2655601659751037, "grad_norm": 0.4862774908542633, "learning_rate": 7.177409088398425e-05, "loss": 0.5481, "step": 915 }, { "epoch": 1.272475795297372, "grad_norm": 0.5330405235290527, "learning_rate": 7.06178942817699e-05, "loss": 0.5595, "step": 920 }, { "epoch": 1.2793914246196403, "grad_norm": 0.49150362610816956, "learning_rate": 6.946598082480268e-05, "loss": 0.5429, "step": 925 }, { "epoch": 1.2863070539419086, "grad_norm": 0.5385686755180359, "learning_rate": 6.831851843204308e-05, "loss": 0.5692, "step": 930 }, { "epoch": 1.293222683264177, "grad_norm": 0.487090528011322, "learning_rate": 6.71756743736024e-05, "loss": 0.5484, "step": 935 }, { "epoch": 1.3001383125864454, "grad_norm": 0.5285480618476868, "learning_rate": 6.603761524635914e-05, "loss": 0.5563, "step": 940 }, { "epoch": 1.3070539419087137, "grad_norm": 0.48829564452171326, "learning_rate": 6.490450694967358e-05, "loss": 0.5606, "step": 945 }, { "epoch": 1.313969571230982, "grad_norm": 0.4818781912326813, "learning_rate": 6.377651466120391e-05, "loss": 0.5538, "step": 950 }, { "epoch": 1.3208852005532503, "grad_norm": 0.47652438282966614, "learning_rate": 6.265380281282762e-05, "loss": 0.5584, "step": 955 }, { "epoch": 1.3278008298755186, "grad_norm": 0.47311243414878845, "learning_rate": 6.15365350666718e-05, "loss": 0.5409, "step": 960 }, { "epoch": 1.334716459197787, "grad_norm": 0.46478375792503357, "learning_rate": 6.042487429125516e-05, "loss": 0.5554, "step": 965 }, { "epoch": 1.3416320885200554, "grad_norm": 0.4783475399017334, "learning_rate": 5.931898253774628e-05, "loss": 0.5465, "step": 970 }, { "epoch": 1.3485477178423237, "grad_norm": 0.48976966738700867, "learning_rate": 5.821902101634069e-05, "loss": 0.5563, "step": 975 }, { "epoch": 1.355463347164592, "grad_norm": 0.4838036298751831, "learning_rate": 5.7125150072760635e-05, "loss": 0.5628, "step": 980 }, { "epoch": 1.3623789764868603, "grad_norm": 0.46755191683769226, "learning_rate": 5.603752916488085e-05, "loss": 0.5472, "step": 985 }, { "epoch": 1.3692946058091287, "grad_norm": 0.50596022605896, "learning_rate": 5.4956316839483734e-05, "loss": 0.5632, "step": 990 }, { "epoch": 1.376210235131397, "grad_norm": 0.49392423033714294, "learning_rate": 5.388167070914738e-05, "loss": 0.5363, "step": 995 }, { "epoch": 1.3831258644536653, "grad_norm": 0.4692226052284241, "learning_rate": 5.281374742926987e-05, "loss": 0.536, "step": 1000 }, { "epoch": 1.3900414937759336, "grad_norm": 0.5283923745155334, "learning_rate": 5.175270267523278e-05, "loss": 0.5553, "step": 1005 }, { "epoch": 1.3969571230982019, "grad_norm": 0.47653043270111084, "learning_rate": 5.069869111970793e-05, "loss": 0.5492, "step": 1010 }, { "epoch": 1.4038727524204702, "grad_norm": 0.45839253067970276, "learning_rate": 4.965186641011013e-05, "loss": 0.5297, "step": 1015 }, { "epoch": 1.4107883817427385, "grad_norm": 0.4624306261539459, "learning_rate": 4.861238114619929e-05, "loss": 0.5384, "step": 1020 }, { "epoch": 1.417704011065007, "grad_norm": 0.48706522583961487, "learning_rate": 4.75803868578355e-05, "loss": 0.5369, "step": 1025 }, { "epoch": 1.4246196403872753, "grad_norm": 0.4311310350894928, "learning_rate": 4.655603398288979e-05, "loss": 0.5276, "step": 1030 }, { "epoch": 1.4315352697095436, "grad_norm": 0.47203701734542847, "learning_rate": 4.5539471845314304e-05, "loss": 0.5347, "step": 1035 }, { "epoch": 1.438450899031812, "grad_norm": 0.46674981713294983, "learning_rate": 4.453084863337471e-05, "loss": 0.5299, "step": 1040 }, { "epoch": 1.4453665283540802, "grad_norm": 0.42725497484207153, "learning_rate": 4.353031137804821e-05, "loss": 0.5369, "step": 1045 }, { "epoch": 1.4522821576763485, "grad_norm": 0.4412887990474701, "learning_rate": 4.253800593159029e-05, "loss": 0.5418, "step": 1050 }, { "epoch": 1.4591977869986168, "grad_norm": 0.45071831345558167, "learning_rate": 4.155407694627322e-05, "loss": 0.5551, "step": 1055 }, { "epoch": 1.4661134163208853, "grad_norm": 0.44687119126319885, "learning_rate": 4.057866785329959e-05, "loss": 0.5424, "step": 1060 }, { "epoch": 1.4730290456431536, "grad_norm": 0.45058682560920715, "learning_rate": 3.96119208418937e-05, "loss": 0.5524, "step": 1065 }, { "epoch": 1.479944674965422, "grad_norm": 0.5045320391654968, "learning_rate": 3.8653976838574104e-05, "loss": 0.5403, "step": 1070 }, { "epoch": 1.4868603042876902, "grad_norm": 0.4632035791873932, "learning_rate": 3.770497548661021e-05, "loss": 0.5346, "step": 1075 }, { "epoch": 1.4937759336099585, "grad_norm": 0.4648893177509308, "learning_rate": 3.676505512566597e-05, "loss": 0.5423, "step": 1080 }, { "epoch": 1.5006915629322268, "grad_norm": 0.4647049605846405, "learning_rate": 3.5834352771633475e-05, "loss": 0.5319, "step": 1085 }, { "epoch": 1.5076071922544951, "grad_norm": 0.45878130197525024, "learning_rate": 3.491300409665963e-05, "loss": 0.533, "step": 1090 }, { "epoch": 1.5145228215767634, "grad_norm": 0.4348316192626953, "learning_rate": 3.4001143409368773e-05, "loss": 0.5111, "step": 1095 }, { "epoch": 1.5214384508990317, "grad_norm": 0.47867557406425476, "learning_rate": 3.309890363528386e-05, "loss": 0.527, "step": 1100 }, { "epoch": 1.5283540802213, "grad_norm": 0.47667166590690613, "learning_rate": 3.220641629744947e-05, "loss": 0.5184, "step": 1105 }, { "epoch": 1.5352697095435683, "grad_norm": 0.4480649530887604, "learning_rate": 3.132381149725916e-05, "loss": 0.5218, "step": 1110 }, { "epoch": 1.5421853388658366, "grad_norm": 0.4330606460571289, "learning_rate": 3.0451217895489992e-05, "loss": 0.5317, "step": 1115 }, { "epoch": 1.5491009681881052, "grad_norm": 0.45356714725494385, "learning_rate": 2.9588762693547355e-05, "loss": 0.5307, "step": 1120 }, { "epoch": 1.5560165975103735, "grad_norm": 0.46869832277297974, "learning_rate": 2.8736571614922046e-05, "loss": 0.5231, "step": 1125 }, { "epoch": 1.5629322268326418, "grad_norm": 0.5013293623924255, "learning_rate": 2.7894768886863233e-05, "loss": 0.5272, "step": 1130 }, { "epoch": 1.56984785615491, "grad_norm": 0.44263824820518494, "learning_rate": 2.7063477222269306e-05, "loss": 0.53, "step": 1135 }, { "epoch": 1.5767634854771784, "grad_norm": 0.45062586665153503, "learning_rate": 2.6242817801799557e-05, "loss": 0.5116, "step": 1140 }, { "epoch": 1.583679114799447, "grad_norm": 0.4699437618255615, "learning_rate": 2.5432910256209187e-05, "loss": 0.5314, "step": 1145 }, { "epoch": 1.5905947441217152, "grad_norm": 0.45320945978164673, "learning_rate": 2.4633872648910252e-05, "loss": 0.5244, "step": 1150 }, { "epoch": 1.5975103734439835, "grad_norm": 0.45921820402145386, "learning_rate": 2.3845821458761063e-05, "loss": 0.5265, "step": 1155 }, { "epoch": 1.6044260027662518, "grad_norm": 0.4836861789226532, "learning_rate": 2.3068871563086757e-05, "loss": 0.5397, "step": 1160 }, { "epoch": 1.61134163208852, "grad_norm": 0.4829198122024536, "learning_rate": 2.230313622093295e-05, "loss": 0.5247, "step": 1165 }, { "epoch": 1.6182572614107884, "grad_norm": 0.42497992515563965, "learning_rate": 2.154872705655566e-05, "loss": 0.4998, "step": 1170 }, { "epoch": 1.6251728907330567, "grad_norm": 0.4427035450935364, "learning_rate": 2.0805754043149394e-05, "loss": 0.52, "step": 1175 }, { "epoch": 1.632088520055325, "grad_norm": 0.4457587003707886, "learning_rate": 2.0074325486815883e-05, "loss": 0.4931, "step": 1180 }, { "epoch": 1.6390041493775933, "grad_norm": 0.46790486574172974, "learning_rate": 1.9354548010775896e-05, "loss": 0.529, "step": 1185 }, { "epoch": 1.6459197786998616, "grad_norm": 0.4801090955734253, "learning_rate": 1.864652653982636e-05, "loss": 0.5188, "step": 1190 }, { "epoch": 1.65283540802213, "grad_norm": 0.4337711036205292, "learning_rate": 1.7950364285044996e-05, "loss": 0.5105, "step": 1195 }, { "epoch": 1.6597510373443982, "grad_norm": 0.45500195026397705, "learning_rate": 1.7266162728744993e-05, "loss": 0.5049, "step": 1200 }, { "epoch": 1.6666666666666665, "grad_norm": 0.4910541772842407, "learning_rate": 1.6594021609681344e-05, "loss": 0.5079, "step": 1205 }, { "epoch": 1.673582295988935, "grad_norm": 0.4468669295310974, "learning_rate": 1.5934038908511616e-05, "loss": 0.5135, "step": 1210 }, { "epoch": 1.6804979253112033, "grad_norm": 0.4427061975002289, "learning_rate": 1.5286310833512963e-05, "loss": 0.5019, "step": 1215 }, { "epoch": 1.6874135546334716, "grad_norm": 0.44807228446006775, "learning_rate": 1.4650931806557389e-05, "loss": 0.5121, "step": 1220 }, { "epoch": 1.69432918395574, "grad_norm": 0.45542779564857483, "learning_rate": 1.402799444934757e-05, "loss": 0.5183, "step": 1225 }, { "epoch": 1.7012448132780082, "grad_norm": 0.44287553429603577, "learning_rate": 1.3417589569914978e-05, "loss": 0.4962, "step": 1230 }, { "epoch": 1.7081604426002768, "grad_norm": 0.43979722261428833, "learning_rate": 1.2819806149382441e-05, "loss": 0.5065, "step": 1235 }, { "epoch": 1.715076071922545, "grad_norm": 0.4374731481075287, "learning_rate": 1.2234731328993055e-05, "loss": 0.5144, "step": 1240 }, { "epoch": 1.7219917012448134, "grad_norm": 0.4652646481990814, "learning_rate": 1.1662450397407188e-05, "loss": 0.5017, "step": 1245 }, { "epoch": 1.7289073305670817, "grad_norm": 0.4413568377494812, "learning_rate": 1.1103046778269687e-05, "loss": 0.5109, "step": 1250 }, { "epoch": 1.73582295988935, "grad_norm": 0.4495951235294342, "learning_rate": 1.0556602018048866e-05, "loss": 0.502, "step": 1255 }, { "epoch": 1.7427385892116183, "grad_norm": 0.4346306025981903, "learning_rate": 1.0023195774149119e-05, "loss": 0.4963, "step": 1260 }, { "epoch": 1.7496542185338866, "grad_norm": 0.4612555503845215, "learning_rate": 9.502905803299e-06, "loss": 0.5006, "step": 1265 }, { "epoch": 1.7565698478561549, "grad_norm": 0.46448782086372375, "learning_rate": 8.995807950216262e-06, "loss": 0.5195, "step": 1270 }, { "epoch": 1.7634854771784232, "grad_norm": 0.44382619857788086, "learning_rate": 8.501976136551749e-06, "loss": 0.5061, "step": 1275 }, { "epoch": 1.7704011065006915, "grad_norm": 0.45043641328811646, "learning_rate": 8.021482350113474e-06, "loss": 0.5254, "step": 1280 }, { "epoch": 1.7773167358229598, "grad_norm": 0.44958069920539856, "learning_rate": 7.554396634372707e-06, "loss": 0.5311, "step": 1285 }, { "epoch": 1.784232365145228, "grad_norm": 0.433776319026947, "learning_rate": 7.100787078253446e-06, "loss": 0.486, "step": 1290 }, { "epoch": 1.7911479944674964, "grad_norm": 0.41777148842811584, "learning_rate": 6.660719806206839e-06, "loss": 0.5108, "step": 1295 }, { "epoch": 1.798063623789765, "grad_norm": 0.4487934112548828, "learning_rate": 6.234258968571971e-06, "loss": 0.5006, "step": 1300 }, { "epoch": 1.8049792531120332, "grad_norm": 0.4544650614261627, "learning_rate": 5.821466732224412e-06, "loss": 0.5037, "step": 1305 }, { "epoch": 1.8118948824343015, "grad_norm": 0.41929152607917786, "learning_rate": 5.422403271513854e-06, "loss": 0.4978, "step": 1310 }, { "epoch": 1.8188105117565698, "grad_norm": 0.4613235294818878, "learning_rate": 5.0371267594923834e-06, "loss": 0.5209, "step": 1315 }, { "epoch": 1.8257261410788381, "grad_norm": 0.44448205828666687, "learning_rate": 4.66569335943422e-06, "loss": 0.5014, "step": 1320 }, { "epoch": 1.8326417704011067, "grad_norm": 0.43924444913864136, "learning_rate": 4.3081572166486675e-06, "loss": 0.5247, "step": 1325 }, { "epoch": 1.839557399723375, "grad_norm": 0.44769856333732605, "learning_rate": 3.964570450587113e-06, "loss": 0.5005, "step": 1330 }, { "epoch": 1.8464730290456433, "grad_norm": 0.4331699311733246, "learning_rate": 3.6349831472453743e-06, "loss": 0.5041, "step": 1335 }, { "epoch": 1.8533886583679116, "grad_norm": 0.43336230516433716, "learning_rate": 3.3194433518624614e-06, "loss": 0.5162, "step": 1340 }, { "epoch": 1.8603042876901799, "grad_norm": 0.4407755434513092, "learning_rate": 3.017997061916833e-06, "loss": 0.501, "step": 1345 }, { "epoch": 1.8672199170124482, "grad_norm": 0.43583956360816956, "learning_rate": 2.7306882204211626e-06, "loss": 0.5009, "step": 1350 }, { "epoch": 1.8741355463347165, "grad_norm": 0.42576801776885986, "learning_rate": 2.4575587095166054e-06, "loss": 0.5128, "step": 1355 }, { "epoch": 1.8810511756569848, "grad_norm": 0.42050647735595703, "learning_rate": 2.198648344367449e-06, "loss": 0.5069, "step": 1360 }, { "epoch": 1.887966804979253, "grad_norm": 0.4313651919364929, "learning_rate": 1.953994867357134e-06, "loss": 0.5055, "step": 1365 }, { "epoch": 1.8948824343015214, "grad_norm": 0.43365243077278137, "learning_rate": 1.7236339425863446e-06, "loss": 0.5145, "step": 1370 }, { "epoch": 1.9017980636237897, "grad_norm": 0.4598560035228729, "learning_rate": 1.507599150674177e-06, "loss": 0.5191, "step": 1375 }, { "epoch": 1.908713692946058, "grad_norm": 0.4304969310760498, "learning_rate": 1.3059219838629234e-06, "loss": 0.501, "step": 1380 }, { "epoch": 1.9156293222683263, "grad_norm": 0.44647660851478577, "learning_rate": 1.11863184142732e-06, "loss": 0.5063, "step": 1385 }, { "epoch": 1.9225449515905948, "grad_norm": 0.40728816390037537, "learning_rate": 9.457560253889219e-07, "loss": 0.5009, "step": 1390 }, { "epoch": 1.929460580912863, "grad_norm": 0.4395773410797119, "learning_rate": 7.873197365361407e-07, "loss": 0.4993, "step": 1395 }, { "epoch": 1.9363762102351314, "grad_norm": 0.44724011421203613, "learning_rate": 6.433460707506722e-07, "loss": 0.5065, "step": 1400 }, { "epoch": 1.9432918395573997, "grad_norm": 0.41976112127304077, "learning_rate": 5.138560156407124e-07, "loss": 0.5058, "step": 1405 }, { "epoch": 1.950207468879668, "grad_norm": 0.43243739008903503, "learning_rate": 3.988684474814819e-07, "loss": 0.4919, "step": 1410 }, { "epoch": 1.9571230982019365, "grad_norm": 0.4788164496421814, "learning_rate": 2.984001284635496e-07, "loss": 0.5128, "step": 1415 }, { "epoch": 1.9640387275242048, "grad_norm": 0.42695632576942444, "learning_rate": 2.1246570424940936e-07, "loss": 0.5116, "step": 1420 }, { "epoch": 1.9709543568464731, "grad_norm": 0.40510523319244385, "learning_rate": 1.4107770183845458e-07, "loss": 0.5058, "step": 1425 }, { "epoch": 1.9778699861687414, "grad_norm": 0.4062838554382324, "learning_rate": 8.424652774089436e-08, "loss": 0.5128, "step": 1430 }, { "epoch": 1.9847856154910097, "grad_norm": 0.4485253393650055, "learning_rate": 4.198046646075593e-08, "loss": 0.5062, "step": 1435 }, { "epoch": 1.991701244813278, "grad_norm": 0.43608593940734863, "learning_rate": 1.4285679288228437e-08, "loss": 0.4919, "step": 1440 }, { "epoch": 1.9986168741355463, "grad_norm": 0.4265099763870239, "learning_rate": 1.166203401481436e-09, "loss": 0.5001, "step": 1445 }, { "epoch": 2.0, "step": 1446, "total_flos": 6.011530730704732e+18, "train_loss": 0.6795949197244182, "train_runtime": 13742.7497, "train_samples_per_second": 6.734, "train_steps_per_second": 0.105 } ], "logging_steps": 5, "max_steps": 1446, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.011530730704732e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }