{ "best_metric": 1.877977728843689, "best_model_checkpoint": "./output/checkpoint-450", "epoch": 0.14511447920025797, "eval_steps": 150, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001074922068150059, "grad_norm": 43.13871765136719, "learning_rate": 4.125e-06, "loss": 2.2922, "step": 10 }, { "epoch": 0.002149844136300118, "grad_norm": 26.323883056640625, "learning_rate": 8.25e-06, "loss": 2.1328, "step": 20 }, { "epoch": 0.0032247662044501773, "grad_norm": 17.629505157470703, "learning_rate": 1.2375e-05, "loss": 2.036, "step": 30 }, { "epoch": 0.004299688272600236, "grad_norm": 10.648378372192383, "learning_rate": 1.65e-05, "loss": 1.851, "step": 40 }, { "epoch": 0.005374610340750295, "grad_norm": 17.73092269897461, "learning_rate": 2.0625e-05, "loss": 1.8746, "step": 50 }, { "epoch": 0.0064495324089003546, "grad_norm": 6.716779708862305, "learning_rate": 2.475e-05, "loss": 1.8512, "step": 60 }, { "epoch": 0.007524454477050414, "grad_norm": 15.877828598022461, "learning_rate": 2.8874999999999997e-05, "loss": 1.9416, "step": 70 }, { "epoch": 0.008599376545200472, "grad_norm": 14.675684928894043, "learning_rate": 3.3e-05, "loss": 1.8581, "step": 80 }, { "epoch": 0.009674298613350531, "grad_norm": 11.489137649536133, "learning_rate": 3.7125e-05, "loss": 1.9017, "step": 90 }, { "epoch": 0.01074922068150059, "grad_norm": 7.483497619628906, "learning_rate": 4.125e-05, "loss": 1.9335, "step": 100 }, { "epoch": 0.01182414274965065, "grad_norm": 9.6410551071167, "learning_rate": 4.12495760935163e-05, "loss": 1.7841, "step": 110 }, { "epoch": 0.012899064817800709, "grad_norm": 7.8748979568481445, "learning_rate": 4.1248304391490334e-05, "loss": 1.8529, "step": 120 }, { "epoch": 0.013973986885950768, "grad_norm": 9.455327987670898, "learning_rate": 4.1246184946196796e-05, "loss": 1.9366, "step": 130 }, { "epoch": 0.015048908954100828, "grad_norm": 8.64035701751709, "learning_rate": 4.124321784475777e-05, "loss": 1.8501, "step": 140 }, { "epoch": 0.016123831022250887, "grad_norm": 13.220332145690918, "learning_rate": 4.123940320913919e-05, "loss": 1.9095, "step": 150 }, { "epoch": 0.016123831022250887, "eval_loss": 1.8831839561462402, "eval_runtime": 61.5136, "eval_samples_per_second": 8.128, "eval_steps_per_second": 8.128, "step": 150 }, { "epoch": 0.017198753090400944, "grad_norm": 8.182404518127441, "learning_rate": 4.123474119614577e-05, "loss": 1.8163, "step": 160 }, { "epoch": 0.018273675158551005, "grad_norm": 12.389548301696777, "learning_rate": 4.1229231997414614e-05, "loss": 1.8781, "step": 170 }, { "epoch": 0.019348597226701063, "grad_norm": 8.600774765014648, "learning_rate": 4.1222875839407306e-05, "loss": 1.8206, "step": 180 }, { "epoch": 0.020423519294851124, "grad_norm": 7.161348819732666, "learning_rate": 4.121567298340059e-05, "loss": 1.8117, "step": 190 }, { "epoch": 0.02149844136300118, "grad_norm": 7.655135631561279, "learning_rate": 4.120762372547569e-05, "loss": 1.794, "step": 200 }, { "epoch": 0.022573363431151242, "grad_norm": 10.144853591918945, "learning_rate": 4.119872839650605e-05, "loss": 1.8959, "step": 210 }, { "epoch": 0.0236482854993013, "grad_norm": 8.557960510253906, "learning_rate": 4.118898736214381e-05, "loss": 1.8676, "step": 220 }, { "epoch": 0.02472320756745136, "grad_norm": 11.726237297058105, "learning_rate": 4.117840102280475e-05, "loss": 1.7744, "step": 230 }, { "epoch": 0.025798129635601418, "grad_norm": 8.739320755004883, "learning_rate": 4.116696981365181e-05, "loss": 1.8178, "step": 240 }, { "epoch": 0.02687305170375148, "grad_norm": 6.433278560638428, "learning_rate": 4.115469420457721e-05, "loss": 1.7578, "step": 250 }, { "epoch": 0.027947973771901537, "grad_norm": 6.021564960479736, "learning_rate": 4.1141574700183186e-05, "loss": 1.8376, "step": 260 }, { "epoch": 0.029022895840051598, "grad_norm": 8.607327461242676, "learning_rate": 4.1127611839761155e-05, "loss": 1.7681, "step": 270 }, { "epoch": 0.030097817908201655, "grad_norm": 9.417404174804688, "learning_rate": 4.111280619726964e-05, "loss": 1.8459, "step": 280 }, { "epoch": 0.031172739976351716, "grad_norm": 11.03150749206543, "learning_rate": 4.109715838131059e-05, "loss": 1.8467, "step": 290 }, { "epoch": 0.032247662044501774, "grad_norm": 7.549683570861816, "learning_rate": 4.108066903510445e-05, "loss": 1.8979, "step": 300 }, { "epoch": 0.032247662044501774, "eval_loss": 1.8808292150497437, "eval_runtime": 60.731, "eval_samples_per_second": 8.233, "eval_steps_per_second": 8.233, "step": 300 }, { "epoch": 0.033322584112651835, "grad_norm": 7.343403339385986, "learning_rate": 4.106333883646366e-05, "loss": 1.8544, "step": 310 }, { "epoch": 0.03439750618080189, "grad_norm": 6.203611850738525, "learning_rate": 4.104516849776479e-05, "loss": 1.8245, "step": 320 }, { "epoch": 0.03547242824895195, "grad_norm": 8.674558639526367, "learning_rate": 4.1026158765919306e-05, "loss": 1.8058, "step": 330 }, { "epoch": 0.03654735031710201, "grad_norm": 9.196893692016602, "learning_rate": 4.100631042234283e-05, "loss": 1.8813, "step": 340 }, { "epoch": 0.03762227238525207, "grad_norm": 10.656414985656738, "learning_rate": 4.098562428292304e-05, "loss": 1.7876, "step": 350 }, { "epoch": 0.038697194453402126, "grad_norm": 6.581936836242676, "learning_rate": 4.096410119798607e-05, "loss": 1.8134, "step": 360 }, { "epoch": 0.03977211652155219, "grad_norm": 9.650823593139648, "learning_rate": 4.094174205226167e-05, "loss": 1.7727, "step": 370 }, { "epoch": 0.04084703858970225, "grad_norm": 10.17803955078125, "learning_rate": 4.0918547764846736e-05, "loss": 1.7945, "step": 380 }, { "epoch": 0.04192196065785231, "grad_norm": 8.77602481842041, "learning_rate": 4.089451928916758e-05, "loss": 1.9062, "step": 390 }, { "epoch": 0.04299688272600236, "grad_norm": 6.488632678985596, "learning_rate": 4.0869657612940723e-05, "loss": 1.774, "step": 400 }, { "epoch": 0.044071804794152424, "grad_norm": 10.849347114562988, "learning_rate": 4.08439637581323e-05, "loss": 1.8497, "step": 410 }, { "epoch": 0.045146726862302484, "grad_norm": 6.79919958114624, "learning_rate": 4.081743878091604e-05, "loss": 1.9205, "step": 420 }, { "epoch": 0.046221648930452545, "grad_norm": 10.169675827026367, "learning_rate": 4.079008377162988e-05, "loss": 1.8459, "step": 430 }, { "epoch": 0.0472965709986026, "grad_norm": 6.641557216644287, "learning_rate": 4.0761899854731085e-05, "loss": 1.867, "step": 440 }, { "epoch": 0.04837149306675266, "grad_norm": 10.017315864562988, "learning_rate": 4.073288818875011e-05, "loss": 1.8095, "step": 450 }, { "epoch": 0.04837149306675266, "eval_loss": 1.877977728843689, "eval_runtime": 60.8543, "eval_samples_per_second": 8.216, "eval_steps_per_second": 8.216, "step": 450 }, { "epoch": 0.04944641513490272, "grad_norm": 7.070505619049072, "learning_rate": 4.070304996624291e-05, "loss": 1.8261, "step": 460 }, { "epoch": 0.050521337203052775, "grad_norm": 7.832033157348633, "learning_rate": 4.067238641374194e-05, "loss": 1.8129, "step": 470 }, { "epoch": 0.051596259271202836, "grad_norm": 8.640854835510254, "learning_rate": 4.0640898791705745e-05, "loss": 1.8887, "step": 480 }, { "epoch": 0.0526711813393529, "grad_norm": 8.050338745117188, "learning_rate": 4.060858839446713e-05, "loss": 1.8551, "step": 490 }, { "epoch": 0.05374610340750296, "grad_norm": 5.903382778167725, "learning_rate": 4.057545655017998e-05, "loss": 1.7708, "step": 500 }, { "epoch": 0.05482102547565301, "grad_norm": 6.558720111846924, "learning_rate": 4.054150462076465e-05, "loss": 1.8277, "step": 510 }, { "epoch": 0.05589594754380307, "grad_norm": 9.94021224975586, "learning_rate": 4.0506734001851976e-05, "loss": 1.8834, "step": 520 }, { "epoch": 0.056970869611953134, "grad_norm": 11.147041320800781, "learning_rate": 4.0471146122725904e-05, "loss": 1.8697, "step": 530 }, { "epoch": 0.058045791680103195, "grad_norm": 7.040550708770752, "learning_rate": 4.043474244626477e-05, "loss": 1.8605, "step": 540 }, { "epoch": 0.05912071374825325, "grad_norm": 8.166383743286133, "learning_rate": 4.0397524468881125e-05, "loss": 1.8772, "step": 550 }, { "epoch": 0.06019563581640331, "grad_norm": 6.262266159057617, "learning_rate": 4.0359493720460244e-05, "loss": 1.8334, "step": 560 }, { "epoch": 0.06127055788455337, "grad_norm": 6.04037618637085, "learning_rate": 4.032065176429724e-05, "loss": 1.8355, "step": 570 }, { "epoch": 0.06234547995270343, "grad_norm": 6.709385871887207, "learning_rate": 4.0281000197032795e-05, "loss": 1.8567, "step": 580 }, { "epoch": 0.06342040202085349, "grad_norm": 6.928658962249756, "learning_rate": 4.0240540648587546e-05, "loss": 1.8503, "step": 590 }, { "epoch": 0.06449532408900355, "grad_norm": 5.788703918457031, "learning_rate": 4.019927478209504e-05, "loss": 1.8522, "step": 600 }, { "epoch": 0.06449532408900355, "eval_loss": 1.8825602531433105, "eval_runtime": 60.165, "eval_samples_per_second": 8.31, "eval_steps_per_second": 8.31, "step": 600 }, { "epoch": 0.06557024615715361, "grad_norm": 9.176572799682617, "learning_rate": 4.015720429383344e-05, "loss": 1.8688, "step": 610 }, { "epoch": 0.06664516822530367, "grad_norm": 7.160205841064453, "learning_rate": 4.0114330913155726e-05, "loss": 1.8031, "step": 620 }, { "epoch": 0.06772009029345373, "grad_norm": 9.988216400146484, "learning_rate": 4.007065640241867e-05, "loss": 1.8262, "step": 630 }, { "epoch": 0.06879501236160378, "grad_norm": 9.315797805786133, "learning_rate": 4.002618255691033e-05, "loss": 1.8542, "step": 640 }, { "epoch": 0.06986993442975384, "grad_norm": 7.385815143585205, "learning_rate": 3.9980911204776306e-05, "loss": 1.8328, "step": 650 }, { "epoch": 0.0709448564979039, "grad_norm": 7.70832633972168, "learning_rate": 3.993484420694458e-05, "loss": 1.9542, "step": 660 }, { "epoch": 0.07201977856605396, "grad_norm": 10.131119728088379, "learning_rate": 3.988798345704899e-05, "loss": 1.8389, "step": 670 }, { "epoch": 0.07309470063420402, "grad_norm": 6.0609354972839355, "learning_rate": 3.984033088135143e-05, "loss": 1.8653, "step": 680 }, { "epoch": 0.07416962270235408, "grad_norm": 9.173294067382812, "learning_rate": 3.979188843866263e-05, "loss": 1.8558, "step": 690 }, { "epoch": 0.07524454477050414, "grad_norm": 5.944100856781006, "learning_rate": 3.97426581202617e-05, "loss": 1.922, "step": 700 }, { "epoch": 0.0763194668386542, "grad_norm": 7.453153133392334, "learning_rate": 3.969264194981418e-05, "loss": 1.8524, "step": 710 }, { "epoch": 0.07739438890680425, "grad_norm": 6.75320291519165, "learning_rate": 3.9641841983288953e-05, "loss": 1.8699, "step": 720 }, { "epoch": 0.07846931097495431, "grad_norm": 6.539112567901611, "learning_rate": 3.959026030887367e-05, "loss": 1.7784, "step": 730 }, { "epoch": 0.07954423304310437, "grad_norm": 7.794706344604492, "learning_rate": 3.953789904688893e-05, "loss": 1.8222, "step": 740 }, { "epoch": 0.08061915511125443, "grad_norm": 7.376310348510742, "learning_rate": 3.948476034970113e-05, "loss": 1.7985, "step": 750 }, { "epoch": 0.08061915511125443, "eval_loss": 1.8882893323898315, "eval_runtime": 60.264, "eval_samples_per_second": 8.297, "eval_steps_per_second": 8.297, "step": 750 }, { "epoch": 0.0816940771794045, "grad_norm": 8.657331466674805, "learning_rate": 3.943084640163398e-05, "loss": 1.7983, "step": 760 }, { "epoch": 0.08276899924755456, "grad_norm": 5.748507976531982, "learning_rate": 3.937615941887873e-05, "loss": 1.884, "step": 770 }, { "epoch": 0.08384392131570462, "grad_norm": 6.509608745574951, "learning_rate": 3.932070164940304e-05, "loss": 1.891, "step": 780 }, { "epoch": 0.08491884338385466, "grad_norm": 6.634593486785889, "learning_rate": 3.926447537285859e-05, "loss": 1.776, "step": 790 }, { "epoch": 0.08599376545200473, "grad_norm": 9.289311408996582, "learning_rate": 3.920748290048739e-05, "loss": 1.7806, "step": 800 }, { "epoch": 0.08706868752015479, "grad_norm": 7.693164348602295, "learning_rate": 3.914972657502677e-05, "loss": 1.8428, "step": 810 }, { "epoch": 0.08814360958830485, "grad_norm": 5.920309066772461, "learning_rate": 3.9091208770613036e-05, "loss": 1.8048, "step": 820 }, { "epoch": 0.08921853165645491, "grad_norm": 7.512606143951416, "learning_rate": 3.9031931892683937e-05, "loss": 1.848, "step": 830 }, { "epoch": 0.09029345372460497, "grad_norm": 8.341816902160645, "learning_rate": 3.897189837787975e-05, "loss": 1.7995, "step": 840 }, { "epoch": 0.09136837579275503, "grad_norm": 7.76444149017334, "learning_rate": 3.891111069394313e-05, "loss": 1.7587, "step": 850 }, { "epoch": 0.09244329786090509, "grad_norm": 8.347227096557617, "learning_rate": 3.884957133961768e-05, "loss": 1.8215, "step": 860 }, { "epoch": 0.09351821992905514, "grad_norm": 7.596982955932617, "learning_rate": 3.878728284454522e-05, "loss": 1.8831, "step": 870 }, { "epoch": 0.0945931419972052, "grad_norm": 7.217879295349121, "learning_rate": 3.872424776916183e-05, "loss": 1.9871, "step": 880 }, { "epoch": 0.09566806406535526, "grad_norm": 6.474759101867676, "learning_rate": 3.866046870459253e-05, "loss": 1.8137, "step": 890 }, { "epoch": 0.09674298613350532, "grad_norm": 7.693883419036865, "learning_rate": 3.8595948272544905e-05, "loss": 1.884, "step": 900 }, { "epoch": 0.09674298613350532, "eval_loss": 1.8837863206863403, "eval_runtime": 61.2639, "eval_samples_per_second": 8.161, "eval_steps_per_second": 8.161, "step": 900 }, { "epoch": 0.09781790820165538, "grad_norm": 7.503582954406738, "learning_rate": 3.8530689125201184e-05, "loss": 1.8213, "step": 910 }, { "epoch": 0.09889283026980544, "grad_norm": 7.769185543060303, "learning_rate": 3.8464693945109305e-05, "loss": 1.8505, "step": 920 }, { "epoch": 0.0999677523379555, "grad_norm": 7.26152229309082, "learning_rate": 3.839796544507265e-05, "loss": 1.8225, "step": 930 }, { "epoch": 0.10104267440610555, "grad_norm": 7.977392673492432, "learning_rate": 3.833050636803849e-05, "loss": 1.8483, "step": 940 }, { "epoch": 0.10211759647425561, "grad_norm": 8.005775451660156, "learning_rate": 3.826231948698527e-05, "loss": 1.9364, "step": 950 }, { "epoch": 0.10319251854240567, "grad_norm": 6.598343849182129, "learning_rate": 3.819340760480859e-05, "loss": 1.8967, "step": 960 }, { "epoch": 0.10426744061055573, "grad_norm": 8.264419555664062, "learning_rate": 3.812377355420602e-05, "loss": 1.791, "step": 970 }, { "epoch": 0.1053423626787058, "grad_norm": 7.987963676452637, "learning_rate": 3.805342019756065e-05, "loss": 1.8654, "step": 980 }, { "epoch": 0.10641728474685586, "grad_norm": 8.105964660644531, "learning_rate": 3.7982350426823406e-05, "loss": 1.854, "step": 990 }, { "epoch": 0.10749220681500592, "grad_norm": 8.088004112243652, "learning_rate": 3.791056716339421e-05, "loss": 1.8123, "step": 1000 }, { "epoch": 0.10856712888315598, "grad_norm": 9.775269508361816, "learning_rate": 3.783807335800187e-05, "loss": 1.8431, "step": 1010 }, { "epoch": 0.10964205095130602, "grad_norm": 6.646794319152832, "learning_rate": 3.776487199058277e-05, "loss": 1.8865, "step": 1020 }, { "epoch": 0.11071697301945609, "grad_norm": 7.902348041534424, "learning_rate": 3.769096607015843e-05, "loss": 1.8599, "step": 1030 }, { "epoch": 0.11179189508760615, "grad_norm": 6.932212829589844, "learning_rate": 3.761635863471175e-05, "loss": 1.8696, "step": 1040 }, { "epoch": 0.11286681715575621, "grad_norm": 8.541101455688477, "learning_rate": 3.754105275106222e-05, "loss": 1.7824, "step": 1050 }, { "epoch": 0.11286681715575621, "eval_loss": 1.8909525871276855, "eval_runtime": 60.951, "eval_samples_per_second": 8.203, "eval_steps_per_second": 8.203, "step": 1050 }, { "epoch": 0.11394173922390627, "grad_norm": 6.613965034484863, "learning_rate": 3.746505151473972e-05, "loss": 1.8719, "step": 1060 }, { "epoch": 0.11501666129205633, "grad_norm": 7.055614948272705, "learning_rate": 3.738835804985743e-05, "loss": 1.8533, "step": 1070 }, { "epoch": 0.11609158336020639, "grad_norm": 8.038810729980469, "learning_rate": 3.731097550898329e-05, "loss": 1.8067, "step": 1080 }, { "epoch": 0.11716650542835644, "grad_norm": 6.726877689361572, "learning_rate": 3.723290707301047e-05, "loss": 1.8871, "step": 1090 }, { "epoch": 0.1182414274965065, "grad_norm": 7.802327632904053, "learning_rate": 3.7154155951026605e-05, "loss": 2.0685, "step": 1100 }, { "epoch": 0.11931634956465656, "grad_norm": 8.144506454467773, "learning_rate": 3.707472538018187e-05, "loss": 1.9074, "step": 1110 }, { "epoch": 0.12039127163280662, "grad_norm": 11.114158630371094, "learning_rate": 3.6994618625555925e-05, "loss": 1.8036, "step": 1120 }, { "epoch": 0.12146619370095668, "grad_norm": 8.913481712341309, "learning_rate": 3.691383898002368e-05, "loss": 1.8411, "step": 1130 }, { "epoch": 0.12254111576910674, "grad_norm": 7.231012344360352, "learning_rate": 3.683238976412e-05, "loss": 1.838, "step": 1140 }, { "epoch": 0.1236160378372568, "grad_norm": 7.460425853729248, "learning_rate": 3.675027432590312e-05, "loss": 1.907, "step": 1150 }, { "epoch": 0.12469095990540686, "grad_norm": 6.360142230987549, "learning_rate": 3.666749604081707e-05, "loss": 1.8568, "step": 1160 }, { "epoch": 0.12576588197355693, "grad_norm": 8.225682258605957, "learning_rate": 3.6584058311552954e-05, "loss": 1.8473, "step": 1170 }, { "epoch": 0.12684080404170697, "grad_norm": 8.119696617126465, "learning_rate": 3.6499964567909e-05, "loss": 1.7953, "step": 1180 }, { "epoch": 0.12791572610985705, "grad_norm": 7.61316442489624, "learning_rate": 3.641521826664964e-05, "loss": 1.7599, "step": 1190 }, { "epoch": 0.1289906481780071, "grad_norm": 7.965550422668457, "learning_rate": 3.63298228913634e-05, "loss": 1.8461, "step": 1200 }, { "epoch": 0.1289906481780071, "eval_loss": 1.8840419054031372, "eval_runtime": 62.5093, "eval_samples_per_second": 7.999, "eval_steps_per_second": 7.999, "step": 1200 }, { "epoch": 0.13006557024615714, "grad_norm": 5.871341228485107, "learning_rate": 3.624378195231967e-05, "loss": 1.8397, "step": 1210 }, { "epoch": 0.13114049231430722, "grad_norm": 18.254127502441406, "learning_rate": 3.615709898632448e-05, "loss": 1.8295, "step": 1220 }, { "epoch": 0.13221541438245726, "grad_norm": 8.392979621887207, "learning_rate": 3.606977755657502e-05, "loss": 1.8058, "step": 1230 }, { "epoch": 0.13329033645060734, "grad_norm": 10.364768981933594, "learning_rate": 3.5981821252513274e-05, "loss": 1.8656, "step": 1240 }, { "epoch": 0.13436525851875739, "grad_norm": 6.204119682312012, "learning_rate": 3.5893233689678384e-05, "loss": 1.8201, "step": 1250 }, { "epoch": 0.13544018058690746, "grad_norm": 8.470161437988281, "learning_rate": 3.5804018509558095e-05, "loss": 1.8629, "step": 1260 }, { "epoch": 0.1365151026550575, "grad_norm": 7.885449409484863, "learning_rate": 3.571417937943903e-05, "loss": 1.9337, "step": 1270 }, { "epoch": 0.13759002472320755, "grad_norm": 6.3673996925354, "learning_rate": 3.562371999225594e-05, "loss": 1.8661, "step": 1280 }, { "epoch": 0.13866494679135763, "grad_norm": 9.29290771484375, "learning_rate": 3.553264406643995e-05, "loss": 1.7597, "step": 1290 }, { "epoch": 0.13973986885950768, "grad_norm": 6.828396320343018, "learning_rate": 3.544095534576563e-05, "loss": 1.8641, "step": 1300 }, { "epoch": 0.14081479092765775, "grad_norm": 9.610004425048828, "learning_rate": 3.534865759919718e-05, "loss": 1.8846, "step": 1310 }, { "epoch": 0.1418897129958078, "grad_norm": 7.615134239196777, "learning_rate": 3.525575462073344e-05, "loss": 1.885, "step": 1320 }, { "epoch": 0.14296463506395787, "grad_norm": 7.668911933898926, "learning_rate": 3.516225022925199e-05, "loss": 1.79, "step": 1330 }, { "epoch": 0.14403955713210792, "grad_norm": 7.5038042068481445, "learning_rate": 3.5068148268352135e-05, "loss": 1.8394, "step": 1340 }, { "epoch": 0.14511447920025797, "grad_norm": 7.994359970092773, "learning_rate": 3.497345260619691e-05, "loss": 1.8393, "step": 1350 }, { "epoch": 0.14511447920025797, "eval_loss": 1.8796833753585815, "eval_runtime": 57.9394, "eval_samples_per_second": 8.63, "eval_steps_per_second": 8.63, "step": 1350 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.751283405814497e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }