|
{ |
|
"best_metric": 1.877977728843689, |
|
"best_model_checkpoint": "./output/checkpoint-450", |
|
"epoch": 0.14511447920025797, |
|
"eval_steps": 150, |
|
"global_step": 1350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001074922068150059, |
|
"grad_norm": 43.13871765136719, |
|
"learning_rate": 4.125e-06, |
|
"loss": 2.2922, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002149844136300118, |
|
"grad_norm": 26.323883056640625, |
|
"learning_rate": 8.25e-06, |
|
"loss": 2.1328, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0032247662044501773, |
|
"grad_norm": 17.629505157470703, |
|
"learning_rate": 1.2375e-05, |
|
"loss": 2.036, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004299688272600236, |
|
"grad_norm": 10.648378372192383, |
|
"learning_rate": 1.65e-05, |
|
"loss": 1.851, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005374610340750295, |
|
"grad_norm": 17.73092269897461, |
|
"learning_rate": 2.0625e-05, |
|
"loss": 1.8746, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0064495324089003546, |
|
"grad_norm": 6.716779708862305, |
|
"learning_rate": 2.475e-05, |
|
"loss": 1.8512, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.007524454477050414, |
|
"grad_norm": 15.877828598022461, |
|
"learning_rate": 2.8874999999999997e-05, |
|
"loss": 1.9416, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.008599376545200472, |
|
"grad_norm": 14.675684928894043, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.8581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.009674298613350531, |
|
"grad_norm": 11.489137649536133, |
|
"learning_rate": 3.7125e-05, |
|
"loss": 1.9017, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01074922068150059, |
|
"grad_norm": 7.483497619628906, |
|
"learning_rate": 4.125e-05, |
|
"loss": 1.9335, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01182414274965065, |
|
"grad_norm": 9.6410551071167, |
|
"learning_rate": 4.12495760935163e-05, |
|
"loss": 1.7841, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.012899064817800709, |
|
"grad_norm": 7.8748979568481445, |
|
"learning_rate": 4.1248304391490334e-05, |
|
"loss": 1.8529, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.013973986885950768, |
|
"grad_norm": 9.455327987670898, |
|
"learning_rate": 4.1246184946196796e-05, |
|
"loss": 1.9366, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.015048908954100828, |
|
"grad_norm": 8.64035701751709, |
|
"learning_rate": 4.124321784475777e-05, |
|
"loss": 1.8501, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.016123831022250887, |
|
"grad_norm": 13.220332145690918, |
|
"learning_rate": 4.123940320913919e-05, |
|
"loss": 1.9095, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.016123831022250887, |
|
"eval_loss": 1.8831839561462402, |
|
"eval_runtime": 61.5136, |
|
"eval_samples_per_second": 8.128, |
|
"eval_steps_per_second": 8.128, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.017198753090400944, |
|
"grad_norm": 8.182404518127441, |
|
"learning_rate": 4.123474119614577e-05, |
|
"loss": 1.8163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.018273675158551005, |
|
"grad_norm": 12.389548301696777, |
|
"learning_rate": 4.1229231997414614e-05, |
|
"loss": 1.8781, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.019348597226701063, |
|
"grad_norm": 8.600774765014648, |
|
"learning_rate": 4.1222875839407306e-05, |
|
"loss": 1.8206, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.020423519294851124, |
|
"grad_norm": 7.161348819732666, |
|
"learning_rate": 4.121567298340059e-05, |
|
"loss": 1.8117, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02149844136300118, |
|
"grad_norm": 7.655135631561279, |
|
"learning_rate": 4.120762372547569e-05, |
|
"loss": 1.794, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.022573363431151242, |
|
"grad_norm": 10.144853591918945, |
|
"learning_rate": 4.119872839650605e-05, |
|
"loss": 1.8959, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0236482854993013, |
|
"grad_norm": 8.557960510253906, |
|
"learning_rate": 4.118898736214381e-05, |
|
"loss": 1.8676, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02472320756745136, |
|
"grad_norm": 11.726237297058105, |
|
"learning_rate": 4.117840102280475e-05, |
|
"loss": 1.7744, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.025798129635601418, |
|
"grad_norm": 8.739320755004883, |
|
"learning_rate": 4.116696981365181e-05, |
|
"loss": 1.8178, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02687305170375148, |
|
"grad_norm": 6.433278560638428, |
|
"learning_rate": 4.115469420457721e-05, |
|
"loss": 1.7578, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.027947973771901537, |
|
"grad_norm": 6.021564960479736, |
|
"learning_rate": 4.1141574700183186e-05, |
|
"loss": 1.8376, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.029022895840051598, |
|
"grad_norm": 8.607327461242676, |
|
"learning_rate": 4.1127611839761155e-05, |
|
"loss": 1.7681, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.030097817908201655, |
|
"grad_norm": 9.417404174804688, |
|
"learning_rate": 4.111280619726964e-05, |
|
"loss": 1.8459, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.031172739976351716, |
|
"grad_norm": 11.03150749206543, |
|
"learning_rate": 4.109715838131059e-05, |
|
"loss": 1.8467, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.032247662044501774, |
|
"grad_norm": 7.549683570861816, |
|
"learning_rate": 4.108066903510445e-05, |
|
"loss": 1.8979, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.032247662044501774, |
|
"eval_loss": 1.8808292150497437, |
|
"eval_runtime": 60.731, |
|
"eval_samples_per_second": 8.233, |
|
"eval_steps_per_second": 8.233, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.033322584112651835, |
|
"grad_norm": 7.343403339385986, |
|
"learning_rate": 4.106333883646366e-05, |
|
"loss": 1.8544, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03439750618080189, |
|
"grad_norm": 6.203611850738525, |
|
"learning_rate": 4.104516849776479e-05, |
|
"loss": 1.8245, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03547242824895195, |
|
"grad_norm": 8.674558639526367, |
|
"learning_rate": 4.1026158765919306e-05, |
|
"loss": 1.8058, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03654735031710201, |
|
"grad_norm": 9.196893692016602, |
|
"learning_rate": 4.100631042234283e-05, |
|
"loss": 1.8813, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03762227238525207, |
|
"grad_norm": 10.656414985656738, |
|
"learning_rate": 4.098562428292304e-05, |
|
"loss": 1.7876, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.038697194453402126, |
|
"grad_norm": 6.581936836242676, |
|
"learning_rate": 4.096410119798607e-05, |
|
"loss": 1.8134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03977211652155219, |
|
"grad_norm": 9.650823593139648, |
|
"learning_rate": 4.094174205226167e-05, |
|
"loss": 1.7727, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04084703858970225, |
|
"grad_norm": 10.17803955078125, |
|
"learning_rate": 4.0918547764846736e-05, |
|
"loss": 1.7945, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04192196065785231, |
|
"grad_norm": 8.77602481842041, |
|
"learning_rate": 4.089451928916758e-05, |
|
"loss": 1.9062, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.04299688272600236, |
|
"grad_norm": 6.488632678985596, |
|
"learning_rate": 4.0869657612940723e-05, |
|
"loss": 1.774, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.044071804794152424, |
|
"grad_norm": 10.849347114562988, |
|
"learning_rate": 4.08439637581323e-05, |
|
"loss": 1.8497, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.045146726862302484, |
|
"grad_norm": 6.79919958114624, |
|
"learning_rate": 4.081743878091604e-05, |
|
"loss": 1.9205, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.046221648930452545, |
|
"grad_norm": 10.169675827026367, |
|
"learning_rate": 4.079008377162988e-05, |
|
"loss": 1.8459, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0472965709986026, |
|
"grad_norm": 6.641557216644287, |
|
"learning_rate": 4.0761899854731085e-05, |
|
"loss": 1.867, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04837149306675266, |
|
"grad_norm": 10.017315864562988, |
|
"learning_rate": 4.073288818875011e-05, |
|
"loss": 1.8095, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04837149306675266, |
|
"eval_loss": 1.877977728843689, |
|
"eval_runtime": 60.8543, |
|
"eval_samples_per_second": 8.216, |
|
"eval_steps_per_second": 8.216, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04944641513490272, |
|
"grad_norm": 7.070505619049072, |
|
"learning_rate": 4.070304996624291e-05, |
|
"loss": 1.8261, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.050521337203052775, |
|
"grad_norm": 7.832033157348633, |
|
"learning_rate": 4.067238641374194e-05, |
|
"loss": 1.8129, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.051596259271202836, |
|
"grad_norm": 8.640854835510254, |
|
"learning_rate": 4.0640898791705745e-05, |
|
"loss": 1.8887, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0526711813393529, |
|
"grad_norm": 8.050338745117188, |
|
"learning_rate": 4.060858839446713e-05, |
|
"loss": 1.8551, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.05374610340750296, |
|
"grad_norm": 5.903382778167725, |
|
"learning_rate": 4.057545655017998e-05, |
|
"loss": 1.7708, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05482102547565301, |
|
"grad_norm": 6.558720111846924, |
|
"learning_rate": 4.054150462076465e-05, |
|
"loss": 1.8277, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.05589594754380307, |
|
"grad_norm": 9.94021224975586, |
|
"learning_rate": 4.0506734001851976e-05, |
|
"loss": 1.8834, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.056970869611953134, |
|
"grad_norm": 11.147041320800781, |
|
"learning_rate": 4.0471146122725904e-05, |
|
"loss": 1.8697, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.058045791680103195, |
|
"grad_norm": 7.040550708770752, |
|
"learning_rate": 4.043474244626477e-05, |
|
"loss": 1.8605, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05912071374825325, |
|
"grad_norm": 8.166383743286133, |
|
"learning_rate": 4.0397524468881125e-05, |
|
"loss": 1.8772, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06019563581640331, |
|
"grad_norm": 6.262266159057617, |
|
"learning_rate": 4.0359493720460244e-05, |
|
"loss": 1.8334, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.06127055788455337, |
|
"grad_norm": 6.04037618637085, |
|
"learning_rate": 4.032065176429724e-05, |
|
"loss": 1.8355, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.06234547995270343, |
|
"grad_norm": 6.709385871887207, |
|
"learning_rate": 4.0281000197032795e-05, |
|
"loss": 1.8567, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.06342040202085349, |
|
"grad_norm": 6.928658962249756, |
|
"learning_rate": 4.0240540648587546e-05, |
|
"loss": 1.8503, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.06449532408900355, |
|
"grad_norm": 5.788703918457031, |
|
"learning_rate": 4.019927478209504e-05, |
|
"loss": 1.8522, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06449532408900355, |
|
"eval_loss": 1.8825602531433105, |
|
"eval_runtime": 60.165, |
|
"eval_samples_per_second": 8.31, |
|
"eval_steps_per_second": 8.31, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06557024615715361, |
|
"grad_norm": 9.176572799682617, |
|
"learning_rate": 4.015720429383344e-05, |
|
"loss": 1.8688, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.06664516822530367, |
|
"grad_norm": 7.160205841064453, |
|
"learning_rate": 4.0114330913155726e-05, |
|
"loss": 1.8031, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.06772009029345373, |
|
"grad_norm": 9.988216400146484, |
|
"learning_rate": 4.007065640241867e-05, |
|
"loss": 1.8262, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.06879501236160378, |
|
"grad_norm": 9.315797805786133, |
|
"learning_rate": 4.002618255691033e-05, |
|
"loss": 1.8542, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06986993442975384, |
|
"grad_norm": 7.385815143585205, |
|
"learning_rate": 3.9980911204776306e-05, |
|
"loss": 1.8328, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0709448564979039, |
|
"grad_norm": 7.70832633972168, |
|
"learning_rate": 3.993484420694458e-05, |
|
"loss": 1.9542, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.07201977856605396, |
|
"grad_norm": 10.131119728088379, |
|
"learning_rate": 3.988798345704899e-05, |
|
"loss": 1.8389, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.07309470063420402, |
|
"grad_norm": 6.0609354972839355, |
|
"learning_rate": 3.984033088135143e-05, |
|
"loss": 1.8653, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.07416962270235408, |
|
"grad_norm": 9.173294067382812, |
|
"learning_rate": 3.979188843866263e-05, |
|
"loss": 1.8558, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.07524454477050414, |
|
"grad_norm": 5.944100856781006, |
|
"learning_rate": 3.97426581202617e-05, |
|
"loss": 1.922, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0763194668386542, |
|
"grad_norm": 7.453153133392334, |
|
"learning_rate": 3.969264194981418e-05, |
|
"loss": 1.8524, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.07739438890680425, |
|
"grad_norm": 6.75320291519165, |
|
"learning_rate": 3.9641841983288953e-05, |
|
"loss": 1.8699, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07846931097495431, |
|
"grad_norm": 6.539112567901611, |
|
"learning_rate": 3.959026030887367e-05, |
|
"loss": 1.7784, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.07954423304310437, |
|
"grad_norm": 7.794706344604492, |
|
"learning_rate": 3.953789904688893e-05, |
|
"loss": 1.8222, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.08061915511125443, |
|
"grad_norm": 7.376310348510742, |
|
"learning_rate": 3.948476034970113e-05, |
|
"loss": 1.7985, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.08061915511125443, |
|
"eval_loss": 1.8882893323898315, |
|
"eval_runtime": 60.264, |
|
"eval_samples_per_second": 8.297, |
|
"eval_steps_per_second": 8.297, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0816940771794045, |
|
"grad_norm": 8.657331466674805, |
|
"learning_rate": 3.943084640163398e-05, |
|
"loss": 1.7983, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.08276899924755456, |
|
"grad_norm": 5.748507976531982, |
|
"learning_rate": 3.937615941887873e-05, |
|
"loss": 1.884, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.08384392131570462, |
|
"grad_norm": 6.509608745574951, |
|
"learning_rate": 3.932070164940304e-05, |
|
"loss": 1.891, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.08491884338385466, |
|
"grad_norm": 6.634593486785889, |
|
"learning_rate": 3.926447537285859e-05, |
|
"loss": 1.776, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.08599376545200473, |
|
"grad_norm": 9.289311408996582, |
|
"learning_rate": 3.920748290048739e-05, |
|
"loss": 1.7806, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08706868752015479, |
|
"grad_norm": 7.693164348602295, |
|
"learning_rate": 3.914972657502677e-05, |
|
"loss": 1.8428, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.08814360958830485, |
|
"grad_norm": 5.920309066772461, |
|
"learning_rate": 3.9091208770613036e-05, |
|
"loss": 1.8048, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.08921853165645491, |
|
"grad_norm": 7.512606143951416, |
|
"learning_rate": 3.9031931892683937e-05, |
|
"loss": 1.848, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.09029345372460497, |
|
"grad_norm": 8.341816902160645, |
|
"learning_rate": 3.897189837787975e-05, |
|
"loss": 1.7995, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.09136837579275503, |
|
"grad_norm": 7.76444149017334, |
|
"learning_rate": 3.891111069394313e-05, |
|
"loss": 1.7587, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.09244329786090509, |
|
"grad_norm": 8.347227096557617, |
|
"learning_rate": 3.884957133961768e-05, |
|
"loss": 1.8215, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.09351821992905514, |
|
"grad_norm": 7.596982955932617, |
|
"learning_rate": 3.878728284454522e-05, |
|
"loss": 1.8831, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.0945931419972052, |
|
"grad_norm": 7.217879295349121, |
|
"learning_rate": 3.872424776916183e-05, |
|
"loss": 1.9871, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.09566806406535526, |
|
"grad_norm": 6.474759101867676, |
|
"learning_rate": 3.866046870459253e-05, |
|
"loss": 1.8137, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.09674298613350532, |
|
"grad_norm": 7.693883419036865, |
|
"learning_rate": 3.8595948272544905e-05, |
|
"loss": 1.884, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09674298613350532, |
|
"eval_loss": 1.8837863206863403, |
|
"eval_runtime": 61.2639, |
|
"eval_samples_per_second": 8.161, |
|
"eval_steps_per_second": 8.161, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09781790820165538, |
|
"grad_norm": 7.503582954406738, |
|
"learning_rate": 3.8530689125201184e-05, |
|
"loss": 1.8213, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.09889283026980544, |
|
"grad_norm": 7.769185543060303, |
|
"learning_rate": 3.8464693945109305e-05, |
|
"loss": 1.8505, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.0999677523379555, |
|
"grad_norm": 7.26152229309082, |
|
"learning_rate": 3.839796544507265e-05, |
|
"loss": 1.8225, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.10104267440610555, |
|
"grad_norm": 7.977392673492432, |
|
"learning_rate": 3.833050636803849e-05, |
|
"loss": 1.8483, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.10211759647425561, |
|
"grad_norm": 8.005775451660156, |
|
"learning_rate": 3.826231948698527e-05, |
|
"loss": 1.9364, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.10319251854240567, |
|
"grad_norm": 6.598343849182129, |
|
"learning_rate": 3.819340760480859e-05, |
|
"loss": 1.8967, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.10426744061055573, |
|
"grad_norm": 8.264419555664062, |
|
"learning_rate": 3.812377355420602e-05, |
|
"loss": 1.791, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.1053423626787058, |
|
"grad_norm": 7.987963676452637, |
|
"learning_rate": 3.805342019756065e-05, |
|
"loss": 1.8654, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.10641728474685586, |
|
"grad_norm": 8.105964660644531, |
|
"learning_rate": 3.7982350426823406e-05, |
|
"loss": 1.854, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.10749220681500592, |
|
"grad_norm": 8.088004112243652, |
|
"learning_rate": 3.791056716339421e-05, |
|
"loss": 1.8123, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10856712888315598, |
|
"grad_norm": 9.775269508361816, |
|
"learning_rate": 3.783807335800187e-05, |
|
"loss": 1.8431, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.10964205095130602, |
|
"grad_norm": 6.646794319152832, |
|
"learning_rate": 3.776487199058277e-05, |
|
"loss": 1.8865, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.11071697301945609, |
|
"grad_norm": 7.902348041534424, |
|
"learning_rate": 3.769096607015843e-05, |
|
"loss": 1.8599, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.11179189508760615, |
|
"grad_norm": 6.932212829589844, |
|
"learning_rate": 3.761635863471175e-05, |
|
"loss": 1.8696, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.11286681715575621, |
|
"grad_norm": 8.541101455688477, |
|
"learning_rate": 3.754105275106222e-05, |
|
"loss": 1.7824, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.11286681715575621, |
|
"eval_loss": 1.8909525871276855, |
|
"eval_runtime": 60.951, |
|
"eval_samples_per_second": 8.203, |
|
"eval_steps_per_second": 8.203, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.11394173922390627, |
|
"grad_norm": 6.613965034484863, |
|
"learning_rate": 3.746505151473972e-05, |
|
"loss": 1.8719, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.11501666129205633, |
|
"grad_norm": 7.055614948272705, |
|
"learning_rate": 3.738835804985743e-05, |
|
"loss": 1.8533, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.11609158336020639, |
|
"grad_norm": 8.038810729980469, |
|
"learning_rate": 3.731097550898329e-05, |
|
"loss": 1.8067, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.11716650542835644, |
|
"grad_norm": 6.726877689361572, |
|
"learning_rate": 3.723290707301047e-05, |
|
"loss": 1.8871, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.1182414274965065, |
|
"grad_norm": 7.802327632904053, |
|
"learning_rate": 3.7154155951026605e-05, |
|
"loss": 2.0685, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11931634956465656, |
|
"grad_norm": 8.144506454467773, |
|
"learning_rate": 3.707472538018187e-05, |
|
"loss": 1.9074, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.12039127163280662, |
|
"grad_norm": 11.114158630371094, |
|
"learning_rate": 3.6994618625555925e-05, |
|
"loss": 1.8036, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.12146619370095668, |
|
"grad_norm": 8.913481712341309, |
|
"learning_rate": 3.691383898002368e-05, |
|
"loss": 1.8411, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.12254111576910674, |
|
"grad_norm": 7.231012344360352, |
|
"learning_rate": 3.683238976412e-05, |
|
"loss": 1.838, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.1236160378372568, |
|
"grad_norm": 7.460425853729248, |
|
"learning_rate": 3.675027432590312e-05, |
|
"loss": 1.907, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.12469095990540686, |
|
"grad_norm": 6.360142230987549, |
|
"learning_rate": 3.666749604081707e-05, |
|
"loss": 1.8568, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.12576588197355693, |
|
"grad_norm": 8.225682258605957, |
|
"learning_rate": 3.6584058311552954e-05, |
|
"loss": 1.8473, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.12684080404170697, |
|
"grad_norm": 8.119696617126465, |
|
"learning_rate": 3.6499964567909e-05, |
|
"loss": 1.7953, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.12791572610985705, |
|
"grad_norm": 7.61316442489624, |
|
"learning_rate": 3.641521826664964e-05, |
|
"loss": 1.7599, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.1289906481780071, |
|
"grad_norm": 7.965550422668457, |
|
"learning_rate": 3.63298228913634e-05, |
|
"loss": 1.8461, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1289906481780071, |
|
"eval_loss": 1.8840419054031372, |
|
"eval_runtime": 62.5093, |
|
"eval_samples_per_second": 7.999, |
|
"eval_steps_per_second": 7.999, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.13006557024615714, |
|
"grad_norm": 5.871341228485107, |
|
"learning_rate": 3.624378195231967e-05, |
|
"loss": 1.8397, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.13114049231430722, |
|
"grad_norm": 18.254127502441406, |
|
"learning_rate": 3.615709898632448e-05, |
|
"loss": 1.8295, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.13221541438245726, |
|
"grad_norm": 8.392979621887207, |
|
"learning_rate": 3.606977755657502e-05, |
|
"loss": 1.8058, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.13329033645060734, |
|
"grad_norm": 10.364768981933594, |
|
"learning_rate": 3.5981821252513274e-05, |
|
"loss": 1.8656, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.13436525851875739, |
|
"grad_norm": 6.204119682312012, |
|
"learning_rate": 3.5893233689678384e-05, |
|
"loss": 1.8201, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.13544018058690746, |
|
"grad_norm": 8.470161437988281, |
|
"learning_rate": 3.5804018509558095e-05, |
|
"loss": 1.8629, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.1365151026550575, |
|
"grad_norm": 7.885449409484863, |
|
"learning_rate": 3.571417937943903e-05, |
|
"loss": 1.9337, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.13759002472320755, |
|
"grad_norm": 6.3673996925354, |
|
"learning_rate": 3.562371999225594e-05, |
|
"loss": 1.8661, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.13866494679135763, |
|
"grad_norm": 9.29290771484375, |
|
"learning_rate": 3.553264406643995e-05, |
|
"loss": 1.7597, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.13973986885950768, |
|
"grad_norm": 6.828396320343018, |
|
"learning_rate": 3.544095534576563e-05, |
|
"loss": 1.8641, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.14081479092765775, |
|
"grad_norm": 9.610004425048828, |
|
"learning_rate": 3.534865759919718e-05, |
|
"loss": 1.8846, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.1418897129958078, |
|
"grad_norm": 7.615134239196777, |
|
"learning_rate": 3.525575462073344e-05, |
|
"loss": 1.885, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.14296463506395787, |
|
"grad_norm": 7.668911933898926, |
|
"learning_rate": 3.516225022925199e-05, |
|
"loss": 1.79, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.14403955713210792, |
|
"grad_norm": 7.5038042068481445, |
|
"learning_rate": 3.5068148268352135e-05, |
|
"loss": 1.8394, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.14511447920025797, |
|
"grad_norm": 7.994359970092773, |
|
"learning_rate": 3.497345260619691e-05, |
|
"loss": 1.8393, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.14511447920025797, |
|
"eval_loss": 1.8796833753585815, |
|
"eval_runtime": 57.9394, |
|
"eval_samples_per_second": 8.63, |
|
"eval_steps_per_second": 8.63, |
|
"step": 1350 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.751283405814497e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|