ProgramInNonsense's picture
Training in progress, step 1350, checkpoint
93ddded verified
{
"best_metric": 1.877977728843689,
"best_model_checkpoint": "./output/checkpoint-450",
"epoch": 0.14511447920025797,
"eval_steps": 150,
"global_step": 1350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001074922068150059,
"grad_norm": 43.13871765136719,
"learning_rate": 4.125e-06,
"loss": 2.2922,
"step": 10
},
{
"epoch": 0.002149844136300118,
"grad_norm": 26.323883056640625,
"learning_rate": 8.25e-06,
"loss": 2.1328,
"step": 20
},
{
"epoch": 0.0032247662044501773,
"grad_norm": 17.629505157470703,
"learning_rate": 1.2375e-05,
"loss": 2.036,
"step": 30
},
{
"epoch": 0.004299688272600236,
"grad_norm": 10.648378372192383,
"learning_rate": 1.65e-05,
"loss": 1.851,
"step": 40
},
{
"epoch": 0.005374610340750295,
"grad_norm": 17.73092269897461,
"learning_rate": 2.0625e-05,
"loss": 1.8746,
"step": 50
},
{
"epoch": 0.0064495324089003546,
"grad_norm": 6.716779708862305,
"learning_rate": 2.475e-05,
"loss": 1.8512,
"step": 60
},
{
"epoch": 0.007524454477050414,
"grad_norm": 15.877828598022461,
"learning_rate": 2.8874999999999997e-05,
"loss": 1.9416,
"step": 70
},
{
"epoch": 0.008599376545200472,
"grad_norm": 14.675684928894043,
"learning_rate": 3.3e-05,
"loss": 1.8581,
"step": 80
},
{
"epoch": 0.009674298613350531,
"grad_norm": 11.489137649536133,
"learning_rate": 3.7125e-05,
"loss": 1.9017,
"step": 90
},
{
"epoch": 0.01074922068150059,
"grad_norm": 7.483497619628906,
"learning_rate": 4.125e-05,
"loss": 1.9335,
"step": 100
},
{
"epoch": 0.01182414274965065,
"grad_norm": 9.6410551071167,
"learning_rate": 4.12495760935163e-05,
"loss": 1.7841,
"step": 110
},
{
"epoch": 0.012899064817800709,
"grad_norm": 7.8748979568481445,
"learning_rate": 4.1248304391490334e-05,
"loss": 1.8529,
"step": 120
},
{
"epoch": 0.013973986885950768,
"grad_norm": 9.455327987670898,
"learning_rate": 4.1246184946196796e-05,
"loss": 1.9366,
"step": 130
},
{
"epoch": 0.015048908954100828,
"grad_norm": 8.64035701751709,
"learning_rate": 4.124321784475777e-05,
"loss": 1.8501,
"step": 140
},
{
"epoch": 0.016123831022250887,
"grad_norm": 13.220332145690918,
"learning_rate": 4.123940320913919e-05,
"loss": 1.9095,
"step": 150
},
{
"epoch": 0.016123831022250887,
"eval_loss": 1.8831839561462402,
"eval_runtime": 61.5136,
"eval_samples_per_second": 8.128,
"eval_steps_per_second": 8.128,
"step": 150
},
{
"epoch": 0.017198753090400944,
"grad_norm": 8.182404518127441,
"learning_rate": 4.123474119614577e-05,
"loss": 1.8163,
"step": 160
},
{
"epoch": 0.018273675158551005,
"grad_norm": 12.389548301696777,
"learning_rate": 4.1229231997414614e-05,
"loss": 1.8781,
"step": 170
},
{
"epoch": 0.019348597226701063,
"grad_norm": 8.600774765014648,
"learning_rate": 4.1222875839407306e-05,
"loss": 1.8206,
"step": 180
},
{
"epoch": 0.020423519294851124,
"grad_norm": 7.161348819732666,
"learning_rate": 4.121567298340059e-05,
"loss": 1.8117,
"step": 190
},
{
"epoch": 0.02149844136300118,
"grad_norm": 7.655135631561279,
"learning_rate": 4.120762372547569e-05,
"loss": 1.794,
"step": 200
},
{
"epoch": 0.022573363431151242,
"grad_norm": 10.144853591918945,
"learning_rate": 4.119872839650605e-05,
"loss": 1.8959,
"step": 210
},
{
"epoch": 0.0236482854993013,
"grad_norm": 8.557960510253906,
"learning_rate": 4.118898736214381e-05,
"loss": 1.8676,
"step": 220
},
{
"epoch": 0.02472320756745136,
"grad_norm": 11.726237297058105,
"learning_rate": 4.117840102280475e-05,
"loss": 1.7744,
"step": 230
},
{
"epoch": 0.025798129635601418,
"grad_norm": 8.739320755004883,
"learning_rate": 4.116696981365181e-05,
"loss": 1.8178,
"step": 240
},
{
"epoch": 0.02687305170375148,
"grad_norm": 6.433278560638428,
"learning_rate": 4.115469420457721e-05,
"loss": 1.7578,
"step": 250
},
{
"epoch": 0.027947973771901537,
"grad_norm": 6.021564960479736,
"learning_rate": 4.1141574700183186e-05,
"loss": 1.8376,
"step": 260
},
{
"epoch": 0.029022895840051598,
"grad_norm": 8.607327461242676,
"learning_rate": 4.1127611839761155e-05,
"loss": 1.7681,
"step": 270
},
{
"epoch": 0.030097817908201655,
"grad_norm": 9.417404174804688,
"learning_rate": 4.111280619726964e-05,
"loss": 1.8459,
"step": 280
},
{
"epoch": 0.031172739976351716,
"grad_norm": 11.03150749206543,
"learning_rate": 4.109715838131059e-05,
"loss": 1.8467,
"step": 290
},
{
"epoch": 0.032247662044501774,
"grad_norm": 7.549683570861816,
"learning_rate": 4.108066903510445e-05,
"loss": 1.8979,
"step": 300
},
{
"epoch": 0.032247662044501774,
"eval_loss": 1.8808292150497437,
"eval_runtime": 60.731,
"eval_samples_per_second": 8.233,
"eval_steps_per_second": 8.233,
"step": 300
},
{
"epoch": 0.033322584112651835,
"grad_norm": 7.343403339385986,
"learning_rate": 4.106333883646366e-05,
"loss": 1.8544,
"step": 310
},
{
"epoch": 0.03439750618080189,
"grad_norm": 6.203611850738525,
"learning_rate": 4.104516849776479e-05,
"loss": 1.8245,
"step": 320
},
{
"epoch": 0.03547242824895195,
"grad_norm": 8.674558639526367,
"learning_rate": 4.1026158765919306e-05,
"loss": 1.8058,
"step": 330
},
{
"epoch": 0.03654735031710201,
"grad_norm": 9.196893692016602,
"learning_rate": 4.100631042234283e-05,
"loss": 1.8813,
"step": 340
},
{
"epoch": 0.03762227238525207,
"grad_norm": 10.656414985656738,
"learning_rate": 4.098562428292304e-05,
"loss": 1.7876,
"step": 350
},
{
"epoch": 0.038697194453402126,
"grad_norm": 6.581936836242676,
"learning_rate": 4.096410119798607e-05,
"loss": 1.8134,
"step": 360
},
{
"epoch": 0.03977211652155219,
"grad_norm": 9.650823593139648,
"learning_rate": 4.094174205226167e-05,
"loss": 1.7727,
"step": 370
},
{
"epoch": 0.04084703858970225,
"grad_norm": 10.17803955078125,
"learning_rate": 4.0918547764846736e-05,
"loss": 1.7945,
"step": 380
},
{
"epoch": 0.04192196065785231,
"grad_norm": 8.77602481842041,
"learning_rate": 4.089451928916758e-05,
"loss": 1.9062,
"step": 390
},
{
"epoch": 0.04299688272600236,
"grad_norm": 6.488632678985596,
"learning_rate": 4.0869657612940723e-05,
"loss": 1.774,
"step": 400
},
{
"epoch": 0.044071804794152424,
"grad_norm": 10.849347114562988,
"learning_rate": 4.08439637581323e-05,
"loss": 1.8497,
"step": 410
},
{
"epoch": 0.045146726862302484,
"grad_norm": 6.79919958114624,
"learning_rate": 4.081743878091604e-05,
"loss": 1.9205,
"step": 420
},
{
"epoch": 0.046221648930452545,
"grad_norm": 10.169675827026367,
"learning_rate": 4.079008377162988e-05,
"loss": 1.8459,
"step": 430
},
{
"epoch": 0.0472965709986026,
"grad_norm": 6.641557216644287,
"learning_rate": 4.0761899854731085e-05,
"loss": 1.867,
"step": 440
},
{
"epoch": 0.04837149306675266,
"grad_norm": 10.017315864562988,
"learning_rate": 4.073288818875011e-05,
"loss": 1.8095,
"step": 450
},
{
"epoch": 0.04837149306675266,
"eval_loss": 1.877977728843689,
"eval_runtime": 60.8543,
"eval_samples_per_second": 8.216,
"eval_steps_per_second": 8.216,
"step": 450
},
{
"epoch": 0.04944641513490272,
"grad_norm": 7.070505619049072,
"learning_rate": 4.070304996624291e-05,
"loss": 1.8261,
"step": 460
},
{
"epoch": 0.050521337203052775,
"grad_norm": 7.832033157348633,
"learning_rate": 4.067238641374194e-05,
"loss": 1.8129,
"step": 470
},
{
"epoch": 0.051596259271202836,
"grad_norm": 8.640854835510254,
"learning_rate": 4.0640898791705745e-05,
"loss": 1.8887,
"step": 480
},
{
"epoch": 0.0526711813393529,
"grad_norm": 8.050338745117188,
"learning_rate": 4.060858839446713e-05,
"loss": 1.8551,
"step": 490
},
{
"epoch": 0.05374610340750296,
"grad_norm": 5.903382778167725,
"learning_rate": 4.057545655017998e-05,
"loss": 1.7708,
"step": 500
},
{
"epoch": 0.05482102547565301,
"grad_norm": 6.558720111846924,
"learning_rate": 4.054150462076465e-05,
"loss": 1.8277,
"step": 510
},
{
"epoch": 0.05589594754380307,
"grad_norm": 9.94021224975586,
"learning_rate": 4.0506734001851976e-05,
"loss": 1.8834,
"step": 520
},
{
"epoch": 0.056970869611953134,
"grad_norm": 11.147041320800781,
"learning_rate": 4.0471146122725904e-05,
"loss": 1.8697,
"step": 530
},
{
"epoch": 0.058045791680103195,
"grad_norm": 7.040550708770752,
"learning_rate": 4.043474244626477e-05,
"loss": 1.8605,
"step": 540
},
{
"epoch": 0.05912071374825325,
"grad_norm": 8.166383743286133,
"learning_rate": 4.0397524468881125e-05,
"loss": 1.8772,
"step": 550
},
{
"epoch": 0.06019563581640331,
"grad_norm": 6.262266159057617,
"learning_rate": 4.0359493720460244e-05,
"loss": 1.8334,
"step": 560
},
{
"epoch": 0.06127055788455337,
"grad_norm": 6.04037618637085,
"learning_rate": 4.032065176429724e-05,
"loss": 1.8355,
"step": 570
},
{
"epoch": 0.06234547995270343,
"grad_norm": 6.709385871887207,
"learning_rate": 4.0281000197032795e-05,
"loss": 1.8567,
"step": 580
},
{
"epoch": 0.06342040202085349,
"grad_norm": 6.928658962249756,
"learning_rate": 4.0240540648587546e-05,
"loss": 1.8503,
"step": 590
},
{
"epoch": 0.06449532408900355,
"grad_norm": 5.788703918457031,
"learning_rate": 4.019927478209504e-05,
"loss": 1.8522,
"step": 600
},
{
"epoch": 0.06449532408900355,
"eval_loss": 1.8825602531433105,
"eval_runtime": 60.165,
"eval_samples_per_second": 8.31,
"eval_steps_per_second": 8.31,
"step": 600
},
{
"epoch": 0.06557024615715361,
"grad_norm": 9.176572799682617,
"learning_rate": 4.015720429383344e-05,
"loss": 1.8688,
"step": 610
},
{
"epoch": 0.06664516822530367,
"grad_norm": 7.160205841064453,
"learning_rate": 4.0114330913155726e-05,
"loss": 1.8031,
"step": 620
},
{
"epoch": 0.06772009029345373,
"grad_norm": 9.988216400146484,
"learning_rate": 4.007065640241867e-05,
"loss": 1.8262,
"step": 630
},
{
"epoch": 0.06879501236160378,
"grad_norm": 9.315797805786133,
"learning_rate": 4.002618255691033e-05,
"loss": 1.8542,
"step": 640
},
{
"epoch": 0.06986993442975384,
"grad_norm": 7.385815143585205,
"learning_rate": 3.9980911204776306e-05,
"loss": 1.8328,
"step": 650
},
{
"epoch": 0.0709448564979039,
"grad_norm": 7.70832633972168,
"learning_rate": 3.993484420694458e-05,
"loss": 1.9542,
"step": 660
},
{
"epoch": 0.07201977856605396,
"grad_norm": 10.131119728088379,
"learning_rate": 3.988798345704899e-05,
"loss": 1.8389,
"step": 670
},
{
"epoch": 0.07309470063420402,
"grad_norm": 6.0609354972839355,
"learning_rate": 3.984033088135143e-05,
"loss": 1.8653,
"step": 680
},
{
"epoch": 0.07416962270235408,
"grad_norm": 9.173294067382812,
"learning_rate": 3.979188843866263e-05,
"loss": 1.8558,
"step": 690
},
{
"epoch": 0.07524454477050414,
"grad_norm": 5.944100856781006,
"learning_rate": 3.97426581202617e-05,
"loss": 1.922,
"step": 700
},
{
"epoch": 0.0763194668386542,
"grad_norm": 7.453153133392334,
"learning_rate": 3.969264194981418e-05,
"loss": 1.8524,
"step": 710
},
{
"epoch": 0.07739438890680425,
"grad_norm": 6.75320291519165,
"learning_rate": 3.9641841983288953e-05,
"loss": 1.8699,
"step": 720
},
{
"epoch": 0.07846931097495431,
"grad_norm": 6.539112567901611,
"learning_rate": 3.959026030887367e-05,
"loss": 1.7784,
"step": 730
},
{
"epoch": 0.07954423304310437,
"grad_norm": 7.794706344604492,
"learning_rate": 3.953789904688893e-05,
"loss": 1.8222,
"step": 740
},
{
"epoch": 0.08061915511125443,
"grad_norm": 7.376310348510742,
"learning_rate": 3.948476034970113e-05,
"loss": 1.7985,
"step": 750
},
{
"epoch": 0.08061915511125443,
"eval_loss": 1.8882893323898315,
"eval_runtime": 60.264,
"eval_samples_per_second": 8.297,
"eval_steps_per_second": 8.297,
"step": 750
},
{
"epoch": 0.0816940771794045,
"grad_norm": 8.657331466674805,
"learning_rate": 3.943084640163398e-05,
"loss": 1.7983,
"step": 760
},
{
"epoch": 0.08276899924755456,
"grad_norm": 5.748507976531982,
"learning_rate": 3.937615941887873e-05,
"loss": 1.884,
"step": 770
},
{
"epoch": 0.08384392131570462,
"grad_norm": 6.509608745574951,
"learning_rate": 3.932070164940304e-05,
"loss": 1.891,
"step": 780
},
{
"epoch": 0.08491884338385466,
"grad_norm": 6.634593486785889,
"learning_rate": 3.926447537285859e-05,
"loss": 1.776,
"step": 790
},
{
"epoch": 0.08599376545200473,
"grad_norm": 9.289311408996582,
"learning_rate": 3.920748290048739e-05,
"loss": 1.7806,
"step": 800
},
{
"epoch": 0.08706868752015479,
"grad_norm": 7.693164348602295,
"learning_rate": 3.914972657502677e-05,
"loss": 1.8428,
"step": 810
},
{
"epoch": 0.08814360958830485,
"grad_norm": 5.920309066772461,
"learning_rate": 3.9091208770613036e-05,
"loss": 1.8048,
"step": 820
},
{
"epoch": 0.08921853165645491,
"grad_norm": 7.512606143951416,
"learning_rate": 3.9031931892683937e-05,
"loss": 1.848,
"step": 830
},
{
"epoch": 0.09029345372460497,
"grad_norm": 8.341816902160645,
"learning_rate": 3.897189837787975e-05,
"loss": 1.7995,
"step": 840
},
{
"epoch": 0.09136837579275503,
"grad_norm": 7.76444149017334,
"learning_rate": 3.891111069394313e-05,
"loss": 1.7587,
"step": 850
},
{
"epoch": 0.09244329786090509,
"grad_norm": 8.347227096557617,
"learning_rate": 3.884957133961768e-05,
"loss": 1.8215,
"step": 860
},
{
"epoch": 0.09351821992905514,
"grad_norm": 7.596982955932617,
"learning_rate": 3.878728284454522e-05,
"loss": 1.8831,
"step": 870
},
{
"epoch": 0.0945931419972052,
"grad_norm": 7.217879295349121,
"learning_rate": 3.872424776916183e-05,
"loss": 1.9871,
"step": 880
},
{
"epoch": 0.09566806406535526,
"grad_norm": 6.474759101867676,
"learning_rate": 3.866046870459253e-05,
"loss": 1.8137,
"step": 890
},
{
"epoch": 0.09674298613350532,
"grad_norm": 7.693883419036865,
"learning_rate": 3.8595948272544905e-05,
"loss": 1.884,
"step": 900
},
{
"epoch": 0.09674298613350532,
"eval_loss": 1.8837863206863403,
"eval_runtime": 61.2639,
"eval_samples_per_second": 8.161,
"eval_steps_per_second": 8.161,
"step": 900
},
{
"epoch": 0.09781790820165538,
"grad_norm": 7.503582954406738,
"learning_rate": 3.8530689125201184e-05,
"loss": 1.8213,
"step": 910
},
{
"epoch": 0.09889283026980544,
"grad_norm": 7.769185543060303,
"learning_rate": 3.8464693945109305e-05,
"loss": 1.8505,
"step": 920
},
{
"epoch": 0.0999677523379555,
"grad_norm": 7.26152229309082,
"learning_rate": 3.839796544507265e-05,
"loss": 1.8225,
"step": 930
},
{
"epoch": 0.10104267440610555,
"grad_norm": 7.977392673492432,
"learning_rate": 3.833050636803849e-05,
"loss": 1.8483,
"step": 940
},
{
"epoch": 0.10211759647425561,
"grad_norm": 8.005775451660156,
"learning_rate": 3.826231948698527e-05,
"loss": 1.9364,
"step": 950
},
{
"epoch": 0.10319251854240567,
"grad_norm": 6.598343849182129,
"learning_rate": 3.819340760480859e-05,
"loss": 1.8967,
"step": 960
},
{
"epoch": 0.10426744061055573,
"grad_norm": 8.264419555664062,
"learning_rate": 3.812377355420602e-05,
"loss": 1.791,
"step": 970
},
{
"epoch": 0.1053423626787058,
"grad_norm": 7.987963676452637,
"learning_rate": 3.805342019756065e-05,
"loss": 1.8654,
"step": 980
},
{
"epoch": 0.10641728474685586,
"grad_norm": 8.105964660644531,
"learning_rate": 3.7982350426823406e-05,
"loss": 1.854,
"step": 990
},
{
"epoch": 0.10749220681500592,
"grad_norm": 8.088004112243652,
"learning_rate": 3.791056716339421e-05,
"loss": 1.8123,
"step": 1000
},
{
"epoch": 0.10856712888315598,
"grad_norm": 9.775269508361816,
"learning_rate": 3.783807335800187e-05,
"loss": 1.8431,
"step": 1010
},
{
"epoch": 0.10964205095130602,
"grad_norm": 6.646794319152832,
"learning_rate": 3.776487199058277e-05,
"loss": 1.8865,
"step": 1020
},
{
"epoch": 0.11071697301945609,
"grad_norm": 7.902348041534424,
"learning_rate": 3.769096607015843e-05,
"loss": 1.8599,
"step": 1030
},
{
"epoch": 0.11179189508760615,
"grad_norm": 6.932212829589844,
"learning_rate": 3.761635863471175e-05,
"loss": 1.8696,
"step": 1040
},
{
"epoch": 0.11286681715575621,
"grad_norm": 8.541101455688477,
"learning_rate": 3.754105275106222e-05,
"loss": 1.7824,
"step": 1050
},
{
"epoch": 0.11286681715575621,
"eval_loss": 1.8909525871276855,
"eval_runtime": 60.951,
"eval_samples_per_second": 8.203,
"eval_steps_per_second": 8.203,
"step": 1050
},
{
"epoch": 0.11394173922390627,
"grad_norm": 6.613965034484863,
"learning_rate": 3.746505151473972e-05,
"loss": 1.8719,
"step": 1060
},
{
"epoch": 0.11501666129205633,
"grad_norm": 7.055614948272705,
"learning_rate": 3.738835804985743e-05,
"loss": 1.8533,
"step": 1070
},
{
"epoch": 0.11609158336020639,
"grad_norm": 8.038810729980469,
"learning_rate": 3.731097550898329e-05,
"loss": 1.8067,
"step": 1080
},
{
"epoch": 0.11716650542835644,
"grad_norm": 6.726877689361572,
"learning_rate": 3.723290707301047e-05,
"loss": 1.8871,
"step": 1090
},
{
"epoch": 0.1182414274965065,
"grad_norm": 7.802327632904053,
"learning_rate": 3.7154155951026605e-05,
"loss": 2.0685,
"step": 1100
},
{
"epoch": 0.11931634956465656,
"grad_norm": 8.144506454467773,
"learning_rate": 3.707472538018187e-05,
"loss": 1.9074,
"step": 1110
},
{
"epoch": 0.12039127163280662,
"grad_norm": 11.114158630371094,
"learning_rate": 3.6994618625555925e-05,
"loss": 1.8036,
"step": 1120
},
{
"epoch": 0.12146619370095668,
"grad_norm": 8.913481712341309,
"learning_rate": 3.691383898002368e-05,
"loss": 1.8411,
"step": 1130
},
{
"epoch": 0.12254111576910674,
"grad_norm": 7.231012344360352,
"learning_rate": 3.683238976412e-05,
"loss": 1.838,
"step": 1140
},
{
"epoch": 0.1236160378372568,
"grad_norm": 7.460425853729248,
"learning_rate": 3.675027432590312e-05,
"loss": 1.907,
"step": 1150
},
{
"epoch": 0.12469095990540686,
"grad_norm": 6.360142230987549,
"learning_rate": 3.666749604081707e-05,
"loss": 1.8568,
"step": 1160
},
{
"epoch": 0.12576588197355693,
"grad_norm": 8.225682258605957,
"learning_rate": 3.6584058311552954e-05,
"loss": 1.8473,
"step": 1170
},
{
"epoch": 0.12684080404170697,
"grad_norm": 8.119696617126465,
"learning_rate": 3.6499964567909e-05,
"loss": 1.7953,
"step": 1180
},
{
"epoch": 0.12791572610985705,
"grad_norm": 7.61316442489624,
"learning_rate": 3.641521826664964e-05,
"loss": 1.7599,
"step": 1190
},
{
"epoch": 0.1289906481780071,
"grad_norm": 7.965550422668457,
"learning_rate": 3.63298228913634e-05,
"loss": 1.8461,
"step": 1200
},
{
"epoch": 0.1289906481780071,
"eval_loss": 1.8840419054031372,
"eval_runtime": 62.5093,
"eval_samples_per_second": 7.999,
"eval_steps_per_second": 7.999,
"step": 1200
},
{
"epoch": 0.13006557024615714,
"grad_norm": 5.871341228485107,
"learning_rate": 3.624378195231967e-05,
"loss": 1.8397,
"step": 1210
},
{
"epoch": 0.13114049231430722,
"grad_norm": 18.254127502441406,
"learning_rate": 3.615709898632448e-05,
"loss": 1.8295,
"step": 1220
},
{
"epoch": 0.13221541438245726,
"grad_norm": 8.392979621887207,
"learning_rate": 3.606977755657502e-05,
"loss": 1.8058,
"step": 1230
},
{
"epoch": 0.13329033645060734,
"grad_norm": 10.364768981933594,
"learning_rate": 3.5981821252513274e-05,
"loss": 1.8656,
"step": 1240
},
{
"epoch": 0.13436525851875739,
"grad_norm": 6.204119682312012,
"learning_rate": 3.5893233689678384e-05,
"loss": 1.8201,
"step": 1250
},
{
"epoch": 0.13544018058690746,
"grad_norm": 8.470161437988281,
"learning_rate": 3.5804018509558095e-05,
"loss": 1.8629,
"step": 1260
},
{
"epoch": 0.1365151026550575,
"grad_norm": 7.885449409484863,
"learning_rate": 3.571417937943903e-05,
"loss": 1.9337,
"step": 1270
},
{
"epoch": 0.13759002472320755,
"grad_norm": 6.3673996925354,
"learning_rate": 3.562371999225594e-05,
"loss": 1.8661,
"step": 1280
},
{
"epoch": 0.13866494679135763,
"grad_norm": 9.29290771484375,
"learning_rate": 3.553264406643995e-05,
"loss": 1.7597,
"step": 1290
},
{
"epoch": 0.13973986885950768,
"grad_norm": 6.828396320343018,
"learning_rate": 3.544095534576563e-05,
"loss": 1.8641,
"step": 1300
},
{
"epoch": 0.14081479092765775,
"grad_norm": 9.610004425048828,
"learning_rate": 3.534865759919718e-05,
"loss": 1.8846,
"step": 1310
},
{
"epoch": 0.1418897129958078,
"grad_norm": 7.615134239196777,
"learning_rate": 3.525575462073344e-05,
"loss": 1.885,
"step": 1320
},
{
"epoch": 0.14296463506395787,
"grad_norm": 7.668911933898926,
"learning_rate": 3.516225022925199e-05,
"loss": 1.79,
"step": 1330
},
{
"epoch": 0.14403955713210792,
"grad_norm": 7.5038042068481445,
"learning_rate": 3.5068148268352135e-05,
"loss": 1.8394,
"step": 1340
},
{
"epoch": 0.14511447920025797,
"grad_norm": 7.994359970092773,
"learning_rate": 3.497345260619691e-05,
"loss": 1.8393,
"step": 1350
},
{
"epoch": 0.14511447920025797,
"eval_loss": 1.8796833753585815,
"eval_runtime": 57.9394,
"eval_samples_per_second": 8.63,
"eval_steps_per_second": 8.63,
"step": 1350
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.751283405814497e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}