sandernotenbaert's picture
Training in progress, step 11000, checkpoint
9cf370a verified
{
"best_global_step": 11000,
"best_metric": 1.5041238069534302,
"best_model_checkpoint": "./results/hierarchical_music_t5_small_finetune/checkpoint-11000",
"epoch": 4.898065952414081,
"eval_steps": 500,
"global_step": 11000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022262418255182968,
"grad_norm": 0.8790799975395203,
"learning_rate": 1.088888888888889e-05,
"loss": 1.7866,
"step": 50
},
{
"epoch": 0.044524836510365935,
"grad_norm": 0.8580410480499268,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.7931,
"step": 100
},
{
"epoch": 0.0667872547655489,
"grad_norm": 1.150437355041504,
"learning_rate": 3.311111111111112e-05,
"loss": 1.7747,
"step": 150
},
{
"epoch": 0.08904967302073187,
"grad_norm": 1.1797043085098267,
"learning_rate": 4.422222222222222e-05,
"loss": 1.7777,
"step": 200
},
{
"epoch": 0.11131209127591485,
"grad_norm": 1.1714961528778076,
"learning_rate": 5e-05,
"loss": 1.7833,
"step": 250
},
{
"epoch": 0.1335745095310978,
"grad_norm": 1.29172682762146,
"learning_rate": 5e-05,
"loss": 1.7813,
"step": 300
},
{
"epoch": 0.15583692778628078,
"grad_norm": 1.0865519046783447,
"learning_rate": 5e-05,
"loss": 1.7737,
"step": 350
},
{
"epoch": 0.17809934604146374,
"grad_norm": 1.1579242944717407,
"learning_rate": 5e-05,
"loss": 1.7716,
"step": 400
},
{
"epoch": 0.20036176429664673,
"grad_norm": 1.1099216938018799,
"learning_rate": 5e-05,
"loss": 1.7877,
"step": 450
},
{
"epoch": 0.2226241825518297,
"grad_norm": 1.1747430562973022,
"learning_rate": 5e-05,
"loss": 1.7839,
"step": 500
},
{
"epoch": 0.2226241825518297,
"eval_loss": 1.6513175964355469,
"eval_runtime": 41.1785,
"eval_samples_per_second": 387.848,
"eval_steps_per_second": 48.496,
"step": 500
},
{
"epoch": 0.24488660080701266,
"grad_norm": 1.154510259628296,
"learning_rate": 5e-05,
"loss": 1.772,
"step": 550
},
{
"epoch": 0.2671490190621956,
"grad_norm": 0.99709153175354,
"learning_rate": 5e-05,
"loss": 1.7802,
"step": 600
},
{
"epoch": 0.2894114373173786,
"grad_norm": 1.3202115297317505,
"learning_rate": 5e-05,
"loss": 1.7565,
"step": 650
},
{
"epoch": 0.31167385557256155,
"grad_norm": 1.3452224731445312,
"learning_rate": 5e-05,
"loss": 1.7612,
"step": 700
},
{
"epoch": 0.33393627382774455,
"grad_norm": 1.1056386232376099,
"learning_rate": 5e-05,
"loss": 1.7696,
"step": 750
},
{
"epoch": 0.3561986920829275,
"grad_norm": 1.0761163234710693,
"learning_rate": 5e-05,
"loss": 1.6937,
"step": 800
},
{
"epoch": 0.3784611103381105,
"grad_norm": 0.9081959128379822,
"learning_rate": 5e-05,
"loss": 1.6766,
"step": 850
},
{
"epoch": 0.40072352859329347,
"grad_norm": 0.9897329211235046,
"learning_rate": 5e-05,
"loss": 1.6804,
"step": 900
},
{
"epoch": 0.4229859468484764,
"grad_norm": 0.992655873298645,
"learning_rate": 5e-05,
"loss": 1.6584,
"step": 950
},
{
"epoch": 0.4452483651036594,
"grad_norm": 1.0000704526901245,
"learning_rate": 5e-05,
"loss": 1.6576,
"step": 1000
},
{
"epoch": 0.4452483651036594,
"eval_loss": 1.6615262031555176,
"eval_runtime": 40.7337,
"eval_samples_per_second": 392.083,
"eval_steps_per_second": 49.026,
"step": 1000
},
{
"epoch": 0.46751078335884233,
"grad_norm": 0.9494450092315674,
"learning_rate": 5e-05,
"loss": 1.6604,
"step": 1050
},
{
"epoch": 0.4897732016140253,
"grad_norm": 0.9924134612083435,
"learning_rate": 5e-05,
"loss": 1.6539,
"step": 1100
},
{
"epoch": 0.5120356198692083,
"grad_norm": 1.0620170831680298,
"learning_rate": 5e-05,
"loss": 1.6552,
"step": 1150
},
{
"epoch": 0.5342980381243913,
"grad_norm": 1.1163603067398071,
"learning_rate": 5e-05,
"loss": 1.6452,
"step": 1200
},
{
"epoch": 0.5565604563795742,
"grad_norm": 1.025298833847046,
"learning_rate": 5e-05,
"loss": 1.6468,
"step": 1250
},
{
"epoch": 0.5788228746347572,
"grad_norm": 0.9661399722099304,
"learning_rate": 5e-05,
"loss": 1.6377,
"step": 1300
},
{
"epoch": 0.6010852928899402,
"grad_norm": 0.9570266008377075,
"learning_rate": 5e-05,
"loss": 1.6525,
"step": 1350
},
{
"epoch": 0.6233477111451231,
"grad_norm": 0.9325594902038574,
"learning_rate": 5e-05,
"loss": 1.6443,
"step": 1400
},
{
"epoch": 0.6456101294003062,
"grad_norm": 1.071475625038147,
"learning_rate": 5e-05,
"loss": 1.6418,
"step": 1450
},
{
"epoch": 0.6678725476554891,
"grad_norm": 0.9684040546417236,
"learning_rate": 5e-05,
"loss": 1.6396,
"step": 1500
},
{
"epoch": 0.6678725476554891,
"eval_loss": 1.6649832725524902,
"eval_runtime": 40.9434,
"eval_samples_per_second": 390.075,
"eval_steps_per_second": 48.775,
"step": 1500
},
{
"epoch": 0.690134965910672,
"grad_norm": 1.0452582836151123,
"learning_rate": 5e-05,
"loss": 1.6583,
"step": 1550
},
{
"epoch": 0.712397384165855,
"grad_norm": 0.8643882274627686,
"learning_rate": 5e-05,
"loss": 1.6538,
"step": 1600
},
{
"epoch": 0.734659802421038,
"grad_norm": 1.0304285287857056,
"learning_rate": 5e-05,
"loss": 1.6653,
"step": 1650
},
{
"epoch": 0.756922220676221,
"grad_norm": 1.1433496475219727,
"learning_rate": 5e-05,
"loss": 1.6605,
"step": 1700
},
{
"epoch": 0.7791846389314039,
"grad_norm": 0.9240351319313049,
"learning_rate": 5e-05,
"loss": 1.6696,
"step": 1750
},
{
"epoch": 0.8014470571865869,
"grad_norm": 1.0242925882339478,
"learning_rate": 5e-05,
"loss": 1.6697,
"step": 1800
},
{
"epoch": 0.8237094754417699,
"grad_norm": 0.9509591460227966,
"learning_rate": 5e-05,
"loss": 1.6859,
"step": 1850
},
{
"epoch": 0.8459718936969528,
"grad_norm": 1.2701749801635742,
"learning_rate": 5e-05,
"loss": 1.6885,
"step": 1900
},
{
"epoch": 0.8682343119521359,
"grad_norm": 1.4032883644104004,
"learning_rate": 5e-05,
"loss": 1.6935,
"step": 1950
},
{
"epoch": 0.8904967302073188,
"grad_norm": 1.2004971504211426,
"learning_rate": 5e-05,
"loss": 1.7168,
"step": 2000
},
{
"epoch": 0.8904967302073188,
"eval_loss": 1.631461262702942,
"eval_runtime": 40.9613,
"eval_samples_per_second": 389.905,
"eval_steps_per_second": 48.753,
"step": 2000
},
{
"epoch": 0.9127591484625017,
"grad_norm": 1.193617820739746,
"learning_rate": 5e-05,
"loss": 1.7404,
"step": 2050
},
{
"epoch": 0.9350215667176847,
"grad_norm": 1.424216866493225,
"learning_rate": 5e-05,
"loss": 1.7444,
"step": 2100
},
{
"epoch": 0.9572839849728677,
"grad_norm": 1.2657979726791382,
"learning_rate": 5e-05,
"loss": 1.7529,
"step": 2150
},
{
"epoch": 0.9795464032280506,
"grad_norm": 1.1823986768722534,
"learning_rate": 5e-05,
"loss": 1.7524,
"step": 2200
},
{
"epoch": 1.0017809934604147,
"grad_norm": 1.078079104423523,
"learning_rate": 5e-05,
"loss": 1.7432,
"step": 2250
},
{
"epoch": 1.0240434117155977,
"grad_norm": 1.279813528060913,
"learning_rate": 5e-05,
"loss": 1.7372,
"step": 2300
},
{
"epoch": 1.0463058299707806,
"grad_norm": 1.1668626070022583,
"learning_rate": 5e-05,
"loss": 1.7296,
"step": 2350
},
{
"epoch": 1.0685682482259635,
"grad_norm": 1.0546634197235107,
"learning_rate": 5e-05,
"loss": 1.7324,
"step": 2400
},
{
"epoch": 1.0908306664811465,
"grad_norm": 1.1601485013961792,
"learning_rate": 5e-05,
"loss": 1.7401,
"step": 2450
},
{
"epoch": 1.1130930847363294,
"grad_norm": 1.463930368423462,
"learning_rate": 5e-05,
"loss": 1.7366,
"step": 2500
},
{
"epoch": 1.1130930847363294,
"eval_loss": 1.6233899593353271,
"eval_runtime": 41.069,
"eval_samples_per_second": 388.882,
"eval_steps_per_second": 48.626,
"step": 2500
},
{
"epoch": 1.1353555029915126,
"grad_norm": 1.172264575958252,
"learning_rate": 5e-05,
"loss": 1.7348,
"step": 2550
},
{
"epoch": 1.1576179212466955,
"grad_norm": 1.076794981956482,
"learning_rate": 5e-05,
"loss": 1.7463,
"step": 2600
},
{
"epoch": 1.1798803395018784,
"grad_norm": 1.0754376649856567,
"learning_rate": 5e-05,
"loss": 1.7378,
"step": 2650
},
{
"epoch": 1.2021427577570614,
"grad_norm": 1.3081718683242798,
"learning_rate": 5e-05,
"loss": 1.7251,
"step": 2700
},
{
"epoch": 1.2244051760122443,
"grad_norm": 1.0483145713806152,
"learning_rate": 5e-05,
"loss": 1.7414,
"step": 2750
},
{
"epoch": 1.2466675942674272,
"grad_norm": 1.2890243530273438,
"learning_rate": 5e-05,
"loss": 1.7254,
"step": 2800
},
{
"epoch": 1.2689300125226102,
"grad_norm": 1.0999932289123535,
"learning_rate": 5e-05,
"loss": 1.7333,
"step": 2850
},
{
"epoch": 1.291192430777793,
"grad_norm": 1.0996226072311401,
"learning_rate": 5e-05,
"loss": 1.7151,
"step": 2900
},
{
"epoch": 1.3134548490329763,
"grad_norm": 1.3446428775787354,
"learning_rate": 5e-05,
"loss": 1.7088,
"step": 2950
},
{
"epoch": 1.3357172672881592,
"grad_norm": 0.9657168388366699,
"learning_rate": 5e-05,
"loss": 1.7171,
"step": 3000
},
{
"epoch": 1.3357172672881592,
"eval_loss": 1.6028199195861816,
"eval_runtime": 41.2736,
"eval_samples_per_second": 386.954,
"eval_steps_per_second": 48.384,
"step": 3000
},
{
"epoch": 1.3579796855433421,
"grad_norm": 0.904662549495697,
"learning_rate": 5e-05,
"loss": 1.6656,
"step": 3050
},
{
"epoch": 1.380242103798525,
"grad_norm": 1.2054646015167236,
"learning_rate": 5e-05,
"loss": 1.6409,
"step": 3100
},
{
"epoch": 1.402504522053708,
"grad_norm": 0.8623887300491333,
"learning_rate": 5e-05,
"loss": 1.6378,
"step": 3150
},
{
"epoch": 1.4247669403088912,
"grad_norm": 0.931481659412384,
"learning_rate": 5e-05,
"loss": 1.6395,
"step": 3200
},
{
"epoch": 1.447029358564074,
"grad_norm": 0.8971887826919556,
"learning_rate": 5e-05,
"loss": 1.6338,
"step": 3250
},
{
"epoch": 1.469291776819257,
"grad_norm": 0.9754030704498291,
"learning_rate": 5e-05,
"loss": 1.6341,
"step": 3300
},
{
"epoch": 1.49155419507444,
"grad_norm": 0.9373458027839661,
"learning_rate": 5e-05,
"loss": 1.6214,
"step": 3350
},
{
"epoch": 1.513816613329623,
"grad_norm": 1.1765072345733643,
"learning_rate": 5e-05,
"loss": 1.622,
"step": 3400
},
{
"epoch": 1.5360790315848059,
"grad_norm": 0.9341714382171631,
"learning_rate": 5e-05,
"loss": 1.6242,
"step": 3450
},
{
"epoch": 1.5583414498399888,
"grad_norm": 0.8690816164016724,
"learning_rate": 5e-05,
"loss": 1.6238,
"step": 3500
},
{
"epoch": 1.5583414498399888,
"eval_loss": 1.6130000352859497,
"eval_runtime": 40.5463,
"eval_samples_per_second": 393.896,
"eval_steps_per_second": 49.252,
"step": 3500
},
{
"epoch": 1.5806038680951717,
"grad_norm": 1.0579187870025635,
"learning_rate": 5e-05,
"loss": 1.611,
"step": 3550
},
{
"epoch": 1.6028662863503547,
"grad_norm": 0.8839408159255981,
"learning_rate": 5e-05,
"loss": 1.6106,
"step": 3600
},
{
"epoch": 1.6251287046055378,
"grad_norm": 1.048997402191162,
"learning_rate": 5e-05,
"loss": 1.6075,
"step": 3650
},
{
"epoch": 1.6473911228607208,
"grad_norm": 1.201557993888855,
"learning_rate": 5e-05,
"loss": 1.621,
"step": 3700
},
{
"epoch": 1.6696535411159037,
"grad_norm": 0.9804443717002869,
"learning_rate": 5e-05,
"loss": 1.6079,
"step": 3750
},
{
"epoch": 1.6919159593710866,
"grad_norm": 0.9969685077667236,
"learning_rate": 5e-05,
"loss": 1.6281,
"step": 3800
},
{
"epoch": 1.7141783776262698,
"grad_norm": 1.0730953216552734,
"learning_rate": 5e-05,
"loss": 1.6235,
"step": 3850
},
{
"epoch": 1.7364407958814527,
"grad_norm": 1.1014162302017212,
"learning_rate": 5e-05,
"loss": 1.6349,
"step": 3900
},
{
"epoch": 1.7587032141366357,
"grad_norm": 0.9518324732780457,
"learning_rate": 5e-05,
"loss": 1.6271,
"step": 3950
},
{
"epoch": 1.7809656323918186,
"grad_norm": 1.0745582580566406,
"learning_rate": 5e-05,
"loss": 1.6217,
"step": 4000
},
{
"epoch": 1.7809656323918186,
"eval_loss": 1.6218018531799316,
"eval_runtime": 41.2037,
"eval_samples_per_second": 387.611,
"eval_steps_per_second": 48.466,
"step": 4000
},
{
"epoch": 1.8032280506470015,
"grad_norm": 1.137293815612793,
"learning_rate": 5e-05,
"loss": 1.6288,
"step": 4050
},
{
"epoch": 1.8254904689021845,
"grad_norm": 1.1091963052749634,
"learning_rate": 5e-05,
"loss": 1.6421,
"step": 4100
},
{
"epoch": 1.8477528871573674,
"grad_norm": 1.0500215291976929,
"learning_rate": 5e-05,
"loss": 1.6594,
"step": 4150
},
{
"epoch": 1.8700153054125503,
"grad_norm": 1.2211509943008423,
"learning_rate": 5e-05,
"loss": 1.658,
"step": 4200
},
{
"epoch": 1.8922777236677333,
"grad_norm": 1.1174074411392212,
"learning_rate": 5e-05,
"loss": 1.6815,
"step": 4250
},
{
"epoch": 1.9145401419229162,
"grad_norm": 1.1086102724075317,
"learning_rate": 5e-05,
"loss": 1.7094,
"step": 4300
},
{
"epoch": 1.9368025601780994,
"grad_norm": 1.3630105257034302,
"learning_rate": 5e-05,
"loss": 1.7099,
"step": 4350
},
{
"epoch": 1.9590649784332823,
"grad_norm": 1.2096022367477417,
"learning_rate": 5e-05,
"loss": 1.7082,
"step": 4400
},
{
"epoch": 1.9813273966884652,
"grad_norm": 1.1671497821807861,
"learning_rate": 5e-05,
"loss": 1.7031,
"step": 4450
},
{
"epoch": 2.0035619869208294,
"grad_norm": 1.090248465538025,
"learning_rate": 5e-05,
"loss": 1.7077,
"step": 4500
},
{
"epoch": 2.0035619869208294,
"eval_loss": 1.5784235000610352,
"eval_runtime": 41.0244,
"eval_samples_per_second": 389.305,
"eval_steps_per_second": 48.678,
"step": 4500
},
{
"epoch": 2.0258244051760124,
"grad_norm": 1.096616506576538,
"learning_rate": 5e-05,
"loss": 1.7104,
"step": 4550
},
{
"epoch": 2.0480868234311953,
"grad_norm": 1.1066138744354248,
"learning_rate": 5e-05,
"loss": 1.7085,
"step": 4600
},
{
"epoch": 2.0703492416863782,
"grad_norm": 1.2357349395751953,
"learning_rate": 5e-05,
"loss": 1.7095,
"step": 4650
},
{
"epoch": 2.092611659941561,
"grad_norm": 1.0187031030654907,
"learning_rate": 5e-05,
"loss": 1.6925,
"step": 4700
},
{
"epoch": 2.114874078196744,
"grad_norm": 1.1060880422592163,
"learning_rate": 5e-05,
"loss": 1.6929,
"step": 4750
},
{
"epoch": 2.137136496451927,
"grad_norm": 1.3188073635101318,
"learning_rate": 5e-05,
"loss": 1.7067,
"step": 4800
},
{
"epoch": 2.15939891470711,
"grad_norm": 1.3043791055679321,
"learning_rate": 5e-05,
"loss": 1.7165,
"step": 4850
},
{
"epoch": 2.181661332962293,
"grad_norm": 1.3332817554473877,
"learning_rate": 5e-05,
"loss": 1.7005,
"step": 4900
},
{
"epoch": 2.203923751217476,
"grad_norm": 1.2902443408966064,
"learning_rate": 5e-05,
"loss": 1.6903,
"step": 4950
},
{
"epoch": 2.226186169472659,
"grad_norm": 0.9684903621673584,
"learning_rate": 5e-05,
"loss": 1.7034,
"step": 5000
},
{
"epoch": 2.226186169472659,
"eval_loss": 1.5792449712753296,
"eval_runtime": 40.876,
"eval_samples_per_second": 390.719,
"eval_steps_per_second": 48.855,
"step": 5000
},
{
"epoch": 2.2484485877278417,
"grad_norm": 1.1154942512512207,
"learning_rate": 5e-05,
"loss": 1.6964,
"step": 5050
},
{
"epoch": 2.270711005983025,
"grad_norm": 1.117543339729309,
"learning_rate": 5e-05,
"loss": 1.6862,
"step": 5100
},
{
"epoch": 2.292973424238208,
"grad_norm": 0.9821292161941528,
"learning_rate": 5e-05,
"loss": 1.6819,
"step": 5150
},
{
"epoch": 2.315235842493391,
"grad_norm": 1.1892586946487427,
"learning_rate": 5e-05,
"loss": 1.6964,
"step": 5200
},
{
"epoch": 2.337498260748574,
"grad_norm": 1.3049404621124268,
"learning_rate": 5e-05,
"loss": 1.682,
"step": 5250
},
{
"epoch": 2.359760679003757,
"grad_norm": 1.0873595476150513,
"learning_rate": 5e-05,
"loss": 1.6399,
"step": 5300
},
{
"epoch": 2.38202309725894,
"grad_norm": 1.0370205640792847,
"learning_rate": 5e-05,
"loss": 1.6153,
"step": 5350
},
{
"epoch": 2.4042855155141227,
"grad_norm": 0.8503725528717041,
"learning_rate": 5e-05,
"loss": 1.6022,
"step": 5400
},
{
"epoch": 2.4265479337693057,
"grad_norm": 0.9510111212730408,
"learning_rate": 5e-05,
"loss": 1.6106,
"step": 5450
},
{
"epoch": 2.4488103520244886,
"grad_norm": 0.9935341477394104,
"learning_rate": 5e-05,
"loss": 1.6049,
"step": 5500
},
{
"epoch": 2.4488103520244886,
"eval_loss": 1.586571455001831,
"eval_runtime": 40.7387,
"eval_samples_per_second": 392.036,
"eval_steps_per_second": 49.02,
"step": 5500
},
{
"epoch": 2.4710727702796715,
"grad_norm": 1.2289257049560547,
"learning_rate": 5e-05,
"loss": 1.5918,
"step": 5550
},
{
"epoch": 2.4933351885348545,
"grad_norm": 1.0900951623916626,
"learning_rate": 5e-05,
"loss": 1.6005,
"step": 5600
},
{
"epoch": 2.5155976067900374,
"grad_norm": 0.9930930137634277,
"learning_rate": 5e-05,
"loss": 1.6151,
"step": 5650
},
{
"epoch": 2.5378600250452203,
"grad_norm": 0.9901494979858398,
"learning_rate": 5e-05,
"loss": 1.59,
"step": 5700
},
{
"epoch": 2.5601224433004033,
"grad_norm": 0.9367809891700745,
"learning_rate": 5e-05,
"loss": 1.5844,
"step": 5750
},
{
"epoch": 2.582384861555586,
"grad_norm": 1.0291093587875366,
"learning_rate": 5e-05,
"loss": 1.5841,
"step": 5800
},
{
"epoch": 2.6046472798107696,
"grad_norm": 0.8904668688774109,
"learning_rate": 5e-05,
"loss": 1.5883,
"step": 5850
},
{
"epoch": 2.6269096980659525,
"grad_norm": 0.9640474915504456,
"learning_rate": 5e-05,
"loss": 1.5855,
"step": 5900
},
{
"epoch": 2.6491721163211355,
"grad_norm": 0.979326605796814,
"learning_rate": 5e-05,
"loss": 1.5798,
"step": 5950
},
{
"epoch": 2.6714345345763184,
"grad_norm": 1.2588844299316406,
"learning_rate": 5e-05,
"loss": 1.6018,
"step": 6000
},
{
"epoch": 2.6714345345763184,
"eval_loss": 1.5868676900863647,
"eval_runtime": 40.9701,
"eval_samples_per_second": 389.821,
"eval_steps_per_second": 48.743,
"step": 6000
},
{
"epoch": 2.6936969528315013,
"grad_norm": 1.070421814918518,
"learning_rate": 5e-05,
"loss": 1.6947,
"step": 6050
},
{
"epoch": 2.7159593710866843,
"grad_norm": 0.9952645301818848,
"learning_rate": 5e-05,
"loss": 1.6907,
"step": 6100
},
{
"epoch": 2.738221789341867,
"grad_norm": 1.2595455646514893,
"learning_rate": 5e-05,
"loss": 1.6954,
"step": 6150
},
{
"epoch": 2.76048420759705,
"grad_norm": 0.9722006916999817,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 6200
},
{
"epoch": 2.782746625852233,
"grad_norm": 1.2001519203186035,
"learning_rate": 5e-05,
"loss": 1.6832,
"step": 6250
},
{
"epoch": 2.805009044107416,
"grad_norm": 1.316867709159851,
"learning_rate": 5e-05,
"loss": 1.6873,
"step": 6300
},
{
"epoch": 2.8272714623625994,
"grad_norm": 1.2271651029586792,
"learning_rate": 5e-05,
"loss": 1.6865,
"step": 6350
},
{
"epoch": 2.8495338806177823,
"grad_norm": 1.2443265914916992,
"learning_rate": 5e-05,
"loss": 1.6779,
"step": 6400
},
{
"epoch": 2.8717962988729653,
"grad_norm": 1.1751494407653809,
"learning_rate": 5e-05,
"loss": 1.666,
"step": 6450
},
{
"epoch": 2.894058717128148,
"grad_norm": 0.9704211950302124,
"learning_rate": 5e-05,
"loss": 1.6628,
"step": 6500
},
{
"epoch": 2.894058717128148,
"eval_loss": 1.5628445148468018,
"eval_runtime": 41.0455,
"eval_samples_per_second": 389.105,
"eval_steps_per_second": 48.653,
"step": 6500
},
{
"epoch": 2.916321135383331,
"grad_norm": 1.0452390909194946,
"learning_rate": 5e-05,
"loss": 1.6794,
"step": 6550
},
{
"epoch": 2.938583553638514,
"grad_norm": 1.338881254196167,
"learning_rate": 5e-05,
"loss": 1.6678,
"step": 6600
},
{
"epoch": 2.960845971893697,
"grad_norm": 0.989860475063324,
"learning_rate": 5e-05,
"loss": 1.6753,
"step": 6650
},
{
"epoch": 2.98310839014888,
"grad_norm": 1.1380687952041626,
"learning_rate": 5e-05,
"loss": 1.6639,
"step": 6700
},
{
"epoch": 3.0057882287463475,
"grad_norm": 1.2292852401733398,
"learning_rate": 5e-05,
"loss": 1.697,
"step": 6750
},
{
"epoch": 3.0280506470015305,
"grad_norm": 1.1919242143630981,
"learning_rate": 5e-05,
"loss": 1.6714,
"step": 6800
},
{
"epoch": 3.0503130652567134,
"grad_norm": 1.1312869787216187,
"learning_rate": 5e-05,
"loss": 1.6641,
"step": 6850
},
{
"epoch": 3.0725754835118964,
"grad_norm": 1.3589369058609009,
"learning_rate": 5e-05,
"loss": 1.655,
"step": 6900
},
{
"epoch": 3.0948379017670793,
"grad_norm": 1.257063627243042,
"learning_rate": 5e-05,
"loss": 1.6661,
"step": 6950
},
{
"epoch": 3.1171003200222622,
"grad_norm": 1.3228737115859985,
"learning_rate": 5e-05,
"loss": 1.653,
"step": 7000
},
{
"epoch": 3.1171003200222622,
"eval_loss": 1.5605802536010742,
"eval_runtime": 42.9947,
"eval_samples_per_second": 371.464,
"eval_steps_per_second": 46.448,
"step": 7000
},
{
"epoch": 3.1393627382774456,
"grad_norm": 1.0204429626464844,
"learning_rate": 5e-05,
"loss": 1.6602,
"step": 7050
},
{
"epoch": 3.1616251565326285,
"grad_norm": 0.9796785712242126,
"learning_rate": 5e-05,
"loss": 1.6508,
"step": 7100
},
{
"epoch": 3.1838875747878115,
"grad_norm": 0.9721747040748596,
"learning_rate": 5e-05,
"loss": 1.6566,
"step": 7150
},
{
"epoch": 3.2061499930429944,
"grad_norm": 1.1874974966049194,
"learning_rate": 5e-05,
"loss": 1.6501,
"step": 7200
},
{
"epoch": 3.2284124112981774,
"grad_norm": 1.2861804962158203,
"learning_rate": 5e-05,
"loss": 1.6663,
"step": 7250
},
{
"epoch": 3.2506748295533603,
"grad_norm": 0.9947218894958496,
"learning_rate": 5e-05,
"loss": 1.6732,
"step": 7300
},
{
"epoch": 3.2729372478085432,
"grad_norm": 1.1224796772003174,
"learning_rate": 5e-05,
"loss": 1.6597,
"step": 7350
},
{
"epoch": 3.295199666063726,
"grad_norm": 1.2262948751449585,
"learning_rate": 5e-05,
"loss": 1.6544,
"step": 7400
},
{
"epoch": 3.317462084318909,
"grad_norm": 1.114092469215393,
"learning_rate": 5e-05,
"loss": 1.6532,
"step": 7450
},
{
"epoch": 3.339724502574092,
"grad_norm": 1.0086640119552612,
"learning_rate": 5e-05,
"loss": 1.6575,
"step": 7500
},
{
"epoch": 3.339724502574092,
"eval_loss": 1.5381077527999878,
"eval_runtime": 43.9183,
"eval_samples_per_second": 363.652,
"eval_steps_per_second": 45.471,
"step": 7500
},
{
"epoch": 3.3615416724641713,
"grad_norm": 1.4630149602890015,
"learning_rate": 5e-05,
"loss": 1.6543,
"step": 7550
},
{
"epoch": 3.3838040907193543,
"grad_norm": 0.9978652596473694,
"learning_rate": 5e-05,
"loss": 1.6469,
"step": 7600
},
{
"epoch": 3.406066508974537,
"grad_norm": 0.9942854046821594,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 7650
},
{
"epoch": 3.42832892722972,
"grad_norm": 1.6113872528076172,
"learning_rate": 5e-05,
"loss": 1.6392,
"step": 7700
},
{
"epoch": 3.450591345484903,
"grad_norm": 1.2430763244628906,
"learning_rate": 5e-05,
"loss": 1.6524,
"step": 7750
},
{
"epoch": 3.472853763740086,
"grad_norm": 0.9973090887069702,
"learning_rate": 5e-05,
"loss": 1.6396,
"step": 7800
},
{
"epoch": 3.4951161819952694,
"grad_norm": 1.3717776536941528,
"learning_rate": 5e-05,
"loss": 1.6463,
"step": 7850
},
{
"epoch": 3.5173786002504523,
"grad_norm": 1.3711599111557007,
"learning_rate": 5e-05,
"loss": 1.644,
"step": 7900
},
{
"epoch": 3.5396410185056353,
"grad_norm": 1.0126900672912598,
"learning_rate": 5e-05,
"loss": 1.6311,
"step": 7950
},
{
"epoch": 3.561903436760818,
"grad_norm": 1.0467159748077393,
"learning_rate": 5e-05,
"loss": 1.64,
"step": 8000
},
{
"epoch": 3.561903436760818,
"eval_loss": 1.539516806602478,
"eval_runtime": 42.3362,
"eval_samples_per_second": 377.242,
"eval_steps_per_second": 47.17,
"step": 8000
},
{
"epoch": 3.584165855016001,
"grad_norm": 1.1766951084136963,
"learning_rate": 5e-05,
"loss": 1.6552,
"step": 8050
},
{
"epoch": 3.606428273271184,
"grad_norm": 1.0943933725357056,
"learning_rate": 5e-05,
"loss": 1.6385,
"step": 8100
},
{
"epoch": 3.628690691526367,
"grad_norm": 1.2377898693084717,
"learning_rate": 5e-05,
"loss": 1.6288,
"step": 8150
},
{
"epoch": 3.65095310978155,
"grad_norm": 0.939339280128479,
"learning_rate": 5e-05,
"loss": 1.6357,
"step": 8200
},
{
"epoch": 3.673215528036733,
"grad_norm": 1.0802948474884033,
"learning_rate": 5e-05,
"loss": 1.6367,
"step": 8250
},
{
"epoch": 3.695477946291916,
"grad_norm": 1.089154601097107,
"learning_rate": 5e-05,
"loss": 1.6434,
"step": 8300
},
{
"epoch": 3.7177403645470988,
"grad_norm": 1.095510482788086,
"learning_rate": 5e-05,
"loss": 1.6445,
"step": 8350
},
{
"epoch": 3.740002782802282,
"grad_norm": 1.2433582544326782,
"learning_rate": 5e-05,
"loss": 1.6521,
"step": 8400
},
{
"epoch": 3.762265201057465,
"grad_norm": 1.3547347784042358,
"learning_rate": 5e-05,
"loss": 1.6363,
"step": 8450
},
{
"epoch": 3.784527619312648,
"grad_norm": 1.224070429801941,
"learning_rate": 5e-05,
"loss": 1.6455,
"step": 8500
},
{
"epoch": 3.784527619312648,
"eval_loss": 1.516330361366272,
"eval_runtime": 40.388,
"eval_samples_per_second": 395.439,
"eval_steps_per_second": 49.445,
"step": 8500
},
{
"epoch": 3.806790037567831,
"grad_norm": 1.3312312364578247,
"learning_rate": 5e-05,
"loss": 1.6493,
"step": 8550
},
{
"epoch": 3.829052455823014,
"grad_norm": 1.275539517402649,
"learning_rate": 5e-05,
"loss": 1.6385,
"step": 8600
},
{
"epoch": 3.851314874078197,
"grad_norm": 1.1481244564056396,
"learning_rate": 5e-05,
"loss": 1.6369,
"step": 8650
},
{
"epoch": 3.8735772923333798,
"grad_norm": 1.039244532585144,
"learning_rate": 5e-05,
"loss": 1.6277,
"step": 8700
},
{
"epoch": 3.8958397105885627,
"grad_norm": 1.0740258693695068,
"learning_rate": 5e-05,
"loss": 1.6294,
"step": 8750
},
{
"epoch": 3.9181021288437456,
"grad_norm": 1.0660001039505005,
"learning_rate": 5e-05,
"loss": 1.6123,
"step": 8800
},
{
"epoch": 3.9403645470989286,
"grad_norm": 1.036129117012024,
"learning_rate": 5e-05,
"loss": 1.5803,
"step": 8850
},
{
"epoch": 3.9626269653541115,
"grad_norm": 0.9285004734992981,
"learning_rate": 5e-05,
"loss": 1.5905,
"step": 8900
},
{
"epoch": 3.9848893836092945,
"grad_norm": 0.9074347019195557,
"learning_rate": 5e-05,
"loss": 1.5713,
"step": 8950
},
{
"epoch": 4.0075692222067625,
"grad_norm": 1.1706671714782715,
"learning_rate": 5e-05,
"loss": 1.6308,
"step": 9000
},
{
"epoch": 4.0075692222067625,
"eval_loss": 1.5311250686645508,
"eval_runtime": 40.3535,
"eval_samples_per_second": 395.778,
"eval_steps_per_second": 49.488,
"step": 9000
},
{
"epoch": 4.029831640461945,
"grad_norm": 1.2552838325500488,
"learning_rate": 5e-05,
"loss": 1.6371,
"step": 9050
},
{
"epoch": 4.052094058717128,
"grad_norm": 1.3415331840515137,
"learning_rate": 5e-05,
"loss": 1.6345,
"step": 9100
},
{
"epoch": 4.074356476972311,
"grad_norm": 1.029757022857666,
"learning_rate": 5e-05,
"loss": 1.629,
"step": 9150
},
{
"epoch": 4.096618895227494,
"grad_norm": 1.1435120105743408,
"learning_rate": 5e-05,
"loss": 1.6287,
"step": 9200
},
{
"epoch": 4.118881313482677,
"grad_norm": 1.385100245475769,
"learning_rate": 5e-05,
"loss": 1.6335,
"step": 9250
},
{
"epoch": 4.14114373173786,
"grad_norm": 1.062818169593811,
"learning_rate": 5e-05,
"loss": 1.6184,
"step": 9300
},
{
"epoch": 4.163406149993043,
"grad_norm": 1.3703244924545288,
"learning_rate": 5e-05,
"loss": 1.631,
"step": 9350
},
{
"epoch": 4.185668568248226,
"grad_norm": 1.1130529642105103,
"learning_rate": 5e-05,
"loss": 1.6284,
"step": 9400
},
{
"epoch": 4.207930986503409,
"grad_norm": 1.189207911491394,
"learning_rate": 5e-05,
"loss": 1.619,
"step": 9450
},
{
"epoch": 4.230193404758592,
"grad_norm": 1.0979055166244507,
"learning_rate": 5e-05,
"loss": 1.6324,
"step": 9500
},
{
"epoch": 4.230193404758592,
"eval_loss": 1.5118227005004883,
"eval_runtime": 40.2303,
"eval_samples_per_second": 396.989,
"eval_steps_per_second": 49.639,
"step": 9500
},
{
"epoch": 4.252455823013775,
"grad_norm": 1.1394270658493042,
"learning_rate": 5e-05,
"loss": 1.6229,
"step": 9550
},
{
"epoch": 4.274718241268958,
"grad_norm": 1.0398465394973755,
"learning_rate": 5e-05,
"loss": 1.625,
"step": 9600
},
{
"epoch": 4.296980659524141,
"grad_norm": 1.1344504356384277,
"learning_rate": 5e-05,
"loss": 1.6113,
"step": 9650
},
{
"epoch": 4.319243077779324,
"grad_norm": 0.9889805316925049,
"learning_rate": 5e-05,
"loss": 1.6195,
"step": 9700
},
{
"epoch": 4.3415054960345065,
"grad_norm": 1.2321630716323853,
"learning_rate": 5e-05,
"loss": 1.6133,
"step": 9750
},
{
"epoch": 4.3637679142896895,
"grad_norm": 1.0766791105270386,
"learning_rate": 5e-05,
"loss": 1.587,
"step": 9800
},
{
"epoch": 4.386030332544872,
"grad_norm": 0.9230866432189941,
"learning_rate": 5e-05,
"loss": 1.5747,
"step": 9850
},
{
"epoch": 4.408292750800055,
"grad_norm": 1.036097526550293,
"learning_rate": 5e-05,
"loss": 1.5673,
"step": 9900
},
{
"epoch": 4.430555169055238,
"grad_norm": 1.0321383476257324,
"learning_rate": 5e-05,
"loss": 1.5641,
"step": 9950
},
{
"epoch": 4.452817587310421,
"grad_norm": 0.9865553379058838,
"learning_rate": 5e-05,
"loss": 1.5481,
"step": 10000
},
{
"epoch": 4.452817587310421,
"eval_loss": 1.5091972351074219,
"eval_runtime": 40.5505,
"eval_samples_per_second": 393.855,
"eval_steps_per_second": 49.247,
"step": 10000
},
{
"epoch": 4.475080005565605,
"grad_norm": 1.0181940793991089,
"learning_rate": 5e-05,
"loss": 1.5594,
"step": 10050
},
{
"epoch": 4.497342423820788,
"grad_norm": 1.0538172721862793,
"learning_rate": 5e-05,
"loss": 1.5523,
"step": 10100
},
{
"epoch": 4.519604842075971,
"grad_norm": 0.936060905456543,
"learning_rate": 5e-05,
"loss": 1.547,
"step": 10150
},
{
"epoch": 4.541867260331154,
"grad_norm": 1.225715160369873,
"learning_rate": 5e-05,
"loss": 1.5491,
"step": 10200
},
{
"epoch": 4.564129678586337,
"grad_norm": 1.2574198246002197,
"learning_rate": 5e-05,
"loss": 1.5496,
"step": 10250
},
{
"epoch": 4.58639209684152,
"grad_norm": 1.2122540473937988,
"learning_rate": 5e-05,
"loss": 1.5327,
"step": 10300
},
{
"epoch": 4.608654515096703,
"grad_norm": 1.1094001531600952,
"learning_rate": 5e-05,
"loss": 1.5375,
"step": 10350
},
{
"epoch": 4.630916933351886,
"grad_norm": 1.0384974479675293,
"learning_rate": 5e-05,
"loss": 1.555,
"step": 10400
},
{
"epoch": 4.6531793516070685,
"grad_norm": 1.0797594785690308,
"learning_rate": 5e-05,
"loss": 1.5621,
"step": 10450
},
{
"epoch": 4.6754417698622515,
"grad_norm": 1.0724256038665771,
"learning_rate": 5e-05,
"loss": 1.547,
"step": 10500
},
{
"epoch": 4.6754417698622515,
"eval_loss": 1.5108764171600342,
"eval_runtime": 40.4882,
"eval_samples_per_second": 394.461,
"eval_steps_per_second": 49.323,
"step": 10500
},
{
"epoch": 4.697704188117434,
"grad_norm": 1.236370325088501,
"learning_rate": 5e-05,
"loss": 1.5426,
"step": 10550
},
{
"epoch": 4.719966606372617,
"grad_norm": 1.1259009838104248,
"learning_rate": 5e-05,
"loss": 1.5701,
"step": 10600
},
{
"epoch": 4.7422290246278,
"grad_norm": 1.0653769969940186,
"learning_rate": 5e-05,
"loss": 1.5543,
"step": 10650
},
{
"epoch": 4.764491442882983,
"grad_norm": 1.1116371154785156,
"learning_rate": 5e-05,
"loss": 1.557,
"step": 10700
},
{
"epoch": 4.786753861138166,
"grad_norm": 1.0332480669021606,
"learning_rate": 5e-05,
"loss": 1.5513,
"step": 10750
},
{
"epoch": 4.809016279393349,
"grad_norm": 1.1142674684524536,
"learning_rate": 5e-05,
"loss": 1.5448,
"step": 10800
},
{
"epoch": 4.831278697648532,
"grad_norm": 1.0316691398620605,
"learning_rate": 5e-05,
"loss": 1.5456,
"step": 10850
},
{
"epoch": 4.853541115903715,
"grad_norm": 0.987628161907196,
"learning_rate": 5e-05,
"loss": 1.5535,
"step": 10900
},
{
"epoch": 4.875803534158898,
"grad_norm": 1.125772476196289,
"learning_rate": 5e-05,
"loss": 1.5583,
"step": 10950
},
{
"epoch": 4.898065952414081,
"grad_norm": 0.9541718363761902,
"learning_rate": 5e-05,
"loss": 1.5584,
"step": 11000
},
{
"epoch": 4.898065952414081,
"eval_loss": 1.5041238069534302,
"eval_runtime": 40.2547,
"eval_samples_per_second": 396.749,
"eval_steps_per_second": 49.609,
"step": 11000
}
],
"logging_steps": 50,
"max_steps": 11230,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.530938070076723e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}