{ "best_global_step": 11000, "best_metric": 1.5041238069534302, "best_model_checkpoint": "./results/hierarchical_music_t5_small_finetune/checkpoint-11000", "epoch": 4.898065952414081, "eval_steps": 500, "global_step": 11000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022262418255182968, "grad_norm": 0.8790799975395203, "learning_rate": 1.088888888888889e-05, "loss": 1.7866, "step": 50 }, { "epoch": 0.044524836510365935, "grad_norm": 0.8580410480499268, "learning_rate": 2.2000000000000003e-05, "loss": 1.7931, "step": 100 }, { "epoch": 0.0667872547655489, "grad_norm": 1.150437355041504, "learning_rate": 3.311111111111112e-05, "loss": 1.7747, "step": 150 }, { "epoch": 0.08904967302073187, "grad_norm": 1.1797043085098267, "learning_rate": 4.422222222222222e-05, "loss": 1.7777, "step": 200 }, { "epoch": 0.11131209127591485, "grad_norm": 1.1714961528778076, "learning_rate": 5e-05, "loss": 1.7833, "step": 250 }, { "epoch": 0.1335745095310978, "grad_norm": 1.29172682762146, "learning_rate": 5e-05, "loss": 1.7813, "step": 300 }, { "epoch": 0.15583692778628078, "grad_norm": 1.0865519046783447, "learning_rate": 5e-05, "loss": 1.7737, "step": 350 }, { "epoch": 0.17809934604146374, "grad_norm": 1.1579242944717407, "learning_rate": 5e-05, "loss": 1.7716, "step": 400 }, { "epoch": 0.20036176429664673, "grad_norm": 1.1099216938018799, "learning_rate": 5e-05, "loss": 1.7877, "step": 450 }, { "epoch": 0.2226241825518297, "grad_norm": 1.1747430562973022, "learning_rate": 5e-05, "loss": 1.7839, "step": 500 }, { "epoch": 0.2226241825518297, "eval_loss": 1.6513175964355469, "eval_runtime": 41.1785, "eval_samples_per_second": 387.848, "eval_steps_per_second": 48.496, "step": 500 }, { "epoch": 0.24488660080701266, "grad_norm": 1.154510259628296, "learning_rate": 5e-05, "loss": 1.772, "step": 550 }, { "epoch": 0.2671490190621956, "grad_norm": 0.99709153175354, "learning_rate": 5e-05, "loss": 1.7802, "step": 600 }, { "epoch": 0.2894114373173786, "grad_norm": 1.3202115297317505, "learning_rate": 5e-05, "loss": 1.7565, "step": 650 }, { "epoch": 0.31167385557256155, "grad_norm": 1.3452224731445312, "learning_rate": 5e-05, "loss": 1.7612, "step": 700 }, { "epoch": 0.33393627382774455, "grad_norm": 1.1056386232376099, "learning_rate": 5e-05, "loss": 1.7696, "step": 750 }, { "epoch": 0.3561986920829275, "grad_norm": 1.0761163234710693, "learning_rate": 5e-05, "loss": 1.6937, "step": 800 }, { "epoch": 0.3784611103381105, "grad_norm": 0.9081959128379822, "learning_rate": 5e-05, "loss": 1.6766, "step": 850 }, { "epoch": 0.40072352859329347, "grad_norm": 0.9897329211235046, "learning_rate": 5e-05, "loss": 1.6804, "step": 900 }, { "epoch": 0.4229859468484764, "grad_norm": 0.992655873298645, "learning_rate": 5e-05, "loss": 1.6584, "step": 950 }, { "epoch": 0.4452483651036594, "grad_norm": 1.0000704526901245, "learning_rate": 5e-05, "loss": 1.6576, "step": 1000 }, { "epoch": 0.4452483651036594, "eval_loss": 1.6615262031555176, "eval_runtime": 40.7337, "eval_samples_per_second": 392.083, "eval_steps_per_second": 49.026, "step": 1000 }, { "epoch": 0.46751078335884233, "grad_norm": 0.9494450092315674, "learning_rate": 5e-05, "loss": 1.6604, "step": 1050 }, { "epoch": 0.4897732016140253, "grad_norm": 0.9924134612083435, "learning_rate": 5e-05, "loss": 1.6539, "step": 1100 }, { "epoch": 0.5120356198692083, "grad_norm": 1.0620170831680298, "learning_rate": 5e-05, "loss": 1.6552, "step": 1150 }, { "epoch": 0.5342980381243913, "grad_norm": 1.1163603067398071, "learning_rate": 5e-05, "loss": 1.6452, "step": 1200 }, { "epoch": 0.5565604563795742, "grad_norm": 1.025298833847046, "learning_rate": 5e-05, "loss": 1.6468, "step": 1250 }, { "epoch": 0.5788228746347572, "grad_norm": 0.9661399722099304, "learning_rate": 5e-05, "loss": 1.6377, "step": 1300 }, { "epoch": 0.6010852928899402, "grad_norm": 0.9570266008377075, "learning_rate": 5e-05, "loss": 1.6525, "step": 1350 }, { "epoch": 0.6233477111451231, "grad_norm": 0.9325594902038574, "learning_rate": 5e-05, "loss": 1.6443, "step": 1400 }, { "epoch": 0.6456101294003062, "grad_norm": 1.071475625038147, "learning_rate": 5e-05, "loss": 1.6418, "step": 1450 }, { "epoch": 0.6678725476554891, "grad_norm": 0.9684040546417236, "learning_rate": 5e-05, "loss": 1.6396, "step": 1500 }, { "epoch": 0.6678725476554891, "eval_loss": 1.6649832725524902, "eval_runtime": 40.9434, "eval_samples_per_second": 390.075, "eval_steps_per_second": 48.775, "step": 1500 }, { "epoch": 0.690134965910672, "grad_norm": 1.0452582836151123, "learning_rate": 5e-05, "loss": 1.6583, "step": 1550 }, { "epoch": 0.712397384165855, "grad_norm": 0.8643882274627686, "learning_rate": 5e-05, "loss": 1.6538, "step": 1600 }, { "epoch": 0.734659802421038, "grad_norm": 1.0304285287857056, "learning_rate": 5e-05, "loss": 1.6653, "step": 1650 }, { "epoch": 0.756922220676221, "grad_norm": 1.1433496475219727, "learning_rate": 5e-05, "loss": 1.6605, "step": 1700 }, { "epoch": 0.7791846389314039, "grad_norm": 0.9240351319313049, "learning_rate": 5e-05, "loss": 1.6696, "step": 1750 }, { "epoch": 0.8014470571865869, "grad_norm": 1.0242925882339478, "learning_rate": 5e-05, "loss": 1.6697, "step": 1800 }, { "epoch": 0.8237094754417699, "grad_norm": 0.9509591460227966, "learning_rate": 5e-05, "loss": 1.6859, "step": 1850 }, { "epoch": 0.8459718936969528, "grad_norm": 1.2701749801635742, "learning_rate": 5e-05, "loss": 1.6885, "step": 1900 }, { "epoch": 0.8682343119521359, "grad_norm": 1.4032883644104004, "learning_rate": 5e-05, "loss": 1.6935, "step": 1950 }, { "epoch": 0.8904967302073188, "grad_norm": 1.2004971504211426, "learning_rate": 5e-05, "loss": 1.7168, "step": 2000 }, { "epoch": 0.8904967302073188, "eval_loss": 1.631461262702942, "eval_runtime": 40.9613, "eval_samples_per_second": 389.905, "eval_steps_per_second": 48.753, "step": 2000 }, { "epoch": 0.9127591484625017, "grad_norm": 1.193617820739746, "learning_rate": 5e-05, "loss": 1.7404, "step": 2050 }, { "epoch": 0.9350215667176847, "grad_norm": 1.424216866493225, "learning_rate": 5e-05, "loss": 1.7444, "step": 2100 }, { "epoch": 0.9572839849728677, "grad_norm": 1.2657979726791382, "learning_rate": 5e-05, "loss": 1.7529, "step": 2150 }, { "epoch": 0.9795464032280506, "grad_norm": 1.1823986768722534, "learning_rate": 5e-05, "loss": 1.7524, "step": 2200 }, { "epoch": 1.0017809934604147, "grad_norm": 1.078079104423523, "learning_rate": 5e-05, "loss": 1.7432, "step": 2250 }, { "epoch": 1.0240434117155977, "grad_norm": 1.279813528060913, "learning_rate": 5e-05, "loss": 1.7372, "step": 2300 }, { "epoch": 1.0463058299707806, "grad_norm": 1.1668626070022583, "learning_rate": 5e-05, "loss": 1.7296, "step": 2350 }, { "epoch": 1.0685682482259635, "grad_norm": 1.0546634197235107, "learning_rate": 5e-05, "loss": 1.7324, "step": 2400 }, { "epoch": 1.0908306664811465, "grad_norm": 1.1601485013961792, "learning_rate": 5e-05, "loss": 1.7401, "step": 2450 }, { "epoch": 1.1130930847363294, "grad_norm": 1.463930368423462, "learning_rate": 5e-05, "loss": 1.7366, "step": 2500 }, { "epoch": 1.1130930847363294, "eval_loss": 1.6233899593353271, "eval_runtime": 41.069, "eval_samples_per_second": 388.882, "eval_steps_per_second": 48.626, "step": 2500 }, { "epoch": 1.1353555029915126, "grad_norm": 1.172264575958252, "learning_rate": 5e-05, "loss": 1.7348, "step": 2550 }, { "epoch": 1.1576179212466955, "grad_norm": 1.076794981956482, "learning_rate": 5e-05, "loss": 1.7463, "step": 2600 }, { "epoch": 1.1798803395018784, "grad_norm": 1.0754376649856567, "learning_rate": 5e-05, "loss": 1.7378, "step": 2650 }, { "epoch": 1.2021427577570614, "grad_norm": 1.3081718683242798, "learning_rate": 5e-05, "loss": 1.7251, "step": 2700 }, { "epoch": 1.2244051760122443, "grad_norm": 1.0483145713806152, "learning_rate": 5e-05, "loss": 1.7414, "step": 2750 }, { "epoch": 1.2466675942674272, "grad_norm": 1.2890243530273438, "learning_rate": 5e-05, "loss": 1.7254, "step": 2800 }, { "epoch": 1.2689300125226102, "grad_norm": 1.0999932289123535, "learning_rate": 5e-05, "loss": 1.7333, "step": 2850 }, { "epoch": 1.291192430777793, "grad_norm": 1.0996226072311401, "learning_rate": 5e-05, "loss": 1.7151, "step": 2900 }, { "epoch": 1.3134548490329763, "grad_norm": 1.3446428775787354, "learning_rate": 5e-05, "loss": 1.7088, "step": 2950 }, { "epoch": 1.3357172672881592, "grad_norm": 0.9657168388366699, "learning_rate": 5e-05, "loss": 1.7171, "step": 3000 }, { "epoch": 1.3357172672881592, "eval_loss": 1.6028199195861816, "eval_runtime": 41.2736, "eval_samples_per_second": 386.954, "eval_steps_per_second": 48.384, "step": 3000 }, { "epoch": 1.3579796855433421, "grad_norm": 0.904662549495697, "learning_rate": 5e-05, "loss": 1.6656, "step": 3050 }, { "epoch": 1.380242103798525, "grad_norm": 1.2054646015167236, "learning_rate": 5e-05, "loss": 1.6409, "step": 3100 }, { "epoch": 1.402504522053708, "grad_norm": 0.8623887300491333, "learning_rate": 5e-05, "loss": 1.6378, "step": 3150 }, { "epoch": 1.4247669403088912, "grad_norm": 0.931481659412384, "learning_rate": 5e-05, "loss": 1.6395, "step": 3200 }, { "epoch": 1.447029358564074, "grad_norm": 0.8971887826919556, "learning_rate": 5e-05, "loss": 1.6338, "step": 3250 }, { "epoch": 1.469291776819257, "grad_norm": 0.9754030704498291, "learning_rate": 5e-05, "loss": 1.6341, "step": 3300 }, { "epoch": 1.49155419507444, "grad_norm": 0.9373458027839661, "learning_rate": 5e-05, "loss": 1.6214, "step": 3350 }, { "epoch": 1.513816613329623, "grad_norm": 1.1765072345733643, "learning_rate": 5e-05, "loss": 1.622, "step": 3400 }, { "epoch": 1.5360790315848059, "grad_norm": 0.9341714382171631, "learning_rate": 5e-05, "loss": 1.6242, "step": 3450 }, { "epoch": 1.5583414498399888, "grad_norm": 0.8690816164016724, "learning_rate": 5e-05, "loss": 1.6238, "step": 3500 }, { "epoch": 1.5583414498399888, "eval_loss": 1.6130000352859497, "eval_runtime": 40.5463, "eval_samples_per_second": 393.896, "eval_steps_per_second": 49.252, "step": 3500 }, { "epoch": 1.5806038680951717, "grad_norm": 1.0579187870025635, "learning_rate": 5e-05, "loss": 1.611, "step": 3550 }, { "epoch": 1.6028662863503547, "grad_norm": 0.8839408159255981, "learning_rate": 5e-05, "loss": 1.6106, "step": 3600 }, { "epoch": 1.6251287046055378, "grad_norm": 1.048997402191162, "learning_rate": 5e-05, "loss": 1.6075, "step": 3650 }, { "epoch": 1.6473911228607208, "grad_norm": 1.201557993888855, "learning_rate": 5e-05, "loss": 1.621, "step": 3700 }, { "epoch": 1.6696535411159037, "grad_norm": 0.9804443717002869, "learning_rate": 5e-05, "loss": 1.6079, "step": 3750 }, { "epoch": 1.6919159593710866, "grad_norm": 0.9969685077667236, "learning_rate": 5e-05, "loss": 1.6281, "step": 3800 }, { "epoch": 1.7141783776262698, "grad_norm": 1.0730953216552734, "learning_rate": 5e-05, "loss": 1.6235, "step": 3850 }, { "epoch": 1.7364407958814527, "grad_norm": 1.1014162302017212, "learning_rate": 5e-05, "loss": 1.6349, "step": 3900 }, { "epoch": 1.7587032141366357, "grad_norm": 0.9518324732780457, "learning_rate": 5e-05, "loss": 1.6271, "step": 3950 }, { "epoch": 1.7809656323918186, "grad_norm": 1.0745582580566406, "learning_rate": 5e-05, "loss": 1.6217, "step": 4000 }, { "epoch": 1.7809656323918186, "eval_loss": 1.6218018531799316, "eval_runtime": 41.2037, "eval_samples_per_second": 387.611, "eval_steps_per_second": 48.466, "step": 4000 }, { "epoch": 1.8032280506470015, "grad_norm": 1.137293815612793, "learning_rate": 5e-05, "loss": 1.6288, "step": 4050 }, { "epoch": 1.8254904689021845, "grad_norm": 1.1091963052749634, "learning_rate": 5e-05, "loss": 1.6421, "step": 4100 }, { "epoch": 1.8477528871573674, "grad_norm": 1.0500215291976929, "learning_rate": 5e-05, "loss": 1.6594, "step": 4150 }, { "epoch": 1.8700153054125503, "grad_norm": 1.2211509943008423, "learning_rate": 5e-05, "loss": 1.658, "step": 4200 }, { "epoch": 1.8922777236677333, "grad_norm": 1.1174074411392212, "learning_rate": 5e-05, "loss": 1.6815, "step": 4250 }, { "epoch": 1.9145401419229162, "grad_norm": 1.1086102724075317, "learning_rate": 5e-05, "loss": 1.7094, "step": 4300 }, { "epoch": 1.9368025601780994, "grad_norm": 1.3630105257034302, "learning_rate": 5e-05, "loss": 1.7099, "step": 4350 }, { "epoch": 1.9590649784332823, "grad_norm": 1.2096022367477417, "learning_rate": 5e-05, "loss": 1.7082, "step": 4400 }, { "epoch": 1.9813273966884652, "grad_norm": 1.1671497821807861, "learning_rate": 5e-05, "loss": 1.7031, "step": 4450 }, { "epoch": 2.0035619869208294, "grad_norm": 1.090248465538025, "learning_rate": 5e-05, "loss": 1.7077, "step": 4500 }, { "epoch": 2.0035619869208294, "eval_loss": 1.5784235000610352, "eval_runtime": 41.0244, "eval_samples_per_second": 389.305, "eval_steps_per_second": 48.678, "step": 4500 }, { "epoch": 2.0258244051760124, "grad_norm": 1.096616506576538, "learning_rate": 5e-05, "loss": 1.7104, "step": 4550 }, { "epoch": 2.0480868234311953, "grad_norm": 1.1066138744354248, "learning_rate": 5e-05, "loss": 1.7085, "step": 4600 }, { "epoch": 2.0703492416863782, "grad_norm": 1.2357349395751953, "learning_rate": 5e-05, "loss": 1.7095, "step": 4650 }, { "epoch": 2.092611659941561, "grad_norm": 1.0187031030654907, "learning_rate": 5e-05, "loss": 1.6925, "step": 4700 }, { "epoch": 2.114874078196744, "grad_norm": 1.1060880422592163, "learning_rate": 5e-05, "loss": 1.6929, "step": 4750 }, { "epoch": 2.137136496451927, "grad_norm": 1.3188073635101318, "learning_rate": 5e-05, "loss": 1.7067, "step": 4800 }, { "epoch": 2.15939891470711, "grad_norm": 1.3043791055679321, "learning_rate": 5e-05, "loss": 1.7165, "step": 4850 }, { "epoch": 2.181661332962293, "grad_norm": 1.3332817554473877, "learning_rate": 5e-05, "loss": 1.7005, "step": 4900 }, { "epoch": 2.203923751217476, "grad_norm": 1.2902443408966064, "learning_rate": 5e-05, "loss": 1.6903, "step": 4950 }, { "epoch": 2.226186169472659, "grad_norm": 0.9684903621673584, "learning_rate": 5e-05, "loss": 1.7034, "step": 5000 }, { "epoch": 2.226186169472659, "eval_loss": 1.5792449712753296, "eval_runtime": 40.876, "eval_samples_per_second": 390.719, "eval_steps_per_second": 48.855, "step": 5000 }, { "epoch": 2.2484485877278417, "grad_norm": 1.1154942512512207, "learning_rate": 5e-05, "loss": 1.6964, "step": 5050 }, { "epoch": 2.270711005983025, "grad_norm": 1.117543339729309, "learning_rate": 5e-05, "loss": 1.6862, "step": 5100 }, { "epoch": 2.292973424238208, "grad_norm": 0.9821292161941528, "learning_rate": 5e-05, "loss": 1.6819, "step": 5150 }, { "epoch": 2.315235842493391, "grad_norm": 1.1892586946487427, "learning_rate": 5e-05, "loss": 1.6964, "step": 5200 }, { "epoch": 2.337498260748574, "grad_norm": 1.3049404621124268, "learning_rate": 5e-05, "loss": 1.682, "step": 5250 }, { "epoch": 2.359760679003757, "grad_norm": 1.0873595476150513, "learning_rate": 5e-05, "loss": 1.6399, "step": 5300 }, { "epoch": 2.38202309725894, "grad_norm": 1.0370205640792847, "learning_rate": 5e-05, "loss": 1.6153, "step": 5350 }, { "epoch": 2.4042855155141227, "grad_norm": 0.8503725528717041, "learning_rate": 5e-05, "loss": 1.6022, "step": 5400 }, { "epoch": 2.4265479337693057, "grad_norm": 0.9510111212730408, "learning_rate": 5e-05, "loss": 1.6106, "step": 5450 }, { "epoch": 2.4488103520244886, "grad_norm": 0.9935341477394104, "learning_rate": 5e-05, "loss": 1.6049, "step": 5500 }, { "epoch": 2.4488103520244886, "eval_loss": 1.586571455001831, "eval_runtime": 40.7387, "eval_samples_per_second": 392.036, "eval_steps_per_second": 49.02, "step": 5500 }, { "epoch": 2.4710727702796715, "grad_norm": 1.2289257049560547, "learning_rate": 5e-05, "loss": 1.5918, "step": 5550 }, { "epoch": 2.4933351885348545, "grad_norm": 1.0900951623916626, "learning_rate": 5e-05, "loss": 1.6005, "step": 5600 }, { "epoch": 2.5155976067900374, "grad_norm": 0.9930930137634277, "learning_rate": 5e-05, "loss": 1.6151, "step": 5650 }, { "epoch": 2.5378600250452203, "grad_norm": 0.9901494979858398, "learning_rate": 5e-05, "loss": 1.59, "step": 5700 }, { "epoch": 2.5601224433004033, "grad_norm": 0.9367809891700745, "learning_rate": 5e-05, "loss": 1.5844, "step": 5750 }, { "epoch": 2.582384861555586, "grad_norm": 1.0291093587875366, "learning_rate": 5e-05, "loss": 1.5841, "step": 5800 }, { "epoch": 2.6046472798107696, "grad_norm": 0.8904668688774109, "learning_rate": 5e-05, "loss": 1.5883, "step": 5850 }, { "epoch": 2.6269096980659525, "grad_norm": 0.9640474915504456, "learning_rate": 5e-05, "loss": 1.5855, "step": 5900 }, { "epoch": 2.6491721163211355, "grad_norm": 0.979326605796814, "learning_rate": 5e-05, "loss": 1.5798, "step": 5950 }, { "epoch": 2.6714345345763184, "grad_norm": 1.2588844299316406, "learning_rate": 5e-05, "loss": 1.6018, "step": 6000 }, { "epoch": 2.6714345345763184, "eval_loss": 1.5868676900863647, "eval_runtime": 40.9701, "eval_samples_per_second": 389.821, "eval_steps_per_second": 48.743, "step": 6000 }, { "epoch": 2.6936969528315013, "grad_norm": 1.070421814918518, "learning_rate": 5e-05, "loss": 1.6947, "step": 6050 }, { "epoch": 2.7159593710866843, "grad_norm": 0.9952645301818848, "learning_rate": 5e-05, "loss": 1.6907, "step": 6100 }, { "epoch": 2.738221789341867, "grad_norm": 1.2595455646514893, "learning_rate": 5e-05, "loss": 1.6954, "step": 6150 }, { "epoch": 2.76048420759705, "grad_norm": 0.9722006916999817, "learning_rate": 5e-05, "loss": 1.6832, "step": 6200 }, { "epoch": 2.782746625852233, "grad_norm": 1.2001519203186035, "learning_rate": 5e-05, "loss": 1.6832, "step": 6250 }, { "epoch": 2.805009044107416, "grad_norm": 1.316867709159851, "learning_rate": 5e-05, "loss": 1.6873, "step": 6300 }, { "epoch": 2.8272714623625994, "grad_norm": 1.2271651029586792, "learning_rate": 5e-05, "loss": 1.6865, "step": 6350 }, { "epoch": 2.8495338806177823, "grad_norm": 1.2443265914916992, "learning_rate": 5e-05, "loss": 1.6779, "step": 6400 }, { "epoch": 2.8717962988729653, "grad_norm": 1.1751494407653809, "learning_rate": 5e-05, "loss": 1.666, "step": 6450 }, { "epoch": 2.894058717128148, "grad_norm": 0.9704211950302124, "learning_rate": 5e-05, "loss": 1.6628, "step": 6500 }, { "epoch": 2.894058717128148, "eval_loss": 1.5628445148468018, "eval_runtime": 41.0455, "eval_samples_per_second": 389.105, "eval_steps_per_second": 48.653, "step": 6500 }, { "epoch": 2.916321135383331, "grad_norm": 1.0452390909194946, "learning_rate": 5e-05, "loss": 1.6794, "step": 6550 }, { "epoch": 2.938583553638514, "grad_norm": 1.338881254196167, "learning_rate": 5e-05, "loss": 1.6678, "step": 6600 }, { "epoch": 2.960845971893697, "grad_norm": 0.989860475063324, "learning_rate": 5e-05, "loss": 1.6753, "step": 6650 }, { "epoch": 2.98310839014888, "grad_norm": 1.1380687952041626, "learning_rate": 5e-05, "loss": 1.6639, "step": 6700 }, { "epoch": 3.0057882287463475, "grad_norm": 1.2292852401733398, "learning_rate": 5e-05, "loss": 1.697, "step": 6750 }, { "epoch": 3.0280506470015305, "grad_norm": 1.1919242143630981, "learning_rate": 5e-05, "loss": 1.6714, "step": 6800 }, { "epoch": 3.0503130652567134, "grad_norm": 1.1312869787216187, "learning_rate": 5e-05, "loss": 1.6641, "step": 6850 }, { "epoch": 3.0725754835118964, "grad_norm": 1.3589369058609009, "learning_rate": 5e-05, "loss": 1.655, "step": 6900 }, { "epoch": 3.0948379017670793, "grad_norm": 1.257063627243042, "learning_rate": 5e-05, "loss": 1.6661, "step": 6950 }, { "epoch": 3.1171003200222622, "grad_norm": 1.3228737115859985, "learning_rate": 5e-05, "loss": 1.653, "step": 7000 }, { "epoch": 3.1171003200222622, "eval_loss": 1.5605802536010742, "eval_runtime": 42.9947, "eval_samples_per_second": 371.464, "eval_steps_per_second": 46.448, "step": 7000 }, { "epoch": 3.1393627382774456, "grad_norm": 1.0204429626464844, "learning_rate": 5e-05, "loss": 1.6602, "step": 7050 }, { "epoch": 3.1616251565326285, "grad_norm": 0.9796785712242126, "learning_rate": 5e-05, "loss": 1.6508, "step": 7100 }, { "epoch": 3.1838875747878115, "grad_norm": 0.9721747040748596, "learning_rate": 5e-05, "loss": 1.6566, "step": 7150 }, { "epoch": 3.2061499930429944, "grad_norm": 1.1874974966049194, "learning_rate": 5e-05, "loss": 1.6501, "step": 7200 }, { "epoch": 3.2284124112981774, "grad_norm": 1.2861804962158203, "learning_rate": 5e-05, "loss": 1.6663, "step": 7250 }, { "epoch": 3.2506748295533603, "grad_norm": 0.9947218894958496, "learning_rate": 5e-05, "loss": 1.6732, "step": 7300 }, { "epoch": 3.2729372478085432, "grad_norm": 1.1224796772003174, "learning_rate": 5e-05, "loss": 1.6597, "step": 7350 }, { "epoch": 3.295199666063726, "grad_norm": 1.2262948751449585, "learning_rate": 5e-05, "loss": 1.6544, "step": 7400 }, { "epoch": 3.317462084318909, "grad_norm": 1.114092469215393, "learning_rate": 5e-05, "loss": 1.6532, "step": 7450 }, { "epoch": 3.339724502574092, "grad_norm": 1.0086640119552612, "learning_rate": 5e-05, "loss": 1.6575, "step": 7500 }, { "epoch": 3.339724502574092, "eval_loss": 1.5381077527999878, "eval_runtime": 43.9183, "eval_samples_per_second": 363.652, "eval_steps_per_second": 45.471, "step": 7500 }, { "epoch": 3.3615416724641713, "grad_norm": 1.4630149602890015, "learning_rate": 5e-05, "loss": 1.6543, "step": 7550 }, { "epoch": 3.3838040907193543, "grad_norm": 0.9978652596473694, "learning_rate": 5e-05, "loss": 1.6469, "step": 7600 }, { "epoch": 3.406066508974537, "grad_norm": 0.9942854046821594, "learning_rate": 5e-05, "loss": 1.6524, "step": 7650 }, { "epoch": 3.42832892722972, "grad_norm": 1.6113872528076172, "learning_rate": 5e-05, "loss": 1.6392, "step": 7700 }, { "epoch": 3.450591345484903, "grad_norm": 1.2430763244628906, "learning_rate": 5e-05, "loss": 1.6524, "step": 7750 }, { "epoch": 3.472853763740086, "grad_norm": 0.9973090887069702, "learning_rate": 5e-05, "loss": 1.6396, "step": 7800 }, { "epoch": 3.4951161819952694, "grad_norm": 1.3717776536941528, "learning_rate": 5e-05, "loss": 1.6463, "step": 7850 }, { "epoch": 3.5173786002504523, "grad_norm": 1.3711599111557007, "learning_rate": 5e-05, "loss": 1.644, "step": 7900 }, { "epoch": 3.5396410185056353, "grad_norm": 1.0126900672912598, "learning_rate": 5e-05, "loss": 1.6311, "step": 7950 }, { "epoch": 3.561903436760818, "grad_norm": 1.0467159748077393, "learning_rate": 5e-05, "loss": 1.64, "step": 8000 }, { "epoch": 3.561903436760818, "eval_loss": 1.539516806602478, "eval_runtime": 42.3362, "eval_samples_per_second": 377.242, "eval_steps_per_second": 47.17, "step": 8000 }, { "epoch": 3.584165855016001, "grad_norm": 1.1766951084136963, "learning_rate": 5e-05, "loss": 1.6552, "step": 8050 }, { "epoch": 3.606428273271184, "grad_norm": 1.0943933725357056, "learning_rate": 5e-05, "loss": 1.6385, "step": 8100 }, { "epoch": 3.628690691526367, "grad_norm": 1.2377898693084717, "learning_rate": 5e-05, "loss": 1.6288, "step": 8150 }, { "epoch": 3.65095310978155, "grad_norm": 0.939339280128479, "learning_rate": 5e-05, "loss": 1.6357, "step": 8200 }, { "epoch": 3.673215528036733, "grad_norm": 1.0802948474884033, "learning_rate": 5e-05, "loss": 1.6367, "step": 8250 }, { "epoch": 3.695477946291916, "grad_norm": 1.089154601097107, "learning_rate": 5e-05, "loss": 1.6434, "step": 8300 }, { "epoch": 3.7177403645470988, "grad_norm": 1.095510482788086, "learning_rate": 5e-05, "loss": 1.6445, "step": 8350 }, { "epoch": 3.740002782802282, "grad_norm": 1.2433582544326782, "learning_rate": 5e-05, "loss": 1.6521, "step": 8400 }, { "epoch": 3.762265201057465, "grad_norm": 1.3547347784042358, "learning_rate": 5e-05, "loss": 1.6363, "step": 8450 }, { "epoch": 3.784527619312648, "grad_norm": 1.224070429801941, "learning_rate": 5e-05, "loss": 1.6455, "step": 8500 }, { "epoch": 3.784527619312648, "eval_loss": 1.516330361366272, "eval_runtime": 40.388, "eval_samples_per_second": 395.439, "eval_steps_per_second": 49.445, "step": 8500 }, { "epoch": 3.806790037567831, "grad_norm": 1.3312312364578247, "learning_rate": 5e-05, "loss": 1.6493, "step": 8550 }, { "epoch": 3.829052455823014, "grad_norm": 1.275539517402649, "learning_rate": 5e-05, "loss": 1.6385, "step": 8600 }, { "epoch": 3.851314874078197, "grad_norm": 1.1481244564056396, "learning_rate": 5e-05, "loss": 1.6369, "step": 8650 }, { "epoch": 3.8735772923333798, "grad_norm": 1.039244532585144, "learning_rate": 5e-05, "loss": 1.6277, "step": 8700 }, { "epoch": 3.8958397105885627, "grad_norm": 1.0740258693695068, "learning_rate": 5e-05, "loss": 1.6294, "step": 8750 }, { "epoch": 3.9181021288437456, "grad_norm": 1.0660001039505005, "learning_rate": 5e-05, "loss": 1.6123, "step": 8800 }, { "epoch": 3.9403645470989286, "grad_norm": 1.036129117012024, "learning_rate": 5e-05, "loss": 1.5803, "step": 8850 }, { "epoch": 3.9626269653541115, "grad_norm": 0.9285004734992981, "learning_rate": 5e-05, "loss": 1.5905, "step": 8900 }, { "epoch": 3.9848893836092945, "grad_norm": 0.9074347019195557, "learning_rate": 5e-05, "loss": 1.5713, "step": 8950 }, { "epoch": 4.0075692222067625, "grad_norm": 1.1706671714782715, "learning_rate": 5e-05, "loss": 1.6308, "step": 9000 }, { "epoch": 4.0075692222067625, "eval_loss": 1.5311250686645508, "eval_runtime": 40.3535, "eval_samples_per_second": 395.778, "eval_steps_per_second": 49.488, "step": 9000 }, { "epoch": 4.029831640461945, "grad_norm": 1.2552838325500488, "learning_rate": 5e-05, "loss": 1.6371, "step": 9050 }, { "epoch": 4.052094058717128, "grad_norm": 1.3415331840515137, "learning_rate": 5e-05, "loss": 1.6345, "step": 9100 }, { "epoch": 4.074356476972311, "grad_norm": 1.029757022857666, "learning_rate": 5e-05, "loss": 1.629, "step": 9150 }, { "epoch": 4.096618895227494, "grad_norm": 1.1435120105743408, "learning_rate": 5e-05, "loss": 1.6287, "step": 9200 }, { "epoch": 4.118881313482677, "grad_norm": 1.385100245475769, "learning_rate": 5e-05, "loss": 1.6335, "step": 9250 }, { "epoch": 4.14114373173786, "grad_norm": 1.062818169593811, "learning_rate": 5e-05, "loss": 1.6184, "step": 9300 }, { "epoch": 4.163406149993043, "grad_norm": 1.3703244924545288, "learning_rate": 5e-05, "loss": 1.631, "step": 9350 }, { "epoch": 4.185668568248226, "grad_norm": 1.1130529642105103, "learning_rate": 5e-05, "loss": 1.6284, "step": 9400 }, { "epoch": 4.207930986503409, "grad_norm": 1.189207911491394, "learning_rate": 5e-05, "loss": 1.619, "step": 9450 }, { "epoch": 4.230193404758592, "grad_norm": 1.0979055166244507, "learning_rate": 5e-05, "loss": 1.6324, "step": 9500 }, { "epoch": 4.230193404758592, "eval_loss": 1.5118227005004883, "eval_runtime": 40.2303, "eval_samples_per_second": 396.989, "eval_steps_per_second": 49.639, "step": 9500 }, { "epoch": 4.252455823013775, "grad_norm": 1.1394270658493042, "learning_rate": 5e-05, "loss": 1.6229, "step": 9550 }, { "epoch": 4.274718241268958, "grad_norm": 1.0398465394973755, "learning_rate": 5e-05, "loss": 1.625, "step": 9600 }, { "epoch": 4.296980659524141, "grad_norm": 1.1344504356384277, "learning_rate": 5e-05, "loss": 1.6113, "step": 9650 }, { "epoch": 4.319243077779324, "grad_norm": 0.9889805316925049, "learning_rate": 5e-05, "loss": 1.6195, "step": 9700 }, { "epoch": 4.3415054960345065, "grad_norm": 1.2321630716323853, "learning_rate": 5e-05, "loss": 1.6133, "step": 9750 }, { "epoch": 4.3637679142896895, "grad_norm": 1.0766791105270386, "learning_rate": 5e-05, "loss": 1.587, "step": 9800 }, { "epoch": 4.386030332544872, "grad_norm": 0.9230866432189941, "learning_rate": 5e-05, "loss": 1.5747, "step": 9850 }, { "epoch": 4.408292750800055, "grad_norm": 1.036097526550293, "learning_rate": 5e-05, "loss": 1.5673, "step": 9900 }, { "epoch": 4.430555169055238, "grad_norm": 1.0321383476257324, "learning_rate": 5e-05, "loss": 1.5641, "step": 9950 }, { "epoch": 4.452817587310421, "grad_norm": 0.9865553379058838, "learning_rate": 5e-05, "loss": 1.5481, "step": 10000 }, { "epoch": 4.452817587310421, "eval_loss": 1.5091972351074219, "eval_runtime": 40.5505, "eval_samples_per_second": 393.855, "eval_steps_per_second": 49.247, "step": 10000 }, { "epoch": 4.475080005565605, "grad_norm": 1.0181940793991089, "learning_rate": 5e-05, "loss": 1.5594, "step": 10050 }, { "epoch": 4.497342423820788, "grad_norm": 1.0538172721862793, "learning_rate": 5e-05, "loss": 1.5523, "step": 10100 }, { "epoch": 4.519604842075971, "grad_norm": 0.936060905456543, "learning_rate": 5e-05, "loss": 1.547, "step": 10150 }, { "epoch": 4.541867260331154, "grad_norm": 1.225715160369873, "learning_rate": 5e-05, "loss": 1.5491, "step": 10200 }, { "epoch": 4.564129678586337, "grad_norm": 1.2574198246002197, "learning_rate": 5e-05, "loss": 1.5496, "step": 10250 }, { "epoch": 4.58639209684152, "grad_norm": 1.2122540473937988, "learning_rate": 5e-05, "loss": 1.5327, "step": 10300 }, { "epoch": 4.608654515096703, "grad_norm": 1.1094001531600952, "learning_rate": 5e-05, "loss": 1.5375, "step": 10350 }, { "epoch": 4.630916933351886, "grad_norm": 1.0384974479675293, "learning_rate": 5e-05, "loss": 1.555, "step": 10400 }, { "epoch": 4.6531793516070685, "grad_norm": 1.0797594785690308, "learning_rate": 5e-05, "loss": 1.5621, "step": 10450 }, { "epoch": 4.6754417698622515, "grad_norm": 1.0724256038665771, "learning_rate": 5e-05, "loss": 1.547, "step": 10500 }, { "epoch": 4.6754417698622515, "eval_loss": 1.5108764171600342, "eval_runtime": 40.4882, "eval_samples_per_second": 394.461, "eval_steps_per_second": 49.323, "step": 10500 }, { "epoch": 4.697704188117434, "grad_norm": 1.236370325088501, "learning_rate": 5e-05, "loss": 1.5426, "step": 10550 }, { "epoch": 4.719966606372617, "grad_norm": 1.1259009838104248, "learning_rate": 5e-05, "loss": 1.5701, "step": 10600 }, { "epoch": 4.7422290246278, "grad_norm": 1.0653769969940186, "learning_rate": 5e-05, "loss": 1.5543, "step": 10650 }, { "epoch": 4.764491442882983, "grad_norm": 1.1116371154785156, "learning_rate": 5e-05, "loss": 1.557, "step": 10700 }, { "epoch": 4.786753861138166, "grad_norm": 1.0332480669021606, "learning_rate": 5e-05, "loss": 1.5513, "step": 10750 }, { "epoch": 4.809016279393349, "grad_norm": 1.1142674684524536, "learning_rate": 5e-05, "loss": 1.5448, "step": 10800 }, { "epoch": 4.831278697648532, "grad_norm": 1.0316691398620605, "learning_rate": 5e-05, "loss": 1.5456, "step": 10850 }, { "epoch": 4.853541115903715, "grad_norm": 0.987628161907196, "learning_rate": 5e-05, "loss": 1.5535, "step": 10900 }, { "epoch": 4.875803534158898, "grad_norm": 1.125772476196289, "learning_rate": 5e-05, "loss": 1.5583, "step": 10950 }, { "epoch": 4.898065952414081, "grad_norm": 0.9541718363761902, "learning_rate": 5e-05, "loss": 1.5584, "step": 11000 }, { "epoch": 4.898065952414081, "eval_loss": 1.5041238069534302, "eval_runtime": 40.2547, "eval_samples_per_second": 396.749, "eval_steps_per_second": 49.609, "step": 11000 } ], "logging_steps": 50, "max_steps": 11230, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.530938070076723e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }