sandernotenbaert's picture
Training in progress, step 2000, checkpoint
c89219f verified
{
"best_global_step": 1000,
"best_metric": 9.911575317382812,
"best_model_checkpoint": "./models/v-001/checkpoint-1000",
"epoch": 64.55434782608695,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.6521739130434783,
"grad_norm": 0.5281125903129578,
"learning_rate": 3.166666666666667e-06,
"loss": 10.3483,
"step": 20
},
{
"epoch": 1.2934782608695652,
"grad_norm": 0.6133605241775513,
"learning_rate": 6.5000000000000004e-06,
"loss": 10.3417,
"step": 40
},
{
"epoch": 1.9456521739130435,
"grad_norm": 0.6125457882881165,
"learning_rate": 9.833333333333333e-06,
"loss": 10.3299,
"step": 60
},
{
"epoch": 2.5869565217391304,
"grad_norm": 0.5962333679199219,
"learning_rate": 1.3166666666666665e-05,
"loss": 10.3064,
"step": 80
},
{
"epoch": 3.2282608695652173,
"grad_norm": 0.6132860779762268,
"learning_rate": 1.65e-05,
"loss": 10.2727,
"step": 100
},
{
"epoch": 3.2282608695652173,
"eval_accuracy": 4.39651795777744e-05,
"eval_loss": 10.328398704528809,
"eval_runtime": 17.8533,
"eval_samples_per_second": 16.916,
"eval_steps_per_second": 1.064,
"step": 100
},
{
"epoch": 3.880434782608696,
"grad_norm": 0.5671436190605164,
"learning_rate": 1.9833333333333335e-05,
"loss": 10.2083,
"step": 120
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.4685352146625519,
"learning_rate": 2.3166666666666666e-05,
"loss": 10.1064,
"step": 140
},
{
"epoch": 5.163043478260869,
"grad_norm": 0.45952484011650085,
"learning_rate": 2.6500000000000004e-05,
"loss": 9.9909,
"step": 160
},
{
"epoch": 5.815217391304348,
"grad_norm": 0.41472023725509644,
"learning_rate": 2.9833333333333335e-05,
"loss": 9.8718,
"step": 180
},
{
"epoch": 6.456521739130435,
"grad_norm": 0.43127089738845825,
"learning_rate": 3.316666666666667e-05,
"loss": 9.7582,
"step": 200
},
{
"epoch": 6.456521739130435,
"eval_accuracy": 0.0026139297676240417,
"eval_loss": 10.096575736999512,
"eval_runtime": 35.2536,
"eval_samples_per_second": 8.566,
"eval_steps_per_second": 0.539,
"step": 200
},
{
"epoch": 7.0978260869565215,
"grad_norm": 0.4042549431324005,
"learning_rate": 3.65e-05,
"loss": 9.6266,
"step": 220
},
{
"epoch": 7.75,
"grad_norm": 0.3853429853916168,
"learning_rate": 3.983333333333333e-05,
"loss": 9.5189,
"step": 240
},
{
"epoch": 8.391304347826088,
"grad_norm": 0.38628196716308594,
"learning_rate": 4.316666666666667e-05,
"loss": 9.4044,
"step": 260
},
{
"epoch": 9.032608695652174,
"grad_norm": 0.43260782957077026,
"learning_rate": 4.6500000000000005e-05,
"loss": 9.3052,
"step": 280
},
{
"epoch": 9.684782608695652,
"grad_norm": 0.4363991916179657,
"learning_rate": 4.9833333333333336e-05,
"loss": 9.2052,
"step": 300
},
{
"epoch": 9.684782608695652,
"eval_accuracy": 0.003677087746504768,
"eval_loss": 9.951318740844727,
"eval_runtime": 47.0129,
"eval_samples_per_second": 6.424,
"eval_steps_per_second": 0.404,
"step": 300
},
{
"epoch": 10.326086956521738,
"grad_norm": 0.49219855666160583,
"learning_rate": 5.316666666666667e-05,
"loss": 9.1327,
"step": 320
},
{
"epoch": 10.978260869565217,
"grad_norm": 0.4471158981323242,
"learning_rate": 5.65e-05,
"loss": 9.0328,
"step": 340
},
{
"epoch": 11.619565217391305,
"grad_norm": 0.3913232684135437,
"learning_rate": 5.983333333333334e-05,
"loss": 8.9514,
"step": 360
},
{
"epoch": 12.26086956521739,
"grad_norm": 1.0882291793823242,
"learning_rate": 6.316666666666668e-05,
"loss": 8.8916,
"step": 380
},
{
"epoch": 12.91304347826087,
"grad_norm": 0.6010486483573914,
"learning_rate": 6.65e-05,
"loss": 8.8216,
"step": 400
},
{
"epoch": 12.91304347826087,
"eval_accuracy": 0.003413296669038122,
"eval_loss": 9.953831672668457,
"eval_runtime": 36.8026,
"eval_samples_per_second": 8.206,
"eval_steps_per_second": 0.516,
"step": 400
},
{
"epoch": 13.554347826086957,
"grad_norm": 0.49556687474250793,
"learning_rate": 6.983333333333334e-05,
"loss": 8.7406,
"step": 420
},
{
"epoch": 14.195652173913043,
"grad_norm": 0.495381623506546,
"learning_rate": 7.316666666666668e-05,
"loss": 8.679,
"step": 440
},
{
"epoch": 14.847826086956522,
"grad_norm": 0.6165482401847839,
"learning_rate": 7.65e-05,
"loss": 8.5904,
"step": 460
},
{
"epoch": 15.48913043478261,
"grad_norm": 0.5654007792472839,
"learning_rate": 7.983333333333334e-05,
"loss": 8.4896,
"step": 480
},
{
"epoch": 16.130434782608695,
"grad_norm": 0.6611935496330261,
"learning_rate": 8.316666666666666e-05,
"loss": 8.406,
"step": 500
},
{
"epoch": 16.130434782608695,
"eval_accuracy": 0.0029496638662179554,
"eval_loss": 9.952414512634277,
"eval_runtime": 39.649,
"eval_samples_per_second": 7.617,
"eval_steps_per_second": 0.479,
"step": 500
},
{
"epoch": 16.782608695652176,
"grad_norm": 0.7537912726402283,
"learning_rate": 8.65e-05,
"loss": 8.2948,
"step": 520
},
{
"epoch": 17.42391304347826,
"grad_norm": 0.9145230650901794,
"learning_rate": 8.983333333333334e-05,
"loss": 8.1992,
"step": 540
},
{
"epoch": 18.065217391304348,
"grad_norm": 0.810655415058136,
"learning_rate": 9.316666666666666e-05,
"loss": 8.08,
"step": 560
},
{
"epoch": 18.717391304347824,
"grad_norm": 0.9121057987213135,
"learning_rate": 9.65e-05,
"loss": 7.9438,
"step": 580
},
{
"epoch": 19.358695652173914,
"grad_norm": 0.8612993359565735,
"learning_rate": 9.983333333333334e-05,
"loss": 7.8326,
"step": 600
},
{
"epoch": 19.358695652173914,
"eval_accuracy": 0.0021423032957897346,
"eval_loss": 9.945837020874023,
"eval_runtime": 25.9715,
"eval_samples_per_second": 11.628,
"eval_steps_per_second": 0.732,
"step": 600
},
{
"epoch": 20.0,
"grad_norm": 0.8960981369018555,
"learning_rate": 9.995456138403733e-05,
"loss": 7.716,
"step": 620
},
{
"epoch": 20.652173913043477,
"grad_norm": 0.9986662268638611,
"learning_rate": 9.980864681729001e-05,
"loss": 7.5692,
"step": 640
},
{
"epoch": 21.293478260869566,
"grad_norm": 0.9232766628265381,
"learning_rate": 9.956242426451834e-05,
"loss": 7.4208,
"step": 660
},
{
"epoch": 21.945652173913043,
"grad_norm": 0.9031963348388672,
"learning_rate": 9.921638958517565e-05,
"loss": 7.3481,
"step": 680
},
{
"epoch": 22.58695652173913,
"grad_norm": 0.9567400813102722,
"learning_rate": 9.877123964705497e-05,
"loss": 7.1956,
"step": 700
},
{
"epoch": 22.58695652173913,
"eval_accuracy": 0.001666679989448357,
"eval_loss": 9.986405372619629,
"eval_runtime": 57.7614,
"eval_samples_per_second": 5.228,
"eval_steps_per_second": 0.329,
"step": 700
},
{
"epoch": 23.22826086956522,
"grad_norm": 1.135650873184204,
"learning_rate": 9.822787092288991e-05,
"loss": 7.0604,
"step": 720
},
{
"epoch": 23.880434782608695,
"grad_norm": 0.8771520853042603,
"learning_rate": 9.758737768497802e-05,
"loss": 6.9215,
"step": 740
},
{
"epoch": 24.52173913043478,
"grad_norm": 1.0156564712524414,
"learning_rate": 9.685104980146193e-05,
"loss": 6.8363,
"step": 760
},
{
"epoch": 25.16304347826087,
"grad_norm": 0.9963734149932861,
"learning_rate": 9.60203701387066e-05,
"loss": 6.6577,
"step": 780
},
{
"epoch": 25.815217391304348,
"grad_norm": 0.8723818063735962,
"learning_rate": 9.509701157500376e-05,
"loss": 6.5659,
"step": 800
},
{
"epoch": 25.815217391304348,
"eval_accuracy": 0.0014988129401514,
"eval_loss": 9.925810813903809,
"eval_runtime": 49.4048,
"eval_samples_per_second": 6.113,
"eval_steps_per_second": 0.385,
"step": 800
},
{
"epoch": 26.456521739130434,
"grad_norm": 1.1702412366867065,
"learning_rate": 9.408283363161774e-05,
"loss": 6.393,
"step": 820
},
{
"epoch": 27.097826086956523,
"grad_norm": 0.8747526407241821,
"learning_rate": 9.297987872795705e-05,
"loss": 6.3074,
"step": 840
},
{
"epoch": 27.75,
"grad_norm": 0.973866879940033,
"learning_rate": 9.179036806841353e-05,
"loss": 6.1801,
"step": 860
},
{
"epoch": 28.391304347826086,
"grad_norm": 1.585481882095337,
"learning_rate": 9.051669716915227e-05,
"loss": 6.1175,
"step": 880
},
{
"epoch": 29.032608695652176,
"grad_norm": 1.0919766426086426,
"learning_rate": 8.916143103386093e-05,
"loss": 5.9719,
"step": 900
},
{
"epoch": 29.032608695652176,
"eval_accuracy": 0.001458844595080696,
"eval_loss": 9.97097396850586,
"eval_runtime": 46.643,
"eval_samples_per_second": 6.475,
"eval_steps_per_second": 0.407,
"step": 900
},
{
"epoch": 29.684782608695652,
"grad_norm": 1.173614501953125,
"learning_rate": 8.77272989881736e-05,
"loss": 5.8702,
"step": 920
},
{
"epoch": 30.32608695652174,
"grad_norm": 1.0350476503372192,
"learning_rate": 8.621718918317225e-05,
"loss": 5.704,
"step": 940
},
{
"epoch": 30.97826086956522,
"grad_norm": 1.1128321886062622,
"learning_rate": 8.463414277903475e-05,
"loss": 5.6413,
"step": 960
},
{
"epoch": 31.619565217391305,
"grad_norm": 1.2460695505142212,
"learning_rate": 8.298134782054305e-05,
"loss": 5.4948,
"step": 980
},
{
"epoch": 32.26086956521739,
"grad_norm": 1.1606298685073853,
"learning_rate": 8.126213281678526e-05,
"loss": 5.4031,
"step": 1000
},
{
"epoch": 32.26086956521739,
"eval_accuracy": 0.001079145316909008,
"eval_loss": 9.911575317382812,
"eval_runtime": 48.65,
"eval_samples_per_second": 6.208,
"eval_steps_per_second": 0.391,
"step": 1000
},
{
"epoch": 32.91304347826087,
"grad_norm": 1.0351324081420898,
"learning_rate": 7.94799600379813e-05,
"loss": 5.3272,
"step": 1020
},
{
"epoch": 33.55434782608695,
"grad_norm": 1.089340329170227,
"learning_rate": 7.763841854293145e-05,
"loss": 5.2996,
"step": 1040
},
{
"epoch": 34.19565217391305,
"grad_norm": 1.3611856698989868,
"learning_rate": 7.574121695112954e-05,
"loss": 5.1266,
"step": 1060
},
{
"epoch": 34.84782608695652,
"grad_norm": 1.2501380443572998,
"learning_rate": 7.379217597409688e-05,
"loss": 5.0434,
"step": 1080
},
{
"epoch": 35.48913043478261,
"grad_norm": 1.057522177696228,
"learning_rate": 7.179522072097774e-05,
"loss": 4.9784,
"step": 1100
},
{
"epoch": 35.48913043478261,
"eval_accuracy": 0.0011830630140928385,
"eval_loss": 9.981914520263672,
"eval_runtime": 106.635,
"eval_samples_per_second": 2.832,
"eval_steps_per_second": 0.178,
"step": 1100
},
{
"epoch": 36.130434782608695,
"grad_norm": 1.1768474578857422,
"learning_rate": 6.975437279389181e-05,
"loss": 4.9012,
"step": 1120
},
{
"epoch": 36.78260869565217,
"grad_norm": 1.1783802509307861,
"learning_rate": 6.767374218896286e-05,
"loss": 4.823,
"step": 1140
},
{
"epoch": 37.42391304347826,
"grad_norm": 1.220082402229309,
"learning_rate": 6.555751901933342e-05,
"loss": 4.7149,
"step": 1160
},
{
"epoch": 38.06521739130435,
"grad_norm": 1.3078495264053345,
"learning_rate": 6.340996507683458e-05,
"loss": 4.6413,
"step": 1180
},
{
"epoch": 38.71739130434783,
"grad_norm": 1.2146966457366943,
"learning_rate": 6.123540524930442e-05,
"loss": 4.6684,
"step": 1200
},
{
"epoch": 38.71739130434783,
"eval_accuracy": 0.0008952909295837696,
"eval_loss": 10.014237403869629,
"eval_runtime": 31.4866,
"eval_samples_per_second": 9.591,
"eval_steps_per_second": 0.603,
"step": 1200
},
{
"epoch": 39.391304347826086,
"grad_norm": 1.090649962425232,
"learning_rate": 5.903821881083942e-05,
"loss": 4.7794,
"step": 1220
},
{
"epoch": 40.03260869565217,
"grad_norm": 1.2051453590393066,
"learning_rate": 5.682283060251932e-05,
"loss": 4.4631,
"step": 1240
},
{
"epoch": 40.68478260869565,
"grad_norm": 1.0512608289718628,
"learning_rate": 5.4593702121365955e-05,
"loss": 4.4119,
"step": 1260
},
{
"epoch": 41.32608695652174,
"grad_norm": 0.9912136793136597,
"learning_rate": 5.235532253548213e-05,
"loss": 4.3377,
"step": 1280
},
{
"epoch": 41.97826086956522,
"grad_norm": 1.0219991207122803,
"learning_rate": 5.0112199643464376e-05,
"loss": 4.3184,
"step": 1300
},
{
"epoch": 41.97826086956522,
"eval_accuracy": 0.001019192799302952,
"eval_loss": 10.048251152038574,
"eval_runtime": 18.0166,
"eval_samples_per_second": 16.762,
"eval_steps_per_second": 1.055,
"step": 1300
},
{
"epoch": 42.619565217391305,
"grad_norm": 0.9927310347557068,
"learning_rate": 4.7868850796296495e-05,
"loss": 4.331,
"step": 1320
},
{
"epoch": 43.26086956521739,
"grad_norm": 0.9491915106773376,
"learning_rate": 4.5629793800005945e-05,
"loss": 4.1006,
"step": 1340
},
{
"epoch": 43.91304347826087,
"grad_norm": 0.8775396347045898,
"learning_rate": 4.339953781740363e-05,
"loss": 4.177,
"step": 1360
},
{
"epoch": 44.55434782608695,
"grad_norm": 0.9140155911445618,
"learning_rate": 4.1182574287230224e-05,
"loss": 4.0757,
"step": 1380
},
{
"epoch": 45.19565217391305,
"grad_norm": 0.9226499199867249,
"learning_rate": 3.898336787899612e-05,
"loss": 4.1251,
"step": 1400
},
{
"epoch": 45.19565217391305,
"eval_accuracy": 0.0008153542394423617,
"eval_loss": 10.09643840789795,
"eval_runtime": 21.8568,
"eval_samples_per_second": 13.817,
"eval_steps_per_second": 0.869,
"step": 1400
},
{
"epoch": 45.84782608695652,
"grad_norm": 0.9727521538734436,
"learning_rate": 3.680634750173137e-05,
"loss": 4.0495,
"step": 1420
},
{
"epoch": 46.48913043478261,
"grad_norm": 0.8741424083709717,
"learning_rate": 3.4655897384752146e-05,
"loss": 4.0195,
"step": 1440
},
{
"epoch": 47.130434782608695,
"grad_norm": 0.9479995965957642,
"learning_rate": 3.2536348248406534e-05,
"loss": 3.9482,
"step": 1460
},
{
"epoch": 47.78260869565217,
"grad_norm": 0.9046297073364258,
"learning_rate": 3.0451968582579915e-05,
"loss": 3.9166,
"step": 1480
},
{
"epoch": 48.42391304347826,
"grad_norm": 0.861109733581543,
"learning_rate": 2.840695605052458e-05,
"loss": 3.909,
"step": 1500
},
{
"epoch": 48.42391304347826,
"eval_accuracy": 0.0008992877640908401,
"eval_loss": 10.132174491882324,
"eval_runtime": 28.1979,
"eval_samples_per_second": 10.71,
"eval_steps_per_second": 0.674,
"step": 1500
},
{
"epoch": 49.06521739130435,
"grad_norm": 0.8144867420196533,
"learning_rate": 2.6405429035324403e-05,
"loss": 3.9054,
"step": 1520
},
{
"epoch": 49.71739130434783,
"grad_norm": 0.8467565178871155,
"learning_rate": 2.4451418346019573e-05,
"loss": 3.8543,
"step": 1540
},
{
"epoch": 50.358695652173914,
"grad_norm": 0.8617934584617615,
"learning_rate": 2.2548859100093407e-05,
"loss": 3.8013,
"step": 1560
},
{
"epoch": 51.0,
"grad_norm": 1.1794334650039673,
"learning_rate": 2.0701582798669676e-05,
"loss": 3.803,
"step": 1580
},
{
"epoch": 51.65217391304348,
"grad_norm": 0.861342191696167,
"learning_rate": 1.8913309610379015e-05,
"loss": 3.7535,
"step": 1600
},
{
"epoch": 51.65217391304348,
"eval_accuracy": 0.0008753067570484176,
"eval_loss": 10.158663749694824,
"eval_runtime": 31.7351,
"eval_samples_per_second": 9.516,
"eval_steps_per_second": 0.599,
"step": 1600
},
{
"epoch": 52.29347826086956,
"grad_norm": 0.8501729369163513,
"learning_rate": 1.7187640879434553e-05,
"loss": 3.7589,
"step": 1620
},
{
"epoch": 52.94565217391305,
"grad_norm": 0.8114346861839294,
"learning_rate": 1.552805187300389e-05,
"loss": 3.8172,
"step": 1640
},
{
"epoch": 53.58695652173913,
"grad_norm": 0.8227590918540955,
"learning_rate": 1.3937884782483484e-05,
"loss": 3.6933,
"step": 1660
},
{
"epoch": 54.22826086956522,
"grad_norm": 0.7768607139587402,
"learning_rate": 1.242034199277008e-05,
"loss": 3.8079,
"step": 1680
},
{
"epoch": 54.880434782608695,
"grad_norm": 0.8110019564628601,
"learning_rate": 1.097847963308351e-05,
"loss": 3.681,
"step": 1700
},
{
"epoch": 54.880434782608695,
"eval_accuracy": 0.0008273447429635729,
"eval_loss": 10.178533554077148,
"eval_runtime": 31.6166,
"eval_samples_per_second": 9.552,
"eval_steps_per_second": 0.601,
"step": 1700
},
{
"epoch": 55.52173913043478,
"grad_norm": 0.8320772647857666,
"learning_rate": 9.615201422329406e-06,
"loss": 3.6494,
"step": 1720
},
{
"epoch": 56.16304347826087,
"grad_norm": 0.7713989019393921,
"learning_rate": 8.333252821395526e-06,
"loss": 3.7021,
"step": 1740
},
{
"epoch": 56.81521739130435,
"grad_norm": 0.7743974924087524,
"learning_rate": 7.135215504159115e-06,
"loss": 3.7404,
"step": 1760
},
{
"epoch": 57.45652173913044,
"grad_norm": 0.7438375353813171,
"learning_rate": 6.023502158339078e-06,
"loss": 3.6467,
"step": 1780
},
{
"epoch": 58.09782608695652,
"grad_norm": 0.8585782051086426,
"learning_rate": 5.000351626664207e-06,
"loss": 3.688,
"step": 1800
},
{
"epoch": 58.09782608695652,
"eval_accuracy": 0.0008273447429635729,
"eval_loss": 10.187094688415527,
"eval_runtime": 31.7031,
"eval_samples_per_second": 9.526,
"eval_steps_per_second": 0.599,
"step": 1800
},
{
"epoch": 58.75,
"grad_norm": 0.7883967161178589,
"learning_rate": 4.067824398141701e-06,
"loss": 3.6471,
"step": 1820
},
{
"epoch": 59.391304347826086,
"grad_norm": 0.7768418192863464,
"learning_rate": 3.2277984585066366e-06,
"loss": 3.6824,
"step": 1840
},
{
"epoch": 60.03260869565217,
"grad_norm": 0.7814875245094299,
"learning_rate": 2.4819655082085835e-06,
"loss": 3.6767,
"step": 1860
},
{
"epoch": 60.68478260869565,
"grad_norm": 0.8055542707443237,
"learning_rate": 1.8318275555520237e-06,
"loss": 3.5899,
"step": 1880
},
{
"epoch": 61.32608695652174,
"grad_norm": 0.8155117630958557,
"learning_rate": 1.2786938918515568e-06,
"loss": 3.6685,
"step": 1900
},
{
"epoch": 61.32608695652174,
"eval_accuracy": 0.0008233479084565024,
"eval_loss": 10.19116497039795,
"eval_runtime": 36.7079,
"eval_samples_per_second": 8.227,
"eval_steps_per_second": 0.518,
"step": 1900
},
{
"epoch": 61.97826086956522,
"grad_norm": 0.8085272312164307,
"learning_rate": 8.236784546933718e-07,
"loss": 3.697,
"step": 1920
},
{
"epoch": 62.619565217391305,
"grad_norm": 0.7857375741004944,
"learning_rate": 4.676975846132692e-07,
"loss": 3.6836,
"step": 1940
},
{
"epoch": 63.26086956521739,
"grad_norm": 0.7917608618736267,
"learning_rate": 2.1146817970871258e-07,
"loss": 3.6163,
"step": 1960
},
{
"epoch": 63.91304347826087,
"grad_norm": 0.7354781031608582,
"learning_rate": 5.550625190150483e-08,
"loss": 3.6652,
"step": 1980
},
{
"epoch": 64.55434782608695,
"grad_norm": 0.7902089357376099,
"learning_rate": 1.2588775841204658e-10,
"loss": 3.6326,
"step": 2000
},
{
"epoch": 64.55434782608695,
"eval_accuracy": 0.0008273447429635729,
"eval_loss": 10.191176414489746,
"eval_runtime": 37.3382,
"eval_samples_per_second": 8.088,
"eval_steps_per_second": 0.509,
"step": 2000
}
],
"logging_steps": 20,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 65,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2610102966336000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}