|
{ |
|
"best_global_step": 1000, |
|
"best_metric": 9.911575317382812, |
|
"best_model_checkpoint": "./models/v-001/checkpoint-1000", |
|
"epoch": 64.55434782608695, |
|
"eval_steps": 100, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.5281125903129578, |
|
"learning_rate": 3.166666666666667e-06, |
|
"loss": 10.3483, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.2934782608695652, |
|
"grad_norm": 0.6133605241775513, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 10.3417, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.9456521739130435, |
|
"grad_norm": 0.6125457882881165, |
|
"learning_rate": 9.833333333333333e-06, |
|
"loss": 10.3299, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.5869565217391304, |
|
"grad_norm": 0.5962333679199219, |
|
"learning_rate": 1.3166666666666665e-05, |
|
"loss": 10.3064, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.2282608695652173, |
|
"grad_norm": 0.6132860779762268, |
|
"learning_rate": 1.65e-05, |
|
"loss": 10.2727, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.2282608695652173, |
|
"eval_accuracy": 4.39651795777744e-05, |
|
"eval_loss": 10.328398704528809, |
|
"eval_runtime": 17.8533, |
|
"eval_samples_per_second": 16.916, |
|
"eval_steps_per_second": 1.064, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.880434782608696, |
|
"grad_norm": 0.5671436190605164, |
|
"learning_rate": 1.9833333333333335e-05, |
|
"loss": 10.2083, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.521739130434782, |
|
"grad_norm": 0.4685352146625519, |
|
"learning_rate": 2.3166666666666666e-05, |
|
"loss": 10.1064, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 5.163043478260869, |
|
"grad_norm": 0.45952484011650085, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 9.9909, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.815217391304348, |
|
"grad_norm": 0.41472023725509644, |
|
"learning_rate": 2.9833333333333335e-05, |
|
"loss": 9.8718, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 6.456521739130435, |
|
"grad_norm": 0.43127089738845825, |
|
"learning_rate": 3.316666666666667e-05, |
|
"loss": 9.7582, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.456521739130435, |
|
"eval_accuracy": 0.0026139297676240417, |
|
"eval_loss": 10.096575736999512, |
|
"eval_runtime": 35.2536, |
|
"eval_samples_per_second": 8.566, |
|
"eval_steps_per_second": 0.539, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 7.0978260869565215, |
|
"grad_norm": 0.4042549431324005, |
|
"learning_rate": 3.65e-05, |
|
"loss": 9.6266, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.3853429853916168, |
|
"learning_rate": 3.983333333333333e-05, |
|
"loss": 9.5189, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 8.391304347826088, |
|
"grad_norm": 0.38628196716308594, |
|
"learning_rate": 4.316666666666667e-05, |
|
"loss": 9.4044, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 9.032608695652174, |
|
"grad_norm": 0.43260782957077026, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 9.3052, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 9.684782608695652, |
|
"grad_norm": 0.4363991916179657, |
|
"learning_rate": 4.9833333333333336e-05, |
|
"loss": 9.2052, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.684782608695652, |
|
"eval_accuracy": 0.003677087746504768, |
|
"eval_loss": 9.951318740844727, |
|
"eval_runtime": 47.0129, |
|
"eval_samples_per_second": 6.424, |
|
"eval_steps_per_second": 0.404, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 10.326086956521738, |
|
"grad_norm": 0.49219855666160583, |
|
"learning_rate": 5.316666666666667e-05, |
|
"loss": 9.1327, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.978260869565217, |
|
"grad_norm": 0.4471158981323242, |
|
"learning_rate": 5.65e-05, |
|
"loss": 9.0328, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 11.619565217391305, |
|
"grad_norm": 0.3913232684135437, |
|
"learning_rate": 5.983333333333334e-05, |
|
"loss": 8.9514, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 12.26086956521739, |
|
"grad_norm": 1.0882291793823242, |
|
"learning_rate": 6.316666666666668e-05, |
|
"loss": 8.8916, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 12.91304347826087, |
|
"grad_norm": 0.6010486483573914, |
|
"learning_rate": 6.65e-05, |
|
"loss": 8.8216, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 12.91304347826087, |
|
"eval_accuracy": 0.003413296669038122, |
|
"eval_loss": 9.953831672668457, |
|
"eval_runtime": 36.8026, |
|
"eval_samples_per_second": 8.206, |
|
"eval_steps_per_second": 0.516, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 13.554347826086957, |
|
"grad_norm": 0.49556687474250793, |
|
"learning_rate": 6.983333333333334e-05, |
|
"loss": 8.7406, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 14.195652173913043, |
|
"grad_norm": 0.495381623506546, |
|
"learning_rate": 7.316666666666668e-05, |
|
"loss": 8.679, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 14.847826086956522, |
|
"grad_norm": 0.6165482401847839, |
|
"learning_rate": 7.65e-05, |
|
"loss": 8.5904, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 15.48913043478261, |
|
"grad_norm": 0.5654007792472839, |
|
"learning_rate": 7.983333333333334e-05, |
|
"loss": 8.4896, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 16.130434782608695, |
|
"grad_norm": 0.6611935496330261, |
|
"learning_rate": 8.316666666666666e-05, |
|
"loss": 8.406, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 16.130434782608695, |
|
"eval_accuracy": 0.0029496638662179554, |
|
"eval_loss": 9.952414512634277, |
|
"eval_runtime": 39.649, |
|
"eval_samples_per_second": 7.617, |
|
"eval_steps_per_second": 0.479, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 16.782608695652176, |
|
"grad_norm": 0.7537912726402283, |
|
"learning_rate": 8.65e-05, |
|
"loss": 8.2948, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 17.42391304347826, |
|
"grad_norm": 0.9145230650901794, |
|
"learning_rate": 8.983333333333334e-05, |
|
"loss": 8.1992, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 18.065217391304348, |
|
"grad_norm": 0.810655415058136, |
|
"learning_rate": 9.316666666666666e-05, |
|
"loss": 8.08, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 18.717391304347824, |
|
"grad_norm": 0.9121057987213135, |
|
"learning_rate": 9.65e-05, |
|
"loss": 7.9438, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 19.358695652173914, |
|
"grad_norm": 0.8612993359565735, |
|
"learning_rate": 9.983333333333334e-05, |
|
"loss": 7.8326, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 19.358695652173914, |
|
"eval_accuracy": 0.0021423032957897346, |
|
"eval_loss": 9.945837020874023, |
|
"eval_runtime": 25.9715, |
|
"eval_samples_per_second": 11.628, |
|
"eval_steps_per_second": 0.732, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.8960981369018555, |
|
"learning_rate": 9.995456138403733e-05, |
|
"loss": 7.716, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 20.652173913043477, |
|
"grad_norm": 0.9986662268638611, |
|
"learning_rate": 9.980864681729001e-05, |
|
"loss": 7.5692, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 21.293478260869566, |
|
"grad_norm": 0.9232766628265381, |
|
"learning_rate": 9.956242426451834e-05, |
|
"loss": 7.4208, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 21.945652173913043, |
|
"grad_norm": 0.9031963348388672, |
|
"learning_rate": 9.921638958517565e-05, |
|
"loss": 7.3481, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 22.58695652173913, |
|
"grad_norm": 0.9567400813102722, |
|
"learning_rate": 9.877123964705497e-05, |
|
"loss": 7.1956, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 22.58695652173913, |
|
"eval_accuracy": 0.001666679989448357, |
|
"eval_loss": 9.986405372619629, |
|
"eval_runtime": 57.7614, |
|
"eval_samples_per_second": 5.228, |
|
"eval_steps_per_second": 0.329, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 23.22826086956522, |
|
"grad_norm": 1.135650873184204, |
|
"learning_rate": 9.822787092288991e-05, |
|
"loss": 7.0604, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 23.880434782608695, |
|
"grad_norm": 0.8771520853042603, |
|
"learning_rate": 9.758737768497802e-05, |
|
"loss": 6.9215, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 24.52173913043478, |
|
"grad_norm": 1.0156564712524414, |
|
"learning_rate": 9.685104980146193e-05, |
|
"loss": 6.8363, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 25.16304347826087, |
|
"grad_norm": 0.9963734149932861, |
|
"learning_rate": 9.60203701387066e-05, |
|
"loss": 6.6577, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 25.815217391304348, |
|
"grad_norm": 0.8723818063735962, |
|
"learning_rate": 9.509701157500376e-05, |
|
"loss": 6.5659, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 25.815217391304348, |
|
"eval_accuracy": 0.0014988129401514, |
|
"eval_loss": 9.925810813903809, |
|
"eval_runtime": 49.4048, |
|
"eval_samples_per_second": 6.113, |
|
"eval_steps_per_second": 0.385, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 26.456521739130434, |
|
"grad_norm": 1.1702412366867065, |
|
"learning_rate": 9.408283363161774e-05, |
|
"loss": 6.393, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 27.097826086956523, |
|
"grad_norm": 0.8747526407241821, |
|
"learning_rate": 9.297987872795705e-05, |
|
"loss": 6.3074, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 27.75, |
|
"grad_norm": 0.973866879940033, |
|
"learning_rate": 9.179036806841353e-05, |
|
"loss": 6.1801, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 28.391304347826086, |
|
"grad_norm": 1.585481882095337, |
|
"learning_rate": 9.051669716915227e-05, |
|
"loss": 6.1175, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 29.032608695652176, |
|
"grad_norm": 1.0919766426086426, |
|
"learning_rate": 8.916143103386093e-05, |
|
"loss": 5.9719, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 29.032608695652176, |
|
"eval_accuracy": 0.001458844595080696, |
|
"eval_loss": 9.97097396850586, |
|
"eval_runtime": 46.643, |
|
"eval_samples_per_second": 6.475, |
|
"eval_steps_per_second": 0.407, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 29.684782608695652, |
|
"grad_norm": 1.173614501953125, |
|
"learning_rate": 8.77272989881736e-05, |
|
"loss": 5.8702, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 30.32608695652174, |
|
"grad_norm": 1.0350476503372192, |
|
"learning_rate": 8.621718918317225e-05, |
|
"loss": 5.704, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 30.97826086956522, |
|
"grad_norm": 1.1128321886062622, |
|
"learning_rate": 8.463414277903475e-05, |
|
"loss": 5.6413, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 31.619565217391305, |
|
"grad_norm": 1.2460695505142212, |
|
"learning_rate": 8.298134782054305e-05, |
|
"loss": 5.4948, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 32.26086956521739, |
|
"grad_norm": 1.1606298685073853, |
|
"learning_rate": 8.126213281678526e-05, |
|
"loss": 5.4031, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 32.26086956521739, |
|
"eval_accuracy": 0.001079145316909008, |
|
"eval_loss": 9.911575317382812, |
|
"eval_runtime": 48.65, |
|
"eval_samples_per_second": 6.208, |
|
"eval_steps_per_second": 0.391, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 32.91304347826087, |
|
"grad_norm": 1.0351324081420898, |
|
"learning_rate": 7.94799600379813e-05, |
|
"loss": 5.3272, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 33.55434782608695, |
|
"grad_norm": 1.089340329170227, |
|
"learning_rate": 7.763841854293145e-05, |
|
"loss": 5.2996, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 34.19565217391305, |
|
"grad_norm": 1.3611856698989868, |
|
"learning_rate": 7.574121695112954e-05, |
|
"loss": 5.1266, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 34.84782608695652, |
|
"grad_norm": 1.2501380443572998, |
|
"learning_rate": 7.379217597409688e-05, |
|
"loss": 5.0434, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 35.48913043478261, |
|
"grad_norm": 1.057522177696228, |
|
"learning_rate": 7.179522072097774e-05, |
|
"loss": 4.9784, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 35.48913043478261, |
|
"eval_accuracy": 0.0011830630140928385, |
|
"eval_loss": 9.981914520263672, |
|
"eval_runtime": 106.635, |
|
"eval_samples_per_second": 2.832, |
|
"eval_steps_per_second": 0.178, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 36.130434782608695, |
|
"grad_norm": 1.1768474578857422, |
|
"learning_rate": 6.975437279389181e-05, |
|
"loss": 4.9012, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 36.78260869565217, |
|
"grad_norm": 1.1783802509307861, |
|
"learning_rate": 6.767374218896286e-05, |
|
"loss": 4.823, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 37.42391304347826, |
|
"grad_norm": 1.220082402229309, |
|
"learning_rate": 6.555751901933342e-05, |
|
"loss": 4.7149, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 38.06521739130435, |
|
"grad_norm": 1.3078495264053345, |
|
"learning_rate": 6.340996507683458e-05, |
|
"loss": 4.6413, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 38.71739130434783, |
|
"grad_norm": 1.2146966457366943, |
|
"learning_rate": 6.123540524930442e-05, |
|
"loss": 4.6684, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 38.71739130434783, |
|
"eval_accuracy": 0.0008952909295837696, |
|
"eval_loss": 10.014237403869629, |
|
"eval_runtime": 31.4866, |
|
"eval_samples_per_second": 9.591, |
|
"eval_steps_per_second": 0.603, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 39.391304347826086, |
|
"grad_norm": 1.090649962425232, |
|
"learning_rate": 5.903821881083942e-05, |
|
"loss": 4.7794, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 40.03260869565217, |
|
"grad_norm": 1.2051453590393066, |
|
"learning_rate": 5.682283060251932e-05, |
|
"loss": 4.4631, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 40.68478260869565, |
|
"grad_norm": 1.0512608289718628, |
|
"learning_rate": 5.4593702121365955e-05, |
|
"loss": 4.4119, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 41.32608695652174, |
|
"grad_norm": 0.9912136793136597, |
|
"learning_rate": 5.235532253548213e-05, |
|
"loss": 4.3377, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 41.97826086956522, |
|
"grad_norm": 1.0219991207122803, |
|
"learning_rate": 5.0112199643464376e-05, |
|
"loss": 4.3184, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 41.97826086956522, |
|
"eval_accuracy": 0.001019192799302952, |
|
"eval_loss": 10.048251152038574, |
|
"eval_runtime": 18.0166, |
|
"eval_samples_per_second": 16.762, |
|
"eval_steps_per_second": 1.055, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 42.619565217391305, |
|
"grad_norm": 0.9927310347557068, |
|
"learning_rate": 4.7868850796296495e-05, |
|
"loss": 4.331, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 43.26086956521739, |
|
"grad_norm": 0.9491915106773376, |
|
"learning_rate": 4.5629793800005945e-05, |
|
"loss": 4.1006, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 43.91304347826087, |
|
"grad_norm": 0.8775396347045898, |
|
"learning_rate": 4.339953781740363e-05, |
|
"loss": 4.177, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 44.55434782608695, |
|
"grad_norm": 0.9140155911445618, |
|
"learning_rate": 4.1182574287230224e-05, |
|
"loss": 4.0757, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 45.19565217391305, |
|
"grad_norm": 0.9226499199867249, |
|
"learning_rate": 3.898336787899612e-05, |
|
"loss": 4.1251, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 45.19565217391305, |
|
"eval_accuracy": 0.0008153542394423617, |
|
"eval_loss": 10.09643840789795, |
|
"eval_runtime": 21.8568, |
|
"eval_samples_per_second": 13.817, |
|
"eval_steps_per_second": 0.869, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 45.84782608695652, |
|
"grad_norm": 0.9727521538734436, |
|
"learning_rate": 3.680634750173137e-05, |
|
"loss": 4.0495, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 46.48913043478261, |
|
"grad_norm": 0.8741424083709717, |
|
"learning_rate": 3.4655897384752146e-05, |
|
"loss": 4.0195, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 47.130434782608695, |
|
"grad_norm": 0.9479995965957642, |
|
"learning_rate": 3.2536348248406534e-05, |
|
"loss": 3.9482, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 47.78260869565217, |
|
"grad_norm": 0.9046297073364258, |
|
"learning_rate": 3.0451968582579915e-05, |
|
"loss": 3.9166, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 48.42391304347826, |
|
"grad_norm": 0.861109733581543, |
|
"learning_rate": 2.840695605052458e-05, |
|
"loss": 3.909, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 48.42391304347826, |
|
"eval_accuracy": 0.0008992877640908401, |
|
"eval_loss": 10.132174491882324, |
|
"eval_runtime": 28.1979, |
|
"eval_samples_per_second": 10.71, |
|
"eval_steps_per_second": 0.674, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 49.06521739130435, |
|
"grad_norm": 0.8144867420196533, |
|
"learning_rate": 2.6405429035324403e-05, |
|
"loss": 3.9054, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 49.71739130434783, |
|
"grad_norm": 0.8467565178871155, |
|
"learning_rate": 2.4451418346019573e-05, |
|
"loss": 3.8543, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 50.358695652173914, |
|
"grad_norm": 0.8617934584617615, |
|
"learning_rate": 2.2548859100093407e-05, |
|
"loss": 3.8013, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 1.1794334650039673, |
|
"learning_rate": 2.0701582798669676e-05, |
|
"loss": 3.803, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 51.65217391304348, |
|
"grad_norm": 0.861342191696167, |
|
"learning_rate": 1.8913309610379015e-05, |
|
"loss": 3.7535, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 51.65217391304348, |
|
"eval_accuracy": 0.0008753067570484176, |
|
"eval_loss": 10.158663749694824, |
|
"eval_runtime": 31.7351, |
|
"eval_samples_per_second": 9.516, |
|
"eval_steps_per_second": 0.599, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 52.29347826086956, |
|
"grad_norm": 0.8501729369163513, |
|
"learning_rate": 1.7187640879434553e-05, |
|
"loss": 3.7589, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 52.94565217391305, |
|
"grad_norm": 0.8114346861839294, |
|
"learning_rate": 1.552805187300389e-05, |
|
"loss": 3.8172, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 53.58695652173913, |
|
"grad_norm": 0.8227590918540955, |
|
"learning_rate": 1.3937884782483484e-05, |
|
"loss": 3.6933, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 54.22826086956522, |
|
"grad_norm": 0.7768607139587402, |
|
"learning_rate": 1.242034199277008e-05, |
|
"loss": 3.8079, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 54.880434782608695, |
|
"grad_norm": 0.8110019564628601, |
|
"learning_rate": 1.097847963308351e-05, |
|
"loss": 3.681, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 54.880434782608695, |
|
"eval_accuracy": 0.0008273447429635729, |
|
"eval_loss": 10.178533554077148, |
|
"eval_runtime": 31.6166, |
|
"eval_samples_per_second": 9.552, |
|
"eval_steps_per_second": 0.601, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 55.52173913043478, |
|
"grad_norm": 0.8320772647857666, |
|
"learning_rate": 9.615201422329406e-06, |
|
"loss": 3.6494, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 56.16304347826087, |
|
"grad_norm": 0.7713989019393921, |
|
"learning_rate": 8.333252821395526e-06, |
|
"loss": 3.7021, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 56.81521739130435, |
|
"grad_norm": 0.7743974924087524, |
|
"learning_rate": 7.135215504159115e-06, |
|
"loss": 3.7404, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 57.45652173913044, |
|
"grad_norm": 0.7438375353813171, |
|
"learning_rate": 6.023502158339078e-06, |
|
"loss": 3.6467, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 58.09782608695652, |
|
"grad_norm": 0.8585782051086426, |
|
"learning_rate": 5.000351626664207e-06, |
|
"loss": 3.688, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 58.09782608695652, |
|
"eval_accuracy": 0.0008273447429635729, |
|
"eval_loss": 10.187094688415527, |
|
"eval_runtime": 31.7031, |
|
"eval_samples_per_second": 9.526, |
|
"eval_steps_per_second": 0.599, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 58.75, |
|
"grad_norm": 0.7883967161178589, |
|
"learning_rate": 4.067824398141701e-06, |
|
"loss": 3.6471, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 59.391304347826086, |
|
"grad_norm": 0.7768418192863464, |
|
"learning_rate": 3.2277984585066366e-06, |
|
"loss": 3.6824, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 60.03260869565217, |
|
"grad_norm": 0.7814875245094299, |
|
"learning_rate": 2.4819655082085835e-06, |
|
"loss": 3.6767, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 60.68478260869565, |
|
"grad_norm": 0.8055542707443237, |
|
"learning_rate": 1.8318275555520237e-06, |
|
"loss": 3.5899, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 61.32608695652174, |
|
"grad_norm": 0.8155117630958557, |
|
"learning_rate": 1.2786938918515568e-06, |
|
"loss": 3.6685, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 61.32608695652174, |
|
"eval_accuracy": 0.0008233479084565024, |
|
"eval_loss": 10.19116497039795, |
|
"eval_runtime": 36.7079, |
|
"eval_samples_per_second": 8.227, |
|
"eval_steps_per_second": 0.518, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 61.97826086956522, |
|
"grad_norm": 0.8085272312164307, |
|
"learning_rate": 8.236784546933718e-07, |
|
"loss": 3.697, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 62.619565217391305, |
|
"grad_norm": 0.7857375741004944, |
|
"learning_rate": 4.676975846132692e-07, |
|
"loss": 3.6836, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 63.26086956521739, |
|
"grad_norm": 0.7917608618736267, |
|
"learning_rate": 2.1146817970871258e-07, |
|
"loss": 3.6163, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 63.91304347826087, |
|
"grad_norm": 0.7354781031608582, |
|
"learning_rate": 5.550625190150483e-08, |
|
"loss": 3.6652, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 64.55434782608695, |
|
"grad_norm": 0.7902089357376099, |
|
"learning_rate": 1.2588775841204658e-10, |
|
"loss": 3.6326, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 64.55434782608695, |
|
"eval_accuracy": 0.0008273447429635729, |
|
"eval_loss": 10.191176414489746, |
|
"eval_runtime": 37.3382, |
|
"eval_samples_per_second": 8.088, |
|
"eval_steps_per_second": 0.509, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 2000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 65, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2610102966336000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|