|
[ |
|
{ |
|
"loss": 13.2026, |
|
"grad_norm": 6.155358791351318, |
|
"learning_rate": 1.739130434782609e-05, |
|
"epoch": 0.034782608695652174, |
|
"step": 20 |
|
}, |
|
{ |
|
"loss": 13.1252, |
|
"grad_norm": 5.816741943359375, |
|
"learning_rate": 3.478260869565218e-05, |
|
"epoch": 0.06956521739130435, |
|
"step": 40 |
|
}, |
|
{ |
|
"loss": 13.0001, |
|
"grad_norm": 5.273156642913818, |
|
"learning_rate": 5.2173913043478256e-05, |
|
"epoch": 0.10434782608695652, |
|
"step": 60 |
|
}, |
|
{ |
|
"loss": 12.8639, |
|
"grad_norm": 4.86655330657959, |
|
"learning_rate": 6.956521739130436e-05, |
|
"epoch": 0.1391304347826087, |
|
"step": 80 |
|
}, |
|
{ |
|
"loss": 12.7376, |
|
"grad_norm": 4.438321113586426, |
|
"learning_rate": 8.695652173913044e-05, |
|
"epoch": 0.17391304347826086, |
|
"step": 100 |
|
}, |
|
{ |
|
"loss": 12.5722, |
|
"grad_norm": 4.164404392242432, |
|
"learning_rate": 0.00010434782608695651, |
|
"epoch": 0.20869565217391303, |
|
"step": 120 |
|
}, |
|
{ |
|
"loss": 12.4229, |
|
"grad_norm": 3.858990430831909, |
|
"learning_rate": 0.00012173913043478261, |
|
"epoch": 0.24347826086956523, |
|
"step": 140 |
|
}, |
|
{ |
|
"loss": 12.2581, |
|
"grad_norm": 3.6574394702911377, |
|
"learning_rate": 0.0001391304347826087, |
|
"epoch": 0.2782608695652174, |
|
"step": 160 |
|
}, |
|
{ |
|
"loss": 12.0753, |
|
"grad_norm": 3.3787951469421387, |
|
"learning_rate": 0.0001565217391304348, |
|
"epoch": 0.3130434782608696, |
|
"step": 180 |
|
}, |
|
{ |
|
"loss": 11.9261, |
|
"grad_norm": 3.323820114135742, |
|
"learning_rate": 0.00017391304347826088, |
|
"epoch": 0.34782608695652173, |
|
"step": 200 |
|
}, |
|
{ |
|
"loss": 11.7417, |
|
"grad_norm": 3.247619152069092, |
|
"learning_rate": 0.00019130434782608697, |
|
"epoch": 0.3826086956521739, |
|
"step": 220 |
|
}, |
|
{ |
|
"loss": 11.5771, |
|
"grad_norm": 3.2254152297973633, |
|
"learning_rate": 0.00020869565217391303, |
|
"epoch": 0.41739130434782606, |
|
"step": 240 |
|
}, |
|
{ |
|
"loss": 11.3969, |
|
"grad_norm": 3.1803464889526367, |
|
"learning_rate": 0.00022608695652173914, |
|
"epoch": 0.45217391304347826, |
|
"step": 260 |
|
}, |
|
{ |
|
"loss": 11.2684, |
|
"grad_norm": 3.41034197807312, |
|
"learning_rate": 0.00024347826086956522, |
|
"epoch": 0.48695652173913045, |
|
"step": 280 |
|
}, |
|
{ |
|
"loss": 11.0744, |
|
"grad_norm": 3.246403217315674, |
|
"learning_rate": 0.0002608695652173913, |
|
"epoch": 0.5217391304347826, |
|
"step": 300 |
|
}, |
|
{ |
|
"loss": 10.8929, |
|
"grad_norm": 3.202021360397339, |
|
"learning_rate": 0.0002782608695652174, |
|
"epoch": 0.5565217391304348, |
|
"step": 320 |
|
}, |
|
{ |
|
"loss": 10.7468, |
|
"grad_norm": 3.1231367588043213, |
|
"learning_rate": 0.0002956521739130435, |
|
"epoch": 0.591304347826087, |
|
"step": 340 |
|
}, |
|
{ |
|
"loss": 10.606, |
|
"grad_norm": 3.1820390224456787, |
|
"learning_rate": 0.0003130434782608696, |
|
"epoch": 0.6260869565217392, |
|
"step": 360 |
|
}, |
|
{ |
|
"loss": 10.4871, |
|
"grad_norm": 3.2470555305480957, |
|
"learning_rate": 0.0003304347826086956, |
|
"epoch": 0.6608695652173913, |
|
"step": 380 |
|
}, |
|
{ |
|
"loss": 10.2836, |
|
"grad_norm": 3.2452709674835205, |
|
"learning_rate": 0.00034782608695652176, |
|
"epoch": 0.6956521739130435, |
|
"step": 400 |
|
}, |
|
{ |
|
"loss": 10.1154, |
|
"grad_norm": 3.203894853591919, |
|
"learning_rate": 0.00036521739130434785, |
|
"epoch": 0.7304347826086957, |
|
"step": 420 |
|
}, |
|
{ |
|
"loss": 9.9283, |
|
"grad_norm": 3.269970178604126, |
|
"learning_rate": 0.00038260869565217393, |
|
"epoch": 0.7652173913043478, |
|
"step": 440 |
|
}, |
|
{ |
|
"loss": 9.8674, |
|
"grad_norm": 3.261357545852661, |
|
"learning_rate": 0.0004, |
|
"epoch": 0.8, |
|
"step": 460 |
|
}, |
|
{ |
|
"loss": 9.6224, |
|
"grad_norm": 3.393953323364258, |
|
"learning_rate": 0.00041739130434782605, |
|
"epoch": 0.8347826086956521, |
|
"step": 480 |
|
}, |
|
{ |
|
"loss": 9.524, |
|
"grad_norm": 3.321411609649658, |
|
"learning_rate": 0.0004347826086956522, |
|
"epoch": 0.8695652173913043, |
|
"step": 500 |
|
}, |
|
{ |
|
"loss": 9.384, |
|
"grad_norm": 3.3886823654174805, |
|
"learning_rate": 0.0004521739130434783, |
|
"epoch": 0.9043478260869565, |
|
"step": 520 |
|
}, |
|
{ |
|
"loss": 9.1767, |
|
"grad_norm": 3.4735491275787354, |
|
"learning_rate": 0.00046956521739130436, |
|
"epoch": 0.9391304347826087, |
|
"step": 540 |
|
}, |
|
{ |
|
"loss": 9.047, |
|
"grad_norm": 3.416966676712036, |
|
"learning_rate": 0.00048695652173913045, |
|
"epoch": 0.9739130434782609, |
|
"step": 560 |
|
}, |
|
{ |
|
"eval_loss": 8.366157531738281, |
|
"eval_accuracy": 0.43039677202420984, |
|
"eval_runtime": 42.3364, |
|
"eval_samples_per_second": 35.123, |
|
"eval_steps_per_second": 35.123, |
|
"epoch": 1.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"loss": 8.8835, |
|
"grad_norm": 3.446899890899658, |
|
"learning_rate": 0.0004995169082125604, |
|
"epoch": 1.008695652173913, |
|
"step": 580 |
|
}, |
|
{ |
|
"loss": 8.6436, |
|
"grad_norm": 3.5842247009277344, |
|
"learning_rate": 0.0004975845410628019, |
|
"epoch": 1.0434782608695652, |
|
"step": 600 |
|
}, |
|
{ |
|
"loss": 8.4775, |
|
"grad_norm": 3.5029306411743164, |
|
"learning_rate": 0.0004956521739130435, |
|
"epoch": 1.0782608695652174, |
|
"step": 620 |
|
}, |
|
{ |
|
"loss": 8.322, |
|
"grad_norm": 3.5451033115386963, |
|
"learning_rate": 0.0004937198067632851, |
|
"epoch": 1.1130434782608696, |
|
"step": 640 |
|
}, |
|
{ |
|
"loss": 8.1264, |
|
"grad_norm": 3.5502634048461914, |
|
"learning_rate": 0.0004917874396135266, |
|
"epoch": 1.1478260869565218, |
|
"step": 660 |
|
}, |
|
{ |
|
"loss": 7.9905, |
|
"grad_norm": 3.607395648956299, |
|
"learning_rate": 0.0004898550724637681, |
|
"epoch": 1.182608695652174, |
|
"step": 680 |
|
}, |
|
{ |
|
"loss": 7.8252, |
|
"grad_norm": 3.6438565254211426, |
|
"learning_rate": 0.0004879227053140097, |
|
"epoch": 1.2173913043478262, |
|
"step": 700 |
|
}, |
|
{ |
|
"loss": 7.7737, |
|
"grad_norm": 3.656705141067505, |
|
"learning_rate": 0.0004859903381642512, |
|
"epoch": 1.2521739130434781, |
|
"step": 720 |
|
}, |
|
{ |
|
"loss": 7.5822, |
|
"grad_norm": 3.7424328327178955, |
|
"learning_rate": 0.0004840579710144928, |
|
"epoch": 1.2869565217391306, |
|
"step": 740 |
|
}, |
|
{ |
|
"loss": 7.4563, |
|
"grad_norm": 3.673156261444092, |
|
"learning_rate": 0.0004821256038647343, |
|
"epoch": 1.3217391304347825, |
|
"step": 760 |
|
}, |
|
{ |
|
"loss": 7.3379, |
|
"grad_norm": 3.6774067878723145, |
|
"learning_rate": 0.0004801932367149758, |
|
"epoch": 1.3565217391304347, |
|
"step": 780 |
|
}, |
|
{ |
|
"loss": 7.1559, |
|
"grad_norm": 3.811283826828003, |
|
"learning_rate": 0.0004782608695652174, |
|
"epoch": 1.391304347826087, |
|
"step": 800 |
|
}, |
|
{ |
|
"loss": 7.0834, |
|
"grad_norm": 3.7899839878082275, |
|
"learning_rate": 0.00047632850241545894, |
|
"epoch": 1.4260869565217391, |
|
"step": 820 |
|
}, |
|
{ |
|
"loss": 6.9172, |
|
"grad_norm": 3.583247423171997, |
|
"learning_rate": 0.00047439613526570047, |
|
"epoch": 1.4608695652173913, |
|
"step": 840 |
|
}, |
|
{ |
|
"loss": 6.7251, |
|
"grad_norm": 3.8192331790924072, |
|
"learning_rate": 0.00047246376811594206, |
|
"epoch": 1.4956521739130435, |
|
"step": 860 |
|
}, |
|
{ |
|
"loss": 6.7871, |
|
"grad_norm": 3.8098299503326416, |
|
"learning_rate": 0.0004705314009661836, |
|
"epoch": 1.5304347826086957, |
|
"step": 880 |
|
}, |
|
{ |
|
"loss": 6.6103, |
|
"grad_norm": 3.7341325283050537, |
|
"learning_rate": 0.0004685990338164252, |
|
"epoch": 1.5652173913043477, |
|
"step": 900 |
|
}, |
|
{ |
|
"loss": 6.4507, |
|
"grad_norm": 3.9190495014190674, |
|
"learning_rate": 0.00046666666666666666, |
|
"epoch": 1.6, |
|
"step": 920 |
|
}, |
|
{ |
|
"loss": 6.3619, |
|
"grad_norm": 3.9456422328948975, |
|
"learning_rate": 0.0004647342995169082, |
|
"epoch": 1.634782608695652, |
|
"step": 940 |
|
}, |
|
{ |
|
"loss": 6.2957, |
|
"grad_norm": 3.899134874343872, |
|
"learning_rate": 0.0004628019323671498, |
|
"epoch": 1.6695652173913045, |
|
"step": 960 |
|
}, |
|
{ |
|
"loss": 6.1362, |
|
"grad_norm": 3.878810167312622, |
|
"learning_rate": 0.0004608695652173913, |
|
"epoch": 1.7043478260869565, |
|
"step": 980 |
|
}, |
|
{ |
|
"loss": 5.9814, |
|
"grad_norm": 3.9270784854888916, |
|
"learning_rate": 0.00045893719806763285, |
|
"epoch": 1.7391304347826086, |
|
"step": 1000 |
|
}, |
|
{ |
|
"loss": 5.9095, |
|
"grad_norm": 3.8247644901275635, |
|
"learning_rate": 0.00045700483091787444, |
|
"epoch": 1.7739130434782608, |
|
"step": 1020 |
|
}, |
|
{ |
|
"loss": 5.7793, |
|
"grad_norm": 3.8870134353637695, |
|
"learning_rate": 0.000455072463768116, |
|
"epoch": 1.808695652173913, |
|
"step": 1040 |
|
}, |
|
{ |
|
"loss": 5.7754, |
|
"grad_norm": 3.9533441066741943, |
|
"learning_rate": 0.00045314009661835745, |
|
"epoch": 1.8434782608695652, |
|
"step": 1060 |
|
}, |
|
{ |
|
"loss": 5.5886, |
|
"grad_norm": 3.9928998947143555, |
|
"learning_rate": 0.00045120772946859904, |
|
"epoch": 1.8782608695652174, |
|
"step": 1080 |
|
}, |
|
{ |
|
"loss": 5.5482, |
|
"grad_norm": 4.030064582824707, |
|
"learning_rate": 0.0004492753623188406, |
|
"epoch": 1.9130434782608696, |
|
"step": 1100 |
|
}, |
|
{ |
|
"loss": 5.4807, |
|
"grad_norm": 3.961806297302246, |
|
"learning_rate": 0.0004473429951690821, |
|
"epoch": 1.9478260869565216, |
|
"step": 1120 |
|
}, |
|
{ |
|
"loss": 5.3508, |
|
"grad_norm": 4.003119945526123, |
|
"learning_rate": 0.0004454106280193237, |
|
"epoch": 1.982608695652174, |
|
"step": 1140 |
|
}, |
|
{ |
|
"eval_loss": 4.025164604187012, |
|
"eval_accuracy": 0.8190988567585743, |
|
"eval_runtime": 42.7144, |
|
"eval_samples_per_second": 34.813, |
|
"eval_steps_per_second": 34.813, |
|
"epoch": 2.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"loss": 5.1229, |
|
"grad_norm": 3.958116292953491, |
|
"learning_rate": 0.00044347826086956523, |
|
"epoch": 2.017391304347826, |
|
"step": 1160 |
|
}, |
|
{ |
|
"loss": 4.8146, |
|
"grad_norm": 3.864279270172119, |
|
"learning_rate": 0.00044154589371980677, |
|
"epoch": 2.0521739130434784, |
|
"step": 1180 |
|
}, |
|
{ |
|
"loss": 4.8843, |
|
"grad_norm": 4.045077323913574, |
|
"learning_rate": 0.0004396135265700483, |
|
"epoch": 2.0869565217391304, |
|
"step": 1200 |
|
}, |
|
{ |
|
"loss": 4.8078, |
|
"grad_norm": 4.061978816986084, |
|
"learning_rate": 0.00043768115942028983, |
|
"epoch": 2.121739130434783, |
|
"step": 1220 |
|
}, |
|
{ |
|
"loss": 4.6812, |
|
"grad_norm": 4.040159225463867, |
|
"learning_rate": 0.0004357487922705314, |
|
"epoch": 2.1565217391304348, |
|
"step": 1240 |
|
}, |
|
{ |
|
"loss": 4.6701, |
|
"grad_norm": 4.234623908996582, |
|
"learning_rate": 0.00043381642512077296, |
|
"epoch": 2.1913043478260867, |
|
"step": 1260 |
|
}, |
|
{ |
|
"loss": 4.6221, |
|
"grad_norm": 4.030038356781006, |
|
"learning_rate": 0.0004318840579710145, |
|
"epoch": 2.226086956521739, |
|
"step": 1280 |
|
}, |
|
{ |
|
"loss": 4.5647, |
|
"grad_norm": 3.9954497814178467, |
|
"learning_rate": 0.0004299516908212561, |
|
"epoch": 2.260869565217391, |
|
"step": 1300 |
|
}, |
|
{ |
|
"loss": 4.4502, |
|
"grad_norm": 4.188636779785156, |
|
"learning_rate": 0.0004280193236714976, |
|
"epoch": 2.2956521739130435, |
|
"step": 1320 |
|
}, |
|
{ |
|
"loss": 4.359, |
|
"grad_norm": 4.185456275939941, |
|
"learning_rate": 0.00042608695652173915, |
|
"epoch": 2.3304347826086955, |
|
"step": 1340 |
|
}, |
|
{ |
|
"loss": 4.2863, |
|
"grad_norm": 4.123263359069824, |
|
"learning_rate": 0.0004241545893719807, |
|
"epoch": 2.365217391304348, |
|
"step": 1360 |
|
}, |
|
{ |
|
"loss": 4.3354, |
|
"grad_norm": 4.194387435913086, |
|
"learning_rate": 0.0004222222222222222, |
|
"epoch": 2.4, |
|
"step": 1380 |
|
}, |
|
{ |
|
"loss": 4.2176, |
|
"grad_norm": 4.065763473510742, |
|
"learning_rate": 0.00042028985507246375, |
|
"epoch": 2.4347826086956523, |
|
"step": 1400 |
|
}, |
|
{ |
|
"loss": 4.0597, |
|
"grad_norm": 4.120363712310791, |
|
"learning_rate": 0.00041835748792270534, |
|
"epoch": 2.4695652173913043, |
|
"step": 1420 |
|
}, |
|
{ |
|
"loss": 4.028, |
|
"grad_norm": 4.3197174072265625, |
|
"learning_rate": 0.00041642512077294687, |
|
"epoch": 2.5043478260869563, |
|
"step": 1440 |
|
}, |
|
{ |
|
"loss": 3.9833, |
|
"grad_norm": 4.2683610916137695, |
|
"learning_rate": 0.0004144927536231884, |
|
"epoch": 2.5391304347826087, |
|
"step": 1460 |
|
}, |
|
{ |
|
"loss": 4.0065, |
|
"grad_norm": 4.15448522567749, |
|
"learning_rate": 0.00041256038647343, |
|
"epoch": 2.573913043478261, |
|
"step": 1480 |
|
}, |
|
{ |
|
"loss": 3.8134, |
|
"grad_norm": 4.348177433013916, |
|
"learning_rate": 0.0004106280193236715, |
|
"epoch": 2.608695652173913, |
|
"step": 1500 |
|
}, |
|
{ |
|
"loss": 3.8548, |
|
"grad_norm": 4.100021839141846, |
|
"learning_rate": 0.00040869565217391306, |
|
"epoch": 2.643478260869565, |
|
"step": 1520 |
|
}, |
|
{ |
|
"loss": 3.7814, |
|
"grad_norm": 4.344174385070801, |
|
"learning_rate": 0.0004067632850241546, |
|
"epoch": 2.6782608695652175, |
|
"step": 1540 |
|
}, |
|
{ |
|
"loss": 3.7578, |
|
"grad_norm": 4.240079402923584, |
|
"learning_rate": 0.00040483091787439613, |
|
"epoch": 2.7130434782608694, |
|
"step": 1560 |
|
}, |
|
{ |
|
"loss": 3.7331, |
|
"grad_norm": 4.468689918518066, |
|
"learning_rate": 0.0004028985507246377, |
|
"epoch": 2.747826086956522, |
|
"step": 1580 |
|
}, |
|
{ |
|
"loss": 3.6396, |
|
"grad_norm": 4.28464937210083, |
|
"learning_rate": 0.00040096618357487925, |
|
"epoch": 2.782608695652174, |
|
"step": 1600 |
|
}, |
|
{ |
|
"loss": 3.5799, |
|
"grad_norm": 4.166805744171143, |
|
"learning_rate": 0.0003990338164251208, |
|
"epoch": 2.8173913043478263, |
|
"step": 1620 |
|
}, |
|
{ |
|
"loss": 3.4734, |
|
"grad_norm": 4.237683296203613, |
|
"learning_rate": 0.0003971014492753624, |
|
"epoch": 2.8521739130434782, |
|
"step": 1640 |
|
}, |
|
{ |
|
"loss": 3.5183, |
|
"grad_norm": 4.153097152709961, |
|
"learning_rate": 0.00039516908212560385, |
|
"epoch": 2.8869565217391306, |
|
"step": 1660 |
|
}, |
|
{ |
|
"loss": 3.3963, |
|
"grad_norm": 4.2313947677612305, |
|
"learning_rate": 0.0003932367149758454, |
|
"epoch": 2.9217391304347826, |
|
"step": 1680 |
|
}, |
|
{ |
|
"loss": 3.3081, |
|
"grad_norm": 3.992475748062134, |
|
"learning_rate": 0.000391304347826087, |
|
"epoch": 2.9565217391304346, |
|
"step": 1700 |
|
}, |
|
{ |
|
"loss": 3.3124, |
|
"grad_norm": 4.4731059074401855, |
|
"learning_rate": 0.0003893719806763285, |
|
"epoch": 2.991304347826087, |
|
"step": 1720 |
|
}, |
|
{ |
|
"eval_loss": 2.1082653999328613, |
|
"eval_accuracy": 0.9260255548083389, |
|
"eval_runtime": 22.1676, |
|
"eval_samples_per_second": 67.08, |
|
"eval_steps_per_second": 67.08, |
|
"epoch": 3.0, |
|
"step": 1725 |
|
}, |
|
{ |
|
"loss": 3.1247, |
|
"grad_norm": 4.272000312805176, |
|
"learning_rate": 0.00038743961352657004, |
|
"epoch": 3.026086956521739, |
|
"step": 1740 |
|
}, |
|
{ |
|
"loss": 3.1064, |
|
"grad_norm": 4.102330207824707, |
|
"learning_rate": 0.00038550724637681163, |
|
"epoch": 3.0608695652173914, |
|
"step": 1760 |
|
}, |
|
{ |
|
"loss": 2.9371, |
|
"grad_norm": 4.381846904754639, |
|
"learning_rate": 0.00038357487922705317, |
|
"epoch": 3.0956521739130434, |
|
"step": 1780 |
|
}, |
|
{ |
|
"loss": 2.9355, |
|
"grad_norm": 4.1588921546936035, |
|
"learning_rate": 0.00038164251207729465, |
|
"epoch": 3.130434782608696, |
|
"step": 1800 |
|
}, |
|
{ |
|
"loss": 2.8545, |
|
"grad_norm": 4.279609203338623, |
|
"learning_rate": 0.00037971014492753623, |
|
"epoch": 3.1652173913043478, |
|
"step": 1820 |
|
}, |
|
{ |
|
"loss": 2.8096, |
|
"grad_norm": 4.240756988525391, |
|
"learning_rate": 0.00037777777777777777, |
|
"epoch": 3.2, |
|
"step": 1840 |
|
}, |
|
{ |
|
"loss": 2.8138, |
|
"grad_norm": 4.11091947555542, |
|
"learning_rate": 0.00037584541062801936, |
|
"epoch": 3.234782608695652, |
|
"step": 1860 |
|
}, |
|
{ |
|
"loss": 2.7417, |
|
"grad_norm": 4.078794479370117, |
|
"learning_rate": 0.0003739130434782609, |
|
"epoch": 3.269565217391304, |
|
"step": 1880 |
|
}, |
|
{ |
|
"loss": 2.7937, |
|
"grad_norm": 4.368116855621338, |
|
"learning_rate": 0.0003719806763285024, |
|
"epoch": 3.3043478260869565, |
|
"step": 1900 |
|
}, |
|
{ |
|
"loss": 2.7361, |
|
"grad_norm": 4.044319152832031, |
|
"learning_rate": 0.000370048309178744, |
|
"epoch": 3.3391304347826085, |
|
"step": 1920 |
|
}, |
|
{ |
|
"loss": 2.7054, |
|
"grad_norm": 4.314040184020996, |
|
"learning_rate": 0.0003681159420289855, |
|
"epoch": 3.373913043478261, |
|
"step": 1940 |
|
}, |
|
{ |
|
"loss": 2.6682, |
|
"grad_norm": 4.185855388641357, |
|
"learning_rate": 0.000366183574879227, |
|
"epoch": 3.408695652173913, |
|
"step": 1960 |
|
}, |
|
{ |
|
"loss": 2.6644, |
|
"grad_norm": 4.433622360229492, |
|
"learning_rate": 0.0003642512077294686, |
|
"epoch": 3.4434782608695653, |
|
"step": 1980 |
|
}, |
|
{ |
|
"loss": 2.618, |
|
"grad_norm": 4.048947811126709, |
|
"learning_rate": 0.00036231884057971015, |
|
"epoch": 3.4782608695652173, |
|
"step": 2000 |
|
}, |
|
{ |
|
"loss": 2.5982, |
|
"grad_norm": 4.145406246185303, |
|
"learning_rate": 0.0003603864734299517, |
|
"epoch": 3.5130434782608697, |
|
"step": 2020 |
|
}, |
|
{ |
|
"loss": 2.6138, |
|
"grad_norm": 4.2812910079956055, |
|
"learning_rate": 0.00035845410628019327, |
|
"epoch": 3.5478260869565217, |
|
"step": 2040 |
|
}, |
|
{ |
|
"loss": 2.5039, |
|
"grad_norm": 4.400162220001221, |
|
"learning_rate": 0.0003565217391304348, |
|
"epoch": 3.5826086956521737, |
|
"step": 2060 |
|
}, |
|
{ |
|
"loss": 2.5249, |
|
"grad_norm": 4.217800617218018, |
|
"learning_rate": 0.0003545893719806763, |
|
"epoch": 3.617391304347826, |
|
"step": 2080 |
|
}, |
|
{ |
|
"loss": 2.4547, |
|
"grad_norm": 4.076215744018555, |
|
"learning_rate": 0.0003526570048309179, |
|
"epoch": 3.6521739130434785, |
|
"step": 2100 |
|
}, |
|
{ |
|
"loss": 2.4315, |
|
"grad_norm": 4.139514446258545, |
|
"learning_rate": 0.0003507246376811594, |
|
"epoch": 3.6869565217391305, |
|
"step": 2120 |
|
}, |
|
{ |
|
"loss": 2.3836, |
|
"grad_norm": 4.118022918701172, |
|
"learning_rate": 0.00034879227053140094, |
|
"epoch": 3.7217391304347824, |
|
"step": 2140 |
|
}, |
|
{ |
|
"loss": 2.3284, |
|
"grad_norm": 4.137601852416992, |
|
"learning_rate": 0.00034685990338164253, |
|
"epoch": 3.756521739130435, |
|
"step": 2160 |
|
}, |
|
{ |
|
"loss": 2.3095, |
|
"grad_norm": 4.023979663848877, |
|
"learning_rate": 0.00034492753623188406, |
|
"epoch": 3.791304347826087, |
|
"step": 2180 |
|
}, |
|
{ |
|
"loss": 2.305, |
|
"grad_norm": 4.042725086212158, |
|
"learning_rate": 0.00034299516908212565, |
|
"epoch": 3.8260869565217392, |
|
"step": 2200 |
|
}, |
|
{ |
|
"loss": 2.3237, |
|
"grad_norm": 4.265875339508057, |
|
"learning_rate": 0.0003410628019323672, |
|
"epoch": 3.860869565217391, |
|
"step": 2220 |
|
}, |
|
{ |
|
"loss": 2.335, |
|
"grad_norm": 4.205041408538818, |
|
"learning_rate": 0.00033913043478260867, |
|
"epoch": 3.8956521739130436, |
|
"step": 2240 |
|
}, |
|
{ |
|
"loss": 2.2341, |
|
"grad_norm": 4.1344709396362305, |
|
"learning_rate": 0.00033719806763285025, |
|
"epoch": 3.9304347826086956, |
|
"step": 2260 |
|
}, |
|
{ |
|
"loss": 2.251, |
|
"grad_norm": 4.247790813446045, |
|
"learning_rate": 0.0003352657004830918, |
|
"epoch": 3.965217391304348, |
|
"step": 2280 |
|
}, |
|
{ |
|
"loss": 2.3212, |
|
"grad_norm": 4.859626770019531, |
|
"learning_rate": 0.0003333333333333333, |
|
"epoch": 4.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"eval_loss": 1.2223739624023438, |
|
"eval_accuracy": 0.9435104236718225, |
|
"eval_runtime": 14.8513, |
|
"eval_samples_per_second": 100.126, |
|
"eval_steps_per_second": 100.126, |
|
"epoch": 4.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"loss": 1.9133, |
|
"grad_norm": 4.098020553588867, |
|
"learning_rate": 0.0003314009661835749, |
|
"epoch": 4.034782608695652, |
|
"step": 2320 |
|
}, |
|
{ |
|
"loss": 1.9814, |
|
"grad_norm": 4.198029041290283, |
|
"learning_rate": 0.00032946859903381644, |
|
"epoch": 4.069565217391304, |
|
"step": 2340 |
|
}, |
|
{ |
|
"loss": 1.9505, |
|
"grad_norm": 3.960844039916992, |
|
"learning_rate": 0.000327536231884058, |
|
"epoch": 4.104347826086957, |
|
"step": 2360 |
|
}, |
|
{ |
|
"loss": 1.8815, |
|
"grad_norm": 4.0190300941467285, |
|
"learning_rate": 0.0003256038647342995, |
|
"epoch": 4.139130434782609, |
|
"step": 2380 |
|
}, |
|
{ |
|
"loss": 1.8365, |
|
"grad_norm": 4.040708541870117, |
|
"learning_rate": 0.00032367149758454105, |
|
"epoch": 4.173913043478261, |
|
"step": 2400 |
|
}, |
|
{ |
|
"loss": 1.84, |
|
"grad_norm": 4.077364444732666, |
|
"learning_rate": 0.0003217391304347826, |
|
"epoch": 4.208695652173913, |
|
"step": 2420 |
|
}, |
|
{ |
|
"loss": 1.8864, |
|
"grad_norm": 4.267309188842773, |
|
"learning_rate": 0.0003199033816425121, |
|
"epoch": 4.243478260869566, |
|
"step": 2440 |
|
}, |
|
{ |
|
"loss": 1.9015, |
|
"grad_norm": 3.978663921356201, |
|
"learning_rate": 0.00031797101449275363, |
|
"epoch": 4.278260869565218, |
|
"step": 2460 |
|
}, |
|
{ |
|
"loss": 1.8388, |
|
"grad_norm": 4.089256763458252, |
|
"learning_rate": 0.0003160386473429952, |
|
"epoch": 4.3130434782608695, |
|
"step": 2480 |
|
}, |
|
{ |
|
"loss": 1.7845, |
|
"grad_norm": 3.9317057132720947, |
|
"learning_rate": 0.0003141062801932367, |
|
"epoch": 4.3478260869565215, |
|
"step": 2500 |
|
}, |
|
{ |
|
"loss": 1.7725, |
|
"grad_norm": 3.9738080501556396, |
|
"learning_rate": 0.00031217391304347823, |
|
"epoch": 4.3826086956521735, |
|
"step": 2520 |
|
}, |
|
{ |
|
"loss": 1.852, |
|
"grad_norm": 4.232215881347656, |
|
"learning_rate": 0.0003102415458937198, |
|
"epoch": 4.417391304347826, |
|
"step": 2540 |
|
}, |
|
{ |
|
"loss": 1.8234, |
|
"grad_norm": 4.050131797790527, |
|
"learning_rate": 0.00030830917874396136, |
|
"epoch": 4.452173913043478, |
|
"step": 2560 |
|
}, |
|
{ |
|
"loss": 1.8148, |
|
"grad_norm": 4.217935085296631, |
|
"learning_rate": 0.0003063768115942029, |
|
"epoch": 4.48695652173913, |
|
"step": 2580 |
|
}, |
|
{ |
|
"loss": 1.7134, |
|
"grad_norm": 3.9807074069976807, |
|
"learning_rate": 0.0003044444444444445, |
|
"epoch": 4.521739130434782, |
|
"step": 2600 |
|
}, |
|
{ |
|
"loss": 1.6752, |
|
"grad_norm": 4.05940580368042, |
|
"learning_rate": 0.000302512077294686, |
|
"epoch": 4.556521739130435, |
|
"step": 2620 |
|
}, |
|
{ |
|
"loss": 1.8413, |
|
"grad_norm": 4.454566955566406, |
|
"learning_rate": 0.00030057971014492755, |
|
"epoch": 4.591304347826087, |
|
"step": 2640 |
|
}, |
|
{ |
|
"loss": 1.7948, |
|
"grad_norm": 4.144088268280029, |
|
"learning_rate": 0.0002986473429951691, |
|
"epoch": 4.626086956521739, |
|
"step": 2660 |
|
}, |
|
{ |
|
"loss": 1.7468, |
|
"grad_norm": 3.940176010131836, |
|
"learning_rate": 0.0002967149758454106, |
|
"epoch": 4.660869565217391, |
|
"step": 2680 |
|
}, |
|
{ |
|
"loss": 1.709, |
|
"grad_norm": 4.198675632476807, |
|
"learning_rate": 0.0002948792270531401, |
|
"epoch": 4.695652173913043, |
|
"step": 2700 |
|
}, |
|
{ |
|
"loss": 1.6506, |
|
"grad_norm": 3.976001501083374, |
|
"learning_rate": 0.00029294685990338167, |
|
"epoch": 4.730434782608696, |
|
"step": 2720 |
|
}, |
|
{ |
|
"loss": 1.7042, |
|
"grad_norm": 4.033059120178223, |
|
"learning_rate": 0.0002910144927536232, |
|
"epoch": 4.765217391304348, |
|
"step": 2740 |
|
}, |
|
{ |
|
"loss": 1.6795, |
|
"grad_norm": 4.062041759490967, |
|
"learning_rate": 0.0002890821256038648, |
|
"epoch": 4.8, |
|
"step": 2760 |
|
}, |
|
{ |
|
"loss": 1.7029, |
|
"grad_norm": 3.988589286804199, |
|
"learning_rate": 0.00028714975845410627, |
|
"epoch": 4.834782608695652, |
|
"step": 2780 |
|
}, |
|
{ |
|
"loss": 1.6641, |
|
"grad_norm": 4.16325044631958, |
|
"learning_rate": 0.0002852173913043478, |
|
"epoch": 4.869565217391305, |
|
"step": 2800 |
|
}, |
|
{ |
|
"loss": 1.6953, |
|
"grad_norm": 4.323537349700928, |
|
"learning_rate": 0.0002832850241545894, |
|
"epoch": 4.904347826086957, |
|
"step": 2820 |
|
}, |
|
{ |
|
"loss": 1.5863, |
|
"grad_norm": 3.8293144702911377, |
|
"learning_rate": 0.0002813526570048309, |
|
"epoch": 4.939130434782609, |
|
"step": 2840 |
|
}, |
|
{ |
|
"loss": 1.6276, |
|
"grad_norm": 3.8955535888671875, |
|
"learning_rate": 0.00027942028985507246, |
|
"epoch": 4.973913043478261, |
|
"step": 2860 |
|
}, |
|
{ |
|
"eval_loss": 0.8229038715362549, |
|
"eval_accuracy": 0.9677202420981843, |
|
"eval_runtime": 88.6744, |
|
"eval_samples_per_second": 16.769, |
|
"eval_steps_per_second": 16.769, |
|
"epoch": 5.0, |
|
"step": 2875 |
|
}, |
|
{ |
|
"loss": 1.5701, |
|
"grad_norm": 3.8480091094970703, |
|
"learning_rate": 0.00027748792270531405, |
|
"epoch": 5.008695652173913, |
|
"step": 2880 |
|
}, |
|
{ |
|
"loss": 1.3786, |
|
"grad_norm": 3.679872512817383, |
|
"learning_rate": 0.0002755555555555556, |
|
"epoch": 5.043478260869565, |
|
"step": 2900 |
|
}, |
|
{ |
|
"loss": 1.3563, |
|
"grad_norm": 4.13381290435791, |
|
"learning_rate": 0.00027362318840579706, |
|
"epoch": 5.078260869565217, |
|
"step": 2920 |
|
}, |
|
{ |
|
"loss": 1.3588, |
|
"grad_norm": 3.7467329502105713, |
|
"learning_rate": 0.00027169082125603865, |
|
"epoch": 5.113043478260869, |
|
"step": 2940 |
|
}, |
|
{ |
|
"loss": 1.3782, |
|
"grad_norm": 3.5837419033050537, |
|
"learning_rate": 0.0002698550724637681, |
|
"epoch": 5.147826086956521, |
|
"step": 2960 |
|
}, |
|
{ |
|
"loss": 1.3969, |
|
"grad_norm": 4.077097415924072, |
|
"learning_rate": 0.00026792270531400964, |
|
"epoch": 5.182608695652174, |
|
"step": 2980 |
|
}, |
|
{ |
|
"loss": 1.3346, |
|
"grad_norm": 3.5995211601257324, |
|
"learning_rate": 0.00026599033816425123, |
|
"epoch": 5.217391304347826, |
|
"step": 3000 |
|
}, |
|
{ |
|
"loss": 1.3772, |
|
"grad_norm": 3.714010000228882, |
|
"learning_rate": 0.00026405797101449277, |
|
"epoch": 5.252173913043478, |
|
"step": 3020 |
|
}, |
|
{ |
|
"loss": 1.3452, |
|
"grad_norm": 3.807094097137451, |
|
"learning_rate": 0.00026231884057971016, |
|
"epoch": 5.28695652173913, |
|
"step": 3040 |
|
}, |
|
{ |
|
"loss": 1.3161, |
|
"grad_norm": 4.012477397918701, |
|
"learning_rate": 0.0002603864734299517, |
|
"epoch": 5.321739130434783, |
|
"step": 3060 |
|
}, |
|
{ |
|
"loss": 1.3146, |
|
"grad_norm": 3.850520372390747, |
|
"learning_rate": 0.0002584541062801932, |
|
"epoch": 5.356521739130435, |
|
"step": 3080 |
|
}, |
|
{ |
|
"loss": 1.3057, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00025661835748792274, |
|
"epoch": 5.391304347826087, |
|
"step": 3100 |
|
}, |
|
{ |
|
"loss": 1.2619, |
|
"grad_norm": 3.697744607925415, |
|
"learning_rate": 0.0002546859903381643, |
|
"epoch": 5.426086956521739, |
|
"step": 3120 |
|
}, |
|
{ |
|
"loss": 1.3436, |
|
"grad_norm": 4.125018119812012, |
|
"learning_rate": 0.00025275362318840576, |
|
"epoch": 5.460869565217392, |
|
"step": 3140 |
|
}, |
|
{ |
|
"loss": 1.3289, |
|
"grad_norm": 4.1491899490356445, |
|
"learning_rate": 0.00025082125603864735, |
|
"epoch": 5.495652173913044, |
|
"step": 3160 |
|
}, |
|
{ |
|
"loss": 1.218, |
|
"grad_norm": 3.9294846057891846, |
|
"learning_rate": 0.0002488888888888889, |
|
"epoch": 5.530434782608696, |
|
"step": 3180 |
|
}, |
|
{ |
|
"loss": 1.3219, |
|
"grad_norm": 3.9030706882476807, |
|
"learning_rate": 0.00024695652173913047, |
|
"epoch": 5.565217391304348, |
|
"step": 3200 |
|
}, |
|
{ |
|
"loss": 1.2694, |
|
"grad_norm": 4.124849319458008, |
|
"learning_rate": 0.000245024154589372, |
|
"epoch": 5.6, |
|
"step": 3220 |
|
}, |
|
{ |
|
"loss": 1.2379, |
|
"grad_norm": 4.1668500900268555, |
|
"learning_rate": 0.0002432850241545894, |
|
"epoch": 5.6347826086956525, |
|
"step": 3240 |
|
}, |
|
{ |
|
"loss": 1.2892, |
|
"grad_norm": 4.098198890686035, |
|
"learning_rate": 0.00024135265700483093, |
|
"epoch": 5.6695652173913045, |
|
"step": 3260 |
|
}, |
|
{ |
|
"loss": 1.2742, |
|
"grad_norm": 3.690241813659668, |
|
"learning_rate": 0.00023942028985507246, |
|
"epoch": 5.7043478260869565, |
|
"step": 3280 |
|
}, |
|
{ |
|
"loss": 1.1755, |
|
"grad_norm": 3.978963613510132, |
|
"learning_rate": 0.00023748792270531402, |
|
"epoch": 5.739130434782608, |
|
"step": 3300 |
|
}, |
|
{ |
|
"loss": 1.2256, |
|
"grad_norm": 3.7397215366363525, |
|
"learning_rate": 0.00023574879227053139, |
|
"epoch": 5.773913043478261, |
|
"step": 3320 |
|
}, |
|
{ |
|
"loss": 1.238, |
|
"grad_norm": 3.9201064109802246, |
|
"learning_rate": 0.00023391304347826088, |
|
"epoch": 5.808695652173913, |
|
"step": 3340 |
|
}, |
|
{ |
|
"loss": 1.1706, |
|
"grad_norm": 3.725389242172241, |
|
"learning_rate": 0.0002319806763285024, |
|
"epoch": 5.843478260869565, |
|
"step": 3360 |
|
}, |
|
{ |
|
"loss": 1.1644, |
|
"grad_norm": 3.5844123363494873, |
|
"learning_rate": 0.00023004830917874397, |
|
"epoch": 5.878260869565217, |
|
"step": 3380 |
|
}, |
|
{ |
|
"loss": 1.2256, |
|
"grad_norm": 3.79936146736145, |
|
"learning_rate": 0.00022821256038647343, |
|
"epoch": 5.913043478260869, |
|
"step": 3400 |
|
}, |
|
{ |
|
"loss": 1.2488, |
|
"grad_norm": 3.5947725772857666, |
|
"learning_rate": 0.00022628019323671497, |
|
"epoch": 5.947826086956522, |
|
"step": 3420 |
|
}, |
|
{ |
|
"loss": 1.1418, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00022444444444444446, |
|
"epoch": 5.982608695652174, |
|
"step": 3440 |
|
}, |
|
{ |
|
"eval_loss": 0.5840117335319519, |
|
"eval_accuracy": 0.9757901815736382, |
|
"eval_runtime": 97.2696, |
|
"eval_samples_per_second": 15.287, |
|
"eval_steps_per_second": 15.287, |
|
"epoch": 6.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"loss": 1.1254, |
|
"grad_norm": 3.5959298610687256, |
|
"learning_rate": 0.00022260869565217392, |
|
"epoch": 6.017391304347826, |
|
"step": 3460 |
|
}, |
|
{ |
|
"loss": 1.0343, |
|
"grad_norm": 3.9623775482177734, |
|
"learning_rate": 0.00022067632850241545, |
|
"epoch": 6.052173913043478, |
|
"step": 3480 |
|
}, |
|
{ |
|
"loss": 1.0348, |
|
"grad_norm": 3.735102415084839, |
|
"learning_rate": 0.00021874396135265702, |
|
"epoch": 6.086956521739131, |
|
"step": 3500 |
|
}, |
|
{ |
|
"loss": 0.9796, |
|
"grad_norm": 3.4255013465881348, |
|
"learning_rate": 0.00021681159420289855, |
|
"epoch": 6.121739130434783, |
|
"step": 3520 |
|
}, |
|
{ |
|
"loss": 0.9865, |
|
"grad_norm": 3.981841564178467, |
|
"learning_rate": 0.00021497584541062804, |
|
"epoch": 6.156521739130435, |
|
"step": 3540 |
|
}, |
|
{ |
|
"loss": 1.0054, |
|
"grad_norm": 3.9057116508483887, |
|
"learning_rate": 0.00021314009661835748, |
|
"epoch": 6.191304347826087, |
|
"step": 3560 |
|
}, |
|
{ |
|
"loss": 1.0012, |
|
"grad_norm": 3.626560688018799, |
|
"learning_rate": 0.00021120772946859904, |
|
"epoch": 6.226086956521739, |
|
"step": 3580 |
|
}, |
|
{ |
|
"loss": 1.0129, |
|
"grad_norm": 3.687683582305908, |
|
"learning_rate": 0.0002093719806763285, |
|
"epoch": 6.260869565217392, |
|
"step": 3600 |
|
}, |
|
{ |
|
"loss": 0.9333, |
|
"grad_norm": 3.8632826805114746, |
|
"learning_rate": 0.00020763285024154592, |
|
"epoch": 6.2956521739130435, |
|
"step": 3620 |
|
}, |
|
{ |
|
"loss": 1.0259, |
|
"grad_norm": 4.089422702789307, |
|
"learning_rate": 0.0002058937198067633, |
|
"epoch": 6.3304347826086955, |
|
"step": 3640 |
|
}, |
|
{ |
|
"loss": 1.0184, |
|
"grad_norm": 4.261268615722656, |
|
"learning_rate": 0.00020415458937198067, |
|
"epoch": 6.3652173913043475, |
|
"step": 3660 |
|
}, |
|
{ |
|
"loss": 1.0293, |
|
"grad_norm": 2.3901586532592773, |
|
"learning_rate": 0.0002026086956521739, |
|
"epoch": 6.4, |
|
"step": 3680 |
|
}, |
|
{ |
|
"loss": 1.0026, |
|
"grad_norm": 2.233633518218994, |
|
"learning_rate": 0.00020067632850241546, |
|
"epoch": 6.434782608695652, |
|
"step": 3700 |
|
}, |
|
{ |
|
"loss": 1.0426, |
|
"grad_norm": 2.049773693084717, |
|
"learning_rate": 0.00019893719806763285, |
|
"epoch": 6.469565217391304, |
|
"step": 3720 |
|
}, |
|
{ |
|
"loss": 1.0324, |
|
"grad_norm": 2.21939754486084, |
|
"learning_rate": 0.0001970048309178744, |
|
"epoch": 6.504347826086956, |
|
"step": 3740 |
|
}, |
|
{ |
|
"loss": 1.0666, |
|
"grad_norm": 2.2138895988464355, |
|
"learning_rate": 0.00019516908212560387, |
|
"epoch": 6.539130434782608, |
|
"step": 3760 |
|
}, |
|
{ |
|
"loss": 1.0724, |
|
"grad_norm": 1.9186855554580688, |
|
"learning_rate": 0.0001932367149758454, |
|
"epoch": 6.573913043478261, |
|
"step": 3780 |
|
}, |
|
{ |
|
"loss": 1.0867, |
|
"grad_norm": 1.302451729774475, |
|
"learning_rate": 0.00019159420289855073, |
|
"epoch": 6.608695652173913, |
|
"step": 3800 |
|
}, |
|
{ |
|
"loss": 1.0659, |
|
"grad_norm": 1.1770459413528442, |
|
"learning_rate": 0.00018975845410628022, |
|
"epoch": 6.643478260869565, |
|
"step": 3820 |
|
}, |
|
{ |
|
"loss": 1.0494, |
|
"grad_norm": 0.2651650309562683, |
|
"learning_rate": 0.0001881159420289855, |
|
"epoch": 6.678260869565217, |
|
"step": 3840 |
|
}, |
|
{ |
|
"loss": 1.0464, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001867632850241546, |
|
"epoch": 6.71304347826087, |
|
"step": 3860 |
|
}, |
|
{ |
|
"loss": 1.0457, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.000185024154589372, |
|
"epoch": 6.747826086956522, |
|
"step": 3880 |
|
}, |
|
{ |
|
"loss": 0.9815, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00018328502415458937, |
|
"epoch": 6.782608695652174, |
|
"step": 3900 |
|
}, |
|
{ |
|
"loss": 1.0094, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001816425120772947, |
|
"epoch": 6.817391304347826, |
|
"step": 3920 |
|
}, |
|
{ |
|
"loss": 1.0023, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018028985507246377, |
|
"epoch": 6.852173913043478, |
|
"step": 3940 |
|
}, |
|
{ |
|
"loss": 1.0278, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00017893719806763288, |
|
"epoch": 6.886956521739131, |
|
"step": 3960 |
|
}, |
|
{ |
|
"loss": 1.0123, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001771014492753623, |
|
"epoch": 6.921739130434783, |
|
"step": 3980 |
|
}, |
|
{ |
|
"loss": 1.0774, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00017565217391304346, |
|
"epoch": 6.956521739130435, |
|
"step": 4000 |
|
}, |
|
{ |
|
"loss": 1.0484, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00017391304347826088, |
|
"epoch": 6.9913043478260875, |
|
"step": 4020 |
|
}, |
|
{ |
|
"eval_loss": 0.5780686736106873, |
|
"eval_accuracy": 0.9737726967047747, |
|
"eval_runtime": 118.8154, |
|
"eval_samples_per_second": 12.515, |
|
"eval_steps_per_second": 12.515, |
|
"epoch": 7.0, |
|
"step": 4025 |
|
}, |
|
{ |
|
"loss": 0.9799, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001723671497584541, |
|
"epoch": 7.026086956521739, |
|
"step": 4040 |
|
}, |
|
{ |
|
"loss": 0.9588, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00017091787439613525, |
|
"epoch": 7.060869565217391, |
|
"step": 4060 |
|
}, |
|
{ |
|
"loss": 0.9421, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016966183574879226, |
|
"epoch": 7.095652173913043, |
|
"step": 4080 |
|
}, |
|
{ |
|
"loss": 0.9551, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00016782608695652175, |
|
"epoch": 7.130434782608695, |
|
"step": 4100 |
|
}, |
|
{ |
|
"loss": 0.9622, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00016618357487922704, |
|
"epoch": 7.165217391304348, |
|
"step": 4120 |
|
}, |
|
{ |
|
"loss": 0.9712, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00016444444444444446, |
|
"epoch": 7.2, |
|
"step": 4140 |
|
}, |
|
{ |
|
"loss": 0.9834, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00016299516908212561, |
|
"epoch": 7.234782608695652, |
|
"step": 4160 |
|
}, |
|
{ |
|
"loss": 0.9968, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016135265700483093, |
|
"epoch": 7.269565217391304, |
|
"step": 4180 |
|
}, |
|
{ |
|
"loss": 0.956, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00015961352657004833, |
|
"epoch": 7.304347826086957, |
|
"step": 4200 |
|
}, |
|
{ |
|
"loss": 0.8981, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00015806763285024155, |
|
"epoch": 7.339130434782609, |
|
"step": 4220 |
|
}, |
|
{ |
|
"loss": 0.9515, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00015642512077294684, |
|
"epoch": 7.373913043478261, |
|
"step": 4240 |
|
}, |
|
{ |
|
"loss": 0.9535, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001548792270531401, |
|
"epoch": 7.408695652173913, |
|
"step": 4260 |
|
}, |
|
{ |
|
"loss": 0.9646, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015333333333333334, |
|
"epoch": 7.443478260869565, |
|
"step": 4280 |
|
}, |
|
{ |
|
"loss": 0.9821, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00015140096618357487, |
|
"epoch": 7.478260869565218, |
|
"step": 4300 |
|
}, |
|
{ |
|
"loss": 0.9259, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00015014492753623188, |
|
"epoch": 7.51304347826087, |
|
"step": 4320 |
|
}, |
|
{ |
|
"loss": 0.9494, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00014869565217391303, |
|
"epoch": 7.547826086956522, |
|
"step": 4340 |
|
}, |
|
{ |
|
"loss": 0.9305, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00014714975845410628, |
|
"epoch": 7.582608695652174, |
|
"step": 4360 |
|
}, |
|
{ |
|
"loss": 0.8889, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0001455072463768116, |
|
"epoch": 7.6173913043478265, |
|
"step": 4380 |
|
}, |
|
{ |
|
"loss": 0.9524, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00014396135265700482, |
|
"epoch": 7.6521739130434785, |
|
"step": 4400 |
|
}, |
|
{ |
|
"loss": 0.9065, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00014231884057971014, |
|
"epoch": 7.6869565217391305, |
|
"step": 4420 |
|
}, |
|
{ |
|
"loss": 0.9153, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.00014048309178743963, |
|
"epoch": 7.721739130434782, |
|
"step": 4440 |
|
}, |
|
{ |
|
"loss": 0.6675, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.756521739130434, |
|
"step": 4460 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.791304347826087, |
|
"step": 4480 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.826086956521739, |
|
"step": 4500 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.860869565217391, |
|
"step": 4520 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.895652173913043, |
|
"step": 4540 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.930434782608696, |
|
"step": 4560 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 7.965217391304348, |
|
"step": 4580 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"eval_loss": NaN, |
|
"eval_accuracy": 0.0006724949562878278, |
|
"eval_runtime": 129.6238, |
|
"eval_samples_per_second": 11.472, |
|
"eval_steps_per_second": 11.472, |
|
"epoch": 8.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.034782608695652, |
|
"step": 4620 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.069565217391304, |
|
"step": 4640 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.104347826086956, |
|
"step": 4660 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.139130434782608, |
|
"step": 4680 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.173913043478262, |
|
"step": 4700 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.208695652173914, |
|
"step": 4720 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.243478260869566, |
|
"step": 4740 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.278260869565218, |
|
"step": 4760 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.31304347826087, |
|
"step": 4780 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.347826086956522, |
|
"step": 4800 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.382608695652173, |
|
"step": 4820 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.417391304347825, |
|
"step": 4840 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.452173913043477, |
|
"step": 4860 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.486956521739131, |
|
"step": 4880 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.521739130434783, |
|
"step": 4900 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.556521739130435, |
|
"step": 4920 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.591304347826087, |
|
"step": 4940 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.626086956521739, |
|
"step": 4960 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.660869565217391, |
|
"step": 4980 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.695652173913043, |
|
"step": 5000 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.730434782608695, |
|
"step": 5020 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.765217391304347, |
|
"step": 5040 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.8, |
|
"step": 5060 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.834782608695653, |
|
"step": 5080 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.869565217391305, |
|
"step": 5100 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.904347826086957, |
|
"step": 5120 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.939130434782609, |
|
"step": 5140 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 8.97391304347826, |
|
"step": 5160 |
|
}, |
|
{ |
|
"eval_loss": NaN, |
|
"eval_accuracy": 0.0006724949562878278, |
|
"eval_runtime": 117.1288, |
|
"eval_samples_per_second": 12.695, |
|
"eval_steps_per_second": 12.695, |
|
"epoch": 9.0, |
|
"step": 5175 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.008695652173913, |
|
"step": 5180 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.043478260869565, |
|
"step": 5200 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.078260869565218, |
|
"step": 5220 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.11304347826087, |
|
"step": 5240 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.147826086956522, |
|
"step": 5260 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.182608695652174, |
|
"step": 5280 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.217391304347826, |
|
"step": 5300 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.252173913043478, |
|
"step": 5320 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.28695652173913, |
|
"step": 5340 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.321739130434782, |
|
"step": 5360 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.356521739130434, |
|
"step": 5380 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.391304347826088, |
|
"step": 5400 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.42608695652174, |
|
"step": 5420 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.460869565217392, |
|
"step": 5440 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.495652173913044, |
|
"step": 5460 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.530434782608696, |
|
"step": 5480 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.565217391304348, |
|
"step": 5500 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.6, |
|
"step": 5520 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.634782608695652, |
|
"step": 5540 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.669565217391304, |
|
"step": 5560 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.704347826086957, |
|
"step": 5580 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.73913043478261, |
|
"step": 5600 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.773913043478261, |
|
"step": 5620 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.808695652173913, |
|
"step": 5640 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.843478260869565, |
|
"step": 5660 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.878260869565217, |
|
"step": 5680 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.91304347826087, |
|
"step": 5700 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.947826086956521, |
|
"step": 5720 |
|
}, |
|
{ |
|
"loss": 0.0, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001403864734299517, |
|
"epoch": 9.982608695652173, |
|
"step": 5740 |
|
}, |
|
{ |
|
"eval_loss": NaN, |
|
"eval_accuracy": 0.0006724949562878278, |
|
"eval_runtime": 103.3199, |
|
"eval_samples_per_second": 14.392, |
|
"eval_steps_per_second": 14.392, |
|
"epoch": 10.0, |
|
"step": 5750 |
|
}, |
|
{ |
|
"train_runtime": 59857.6179, |
|
"train_samples_per_second": 24.584, |
|
"train_steps_per_second": 0.096, |
|
"total_flos": 2.7398100529152e+18, |
|
"train_loss": 2.9414075751926587, |
|
"epoch": 10.0, |
|
"step": 5750 |
|
} |
|
] |