{ "best_metric": 0.9716312056737588, "best_model_checkpoint": "./results/checkpoint-3807", "epoch": 70.0, "eval_steps": 500, "global_step": 5670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 6.570446014404297, "learning_rate": 1.9728395061728395e-05, "loss": 2.6389, "step": 81 }, { "epoch": 1.0, "eval_accuracy": 0.475177304964539, "eval_loss": 0.7098350524902344, "eval_runtime": 0.3441, "eval_samples_per_second": 819.478, "eval_steps_per_second": 52.307, "step": 81 }, { "epoch": 2.0, "grad_norm": 4.623419284820557, "learning_rate": 1.944268077601411e-05, "loss": 0.6477, "step": 162 }, { "epoch": 2.0, "eval_accuracy": 0.48226950354609927, "eval_loss": 0.7516428828239441, "eval_runtime": 0.3434, "eval_samples_per_second": 821.083, "eval_steps_per_second": 52.41, "step": 162 }, { "epoch": 3.0, "grad_norm": 10.926794052124023, "learning_rate": 1.9156966490299824e-05, "loss": 0.6227, "step": 243 }, { "epoch": 3.0, "eval_accuracy": 0.4929078014184397, "eval_loss": 0.8317187428474426, "eval_runtime": 0.3439, "eval_samples_per_second": 820.006, "eval_steps_per_second": 52.341, "step": 243 }, { "epoch": 4.0, "grad_norm": 12.648384094238281, "learning_rate": 1.887125220458554e-05, "loss": 0.5403, "step": 324 }, { "epoch": 4.0, "eval_accuracy": 0.4929078014184397, "eval_loss": 1.9380121231079102, "eval_runtime": 0.3424, "eval_samples_per_second": 823.561, "eval_steps_per_second": 52.568, "step": 324 }, { "epoch": 5.0, "grad_norm": 23.567258834838867, "learning_rate": 1.8585537918871256e-05, "loss": 0.5108, "step": 405 }, { "epoch": 5.0, "eval_accuracy": 0.49645390070921985, "eval_loss": 2.270359754562378, "eval_runtime": 0.3437, "eval_samples_per_second": 820.538, "eval_steps_per_second": 52.375, "step": 405 }, { "epoch": 6.0, "grad_norm": 3.5719075202941895, "learning_rate": 1.830335097001764e-05, "loss": 0.4677, "step": 486 }, { "epoch": 6.0, "eval_accuracy": 0.48936170212765956, "eval_loss": 1.6858181953430176, "eval_runtime": 0.3432, "eval_samples_per_second": 821.693, "eval_steps_per_second": 52.449, "step": 486 }, { "epoch": 7.0, "grad_norm": 7.08165168762207, "learning_rate": 1.8017636684303353e-05, "loss": 0.4798, "step": 567 }, { "epoch": 7.0, "eval_accuracy": 0.49645390070921985, "eval_loss": 1.623734712600708, "eval_runtime": 0.3436, "eval_samples_per_second": 820.682, "eval_steps_per_second": 52.384, "step": 567 }, { "epoch": 8.0, "grad_norm": 10.894269943237305, "learning_rate": 1.773192239858907e-05, "loss": 0.4817, "step": 648 }, { "epoch": 8.0, "eval_accuracy": 0.5141843971631206, "eval_loss": 1.3935478925704956, "eval_runtime": 0.3435, "eval_samples_per_second": 821.029, "eval_steps_per_second": 52.406, "step": 648 }, { "epoch": 9.0, "grad_norm": 7.739453315734863, "learning_rate": 1.744620811287478e-05, "loss": 0.4668, "step": 729 }, { "epoch": 9.0, "eval_accuracy": 0.5177304964539007, "eval_loss": 1.259345531463623, "eval_runtime": 0.343, "eval_samples_per_second": 822.053, "eval_steps_per_second": 52.471, "step": 729 }, { "epoch": 10.0, "grad_norm": 17.012800216674805, "learning_rate": 1.7160493827160498e-05, "loss": 0.4359, "step": 810 }, { "epoch": 10.0, "eval_accuracy": 0.5354609929078015, "eval_loss": 1.310729742050171, "eval_runtime": 0.3436, "eval_samples_per_second": 820.694, "eval_steps_per_second": 52.385, "step": 810 }, { "epoch": 11.0, "grad_norm": 1.6642764806747437, "learning_rate": 1.687477954144621e-05, "loss": 0.3956, "step": 891 }, { "epoch": 11.0, "eval_accuracy": 0.8226950354609929, "eval_loss": 0.43421775102615356, "eval_runtime": 0.3435, "eval_samples_per_second": 820.982, "eval_steps_per_second": 52.403, "step": 891 }, { "epoch": 12.0, "grad_norm": 0.3688388168811798, "learning_rate": 1.6589065255731923e-05, "loss": 0.2906, "step": 972 }, { "epoch": 12.0, "eval_accuracy": 0.9290780141843972, "eval_loss": 0.23947754502296448, "eval_runtime": 0.3442, "eval_samples_per_second": 819.25, "eval_steps_per_second": 52.293, "step": 972 }, { "epoch": 13.0, "grad_norm": 37.02349853515625, "learning_rate": 1.630335097001764e-05, "loss": 0.2146, "step": 1053 }, { "epoch": 13.0, "eval_accuracy": 0.9397163120567376, "eval_loss": 0.33284759521484375, "eval_runtime": 0.3437, "eval_samples_per_second": 820.462, "eval_steps_per_second": 52.37, "step": 1053 }, { "epoch": 14.0, "grad_norm": 0.11939908564090729, "learning_rate": 1.601763668430335e-05, "loss": 0.1462, "step": 1134 }, { "epoch": 14.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.3009294867515564, "eval_runtime": 0.3439, "eval_samples_per_second": 819.984, "eval_steps_per_second": 52.339, "step": 1134 }, { "epoch": 15.0, "grad_norm": 0.08733003586530685, "learning_rate": 1.5731922398589064e-05, "loss": 0.1062, "step": 1215 }, { "epoch": 15.0, "eval_accuracy": 0.9290780141843972, "eval_loss": 0.21407951414585114, "eval_runtime": 0.3436, "eval_samples_per_second": 820.805, "eval_steps_per_second": 52.392, "step": 1215 }, { "epoch": 16.0, "grad_norm": 0.21886540949344635, "learning_rate": 1.544620811287478e-05, "loss": 0.0813, "step": 1296 }, { "epoch": 16.0, "eval_accuracy": 0.9432624113475178, "eval_loss": 0.34917283058166504, "eval_runtime": 0.344, "eval_samples_per_second": 819.711, "eval_steps_per_second": 52.322, "step": 1296 }, { "epoch": 17.0, "grad_norm": 0.5847246646881104, "learning_rate": 1.5160493827160495e-05, "loss": 0.1027, "step": 1377 }, { "epoch": 17.0, "eval_accuracy": 0.9219858156028369, "eval_loss": 0.3432806432247162, "eval_runtime": 0.3446, "eval_samples_per_second": 818.425, "eval_steps_per_second": 52.24, "step": 1377 }, { "epoch": 18.0, "grad_norm": 0.6198065280914307, "learning_rate": 1.4874779541446209e-05, "loss": 0.0736, "step": 1458 }, { "epoch": 18.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.27183273434638977, "eval_runtime": 0.3437, "eval_samples_per_second": 820.405, "eval_steps_per_second": 52.366, "step": 1458 }, { "epoch": 19.0, "grad_norm": 0.5257266163825989, "learning_rate": 1.4589065255731925e-05, "loss": 0.0684, "step": 1539 }, { "epoch": 19.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.25684282183647156, "eval_runtime": 0.3434, "eval_samples_per_second": 821.157, "eval_steps_per_second": 52.414, "step": 1539 }, { "epoch": 20.0, "grad_norm": 0.0009818405378609896, "learning_rate": 1.4303350970017638e-05, "loss": 0.0779, "step": 1620 }, { "epoch": 20.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.2152564525604248, "eval_runtime": 0.3431, "eval_samples_per_second": 821.93, "eval_steps_per_second": 52.464, "step": 1620 }, { "epoch": 21.0, "grad_norm": 0.4532203674316406, "learning_rate": 1.4021164021164022e-05, "loss": 0.0745, "step": 1701 }, { "epoch": 21.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.1914406418800354, "eval_runtime": 0.344, "eval_samples_per_second": 819.813, "eval_steps_per_second": 52.329, "step": 1701 }, { "epoch": 22.0, "grad_norm": 17.428327560424805, "learning_rate": 1.3735449735449738e-05, "loss": 0.1106, "step": 1782 }, { "epoch": 22.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.2807099223136902, "eval_runtime": 0.3441, "eval_samples_per_second": 819.457, "eval_steps_per_second": 52.306, "step": 1782 }, { "epoch": 23.0, "grad_norm": 0.00047796443686820567, "learning_rate": 1.344973544973545e-05, "loss": 0.0755, "step": 1863 }, { "epoch": 23.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.331978976726532, "eval_runtime": 0.3453, "eval_samples_per_second": 816.672, "eval_steps_per_second": 52.128, "step": 1863 }, { "epoch": 24.0, "grad_norm": 1.006925106048584, "learning_rate": 1.3164021164021166e-05, "loss": 0.0833, "step": 1944 }, { "epoch": 24.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.34625303745269775, "eval_runtime": 0.3436, "eval_samples_per_second": 820.661, "eval_steps_per_second": 52.383, "step": 1944 }, { "epoch": 25.0, "grad_norm": 0.506279706954956, "learning_rate": 1.288183421516755e-05, "loss": 0.0754, "step": 2025 }, { "epoch": 25.0, "eval_accuracy": 0.9432624113475178, "eval_loss": 0.34365448355674744, "eval_runtime": 0.3432, "eval_samples_per_second": 821.691, "eval_steps_per_second": 52.448, "step": 2025 }, { "epoch": 26.0, "grad_norm": 0.1998976171016693, "learning_rate": 1.2596119929453263e-05, "loss": 0.0772, "step": 2106 }, { "epoch": 26.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.3350883424282074, "eval_runtime": 0.3435, "eval_samples_per_second": 820.852, "eval_steps_per_second": 52.395, "step": 2106 }, { "epoch": 27.0, "grad_norm": 0.19478876888751984, "learning_rate": 1.2310405643738979e-05, "loss": 0.076, "step": 2187 }, { "epoch": 27.0, "eval_accuracy": 0.9468085106382979, "eval_loss": 0.4145265519618988, "eval_runtime": 0.3445, "eval_samples_per_second": 818.483, "eval_steps_per_second": 52.244, "step": 2187 }, { "epoch": 28.0, "grad_norm": 0.27469512820243835, "learning_rate": 1.2024691358024691e-05, "loss": 0.0625, "step": 2268 }, { "epoch": 28.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.44451093673706055, "eval_runtime": 0.3439, "eval_samples_per_second": 819.913, "eval_steps_per_second": 52.335, "step": 2268 }, { "epoch": 29.0, "grad_norm": 26.14291000366211, "learning_rate": 1.1738977072310408e-05, "loss": 0.0741, "step": 2349 }, { "epoch": 29.0, "eval_accuracy": 0.9468085106382979, "eval_loss": 0.29801085591316223, "eval_runtime": 0.3448, "eval_samples_per_second": 817.812, "eval_steps_per_second": 52.201, "step": 2349 }, { "epoch": 30.0, "grad_norm": 0.0004499799106270075, "learning_rate": 1.145326278659612e-05, "loss": 0.0649, "step": 2430 }, { "epoch": 30.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.28359255194664, "eval_runtime": 0.3442, "eval_samples_per_second": 819.247, "eval_steps_per_second": 52.292, "step": 2430 }, { "epoch": 31.0, "grad_norm": 0.0018564946949481964, "learning_rate": 1.1167548500881835e-05, "loss": 0.0688, "step": 2511 }, { "epoch": 31.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.21793903410434723, "eval_runtime": 0.3445, "eval_samples_per_second": 818.498, "eval_steps_per_second": 52.245, "step": 2511 }, { "epoch": 32.0, "grad_norm": 0.0009469461510889232, "learning_rate": 1.088183421516755e-05, "loss": 0.0735, "step": 2592 }, { "epoch": 32.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.22946923971176147, "eval_runtime": 0.3449, "eval_samples_per_second": 817.666, "eval_steps_per_second": 52.191, "step": 2592 }, { "epoch": 33.0, "grad_norm": 0.4778638184070587, "learning_rate": 1.0596119929453263e-05, "loss": 0.0648, "step": 2673 }, { "epoch": 33.0, "eval_accuracy": 0.9468085106382979, "eval_loss": 0.42410480976104736, "eval_runtime": 0.3433, "eval_samples_per_second": 821.406, "eval_steps_per_second": 52.43, "step": 2673 }, { "epoch": 34.0, "grad_norm": 0.21737487614154816, "learning_rate": 1.031040564373898e-05, "loss": 0.0672, "step": 2754 }, { "epoch": 34.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.2829430401325226, "eval_runtime": 0.3447, "eval_samples_per_second": 818.124, "eval_steps_per_second": 52.221, "step": 2754 }, { "epoch": 35.0, "grad_norm": 0.08269879966974258, "learning_rate": 1.0024691358024692e-05, "loss": 0.067, "step": 2835 }, { "epoch": 35.0, "eval_accuracy": 0.9468085106382979, "eval_loss": 0.3723122179508209, "eval_runtime": 0.3448, "eval_samples_per_second": 817.778, "eval_steps_per_second": 52.199, "step": 2835 }, { "epoch": 36.0, "grad_norm": 0.3665499687194824, "learning_rate": 9.738977072310406e-06, "loss": 0.0768, "step": 2916 }, { "epoch": 36.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.25441667437553406, "eval_runtime": 0.3447, "eval_samples_per_second": 818.182, "eval_steps_per_second": 52.224, "step": 2916 }, { "epoch": 37.0, "grad_norm": 0.11919476091861725, "learning_rate": 9.45326278659612e-06, "loss": 0.0691, "step": 2997 }, { "epoch": 37.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.20481815934181213, "eval_runtime": 0.3445, "eval_samples_per_second": 818.558, "eval_steps_per_second": 52.248, "step": 2997 }, { "epoch": 38.0, "grad_norm": 0.0036801116075366735, "learning_rate": 9.167548500881835e-06, "loss": 0.0661, "step": 3078 }, { "epoch": 38.0, "eval_accuracy": 0.9680851063829787, "eval_loss": 0.20478524267673492, "eval_runtime": 0.3445, "eval_samples_per_second": 818.468, "eval_steps_per_second": 52.243, "step": 3078 }, { "epoch": 39.0, "grad_norm": 0.12663815915584564, "learning_rate": 8.88183421516755e-06, "loss": 0.0409, "step": 3159 }, { "epoch": 39.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.18502239882946014, "eval_runtime": 0.3434, "eval_samples_per_second": 821.144, "eval_steps_per_second": 52.413, "step": 3159 }, { "epoch": 40.0, "grad_norm": 0.06950168311595917, "learning_rate": 8.596119929453264e-06, "loss": 0.0424, "step": 3240 }, { "epoch": 40.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.20747074484825134, "eval_runtime": 0.3445, "eval_samples_per_second": 818.693, "eval_steps_per_second": 52.257, "step": 3240 }, { "epoch": 41.0, "grad_norm": 0.09251494705677032, "learning_rate": 8.310405643738978e-06, "loss": 0.0381, "step": 3321 }, { "epoch": 41.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.2633875906467438, "eval_runtime": 0.3468, "eval_samples_per_second": 813.14, "eval_steps_per_second": 51.903, "step": 3321 }, { "epoch": 42.0, "grad_norm": 0.06917154043912888, "learning_rate": 8.024691358024692e-06, "loss": 0.0383, "step": 3402 }, { "epoch": 42.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.3520617187023163, "eval_runtime": 0.3447, "eval_samples_per_second": 818.036, "eval_steps_per_second": 52.215, "step": 3402 }, { "epoch": 43.0, "grad_norm": 0.0010325413895770907, "learning_rate": 7.738977072310407e-06, "loss": 0.0288, "step": 3483 }, { "epoch": 43.0, "eval_accuracy": 0.9680851063829787, "eval_loss": 0.2726523280143738, "eval_runtime": 0.3428, "eval_samples_per_second": 822.588, "eval_steps_per_second": 52.506, "step": 3483 }, { "epoch": 44.0, "grad_norm": 0.04726780578494072, "learning_rate": 7.45326278659612e-06, "loss": 0.035, "step": 3564 }, { "epoch": 44.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.2995310127735138, "eval_runtime": 0.3442, "eval_samples_per_second": 819.308, "eval_steps_per_second": 52.296, "step": 3564 }, { "epoch": 45.0, "grad_norm": 0.09283600747585297, "learning_rate": 7.167548500881835e-06, "loss": 0.0265, "step": 3645 }, { "epoch": 45.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.33694958686828613, "eval_runtime": 0.3443, "eval_samples_per_second": 818.994, "eval_steps_per_second": 52.276, "step": 3645 }, { "epoch": 46.0, "grad_norm": 0.03685113787651062, "learning_rate": 6.881834215167549e-06, "loss": 0.0217, "step": 3726 }, { "epoch": 46.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.35722091794013977, "eval_runtime": 0.3438, "eval_samples_per_second": 820.281, "eval_steps_per_second": 52.358, "step": 3726 }, { "epoch": 47.0, "grad_norm": 0.04708189144730568, "learning_rate": 6.596119929453263e-06, "loss": 0.0259, "step": 3807 }, { "epoch": 47.0, "eval_accuracy": 0.9716312056737588, "eval_loss": 0.21833930909633636, "eval_runtime": 0.3427, "eval_samples_per_second": 822.913, "eval_steps_per_second": 52.526, "step": 3807 }, { "epoch": 48.0, "grad_norm": 0.06329997628927231, "learning_rate": 6.310405643738977e-06, "loss": 0.0264, "step": 3888 }, { "epoch": 48.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.2745024561882019, "eval_runtime": 0.3436, "eval_samples_per_second": 820.777, "eval_steps_per_second": 52.39, "step": 3888 }, { "epoch": 49.0, "grad_norm": 0.13020673394203186, "learning_rate": 6.024691358024692e-06, "loss": 0.027, "step": 3969 }, { "epoch": 49.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.3425739109516144, "eval_runtime": 0.3449, "eval_samples_per_second": 817.548, "eval_steps_per_second": 52.184, "step": 3969 }, { "epoch": 50.0, "grad_norm": 0.04181819409132004, "learning_rate": 5.7389770723104065e-06, "loss": 0.023, "step": 4050 }, { "epoch": 50.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.37068530917167664, "eval_runtime": 0.3441, "eval_samples_per_second": 819.471, "eval_steps_per_second": 52.307, "step": 4050 }, { "epoch": 51.0, "grad_norm": 0.03754027560353279, "learning_rate": 5.453262786596121e-06, "loss": 0.0241, "step": 4131 }, { "epoch": 51.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.3041815459728241, "eval_runtime": 0.3443, "eval_samples_per_second": 819.127, "eval_steps_per_second": 52.285, "step": 4131 }, { "epoch": 52.0, "grad_norm": 0.06724414229393005, "learning_rate": 5.167548500881835e-06, "loss": 0.0248, "step": 4212 }, { "epoch": 52.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.3282240927219391, "eval_runtime": 0.3433, "eval_samples_per_second": 821.512, "eval_steps_per_second": 52.437, "step": 4212 }, { "epoch": 53.0, "grad_norm": 0.044111430644989014, "learning_rate": 4.881834215167549e-06, "loss": 0.0267, "step": 4293 }, { "epoch": 53.0, "eval_accuracy": 0.9680851063829787, "eval_loss": 0.2480100840330124, "eval_runtime": 0.3438, "eval_samples_per_second": 820.176, "eval_steps_per_second": 52.352, "step": 4293 }, { "epoch": 54.0, "grad_norm": 0.09385800361633301, "learning_rate": 4.596119929453263e-06, "loss": 0.019, "step": 4374 }, { "epoch": 54.0, "eval_accuracy": 0.9680851063829787, "eval_loss": 0.2954387366771698, "eval_runtime": 0.3444, "eval_samples_per_second": 818.748, "eval_steps_per_second": 52.261, "step": 4374 }, { "epoch": 55.0, "grad_norm": 0.00036285247188061476, "learning_rate": 4.3104056437389775e-06, "loss": 0.0233, "step": 4455 }, { "epoch": 55.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.26300373673439026, "eval_runtime": 0.3483, "eval_samples_per_second": 809.563, "eval_steps_per_second": 51.674, "step": 4455 }, { "epoch": 56.0, "grad_norm": 0.03549063578248024, "learning_rate": 4.024691358024692e-06, "loss": 0.0231, "step": 4536 }, { "epoch": 56.0, "eval_accuracy": 0.9645390070921985, "eval_loss": 0.26614007353782654, "eval_runtime": 0.3434, "eval_samples_per_second": 821.294, "eval_steps_per_second": 52.423, "step": 4536 }, { "epoch": 57.0, "grad_norm": 0.0008688592351973057, "learning_rate": 3.7389770723104058e-06, "loss": 0.0188, "step": 4617 }, { "epoch": 57.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.3676702678203583, "eval_runtime": 0.3441, "eval_samples_per_second": 819.514, "eval_steps_per_second": 52.309, "step": 4617 }, { "epoch": 58.0, "grad_norm": 0.00031407736241817474, "learning_rate": 3.4532627865961205e-06, "loss": 0.0263, "step": 4698 }, { "epoch": 58.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.36925771832466125, "eval_runtime": 0.348, "eval_samples_per_second": 810.368, "eval_steps_per_second": 51.726, "step": 4698 }, { "epoch": 59.0, "grad_norm": 0.040128860622644424, "learning_rate": 3.1675485008818345e-06, "loss": 0.019, "step": 4779 }, { "epoch": 59.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.35094693303108215, "eval_runtime": 0.3436, "eval_samples_per_second": 820.815, "eval_steps_per_second": 52.392, "step": 4779 }, { "epoch": 60.0, "grad_norm": 0.0004439246258698404, "learning_rate": 2.881834215167549e-06, "loss": 0.0202, "step": 4860 }, { "epoch": 60.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.3040333092212677, "eval_runtime": 0.3445, "eval_samples_per_second": 818.559, "eval_steps_per_second": 52.248, "step": 4860 }, { "epoch": 61.0, "grad_norm": 0.07529360055923462, "learning_rate": 2.5961199294532628e-06, "loss": 0.0208, "step": 4941 }, { "epoch": 61.0, "eval_accuracy": 0.9468085106382979, "eval_loss": 0.5039365887641907, "eval_runtime": 0.3439, "eval_samples_per_second": 819.902, "eval_steps_per_second": 52.334, "step": 4941 }, { "epoch": 62.0, "grad_norm": 0.00026053638430312276, "learning_rate": 2.310405643738977e-06, "loss": 0.0242, "step": 5022 }, { "epoch": 62.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.4803861677646637, "eval_runtime": 0.3445, "eval_samples_per_second": 818.64, "eval_steps_per_second": 52.254, "step": 5022 }, { "epoch": 63.0, "grad_norm": 0.06742388755083084, "learning_rate": 2.0246913580246915e-06, "loss": 0.023, "step": 5103 }, { "epoch": 63.0, "eval_accuracy": 0.9609929078014184, "eval_loss": 0.3538144826889038, "eval_runtime": 0.3445, "eval_samples_per_second": 818.51, "eval_steps_per_second": 52.245, "step": 5103 }, { "epoch": 64.0, "grad_norm": 0.00042550539365038276, "learning_rate": 1.7389770723104056e-06, "loss": 0.0189, "step": 5184 }, { "epoch": 64.0, "eval_accuracy": 0.9574468085106383, "eval_loss": 0.37617096304893494, "eval_runtime": 0.3442, "eval_samples_per_second": 819.198, "eval_steps_per_second": 52.289, "step": 5184 }, { "epoch": 65.0, "grad_norm": 0.02407378889620304, "learning_rate": 1.45326278659612e-06, "loss": 0.0209, "step": 5265 }, { "epoch": 65.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.43608424067497253, "eval_runtime": 0.3438, "eval_samples_per_second": 820.243, "eval_steps_per_second": 52.356, "step": 5265 }, { "epoch": 66.0, "grad_norm": 0.054311446845531464, "learning_rate": 1.1675485008818344e-06, "loss": 0.0209, "step": 5346 }, { "epoch": 66.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.41794532537460327, "eval_runtime": 0.3436, "eval_samples_per_second": 820.791, "eval_steps_per_second": 52.391, "step": 5346 }, { "epoch": 67.0, "grad_norm": 0.04109662398695946, "learning_rate": 8.818342151675485e-07, "loss": 0.0198, "step": 5427 }, { "epoch": 67.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.3815895617008209, "eval_runtime": 0.3443, "eval_samples_per_second": 819.013, "eval_steps_per_second": 52.277, "step": 5427 }, { "epoch": 68.0, "grad_norm": 0.13629287481307983, "learning_rate": 5.961199294532629e-07, "loss": 0.0197, "step": 5508 }, { "epoch": 68.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.39786896109580994, "eval_runtime": 0.3445, "eval_samples_per_second": 818.46, "eval_steps_per_second": 52.242, "step": 5508 }, { "epoch": 69.0, "grad_norm": 0.039983708411455154, "learning_rate": 3.104056437389771e-07, "loss": 0.0192, "step": 5589 }, { "epoch": 69.0, "eval_accuracy": 0.950354609929078, "eval_loss": 0.411296546459198, "eval_runtime": 0.3435, "eval_samples_per_second": 820.901, "eval_steps_per_second": 52.398, "step": 5589 }, { "epoch": 70.0, "grad_norm": 0.00027353325276635587, "learning_rate": 2.469135802469136e-08, "loss": 0.0177, "step": 5670 }, { "epoch": 70.0, "eval_accuracy": 0.9539007092198581, "eval_loss": 0.40772485733032227, "eval_runtime": 0.3437, "eval_samples_per_second": 820.466, "eval_steps_per_second": 52.37, "step": 5670 } ], "logging_steps": 100, "max_steps": 5670, "num_input_tokens_seen": 0, "num_train_epochs": 70, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9735501528974304.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }