{ "best_metric": 0.4847618043422699, "best_model_checkpoint": "./model_fine-tune/glot/xlm-r/mlt-Latn/checkpoint-96000", "epoch": 43.656207366984994, "eval_steps": 500, "global_step": 96000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.22737608003638018, "grad_norm": 5.168328285217285, "learning_rate": 9.95e-05, "loss": 1.8057, "step": 500 }, { "epoch": 0.22737608003638018, "eval_accuracy": 0.7673858936928836, "eval_loss": 1.3882914781570435, "eval_runtime": 57.4714, "eval_samples_per_second": 265.141, "eval_steps_per_second": 8.3, "step": 500 }, { "epoch": 0.45475216007276037, "grad_norm": 6.386734485626221, "learning_rate": 9.900000000000001e-05, "loss": 1.2412, "step": 1000 }, { "epoch": 0.45475216007276037, "eval_accuracy": 0.8023059017621376, "eval_loss": 1.1490817070007324, "eval_runtime": 57.9662, "eval_samples_per_second": 262.877, "eval_steps_per_second": 8.229, "step": 1000 }, { "epoch": 0.6821282401091405, "grad_norm": 3.5194196701049805, "learning_rate": 9.850000000000001e-05, "loss": 1.0971, "step": 1500 }, { "epoch": 0.6821282401091405, "eval_accuracy": 0.8191701818204763, "eval_loss": 1.0539811849594116, "eval_runtime": 58.0944, "eval_samples_per_second": 262.297, "eval_steps_per_second": 8.211, "step": 1500 }, { "epoch": 0.9095043201455207, "grad_norm": 4.919663906097412, "learning_rate": 9.8e-05, "loss": 1.0083, "step": 2000 }, { "epoch": 0.9095043201455207, "eval_accuracy": 0.8290244187626605, "eval_loss": 0.9915244579315186, "eval_runtime": 59.0201, "eval_samples_per_second": 258.183, "eval_steps_per_second": 8.082, "step": 2000 }, { "epoch": 1.1368804001819008, "grad_norm": 3.4152417182922363, "learning_rate": 9.75e-05, "loss": 0.9343, "step": 2500 }, { "epoch": 1.1368804001819008, "eval_accuracy": 0.8386819891878221, "eval_loss": 0.9384229779243469, "eval_runtime": 58.1514, "eval_samples_per_second": 262.04, "eval_steps_per_second": 8.203, "step": 2500 }, { "epoch": 1.364256480218281, "grad_norm": 3.325963258743286, "learning_rate": 9.7e-05, "loss": 0.8966, "step": 3000 }, { "epoch": 1.364256480218281, "eval_accuracy": 0.8441958792162307, "eval_loss": 0.8955113887786865, "eval_runtime": 59.0626, "eval_samples_per_second": 257.997, "eval_steps_per_second": 8.076, "step": 3000 }, { "epoch": 1.591632560254661, "grad_norm": 4.558549880981445, "learning_rate": 9.65e-05, "loss": 0.8609, "step": 3500 }, { "epoch": 1.591632560254661, "eval_accuracy": 0.8482073460109512, "eval_loss": 0.8796423673629761, "eval_runtime": 58.1755, "eval_samples_per_second": 261.931, "eval_steps_per_second": 8.199, "step": 3500 }, { "epoch": 1.8190086402910413, "grad_norm": 4.228474140167236, "learning_rate": 9.6e-05, "loss": 0.8394, "step": 4000 }, { "epoch": 1.8190086402910413, "eval_accuracy": 0.8522584062680361, "eval_loss": 0.8565191030502319, "eval_runtime": 59.0465, "eval_samples_per_second": 258.068, "eval_steps_per_second": 8.078, "step": 4000 }, { "epoch": 2.0463847203274215, "grad_norm": 4.954216003417969, "learning_rate": 9.55e-05, "loss": 0.8082, "step": 4500 }, { "epoch": 2.0463847203274215, "eval_accuracy": 0.8560422496354393, "eval_loss": 0.8301263451576233, "eval_runtime": 59.074, "eval_samples_per_second": 257.947, "eval_steps_per_second": 8.075, "step": 4500 }, { "epoch": 2.2737608003638017, "grad_norm": 3.8996498584747314, "learning_rate": 9.5e-05, "loss": 0.7751, "step": 5000 }, { "epoch": 2.2737608003638017, "eval_accuracy": 0.8575601290381725, "eval_loss": 0.8123583793640137, "eval_runtime": 59.0078, "eval_samples_per_second": 258.237, "eval_steps_per_second": 8.084, "step": 5000 }, { "epoch": 2.501136880400182, "grad_norm": 3.683563709259033, "learning_rate": 9.449999999999999e-05, "loss": 0.7746, "step": 5500 }, { "epoch": 2.501136880400182, "eval_accuracy": 0.8608798596484555, "eval_loss": 0.8004751801490784, "eval_runtime": 57.9526, "eval_samples_per_second": 262.939, "eval_steps_per_second": 8.231, "step": 5500 }, { "epoch": 2.728512960436562, "grad_norm": 3.8639516830444336, "learning_rate": 9.4e-05, "loss": 0.7467, "step": 6000 }, { "epoch": 2.728512960436562, "eval_accuracy": 0.8630581086069647, "eval_loss": 0.7939795255661011, "eval_runtime": 59.0043, "eval_samples_per_second": 258.252, "eval_steps_per_second": 8.084, "step": 6000 }, { "epoch": 2.9558890404729423, "grad_norm": 3.31400990486145, "learning_rate": 9.350000000000001e-05, "loss": 0.7404, "step": 6500 }, { "epoch": 2.9558890404729423, "eval_accuracy": 0.8657109489879384, "eval_loss": 0.7746465802192688, "eval_runtime": 58.2316, "eval_samples_per_second": 261.679, "eval_steps_per_second": 8.191, "step": 6500 }, { "epoch": 3.1832651205093225, "grad_norm": 2.843773603439331, "learning_rate": 9.300000000000001e-05, "loss": 0.7193, "step": 7000 }, { "epoch": 3.1832651205093225, "eval_accuracy": 0.8669591175218982, "eval_loss": 0.7662197947502136, "eval_runtime": 58.8578, "eval_samples_per_second": 258.895, "eval_steps_per_second": 8.104, "step": 7000 }, { "epoch": 3.4106412005457027, "grad_norm": 3.61684513092041, "learning_rate": 9.250000000000001e-05, "loss": 0.7127, "step": 7500 }, { "epoch": 3.4106412005457027, "eval_accuracy": 0.8696189271143282, "eval_loss": 0.7391215562820435, "eval_runtime": 58.0068, "eval_samples_per_second": 262.693, "eval_steps_per_second": 8.223, "step": 7500 }, { "epoch": 3.6380172805820825, "grad_norm": 2.982285737991333, "learning_rate": 9.200000000000001e-05, "loss": 0.7077, "step": 8000 }, { "epoch": 3.6380172805820825, "eval_accuracy": 0.8704082502343816, "eval_loss": 0.7502180933952332, "eval_runtime": 58.7428, "eval_samples_per_second": 259.402, "eval_steps_per_second": 8.12, "step": 8000 }, { "epoch": 3.865393360618463, "grad_norm": 3.2284772396087646, "learning_rate": 9.15e-05, "loss": 0.6954, "step": 8500 }, { "epoch": 3.865393360618463, "eval_accuracy": 0.8723014320863037, "eval_loss": 0.7357130646705627, "eval_runtime": 57.9037, "eval_samples_per_second": 263.161, "eval_steps_per_second": 8.238, "step": 8500 }, { "epoch": 4.092769440654843, "grad_norm": 3.141983985900879, "learning_rate": 9.1e-05, "loss": 0.6831, "step": 9000 }, { "epoch": 4.092769440654843, "eval_accuracy": 0.87450734136313, "eval_loss": 0.7237355709075928, "eval_runtime": 58.8298, "eval_samples_per_second": 259.018, "eval_steps_per_second": 8.108, "step": 9000 }, { "epoch": 4.320145520691224, "grad_norm": 3.805156707763672, "learning_rate": 9.05e-05, "loss": 0.6653, "step": 9500 }, { "epoch": 4.320145520691224, "eval_accuracy": 0.875826148405435, "eval_loss": 0.7154198288917542, "eval_runtime": 58.8099, "eval_samples_per_second": 259.106, "eval_steps_per_second": 8.111, "step": 9500 }, { "epoch": 4.547521600727603, "grad_norm": 2.943392753601074, "learning_rate": 9e-05, "loss": 0.6587, "step": 10000 }, { "epoch": 4.547521600727603, "eval_accuracy": 0.8750290297659766, "eval_loss": 0.7158774733543396, "eval_runtime": 58.8052, "eval_samples_per_second": 259.127, "eval_steps_per_second": 8.112, "step": 10000 }, { "epoch": 4.774897680763983, "grad_norm": 3.1016767024993896, "learning_rate": 8.950000000000001e-05, "loss": 0.657, "step": 10500 }, { "epoch": 4.774897680763983, "eval_accuracy": 0.876631092564109, "eval_loss": 0.7100118398666382, "eval_runtime": 57.9757, "eval_samples_per_second": 262.834, "eval_steps_per_second": 8.228, "step": 10500 }, { "epoch": 5.002273760800364, "grad_norm": 2.9359586238861084, "learning_rate": 8.900000000000001e-05, "loss": 0.6544, "step": 11000 }, { "epoch": 5.002273760800364, "eval_accuracy": 0.8772724264740639, "eval_loss": 0.6987695097923279, "eval_runtime": 58.0324, "eval_samples_per_second": 262.578, "eval_steps_per_second": 8.22, "step": 11000 }, { "epoch": 5.229649840836744, "grad_norm": 3.289794445037842, "learning_rate": 8.850000000000001e-05, "loss": 0.6315, "step": 11500 }, { "epoch": 5.229649840836744, "eval_accuracy": 0.8785936092935085, "eval_loss": 0.7098827362060547, "eval_runtime": 58.2042, "eval_samples_per_second": 261.802, "eval_steps_per_second": 8.195, "step": 11500 }, { "epoch": 5.457025920873124, "grad_norm": 3.1603569984436035, "learning_rate": 8.800000000000001e-05, "loss": 0.6406, "step": 12000 }, { "epoch": 5.457025920873124, "eval_accuracy": 0.8785365360431241, "eval_loss": 0.697100043296814, "eval_runtime": 58.956, "eval_samples_per_second": 258.464, "eval_steps_per_second": 8.091, "step": 12000 }, { "epoch": 5.684402000909504, "grad_norm": 3.9137370586395264, "learning_rate": 8.75e-05, "loss": 0.6296, "step": 12500 }, { "epoch": 5.684402000909504, "eval_accuracy": 0.8807072007268872, "eval_loss": 0.6913357377052307, "eval_runtime": 58.1403, "eval_samples_per_second": 262.09, "eval_steps_per_second": 8.204, "step": 12500 }, { "epoch": 5.911778080945885, "grad_norm": 3.703839063644409, "learning_rate": 8.7e-05, "loss": 0.626, "step": 13000 }, { "epoch": 5.911778080945885, "eval_accuracy": 0.8809903070027411, "eval_loss": 0.6919081807136536, "eval_runtime": 58.1489, "eval_samples_per_second": 262.051, "eval_steps_per_second": 8.203, "step": 13000 }, { "epoch": 6.139154160982264, "grad_norm": 3.5155723094940186, "learning_rate": 8.65e-05, "loss": 0.6198, "step": 13500 }, { "epoch": 6.139154160982264, "eval_accuracy": 0.8827777558822371, "eval_loss": 0.6714188456535339, "eval_runtime": 58.9629, "eval_samples_per_second": 258.434, "eval_steps_per_second": 8.09, "step": 13500 }, { "epoch": 6.366530241018645, "grad_norm": 3.117462635040283, "learning_rate": 8.6e-05, "loss": 0.6076, "step": 14000 }, { "epoch": 6.366530241018645, "eval_accuracy": 0.8834275999297155, "eval_loss": 0.6733196973800659, "eval_runtime": 59.0324, "eval_samples_per_second": 258.13, "eval_steps_per_second": 8.08, "step": 14000 }, { "epoch": 6.593906321055025, "grad_norm": 2.6322078704833984, "learning_rate": 8.55e-05, "loss": 0.6087, "step": 14500 }, { "epoch": 6.593906321055025, "eval_accuracy": 0.8831569539581932, "eval_loss": 0.6722173690795898, "eval_runtime": 58.2146, "eval_samples_per_second": 261.755, "eval_steps_per_second": 8.194, "step": 14500 }, { "epoch": 6.8212824010914055, "grad_norm": 2.875756025314331, "learning_rate": 8.5e-05, "loss": 0.6016, "step": 15000 }, { "epoch": 6.8212824010914055, "eval_accuracy": 0.8827436768449041, "eval_loss": 0.6779712438583374, "eval_runtime": 57.9717, "eval_samples_per_second": 262.852, "eval_steps_per_second": 8.228, "step": 15000 }, { "epoch": 7.048658481127785, "grad_norm": 2.797990560531616, "learning_rate": 8.450000000000001e-05, "loss": 0.5932, "step": 15500 }, { "epoch": 7.048658481127785, "eval_accuracy": 0.8849467695494039, "eval_loss": 0.6560626029968262, "eval_runtime": 59.2909, "eval_samples_per_second": 257.004, "eval_steps_per_second": 8.045, "step": 15500 }, { "epoch": 7.276034561164166, "grad_norm": 3.173975706100464, "learning_rate": 8.4e-05, "loss": 0.5877, "step": 16000 }, { "epoch": 7.276034561164166, "eval_accuracy": 0.8849921630094044, "eval_loss": 0.6600627303123474, "eval_runtime": 57.9674, "eval_samples_per_second": 262.872, "eval_steps_per_second": 8.229, "step": 16000 }, { "epoch": 7.503410641200546, "grad_norm": 2.7644402980804443, "learning_rate": 8.35e-05, "loss": 0.5909, "step": 16500 }, { "epoch": 7.503410641200546, "eval_accuracy": 0.885632879834746, "eval_loss": 0.6634506583213806, "eval_runtime": 58.1288, "eval_samples_per_second": 262.142, "eval_steps_per_second": 8.206, "step": 16500 }, { "epoch": 7.730786721236926, "grad_norm": 3.6716108322143555, "learning_rate": 8.3e-05, "loss": 0.5848, "step": 17000 }, { "epoch": 7.730786721236926, "eval_accuracy": 0.8870372975442135, "eval_loss": 0.6555737257003784, "eval_runtime": 58.0593, "eval_samples_per_second": 262.456, "eval_steps_per_second": 8.216, "step": 17000 }, { "epoch": 7.958162801273306, "grad_norm": 2.635899782180786, "learning_rate": 8.25e-05, "loss": 0.5806, "step": 17500 }, { "epoch": 7.958162801273306, "eval_accuracy": 0.8872147874120453, "eval_loss": 0.6475590467453003, "eval_runtime": 58.9688, "eval_samples_per_second": 258.408, "eval_steps_per_second": 8.089, "step": 17500 }, { "epoch": 8.185538881309686, "grad_norm": 3.155376434326172, "learning_rate": 8.2e-05, "loss": 0.5717, "step": 18000 }, { "epoch": 8.185538881309686, "eval_accuracy": 0.887904005764964, "eval_loss": 0.6570438146591187, "eval_runtime": 58.2291, "eval_samples_per_second": 261.69, "eval_steps_per_second": 8.192, "step": 18000 }, { "epoch": 8.412914961346067, "grad_norm": 2.7952346801757812, "learning_rate": 8.15e-05, "loss": 0.564, "step": 18500 }, { "epoch": 8.412914961346067, "eval_accuracy": 0.8873371327146311, "eval_loss": 0.6420606374740601, "eval_runtime": 58.022, "eval_samples_per_second": 262.624, "eval_steps_per_second": 8.221, "step": 18500 }, { "epoch": 8.640291041382447, "grad_norm": 2.901456832885742, "learning_rate": 8.1e-05, "loss": 0.5668, "step": 19000 }, { "epoch": 8.640291041382447, "eval_accuracy": 0.8888152172025493, "eval_loss": 0.6471173763275146, "eval_runtime": 58.0374, "eval_samples_per_second": 262.555, "eval_steps_per_second": 8.219, "step": 19000 }, { "epoch": 8.867667121418826, "grad_norm": 3.783108949661255, "learning_rate": 8.05e-05, "loss": 0.5661, "step": 19500 }, { "epoch": 8.867667121418826, "eval_accuracy": 0.8896724765068369, "eval_loss": 0.6358206272125244, "eval_runtime": 57.9826, "eval_samples_per_second": 262.803, "eval_steps_per_second": 8.227, "step": 19500 }, { "epoch": 9.095043201455207, "grad_norm": 3.0208489894866943, "learning_rate": 8e-05, "loss": 0.5575, "step": 20000 }, { "epoch": 9.095043201455207, "eval_accuracy": 0.8890150798157612, "eval_loss": 0.6520903706550598, "eval_runtime": 58.9559, "eval_samples_per_second": 258.464, "eval_steps_per_second": 8.091, "step": 20000 }, { "epoch": 9.322419281491587, "grad_norm": 2.4719395637512207, "learning_rate": 7.950000000000001e-05, "loss": 0.5563, "step": 20500 }, { "epoch": 9.322419281491587, "eval_accuracy": 0.8897703404100196, "eval_loss": 0.6332861185073853, "eval_runtime": 58.0586, "eval_samples_per_second": 262.459, "eval_steps_per_second": 8.216, "step": 20500 }, { "epoch": 9.549795361527968, "grad_norm": 3.9511001110076904, "learning_rate": 7.900000000000001e-05, "loss": 0.5462, "step": 21000 }, { "epoch": 9.549795361527968, "eval_accuracy": 0.8907973350119666, "eval_loss": 0.6307789087295532, "eval_runtime": 58.9193, "eval_samples_per_second": 258.625, "eval_steps_per_second": 8.096, "step": 21000 }, { "epoch": 9.777171441564347, "grad_norm": 3.5722222328186035, "learning_rate": 7.850000000000001e-05, "loss": 0.554, "step": 21500 }, { "epoch": 9.777171441564347, "eval_accuracy": 0.892567522812507, "eval_loss": 0.6097805500030518, "eval_runtime": 58.065, "eval_samples_per_second": 262.43, "eval_steps_per_second": 8.215, "step": 21500 }, { "epoch": 10.004547521600728, "grad_norm": 2.951775074005127, "learning_rate": 7.800000000000001e-05, "loss": 0.5484, "step": 22000 }, { "epoch": 10.004547521600728, "eval_accuracy": 0.8916424877002649, "eval_loss": 0.6175746917724609, "eval_runtime": 58.4944, "eval_samples_per_second": 260.504, "eval_steps_per_second": 8.155, "step": 22000 }, { "epoch": 10.231923601637108, "grad_norm": 3.9985241889953613, "learning_rate": 7.75e-05, "loss": 0.5443, "step": 22500 }, { "epoch": 10.231923601637108, "eval_accuracy": 0.8924841760921375, "eval_loss": 0.6330751180648804, "eval_runtime": 58.5296, "eval_samples_per_second": 260.347, "eval_steps_per_second": 8.15, "step": 22500 }, { "epoch": 10.459299681673487, "grad_norm": 3.2177023887634277, "learning_rate": 7.7e-05, "loss": 0.5329, "step": 23000 }, { "epoch": 10.459299681673487, "eval_accuracy": 0.8919113818187556, "eval_loss": 0.6282722353935242, "eval_runtime": 58.5087, "eval_samples_per_second": 260.44, "eval_steps_per_second": 8.153, "step": 23000 }, { "epoch": 10.686675761709868, "grad_norm": 2.4907150268554688, "learning_rate": 7.65e-05, "loss": 0.5332, "step": 23500 }, { "epoch": 10.686675761709868, "eval_accuracy": 0.8929225353778606, "eval_loss": 0.6218137145042419, "eval_runtime": 58.6581, "eval_samples_per_second": 259.777, "eval_steps_per_second": 8.132, "step": 23500 }, { "epoch": 10.914051841746248, "grad_norm": 2.3024277687072754, "learning_rate": 7.6e-05, "loss": 0.5323, "step": 24000 }, { "epoch": 10.914051841746248, "eval_accuracy": 0.8931125650111673, "eval_loss": 0.6083813309669495, "eval_runtime": 57.6897, "eval_samples_per_second": 264.137, "eval_steps_per_second": 8.268, "step": 24000 }, { "epoch": 11.141427921782629, "grad_norm": 3.06503963470459, "learning_rate": 7.55e-05, "loss": 0.5239, "step": 24500 }, { "epoch": 11.141427921782629, "eval_accuracy": 0.8939010799940649, "eval_loss": 0.637162446975708, "eval_runtime": 57.7617, "eval_samples_per_second": 263.808, "eval_steps_per_second": 8.258, "step": 24500 }, { "epoch": 11.368804001819008, "grad_norm": 2.7696726322174072, "learning_rate": 7.500000000000001e-05, "loss": 0.5261, "step": 25000 }, { "epoch": 11.368804001819008, "eval_accuracy": 0.8937561222232755, "eval_loss": 0.6276402473449707, "eval_runtime": 57.6676, "eval_samples_per_second": 264.239, "eval_steps_per_second": 8.272, "step": 25000 }, { "epoch": 11.596180081855389, "grad_norm": 2.751497268676758, "learning_rate": 7.450000000000001e-05, "loss": 0.5175, "step": 25500 }, { "epoch": 11.596180081855389, "eval_accuracy": 0.8951661397774281, "eval_loss": 0.6050118207931519, "eval_runtime": 57.5452, "eval_samples_per_second": 264.8, "eval_steps_per_second": 8.289, "step": 25500 }, { "epoch": 11.82355616189177, "grad_norm": 2.554222822189331, "learning_rate": 7.4e-05, "loss": 0.5262, "step": 26000 }, { "epoch": 11.82355616189177, "eval_accuracy": 0.8952365458244355, "eval_loss": 0.6047869324684143, "eval_runtime": 58.525, "eval_samples_per_second": 260.367, "eval_steps_per_second": 8.15, "step": 26000 }, { "epoch": 12.05093224192815, "grad_norm": 2.832977294921875, "learning_rate": 7.35e-05, "loss": 0.5189, "step": 26500 }, { "epoch": 12.05093224192815, "eval_accuracy": 0.8951059547768438, "eval_loss": 0.6053177714347839, "eval_runtime": 57.7314, "eval_samples_per_second": 263.946, "eval_steps_per_second": 8.262, "step": 26500 }, { "epoch": 12.278308321964529, "grad_norm": 2.9270408153533936, "learning_rate": 7.3e-05, "loss": 0.5162, "step": 27000 }, { "epoch": 12.278308321964529, "eval_accuracy": 0.8956970707551482, "eval_loss": 0.600378692150116, "eval_runtime": 68.0954, "eval_samples_per_second": 223.774, "eval_steps_per_second": 7.005, "step": 27000 }, { "epoch": 12.50568440200091, "grad_norm": 4.0453782081604, "learning_rate": 7.25e-05, "loss": 0.5111, "step": 27500 }, { "epoch": 12.50568440200091, "eval_accuracy": 0.8950593575694928, "eval_loss": 0.601889431476593, "eval_runtime": 69.1002, "eval_samples_per_second": 220.52, "eval_steps_per_second": 6.903, "step": 27500 }, { "epoch": 12.73306048203729, "grad_norm": 1.8677308559417725, "learning_rate": 7.2e-05, "loss": 0.51, "step": 28000 }, { "epoch": 12.73306048203729, "eval_accuracy": 0.8962058115277431, "eval_loss": 0.6174491047859192, "eval_runtime": 69.7483, "eval_samples_per_second": 218.471, "eval_steps_per_second": 6.839, "step": 28000 }, { "epoch": 12.96043656207367, "grad_norm": 2.551996946334839, "learning_rate": 7.15e-05, "loss": 0.5109, "step": 28500 }, { "epoch": 12.96043656207367, "eval_accuracy": 0.8969674492010259, "eval_loss": 0.6083965301513672, "eval_runtime": 66.6755, "eval_samples_per_second": 228.54, "eval_steps_per_second": 7.154, "step": 28500 }, { "epoch": 13.18781264211005, "grad_norm": 2.474005699157715, "learning_rate": 7.1e-05, "loss": 0.502, "step": 29000 }, { "epoch": 13.18781264211005, "eval_accuracy": 0.8963457500841246, "eval_loss": 0.6135991215705872, "eval_runtime": 67.7791, "eval_samples_per_second": 224.818, "eval_steps_per_second": 7.038, "step": 29000 }, { "epoch": 13.41518872214643, "grad_norm": 3.0140531063079834, "learning_rate": 7.05e-05, "loss": 0.5, "step": 29500 }, { "epoch": 13.41518872214643, "eval_accuracy": 0.8979126349984058, "eval_loss": 0.6121774911880493, "eval_runtime": 69.2043, "eval_samples_per_second": 220.189, "eval_steps_per_second": 6.893, "step": 29500 }, { "epoch": 13.642564802182811, "grad_norm": 2.9221041202545166, "learning_rate": 7e-05, "loss": 0.5024, "step": 30000 }, { "epoch": 13.642564802182811, "eval_accuracy": 0.8981451519977776, "eval_loss": 0.59588623046875, "eval_runtime": 68.802, "eval_samples_per_second": 221.476, "eval_steps_per_second": 6.933, "step": 30000 }, { "epoch": 13.86994088221919, "grad_norm": 3.138761043548584, "learning_rate": 6.95e-05, "loss": 0.4998, "step": 30500 }, { "epoch": 13.86994088221919, "eval_accuracy": 0.8978226704469172, "eval_loss": 0.5944364666938782, "eval_runtime": 66.5107, "eval_samples_per_second": 229.106, "eval_steps_per_second": 7.172, "step": 30500 }, { "epoch": 14.09731696225557, "grad_norm": 2.7592108249664307, "learning_rate": 6.9e-05, "loss": 0.4854, "step": 31000 }, { "epoch": 14.09731696225557, "eval_accuracy": 0.898367703578009, "eval_loss": 0.5859193205833435, "eval_runtime": 66.7713, "eval_samples_per_second": 228.212, "eval_steps_per_second": 7.144, "step": 31000 }, { "epoch": 14.324693042291951, "grad_norm": 2.9050989151000977, "learning_rate": 6.850000000000001e-05, "loss": 0.4882, "step": 31500 }, { "epoch": 14.324693042291951, "eval_accuracy": 0.8975941653774508, "eval_loss": 0.60106360912323, "eval_runtime": 68.9335, "eval_samples_per_second": 221.054, "eval_steps_per_second": 6.92, "step": 31500 }, { "epoch": 14.552069122328332, "grad_norm": 2.3342740535736084, "learning_rate": 6.800000000000001e-05, "loss": 0.4841, "step": 32000 }, { "epoch": 14.552069122328332, "eval_accuracy": 0.8990730423650459, "eval_loss": 0.584600567817688, "eval_runtime": 67.2181, "eval_samples_per_second": 226.695, "eval_steps_per_second": 7.096, "step": 32000 }, { "epoch": 14.77944520236471, "grad_norm": 2.1228647232055664, "learning_rate": 6.750000000000001e-05, "loss": 0.4844, "step": 32500 }, { "epoch": 14.77944520236471, "eval_accuracy": 0.8990018801499072, "eval_loss": 0.5907680988311768, "eval_runtime": 59.205, "eval_samples_per_second": 257.377, "eval_steps_per_second": 8.057, "step": 32500 }, { "epoch": 15.006821282401091, "grad_norm": 3.1188831329345703, "learning_rate": 6.7e-05, "loss": 0.4831, "step": 33000 }, { "epoch": 15.006821282401091, "eval_accuracy": 0.8995549513632177, "eval_loss": 0.5859436392784119, "eval_runtime": 58.4781, "eval_samples_per_second": 260.576, "eval_steps_per_second": 8.157, "step": 33000 }, { "epoch": 15.234197362437472, "grad_norm": 3.4289638996124268, "learning_rate": 6.65e-05, "loss": 0.4783, "step": 33500 }, { "epoch": 15.234197362437472, "eval_accuracy": 0.8997874968645139, "eval_loss": 0.586015522480011, "eval_runtime": 58.2205, "eval_samples_per_second": 261.729, "eval_steps_per_second": 8.193, "step": 33500 }, { "epoch": 15.461573442473851, "grad_norm": 2.7651004791259766, "learning_rate": 6.6e-05, "loss": 0.4752, "step": 34000 }, { "epoch": 15.461573442473851, "eval_accuracy": 0.9008641609570693, "eval_loss": 0.5896708369255066, "eval_runtime": 59.224, "eval_samples_per_second": 257.294, "eval_steps_per_second": 8.054, "step": 34000 }, { "epoch": 15.688949522510232, "grad_norm": 2.334015130996704, "learning_rate": 6.55e-05, "loss": 0.4751, "step": 34500 }, { "epoch": 15.688949522510232, "eval_accuracy": 0.8998034774235811, "eval_loss": 0.5798426270484924, "eval_runtime": 58.5323, "eval_samples_per_second": 260.335, "eval_steps_per_second": 8.149, "step": 34500 }, { "epoch": 15.916325602546612, "grad_norm": 1.9268873929977417, "learning_rate": 6.500000000000001e-05, "loss": 0.4781, "step": 35000 }, { "epoch": 15.916325602546612, "eval_accuracy": 0.900376427009191, "eval_loss": 0.574535071849823, "eval_runtime": 59.3062, "eval_samples_per_second": 256.938, "eval_steps_per_second": 8.043, "step": 35000 }, { "epoch": 16.143701682582993, "grad_norm": 3.1282005310058594, "learning_rate": 6.450000000000001e-05, "loss": 0.4741, "step": 35500 }, { "epoch": 16.143701682582993, "eval_accuracy": 0.9008166774124221, "eval_loss": 0.5920200347900391, "eval_runtime": 67.8707, "eval_samples_per_second": 224.515, "eval_steps_per_second": 7.028, "step": 35500 }, { "epoch": 16.37107776261937, "grad_norm": 2.4809906482696533, "learning_rate": 6.400000000000001e-05, "loss": 0.4764, "step": 36000 }, { "epoch": 16.37107776261937, "eval_accuracy": 0.9007614317314762, "eval_loss": 0.5653102397918701, "eval_runtime": 67.6973, "eval_samples_per_second": 225.09, "eval_steps_per_second": 7.046, "step": 36000 }, { "epoch": 16.598453842655754, "grad_norm": 2.460899591445923, "learning_rate": 6.35e-05, "loss": 0.4693, "step": 36500 }, { "epoch": 16.598453842655754, "eval_accuracy": 0.9008501214346764, "eval_loss": 0.5879611968994141, "eval_runtime": 66.4301, "eval_samples_per_second": 229.384, "eval_steps_per_second": 7.18, "step": 36500 }, { "epoch": 16.825829922692133, "grad_norm": 2.389681816101074, "learning_rate": 6.3e-05, "loss": 0.4638, "step": 37000 }, { "epoch": 16.825829922692133, "eval_accuracy": 0.9024308529635798, "eval_loss": 0.5856647491455078, "eval_runtime": 66.0584, "eval_samples_per_second": 230.675, "eval_steps_per_second": 7.221, "step": 37000 }, { "epoch": 17.053206002728512, "grad_norm": 2.5536763668060303, "learning_rate": 6.25e-05, "loss": 0.4643, "step": 37500 }, { "epoch": 17.053206002728512, "eval_accuracy": 0.9017984039775945, "eval_loss": 0.5838255882263184, "eval_runtime": 66.4217, "eval_samples_per_second": 229.413, "eval_steps_per_second": 7.181, "step": 37500 }, { "epoch": 17.280582082764894, "grad_norm": 2.8841841220855713, "learning_rate": 6.2e-05, "loss": 0.4559, "step": 38000 }, { "epoch": 17.280582082764894, "eval_accuracy": 0.9016667691546006, "eval_loss": 0.5722949504852295, "eval_runtime": 67.5295, "eval_samples_per_second": 225.65, "eval_steps_per_second": 7.064, "step": 38000 }, { "epoch": 17.507958162801273, "grad_norm": 2.0763895511627197, "learning_rate": 6.15e-05, "loss": 0.4649, "step": 38500 }, { "epoch": 17.507958162801273, "eval_accuracy": 0.9028016315170558, "eval_loss": 0.5597889423370361, "eval_runtime": 68.1673, "eval_samples_per_second": 223.538, "eval_steps_per_second": 6.997, "step": 38500 }, { "epoch": 17.735334242837652, "grad_norm": 2.7098090648651123, "learning_rate": 6.1e-05, "loss": 0.4619, "step": 39000 }, { "epoch": 17.735334242837652, "eval_accuracy": 0.9032880942757541, "eval_loss": 0.5635744333267212, "eval_runtime": 65.7624, "eval_samples_per_second": 231.713, "eval_steps_per_second": 7.253, "step": 39000 }, { "epoch": 17.962710322874035, "grad_norm": 2.361464023590088, "learning_rate": 6.05e-05, "loss": 0.4554, "step": 39500 }, { "epoch": 17.962710322874035, "eval_accuracy": 0.9026038090276955, "eval_loss": 0.5572078227996826, "eval_runtime": 65.3403, "eval_samples_per_second": 233.21, "eval_steps_per_second": 7.3, "step": 39500 }, { "epoch": 18.190086402910413, "grad_norm": 2.2354624271392822, "learning_rate": 6e-05, "loss": 0.4516, "step": 40000 }, { "epoch": 18.190086402910413, "eval_accuracy": 0.9029999052162649, "eval_loss": 0.5721431970596313, "eval_runtime": 66.8398, "eval_samples_per_second": 227.978, "eval_steps_per_second": 7.136, "step": 40000 }, { "epoch": 18.417462482946792, "grad_norm": 2.5873236656188965, "learning_rate": 5.95e-05, "loss": 0.4488, "step": 40500 }, { "epoch": 18.417462482946792, "eval_accuracy": 0.9031028868580921, "eval_loss": 0.577458381652832, "eval_runtime": 65.9075, "eval_samples_per_second": 231.203, "eval_steps_per_second": 7.237, "step": 40500 }, { "epoch": 18.644838562983175, "grad_norm": 2.14237642288208, "learning_rate": 5.9e-05, "loss": 0.4529, "step": 41000 }, { "epoch": 18.644838562983175, "eval_accuracy": 0.9034652567356332, "eval_loss": 0.5635027289390564, "eval_runtime": 57.1062, "eval_samples_per_second": 266.836, "eval_steps_per_second": 8.353, "step": 41000 }, { "epoch": 18.872214643019554, "grad_norm": 2.687331199645996, "learning_rate": 5.85e-05, "loss": 0.4457, "step": 41500 }, { "epoch": 18.872214643019554, "eval_accuracy": 0.9038571769197852, "eval_loss": 0.5578777194023132, "eval_runtime": 57.0961, "eval_samples_per_second": 266.883, "eval_steps_per_second": 8.354, "step": 41500 }, { "epoch": 19.099590723055936, "grad_norm": 2.4610774517059326, "learning_rate": 5.8e-05, "loss": 0.4456, "step": 42000 }, { "epoch": 19.099590723055936, "eval_accuracy": 0.9040196805071515, "eval_loss": 0.570646345615387, "eval_runtime": 57.0535, "eval_samples_per_second": 267.083, "eval_steps_per_second": 8.361, "step": 42000 }, { "epoch": 19.326966803092315, "grad_norm": 2.7358736991882324, "learning_rate": 5.7499999999999995e-05, "loss": 0.4439, "step": 42500 }, { "epoch": 19.326966803092315, "eval_accuracy": 0.9047236917555364, "eval_loss": 0.5603917837142944, "eval_runtime": 57.9311, "eval_samples_per_second": 263.037, "eval_steps_per_second": 8.234, "step": 42500 }, { "epoch": 19.554342883128694, "grad_norm": 2.6887753009796143, "learning_rate": 5.6999999999999996e-05, "loss": 0.4371, "step": 43000 }, { "epoch": 19.554342883128694, "eval_accuracy": 0.9050488743420761, "eval_loss": 0.5606555938720703, "eval_runtime": 57.2573, "eval_samples_per_second": 266.132, "eval_steps_per_second": 8.331, "step": 43000 }, { "epoch": 19.781718963165076, "grad_norm": 4.247918128967285, "learning_rate": 5.65e-05, "loss": 0.4419, "step": 43500 }, { "epoch": 19.781718963165076, "eval_accuracy": 0.9042590959223571, "eval_loss": 0.5585463643074036, "eval_runtime": 66.8598, "eval_samples_per_second": 227.91, "eval_steps_per_second": 7.134, "step": 43500 }, { "epoch": 20.009095043201455, "grad_norm": 2.268353223800659, "learning_rate": 5.6000000000000006e-05, "loss": 0.4415, "step": 44000 }, { "epoch": 20.009095043201455, "eval_accuracy": 0.9043696805910753, "eval_loss": 0.5533380508422852, "eval_runtime": 75.8782, "eval_samples_per_second": 200.822, "eval_steps_per_second": 6.286, "step": 44000 }, { "epoch": 20.236471123237834, "grad_norm": 2.3621208667755127, "learning_rate": 5.550000000000001e-05, "loss": 0.4363, "step": 44500 }, { "epoch": 20.236471123237834, "eval_accuracy": 0.9052580054316933, "eval_loss": 0.561882734298706, "eval_runtime": 76.0871, "eval_samples_per_second": 200.27, "eval_steps_per_second": 6.269, "step": 44500 }, { "epoch": 20.463847203274216, "grad_norm": 2.7799289226531982, "learning_rate": 5.500000000000001e-05, "loss": 0.4327, "step": 45000 }, { "epoch": 20.463847203274216, "eval_accuracy": 0.9050759625981704, "eval_loss": 0.5621650815010071, "eval_runtime": 75.1467, "eval_samples_per_second": 202.777, "eval_steps_per_second": 6.348, "step": 45000 }, { "epoch": 20.691223283310595, "grad_norm": 2.6320040225982666, "learning_rate": 5.45e-05, "loss": 0.4341, "step": 45500 }, { "epoch": 20.691223283310595, "eval_accuracy": 0.90537806159975, "eval_loss": 0.5658455491065979, "eval_runtime": 75.0197, "eval_samples_per_second": 203.12, "eval_steps_per_second": 6.358, "step": 45500 }, { "epoch": 20.918599363346974, "grad_norm": 2.626370668411255, "learning_rate": 5.4000000000000005e-05, "loss": 0.436, "step": 46000 }, { "epoch": 20.918599363346974, "eval_accuracy": 0.9057144525510582, "eval_loss": 0.5474947094917297, "eval_runtime": 74.8407, "eval_samples_per_second": 203.606, "eval_steps_per_second": 6.374, "step": 46000 }, { "epoch": 21.145975443383357, "grad_norm": 2.2103912830352783, "learning_rate": 5.3500000000000006e-05, "loss": 0.43, "step": 46500 }, { "epoch": 21.145975443383357, "eval_accuracy": 0.9056834598326479, "eval_loss": 0.5598079562187195, "eval_runtime": 67.154, "eval_samples_per_second": 226.911, "eval_steps_per_second": 7.103, "step": 46500 }, { "epoch": 21.373351523419736, "grad_norm": 2.3939335346221924, "learning_rate": 5.300000000000001e-05, "loss": 0.4253, "step": 47000 }, { "epoch": 21.373351523419736, "eval_accuracy": 0.9061305526247623, "eval_loss": 0.5633291602134705, "eval_runtime": 66.5353, "eval_samples_per_second": 229.021, "eval_steps_per_second": 7.169, "step": 47000 }, { "epoch": 21.600727603456118, "grad_norm": 2.4314322471618652, "learning_rate": 5.25e-05, "loss": 0.4276, "step": 47500 }, { "epoch": 21.600727603456118, "eval_accuracy": 0.9062991778333528, "eval_loss": 0.5566443800926208, "eval_runtime": 68.0296, "eval_samples_per_second": 223.991, "eval_steps_per_second": 7.012, "step": 47500 }, { "epoch": 21.828103683492497, "grad_norm": 2.792711019515991, "learning_rate": 5.2000000000000004e-05, "loss": 0.4235, "step": 48000 }, { "epoch": 21.828103683492497, "eval_accuracy": 0.9063092114708347, "eval_loss": 0.5603668093681335, "eval_runtime": 67.1839, "eval_samples_per_second": 226.81, "eval_steps_per_second": 7.1, "step": 48000 }, { "epoch": 22.055479763528876, "grad_norm": 3.2149298191070557, "learning_rate": 5.1500000000000005e-05, "loss": 0.4293, "step": 48500 }, { "epoch": 22.055479763528876, "eval_accuracy": 0.9061439320028164, "eval_loss": 0.5632808804512024, "eval_runtime": 55.9195, "eval_samples_per_second": 272.499, "eval_steps_per_second": 8.53, "step": 48500 }, { "epoch": 22.282855843565258, "grad_norm": 2.1803324222564697, "learning_rate": 5.1000000000000006e-05, "loss": 0.4226, "step": 49000 }, { "epoch": 22.282855843565258, "eval_accuracy": 0.9075216851954307, "eval_loss": 0.5528887510299683, "eval_runtime": 56.7606, "eval_samples_per_second": 268.461, "eval_steps_per_second": 8.404, "step": 49000 }, { "epoch": 22.510231923601637, "grad_norm": 2.293896436691284, "learning_rate": 5.05e-05, "loss": 0.4193, "step": 49500 }, { "epoch": 22.510231923601637, "eval_accuracy": 0.907387491792463, "eval_loss": 0.5482432246208191, "eval_runtime": 57.104, "eval_samples_per_second": 266.846, "eval_steps_per_second": 8.353, "step": 49500 }, { "epoch": 22.737608003638016, "grad_norm": 2.2139928340911865, "learning_rate": 5e-05, "loss": 0.4158, "step": 50000 }, { "epoch": 22.737608003638016, "eval_accuracy": 0.9077673279064253, "eval_loss": 0.5403118133544922, "eval_runtime": 55.9516, "eval_samples_per_second": 272.342, "eval_steps_per_second": 8.525, "step": 50000 }, { "epoch": 22.9649840836744, "grad_norm": 2.241081476211548, "learning_rate": 4.9500000000000004e-05, "loss": 0.4174, "step": 50500 }, { "epoch": 22.9649840836744, "eval_accuracy": 0.9083347677575133, "eval_loss": 0.5309577584266663, "eval_runtime": 56.8789, "eval_samples_per_second": 267.903, "eval_steps_per_second": 8.386, "step": 50500 }, { "epoch": 23.192360163710777, "grad_norm": 3.3256096839904785, "learning_rate": 4.9e-05, "loss": 0.4191, "step": 51000 }, { "epoch": 23.192360163710777, "eval_accuracy": 0.9088304562502367, "eval_loss": 0.5443009734153748, "eval_runtime": 56.8135, "eval_samples_per_second": 268.211, "eval_steps_per_second": 8.396, "step": 51000 }, { "epoch": 23.419736243747156, "grad_norm": 2.716857433319092, "learning_rate": 4.85e-05, "loss": 0.4071, "step": 51500 }, { "epoch": 23.419736243747156, "eval_accuracy": 0.9077237472285563, "eval_loss": 0.5535444617271423, "eval_runtime": 56.7929, "eval_samples_per_second": 268.308, "eval_steps_per_second": 8.399, "step": 51500 }, { "epoch": 23.64711232378354, "grad_norm": 2.466326951980591, "learning_rate": 4.8e-05, "loss": 0.4159, "step": 52000 }, { "epoch": 23.64711232378354, "eval_accuracy": 0.9093382373851798, "eval_loss": 0.5515927076339722, "eval_runtime": 56.8114, "eval_samples_per_second": 268.221, "eval_steps_per_second": 8.396, "step": 52000 }, { "epoch": 23.874488403819917, "grad_norm": 2.6443376541137695, "learning_rate": 4.75e-05, "loss": 0.4159, "step": 52500 }, { "epoch": 23.874488403819917, "eval_accuracy": 0.9080766240994744, "eval_loss": 0.5417291522026062, "eval_runtime": 56.0043, "eval_samples_per_second": 272.086, "eval_steps_per_second": 8.517, "step": 52500 }, { "epoch": 24.1018644838563, "grad_norm": 2.359405994415283, "learning_rate": 4.7e-05, "loss": 0.4128, "step": 53000 }, { "epoch": 24.1018644838563, "eval_accuracy": 0.9082854247369369, "eval_loss": 0.546323299407959, "eval_runtime": 55.971, "eval_samples_per_second": 272.248, "eval_steps_per_second": 8.522, "step": 53000 }, { "epoch": 24.32924056389268, "grad_norm": 2.0880720615386963, "learning_rate": 4.6500000000000005e-05, "loss": 0.4034, "step": 53500 }, { "epoch": 24.32924056389268, "eval_accuracy": 0.9087918433807937, "eval_loss": 0.5541105270385742, "eval_runtime": 58.891, "eval_samples_per_second": 258.749, "eval_steps_per_second": 8.1, "step": 53500 }, { "epoch": 24.556616643929058, "grad_norm": 2.977452039718628, "learning_rate": 4.600000000000001e-05, "loss": 0.4051, "step": 54000 }, { "epoch": 24.556616643929058, "eval_accuracy": 0.908003587059478, "eval_loss": 0.5499656200408936, "eval_runtime": 57.0241, "eval_samples_per_second": 267.22, "eval_steps_per_second": 8.365, "step": 54000 }, { "epoch": 24.78399272396544, "grad_norm": 2.6565568447113037, "learning_rate": 4.55e-05, "loss": 0.3973, "step": 54500 }, { "epoch": 24.78399272396544, "eval_accuracy": 0.9084627466876093, "eval_loss": 0.5408248901367188, "eval_runtime": 56.052, "eval_samples_per_second": 271.855, "eval_steps_per_second": 8.51, "step": 54500 }, { "epoch": 25.01136880400182, "grad_norm": 2.017199993133545, "learning_rate": 4.5e-05, "loss": 0.4038, "step": 55000 }, { "epoch": 25.01136880400182, "eval_accuracy": 0.9091495750162909, "eval_loss": 0.531546413898468, "eval_runtime": 56.6501, "eval_samples_per_second": 268.984, "eval_steps_per_second": 8.42, "step": 55000 }, { "epoch": 25.238744884038198, "grad_norm": 2.50418758392334, "learning_rate": 4.4500000000000004e-05, "loss": 0.3993, "step": 55500 }, { "epoch": 25.238744884038198, "eval_accuracy": 0.9091283316273341, "eval_loss": 0.5372242331504822, "eval_runtime": 56.2979, "eval_samples_per_second": 270.667, "eval_steps_per_second": 8.473, "step": 55500 }, { "epoch": 25.46612096407458, "grad_norm": 2.4235289096832275, "learning_rate": 4.4000000000000006e-05, "loss": 0.3981, "step": 56000 }, { "epoch": 25.46612096407458, "eval_accuracy": 0.9090792662970008, "eval_loss": 0.5424542427062988, "eval_runtime": 56.0065, "eval_samples_per_second": 272.075, "eval_steps_per_second": 8.517, "step": 56000 }, { "epoch": 25.69349704411096, "grad_norm": 2.049229621887207, "learning_rate": 4.35e-05, "loss": 0.3965, "step": 56500 }, { "epoch": 25.69349704411096, "eval_accuracy": 0.9091030899558308, "eval_loss": 0.5371273159980774, "eval_runtime": 56.9032, "eval_samples_per_second": 267.788, "eval_steps_per_second": 8.383, "step": 56500 }, { "epoch": 25.920873124147338, "grad_norm": 3.2039127349853516, "learning_rate": 4.3e-05, "loss": 0.3956, "step": 57000 }, { "epoch": 25.920873124147338, "eval_accuracy": 0.9102188871200084, "eval_loss": 0.5293972492218018, "eval_runtime": 56.0727, "eval_samples_per_second": 271.754, "eval_steps_per_second": 8.507, "step": 57000 }, { "epoch": 26.14824920418372, "grad_norm": 2.5481934547424316, "learning_rate": 4.25e-05, "loss": 0.3968, "step": 57500 }, { "epoch": 26.14824920418372, "eval_accuracy": 0.9097068103213896, "eval_loss": 0.5351966619491577, "eval_runtime": 55.9717, "eval_samples_per_second": 272.245, "eval_steps_per_second": 8.522, "step": 57500 }, { "epoch": 26.3756252842201, "grad_norm": 2.174415111541748, "learning_rate": 4.2e-05, "loss": 0.3902, "step": 58000 }, { "epoch": 26.3756252842201, "eval_accuracy": 0.9100280555809778, "eval_loss": 0.5361006855964661, "eval_runtime": 56.0896, "eval_samples_per_second": 271.672, "eval_steps_per_second": 8.504, "step": 58000 }, { "epoch": 26.60300136425648, "grad_norm": 2.4938621520996094, "learning_rate": 4.15e-05, "loss": 0.3919, "step": 58500 }, { "epoch": 26.60300136425648, "eval_accuracy": 0.9107032517948321, "eval_loss": 0.5426139831542969, "eval_runtime": 56.0504, "eval_samples_per_second": 271.863, "eval_steps_per_second": 8.51, "step": 58500 }, { "epoch": 26.83037744429286, "grad_norm": 2.729896306991577, "learning_rate": 4.1e-05, "loss": 0.3932, "step": 59000 }, { "epoch": 26.83037744429286, "eval_accuracy": 0.910556622486772, "eval_loss": 0.5372660756111145, "eval_runtime": 56.0573, "eval_samples_per_second": 271.829, "eval_steps_per_second": 8.509, "step": 59000 }, { "epoch": 27.05775352432924, "grad_norm": 2.150261163711548, "learning_rate": 4.05e-05, "loss": 0.3902, "step": 59500 }, { "epoch": 27.05775352432924, "eval_accuracy": 0.9114104771079771, "eval_loss": 0.5252653360366821, "eval_runtime": 56.1245, "eval_samples_per_second": 271.503, "eval_steps_per_second": 8.499, "step": 59500 }, { "epoch": 27.285129604365622, "grad_norm": 2.6134257316589355, "learning_rate": 4e-05, "loss": 0.393, "step": 60000 }, { "epoch": 27.285129604365622, "eval_accuracy": 0.910748714906454, "eval_loss": 0.5393661260604858, "eval_runtime": 56.1182, "eval_samples_per_second": 271.534, "eval_steps_per_second": 8.5, "step": 60000 }, { "epoch": 27.512505684402, "grad_norm": 3.204314947128296, "learning_rate": 3.9500000000000005e-05, "loss": 0.3869, "step": 60500 }, { "epoch": 27.512505684402, "eval_accuracy": 0.9109071564105236, "eval_loss": 0.5372085571289062, "eval_runtime": 55.938, "eval_samples_per_second": 272.409, "eval_steps_per_second": 8.527, "step": 60500 }, { "epoch": 27.73988176443838, "grad_norm": 2.1017961502075195, "learning_rate": 3.9000000000000006e-05, "loss": 0.3869, "step": 61000 }, { "epoch": 27.73988176443838, "eval_accuracy": 0.911014457937958, "eval_loss": 0.5260709524154663, "eval_runtime": 56.0593, "eval_samples_per_second": 271.819, "eval_steps_per_second": 8.509, "step": 61000 }, { "epoch": 27.967257844474762, "grad_norm": 1.7981553077697754, "learning_rate": 3.85e-05, "loss": 0.3817, "step": 61500 }, { "epoch": 27.967257844474762, "eval_accuracy": 0.9113206772639144, "eval_loss": 0.532031238079071, "eval_runtime": 56.0692, "eval_samples_per_second": 271.771, "eval_steps_per_second": 8.507, "step": 61500 }, { "epoch": 28.19463392451114, "grad_norm": 2.4235992431640625, "learning_rate": 3.8e-05, "loss": 0.3781, "step": 62000 }, { "epoch": 28.19463392451114, "eval_accuracy": 0.9117986222084103, "eval_loss": 0.5406020283699036, "eval_runtime": 56.0456, "eval_samples_per_second": 271.886, "eval_steps_per_second": 8.511, "step": 62000 }, { "epoch": 28.42201000454752, "grad_norm": 2.256941795349121, "learning_rate": 3.7500000000000003e-05, "loss": 0.3793, "step": 62500 }, { "epoch": 28.42201000454752, "eval_accuracy": 0.9126116944372022, "eval_loss": 0.5184915661811829, "eval_runtime": 56.0344, "eval_samples_per_second": 271.94, "eval_steps_per_second": 8.513, "step": 62500 }, { "epoch": 28.649386084583902, "grad_norm": 2.4860892295837402, "learning_rate": 3.7e-05, "loss": 0.3758, "step": 63000 }, { "epoch": 28.649386084583902, "eval_accuracy": 0.911845377066606, "eval_loss": 0.5190649628639221, "eval_runtime": 55.9764, "eval_samples_per_second": 272.222, "eval_steps_per_second": 8.521, "step": 63000 }, { "epoch": 28.87676216462028, "grad_norm": 2.1798510551452637, "learning_rate": 3.65e-05, "loss": 0.382, "step": 63500 }, { "epoch": 28.87676216462028, "eval_accuracy": 0.9118922815248525, "eval_loss": 0.5215730667114258, "eval_runtime": 56.0709, "eval_samples_per_second": 271.763, "eval_steps_per_second": 8.507, "step": 63500 }, { "epoch": 29.104138244656664, "grad_norm": 2.404370069503784, "learning_rate": 3.6e-05, "loss": 0.3763, "step": 64000 }, { "epoch": 29.104138244656664, "eval_accuracy": 0.912145037637229, "eval_loss": 0.5312708020210266, "eval_runtime": 56.2338, "eval_samples_per_second": 270.976, "eval_steps_per_second": 8.482, "step": 64000 }, { "epoch": 29.331514324693043, "grad_norm": 2.4613826274871826, "learning_rate": 3.55e-05, "loss": 0.3788, "step": 64500 }, { "epoch": 29.331514324693043, "eval_accuracy": 0.9128715433606646, "eval_loss": 0.5222127437591553, "eval_runtime": 56.8998, "eval_samples_per_second": 267.804, "eval_steps_per_second": 8.383, "step": 64500 }, { "epoch": 29.55889040472942, "grad_norm": 3.3356547355651855, "learning_rate": 3.5e-05, "loss": 0.3755, "step": 65000 }, { "epoch": 29.55889040472942, "eval_accuracy": 0.913125950801356, "eval_loss": 0.5145973563194275, "eval_runtime": 56.8962, "eval_samples_per_second": 267.821, "eval_steps_per_second": 8.384, "step": 65000 }, { "epoch": 29.786266484765804, "grad_norm": 2.2212953567504883, "learning_rate": 3.45e-05, "loss": 0.3788, "step": 65500 }, { "epoch": 29.786266484765804, "eval_accuracy": 0.9125449385052034, "eval_loss": 0.531129002571106, "eval_runtime": 56.1683, "eval_samples_per_second": 271.292, "eval_steps_per_second": 8.492, "step": 65500 }, { "epoch": 30.013642564802183, "grad_norm": 2.8541479110717773, "learning_rate": 3.4000000000000007e-05, "loss": 0.3737, "step": 66000 }, { "epoch": 30.013642564802183, "eval_accuracy": 0.9125566886698588, "eval_loss": 0.5407569408416748, "eval_runtime": 56.0905, "eval_samples_per_second": 271.668, "eval_steps_per_second": 8.504, "step": 66000 }, { "epoch": 30.24101864483856, "grad_norm": 2.438603162765503, "learning_rate": 3.35e-05, "loss": 0.3702, "step": 66500 }, { "epoch": 30.24101864483856, "eval_accuracy": 0.9129054862362733, "eval_loss": 0.5133882761001587, "eval_runtime": 57.1245, "eval_samples_per_second": 266.751, "eval_steps_per_second": 8.35, "step": 66500 }, { "epoch": 30.468394724874944, "grad_norm": 2.1143336296081543, "learning_rate": 3.3e-05, "loss": 0.3729, "step": 67000 }, { "epoch": 30.468394724874944, "eval_accuracy": 0.9124566054408887, "eval_loss": 0.5418105125427246, "eval_runtime": 56.0088, "eval_samples_per_second": 272.065, "eval_steps_per_second": 8.517, "step": 67000 }, { "epoch": 30.695770804911323, "grad_norm": 2.317859649658203, "learning_rate": 3.2500000000000004e-05, "loss": 0.3662, "step": 67500 }, { "epoch": 30.695770804911323, "eval_accuracy": 0.9135011845911662, "eval_loss": 0.5446010828018188, "eval_runtime": 56.0915, "eval_samples_per_second": 271.663, "eval_steps_per_second": 8.504, "step": 67500 }, { "epoch": 30.923146884947702, "grad_norm": 2.983668088912964, "learning_rate": 3.2000000000000005e-05, "loss": 0.3647, "step": 68000 }, { "epoch": 30.923146884947702, "eval_accuracy": 0.9138354207606285, "eval_loss": 0.527925968170166, "eval_runtime": 56.0461, "eval_samples_per_second": 271.883, "eval_steps_per_second": 8.511, "step": 68000 }, { "epoch": 31.150522964984084, "grad_norm": 2.1927847862243652, "learning_rate": 3.15e-05, "loss": 0.3683, "step": 68500 }, { "epoch": 31.150522964984084, "eval_accuracy": 0.9147327240782173, "eval_loss": 0.5225592255592346, "eval_runtime": 57.0493, "eval_samples_per_second": 267.102, "eval_steps_per_second": 8.361, "step": 68500 }, { "epoch": 31.377899045020463, "grad_norm": 2.345428228378296, "learning_rate": 3.1e-05, "loss": 0.3628, "step": 69000 }, { "epoch": 31.377899045020463, "eval_accuracy": 0.913656255193207, "eval_loss": 0.5174685716629028, "eval_runtime": 55.9136, "eval_samples_per_second": 272.528, "eval_steps_per_second": 8.531, "step": 69000 }, { "epoch": 31.605275125056846, "grad_norm": 2.5066728591918945, "learning_rate": 3.05e-05, "loss": 0.3651, "step": 69500 }, { "epoch": 31.605275125056846, "eval_accuracy": 0.9143142595628372, "eval_loss": 0.5336447358131409, "eval_runtime": 56.0707, "eval_samples_per_second": 271.764, "eval_steps_per_second": 8.507, "step": 69500 }, { "epoch": 31.832651205093224, "grad_norm": 2.5718226432800293, "learning_rate": 3e-05, "loss": 0.3621, "step": 70000 }, { "epoch": 31.832651205093224, "eval_accuracy": 0.914124758029471, "eval_loss": 0.5136735439300537, "eval_runtime": 56.0417, "eval_samples_per_second": 271.905, "eval_steps_per_second": 8.512, "step": 70000 }, { "epoch": 32.06002728512961, "grad_norm": 3.0197207927703857, "learning_rate": 2.95e-05, "loss": 0.3598, "step": 70500 }, { "epoch": 32.06002728512961, "eval_accuracy": 0.9143737680741543, "eval_loss": 0.5096654295921326, "eval_runtime": 56.0653, "eval_samples_per_second": 271.79, "eval_steps_per_second": 8.508, "step": 70500 }, { "epoch": 32.287403365165986, "grad_norm": 2.932882785797119, "learning_rate": 2.9e-05, "loss": 0.3583, "step": 71000 }, { "epoch": 32.287403365165986, "eval_accuracy": 0.9142125358692816, "eval_loss": 0.5173851251602173, "eval_runtime": 56.0814, "eval_samples_per_second": 271.712, "eval_steps_per_second": 8.505, "step": 71000 }, { "epoch": 32.514779445202365, "grad_norm": 1.9991718530654907, "learning_rate": 2.8499999999999998e-05, "loss": 0.3542, "step": 71500 }, { "epoch": 32.514779445202365, "eval_accuracy": 0.9147026168203635, "eval_loss": 0.522916316986084, "eval_runtime": 55.8503, "eval_samples_per_second": 272.837, "eval_steps_per_second": 8.541, "step": 71500 }, { "epoch": 32.74215552523874, "grad_norm": 1.8940651416778564, "learning_rate": 2.8000000000000003e-05, "loss": 0.356, "step": 72000 }, { "epoch": 32.74215552523874, "eval_accuracy": 0.9140601083755969, "eval_loss": 0.5267335176467896, "eval_runtime": 56.0286, "eval_samples_per_second": 271.968, "eval_steps_per_second": 8.514, "step": 72000 }, { "epoch": 32.96953160527512, "grad_norm": 2.228545665740967, "learning_rate": 2.7500000000000004e-05, "loss": 0.3554, "step": 72500 }, { "epoch": 32.96953160527512, "eval_accuracy": 0.9148385388009793, "eval_loss": 0.518826425075531, "eval_runtime": 57.0798, "eval_samples_per_second": 266.96, "eval_steps_per_second": 8.357, "step": 72500 }, { "epoch": 33.19690768531151, "grad_norm": 1.9578146934509277, "learning_rate": 2.7000000000000002e-05, "loss": 0.3503, "step": 73000 }, { "epoch": 33.19690768531151, "eval_accuracy": 0.9152538892188704, "eval_loss": 0.5155122876167297, "eval_runtime": 56.9103, "eval_samples_per_second": 267.755, "eval_steps_per_second": 8.382, "step": 73000 }, { "epoch": 33.42428376534789, "grad_norm": 2.4094908237457275, "learning_rate": 2.6500000000000004e-05, "loss": 0.3557, "step": 73500 }, { "epoch": 33.42428376534789, "eval_accuracy": 0.9150505399718056, "eval_loss": 0.5098891854286194, "eval_runtime": 55.8856, "eval_samples_per_second": 272.664, "eval_steps_per_second": 8.535, "step": 73500 }, { "epoch": 33.651659845384266, "grad_norm": 3.974923610687256, "learning_rate": 2.6000000000000002e-05, "loss": 0.3504, "step": 74000 }, { "epoch": 33.651659845384266, "eval_accuracy": 0.9151609378276823, "eval_loss": 0.5117126703262329, "eval_runtime": 56.0457, "eval_samples_per_second": 271.885, "eval_steps_per_second": 8.511, "step": 74000 }, { "epoch": 33.879035925420645, "grad_norm": 2.490255355834961, "learning_rate": 2.5500000000000003e-05, "loss": 0.3543, "step": 74500 }, { "epoch": 33.879035925420645, "eval_accuracy": 0.9146644769193912, "eval_loss": 0.5268692970275879, "eval_runtime": 55.9308, "eval_samples_per_second": 272.444, "eval_steps_per_second": 8.528, "step": 74500 }, { "epoch": 34.106412005457024, "grad_norm": 2.831305742263794, "learning_rate": 2.5e-05, "loss": 0.352, "step": 75000 }, { "epoch": 34.106412005457024, "eval_accuracy": 0.9151447226136769, "eval_loss": 0.5093286037445068, "eval_runtime": 56.9838, "eval_samples_per_second": 267.409, "eval_steps_per_second": 8.371, "step": 75000 }, { "epoch": 34.3337880854934, "grad_norm": 2.085205554962158, "learning_rate": 2.45e-05, "loss": 0.3511, "step": 75500 }, { "epoch": 34.3337880854934, "eval_accuracy": 0.9152598801148045, "eval_loss": 0.5099524259567261, "eval_runtime": 56.0735, "eval_samples_per_second": 271.75, "eval_steps_per_second": 8.507, "step": 75500 }, { "epoch": 34.56116416552979, "grad_norm": 2.5950026512145996, "learning_rate": 2.4e-05, "loss": 0.3477, "step": 76000 }, { "epoch": 34.56116416552979, "eval_accuracy": 0.9159398865939414, "eval_loss": 0.5132637023925781, "eval_runtime": 56.0447, "eval_samples_per_second": 271.89, "eval_steps_per_second": 8.511, "step": 76000 }, { "epoch": 34.78854024556617, "grad_norm": 3.0509707927703857, "learning_rate": 2.35e-05, "loss": 0.3487, "step": 76500 }, { "epoch": 34.78854024556617, "eval_accuracy": 0.9158777972417345, "eval_loss": 0.5157153010368347, "eval_runtime": 56.0477, "eval_samples_per_second": 271.876, "eval_steps_per_second": 8.511, "step": 76500 }, { "epoch": 35.01591632560255, "grad_norm": 3.29341983795166, "learning_rate": 2.3000000000000003e-05, "loss": 0.3432, "step": 77000 }, { "epoch": 35.01591632560255, "eval_accuracy": 0.9159272981938116, "eval_loss": 0.503484308719635, "eval_runtime": 56.0514, "eval_samples_per_second": 271.857, "eval_steps_per_second": 8.51, "step": 77000 }, { "epoch": 35.243292405638925, "grad_norm": 2.7035512924194336, "learning_rate": 2.25e-05, "loss": 0.3468, "step": 77500 }, { "epoch": 35.243292405638925, "eval_accuracy": 0.9168826617753377, "eval_loss": 0.504048228263855, "eval_runtime": 56.0964, "eval_samples_per_second": 271.639, "eval_steps_per_second": 8.503, "step": 77500 }, { "epoch": 35.470668485675304, "grad_norm": 4.089804649353027, "learning_rate": 2.2000000000000003e-05, "loss": 0.3444, "step": 78000 }, { "epoch": 35.470668485675304, "eval_accuracy": 0.9165130415874907, "eval_loss": 0.5067149996757507, "eval_runtime": 56.0967, "eval_samples_per_second": 271.638, "eval_steps_per_second": 8.503, "step": 78000 }, { "epoch": 35.69804456571169, "grad_norm": 2.281663656234741, "learning_rate": 2.15e-05, "loss": 0.3391, "step": 78500 }, { "epoch": 35.69804456571169, "eval_accuracy": 0.9166254129979897, "eval_loss": 0.4899181127548218, "eval_runtime": 56.9423, "eval_samples_per_second": 267.604, "eval_steps_per_second": 8.377, "step": 78500 }, { "epoch": 35.92542064574807, "grad_norm": 2.6281392574310303, "learning_rate": 2.1e-05, "loss": 0.3395, "step": 79000 }, { "epoch": 35.92542064574807, "eval_accuracy": 0.9163551690154882, "eval_loss": 0.5145460367202759, "eval_runtime": 56.0529, "eval_samples_per_second": 271.85, "eval_steps_per_second": 8.51, "step": 79000 }, { "epoch": 36.15279672578445, "grad_norm": 2.2223994731903076, "learning_rate": 2.05e-05, "loss": 0.3405, "step": 79500 }, { "epoch": 36.15279672578445, "eval_accuracy": 0.9158962396841802, "eval_loss": 0.51621013879776, "eval_runtime": 56.078, "eval_samples_per_second": 271.729, "eval_steps_per_second": 8.506, "step": 79500 }, { "epoch": 36.38017280582083, "grad_norm": 2.688448190689087, "learning_rate": 2e-05, "loss": 0.3415, "step": 80000 }, { "epoch": 36.38017280582083, "eval_accuracy": 0.9164056168752696, "eval_loss": 0.5185515284538269, "eval_runtime": 56.0615, "eval_samples_per_second": 271.809, "eval_steps_per_second": 8.509, "step": 80000 }, { "epoch": 36.607548885857206, "grad_norm": 1.9620684385299683, "learning_rate": 1.9500000000000003e-05, "loss": 0.3375, "step": 80500 }, { "epoch": 36.607548885857206, "eval_accuracy": 0.9163537188934124, "eval_loss": 0.517730176448822, "eval_runtime": 56.9493, "eval_samples_per_second": 267.571, "eval_steps_per_second": 8.376, "step": 80500 }, { "epoch": 36.834924965893585, "grad_norm": 2.495645761489868, "learning_rate": 1.9e-05, "loss": 0.3398, "step": 81000 }, { "epoch": 36.834924965893585, "eval_accuracy": 0.9171486235060389, "eval_loss": 0.5006797313690186, "eval_runtime": 56.9369, "eval_samples_per_second": 267.63, "eval_steps_per_second": 8.378, "step": 81000 }, { "epoch": 37.06230104592997, "grad_norm": 2.4040660858154297, "learning_rate": 1.85e-05, "loss": 0.3393, "step": 81500 }, { "epoch": 37.06230104592997, "eval_accuracy": 0.9175943590821654, "eval_loss": 0.5150498151779175, "eval_runtime": 56.1005, "eval_samples_per_second": 271.62, "eval_steps_per_second": 8.503, "step": 81500 }, { "epoch": 37.28967712596635, "grad_norm": 2.7558135986328125, "learning_rate": 1.8e-05, "loss": 0.3344, "step": 82000 }, { "epoch": 37.28967712596635, "eval_accuracy": 0.9171837999528608, "eval_loss": 0.5036485195159912, "eval_runtime": 56.078, "eval_samples_per_second": 271.729, "eval_steps_per_second": 8.506, "step": 82000 }, { "epoch": 37.51705320600273, "grad_norm": 2.468700647354126, "learning_rate": 1.75e-05, "loss": 0.3352, "step": 82500 }, { "epoch": 37.51705320600273, "eval_accuracy": 0.9175697679179382, "eval_loss": 0.5030218958854675, "eval_runtime": 56.0873, "eval_samples_per_second": 271.684, "eval_steps_per_second": 8.505, "step": 82500 }, { "epoch": 37.74442928603911, "grad_norm": 2.5978448390960693, "learning_rate": 1.7000000000000003e-05, "loss": 0.3329, "step": 83000 }, { "epoch": 37.74442928603911, "eval_accuracy": 0.9171495853383843, "eval_loss": 0.5056445598602295, "eval_runtime": 56.0025, "eval_samples_per_second": 272.095, "eval_steps_per_second": 8.517, "step": 83000 }, { "epoch": 37.971805366075486, "grad_norm": 2.1514439582824707, "learning_rate": 1.65e-05, "loss": 0.3298, "step": 83500 }, { "epoch": 37.971805366075486, "eval_accuracy": 0.917162721884158, "eval_loss": 0.5005716681480408, "eval_runtime": 57.1445, "eval_samples_per_second": 266.658, "eval_steps_per_second": 8.347, "step": 83500 }, { "epoch": 38.19918144611187, "grad_norm": 2.6213526725769043, "learning_rate": 1.6000000000000003e-05, "loss": 0.3283, "step": 84000 }, { "epoch": 38.19918144611187, "eval_accuracy": 0.9176722555185908, "eval_loss": 0.5062026977539062, "eval_runtime": 56.1037, "eval_samples_per_second": 271.604, "eval_steps_per_second": 8.502, "step": 84000 }, { "epoch": 38.42655752614825, "grad_norm": 2.2421905994415283, "learning_rate": 1.55e-05, "loss": 0.327, "step": 84500 }, { "epoch": 38.42655752614825, "eval_accuracy": 0.918665843034967, "eval_loss": 0.4939974844455719, "eval_runtime": 56.9571, "eval_samples_per_second": 267.535, "eval_steps_per_second": 8.375, "step": 84500 }, { "epoch": 38.65393360618463, "grad_norm": 2.6543004512786865, "learning_rate": 1.5e-05, "loss": 0.3335, "step": 85000 }, { "epoch": 38.65393360618463, "eval_accuracy": 0.9179265011087174, "eval_loss": 0.5135884284973145, "eval_runtime": 56.0886, "eval_samples_per_second": 271.677, "eval_steps_per_second": 8.504, "step": 85000 }, { "epoch": 38.88130968622101, "grad_norm": 2.2833940982818604, "learning_rate": 1.45e-05, "loss": 0.3323, "step": 85500 }, { "epoch": 38.88130968622101, "eval_accuracy": 0.9180402792823424, "eval_loss": 0.5044585466384888, "eval_runtime": 56.0736, "eval_samples_per_second": 271.75, "eval_steps_per_second": 8.507, "step": 85500 }, { "epoch": 39.10868576625739, "grad_norm": 2.157496213912964, "learning_rate": 1.4000000000000001e-05, "loss": 0.3323, "step": 86000 }, { "epoch": 39.10868576625739, "eval_accuracy": 0.9178364524991991, "eval_loss": 0.5072239637374878, "eval_runtime": 56.0862, "eval_samples_per_second": 271.689, "eval_steps_per_second": 8.505, "step": 86000 }, { "epoch": 39.33606184629377, "grad_norm": 2.467801809310913, "learning_rate": 1.3500000000000001e-05, "loss": 0.3209, "step": 86500 }, { "epoch": 39.33606184629377, "eval_accuracy": 0.9178805584191491, "eval_loss": 0.5000079870223999, "eval_runtime": 56.0688, "eval_samples_per_second": 271.773, "eval_steps_per_second": 8.507, "step": 86500 }, { "epoch": 39.56343792633015, "grad_norm": 1.764186143875122, "learning_rate": 1.3000000000000001e-05, "loss": 0.318, "step": 87000 }, { "epoch": 39.56343792633015, "eval_accuracy": 0.9177274550237315, "eval_loss": 0.5174301862716675, "eval_runtime": 55.9254, "eval_samples_per_second": 272.47, "eval_steps_per_second": 8.529, "step": 87000 }, { "epoch": 39.79081400636653, "grad_norm": 1.9439367055892944, "learning_rate": 1.25e-05, "loss": 0.3304, "step": 87500 }, { "epoch": 39.79081400636653, "eval_accuracy": 0.9182047509597898, "eval_loss": 0.5170900821685791, "eval_runtime": 56.0765, "eval_samples_per_second": 271.736, "eval_steps_per_second": 8.506, "step": 87500 }, { "epoch": 40.01819008640291, "grad_norm": 2.7483394145965576, "learning_rate": 1.2e-05, "loss": 0.3269, "step": 88000 }, { "epoch": 40.01819008640291, "eval_accuracy": 0.9192084760524332, "eval_loss": 0.5051037669181824, "eval_runtime": 56.982, "eval_samples_per_second": 267.418, "eval_steps_per_second": 8.371, "step": 88000 }, { "epoch": 40.24556616643929, "grad_norm": 1.9884289503097534, "learning_rate": 1.1500000000000002e-05, "loss": 0.3226, "step": 88500 }, { "epoch": 40.24556616643929, "eval_accuracy": 0.9179925787853008, "eval_loss": 0.4963218867778778, "eval_runtime": 56.9742, "eval_samples_per_second": 267.454, "eval_steps_per_second": 8.372, "step": 88500 }, { "epoch": 40.47294224647567, "grad_norm": 2.4133260250091553, "learning_rate": 1.1000000000000001e-05, "loss": 0.3193, "step": 89000 }, { "epoch": 40.47294224647567, "eval_accuracy": 0.9186944890989619, "eval_loss": 0.5157626867294312, "eval_runtime": 56.072, "eval_samples_per_second": 271.758, "eval_steps_per_second": 8.507, "step": 89000 }, { "epoch": 40.700318326512054, "grad_norm": 2.426737070083618, "learning_rate": 1.05e-05, "loss": 0.3204, "step": 89500 }, { "epoch": 40.700318326512054, "eval_accuracy": 0.918837528271839, "eval_loss": 0.49108728766441345, "eval_runtime": 56.0958, "eval_samples_per_second": 271.643, "eval_steps_per_second": 8.503, "step": 89500 }, { "epoch": 40.92769440654843, "grad_norm": 2.0523056983947754, "learning_rate": 1e-05, "loss": 0.3234, "step": 90000 }, { "epoch": 40.92769440654843, "eval_accuracy": 0.9187695504636845, "eval_loss": 0.497799277305603, "eval_runtime": 56.0896, "eval_samples_per_second": 271.673, "eval_steps_per_second": 8.504, "step": 90000 }, { "epoch": 41.15507048658481, "grad_norm": 2.7144970893859863, "learning_rate": 9.5e-06, "loss": 0.3211, "step": 90500 }, { "epoch": 41.15507048658481, "eval_accuracy": 0.9191388218103149, "eval_loss": 0.4986066222190857, "eval_runtime": 56.0597, "eval_samples_per_second": 271.818, "eval_steps_per_second": 8.509, "step": 90500 }, { "epoch": 41.38244656662119, "grad_norm": 2.1701807975769043, "learning_rate": 9e-06, "loss": 0.3202, "step": 91000 }, { "epoch": 41.38244656662119, "eval_accuracy": 0.9192327856840563, "eval_loss": 0.5045046210289001, "eval_runtime": 56.0657, "eval_samples_per_second": 271.788, "eval_steps_per_second": 8.508, "step": 91000 }, { "epoch": 41.60982264665757, "grad_norm": 1.8716968297958374, "learning_rate": 8.500000000000002e-06, "loss": 0.3178, "step": 91500 }, { "epoch": 41.60982264665757, "eval_accuracy": 0.918940321981522, "eval_loss": 0.5036594867706299, "eval_runtime": 57.7478, "eval_samples_per_second": 263.872, "eval_steps_per_second": 8.26, "step": 91500 }, { "epoch": 41.83719872669395, "grad_norm": 2.7744719982147217, "learning_rate": 8.000000000000001e-06, "loss": 0.3181, "step": 92000 }, { "epoch": 41.83719872669395, "eval_accuracy": 0.9190958139064731, "eval_loss": 0.49405789375305176, "eval_runtime": 55.9547, "eval_samples_per_second": 272.327, "eval_steps_per_second": 8.525, "step": 92000 }, { "epoch": 42.064574806730334, "grad_norm": 1.9463552236557007, "learning_rate": 7.5e-06, "loss": 0.3139, "step": 92500 }, { "epoch": 42.064574806730334, "eval_accuracy": 0.9185921615457032, "eval_loss": 0.5153664946556091, "eval_runtime": 55.9616, "eval_samples_per_second": 272.294, "eval_steps_per_second": 8.524, "step": 92500 }, { "epoch": 42.29195088676671, "grad_norm": 1.7864753007888794, "learning_rate": 7.000000000000001e-06, "loss": 0.3155, "step": 93000 }, { "epoch": 42.29195088676671, "eval_accuracy": 0.9193213422174352, "eval_loss": 0.4968840777873993, "eval_runtime": 56.9973, "eval_samples_per_second": 267.346, "eval_steps_per_second": 8.369, "step": 93000 }, { "epoch": 42.51932696680309, "grad_norm": 2.8307926654815674, "learning_rate": 6.5000000000000004e-06, "loss": 0.3225, "step": 93500 }, { "epoch": 42.51932696680309, "eval_accuracy": 0.9194418625202981, "eval_loss": 0.49802207946777344, "eval_runtime": 57.0104, "eval_samples_per_second": 267.284, "eval_steps_per_second": 8.367, "step": 93500 }, { "epoch": 42.74670304683947, "grad_norm": 2.3244693279266357, "learning_rate": 6e-06, "loss": 0.3166, "step": 94000 }, { "epoch": 42.74670304683947, "eval_accuracy": 0.9192593340761245, "eval_loss": 0.5010645985603333, "eval_runtime": 55.9693, "eval_samples_per_second": 272.256, "eval_steps_per_second": 8.523, "step": 94000 }, { "epoch": 42.97407912687585, "grad_norm": 1.967890977859497, "learning_rate": 5.500000000000001e-06, "loss": 0.3146, "step": 94500 }, { "epoch": 42.97407912687585, "eval_accuracy": 0.9196343334604102, "eval_loss": 0.48918718099594116, "eval_runtime": 56.094, "eval_samples_per_second": 271.651, "eval_steps_per_second": 8.504, "step": 94500 }, { "epoch": 43.201455206912236, "grad_norm": 2.064098596572876, "learning_rate": 5e-06, "loss": 0.3152, "step": 95000 }, { "epoch": 43.201455206912236, "eval_accuracy": 0.9197978271795374, "eval_loss": 0.4905773103237152, "eval_runtime": 56.0802, "eval_samples_per_second": 271.718, "eval_steps_per_second": 8.506, "step": 95000 }, { "epoch": 43.428831286948615, "grad_norm": 2.1124424934387207, "learning_rate": 4.5e-06, "loss": 0.3077, "step": 95500 }, { "epoch": 43.428831286948615, "eval_accuracy": 0.9199429121130404, "eval_loss": 0.4860183894634247, "eval_runtime": 55.9425, "eval_samples_per_second": 272.387, "eval_steps_per_second": 8.527, "step": 95500 }, { "epoch": 43.656207366984994, "grad_norm": 1.9392900466918945, "learning_rate": 4.000000000000001e-06, "loss": 0.3185, "step": 96000 }, { "epoch": 43.656207366984994, "eval_accuracy": 0.919319351233956, "eval_loss": 0.4847618043422699, "eval_runtime": 56.9968, "eval_samples_per_second": 267.348, "eval_steps_per_second": 8.369, "step": 96000 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 46, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.105977491866255e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }