|
{ |
|
"best_metric": 0.4847618043422699, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/mlt-Latn/checkpoint-96000", |
|
"epoch": 43.656207366984994, |
|
"eval_steps": 500, |
|
"global_step": 96000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.22737608003638018, |
|
"grad_norm": 5.168328285217285, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.8057, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22737608003638018, |
|
"eval_accuracy": 0.7673858936928836, |
|
"eval_loss": 1.3882914781570435, |
|
"eval_runtime": 57.4714, |
|
"eval_samples_per_second": 265.141, |
|
"eval_steps_per_second": 8.3, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.45475216007276037, |
|
"grad_norm": 6.386734485626221, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.2412, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.45475216007276037, |
|
"eval_accuracy": 0.8023059017621376, |
|
"eval_loss": 1.1490817070007324, |
|
"eval_runtime": 57.9662, |
|
"eval_samples_per_second": 262.877, |
|
"eval_steps_per_second": 8.229, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6821282401091405, |
|
"grad_norm": 3.5194196701049805, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.0971, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6821282401091405, |
|
"eval_accuracy": 0.8191701818204763, |
|
"eval_loss": 1.0539811849594116, |
|
"eval_runtime": 58.0944, |
|
"eval_samples_per_second": 262.297, |
|
"eval_steps_per_second": 8.211, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9095043201455207, |
|
"grad_norm": 4.919663906097412, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.0083, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9095043201455207, |
|
"eval_accuracy": 0.8290244187626605, |
|
"eval_loss": 0.9915244579315186, |
|
"eval_runtime": 59.0201, |
|
"eval_samples_per_second": 258.183, |
|
"eval_steps_per_second": 8.082, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1368804001819008, |
|
"grad_norm": 3.4152417182922363, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.9343, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1368804001819008, |
|
"eval_accuracy": 0.8386819891878221, |
|
"eval_loss": 0.9384229779243469, |
|
"eval_runtime": 58.1514, |
|
"eval_samples_per_second": 262.04, |
|
"eval_steps_per_second": 8.203, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.364256480218281, |
|
"grad_norm": 3.325963258743286, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.8966, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.364256480218281, |
|
"eval_accuracy": 0.8441958792162307, |
|
"eval_loss": 0.8955113887786865, |
|
"eval_runtime": 59.0626, |
|
"eval_samples_per_second": 257.997, |
|
"eval_steps_per_second": 8.076, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.591632560254661, |
|
"grad_norm": 4.558549880981445, |
|
"learning_rate": 9.65e-05, |
|
"loss": 0.8609, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.591632560254661, |
|
"eval_accuracy": 0.8482073460109512, |
|
"eval_loss": 0.8796423673629761, |
|
"eval_runtime": 58.1755, |
|
"eval_samples_per_second": 261.931, |
|
"eval_steps_per_second": 8.199, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.8190086402910413, |
|
"grad_norm": 4.228474140167236, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.8394, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8190086402910413, |
|
"eval_accuracy": 0.8522584062680361, |
|
"eval_loss": 0.8565191030502319, |
|
"eval_runtime": 59.0465, |
|
"eval_samples_per_second": 258.068, |
|
"eval_steps_per_second": 8.078, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.0463847203274215, |
|
"grad_norm": 4.954216003417969, |
|
"learning_rate": 9.55e-05, |
|
"loss": 0.8082, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0463847203274215, |
|
"eval_accuracy": 0.8560422496354393, |
|
"eval_loss": 0.8301263451576233, |
|
"eval_runtime": 59.074, |
|
"eval_samples_per_second": 257.947, |
|
"eval_steps_per_second": 8.075, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.2737608003638017, |
|
"grad_norm": 3.8996498584747314, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.7751, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.2737608003638017, |
|
"eval_accuracy": 0.8575601290381725, |
|
"eval_loss": 0.8123583793640137, |
|
"eval_runtime": 59.0078, |
|
"eval_samples_per_second": 258.237, |
|
"eval_steps_per_second": 8.084, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.501136880400182, |
|
"grad_norm": 3.683563709259033, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 0.7746, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.501136880400182, |
|
"eval_accuracy": 0.8608798596484555, |
|
"eval_loss": 0.8004751801490784, |
|
"eval_runtime": 57.9526, |
|
"eval_samples_per_second": 262.939, |
|
"eval_steps_per_second": 8.231, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.728512960436562, |
|
"grad_norm": 3.8639516830444336, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.7467, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.728512960436562, |
|
"eval_accuracy": 0.8630581086069647, |
|
"eval_loss": 0.7939795255661011, |
|
"eval_runtime": 59.0043, |
|
"eval_samples_per_second": 258.252, |
|
"eval_steps_per_second": 8.084, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.9558890404729423, |
|
"grad_norm": 3.31400990486145, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 0.7404, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.9558890404729423, |
|
"eval_accuracy": 0.8657109489879384, |
|
"eval_loss": 0.7746465802192688, |
|
"eval_runtime": 58.2316, |
|
"eval_samples_per_second": 261.679, |
|
"eval_steps_per_second": 8.191, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.1832651205093225, |
|
"grad_norm": 2.843773603439331, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.7193, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.1832651205093225, |
|
"eval_accuracy": 0.8669591175218982, |
|
"eval_loss": 0.7662197947502136, |
|
"eval_runtime": 58.8578, |
|
"eval_samples_per_second": 258.895, |
|
"eval_steps_per_second": 8.104, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.4106412005457027, |
|
"grad_norm": 3.61684513092041, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.7127, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.4106412005457027, |
|
"eval_accuracy": 0.8696189271143282, |
|
"eval_loss": 0.7391215562820435, |
|
"eval_runtime": 58.0068, |
|
"eval_samples_per_second": 262.693, |
|
"eval_steps_per_second": 8.223, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.6380172805820825, |
|
"grad_norm": 2.982285737991333, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.7077, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.6380172805820825, |
|
"eval_accuracy": 0.8704082502343816, |
|
"eval_loss": 0.7502180933952332, |
|
"eval_runtime": 58.7428, |
|
"eval_samples_per_second": 259.402, |
|
"eval_steps_per_second": 8.12, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.865393360618463, |
|
"grad_norm": 3.2284772396087646, |
|
"learning_rate": 9.15e-05, |
|
"loss": 0.6954, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.865393360618463, |
|
"eval_accuracy": 0.8723014320863037, |
|
"eval_loss": 0.7357130646705627, |
|
"eval_runtime": 57.9037, |
|
"eval_samples_per_second": 263.161, |
|
"eval_steps_per_second": 8.238, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.092769440654843, |
|
"grad_norm": 3.141983985900879, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.6831, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.092769440654843, |
|
"eval_accuracy": 0.87450734136313, |
|
"eval_loss": 0.7237355709075928, |
|
"eval_runtime": 58.8298, |
|
"eval_samples_per_second": 259.018, |
|
"eval_steps_per_second": 8.108, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.320145520691224, |
|
"grad_norm": 3.805156707763672, |
|
"learning_rate": 9.05e-05, |
|
"loss": 0.6653, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.320145520691224, |
|
"eval_accuracy": 0.875826148405435, |
|
"eval_loss": 0.7154198288917542, |
|
"eval_runtime": 58.8099, |
|
"eval_samples_per_second": 259.106, |
|
"eval_steps_per_second": 8.111, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.547521600727603, |
|
"grad_norm": 2.943392753601074, |
|
"learning_rate": 9e-05, |
|
"loss": 0.6587, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.547521600727603, |
|
"eval_accuracy": 0.8750290297659766, |
|
"eval_loss": 0.7158774733543396, |
|
"eval_runtime": 58.8052, |
|
"eval_samples_per_second": 259.127, |
|
"eval_steps_per_second": 8.112, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.774897680763983, |
|
"grad_norm": 3.1016767024993896, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 0.657, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.774897680763983, |
|
"eval_accuracy": 0.876631092564109, |
|
"eval_loss": 0.7100118398666382, |
|
"eval_runtime": 57.9757, |
|
"eval_samples_per_second": 262.834, |
|
"eval_steps_per_second": 8.228, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 5.002273760800364, |
|
"grad_norm": 2.9359586238861084, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.6544, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.002273760800364, |
|
"eval_accuracy": 0.8772724264740639, |
|
"eval_loss": 0.6987695097923279, |
|
"eval_runtime": 58.0324, |
|
"eval_samples_per_second": 262.578, |
|
"eval_steps_per_second": 8.22, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.229649840836744, |
|
"grad_norm": 3.289794445037842, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 0.6315, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.229649840836744, |
|
"eval_accuracy": 0.8785936092935085, |
|
"eval_loss": 0.7098827362060547, |
|
"eval_runtime": 58.2042, |
|
"eval_samples_per_second": 261.802, |
|
"eval_steps_per_second": 8.195, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.457025920873124, |
|
"grad_norm": 3.1603569984436035, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.6406, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.457025920873124, |
|
"eval_accuracy": 0.8785365360431241, |
|
"eval_loss": 0.697100043296814, |
|
"eval_runtime": 58.956, |
|
"eval_samples_per_second": 258.464, |
|
"eval_steps_per_second": 8.091, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.684402000909504, |
|
"grad_norm": 3.9137370586395264, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.6296, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.684402000909504, |
|
"eval_accuracy": 0.8807072007268872, |
|
"eval_loss": 0.6913357377052307, |
|
"eval_runtime": 58.1403, |
|
"eval_samples_per_second": 262.09, |
|
"eval_steps_per_second": 8.204, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.911778080945885, |
|
"grad_norm": 3.703839063644409, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.626, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.911778080945885, |
|
"eval_accuracy": 0.8809903070027411, |
|
"eval_loss": 0.6919081807136536, |
|
"eval_runtime": 58.1489, |
|
"eval_samples_per_second": 262.051, |
|
"eval_steps_per_second": 8.203, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.139154160982264, |
|
"grad_norm": 3.5155723094940186, |
|
"learning_rate": 8.65e-05, |
|
"loss": 0.6198, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.139154160982264, |
|
"eval_accuracy": 0.8827777558822371, |
|
"eval_loss": 0.6714188456535339, |
|
"eval_runtime": 58.9629, |
|
"eval_samples_per_second": 258.434, |
|
"eval_steps_per_second": 8.09, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.366530241018645, |
|
"grad_norm": 3.117462635040283, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.6076, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.366530241018645, |
|
"eval_accuracy": 0.8834275999297155, |
|
"eval_loss": 0.6733196973800659, |
|
"eval_runtime": 59.0324, |
|
"eval_samples_per_second": 258.13, |
|
"eval_steps_per_second": 8.08, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.593906321055025, |
|
"grad_norm": 2.6322078704833984, |
|
"learning_rate": 8.55e-05, |
|
"loss": 0.6087, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.593906321055025, |
|
"eval_accuracy": 0.8831569539581932, |
|
"eval_loss": 0.6722173690795898, |
|
"eval_runtime": 58.2146, |
|
"eval_samples_per_second": 261.755, |
|
"eval_steps_per_second": 8.194, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.8212824010914055, |
|
"grad_norm": 2.875756025314331, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.6016, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.8212824010914055, |
|
"eval_accuracy": 0.8827436768449041, |
|
"eval_loss": 0.6779712438583374, |
|
"eval_runtime": 57.9717, |
|
"eval_samples_per_second": 262.852, |
|
"eval_steps_per_second": 8.228, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.048658481127785, |
|
"grad_norm": 2.797990560531616, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 0.5932, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.048658481127785, |
|
"eval_accuracy": 0.8849467695494039, |
|
"eval_loss": 0.6560626029968262, |
|
"eval_runtime": 59.2909, |
|
"eval_samples_per_second": 257.004, |
|
"eval_steps_per_second": 8.045, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.276034561164166, |
|
"grad_norm": 3.173975706100464, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.5877, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.276034561164166, |
|
"eval_accuracy": 0.8849921630094044, |
|
"eval_loss": 0.6600627303123474, |
|
"eval_runtime": 57.9674, |
|
"eval_samples_per_second": 262.872, |
|
"eval_steps_per_second": 8.229, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.503410641200546, |
|
"grad_norm": 2.7644402980804443, |
|
"learning_rate": 8.35e-05, |
|
"loss": 0.5909, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.503410641200546, |
|
"eval_accuracy": 0.885632879834746, |
|
"eval_loss": 0.6634506583213806, |
|
"eval_runtime": 58.1288, |
|
"eval_samples_per_second": 262.142, |
|
"eval_steps_per_second": 8.206, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.730786721236926, |
|
"grad_norm": 3.6716108322143555, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.5848, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.730786721236926, |
|
"eval_accuracy": 0.8870372975442135, |
|
"eval_loss": 0.6555737257003784, |
|
"eval_runtime": 58.0593, |
|
"eval_samples_per_second": 262.456, |
|
"eval_steps_per_second": 8.216, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.958162801273306, |
|
"grad_norm": 2.635899782180786, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.5806, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 7.958162801273306, |
|
"eval_accuracy": 0.8872147874120453, |
|
"eval_loss": 0.6475590467453003, |
|
"eval_runtime": 58.9688, |
|
"eval_samples_per_second": 258.408, |
|
"eval_steps_per_second": 8.089, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 8.185538881309686, |
|
"grad_norm": 3.155376434326172, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.5717, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.185538881309686, |
|
"eval_accuracy": 0.887904005764964, |
|
"eval_loss": 0.6570438146591187, |
|
"eval_runtime": 58.2291, |
|
"eval_samples_per_second": 261.69, |
|
"eval_steps_per_second": 8.192, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.412914961346067, |
|
"grad_norm": 2.7952346801757812, |
|
"learning_rate": 8.15e-05, |
|
"loss": 0.564, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 8.412914961346067, |
|
"eval_accuracy": 0.8873371327146311, |
|
"eval_loss": 0.6420606374740601, |
|
"eval_runtime": 58.022, |
|
"eval_samples_per_second": 262.624, |
|
"eval_steps_per_second": 8.221, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 8.640291041382447, |
|
"grad_norm": 2.901456832885742, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.5668, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.640291041382447, |
|
"eval_accuracy": 0.8888152172025493, |
|
"eval_loss": 0.6471173763275146, |
|
"eval_runtime": 58.0374, |
|
"eval_samples_per_second": 262.555, |
|
"eval_steps_per_second": 8.219, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.867667121418826, |
|
"grad_norm": 3.783108949661255, |
|
"learning_rate": 8.05e-05, |
|
"loss": 0.5661, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 8.867667121418826, |
|
"eval_accuracy": 0.8896724765068369, |
|
"eval_loss": 0.6358206272125244, |
|
"eval_runtime": 57.9826, |
|
"eval_samples_per_second": 262.803, |
|
"eval_steps_per_second": 8.227, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 9.095043201455207, |
|
"grad_norm": 3.0208489894866943, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5575, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.095043201455207, |
|
"eval_accuracy": 0.8890150798157612, |
|
"eval_loss": 0.6520903706550598, |
|
"eval_runtime": 58.9559, |
|
"eval_samples_per_second": 258.464, |
|
"eval_steps_per_second": 8.091, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.322419281491587, |
|
"grad_norm": 2.4719395637512207, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 0.5563, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 9.322419281491587, |
|
"eval_accuracy": 0.8897703404100196, |
|
"eval_loss": 0.6332861185073853, |
|
"eval_runtime": 58.0586, |
|
"eval_samples_per_second": 262.459, |
|
"eval_steps_per_second": 8.216, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 9.549795361527968, |
|
"grad_norm": 3.9511001110076904, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.5462, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.549795361527968, |
|
"eval_accuracy": 0.8907973350119666, |
|
"eval_loss": 0.6307789087295532, |
|
"eval_runtime": 58.9193, |
|
"eval_samples_per_second": 258.625, |
|
"eval_steps_per_second": 8.096, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.777171441564347, |
|
"grad_norm": 3.5722222328186035, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 0.554, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 9.777171441564347, |
|
"eval_accuracy": 0.892567522812507, |
|
"eval_loss": 0.6097805500030518, |
|
"eval_runtime": 58.065, |
|
"eval_samples_per_second": 262.43, |
|
"eval_steps_per_second": 8.215, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 10.004547521600728, |
|
"grad_norm": 2.951775074005127, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.5484, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.004547521600728, |
|
"eval_accuracy": 0.8916424877002649, |
|
"eval_loss": 0.6175746917724609, |
|
"eval_runtime": 58.4944, |
|
"eval_samples_per_second": 260.504, |
|
"eval_steps_per_second": 8.155, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.231923601637108, |
|
"grad_norm": 3.9985241889953613, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.5443, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 10.231923601637108, |
|
"eval_accuracy": 0.8924841760921375, |
|
"eval_loss": 0.6330751180648804, |
|
"eval_runtime": 58.5296, |
|
"eval_samples_per_second": 260.347, |
|
"eval_steps_per_second": 8.15, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 10.459299681673487, |
|
"grad_norm": 3.2177023887634277, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.5329, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.459299681673487, |
|
"eval_accuracy": 0.8919113818187556, |
|
"eval_loss": 0.6282722353935242, |
|
"eval_runtime": 58.5087, |
|
"eval_samples_per_second": 260.44, |
|
"eval_steps_per_second": 8.153, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.686675761709868, |
|
"grad_norm": 2.4907150268554688, |
|
"learning_rate": 7.65e-05, |
|
"loss": 0.5332, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 10.686675761709868, |
|
"eval_accuracy": 0.8929225353778606, |
|
"eval_loss": 0.6218137145042419, |
|
"eval_runtime": 58.6581, |
|
"eval_samples_per_second": 259.777, |
|
"eval_steps_per_second": 8.132, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 10.914051841746248, |
|
"grad_norm": 2.3024277687072754, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.5323, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 10.914051841746248, |
|
"eval_accuracy": 0.8931125650111673, |
|
"eval_loss": 0.6083813309669495, |
|
"eval_runtime": 57.6897, |
|
"eval_samples_per_second": 264.137, |
|
"eval_steps_per_second": 8.268, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 11.141427921782629, |
|
"grad_norm": 3.06503963470459, |
|
"learning_rate": 7.55e-05, |
|
"loss": 0.5239, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 11.141427921782629, |
|
"eval_accuracy": 0.8939010799940649, |
|
"eval_loss": 0.637162446975708, |
|
"eval_runtime": 57.7617, |
|
"eval_samples_per_second": 263.808, |
|
"eval_steps_per_second": 8.258, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 11.368804001819008, |
|
"grad_norm": 2.7696726322174072, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.5261, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.368804001819008, |
|
"eval_accuracy": 0.8937561222232755, |
|
"eval_loss": 0.6276402473449707, |
|
"eval_runtime": 57.6676, |
|
"eval_samples_per_second": 264.239, |
|
"eval_steps_per_second": 8.272, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.596180081855389, |
|
"grad_norm": 2.751497268676758, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.5175, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 11.596180081855389, |
|
"eval_accuracy": 0.8951661397774281, |
|
"eval_loss": 0.6050118207931519, |
|
"eval_runtime": 57.5452, |
|
"eval_samples_per_second": 264.8, |
|
"eval_steps_per_second": 8.289, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 11.82355616189177, |
|
"grad_norm": 2.554222822189331, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.5262, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 11.82355616189177, |
|
"eval_accuracy": 0.8952365458244355, |
|
"eval_loss": 0.6047869324684143, |
|
"eval_runtime": 58.525, |
|
"eval_samples_per_second": 260.367, |
|
"eval_steps_per_second": 8.15, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 12.05093224192815, |
|
"grad_norm": 2.832977294921875, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.5189, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 12.05093224192815, |
|
"eval_accuracy": 0.8951059547768438, |
|
"eval_loss": 0.6053177714347839, |
|
"eval_runtime": 57.7314, |
|
"eval_samples_per_second": 263.946, |
|
"eval_steps_per_second": 8.262, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 12.278308321964529, |
|
"grad_norm": 2.9270408153533936, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.5162, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.278308321964529, |
|
"eval_accuracy": 0.8956970707551482, |
|
"eval_loss": 0.600378692150116, |
|
"eval_runtime": 68.0954, |
|
"eval_samples_per_second": 223.774, |
|
"eval_steps_per_second": 7.005, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.50568440200091, |
|
"grad_norm": 4.0453782081604, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.5111, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 12.50568440200091, |
|
"eval_accuracy": 0.8950593575694928, |
|
"eval_loss": 0.601889431476593, |
|
"eval_runtime": 69.1002, |
|
"eval_samples_per_second": 220.52, |
|
"eval_steps_per_second": 6.903, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 12.73306048203729, |
|
"grad_norm": 1.8677308559417725, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.51, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 12.73306048203729, |
|
"eval_accuracy": 0.8962058115277431, |
|
"eval_loss": 0.6174491047859192, |
|
"eval_runtime": 69.7483, |
|
"eval_samples_per_second": 218.471, |
|
"eval_steps_per_second": 6.839, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 12.96043656207367, |
|
"grad_norm": 2.551996946334839, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.5109, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 12.96043656207367, |
|
"eval_accuracy": 0.8969674492010259, |
|
"eval_loss": 0.6083965301513672, |
|
"eval_runtime": 66.6755, |
|
"eval_samples_per_second": 228.54, |
|
"eval_steps_per_second": 7.154, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 13.18781264211005, |
|
"grad_norm": 2.474005699157715, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.502, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.18781264211005, |
|
"eval_accuracy": 0.8963457500841246, |
|
"eval_loss": 0.6135991215705872, |
|
"eval_runtime": 67.7791, |
|
"eval_samples_per_second": 224.818, |
|
"eval_steps_per_second": 7.038, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.41518872214643, |
|
"grad_norm": 3.0140531063079834, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.5, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 13.41518872214643, |
|
"eval_accuracy": 0.8979126349984058, |
|
"eval_loss": 0.6121774911880493, |
|
"eval_runtime": 69.2043, |
|
"eval_samples_per_second": 220.189, |
|
"eval_steps_per_second": 6.893, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 13.642564802182811, |
|
"grad_norm": 2.9221041202545166, |
|
"learning_rate": 7e-05, |
|
"loss": 0.5024, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.642564802182811, |
|
"eval_accuracy": 0.8981451519977776, |
|
"eval_loss": 0.59588623046875, |
|
"eval_runtime": 68.802, |
|
"eval_samples_per_second": 221.476, |
|
"eval_steps_per_second": 6.933, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.86994088221919, |
|
"grad_norm": 3.138761043548584, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.4998, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 13.86994088221919, |
|
"eval_accuracy": 0.8978226704469172, |
|
"eval_loss": 0.5944364666938782, |
|
"eval_runtime": 66.5107, |
|
"eval_samples_per_second": 229.106, |
|
"eval_steps_per_second": 7.172, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 14.09731696225557, |
|
"grad_norm": 2.7592108249664307, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.4854, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.09731696225557, |
|
"eval_accuracy": 0.898367703578009, |
|
"eval_loss": 0.5859193205833435, |
|
"eval_runtime": 66.7713, |
|
"eval_samples_per_second": 228.212, |
|
"eval_steps_per_second": 7.144, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.324693042291951, |
|
"grad_norm": 2.9050989151000977, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.4882, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 14.324693042291951, |
|
"eval_accuracy": 0.8975941653774508, |
|
"eval_loss": 0.60106360912323, |
|
"eval_runtime": 68.9335, |
|
"eval_samples_per_second": 221.054, |
|
"eval_steps_per_second": 6.92, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 14.552069122328332, |
|
"grad_norm": 2.3342740535736084, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.4841, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.552069122328332, |
|
"eval_accuracy": 0.8990730423650459, |
|
"eval_loss": 0.584600567817688, |
|
"eval_runtime": 67.2181, |
|
"eval_samples_per_second": 226.695, |
|
"eval_steps_per_second": 7.096, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.77944520236471, |
|
"grad_norm": 2.1228647232055664, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.4844, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 14.77944520236471, |
|
"eval_accuracy": 0.8990018801499072, |
|
"eval_loss": 0.5907680988311768, |
|
"eval_runtime": 59.205, |
|
"eval_samples_per_second": 257.377, |
|
"eval_steps_per_second": 8.057, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 15.006821282401091, |
|
"grad_norm": 3.1188831329345703, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.4831, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 15.006821282401091, |
|
"eval_accuracy": 0.8995549513632177, |
|
"eval_loss": 0.5859436392784119, |
|
"eval_runtime": 58.4781, |
|
"eval_samples_per_second": 260.576, |
|
"eval_steps_per_second": 8.157, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 15.234197362437472, |
|
"grad_norm": 3.4289638996124268, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.4783, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 15.234197362437472, |
|
"eval_accuracy": 0.8997874968645139, |
|
"eval_loss": 0.586015522480011, |
|
"eval_runtime": 58.2205, |
|
"eval_samples_per_second": 261.729, |
|
"eval_steps_per_second": 8.193, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 15.461573442473851, |
|
"grad_norm": 2.7651004791259766, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.4752, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 15.461573442473851, |
|
"eval_accuracy": 0.9008641609570693, |
|
"eval_loss": 0.5896708369255066, |
|
"eval_runtime": 59.224, |
|
"eval_samples_per_second": 257.294, |
|
"eval_steps_per_second": 8.054, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 15.688949522510232, |
|
"grad_norm": 2.334015130996704, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.4751, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 15.688949522510232, |
|
"eval_accuracy": 0.8998034774235811, |
|
"eval_loss": 0.5798426270484924, |
|
"eval_runtime": 58.5323, |
|
"eval_samples_per_second": 260.335, |
|
"eval_steps_per_second": 8.149, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 15.916325602546612, |
|
"grad_norm": 1.9268873929977417, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.4781, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 15.916325602546612, |
|
"eval_accuracy": 0.900376427009191, |
|
"eval_loss": 0.574535071849823, |
|
"eval_runtime": 59.3062, |
|
"eval_samples_per_second": 256.938, |
|
"eval_steps_per_second": 8.043, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 16.143701682582993, |
|
"grad_norm": 3.1282005310058594, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.4741, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 16.143701682582993, |
|
"eval_accuracy": 0.9008166774124221, |
|
"eval_loss": 0.5920200347900391, |
|
"eval_runtime": 67.8707, |
|
"eval_samples_per_second": 224.515, |
|
"eval_steps_per_second": 7.028, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 16.37107776261937, |
|
"grad_norm": 2.4809906482696533, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.4764, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 16.37107776261937, |
|
"eval_accuracy": 0.9007614317314762, |
|
"eval_loss": 0.5653102397918701, |
|
"eval_runtime": 67.6973, |
|
"eval_samples_per_second": 225.09, |
|
"eval_steps_per_second": 7.046, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 16.598453842655754, |
|
"grad_norm": 2.460899591445923, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.4693, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 16.598453842655754, |
|
"eval_accuracy": 0.9008501214346764, |
|
"eval_loss": 0.5879611968994141, |
|
"eval_runtime": 66.4301, |
|
"eval_samples_per_second": 229.384, |
|
"eval_steps_per_second": 7.18, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 16.825829922692133, |
|
"grad_norm": 2.389681816101074, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.4638, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 16.825829922692133, |
|
"eval_accuracy": 0.9024308529635798, |
|
"eval_loss": 0.5856647491455078, |
|
"eval_runtime": 66.0584, |
|
"eval_samples_per_second": 230.675, |
|
"eval_steps_per_second": 7.221, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 17.053206002728512, |
|
"grad_norm": 2.5536763668060303, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.4643, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 17.053206002728512, |
|
"eval_accuracy": 0.9017984039775945, |
|
"eval_loss": 0.5838255882263184, |
|
"eval_runtime": 66.4217, |
|
"eval_samples_per_second": 229.413, |
|
"eval_steps_per_second": 7.181, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 17.280582082764894, |
|
"grad_norm": 2.8841841220855713, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.4559, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 17.280582082764894, |
|
"eval_accuracy": 0.9016667691546006, |
|
"eval_loss": 0.5722949504852295, |
|
"eval_runtime": 67.5295, |
|
"eval_samples_per_second": 225.65, |
|
"eval_steps_per_second": 7.064, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 17.507958162801273, |
|
"grad_norm": 2.0763895511627197, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.4649, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 17.507958162801273, |
|
"eval_accuracy": 0.9028016315170558, |
|
"eval_loss": 0.5597889423370361, |
|
"eval_runtime": 68.1673, |
|
"eval_samples_per_second": 223.538, |
|
"eval_steps_per_second": 6.997, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 17.735334242837652, |
|
"grad_norm": 2.7098090648651123, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.4619, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 17.735334242837652, |
|
"eval_accuracy": 0.9032880942757541, |
|
"eval_loss": 0.5635744333267212, |
|
"eval_runtime": 65.7624, |
|
"eval_samples_per_second": 231.713, |
|
"eval_steps_per_second": 7.253, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 17.962710322874035, |
|
"grad_norm": 2.361464023590088, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.4554, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 17.962710322874035, |
|
"eval_accuracy": 0.9026038090276955, |
|
"eval_loss": 0.5572078227996826, |
|
"eval_runtime": 65.3403, |
|
"eval_samples_per_second": 233.21, |
|
"eval_steps_per_second": 7.3, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 18.190086402910413, |
|
"grad_norm": 2.2354624271392822, |
|
"learning_rate": 6e-05, |
|
"loss": 0.4516, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 18.190086402910413, |
|
"eval_accuracy": 0.9029999052162649, |
|
"eval_loss": 0.5721431970596313, |
|
"eval_runtime": 66.8398, |
|
"eval_samples_per_second": 227.978, |
|
"eval_steps_per_second": 7.136, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 18.417462482946792, |
|
"grad_norm": 2.5873236656188965, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.4488, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 18.417462482946792, |
|
"eval_accuracy": 0.9031028868580921, |
|
"eval_loss": 0.577458381652832, |
|
"eval_runtime": 65.9075, |
|
"eval_samples_per_second": 231.203, |
|
"eval_steps_per_second": 7.237, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 18.644838562983175, |
|
"grad_norm": 2.14237642288208, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.4529, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 18.644838562983175, |
|
"eval_accuracy": 0.9034652567356332, |
|
"eval_loss": 0.5635027289390564, |
|
"eval_runtime": 57.1062, |
|
"eval_samples_per_second": 266.836, |
|
"eval_steps_per_second": 8.353, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 18.872214643019554, |
|
"grad_norm": 2.687331199645996, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.4457, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 18.872214643019554, |
|
"eval_accuracy": 0.9038571769197852, |
|
"eval_loss": 0.5578777194023132, |
|
"eval_runtime": 57.0961, |
|
"eval_samples_per_second": 266.883, |
|
"eval_steps_per_second": 8.354, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 19.099590723055936, |
|
"grad_norm": 2.4610774517059326, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.4456, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 19.099590723055936, |
|
"eval_accuracy": 0.9040196805071515, |
|
"eval_loss": 0.570646345615387, |
|
"eval_runtime": 57.0535, |
|
"eval_samples_per_second": 267.083, |
|
"eval_steps_per_second": 8.361, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 19.326966803092315, |
|
"grad_norm": 2.7358736991882324, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.4439, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 19.326966803092315, |
|
"eval_accuracy": 0.9047236917555364, |
|
"eval_loss": 0.5603917837142944, |
|
"eval_runtime": 57.9311, |
|
"eval_samples_per_second": 263.037, |
|
"eval_steps_per_second": 8.234, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 19.554342883128694, |
|
"grad_norm": 2.6887753009796143, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.4371, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 19.554342883128694, |
|
"eval_accuracy": 0.9050488743420761, |
|
"eval_loss": 0.5606555938720703, |
|
"eval_runtime": 57.2573, |
|
"eval_samples_per_second": 266.132, |
|
"eval_steps_per_second": 8.331, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 19.781718963165076, |
|
"grad_norm": 4.247918128967285, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.4419, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 19.781718963165076, |
|
"eval_accuracy": 0.9042590959223571, |
|
"eval_loss": 0.5585463643074036, |
|
"eval_runtime": 66.8598, |
|
"eval_samples_per_second": 227.91, |
|
"eval_steps_per_second": 7.134, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 20.009095043201455, |
|
"grad_norm": 2.268353223800659, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.4415, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 20.009095043201455, |
|
"eval_accuracy": 0.9043696805910753, |
|
"eval_loss": 0.5533380508422852, |
|
"eval_runtime": 75.8782, |
|
"eval_samples_per_second": 200.822, |
|
"eval_steps_per_second": 6.286, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 20.236471123237834, |
|
"grad_norm": 2.3621208667755127, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.4363, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 20.236471123237834, |
|
"eval_accuracy": 0.9052580054316933, |
|
"eval_loss": 0.561882734298706, |
|
"eval_runtime": 76.0871, |
|
"eval_samples_per_second": 200.27, |
|
"eval_steps_per_second": 6.269, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 20.463847203274216, |
|
"grad_norm": 2.7799289226531982, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.4327, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 20.463847203274216, |
|
"eval_accuracy": 0.9050759625981704, |
|
"eval_loss": 0.5621650815010071, |
|
"eval_runtime": 75.1467, |
|
"eval_samples_per_second": 202.777, |
|
"eval_steps_per_second": 6.348, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 20.691223283310595, |
|
"grad_norm": 2.6320040225982666, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.4341, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 20.691223283310595, |
|
"eval_accuracy": 0.90537806159975, |
|
"eval_loss": 0.5658455491065979, |
|
"eval_runtime": 75.0197, |
|
"eval_samples_per_second": 203.12, |
|
"eval_steps_per_second": 6.358, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 20.918599363346974, |
|
"grad_norm": 2.626370668411255, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.436, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 20.918599363346974, |
|
"eval_accuracy": 0.9057144525510582, |
|
"eval_loss": 0.5474947094917297, |
|
"eval_runtime": 74.8407, |
|
"eval_samples_per_second": 203.606, |
|
"eval_steps_per_second": 6.374, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 21.145975443383357, |
|
"grad_norm": 2.2103912830352783, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.43, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 21.145975443383357, |
|
"eval_accuracy": 0.9056834598326479, |
|
"eval_loss": 0.5598079562187195, |
|
"eval_runtime": 67.154, |
|
"eval_samples_per_second": 226.911, |
|
"eval_steps_per_second": 7.103, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 21.373351523419736, |
|
"grad_norm": 2.3939335346221924, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.4253, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 21.373351523419736, |
|
"eval_accuracy": 0.9061305526247623, |
|
"eval_loss": 0.5633291602134705, |
|
"eval_runtime": 66.5353, |
|
"eval_samples_per_second": 229.021, |
|
"eval_steps_per_second": 7.169, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 21.600727603456118, |
|
"grad_norm": 2.4314322471618652, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.4276, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 21.600727603456118, |
|
"eval_accuracy": 0.9062991778333528, |
|
"eval_loss": 0.5566443800926208, |
|
"eval_runtime": 68.0296, |
|
"eval_samples_per_second": 223.991, |
|
"eval_steps_per_second": 7.012, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 21.828103683492497, |
|
"grad_norm": 2.792711019515991, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.4235, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 21.828103683492497, |
|
"eval_accuracy": 0.9063092114708347, |
|
"eval_loss": 0.5603668093681335, |
|
"eval_runtime": 67.1839, |
|
"eval_samples_per_second": 226.81, |
|
"eval_steps_per_second": 7.1, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 22.055479763528876, |
|
"grad_norm": 3.2149298191070557, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.4293, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 22.055479763528876, |
|
"eval_accuracy": 0.9061439320028164, |
|
"eval_loss": 0.5632808804512024, |
|
"eval_runtime": 55.9195, |
|
"eval_samples_per_second": 272.499, |
|
"eval_steps_per_second": 8.53, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 22.282855843565258, |
|
"grad_norm": 2.1803324222564697, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.4226, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 22.282855843565258, |
|
"eval_accuracy": 0.9075216851954307, |
|
"eval_loss": 0.5528887510299683, |
|
"eval_runtime": 56.7606, |
|
"eval_samples_per_second": 268.461, |
|
"eval_steps_per_second": 8.404, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 22.510231923601637, |
|
"grad_norm": 2.293896436691284, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.4193, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 22.510231923601637, |
|
"eval_accuracy": 0.907387491792463, |
|
"eval_loss": 0.5482432246208191, |
|
"eval_runtime": 57.104, |
|
"eval_samples_per_second": 266.846, |
|
"eval_steps_per_second": 8.353, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 22.737608003638016, |
|
"grad_norm": 2.2139928340911865, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4158, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 22.737608003638016, |
|
"eval_accuracy": 0.9077673279064253, |
|
"eval_loss": 0.5403118133544922, |
|
"eval_runtime": 55.9516, |
|
"eval_samples_per_second": 272.342, |
|
"eval_steps_per_second": 8.525, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 22.9649840836744, |
|
"grad_norm": 2.241081476211548, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.4174, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 22.9649840836744, |
|
"eval_accuracy": 0.9083347677575133, |
|
"eval_loss": 0.5309577584266663, |
|
"eval_runtime": 56.8789, |
|
"eval_samples_per_second": 267.903, |
|
"eval_steps_per_second": 8.386, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 23.192360163710777, |
|
"grad_norm": 3.3256096839904785, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.4191, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 23.192360163710777, |
|
"eval_accuracy": 0.9088304562502367, |
|
"eval_loss": 0.5443009734153748, |
|
"eval_runtime": 56.8135, |
|
"eval_samples_per_second": 268.211, |
|
"eval_steps_per_second": 8.396, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 23.419736243747156, |
|
"grad_norm": 2.716857433319092, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.4071, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 23.419736243747156, |
|
"eval_accuracy": 0.9077237472285563, |
|
"eval_loss": 0.5535444617271423, |
|
"eval_runtime": 56.7929, |
|
"eval_samples_per_second": 268.308, |
|
"eval_steps_per_second": 8.399, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 23.64711232378354, |
|
"grad_norm": 2.466326951980591, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.4159, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 23.64711232378354, |
|
"eval_accuracy": 0.9093382373851798, |
|
"eval_loss": 0.5515927076339722, |
|
"eval_runtime": 56.8114, |
|
"eval_samples_per_second": 268.221, |
|
"eval_steps_per_second": 8.396, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 23.874488403819917, |
|
"grad_norm": 2.6443376541137695, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.4159, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 23.874488403819917, |
|
"eval_accuracy": 0.9080766240994744, |
|
"eval_loss": 0.5417291522026062, |
|
"eval_runtime": 56.0043, |
|
"eval_samples_per_second": 272.086, |
|
"eval_steps_per_second": 8.517, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 24.1018644838563, |
|
"grad_norm": 2.359405994415283, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.4128, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 24.1018644838563, |
|
"eval_accuracy": 0.9082854247369369, |
|
"eval_loss": 0.546323299407959, |
|
"eval_runtime": 55.971, |
|
"eval_samples_per_second": 272.248, |
|
"eval_steps_per_second": 8.522, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 24.32924056389268, |
|
"grad_norm": 2.0880720615386963, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.4034, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 24.32924056389268, |
|
"eval_accuracy": 0.9087918433807937, |
|
"eval_loss": 0.5541105270385742, |
|
"eval_runtime": 58.891, |
|
"eval_samples_per_second": 258.749, |
|
"eval_steps_per_second": 8.1, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 24.556616643929058, |
|
"grad_norm": 2.977452039718628, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.4051, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 24.556616643929058, |
|
"eval_accuracy": 0.908003587059478, |
|
"eval_loss": 0.5499656200408936, |
|
"eval_runtime": 57.0241, |
|
"eval_samples_per_second": 267.22, |
|
"eval_steps_per_second": 8.365, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 24.78399272396544, |
|
"grad_norm": 2.6565568447113037, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.3973, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 24.78399272396544, |
|
"eval_accuracy": 0.9084627466876093, |
|
"eval_loss": 0.5408248901367188, |
|
"eval_runtime": 56.052, |
|
"eval_samples_per_second": 271.855, |
|
"eval_steps_per_second": 8.51, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 25.01136880400182, |
|
"grad_norm": 2.017199993133545, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.4038, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 25.01136880400182, |
|
"eval_accuracy": 0.9091495750162909, |
|
"eval_loss": 0.531546413898468, |
|
"eval_runtime": 56.6501, |
|
"eval_samples_per_second": 268.984, |
|
"eval_steps_per_second": 8.42, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 25.238744884038198, |
|
"grad_norm": 2.50418758392334, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.3993, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 25.238744884038198, |
|
"eval_accuracy": 0.9091283316273341, |
|
"eval_loss": 0.5372242331504822, |
|
"eval_runtime": 56.2979, |
|
"eval_samples_per_second": 270.667, |
|
"eval_steps_per_second": 8.473, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 25.46612096407458, |
|
"grad_norm": 2.4235289096832275, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.3981, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 25.46612096407458, |
|
"eval_accuracy": 0.9090792662970008, |
|
"eval_loss": 0.5424542427062988, |
|
"eval_runtime": 56.0065, |
|
"eval_samples_per_second": 272.075, |
|
"eval_steps_per_second": 8.517, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 25.69349704411096, |
|
"grad_norm": 2.049229621887207, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.3965, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 25.69349704411096, |
|
"eval_accuracy": 0.9091030899558308, |
|
"eval_loss": 0.5371273159980774, |
|
"eval_runtime": 56.9032, |
|
"eval_samples_per_second": 267.788, |
|
"eval_steps_per_second": 8.383, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 25.920873124147338, |
|
"grad_norm": 3.2039127349853516, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.3956, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 25.920873124147338, |
|
"eval_accuracy": 0.9102188871200084, |
|
"eval_loss": 0.5293972492218018, |
|
"eval_runtime": 56.0727, |
|
"eval_samples_per_second": 271.754, |
|
"eval_steps_per_second": 8.507, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 26.14824920418372, |
|
"grad_norm": 2.5481934547424316, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.3968, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 26.14824920418372, |
|
"eval_accuracy": 0.9097068103213896, |
|
"eval_loss": 0.5351966619491577, |
|
"eval_runtime": 55.9717, |
|
"eval_samples_per_second": 272.245, |
|
"eval_steps_per_second": 8.522, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 26.3756252842201, |
|
"grad_norm": 2.174415111541748, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.3902, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 26.3756252842201, |
|
"eval_accuracy": 0.9100280555809778, |
|
"eval_loss": 0.5361006855964661, |
|
"eval_runtime": 56.0896, |
|
"eval_samples_per_second": 271.672, |
|
"eval_steps_per_second": 8.504, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 26.60300136425648, |
|
"grad_norm": 2.4938621520996094, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.3919, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 26.60300136425648, |
|
"eval_accuracy": 0.9107032517948321, |
|
"eval_loss": 0.5426139831542969, |
|
"eval_runtime": 56.0504, |
|
"eval_samples_per_second": 271.863, |
|
"eval_steps_per_second": 8.51, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 26.83037744429286, |
|
"grad_norm": 2.729896306991577, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.3932, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 26.83037744429286, |
|
"eval_accuracy": 0.910556622486772, |
|
"eval_loss": 0.5372660756111145, |
|
"eval_runtime": 56.0573, |
|
"eval_samples_per_second": 271.829, |
|
"eval_steps_per_second": 8.509, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 27.05775352432924, |
|
"grad_norm": 2.150261163711548, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.3902, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 27.05775352432924, |
|
"eval_accuracy": 0.9114104771079771, |
|
"eval_loss": 0.5252653360366821, |
|
"eval_runtime": 56.1245, |
|
"eval_samples_per_second": 271.503, |
|
"eval_steps_per_second": 8.499, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 27.285129604365622, |
|
"grad_norm": 2.6134257316589355, |
|
"learning_rate": 4e-05, |
|
"loss": 0.393, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 27.285129604365622, |
|
"eval_accuracy": 0.910748714906454, |
|
"eval_loss": 0.5393661260604858, |
|
"eval_runtime": 56.1182, |
|
"eval_samples_per_second": 271.534, |
|
"eval_steps_per_second": 8.5, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 27.512505684402, |
|
"grad_norm": 3.204314947128296, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.3869, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 27.512505684402, |
|
"eval_accuracy": 0.9109071564105236, |
|
"eval_loss": 0.5372085571289062, |
|
"eval_runtime": 55.938, |
|
"eval_samples_per_second": 272.409, |
|
"eval_steps_per_second": 8.527, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 27.73988176443838, |
|
"grad_norm": 2.1017961502075195, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.3869, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 27.73988176443838, |
|
"eval_accuracy": 0.911014457937958, |
|
"eval_loss": 0.5260709524154663, |
|
"eval_runtime": 56.0593, |
|
"eval_samples_per_second": 271.819, |
|
"eval_steps_per_second": 8.509, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 27.967257844474762, |
|
"grad_norm": 1.7981553077697754, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.3817, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 27.967257844474762, |
|
"eval_accuracy": 0.9113206772639144, |
|
"eval_loss": 0.532031238079071, |
|
"eval_runtime": 56.0692, |
|
"eval_samples_per_second": 271.771, |
|
"eval_steps_per_second": 8.507, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 28.19463392451114, |
|
"grad_norm": 2.4235992431640625, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.3781, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 28.19463392451114, |
|
"eval_accuracy": 0.9117986222084103, |
|
"eval_loss": 0.5406020283699036, |
|
"eval_runtime": 56.0456, |
|
"eval_samples_per_second": 271.886, |
|
"eval_steps_per_second": 8.511, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 28.42201000454752, |
|
"grad_norm": 2.256941795349121, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.3793, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 28.42201000454752, |
|
"eval_accuracy": 0.9126116944372022, |
|
"eval_loss": 0.5184915661811829, |
|
"eval_runtime": 56.0344, |
|
"eval_samples_per_second": 271.94, |
|
"eval_steps_per_second": 8.513, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 28.649386084583902, |
|
"grad_norm": 2.4860892295837402, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.3758, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 28.649386084583902, |
|
"eval_accuracy": 0.911845377066606, |
|
"eval_loss": 0.5190649628639221, |
|
"eval_runtime": 55.9764, |
|
"eval_samples_per_second": 272.222, |
|
"eval_steps_per_second": 8.521, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 28.87676216462028, |
|
"grad_norm": 2.1798510551452637, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.382, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 28.87676216462028, |
|
"eval_accuracy": 0.9118922815248525, |
|
"eval_loss": 0.5215730667114258, |
|
"eval_runtime": 56.0709, |
|
"eval_samples_per_second": 271.763, |
|
"eval_steps_per_second": 8.507, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 29.104138244656664, |
|
"grad_norm": 2.404370069503784, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.3763, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 29.104138244656664, |
|
"eval_accuracy": 0.912145037637229, |
|
"eval_loss": 0.5312708020210266, |
|
"eval_runtime": 56.2338, |
|
"eval_samples_per_second": 270.976, |
|
"eval_steps_per_second": 8.482, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 29.331514324693043, |
|
"grad_norm": 2.4613826274871826, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.3788, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 29.331514324693043, |
|
"eval_accuracy": 0.9128715433606646, |
|
"eval_loss": 0.5222127437591553, |
|
"eval_runtime": 56.8998, |
|
"eval_samples_per_second": 267.804, |
|
"eval_steps_per_second": 8.383, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 29.55889040472942, |
|
"grad_norm": 3.3356547355651855, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.3755, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 29.55889040472942, |
|
"eval_accuracy": 0.913125950801356, |
|
"eval_loss": 0.5145973563194275, |
|
"eval_runtime": 56.8962, |
|
"eval_samples_per_second": 267.821, |
|
"eval_steps_per_second": 8.384, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 29.786266484765804, |
|
"grad_norm": 2.2212953567504883, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.3788, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 29.786266484765804, |
|
"eval_accuracy": 0.9125449385052034, |
|
"eval_loss": 0.531129002571106, |
|
"eval_runtime": 56.1683, |
|
"eval_samples_per_second": 271.292, |
|
"eval_steps_per_second": 8.492, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 30.013642564802183, |
|
"grad_norm": 2.8541479110717773, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.3737, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 30.013642564802183, |
|
"eval_accuracy": 0.9125566886698588, |
|
"eval_loss": 0.5407569408416748, |
|
"eval_runtime": 56.0905, |
|
"eval_samples_per_second": 271.668, |
|
"eval_steps_per_second": 8.504, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 30.24101864483856, |
|
"grad_norm": 2.438603162765503, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.3702, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 30.24101864483856, |
|
"eval_accuracy": 0.9129054862362733, |
|
"eval_loss": 0.5133882761001587, |
|
"eval_runtime": 57.1245, |
|
"eval_samples_per_second": 266.751, |
|
"eval_steps_per_second": 8.35, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 30.468394724874944, |
|
"grad_norm": 2.1143336296081543, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.3729, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 30.468394724874944, |
|
"eval_accuracy": 0.9124566054408887, |
|
"eval_loss": 0.5418105125427246, |
|
"eval_runtime": 56.0088, |
|
"eval_samples_per_second": 272.065, |
|
"eval_steps_per_second": 8.517, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 30.695770804911323, |
|
"grad_norm": 2.317859649658203, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.3662, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 30.695770804911323, |
|
"eval_accuracy": 0.9135011845911662, |
|
"eval_loss": 0.5446010828018188, |
|
"eval_runtime": 56.0915, |
|
"eval_samples_per_second": 271.663, |
|
"eval_steps_per_second": 8.504, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 30.923146884947702, |
|
"grad_norm": 2.983668088912964, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.3647, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 30.923146884947702, |
|
"eval_accuracy": 0.9138354207606285, |
|
"eval_loss": 0.527925968170166, |
|
"eval_runtime": 56.0461, |
|
"eval_samples_per_second": 271.883, |
|
"eval_steps_per_second": 8.511, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 31.150522964984084, |
|
"grad_norm": 2.1927847862243652, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.3683, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 31.150522964984084, |
|
"eval_accuracy": 0.9147327240782173, |
|
"eval_loss": 0.5225592255592346, |
|
"eval_runtime": 57.0493, |
|
"eval_samples_per_second": 267.102, |
|
"eval_steps_per_second": 8.361, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 31.377899045020463, |
|
"grad_norm": 2.345428228378296, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.3628, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 31.377899045020463, |
|
"eval_accuracy": 0.913656255193207, |
|
"eval_loss": 0.5174685716629028, |
|
"eval_runtime": 55.9136, |
|
"eval_samples_per_second": 272.528, |
|
"eval_steps_per_second": 8.531, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 31.605275125056846, |
|
"grad_norm": 2.5066728591918945, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.3651, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 31.605275125056846, |
|
"eval_accuracy": 0.9143142595628372, |
|
"eval_loss": 0.5336447358131409, |
|
"eval_runtime": 56.0707, |
|
"eval_samples_per_second": 271.764, |
|
"eval_steps_per_second": 8.507, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 31.832651205093224, |
|
"grad_norm": 2.5718226432800293, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3621, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 31.832651205093224, |
|
"eval_accuracy": 0.914124758029471, |
|
"eval_loss": 0.5136735439300537, |
|
"eval_runtime": 56.0417, |
|
"eval_samples_per_second": 271.905, |
|
"eval_steps_per_second": 8.512, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 32.06002728512961, |
|
"grad_norm": 3.0197207927703857, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.3598, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 32.06002728512961, |
|
"eval_accuracy": 0.9143737680741543, |
|
"eval_loss": 0.5096654295921326, |
|
"eval_runtime": 56.0653, |
|
"eval_samples_per_second": 271.79, |
|
"eval_steps_per_second": 8.508, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 32.287403365165986, |
|
"grad_norm": 2.932882785797119, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.3583, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 32.287403365165986, |
|
"eval_accuracy": 0.9142125358692816, |
|
"eval_loss": 0.5173851251602173, |
|
"eval_runtime": 56.0814, |
|
"eval_samples_per_second": 271.712, |
|
"eval_steps_per_second": 8.505, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 32.514779445202365, |
|
"grad_norm": 1.9991718530654907, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.3542, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 32.514779445202365, |
|
"eval_accuracy": 0.9147026168203635, |
|
"eval_loss": 0.522916316986084, |
|
"eval_runtime": 55.8503, |
|
"eval_samples_per_second": 272.837, |
|
"eval_steps_per_second": 8.541, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 32.74215552523874, |
|
"grad_norm": 1.8940651416778564, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.356, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 32.74215552523874, |
|
"eval_accuracy": 0.9140601083755969, |
|
"eval_loss": 0.5267335176467896, |
|
"eval_runtime": 56.0286, |
|
"eval_samples_per_second": 271.968, |
|
"eval_steps_per_second": 8.514, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 32.96953160527512, |
|
"grad_norm": 2.228545665740967, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.3554, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 32.96953160527512, |
|
"eval_accuracy": 0.9148385388009793, |
|
"eval_loss": 0.518826425075531, |
|
"eval_runtime": 57.0798, |
|
"eval_samples_per_second": 266.96, |
|
"eval_steps_per_second": 8.357, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 33.19690768531151, |
|
"grad_norm": 1.9578146934509277, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.3503, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 33.19690768531151, |
|
"eval_accuracy": 0.9152538892188704, |
|
"eval_loss": 0.5155122876167297, |
|
"eval_runtime": 56.9103, |
|
"eval_samples_per_second": 267.755, |
|
"eval_steps_per_second": 8.382, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 33.42428376534789, |
|
"grad_norm": 2.4094908237457275, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.3557, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 33.42428376534789, |
|
"eval_accuracy": 0.9150505399718056, |
|
"eval_loss": 0.5098891854286194, |
|
"eval_runtime": 55.8856, |
|
"eval_samples_per_second": 272.664, |
|
"eval_steps_per_second": 8.535, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 33.651659845384266, |
|
"grad_norm": 3.974923610687256, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.3504, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 33.651659845384266, |
|
"eval_accuracy": 0.9151609378276823, |
|
"eval_loss": 0.5117126703262329, |
|
"eval_runtime": 56.0457, |
|
"eval_samples_per_second": 271.885, |
|
"eval_steps_per_second": 8.511, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 33.879035925420645, |
|
"grad_norm": 2.490255355834961, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.3543, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 33.879035925420645, |
|
"eval_accuracy": 0.9146644769193912, |
|
"eval_loss": 0.5268692970275879, |
|
"eval_runtime": 55.9308, |
|
"eval_samples_per_second": 272.444, |
|
"eval_steps_per_second": 8.528, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 34.106412005457024, |
|
"grad_norm": 2.831305742263794, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.352, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 34.106412005457024, |
|
"eval_accuracy": 0.9151447226136769, |
|
"eval_loss": 0.5093286037445068, |
|
"eval_runtime": 56.9838, |
|
"eval_samples_per_second": 267.409, |
|
"eval_steps_per_second": 8.371, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 34.3337880854934, |
|
"grad_norm": 2.085205554962158, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.3511, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 34.3337880854934, |
|
"eval_accuracy": 0.9152598801148045, |
|
"eval_loss": 0.5099524259567261, |
|
"eval_runtime": 56.0735, |
|
"eval_samples_per_second": 271.75, |
|
"eval_steps_per_second": 8.507, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 34.56116416552979, |
|
"grad_norm": 2.5950026512145996, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.3477, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 34.56116416552979, |
|
"eval_accuracy": 0.9159398865939414, |
|
"eval_loss": 0.5132637023925781, |
|
"eval_runtime": 56.0447, |
|
"eval_samples_per_second": 271.89, |
|
"eval_steps_per_second": 8.511, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 34.78854024556617, |
|
"grad_norm": 3.0509707927703857, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.3487, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 34.78854024556617, |
|
"eval_accuracy": 0.9158777972417345, |
|
"eval_loss": 0.5157153010368347, |
|
"eval_runtime": 56.0477, |
|
"eval_samples_per_second": 271.876, |
|
"eval_steps_per_second": 8.511, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 35.01591632560255, |
|
"grad_norm": 3.29341983795166, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.3432, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 35.01591632560255, |
|
"eval_accuracy": 0.9159272981938116, |
|
"eval_loss": 0.503484308719635, |
|
"eval_runtime": 56.0514, |
|
"eval_samples_per_second": 271.857, |
|
"eval_steps_per_second": 8.51, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 35.243292405638925, |
|
"grad_norm": 2.7035512924194336, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.3468, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 35.243292405638925, |
|
"eval_accuracy": 0.9168826617753377, |
|
"eval_loss": 0.504048228263855, |
|
"eval_runtime": 56.0964, |
|
"eval_samples_per_second": 271.639, |
|
"eval_steps_per_second": 8.503, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 35.470668485675304, |
|
"grad_norm": 4.089804649353027, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.3444, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 35.470668485675304, |
|
"eval_accuracy": 0.9165130415874907, |
|
"eval_loss": 0.5067149996757507, |
|
"eval_runtime": 56.0967, |
|
"eval_samples_per_second": 271.638, |
|
"eval_steps_per_second": 8.503, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 35.69804456571169, |
|
"grad_norm": 2.281663656234741, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.3391, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 35.69804456571169, |
|
"eval_accuracy": 0.9166254129979897, |
|
"eval_loss": 0.4899181127548218, |
|
"eval_runtime": 56.9423, |
|
"eval_samples_per_second": 267.604, |
|
"eval_steps_per_second": 8.377, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 35.92542064574807, |
|
"grad_norm": 2.6281392574310303, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.3395, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 35.92542064574807, |
|
"eval_accuracy": 0.9163551690154882, |
|
"eval_loss": 0.5145460367202759, |
|
"eval_runtime": 56.0529, |
|
"eval_samples_per_second": 271.85, |
|
"eval_steps_per_second": 8.51, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 36.15279672578445, |
|
"grad_norm": 2.2223994731903076, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.3405, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 36.15279672578445, |
|
"eval_accuracy": 0.9158962396841802, |
|
"eval_loss": 0.51621013879776, |
|
"eval_runtime": 56.078, |
|
"eval_samples_per_second": 271.729, |
|
"eval_steps_per_second": 8.506, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 36.38017280582083, |
|
"grad_norm": 2.688448190689087, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3415, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 36.38017280582083, |
|
"eval_accuracy": 0.9164056168752696, |
|
"eval_loss": 0.5185515284538269, |
|
"eval_runtime": 56.0615, |
|
"eval_samples_per_second": 271.809, |
|
"eval_steps_per_second": 8.509, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 36.607548885857206, |
|
"grad_norm": 1.9620684385299683, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.3375, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 36.607548885857206, |
|
"eval_accuracy": 0.9163537188934124, |
|
"eval_loss": 0.517730176448822, |
|
"eval_runtime": 56.9493, |
|
"eval_samples_per_second": 267.571, |
|
"eval_steps_per_second": 8.376, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 36.834924965893585, |
|
"grad_norm": 2.495645761489868, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.3398, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 36.834924965893585, |
|
"eval_accuracy": 0.9171486235060389, |
|
"eval_loss": 0.5006797313690186, |
|
"eval_runtime": 56.9369, |
|
"eval_samples_per_second": 267.63, |
|
"eval_steps_per_second": 8.378, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 37.06230104592997, |
|
"grad_norm": 2.4040660858154297, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.3393, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 37.06230104592997, |
|
"eval_accuracy": 0.9175943590821654, |
|
"eval_loss": 0.5150498151779175, |
|
"eval_runtime": 56.1005, |
|
"eval_samples_per_second": 271.62, |
|
"eval_steps_per_second": 8.503, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 37.28967712596635, |
|
"grad_norm": 2.7558135986328125, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.3344, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 37.28967712596635, |
|
"eval_accuracy": 0.9171837999528608, |
|
"eval_loss": 0.5036485195159912, |
|
"eval_runtime": 56.078, |
|
"eval_samples_per_second": 271.729, |
|
"eval_steps_per_second": 8.506, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 37.51705320600273, |
|
"grad_norm": 2.468700647354126, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.3352, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 37.51705320600273, |
|
"eval_accuracy": 0.9175697679179382, |
|
"eval_loss": 0.5030218958854675, |
|
"eval_runtime": 56.0873, |
|
"eval_samples_per_second": 271.684, |
|
"eval_steps_per_second": 8.505, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 37.74442928603911, |
|
"grad_norm": 2.5978448390960693, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.3329, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 37.74442928603911, |
|
"eval_accuracy": 0.9171495853383843, |
|
"eval_loss": 0.5056445598602295, |
|
"eval_runtime": 56.0025, |
|
"eval_samples_per_second": 272.095, |
|
"eval_steps_per_second": 8.517, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 37.971805366075486, |
|
"grad_norm": 2.1514439582824707, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.3298, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 37.971805366075486, |
|
"eval_accuracy": 0.917162721884158, |
|
"eval_loss": 0.5005716681480408, |
|
"eval_runtime": 57.1445, |
|
"eval_samples_per_second": 266.658, |
|
"eval_steps_per_second": 8.347, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 38.19918144611187, |
|
"grad_norm": 2.6213526725769043, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.3283, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 38.19918144611187, |
|
"eval_accuracy": 0.9176722555185908, |
|
"eval_loss": 0.5062026977539062, |
|
"eval_runtime": 56.1037, |
|
"eval_samples_per_second": 271.604, |
|
"eval_steps_per_second": 8.502, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 38.42655752614825, |
|
"grad_norm": 2.2421905994415283, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.327, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 38.42655752614825, |
|
"eval_accuracy": 0.918665843034967, |
|
"eval_loss": 0.4939974844455719, |
|
"eval_runtime": 56.9571, |
|
"eval_samples_per_second": 267.535, |
|
"eval_steps_per_second": 8.375, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 38.65393360618463, |
|
"grad_norm": 2.6543004512786865, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.3335, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 38.65393360618463, |
|
"eval_accuracy": 0.9179265011087174, |
|
"eval_loss": 0.5135884284973145, |
|
"eval_runtime": 56.0886, |
|
"eval_samples_per_second": 271.677, |
|
"eval_steps_per_second": 8.504, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 38.88130968622101, |
|
"grad_norm": 2.2833940982818604, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.3323, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 38.88130968622101, |
|
"eval_accuracy": 0.9180402792823424, |
|
"eval_loss": 0.5044585466384888, |
|
"eval_runtime": 56.0736, |
|
"eval_samples_per_second": 271.75, |
|
"eval_steps_per_second": 8.507, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 39.10868576625739, |
|
"grad_norm": 2.157496213912964, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.3323, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 39.10868576625739, |
|
"eval_accuracy": 0.9178364524991991, |
|
"eval_loss": 0.5072239637374878, |
|
"eval_runtime": 56.0862, |
|
"eval_samples_per_second": 271.689, |
|
"eval_steps_per_second": 8.505, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 39.33606184629377, |
|
"grad_norm": 2.467801809310913, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.3209, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 39.33606184629377, |
|
"eval_accuracy": 0.9178805584191491, |
|
"eval_loss": 0.5000079870223999, |
|
"eval_runtime": 56.0688, |
|
"eval_samples_per_second": 271.773, |
|
"eval_steps_per_second": 8.507, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 39.56343792633015, |
|
"grad_norm": 1.764186143875122, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.318, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 39.56343792633015, |
|
"eval_accuracy": 0.9177274550237315, |
|
"eval_loss": 0.5174301862716675, |
|
"eval_runtime": 55.9254, |
|
"eval_samples_per_second": 272.47, |
|
"eval_steps_per_second": 8.529, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 39.79081400636653, |
|
"grad_norm": 1.9439367055892944, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.3304, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 39.79081400636653, |
|
"eval_accuracy": 0.9182047509597898, |
|
"eval_loss": 0.5170900821685791, |
|
"eval_runtime": 56.0765, |
|
"eval_samples_per_second": 271.736, |
|
"eval_steps_per_second": 8.506, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 40.01819008640291, |
|
"grad_norm": 2.7483394145965576, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.3269, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 40.01819008640291, |
|
"eval_accuracy": 0.9192084760524332, |
|
"eval_loss": 0.5051037669181824, |
|
"eval_runtime": 56.982, |
|
"eval_samples_per_second": 267.418, |
|
"eval_steps_per_second": 8.371, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 40.24556616643929, |
|
"grad_norm": 1.9884289503097534, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.3226, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 40.24556616643929, |
|
"eval_accuracy": 0.9179925787853008, |
|
"eval_loss": 0.4963218867778778, |
|
"eval_runtime": 56.9742, |
|
"eval_samples_per_second": 267.454, |
|
"eval_steps_per_second": 8.372, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 40.47294224647567, |
|
"grad_norm": 2.4133260250091553, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.3193, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 40.47294224647567, |
|
"eval_accuracy": 0.9186944890989619, |
|
"eval_loss": 0.5157626867294312, |
|
"eval_runtime": 56.072, |
|
"eval_samples_per_second": 271.758, |
|
"eval_steps_per_second": 8.507, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 40.700318326512054, |
|
"grad_norm": 2.426737070083618, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.3204, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 40.700318326512054, |
|
"eval_accuracy": 0.918837528271839, |
|
"eval_loss": 0.49108728766441345, |
|
"eval_runtime": 56.0958, |
|
"eval_samples_per_second": 271.643, |
|
"eval_steps_per_second": 8.503, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 40.92769440654843, |
|
"grad_norm": 2.0523056983947754, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3234, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 40.92769440654843, |
|
"eval_accuracy": 0.9187695504636845, |
|
"eval_loss": 0.497799277305603, |
|
"eval_runtime": 56.0896, |
|
"eval_samples_per_second": 271.673, |
|
"eval_steps_per_second": 8.504, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 41.15507048658481, |
|
"grad_norm": 2.7144970893859863, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.3211, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 41.15507048658481, |
|
"eval_accuracy": 0.9191388218103149, |
|
"eval_loss": 0.4986066222190857, |
|
"eval_runtime": 56.0597, |
|
"eval_samples_per_second": 271.818, |
|
"eval_steps_per_second": 8.509, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 41.38244656662119, |
|
"grad_norm": 2.1701807975769043, |
|
"learning_rate": 9e-06, |
|
"loss": 0.3202, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 41.38244656662119, |
|
"eval_accuracy": 0.9192327856840563, |
|
"eval_loss": 0.5045046210289001, |
|
"eval_runtime": 56.0657, |
|
"eval_samples_per_second": 271.788, |
|
"eval_steps_per_second": 8.508, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 41.60982264665757, |
|
"grad_norm": 1.8716968297958374, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.3178, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 41.60982264665757, |
|
"eval_accuracy": 0.918940321981522, |
|
"eval_loss": 0.5036594867706299, |
|
"eval_runtime": 57.7478, |
|
"eval_samples_per_second": 263.872, |
|
"eval_steps_per_second": 8.26, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 41.83719872669395, |
|
"grad_norm": 2.7744719982147217, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.3181, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 41.83719872669395, |
|
"eval_accuracy": 0.9190958139064731, |
|
"eval_loss": 0.49405789375305176, |
|
"eval_runtime": 55.9547, |
|
"eval_samples_per_second": 272.327, |
|
"eval_steps_per_second": 8.525, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 42.064574806730334, |
|
"grad_norm": 1.9463552236557007, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.3139, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 42.064574806730334, |
|
"eval_accuracy": 0.9185921615457032, |
|
"eval_loss": 0.5153664946556091, |
|
"eval_runtime": 55.9616, |
|
"eval_samples_per_second": 272.294, |
|
"eval_steps_per_second": 8.524, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 42.29195088676671, |
|
"grad_norm": 1.7864753007888794, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.3155, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 42.29195088676671, |
|
"eval_accuracy": 0.9193213422174352, |
|
"eval_loss": 0.4968840777873993, |
|
"eval_runtime": 56.9973, |
|
"eval_samples_per_second": 267.346, |
|
"eval_steps_per_second": 8.369, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 42.51932696680309, |
|
"grad_norm": 2.8307926654815674, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.3225, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 42.51932696680309, |
|
"eval_accuracy": 0.9194418625202981, |
|
"eval_loss": 0.49802207946777344, |
|
"eval_runtime": 57.0104, |
|
"eval_samples_per_second": 267.284, |
|
"eval_steps_per_second": 8.367, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 42.74670304683947, |
|
"grad_norm": 2.3244693279266357, |
|
"learning_rate": 6e-06, |
|
"loss": 0.3166, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 42.74670304683947, |
|
"eval_accuracy": 0.9192593340761245, |
|
"eval_loss": 0.5010645985603333, |
|
"eval_runtime": 55.9693, |
|
"eval_samples_per_second": 272.256, |
|
"eval_steps_per_second": 8.523, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 42.97407912687585, |
|
"grad_norm": 1.967890977859497, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.3146, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 42.97407912687585, |
|
"eval_accuracy": 0.9196343334604102, |
|
"eval_loss": 0.48918718099594116, |
|
"eval_runtime": 56.094, |
|
"eval_samples_per_second": 271.651, |
|
"eval_steps_per_second": 8.504, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 43.201455206912236, |
|
"grad_norm": 2.064098596572876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3152, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 43.201455206912236, |
|
"eval_accuracy": 0.9197978271795374, |
|
"eval_loss": 0.4905773103237152, |
|
"eval_runtime": 56.0802, |
|
"eval_samples_per_second": 271.718, |
|
"eval_steps_per_second": 8.506, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 43.428831286948615, |
|
"grad_norm": 2.1124424934387207, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.3077, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 43.428831286948615, |
|
"eval_accuracy": 0.9199429121130404, |
|
"eval_loss": 0.4860183894634247, |
|
"eval_runtime": 55.9425, |
|
"eval_samples_per_second": 272.387, |
|
"eval_steps_per_second": 8.527, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 43.656207366984994, |
|
"grad_norm": 1.9392900466918945, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.3185, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 43.656207366984994, |
|
"eval_accuracy": 0.919319351233956, |
|
"eval_loss": 0.4847618043422699, |
|
"eval_runtime": 56.9968, |
|
"eval_samples_per_second": 267.348, |
|
"eval_steps_per_second": 8.369, |
|
"step": 96000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 46, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.105977491866255e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|