|
{ |
|
"best_metric": 1.022267460823059, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/ben-Beng/checkpoint-98500", |
|
"epoch": 29.60625187856928, |
|
"eval_steps": 500, |
|
"global_step": 98500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15028554253080853, |
|
"grad_norm": 4.279157638549805, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.5206, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15028554253080853, |
|
"eval_accuracy": 0.7123080245508447, |
|
"eval_loss": 1.6415441036224365, |
|
"eval_runtime": 108.0973, |
|
"eval_samples_per_second": 220.635, |
|
"eval_steps_per_second": 6.901, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.30057108506161706, |
|
"grad_norm": 3.948293685913086, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.4505, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.30057108506161706, |
|
"eval_accuracy": 0.7195765918792126, |
|
"eval_loss": 1.6040955781936646, |
|
"eval_runtime": 108.1918, |
|
"eval_samples_per_second": 220.442, |
|
"eval_steps_per_second": 6.895, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4508566275924256, |
|
"grad_norm": 3.838204860687256, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.4092, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4508566275924256, |
|
"eval_accuracy": 0.7255862618332372, |
|
"eval_loss": 1.5711835622787476, |
|
"eval_runtime": 105.314, |
|
"eval_samples_per_second": 226.466, |
|
"eval_steps_per_second": 7.084, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6011421701232341, |
|
"grad_norm": 3.7270383834838867, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.3738, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6011421701232341, |
|
"eval_accuracy": 0.7298732586939131, |
|
"eval_loss": 1.562820315361023, |
|
"eval_runtime": 104.3524, |
|
"eval_samples_per_second": 228.552, |
|
"eval_steps_per_second": 7.149, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7514277126540427, |
|
"grad_norm": 3.720855474472046, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.3447, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7514277126540427, |
|
"eval_accuracy": 0.7334076213509759, |
|
"eval_loss": 1.533949613571167, |
|
"eval_runtime": 105.6055, |
|
"eval_samples_per_second": 225.84, |
|
"eval_steps_per_second": 7.064, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9017132551848512, |
|
"grad_norm": 3.9192264080047607, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.3215, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9017132551848512, |
|
"eval_accuracy": 0.737443643353561, |
|
"eval_loss": 1.5036447048187256, |
|
"eval_runtime": 103.4988, |
|
"eval_samples_per_second": 230.437, |
|
"eval_steps_per_second": 7.208, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0519987977156597, |
|
"grad_norm": 3.8544235229492188, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.3074, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0519987977156597, |
|
"eval_accuracy": 0.7401312504576956, |
|
"eval_loss": 1.4803341627120972, |
|
"eval_runtime": 104.2256, |
|
"eval_samples_per_second": 228.831, |
|
"eval_steps_per_second": 7.158, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2022843402464682, |
|
"grad_norm": 3.2664737701416016, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.2826, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.2022843402464682, |
|
"eval_accuracy": 0.7433592201310665, |
|
"eval_loss": 1.475807785987854, |
|
"eval_runtime": 104.5255, |
|
"eval_samples_per_second": 228.174, |
|
"eval_steps_per_second": 7.137, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3525698827772767, |
|
"grad_norm": 3.448021650314331, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.2676, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3525698827772767, |
|
"eval_accuracy": 0.7454010087654052, |
|
"eval_loss": 1.467736005783081, |
|
"eval_runtime": 106.1685, |
|
"eval_samples_per_second": 224.643, |
|
"eval_steps_per_second": 7.027, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.5028554253080855, |
|
"grad_norm": 3.6344072818756104, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.2462, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.5028554253080855, |
|
"eval_accuracy": 0.7480509601003421, |
|
"eval_loss": 1.454475998878479, |
|
"eval_runtime": 107.2463, |
|
"eval_samples_per_second": 222.385, |
|
"eval_steps_per_second": 6.956, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.653140967838894, |
|
"grad_norm": 3.191260576248169, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.2353, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.653140967838894, |
|
"eval_accuracy": 0.7502682619699697, |
|
"eval_loss": 1.4226497411727905, |
|
"eval_runtime": 107.1587, |
|
"eval_samples_per_second": 222.567, |
|
"eval_steps_per_second": 6.962, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.8034265103697025, |
|
"grad_norm": 3.074147939682007, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.2227, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.8034265103697025, |
|
"eval_accuracy": 0.7523797476360646, |
|
"eval_loss": 1.4185426235198975, |
|
"eval_runtime": 107.8236, |
|
"eval_samples_per_second": 221.195, |
|
"eval_steps_per_second": 6.919, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.953712052900511, |
|
"grad_norm": 3.0745174884796143, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.2179, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.953712052900511, |
|
"eval_accuracy": 0.7525899025066154, |
|
"eval_loss": 1.4250199794769287, |
|
"eval_runtime": 103.869, |
|
"eval_samples_per_second": 229.616, |
|
"eval_steps_per_second": 7.182, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.1039975954313195, |
|
"grad_norm": 3.809814453125, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.1973, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.1039975954313195, |
|
"eval_accuracy": 0.7544079430376059, |
|
"eval_loss": 1.4045050144195557, |
|
"eval_runtime": 103.9514, |
|
"eval_samples_per_second": 229.434, |
|
"eval_steps_per_second": 7.176, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.254283137962128, |
|
"grad_norm": 3.262021541595459, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 1.1854, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.254283137962128, |
|
"eval_accuracy": 0.7566596229680038, |
|
"eval_loss": 1.4005974531173706, |
|
"eval_runtime": 103.4306, |
|
"eval_samples_per_second": 230.59, |
|
"eval_steps_per_second": 7.213, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.4045686804929365, |
|
"grad_norm": 3.562636613845825, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.1894, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.4045686804929365, |
|
"eval_accuracy": 0.7518658362592376, |
|
"eval_loss": 1.4261118173599243, |
|
"eval_runtime": 107.7368, |
|
"eval_samples_per_second": 221.373, |
|
"eval_steps_per_second": 6.924, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.554854223023745, |
|
"grad_norm": 3.0819168090820312, |
|
"learning_rate": 9.15e-05, |
|
"loss": 1.1821, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.554854223023745, |
|
"eval_accuracy": 0.759439375239073, |
|
"eval_loss": 1.3829759359359741, |
|
"eval_runtime": 108.9119, |
|
"eval_samples_per_second": 218.984, |
|
"eval_steps_per_second": 6.85, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.7051397655545535, |
|
"grad_norm": 3.441218852996826, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.1617, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.7051397655545535, |
|
"eval_accuracy": 0.7595643512285075, |
|
"eval_loss": 1.38172447681427, |
|
"eval_runtime": 107.6971, |
|
"eval_samples_per_second": 221.454, |
|
"eval_steps_per_second": 6.927, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.855425308085362, |
|
"grad_norm": 2.9258110523223877, |
|
"learning_rate": 9.05e-05, |
|
"loss": 1.1647, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.855425308085362, |
|
"eval_accuracy": 0.7622641760596893, |
|
"eval_loss": 1.3564621210098267, |
|
"eval_runtime": 108.8704, |
|
"eval_samples_per_second": 219.068, |
|
"eval_steps_per_second": 6.852, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.005710850616171, |
|
"grad_norm": 3.097913980484009, |
|
"learning_rate": 9e-05, |
|
"loss": 1.1543, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.005710850616171, |
|
"eval_accuracy": 0.7637979723391941, |
|
"eval_loss": 1.3542959690093994, |
|
"eval_runtime": 108.5802, |
|
"eval_samples_per_second": 219.653, |
|
"eval_steps_per_second": 6.87, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.1559963931469794, |
|
"grad_norm": 3.168121099472046, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 1.1379, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.1559963931469794, |
|
"eval_accuracy": 0.76408780640708, |
|
"eval_loss": 1.3504241704940796, |
|
"eval_runtime": 108.2586, |
|
"eval_samples_per_second": 220.306, |
|
"eval_steps_per_second": 6.891, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.306281935677788, |
|
"grad_norm": 3.410958766937256, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.1254, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.306281935677788, |
|
"eval_accuracy": 0.7652592815735387, |
|
"eval_loss": 1.346474528312683, |
|
"eval_runtime": 107.3669, |
|
"eval_samples_per_second": 222.136, |
|
"eval_steps_per_second": 6.948, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.4565674782085964, |
|
"grad_norm": 2.775892734527588, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 1.1269, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.4565674782085964, |
|
"eval_accuracy": 0.7657911530106338, |
|
"eval_loss": 1.3591572046279907, |
|
"eval_runtime": 106.844, |
|
"eval_samples_per_second": 223.223, |
|
"eval_steps_per_second": 6.982, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.606853020739405, |
|
"grad_norm": 3.029595375061035, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.1247, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.606853020739405, |
|
"eval_accuracy": 0.767305872889515, |
|
"eval_loss": 1.3377037048339844, |
|
"eval_runtime": 108.9561, |
|
"eval_samples_per_second": 218.896, |
|
"eval_steps_per_second": 6.847, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.7571385632702134, |
|
"grad_norm": 3.1967430114746094, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.1123, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.7571385632702134, |
|
"eval_accuracy": 0.7687083850392965, |
|
"eval_loss": 1.340959906578064, |
|
"eval_runtime": 107.8508, |
|
"eval_samples_per_second": 221.139, |
|
"eval_steps_per_second": 6.917, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.907424105801022, |
|
"grad_norm": 2.859966993331909, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.112, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.907424105801022, |
|
"eval_accuracy": 0.7696794726604136, |
|
"eval_loss": 1.3237755298614502, |
|
"eval_runtime": 107.6845, |
|
"eval_samples_per_second": 221.48, |
|
"eval_steps_per_second": 6.928, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.057709648331831, |
|
"grad_norm": 2.8421764373779297, |
|
"learning_rate": 8.65e-05, |
|
"loss": 1.0983, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.057709648331831, |
|
"eval_accuracy": 0.7714031663702743, |
|
"eval_loss": 1.3144177198410034, |
|
"eval_runtime": 108.7428, |
|
"eval_samples_per_second": 219.325, |
|
"eval_steps_per_second": 6.86, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.207995190862639, |
|
"grad_norm": 3.1582884788513184, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.0918, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.207995190862639, |
|
"eval_accuracy": 0.7718766206792639, |
|
"eval_loss": 1.3221428394317627, |
|
"eval_runtime": 108.1645, |
|
"eval_samples_per_second": 220.498, |
|
"eval_steps_per_second": 6.897, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.358280733393448, |
|
"grad_norm": 2.631855010986328, |
|
"learning_rate": 8.55e-05, |
|
"loss": 1.0884, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.358280733393448, |
|
"eval_accuracy": 0.772324790492508, |
|
"eval_loss": 1.3152235746383667, |
|
"eval_runtime": 107.5113, |
|
"eval_samples_per_second": 221.837, |
|
"eval_steps_per_second": 6.939, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.508566275924256, |
|
"grad_norm": 3.242208480834961, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.0837, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.508566275924256, |
|
"eval_accuracy": 0.7737920955341676, |
|
"eval_loss": 1.2969176769256592, |
|
"eval_runtime": 107.6972, |
|
"eval_samples_per_second": 221.454, |
|
"eval_steps_per_second": 6.927, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.658851818455065, |
|
"grad_norm": 3.0691699981689453, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 1.0796, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 4.658851818455065, |
|
"eval_accuracy": 0.7738074322270785, |
|
"eval_loss": 1.3085857629776, |
|
"eval_runtime": 106.2258, |
|
"eval_samples_per_second": 224.522, |
|
"eval_steps_per_second": 7.023, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 4.809137360985873, |
|
"grad_norm": 3.258615732192993, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0714, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.809137360985873, |
|
"eval_accuracy": 0.775101051869418, |
|
"eval_loss": 1.2939372062683105, |
|
"eval_runtime": 106.8715, |
|
"eval_samples_per_second": 223.165, |
|
"eval_steps_per_second": 6.98, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.959422903516682, |
|
"grad_norm": 3.1920101642608643, |
|
"learning_rate": 8.35e-05, |
|
"loss": 1.071, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.959422903516682, |
|
"eval_accuracy": 0.7753841079350378, |
|
"eval_loss": 1.2854809761047363, |
|
"eval_runtime": 106.6266, |
|
"eval_samples_per_second": 223.678, |
|
"eval_steps_per_second": 6.996, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.10970844604749, |
|
"grad_norm": 3.2814955711364746, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.0568, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.10970844604749, |
|
"eval_accuracy": 0.7761810007493132, |
|
"eval_loss": 1.2916713953018188, |
|
"eval_runtime": 107.4768, |
|
"eval_samples_per_second": 221.908, |
|
"eval_steps_per_second": 6.941, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.259993988578299, |
|
"grad_norm": 3.2327558994293213, |
|
"learning_rate": 8.25e-05, |
|
"loss": 1.0549, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.259993988578299, |
|
"eval_accuracy": 0.7780586564617779, |
|
"eval_loss": 1.2857751846313477, |
|
"eval_runtime": 107.761, |
|
"eval_samples_per_second": 221.323, |
|
"eval_steps_per_second": 6.923, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.410279531109107, |
|
"grad_norm": 3.0951426029205322, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.0511, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.410279531109107, |
|
"eval_accuracy": 0.7783227014884583, |
|
"eval_loss": 1.2948368787765503, |
|
"eval_runtime": 108.2092, |
|
"eval_samples_per_second": 220.406, |
|
"eval_steps_per_second": 6.894, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.560565073639916, |
|
"grad_norm": 2.639129638671875, |
|
"learning_rate": 8.15e-05, |
|
"loss": 1.0492, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.560565073639916, |
|
"eval_accuracy": 0.7788534810933934, |
|
"eval_loss": 1.2669661045074463, |
|
"eval_runtime": 108.4058, |
|
"eval_samples_per_second": 220.007, |
|
"eval_steps_per_second": 6.882, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.710850616170724, |
|
"grad_norm": 2.4974186420440674, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.0476, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 5.710850616170724, |
|
"eval_accuracy": 0.7792994699668719, |
|
"eval_loss": 1.2653881311416626, |
|
"eval_runtime": 108.006, |
|
"eval_samples_per_second": 220.821, |
|
"eval_steps_per_second": 6.907, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 5.861136158701533, |
|
"grad_norm": 2.6549036502838135, |
|
"learning_rate": 8.05e-05, |
|
"loss": 1.0537, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 5.861136158701533, |
|
"eval_accuracy": 0.781079231841748, |
|
"eval_loss": 1.2602005004882812, |
|
"eval_runtime": 107.8534, |
|
"eval_samples_per_second": 221.133, |
|
"eval_steps_per_second": 6.917, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.011421701232342, |
|
"grad_norm": 3.047539234161377, |
|
"learning_rate": 8e-05, |
|
"loss": 1.038, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.011421701232342, |
|
"eval_accuracy": 0.7802534672325914, |
|
"eval_loss": 1.2705827951431274, |
|
"eval_runtime": 106.7758, |
|
"eval_samples_per_second": 223.365, |
|
"eval_steps_per_second": 6.987, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.16170724376315, |
|
"grad_norm": 2.7509360313415527, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 1.0268, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.16170724376315, |
|
"eval_accuracy": 0.7803480613075008, |
|
"eval_loss": 1.2649264335632324, |
|
"eval_runtime": 107.1559, |
|
"eval_samples_per_second": 222.573, |
|
"eval_steps_per_second": 6.962, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.311992786293959, |
|
"grad_norm": 2.5355842113494873, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.0294, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.311992786293959, |
|
"eval_accuracy": 0.7818066507198538, |
|
"eval_loss": 1.250462532043457, |
|
"eval_runtime": 115.6436, |
|
"eval_samples_per_second": 206.237, |
|
"eval_steps_per_second": 6.451, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.462278328824767, |
|
"grad_norm": 2.9398176670074463, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 1.0209, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.462278328824767, |
|
"eval_accuracy": 0.7832408080027191, |
|
"eval_loss": 1.2534795999526978, |
|
"eval_runtime": 107.8449, |
|
"eval_samples_per_second": 221.151, |
|
"eval_steps_per_second": 6.917, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.612563871355576, |
|
"grad_norm": 2.7950327396392822, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.0268, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 6.612563871355576, |
|
"eval_accuracy": 0.7836778557957977, |
|
"eval_loss": 1.2419434785842896, |
|
"eval_runtime": 107.3738, |
|
"eval_samples_per_second": 222.121, |
|
"eval_steps_per_second": 6.948, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 6.762849413886384, |
|
"grad_norm": 3.0478243827819824, |
|
"learning_rate": 7.75e-05, |
|
"loss": 1.0233, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 6.762849413886384, |
|
"eval_accuracy": 0.7839343784481503, |
|
"eval_loss": 1.244408369064331, |
|
"eval_runtime": 107.5196, |
|
"eval_samples_per_second": 221.82, |
|
"eval_steps_per_second": 6.938, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 6.913134956417193, |
|
"grad_norm": 3.049609661102295, |
|
"learning_rate": 7.7e-05, |
|
"loss": 1.016, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 6.913134956417193, |
|
"eval_accuracy": 0.7844505307770736, |
|
"eval_loss": 1.2420412302017212, |
|
"eval_runtime": 108.256, |
|
"eval_samples_per_second": 220.311, |
|
"eval_steps_per_second": 6.891, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.063420498948001, |
|
"grad_norm": 3.2929279804229736, |
|
"learning_rate": 7.65e-05, |
|
"loss": 1.0113, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.063420498948001, |
|
"eval_accuracy": 0.7841336670175184, |
|
"eval_loss": 1.2468925714492798, |
|
"eval_runtime": 108.0777, |
|
"eval_samples_per_second": 220.675, |
|
"eval_steps_per_second": 6.902, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.21370604147881, |
|
"grad_norm": 2.5201022624969482, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.01, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.21370604147881, |
|
"eval_accuracy": 0.7857022835486672, |
|
"eval_loss": 1.2313475608825684, |
|
"eval_runtime": 108.2646, |
|
"eval_samples_per_second": 220.294, |
|
"eval_steps_per_second": 6.891, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.363991584009618, |
|
"grad_norm": 2.926717519760132, |
|
"learning_rate": 7.55e-05, |
|
"loss": 1.0017, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 7.363991584009618, |
|
"eval_accuracy": 0.7857564674613335, |
|
"eval_loss": 1.2349497079849243, |
|
"eval_runtime": 107.7822, |
|
"eval_samples_per_second": 221.28, |
|
"eval_steps_per_second": 6.921, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 7.514277126540427, |
|
"grad_norm": 2.6643712520599365, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9939, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 7.514277126540427, |
|
"eval_accuracy": 0.7868022880070933, |
|
"eval_loss": 1.2337555885314941, |
|
"eval_runtime": 106.7322, |
|
"eval_samples_per_second": 223.456, |
|
"eval_steps_per_second": 6.989, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 7.664562669071236, |
|
"grad_norm": 2.679358720779419, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.9921, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 7.664562669071236, |
|
"eval_accuracy": 0.7861627827490367, |
|
"eval_loss": 1.2326431274414062, |
|
"eval_runtime": 107.5896, |
|
"eval_samples_per_second": 221.676, |
|
"eval_steps_per_second": 6.934, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 7.814848211602044, |
|
"grad_norm": 2.5837836265563965, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.9973, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 7.814848211602044, |
|
"eval_accuracy": 0.7871569919088683, |
|
"eval_loss": 1.2331918478012085, |
|
"eval_runtime": 107.9543, |
|
"eval_samples_per_second": 220.927, |
|
"eval_steps_per_second": 6.91, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 7.965133754132852, |
|
"grad_norm": 2.536043882369995, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.9953, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 7.965133754132852, |
|
"eval_accuracy": 0.7891570577004067, |
|
"eval_loss": 1.2163290977478027, |
|
"eval_runtime": 113.9576, |
|
"eval_samples_per_second": 209.288, |
|
"eval_steps_per_second": 6.546, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.115419296663662, |
|
"grad_norm": 2.690735340118408, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.9847, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.115419296663662, |
|
"eval_accuracy": 0.7888144688528846, |
|
"eval_loss": 1.209494709968567, |
|
"eval_runtime": 116.0276, |
|
"eval_samples_per_second": 205.555, |
|
"eval_steps_per_second": 6.43, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.26570483919447, |
|
"grad_norm": 2.4185330867767334, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.9797, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.26570483919447, |
|
"eval_accuracy": 0.7891209106681478, |
|
"eval_loss": 1.218719482421875, |
|
"eval_runtime": 114.5016, |
|
"eval_samples_per_second": 208.294, |
|
"eval_steps_per_second": 6.515, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.415990381725278, |
|
"grad_norm": 2.7199230194091797, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.9755, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 8.415990381725278, |
|
"eval_accuracy": 0.7887393254933777, |
|
"eval_loss": 1.2271952629089355, |
|
"eval_runtime": 111.4173, |
|
"eval_samples_per_second": 214.06, |
|
"eval_steps_per_second": 6.696, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 8.566275924256086, |
|
"grad_norm": 2.6658599376678467, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.9749, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 8.566275924256086, |
|
"eval_accuracy": 0.7904619110106559, |
|
"eval_loss": 1.217193603515625, |
|
"eval_runtime": 107.3348, |
|
"eval_samples_per_second": 222.202, |
|
"eval_steps_per_second": 6.95, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 8.716561466786896, |
|
"grad_norm": 2.9954679012298584, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.9747, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 8.716561466786896, |
|
"eval_accuracy": 0.7904568069124281, |
|
"eval_loss": 1.2054858207702637, |
|
"eval_runtime": 111.9259, |
|
"eval_samples_per_second": 213.087, |
|
"eval_steps_per_second": 6.665, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 8.866847009317704, |
|
"grad_norm": 2.506471872329712, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.9715, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 8.866847009317704, |
|
"eval_accuracy": 0.790933514493206, |
|
"eval_loss": 1.203903317451477, |
|
"eval_runtime": 106.5362, |
|
"eval_samples_per_second": 223.868, |
|
"eval_steps_per_second": 7.002, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 9.017132551848512, |
|
"grad_norm": 2.67307710647583, |
|
"learning_rate": 7e-05, |
|
"loss": 0.9716, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 9.017132551848512, |
|
"eval_accuracy": 0.7914492771721259, |
|
"eval_loss": 1.2048789262771606, |
|
"eval_runtime": 121.4256, |
|
"eval_samples_per_second": 196.417, |
|
"eval_steps_per_second": 6.144, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 9.16741809437932, |
|
"grad_norm": 2.9317779541015625, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.962, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 9.16741809437932, |
|
"eval_accuracy": 0.7917621280752731, |
|
"eval_loss": 1.199426293373108, |
|
"eval_runtime": 121.1706, |
|
"eval_samples_per_second": 196.83, |
|
"eval_steps_per_second": 6.157, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 9.31770363691013, |
|
"grad_norm": 2.679703712463379, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.9642, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 9.31770363691013, |
|
"eval_accuracy": 0.7925397227321643, |
|
"eval_loss": 1.193407654762268, |
|
"eval_runtime": 123.2276, |
|
"eval_samples_per_second": 193.544, |
|
"eval_steps_per_second": 6.054, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 9.467989179440938, |
|
"grad_norm": 2.4788730144500732, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.9601, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 9.467989179440938, |
|
"eval_accuracy": 0.792334985338317, |
|
"eval_loss": 1.1934560537338257, |
|
"eval_runtime": 119.6088, |
|
"eval_samples_per_second": 199.4, |
|
"eval_steps_per_second": 6.237, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 9.618274721971746, |
|
"grad_norm": 2.8737170696258545, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.9541, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 9.618274721971746, |
|
"eval_accuracy": 0.7933121889138387, |
|
"eval_loss": 1.1921508312225342, |
|
"eval_runtime": 124.8462, |
|
"eval_samples_per_second": 191.035, |
|
"eval_steps_per_second": 5.975, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 9.768560264502554, |
|
"grad_norm": 2.7593533992767334, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.9599, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 9.768560264502554, |
|
"eval_accuracy": 0.7930929148774838, |
|
"eval_loss": 1.2042547464370728, |
|
"eval_runtime": 124.9577, |
|
"eval_samples_per_second": 190.865, |
|
"eval_steps_per_second": 5.97, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 9.918845807033364, |
|
"grad_norm": 2.389718532562256, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.9649, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 9.918845807033364, |
|
"eval_accuracy": 0.7939532145922783, |
|
"eval_loss": 1.1914194822311401, |
|
"eval_runtime": 120.967, |
|
"eval_samples_per_second": 197.161, |
|
"eval_steps_per_second": 6.167, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 10.069131349564172, |
|
"grad_norm": 2.846874475479126, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.9467, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 10.069131349564172, |
|
"eval_accuracy": 0.7947739980138828, |
|
"eval_loss": 1.1852329969406128, |
|
"eval_runtime": 120.8513, |
|
"eval_samples_per_second": 197.35, |
|
"eval_steps_per_second": 6.173, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 10.21941689209498, |
|
"grad_norm": 2.58475399017334, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.9406, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 10.21941689209498, |
|
"eval_accuracy": 0.7943515220267853, |
|
"eval_loss": 1.1843186616897583, |
|
"eval_runtime": 121.8305, |
|
"eval_samples_per_second": 195.764, |
|
"eval_steps_per_second": 6.123, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 10.36970243462579, |
|
"grad_norm": 3.003615140914917, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.9395, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 10.36970243462579, |
|
"eval_accuracy": 0.7948140182397172, |
|
"eval_loss": 1.1831063032150269, |
|
"eval_runtime": 120.2856, |
|
"eval_samples_per_second": 198.278, |
|
"eval_steps_per_second": 6.202, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 10.519987977156598, |
|
"grad_norm": 2.6997032165527344, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.9338, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 10.519987977156598, |
|
"eval_accuracy": 0.7955270740039722, |
|
"eval_loss": 1.1888868808746338, |
|
"eval_runtime": 121.3291, |
|
"eval_samples_per_second": 196.573, |
|
"eval_steps_per_second": 6.149, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 10.670273519687406, |
|
"grad_norm": 2.5620720386505127, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.9413, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 10.670273519687406, |
|
"eval_accuracy": 0.7960009228127132, |
|
"eval_loss": 1.1771271228790283, |
|
"eval_runtime": 116.3789, |
|
"eval_samples_per_second": 204.934, |
|
"eval_steps_per_second": 6.41, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 10.820559062218214, |
|
"grad_norm": 2.4902310371398926, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.941, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 10.820559062218214, |
|
"eval_accuracy": 0.7952424860773146, |
|
"eval_loss": 1.176774263381958, |
|
"eval_runtime": 116.6512, |
|
"eval_samples_per_second": 204.456, |
|
"eval_steps_per_second": 6.395, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 10.970844604749024, |
|
"grad_norm": 2.6512043476104736, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.9371, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 10.970844604749024, |
|
"eval_accuracy": 0.796440942176153, |
|
"eval_loss": 1.1723679304122925, |
|
"eval_runtime": 119.8979, |
|
"eval_samples_per_second": 198.919, |
|
"eval_steps_per_second": 6.222, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 11.121130147279832, |
|
"grad_norm": 2.6038336753845215, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.9293, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 11.121130147279832, |
|
"eval_accuracy": 0.7967239525153044, |
|
"eval_loss": 1.1750506162643433, |
|
"eval_runtime": 120.4764, |
|
"eval_samples_per_second": 197.964, |
|
"eval_steps_per_second": 6.192, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 11.27141568981064, |
|
"grad_norm": 2.5120317935943604, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.9279, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 11.27141568981064, |
|
"eval_accuracy": 0.7974463986990397, |
|
"eval_loss": 1.1721601486206055, |
|
"eval_runtime": 115.5123, |
|
"eval_samples_per_second": 206.471, |
|
"eval_steps_per_second": 6.458, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 11.421701232341448, |
|
"grad_norm": 2.6776065826416016, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.9273, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 11.421701232341448, |
|
"eval_accuracy": 0.7977723054491466, |
|
"eval_loss": 1.1790024042129517, |
|
"eval_runtime": 118.0413, |
|
"eval_samples_per_second": 202.048, |
|
"eval_steps_per_second": 6.32, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 11.571986774872258, |
|
"grad_norm": 2.473292589187622, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.9176, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 11.571986774872258, |
|
"eval_accuracy": 0.7978240654330204, |
|
"eval_loss": 1.1769903898239136, |
|
"eval_runtime": 119.5414, |
|
"eval_samples_per_second": 199.512, |
|
"eval_steps_per_second": 6.241, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 11.722272317403066, |
|
"grad_norm": 2.573493242263794, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.918, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 11.722272317403066, |
|
"eval_accuracy": 0.7978464067774307, |
|
"eval_loss": 1.1745543479919434, |
|
"eval_runtime": 118.8031, |
|
"eval_samples_per_second": 200.752, |
|
"eval_steps_per_second": 6.279, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 11.872557859933874, |
|
"grad_norm": 2.496293067932129, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.9209, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 11.872557859933874, |
|
"eval_accuracy": 0.7988222222591843, |
|
"eval_loss": 1.1572139263153076, |
|
"eval_runtime": 119.3596, |
|
"eval_samples_per_second": 199.816, |
|
"eval_steps_per_second": 6.25, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 12.022843402464684, |
|
"grad_norm": 2.8805453777313232, |
|
"learning_rate": 6e-05, |
|
"loss": 0.9204, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 12.022843402464684, |
|
"eval_accuracy": 0.799307208811464, |
|
"eval_loss": 1.1740858554840088, |
|
"eval_runtime": 118.6109, |
|
"eval_samples_per_second": 201.078, |
|
"eval_steps_per_second": 6.289, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 12.173128944995492, |
|
"grad_norm": 2.6088273525238037, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.9149, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 12.173128944995492, |
|
"eval_accuracy": 0.7991013549531508, |
|
"eval_loss": 1.1727755069732666, |
|
"eval_runtime": 117.7954, |
|
"eval_samples_per_second": 202.47, |
|
"eval_steps_per_second": 6.333, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 12.3234144875263, |
|
"grad_norm": 2.426567316055298, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.9009, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 12.3234144875263, |
|
"eval_accuracy": 0.798481914968918, |
|
"eval_loss": 1.1672251224517822, |
|
"eval_runtime": 119.0757, |
|
"eval_samples_per_second": 200.293, |
|
"eval_steps_per_second": 6.265, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 12.473700030057108, |
|
"grad_norm": 2.687640428543091, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.9094, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 12.473700030057108, |
|
"eval_accuracy": 0.7998230800562746, |
|
"eval_loss": 1.1598495244979858, |
|
"eval_runtime": 118.9102, |
|
"eval_samples_per_second": 200.572, |
|
"eval_steps_per_second": 6.274, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 12.623985572587918, |
|
"grad_norm": 3.0333776473999023, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.9149, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 12.623985572587918, |
|
"eval_accuracy": 0.7999625316919439, |
|
"eval_loss": 1.171325445175171, |
|
"eval_runtime": 115.3742, |
|
"eval_samples_per_second": 206.719, |
|
"eval_steps_per_second": 6.466, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 12.774271115118726, |
|
"grad_norm": 2.299436569213867, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.9068, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 12.774271115118726, |
|
"eval_accuracy": 0.8010648350504735, |
|
"eval_loss": 1.1537537574768066, |
|
"eval_runtime": 118.7443, |
|
"eval_samples_per_second": 200.852, |
|
"eval_steps_per_second": 6.282, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 12.924556657649534, |
|
"grad_norm": 2.7340447902679443, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.9099, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 12.924556657649534, |
|
"eval_accuracy": 0.8012578176921267, |
|
"eval_loss": 1.1426852941513062, |
|
"eval_runtime": 120.2664, |
|
"eval_samples_per_second": 198.31, |
|
"eval_steps_per_second": 6.203, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 13.074842200180342, |
|
"grad_norm": 2.6585094928741455, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.8951, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 13.074842200180342, |
|
"eval_accuracy": 0.8005615361847466, |
|
"eval_loss": 1.1578710079193115, |
|
"eval_runtime": 115.0048, |
|
"eval_samples_per_second": 207.383, |
|
"eval_steps_per_second": 6.487, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 13.225127742711152, |
|
"grad_norm": 2.6981582641601562, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.8886, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 13.225127742711152, |
|
"eval_accuracy": 0.8010340860790154, |
|
"eval_loss": 1.1602416038513184, |
|
"eval_runtime": 113.2435, |
|
"eval_samples_per_second": 210.608, |
|
"eval_steps_per_second": 6.588, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 13.37541328524196, |
|
"grad_norm": 2.6016407012939453, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.9057, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 13.37541328524196, |
|
"eval_accuracy": 0.8021396084503188, |
|
"eval_loss": 1.1406522989273071, |
|
"eval_runtime": 115.7578, |
|
"eval_samples_per_second": 206.034, |
|
"eval_steps_per_second": 6.444, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 13.525698827772768, |
|
"grad_norm": 2.693239688873291, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.8921, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 13.525698827772768, |
|
"eval_accuracy": 0.801952397809383, |
|
"eval_loss": 1.1361913681030273, |
|
"eval_runtime": 112.7804, |
|
"eval_samples_per_second": 211.473, |
|
"eval_steps_per_second": 6.615, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 13.675984370303578, |
|
"grad_norm": 2.386279582977295, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.8942, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 13.675984370303578, |
|
"eval_accuracy": 0.8024716769653207, |
|
"eval_loss": 1.1441001892089844, |
|
"eval_runtime": 114.7396, |
|
"eval_samples_per_second": 207.862, |
|
"eval_steps_per_second": 6.502, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 13.826269912834386, |
|
"grad_norm": 2.3237531185150146, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.8939, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 13.826269912834386, |
|
"eval_accuracy": 0.8025017531571386, |
|
"eval_loss": 1.141733169555664, |
|
"eval_runtime": 112.3505, |
|
"eval_samples_per_second": 212.282, |
|
"eval_steps_per_second": 6.64, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 13.976555455365194, |
|
"grad_norm": 2.463498592376709, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.891, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 13.976555455365194, |
|
"eval_accuracy": 0.803545043091428, |
|
"eval_loss": 1.144685983657837, |
|
"eval_runtime": 114.632, |
|
"eval_samples_per_second": 208.057, |
|
"eval_steps_per_second": 6.508, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 14.126840997896002, |
|
"grad_norm": 2.7393481731414795, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.8846, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 14.126840997896002, |
|
"eval_accuracy": 0.8038034340555349, |
|
"eval_loss": 1.1320561170578003, |
|
"eval_runtime": 112.5915, |
|
"eval_samples_per_second": 211.828, |
|
"eval_steps_per_second": 6.626, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 14.277126540426812, |
|
"grad_norm": 2.776505708694458, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.8841, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 14.277126540426812, |
|
"eval_accuracy": 0.8034705833910766, |
|
"eval_loss": 1.1436700820922852, |
|
"eval_runtime": 114.9721, |
|
"eval_samples_per_second": 207.442, |
|
"eval_steps_per_second": 6.489, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 14.42741208295762, |
|
"grad_norm": 2.536817789077759, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.8811, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 14.42741208295762, |
|
"eval_accuracy": 0.8044537392561116, |
|
"eval_loss": 1.133318305015564, |
|
"eval_runtime": 113.5045, |
|
"eval_samples_per_second": 210.124, |
|
"eval_steps_per_second": 6.572, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 14.577697625488428, |
|
"grad_norm": 2.524203062057495, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.8778, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 14.577697625488428, |
|
"eval_accuracy": 0.8039248049976617, |
|
"eval_loss": 1.1258032321929932, |
|
"eval_runtime": 114.9378, |
|
"eval_samples_per_second": 207.504, |
|
"eval_steps_per_second": 6.49, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 14.727983168019236, |
|
"grad_norm": 2.6129727363586426, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.8765, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 14.727983168019236, |
|
"eval_accuracy": 0.8044657041630938, |
|
"eval_loss": 1.1352417469024658, |
|
"eval_runtime": 114.1357, |
|
"eval_samples_per_second": 208.962, |
|
"eval_steps_per_second": 6.536, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 14.878268710550046, |
|
"grad_norm": 2.6218881607055664, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.876, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 14.878268710550046, |
|
"eval_accuracy": 0.8049223412984352, |
|
"eval_loss": 1.131935954093933, |
|
"eval_runtime": 114.4278, |
|
"eval_samples_per_second": 208.428, |
|
"eval_steps_per_second": 6.519, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 15.028554253080854, |
|
"grad_norm": 2.654724597930908, |
|
"learning_rate": 5e-05, |
|
"loss": 0.879, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 15.028554253080854, |
|
"eval_accuracy": 0.8056327032697777, |
|
"eval_loss": 1.1161094903945923, |
|
"eval_runtime": 112.2273, |
|
"eval_samples_per_second": 212.515, |
|
"eval_steps_per_second": 6.647, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 15.178839795611662, |
|
"grad_norm": 2.569153308868408, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.8674, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 15.178839795611662, |
|
"eval_accuracy": 0.8048139780991767, |
|
"eval_loss": 1.1278635263442993, |
|
"eval_runtime": 114.7277, |
|
"eval_samples_per_second": 207.884, |
|
"eval_steps_per_second": 6.502, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 15.32912533814247, |
|
"grad_norm": 2.5382237434387207, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.8676, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 15.32912533814247, |
|
"eval_accuracy": 0.8064227363767525, |
|
"eval_loss": 1.132450819015503, |
|
"eval_runtime": 107.2478, |
|
"eval_samples_per_second": 222.382, |
|
"eval_steps_per_second": 6.956, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 15.47941088067328, |
|
"grad_norm": 2.480746030807495, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.867, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 15.47941088067328, |
|
"eval_accuracy": 0.806509341887431, |
|
"eval_loss": 1.1208624839782715, |
|
"eval_runtime": 107.5739, |
|
"eval_samples_per_second": 221.708, |
|
"eval_steps_per_second": 6.935, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 15.629696423204088, |
|
"grad_norm": 2.4417829513549805, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.8648, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 15.629696423204088, |
|
"eval_accuracy": 0.80664691560334, |
|
"eval_loss": 1.123421549797058, |
|
"eval_runtime": 107.0763, |
|
"eval_samples_per_second": 222.738, |
|
"eval_steps_per_second": 6.967, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 15.779981965734896, |
|
"grad_norm": 2.3048479557037354, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.87, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 15.779981965734896, |
|
"eval_accuracy": 0.8067313801594327, |
|
"eval_loss": 1.1155991554260254, |
|
"eval_runtime": 108.1545, |
|
"eval_samples_per_second": 220.518, |
|
"eval_steps_per_second": 6.898, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 15.930267508265704, |
|
"grad_norm": 2.6421682834625244, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.8694, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 15.930267508265704, |
|
"eval_accuracy": 0.8066291950596075, |
|
"eval_loss": 1.1215757131576538, |
|
"eval_runtime": 109.1629, |
|
"eval_samples_per_second": 218.481, |
|
"eval_steps_per_second": 6.834, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 16.080553050796514, |
|
"grad_norm": 2.4313910007476807, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.8668, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 16.080553050796514, |
|
"eval_accuracy": 0.8079721270128156, |
|
"eval_loss": 1.1246066093444824, |
|
"eval_runtime": 108.472, |
|
"eval_samples_per_second": 219.872, |
|
"eval_steps_per_second": 6.877, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 16.230838593327324, |
|
"grad_norm": 2.2622108459472656, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.8603, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 16.230838593327324, |
|
"eval_accuracy": 0.8070964068446762, |
|
"eval_loss": 1.1118344068527222, |
|
"eval_runtime": 107.6522, |
|
"eval_samples_per_second": 221.547, |
|
"eval_steps_per_second": 6.93, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 16.38112413585813, |
|
"grad_norm": 2.355970621109009, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.8557, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 16.38112413585813, |
|
"eval_accuracy": 0.8079940925101587, |
|
"eval_loss": 1.1261143684387207, |
|
"eval_runtime": 107.1531, |
|
"eval_samples_per_second": 222.579, |
|
"eval_steps_per_second": 6.962, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 16.53140967838894, |
|
"grad_norm": 2.4477593898773193, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.8521, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 16.53140967838894, |
|
"eval_accuracy": 0.8084604086659483, |
|
"eval_loss": 1.1086950302124023, |
|
"eval_runtime": 106.6462, |
|
"eval_samples_per_second": 223.637, |
|
"eval_steps_per_second": 6.995, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 16.681695220919746, |
|
"grad_norm": 3.1719090938568115, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.8583, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 16.681695220919746, |
|
"eval_accuracy": 0.8085530062732024, |
|
"eval_loss": 1.1111265420913696, |
|
"eval_runtime": 107.453, |
|
"eval_samples_per_second": 221.958, |
|
"eval_steps_per_second": 6.943, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 16.831980763450556, |
|
"grad_norm": 2.867218494415283, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.8509, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 16.831980763450556, |
|
"eval_accuracy": 0.8083466491211581, |
|
"eval_loss": 1.1217560768127441, |
|
"eval_runtime": 107.5379, |
|
"eval_samples_per_second": 221.782, |
|
"eval_steps_per_second": 6.937, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 16.982266305981366, |
|
"grad_norm": 2.421694040298462, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.8591, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 16.982266305981366, |
|
"eval_accuracy": 0.808431427546503, |
|
"eval_loss": 1.1109532117843628, |
|
"eval_runtime": 108.5465, |
|
"eval_samples_per_second": 219.721, |
|
"eval_steps_per_second": 6.873, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 17.132551848512172, |
|
"grad_norm": 2.6324167251586914, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.8417, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 17.132551848512172, |
|
"eval_accuracy": 0.8085933337773042, |
|
"eval_loss": 1.1125506162643433, |
|
"eval_runtime": 108.0877, |
|
"eval_samples_per_second": 220.654, |
|
"eval_steps_per_second": 6.902, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 17.282837391042982, |
|
"grad_norm": 2.448883056640625, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.8511, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 17.282837391042982, |
|
"eval_accuracy": 0.8093454793356915, |
|
"eval_loss": 1.1108651161193848, |
|
"eval_runtime": 108.0036, |
|
"eval_samples_per_second": 220.826, |
|
"eval_steps_per_second": 6.907, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 17.43312293357379, |
|
"grad_norm": 2.647814989089966, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.8472, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 17.43312293357379, |
|
"eval_accuracy": 0.8097202877059791, |
|
"eval_loss": 1.1124111413955688, |
|
"eval_runtime": 107.9075, |
|
"eval_samples_per_second": 221.023, |
|
"eval_steps_per_second": 6.913, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 17.583408476104598, |
|
"grad_norm": 2.3744354248046875, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.8381, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 17.583408476104598, |
|
"eval_accuracy": 0.8095948479389424, |
|
"eval_loss": 1.1056084632873535, |
|
"eval_runtime": 107.6903, |
|
"eval_samples_per_second": 221.468, |
|
"eval_steps_per_second": 6.927, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 17.733694018635408, |
|
"grad_norm": 2.3030834197998047, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.8474, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 17.733694018635408, |
|
"eval_accuracy": 0.8102527131048097, |
|
"eval_loss": 1.1011704206466675, |
|
"eval_runtime": 107.5969, |
|
"eval_samples_per_second": 221.661, |
|
"eval_steps_per_second": 6.933, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 17.883979561166214, |
|
"grad_norm": 2.610208749771118, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.8456, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 17.883979561166214, |
|
"eval_accuracy": 0.8108135572608609, |
|
"eval_loss": 1.097122073173523, |
|
"eval_runtime": 106.6046, |
|
"eval_samples_per_second": 223.724, |
|
"eval_steps_per_second": 6.998, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 18.034265103697024, |
|
"grad_norm": 2.491633176803589, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8367, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 18.034265103697024, |
|
"eval_accuracy": 0.8112650064622723, |
|
"eval_loss": 1.0913532972335815, |
|
"eval_runtime": 107.2029, |
|
"eval_samples_per_second": 222.475, |
|
"eval_steps_per_second": 6.959, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 18.184550646227834, |
|
"grad_norm": 2.270582437515259, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.8336, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 18.184550646227834, |
|
"eval_accuracy": 0.8104727212172935, |
|
"eval_loss": 1.110021710395813, |
|
"eval_runtime": 106.0993, |
|
"eval_samples_per_second": 224.789, |
|
"eval_steps_per_second": 7.031, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 18.33483618875864, |
|
"grad_norm": 2.4716830253601074, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.8404, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 18.33483618875864, |
|
"eval_accuracy": 0.8114022782578146, |
|
"eval_loss": 1.0959724187850952, |
|
"eval_runtime": 107.1337, |
|
"eval_samples_per_second": 222.619, |
|
"eval_steps_per_second": 6.963, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 18.48512173128945, |
|
"grad_norm": 2.679825782775879, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.8365, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 18.48512173128945, |
|
"eval_accuracy": 0.8112698414020383, |
|
"eval_loss": 1.0895620584487915, |
|
"eval_runtime": 107.9425, |
|
"eval_samples_per_second": 220.951, |
|
"eval_steps_per_second": 6.911, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 18.63540727382026, |
|
"grad_norm": 2.5028414726257324, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.8308, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 18.63540727382026, |
|
"eval_accuracy": 0.8112445219475855, |
|
"eval_loss": 1.098541498184204, |
|
"eval_runtime": 108.1696, |
|
"eval_samples_per_second": 220.487, |
|
"eval_steps_per_second": 6.897, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 18.785692816351066, |
|
"grad_norm": 2.4303503036499023, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.8308, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 18.785692816351066, |
|
"eval_accuracy": 0.8120511999201301, |
|
"eval_loss": 1.0968284606933594, |
|
"eval_runtime": 107.5288, |
|
"eval_samples_per_second": 221.801, |
|
"eval_steps_per_second": 6.938, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 18.935978358881876, |
|
"grad_norm": 2.9847395420074463, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.8312, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 18.935978358881876, |
|
"eval_accuracy": 0.8121808312724796, |
|
"eval_loss": 1.0884159803390503, |
|
"eval_runtime": 107.9223, |
|
"eval_samples_per_second": 220.992, |
|
"eval_steps_per_second": 6.912, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 19.086263901412686, |
|
"grad_norm": 2.302724838256836, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.8279, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 19.086263901412686, |
|
"eval_accuracy": 0.8127240285256392, |
|
"eval_loss": 1.0900928974151611, |
|
"eval_runtime": 106.9119, |
|
"eval_samples_per_second": 223.081, |
|
"eval_steps_per_second": 6.978, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 19.236549443943492, |
|
"grad_norm": 2.4187684059143066, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.8219, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 19.236549443943492, |
|
"eval_accuracy": 0.8129403972187366, |
|
"eval_loss": 1.0872172117233276, |
|
"eval_runtime": 109.8336, |
|
"eval_samples_per_second": 217.147, |
|
"eval_steps_per_second": 6.792, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 19.3868349864743, |
|
"grad_norm": 2.740501880645752, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.8266, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 19.3868349864743, |
|
"eval_accuracy": 0.8114303723800741, |
|
"eval_loss": 1.0987831354141235, |
|
"eval_runtime": 109.7168, |
|
"eval_samples_per_second": 217.378, |
|
"eval_steps_per_second": 6.799, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 19.537120529005108, |
|
"grad_norm": 2.326366424560547, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.8255, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 19.537120529005108, |
|
"eval_accuracy": 0.8124286882301512, |
|
"eval_loss": 1.0885217189788818, |
|
"eval_runtime": 114.778, |
|
"eval_samples_per_second": 207.792, |
|
"eval_steps_per_second": 6.5, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 19.687406071535918, |
|
"grad_norm": 3.1694321632385254, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.8185, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 19.687406071535918, |
|
"eval_accuracy": 0.8129749253547676, |
|
"eval_loss": 1.0836535692214966, |
|
"eval_runtime": 114.5951, |
|
"eval_samples_per_second": 208.124, |
|
"eval_steps_per_second": 6.51, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 19.837691614066728, |
|
"grad_norm": 2.6415817737579346, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.8219, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 19.837691614066728, |
|
"eval_accuracy": 0.8122496169754481, |
|
"eval_loss": 1.0868114233016968, |
|
"eval_runtime": 113.7304, |
|
"eval_samples_per_second": 209.707, |
|
"eval_steps_per_second": 6.559, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 19.987977156597534, |
|
"grad_norm": 2.567044496536255, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.8214, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 19.987977156597534, |
|
"eval_accuracy": 0.8135328500351103, |
|
"eval_loss": 1.0849970579147339, |
|
"eval_runtime": 114.2096, |
|
"eval_samples_per_second": 208.827, |
|
"eval_steps_per_second": 6.532, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 20.138262699128344, |
|
"grad_norm": 2.475660562515259, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.8123, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 20.138262699128344, |
|
"eval_accuracy": 0.8127171094527156, |
|
"eval_loss": 1.0919703245162964, |
|
"eval_runtime": 110.4581, |
|
"eval_samples_per_second": 215.919, |
|
"eval_steps_per_second": 6.754, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 20.288548241659154, |
|
"grad_norm": 2.9205057621002197, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.8146, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 20.288548241659154, |
|
"eval_accuracy": 0.8139352319239987, |
|
"eval_loss": 1.0828359127044678, |
|
"eval_runtime": 110.5729, |
|
"eval_samples_per_second": 215.695, |
|
"eval_steps_per_second": 6.747, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 20.43883378418996, |
|
"grad_norm": 2.775470018386841, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.8117, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 20.43883378418996, |
|
"eval_accuracy": 0.8144743916913453, |
|
"eval_loss": 1.0843496322631836, |
|
"eval_runtime": 110.9504, |
|
"eval_samples_per_second": 214.961, |
|
"eval_steps_per_second": 6.724, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 20.58911932672077, |
|
"grad_norm": 2.8596460819244385, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.8142, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 20.58911932672077, |
|
"eval_accuracy": 0.8145867898001746, |
|
"eval_loss": 1.0775424242019653, |
|
"eval_runtime": 110.2042, |
|
"eval_samples_per_second": 216.416, |
|
"eval_steps_per_second": 6.769, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 20.73940486925158, |
|
"grad_norm": 2.5671284198760986, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.8176, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 20.73940486925158, |
|
"eval_accuracy": 0.8140961234786387, |
|
"eval_loss": 1.0756142139434814, |
|
"eval_runtime": 110.7907, |
|
"eval_samples_per_second": 215.271, |
|
"eval_steps_per_second": 6.733, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 20.889690411782386, |
|
"grad_norm": 2.549713373184204, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.813, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 20.889690411782386, |
|
"eval_accuracy": 0.8145543791131244, |
|
"eval_loss": 1.0741270780563354, |
|
"eval_runtime": 113.2044, |
|
"eval_samples_per_second": 210.681, |
|
"eval_steps_per_second": 6.59, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 21.039975954313196, |
|
"grad_norm": 2.433366060256958, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8044, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 21.039975954313196, |
|
"eval_accuracy": 0.8138238331972251, |
|
"eval_loss": 1.0890129804611206, |
|
"eval_runtime": 113.5064, |
|
"eval_samples_per_second": 210.12, |
|
"eval_steps_per_second": 6.572, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 21.190261496844002, |
|
"grad_norm": 2.672717571258545, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.8034, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 21.190261496844002, |
|
"eval_accuracy": 0.8146000723609511, |
|
"eval_loss": 1.0757637023925781, |
|
"eval_runtime": 109.4526, |
|
"eval_samples_per_second": 217.902, |
|
"eval_steps_per_second": 6.816, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 21.340547039374812, |
|
"grad_norm": 2.7107882499694824, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.8007, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 21.340547039374812, |
|
"eval_accuracy": 0.815981095746543, |
|
"eval_loss": 1.074793815612793, |
|
"eval_runtime": 113.0561, |
|
"eval_samples_per_second": 210.957, |
|
"eval_steps_per_second": 6.598, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 21.49083258190562, |
|
"grad_norm": 2.419224262237549, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.8009, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 21.49083258190562, |
|
"eval_accuracy": 0.8157551774698867, |
|
"eval_loss": 1.0713990926742554, |
|
"eval_runtime": 114.1603, |
|
"eval_samples_per_second": 208.917, |
|
"eval_steps_per_second": 6.535, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 21.641118124436428, |
|
"grad_norm": 2.68849778175354, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.8042, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 21.641118124436428, |
|
"eval_accuracy": 0.816240857712766, |
|
"eval_loss": 1.0804829597473145, |
|
"eval_runtime": 113.3941, |
|
"eval_samples_per_second": 210.328, |
|
"eval_steps_per_second": 6.579, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 21.791403666967238, |
|
"grad_norm": 2.3715298175811768, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.7971, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 21.791403666967238, |
|
"eval_accuracy": 0.8154816358162141, |
|
"eval_loss": 1.0725679397583008, |
|
"eval_runtime": 114.0456, |
|
"eval_samples_per_second": 209.127, |
|
"eval_steps_per_second": 6.541, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 21.941689209498048, |
|
"grad_norm": 2.6499125957489014, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.807, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 21.941689209498048, |
|
"eval_accuracy": 0.8168061183380335, |
|
"eval_loss": 1.0588874816894531, |
|
"eval_runtime": 112.5964, |
|
"eval_samples_per_second": 211.819, |
|
"eval_steps_per_second": 6.625, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 22.091974752028854, |
|
"grad_norm": 2.766451358795166, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.7973, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 22.091974752028854, |
|
"eval_accuracy": 0.8168862140868284, |
|
"eval_loss": 1.063796877861023, |
|
"eval_runtime": 113.9911, |
|
"eval_samples_per_second": 209.227, |
|
"eval_steps_per_second": 6.544, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 22.242260294559664, |
|
"grad_norm": 2.5453474521636963, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.7915, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 22.242260294559664, |
|
"eval_accuracy": 0.8162707303528417, |
|
"eval_loss": 1.068402886390686, |
|
"eval_runtime": 113.1917, |
|
"eval_samples_per_second": 210.704, |
|
"eval_steps_per_second": 6.591, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 22.392545837090474, |
|
"grad_norm": 2.449525833129883, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.8019, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 22.392545837090474, |
|
"eval_accuracy": 0.8168465033060288, |
|
"eval_loss": 1.071266531944275, |
|
"eval_runtime": 114.1644, |
|
"eval_samples_per_second": 208.909, |
|
"eval_steps_per_second": 6.534, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 22.54283137962128, |
|
"grad_norm": 2.782717704772949, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7959, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 22.54283137962128, |
|
"eval_accuracy": 0.8176195224595183, |
|
"eval_loss": 1.0642682313919067, |
|
"eval_runtime": 112.938, |
|
"eval_samples_per_second": 211.178, |
|
"eval_steps_per_second": 6.605, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 22.69311692215209, |
|
"grad_norm": 2.754309892654419, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.7905, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 22.69311692215209, |
|
"eval_accuracy": 0.8177573368082611, |
|
"eval_loss": 1.0715863704681396, |
|
"eval_runtime": 113.2687, |
|
"eval_samples_per_second": 210.561, |
|
"eval_steps_per_second": 6.586, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 22.843402464682896, |
|
"grad_norm": 2.665132999420166, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.7894, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 22.843402464682896, |
|
"eval_accuracy": 0.8184662117931626, |
|
"eval_loss": 1.0566191673278809, |
|
"eval_runtime": 114.3215, |
|
"eval_samples_per_second": 208.622, |
|
"eval_steps_per_second": 6.525, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 22.993688007213706, |
|
"grad_norm": 2.2912895679473877, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.789, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 22.993688007213706, |
|
"eval_accuracy": 0.8173126447012466, |
|
"eval_loss": 1.0590641498565674, |
|
"eval_runtime": 114.0949, |
|
"eval_samples_per_second": 209.037, |
|
"eval_steps_per_second": 6.538, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 23.143973549744516, |
|
"grad_norm": 2.7320480346679688, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.7859, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 23.143973549744516, |
|
"eval_accuracy": 0.8181692377590023, |
|
"eval_loss": 1.0568209886550903, |
|
"eval_runtime": 114.0712, |
|
"eval_samples_per_second": 209.08, |
|
"eval_steps_per_second": 6.54, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 23.294259092275322, |
|
"grad_norm": 2.5960936546325684, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.7894, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 23.294259092275322, |
|
"eval_accuracy": 0.8178769275591534, |
|
"eval_loss": 1.061540961265564, |
|
"eval_runtime": 113.9113, |
|
"eval_samples_per_second": 209.373, |
|
"eval_steps_per_second": 6.549, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 23.44454463480613, |
|
"grad_norm": 2.435558557510376, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.7887, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 23.44454463480613, |
|
"eval_accuracy": 0.8177420807085761, |
|
"eval_loss": 1.0617201328277588, |
|
"eval_runtime": 113.4776, |
|
"eval_samples_per_second": 210.174, |
|
"eval_steps_per_second": 6.574, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 23.59483017733694, |
|
"grad_norm": 2.3692948818206787, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.7826, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 23.59483017733694, |
|
"eval_accuracy": 0.8180053577064287, |
|
"eval_loss": 1.0567620992660522, |
|
"eval_runtime": 113.7057, |
|
"eval_samples_per_second": 209.752, |
|
"eval_steps_per_second": 6.561, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 23.745115719867748, |
|
"grad_norm": 2.5609283447265625, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.7885, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 23.745115719867748, |
|
"eval_accuracy": 0.8188002648627605, |
|
"eval_loss": 1.0632256269454956, |
|
"eval_runtime": 113.6739, |
|
"eval_samples_per_second": 209.811, |
|
"eval_steps_per_second": 6.563, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 23.895401262398558, |
|
"grad_norm": 2.3372645378112793, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.7883, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 23.895401262398558, |
|
"eval_accuracy": 0.8187176321623402, |
|
"eval_loss": 1.066706657409668, |
|
"eval_runtime": 114.0886, |
|
"eval_samples_per_second": 209.048, |
|
"eval_steps_per_second": 6.539, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 24.045686804929367, |
|
"grad_norm": 2.3112826347351074, |
|
"learning_rate": 2e-05, |
|
"loss": 0.783, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 24.045686804929367, |
|
"eval_accuracy": 0.818846919349616, |
|
"eval_loss": 1.0611591339111328, |
|
"eval_runtime": 113.1339, |
|
"eval_samples_per_second": 210.812, |
|
"eval_steps_per_second": 6.594, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 24.195972347460174, |
|
"grad_norm": 2.514390230178833, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.7804, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 24.195972347460174, |
|
"eval_accuracy": 0.818812757678464, |
|
"eval_loss": 1.0573077201843262, |
|
"eval_runtime": 114.2005, |
|
"eval_samples_per_second": 208.843, |
|
"eval_steps_per_second": 6.532, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 24.346257889990984, |
|
"grad_norm": 2.4737348556518555, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.7811, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 24.346257889990984, |
|
"eval_accuracy": 0.8192913294066332, |
|
"eval_loss": 1.0586949586868286, |
|
"eval_runtime": 113.7459, |
|
"eval_samples_per_second": 209.678, |
|
"eval_steps_per_second": 6.558, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 24.49654343252179, |
|
"grad_norm": 2.288757562637329, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.7767, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 24.49654343252179, |
|
"eval_accuracy": 0.8197264062916556, |
|
"eval_loss": 1.0525128841400146, |
|
"eval_runtime": 113.6132, |
|
"eval_samples_per_second": 209.923, |
|
"eval_steps_per_second": 6.566, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 24.6468289750526, |
|
"grad_norm": 2.4246654510498047, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7803, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 24.6468289750526, |
|
"eval_accuracy": 0.8195798531764643, |
|
"eval_loss": 1.0466234683990479, |
|
"eval_runtime": 114.7779, |
|
"eval_samples_per_second": 207.793, |
|
"eval_steps_per_second": 6.5, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 24.79711451758341, |
|
"grad_norm": 3.0004007816314697, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.7688, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 24.79711451758341, |
|
"eval_accuracy": 0.8197989075740256, |
|
"eval_loss": 1.0529950857162476, |
|
"eval_runtime": 113.2072, |
|
"eval_samples_per_second": 210.676, |
|
"eval_steps_per_second": 6.59, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 24.947400060114216, |
|
"grad_norm": 2.476900577545166, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.7734, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 24.947400060114216, |
|
"eval_accuracy": 0.8198411078292097, |
|
"eval_loss": 1.0492240190505981, |
|
"eval_runtime": 114.2362, |
|
"eval_samples_per_second": 208.778, |
|
"eval_steps_per_second": 6.53, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 25.097685602645026, |
|
"grad_norm": 2.6050217151641846, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.7741, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 25.097685602645026, |
|
"eval_accuracy": 0.8199209388676126, |
|
"eval_loss": 1.0443811416625977, |
|
"eval_runtime": 115.0076, |
|
"eval_samples_per_second": 207.378, |
|
"eval_steps_per_second": 6.487, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 25.247971145175836, |
|
"grad_norm": 2.845093011856079, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.768, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 25.247971145175836, |
|
"eval_accuracy": 0.8203943019525041, |
|
"eval_loss": 1.0556350946426392, |
|
"eval_runtime": 115.581, |
|
"eval_samples_per_second": 206.349, |
|
"eval_steps_per_second": 6.454, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 25.398256687706642, |
|
"grad_norm": 2.8715054988861084, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.7731, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 25.398256687706642, |
|
"eval_accuracy": 0.8204570484351956, |
|
"eval_loss": 1.0443503856658936, |
|
"eval_runtime": 114.2662, |
|
"eval_samples_per_second": 208.723, |
|
"eval_steps_per_second": 6.529, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 25.54854223023745, |
|
"grad_norm": 2.2550415992736816, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.7675, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 25.54854223023745, |
|
"eval_accuracy": 0.8200469636032075, |
|
"eval_loss": 1.0414886474609375, |
|
"eval_runtime": 114.9948, |
|
"eval_samples_per_second": 207.401, |
|
"eval_steps_per_second": 6.487, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 25.69882777276826, |
|
"grad_norm": 2.238607168197632, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.7675, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 25.69882777276826, |
|
"eval_accuracy": 0.8207652275282068, |
|
"eval_loss": 1.0444408655166626, |
|
"eval_runtime": 115.1961, |
|
"eval_samples_per_second": 207.038, |
|
"eval_steps_per_second": 6.476, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 25.849113315299068, |
|
"grad_norm": 2.242630958557129, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.7655, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 25.849113315299068, |
|
"eval_accuracy": 0.8202594142538011, |
|
"eval_loss": 1.0470616817474365, |
|
"eval_runtime": 116.2229, |
|
"eval_samples_per_second": 205.209, |
|
"eval_steps_per_second": 6.419, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 25.999398857829878, |
|
"grad_norm": 2.5453295707702637, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.7678, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 25.999398857829878, |
|
"eval_accuracy": 0.8200539868874701, |
|
"eval_loss": 1.0508933067321777, |
|
"eval_runtime": 114.7052, |
|
"eval_samples_per_second": 207.924, |
|
"eval_steps_per_second": 6.504, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 26.149684400360684, |
|
"grad_norm": 2.391352415084839, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.7653, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 26.149684400360684, |
|
"eval_accuracy": 0.8205590231388606, |
|
"eval_loss": 1.0484099388122559, |
|
"eval_runtime": 115.0871, |
|
"eval_samples_per_second": 207.234, |
|
"eval_steps_per_second": 6.482, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 26.299969942891494, |
|
"grad_norm": 2.5071208477020264, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.7679, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 26.299969942891494, |
|
"eval_accuracy": 0.8217573625905127, |
|
"eval_loss": 1.0400173664093018, |
|
"eval_runtime": 113.1376, |
|
"eval_samples_per_second": 210.805, |
|
"eval_steps_per_second": 6.594, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 26.450255485422304, |
|
"grad_norm": 2.5453133583068848, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7656, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 26.450255485422304, |
|
"eval_accuracy": 0.8213452575807276, |
|
"eval_loss": 1.0516611337661743, |
|
"eval_runtime": 113.3943, |
|
"eval_samples_per_second": 210.328, |
|
"eval_steps_per_second": 6.579, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 26.60054102795311, |
|
"grad_norm": 2.509098529815674, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.7593, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 26.60054102795311, |
|
"eval_accuracy": 0.8213436048487595, |
|
"eval_loss": 1.0389631986618042, |
|
"eval_runtime": 116.4627, |
|
"eval_samples_per_second": 204.787, |
|
"eval_steps_per_second": 6.405, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 26.75082657048392, |
|
"grad_norm": 2.835665464401245, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.7606, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 26.75082657048392, |
|
"eval_accuracy": 0.8212840552795833, |
|
"eval_loss": 1.0403192043304443, |
|
"eval_runtime": 114.9351, |
|
"eval_samples_per_second": 207.508, |
|
"eval_steps_per_second": 6.491, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 26.90111211301473, |
|
"grad_norm": 2.5455708503723145, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.7578, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 26.90111211301473, |
|
"eval_accuracy": 0.821311120726958, |
|
"eval_loss": 1.036601185798645, |
|
"eval_runtime": 114.3227, |
|
"eval_samples_per_second": 208.62, |
|
"eval_steps_per_second": 6.525, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 27.051397655545536, |
|
"grad_norm": 2.821378231048584, |
|
"learning_rate": 1e-05, |
|
"loss": 0.765, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 27.051397655545536, |
|
"eval_accuracy": 0.8221304808698725, |
|
"eval_loss": 1.0333795547485352, |
|
"eval_runtime": 115.4024, |
|
"eval_samples_per_second": 206.668, |
|
"eval_steps_per_second": 6.464, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 27.201683198076346, |
|
"grad_norm": 2.59919810295105, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.7589, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 27.201683198076346, |
|
"eval_accuracy": 0.8222628454875403, |
|
"eval_loss": 1.033319354057312, |
|
"eval_runtime": 115.8055, |
|
"eval_samples_per_second": 205.949, |
|
"eval_steps_per_second": 6.442, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 27.351968740607152, |
|
"grad_norm": 2.601203203201294, |
|
"learning_rate": 9e-06, |
|
"loss": 0.7563, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 27.351968740607152, |
|
"eval_accuracy": 0.8218045068983223, |
|
"eval_loss": 1.046338677406311, |
|
"eval_runtime": 116.2837, |
|
"eval_samples_per_second": 205.102, |
|
"eval_steps_per_second": 6.415, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 27.50225428313796, |
|
"grad_norm": 2.8450610637664795, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.7559, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 27.50225428313796, |
|
"eval_accuracy": 0.8218716317162797, |
|
"eval_loss": 1.044384241104126, |
|
"eval_runtime": 115.6484, |
|
"eval_samples_per_second": 206.229, |
|
"eval_steps_per_second": 6.451, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 27.65253982566877, |
|
"grad_norm": 2.7283740043640137, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.7586, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 27.65253982566877, |
|
"eval_accuracy": 0.8221257250457026, |
|
"eval_loss": 1.0296134948730469, |
|
"eval_runtime": 115.214, |
|
"eval_samples_per_second": 207.006, |
|
"eval_steps_per_second": 6.475, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 27.802825368199578, |
|
"grad_norm": 2.795022964477539, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.7585, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 27.802825368199578, |
|
"eval_accuracy": 0.82162242799983, |
|
"eval_loss": 1.045117735862732, |
|
"eval_runtime": 113.2778, |
|
"eval_samples_per_second": 210.544, |
|
"eval_steps_per_second": 6.586, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 27.953110910730388, |
|
"grad_norm": 2.52536678314209, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.7548, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 27.953110910730388, |
|
"eval_accuracy": 0.8225299354468154, |
|
"eval_loss": 1.0380265712738037, |
|
"eval_runtime": 113.5711, |
|
"eval_samples_per_second": 210.001, |
|
"eval_steps_per_second": 6.569, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 28.103396453261198, |
|
"grad_norm": 2.3631057739257812, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.7542, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 28.103396453261198, |
|
"eval_accuracy": 0.8229969642015583, |
|
"eval_loss": 1.0490919351577759, |
|
"eval_runtime": 112.4838, |
|
"eval_samples_per_second": 212.031, |
|
"eval_steps_per_second": 6.632, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 28.253681995792004, |
|
"grad_norm": 2.2699172496795654, |
|
"learning_rate": 6e-06, |
|
"loss": 0.7522, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 28.253681995792004, |
|
"eval_accuracy": 0.8230864296098176, |
|
"eval_loss": 1.0274651050567627, |
|
"eval_runtime": 111.8716, |
|
"eval_samples_per_second": 213.191, |
|
"eval_steps_per_second": 6.668, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 28.403967538322814, |
|
"grad_norm": 2.44769287109375, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.7569, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 28.403967538322814, |
|
"eval_accuracy": 0.8234413637256175, |
|
"eval_loss": 1.0298686027526855, |
|
"eval_runtime": 113.5377, |
|
"eval_samples_per_second": 210.062, |
|
"eval_steps_per_second": 6.571, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 28.554253080853623, |
|
"grad_norm": 2.7103466987609863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7536, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 28.554253080853623, |
|
"eval_accuracy": 0.8224712064035926, |
|
"eval_loss": 1.034175992012024, |
|
"eval_runtime": 113.1535, |
|
"eval_samples_per_second": 210.776, |
|
"eval_steps_per_second": 6.593, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 28.70453862338443, |
|
"grad_norm": 2.594381809234619, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.7547, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 28.70453862338443, |
|
"eval_accuracy": 0.8231576514254546, |
|
"eval_loss": 1.0324558019638062, |
|
"eval_runtime": 111.2443, |
|
"eval_samples_per_second": 214.393, |
|
"eval_steps_per_second": 6.706, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 28.85482416591524, |
|
"grad_norm": 2.623845100402832, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7529, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 28.85482416591524, |
|
"eval_accuracy": 0.8241724336625226, |
|
"eval_loss": 1.0312702655792236, |
|
"eval_runtime": 115.266, |
|
"eval_samples_per_second": 206.913, |
|
"eval_steps_per_second": 6.472, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 29.005109708446046, |
|
"grad_norm": 2.706256628036499, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.7461, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 29.005109708446046, |
|
"eval_accuracy": 0.8230462104005479, |
|
"eval_loss": 1.0389125347137451, |
|
"eval_runtime": 115.814, |
|
"eval_samples_per_second": 205.934, |
|
"eval_steps_per_second": 6.441, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 29.155395250976856, |
|
"grad_norm": 2.697993278503418, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7513, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 29.155395250976856, |
|
"eval_accuracy": 0.8236184494498082, |
|
"eval_loss": 1.036423921585083, |
|
"eval_runtime": 115.3283, |
|
"eval_samples_per_second": 206.801, |
|
"eval_steps_per_second": 6.468, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 29.305680793507666, |
|
"grad_norm": 2.68867826461792, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.7494, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 29.305680793507666, |
|
"eval_accuracy": 0.82393844898435, |
|
"eval_loss": 1.0242578983306885, |
|
"eval_runtime": 116.3698, |
|
"eval_samples_per_second": 204.95, |
|
"eval_steps_per_second": 6.411, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 29.455966336038472, |
|
"grad_norm": 2.259347915649414, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7438, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 29.455966336038472, |
|
"eval_accuracy": 0.8236859969210806, |
|
"eval_loss": 1.0227982997894287, |
|
"eval_runtime": 115.3714, |
|
"eval_samples_per_second": 206.724, |
|
"eval_steps_per_second": 6.466, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 29.60625187856928, |
|
"grad_norm": 2.8278329372406006, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.7499, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 29.60625187856928, |
|
"eval_accuracy": 0.8236034478162941, |
|
"eval_loss": 1.022267460823059, |
|
"eval_runtime": 114.5023, |
|
"eval_samples_per_second": 208.293, |
|
"eval_steps_per_second": 6.515, |
|
"step": 98500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 31, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.31746763541971e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|