{ "best_metric": 1.022267460823059, "best_model_checkpoint": "./model_fine-tune/glot/xlm-r/ben-Beng/checkpoint-98500", "epoch": 29.60625187856928, "eval_steps": 500, "global_step": 98500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15028554253080853, "grad_norm": 4.279157638549805, "learning_rate": 9.95e-05, "loss": 1.5206, "step": 500 }, { "epoch": 0.15028554253080853, "eval_accuracy": 0.7123080245508447, "eval_loss": 1.6415441036224365, "eval_runtime": 108.0973, "eval_samples_per_second": 220.635, "eval_steps_per_second": 6.901, "step": 500 }, { "epoch": 0.30057108506161706, "grad_norm": 3.948293685913086, "learning_rate": 9.900000000000001e-05, "loss": 1.4505, "step": 1000 }, { "epoch": 0.30057108506161706, "eval_accuracy": 0.7195765918792126, "eval_loss": 1.6040955781936646, "eval_runtime": 108.1918, "eval_samples_per_second": 220.442, "eval_steps_per_second": 6.895, "step": 1000 }, { "epoch": 0.4508566275924256, "grad_norm": 3.838204860687256, "learning_rate": 9.850000000000001e-05, "loss": 1.4092, "step": 1500 }, { "epoch": 0.4508566275924256, "eval_accuracy": 0.7255862618332372, "eval_loss": 1.5711835622787476, "eval_runtime": 105.314, "eval_samples_per_second": 226.466, "eval_steps_per_second": 7.084, "step": 1500 }, { "epoch": 0.6011421701232341, "grad_norm": 3.7270383834838867, "learning_rate": 9.8e-05, "loss": 1.3738, "step": 2000 }, { "epoch": 0.6011421701232341, "eval_accuracy": 0.7298732586939131, "eval_loss": 1.562820315361023, "eval_runtime": 104.3524, "eval_samples_per_second": 228.552, "eval_steps_per_second": 7.149, "step": 2000 }, { "epoch": 0.7514277126540427, "grad_norm": 3.720855474472046, "learning_rate": 9.75e-05, "loss": 1.3447, "step": 2500 }, { "epoch": 0.7514277126540427, "eval_accuracy": 0.7334076213509759, "eval_loss": 1.533949613571167, "eval_runtime": 105.6055, "eval_samples_per_second": 225.84, "eval_steps_per_second": 7.064, "step": 2500 }, { "epoch": 0.9017132551848512, "grad_norm": 3.9192264080047607, "learning_rate": 9.7e-05, "loss": 1.3215, "step": 3000 }, { "epoch": 0.9017132551848512, "eval_accuracy": 0.737443643353561, "eval_loss": 1.5036447048187256, "eval_runtime": 103.4988, "eval_samples_per_second": 230.437, "eval_steps_per_second": 7.208, "step": 3000 }, { "epoch": 1.0519987977156597, "grad_norm": 3.8544235229492188, "learning_rate": 9.65e-05, "loss": 1.3074, "step": 3500 }, { "epoch": 1.0519987977156597, "eval_accuracy": 0.7401312504576956, "eval_loss": 1.4803341627120972, "eval_runtime": 104.2256, "eval_samples_per_second": 228.831, "eval_steps_per_second": 7.158, "step": 3500 }, { "epoch": 1.2022843402464682, "grad_norm": 3.2664737701416016, "learning_rate": 9.6e-05, "loss": 1.2826, "step": 4000 }, { "epoch": 1.2022843402464682, "eval_accuracy": 0.7433592201310665, "eval_loss": 1.475807785987854, "eval_runtime": 104.5255, "eval_samples_per_second": 228.174, "eval_steps_per_second": 7.137, "step": 4000 }, { "epoch": 1.3525698827772767, "grad_norm": 3.448021650314331, "learning_rate": 9.55e-05, "loss": 1.2676, "step": 4500 }, { "epoch": 1.3525698827772767, "eval_accuracy": 0.7454010087654052, "eval_loss": 1.467736005783081, "eval_runtime": 106.1685, "eval_samples_per_second": 224.643, "eval_steps_per_second": 7.027, "step": 4500 }, { "epoch": 1.5028554253080855, "grad_norm": 3.6344072818756104, "learning_rate": 9.5e-05, "loss": 1.2462, "step": 5000 }, { "epoch": 1.5028554253080855, "eval_accuracy": 0.7480509601003421, "eval_loss": 1.454475998878479, "eval_runtime": 107.2463, "eval_samples_per_second": 222.385, "eval_steps_per_second": 6.956, "step": 5000 }, { "epoch": 1.653140967838894, "grad_norm": 3.191260576248169, "learning_rate": 9.449999999999999e-05, "loss": 1.2353, "step": 5500 }, { "epoch": 1.653140967838894, "eval_accuracy": 0.7502682619699697, "eval_loss": 1.4226497411727905, "eval_runtime": 107.1587, "eval_samples_per_second": 222.567, "eval_steps_per_second": 6.962, "step": 5500 }, { "epoch": 1.8034265103697025, "grad_norm": 3.074147939682007, "learning_rate": 9.4e-05, "loss": 1.2227, "step": 6000 }, { "epoch": 1.8034265103697025, "eval_accuracy": 0.7523797476360646, "eval_loss": 1.4185426235198975, "eval_runtime": 107.8236, "eval_samples_per_second": 221.195, "eval_steps_per_second": 6.919, "step": 6000 }, { "epoch": 1.953712052900511, "grad_norm": 3.0745174884796143, "learning_rate": 9.350000000000001e-05, "loss": 1.2179, "step": 6500 }, { "epoch": 1.953712052900511, "eval_accuracy": 0.7525899025066154, "eval_loss": 1.4250199794769287, "eval_runtime": 103.869, "eval_samples_per_second": 229.616, "eval_steps_per_second": 7.182, "step": 6500 }, { "epoch": 2.1039975954313195, "grad_norm": 3.809814453125, "learning_rate": 9.300000000000001e-05, "loss": 1.1973, "step": 7000 }, { "epoch": 2.1039975954313195, "eval_accuracy": 0.7544079430376059, "eval_loss": 1.4045050144195557, "eval_runtime": 103.9514, "eval_samples_per_second": 229.434, "eval_steps_per_second": 7.176, "step": 7000 }, { "epoch": 2.254283137962128, "grad_norm": 3.262021541595459, "learning_rate": 9.250000000000001e-05, "loss": 1.1854, "step": 7500 }, { "epoch": 2.254283137962128, "eval_accuracy": 0.7566596229680038, "eval_loss": 1.4005974531173706, "eval_runtime": 103.4306, "eval_samples_per_second": 230.59, "eval_steps_per_second": 7.213, "step": 7500 }, { "epoch": 2.4045686804929365, "grad_norm": 3.562636613845825, "learning_rate": 9.200000000000001e-05, "loss": 1.1894, "step": 8000 }, { "epoch": 2.4045686804929365, "eval_accuracy": 0.7518658362592376, "eval_loss": 1.4261118173599243, "eval_runtime": 107.7368, "eval_samples_per_second": 221.373, "eval_steps_per_second": 6.924, "step": 8000 }, { "epoch": 2.554854223023745, "grad_norm": 3.0819168090820312, "learning_rate": 9.15e-05, "loss": 1.1821, "step": 8500 }, { "epoch": 2.554854223023745, "eval_accuracy": 0.759439375239073, "eval_loss": 1.3829759359359741, "eval_runtime": 108.9119, "eval_samples_per_second": 218.984, "eval_steps_per_second": 6.85, "step": 8500 }, { "epoch": 2.7051397655545535, "grad_norm": 3.441218852996826, "learning_rate": 9.1e-05, "loss": 1.1617, "step": 9000 }, { "epoch": 2.7051397655545535, "eval_accuracy": 0.7595643512285075, "eval_loss": 1.38172447681427, "eval_runtime": 107.6971, "eval_samples_per_second": 221.454, "eval_steps_per_second": 6.927, "step": 9000 }, { "epoch": 2.855425308085362, "grad_norm": 2.9258110523223877, "learning_rate": 9.05e-05, "loss": 1.1647, "step": 9500 }, { "epoch": 2.855425308085362, "eval_accuracy": 0.7622641760596893, "eval_loss": 1.3564621210098267, "eval_runtime": 108.8704, "eval_samples_per_second": 219.068, "eval_steps_per_second": 6.852, "step": 9500 }, { "epoch": 3.005710850616171, "grad_norm": 3.097913980484009, "learning_rate": 9e-05, "loss": 1.1543, "step": 10000 }, { "epoch": 3.005710850616171, "eval_accuracy": 0.7637979723391941, "eval_loss": 1.3542959690093994, "eval_runtime": 108.5802, "eval_samples_per_second": 219.653, "eval_steps_per_second": 6.87, "step": 10000 }, { "epoch": 3.1559963931469794, "grad_norm": 3.168121099472046, "learning_rate": 8.950000000000001e-05, "loss": 1.1379, "step": 10500 }, { "epoch": 3.1559963931469794, "eval_accuracy": 0.76408780640708, "eval_loss": 1.3504241704940796, "eval_runtime": 108.2586, "eval_samples_per_second": 220.306, "eval_steps_per_second": 6.891, "step": 10500 }, { "epoch": 3.306281935677788, "grad_norm": 3.410958766937256, "learning_rate": 8.900000000000001e-05, "loss": 1.1254, "step": 11000 }, { "epoch": 3.306281935677788, "eval_accuracy": 0.7652592815735387, "eval_loss": 1.346474528312683, "eval_runtime": 107.3669, "eval_samples_per_second": 222.136, "eval_steps_per_second": 6.948, "step": 11000 }, { "epoch": 3.4565674782085964, "grad_norm": 2.775892734527588, "learning_rate": 8.850000000000001e-05, "loss": 1.1269, "step": 11500 }, { "epoch": 3.4565674782085964, "eval_accuracy": 0.7657911530106338, "eval_loss": 1.3591572046279907, "eval_runtime": 106.844, "eval_samples_per_second": 223.223, "eval_steps_per_second": 6.982, "step": 11500 }, { "epoch": 3.606853020739405, "grad_norm": 3.029595375061035, "learning_rate": 8.800000000000001e-05, "loss": 1.1247, "step": 12000 }, { "epoch": 3.606853020739405, "eval_accuracy": 0.767305872889515, "eval_loss": 1.3377037048339844, "eval_runtime": 108.9561, "eval_samples_per_second": 218.896, "eval_steps_per_second": 6.847, "step": 12000 }, { "epoch": 3.7571385632702134, "grad_norm": 3.1967430114746094, "learning_rate": 8.75e-05, "loss": 1.1123, "step": 12500 }, { "epoch": 3.7571385632702134, "eval_accuracy": 0.7687083850392965, "eval_loss": 1.340959906578064, "eval_runtime": 107.8508, "eval_samples_per_second": 221.139, "eval_steps_per_second": 6.917, "step": 12500 }, { "epoch": 3.907424105801022, "grad_norm": 2.859966993331909, "learning_rate": 8.7e-05, "loss": 1.112, "step": 13000 }, { "epoch": 3.907424105801022, "eval_accuracy": 0.7696794726604136, "eval_loss": 1.3237755298614502, "eval_runtime": 107.6845, "eval_samples_per_second": 221.48, "eval_steps_per_second": 6.928, "step": 13000 }, { "epoch": 4.057709648331831, "grad_norm": 2.8421764373779297, "learning_rate": 8.65e-05, "loss": 1.0983, "step": 13500 }, { "epoch": 4.057709648331831, "eval_accuracy": 0.7714031663702743, "eval_loss": 1.3144177198410034, "eval_runtime": 108.7428, "eval_samples_per_second": 219.325, "eval_steps_per_second": 6.86, "step": 13500 }, { "epoch": 4.207995190862639, "grad_norm": 3.1582884788513184, "learning_rate": 8.6e-05, "loss": 1.0918, "step": 14000 }, { "epoch": 4.207995190862639, "eval_accuracy": 0.7718766206792639, "eval_loss": 1.3221428394317627, "eval_runtime": 108.1645, "eval_samples_per_second": 220.498, "eval_steps_per_second": 6.897, "step": 14000 }, { "epoch": 4.358280733393448, "grad_norm": 2.631855010986328, "learning_rate": 8.55e-05, "loss": 1.0884, "step": 14500 }, { "epoch": 4.358280733393448, "eval_accuracy": 0.772324790492508, "eval_loss": 1.3152235746383667, "eval_runtime": 107.5113, "eval_samples_per_second": 221.837, "eval_steps_per_second": 6.939, "step": 14500 }, { "epoch": 4.508566275924256, "grad_norm": 3.242208480834961, "learning_rate": 8.5e-05, "loss": 1.0837, "step": 15000 }, { "epoch": 4.508566275924256, "eval_accuracy": 0.7737920955341676, "eval_loss": 1.2969176769256592, "eval_runtime": 107.6972, "eval_samples_per_second": 221.454, "eval_steps_per_second": 6.927, "step": 15000 }, { "epoch": 4.658851818455065, "grad_norm": 3.0691699981689453, "learning_rate": 8.450000000000001e-05, "loss": 1.0796, "step": 15500 }, { "epoch": 4.658851818455065, "eval_accuracy": 0.7738074322270785, "eval_loss": 1.3085857629776, "eval_runtime": 106.2258, "eval_samples_per_second": 224.522, "eval_steps_per_second": 7.023, "step": 15500 }, { "epoch": 4.809137360985873, "grad_norm": 3.258615732192993, "learning_rate": 8.4e-05, "loss": 1.0714, "step": 16000 }, { "epoch": 4.809137360985873, "eval_accuracy": 0.775101051869418, "eval_loss": 1.2939372062683105, "eval_runtime": 106.8715, "eval_samples_per_second": 223.165, "eval_steps_per_second": 6.98, "step": 16000 }, { "epoch": 4.959422903516682, "grad_norm": 3.1920101642608643, "learning_rate": 8.35e-05, "loss": 1.071, "step": 16500 }, { "epoch": 4.959422903516682, "eval_accuracy": 0.7753841079350378, "eval_loss": 1.2854809761047363, "eval_runtime": 106.6266, "eval_samples_per_second": 223.678, "eval_steps_per_second": 6.996, "step": 16500 }, { "epoch": 5.10970844604749, "grad_norm": 3.2814955711364746, "learning_rate": 8.3e-05, "loss": 1.0568, "step": 17000 }, { "epoch": 5.10970844604749, "eval_accuracy": 0.7761810007493132, "eval_loss": 1.2916713953018188, "eval_runtime": 107.4768, "eval_samples_per_second": 221.908, "eval_steps_per_second": 6.941, "step": 17000 }, { "epoch": 5.259993988578299, "grad_norm": 3.2327558994293213, "learning_rate": 8.25e-05, "loss": 1.0549, "step": 17500 }, { "epoch": 5.259993988578299, "eval_accuracy": 0.7780586564617779, "eval_loss": 1.2857751846313477, "eval_runtime": 107.761, "eval_samples_per_second": 221.323, "eval_steps_per_second": 6.923, "step": 17500 }, { "epoch": 5.410279531109107, "grad_norm": 3.0951426029205322, "learning_rate": 8.2e-05, "loss": 1.0511, "step": 18000 }, { "epoch": 5.410279531109107, "eval_accuracy": 0.7783227014884583, "eval_loss": 1.2948368787765503, "eval_runtime": 108.2092, "eval_samples_per_second": 220.406, "eval_steps_per_second": 6.894, "step": 18000 }, { "epoch": 5.560565073639916, "grad_norm": 2.639129638671875, "learning_rate": 8.15e-05, "loss": 1.0492, "step": 18500 }, { "epoch": 5.560565073639916, "eval_accuracy": 0.7788534810933934, "eval_loss": 1.2669661045074463, "eval_runtime": 108.4058, "eval_samples_per_second": 220.007, "eval_steps_per_second": 6.882, "step": 18500 }, { "epoch": 5.710850616170724, "grad_norm": 2.4974186420440674, "learning_rate": 8.1e-05, "loss": 1.0476, "step": 19000 }, { "epoch": 5.710850616170724, "eval_accuracy": 0.7792994699668719, "eval_loss": 1.2653881311416626, "eval_runtime": 108.006, "eval_samples_per_second": 220.821, "eval_steps_per_second": 6.907, "step": 19000 }, { "epoch": 5.861136158701533, "grad_norm": 2.6549036502838135, "learning_rate": 8.05e-05, "loss": 1.0537, "step": 19500 }, { "epoch": 5.861136158701533, "eval_accuracy": 0.781079231841748, "eval_loss": 1.2602005004882812, "eval_runtime": 107.8534, "eval_samples_per_second": 221.133, "eval_steps_per_second": 6.917, "step": 19500 }, { "epoch": 6.011421701232342, "grad_norm": 3.047539234161377, "learning_rate": 8e-05, "loss": 1.038, "step": 20000 }, { "epoch": 6.011421701232342, "eval_accuracy": 0.7802534672325914, "eval_loss": 1.2705827951431274, "eval_runtime": 106.7758, "eval_samples_per_second": 223.365, "eval_steps_per_second": 6.987, "step": 20000 }, { "epoch": 6.16170724376315, "grad_norm": 2.7509360313415527, "learning_rate": 7.950000000000001e-05, "loss": 1.0268, "step": 20500 }, { "epoch": 6.16170724376315, "eval_accuracy": 0.7803480613075008, "eval_loss": 1.2649264335632324, "eval_runtime": 107.1559, "eval_samples_per_second": 222.573, "eval_steps_per_second": 6.962, "step": 20500 }, { "epoch": 6.311992786293959, "grad_norm": 2.5355842113494873, "learning_rate": 7.900000000000001e-05, "loss": 1.0294, "step": 21000 }, { "epoch": 6.311992786293959, "eval_accuracy": 0.7818066507198538, "eval_loss": 1.250462532043457, "eval_runtime": 115.6436, "eval_samples_per_second": 206.237, "eval_steps_per_second": 6.451, "step": 21000 }, { "epoch": 6.462278328824767, "grad_norm": 2.9398176670074463, "learning_rate": 7.850000000000001e-05, "loss": 1.0209, "step": 21500 }, { "epoch": 6.462278328824767, "eval_accuracy": 0.7832408080027191, "eval_loss": 1.2534795999526978, "eval_runtime": 107.8449, "eval_samples_per_second": 221.151, "eval_steps_per_second": 6.917, "step": 21500 }, { "epoch": 6.612563871355576, "grad_norm": 2.7950327396392822, "learning_rate": 7.800000000000001e-05, "loss": 1.0268, "step": 22000 }, { "epoch": 6.612563871355576, "eval_accuracy": 0.7836778557957977, "eval_loss": 1.2419434785842896, "eval_runtime": 107.3738, "eval_samples_per_second": 222.121, "eval_steps_per_second": 6.948, "step": 22000 }, { "epoch": 6.762849413886384, "grad_norm": 3.0478243827819824, "learning_rate": 7.75e-05, "loss": 1.0233, "step": 22500 }, { "epoch": 6.762849413886384, "eval_accuracy": 0.7839343784481503, "eval_loss": 1.244408369064331, "eval_runtime": 107.5196, "eval_samples_per_second": 221.82, "eval_steps_per_second": 6.938, "step": 22500 }, { "epoch": 6.913134956417193, "grad_norm": 3.049609661102295, "learning_rate": 7.7e-05, "loss": 1.016, "step": 23000 }, { "epoch": 6.913134956417193, "eval_accuracy": 0.7844505307770736, "eval_loss": 1.2420412302017212, "eval_runtime": 108.256, "eval_samples_per_second": 220.311, "eval_steps_per_second": 6.891, "step": 23000 }, { "epoch": 7.063420498948001, "grad_norm": 3.2929279804229736, "learning_rate": 7.65e-05, "loss": 1.0113, "step": 23500 }, { "epoch": 7.063420498948001, "eval_accuracy": 0.7841336670175184, "eval_loss": 1.2468925714492798, "eval_runtime": 108.0777, "eval_samples_per_second": 220.675, "eval_steps_per_second": 6.902, "step": 23500 }, { "epoch": 7.21370604147881, "grad_norm": 2.5201022624969482, "learning_rate": 7.6e-05, "loss": 1.01, "step": 24000 }, { "epoch": 7.21370604147881, "eval_accuracy": 0.7857022835486672, "eval_loss": 1.2313475608825684, "eval_runtime": 108.2646, "eval_samples_per_second": 220.294, "eval_steps_per_second": 6.891, "step": 24000 }, { "epoch": 7.363991584009618, "grad_norm": 2.926717519760132, "learning_rate": 7.55e-05, "loss": 1.0017, "step": 24500 }, { "epoch": 7.363991584009618, "eval_accuracy": 0.7857564674613335, "eval_loss": 1.2349497079849243, "eval_runtime": 107.7822, "eval_samples_per_second": 221.28, "eval_steps_per_second": 6.921, "step": 24500 }, { "epoch": 7.514277126540427, "grad_norm": 2.6643712520599365, "learning_rate": 7.500000000000001e-05, "loss": 0.9939, "step": 25000 }, { "epoch": 7.514277126540427, "eval_accuracy": 0.7868022880070933, "eval_loss": 1.2337555885314941, "eval_runtime": 106.7322, "eval_samples_per_second": 223.456, "eval_steps_per_second": 6.989, "step": 25000 }, { "epoch": 7.664562669071236, "grad_norm": 2.679358720779419, "learning_rate": 7.450000000000001e-05, "loss": 0.9921, "step": 25500 }, { "epoch": 7.664562669071236, "eval_accuracy": 0.7861627827490367, "eval_loss": 1.2326431274414062, "eval_runtime": 107.5896, "eval_samples_per_second": 221.676, "eval_steps_per_second": 6.934, "step": 25500 }, { "epoch": 7.814848211602044, "grad_norm": 2.5837836265563965, "learning_rate": 7.4e-05, "loss": 0.9973, "step": 26000 }, { "epoch": 7.814848211602044, "eval_accuracy": 0.7871569919088683, "eval_loss": 1.2331918478012085, "eval_runtime": 107.9543, "eval_samples_per_second": 220.927, "eval_steps_per_second": 6.91, "step": 26000 }, { "epoch": 7.965133754132852, "grad_norm": 2.536043882369995, "learning_rate": 7.35e-05, "loss": 0.9953, "step": 26500 }, { "epoch": 7.965133754132852, "eval_accuracy": 0.7891570577004067, "eval_loss": 1.2163290977478027, "eval_runtime": 113.9576, "eval_samples_per_second": 209.288, "eval_steps_per_second": 6.546, "step": 26500 }, { "epoch": 8.115419296663662, "grad_norm": 2.690735340118408, "learning_rate": 7.3e-05, "loss": 0.9847, "step": 27000 }, { "epoch": 8.115419296663662, "eval_accuracy": 0.7888144688528846, "eval_loss": 1.209494709968567, "eval_runtime": 116.0276, "eval_samples_per_second": 205.555, "eval_steps_per_second": 6.43, "step": 27000 }, { "epoch": 8.26570483919447, "grad_norm": 2.4185330867767334, "learning_rate": 7.25e-05, "loss": 0.9797, "step": 27500 }, { "epoch": 8.26570483919447, "eval_accuracy": 0.7891209106681478, "eval_loss": 1.218719482421875, "eval_runtime": 114.5016, "eval_samples_per_second": 208.294, "eval_steps_per_second": 6.515, "step": 27500 }, { "epoch": 8.415990381725278, "grad_norm": 2.7199230194091797, "learning_rate": 7.2e-05, "loss": 0.9755, "step": 28000 }, { "epoch": 8.415990381725278, "eval_accuracy": 0.7887393254933777, "eval_loss": 1.2271952629089355, "eval_runtime": 111.4173, "eval_samples_per_second": 214.06, "eval_steps_per_second": 6.696, "step": 28000 }, { "epoch": 8.566275924256086, "grad_norm": 2.6658599376678467, "learning_rate": 7.15e-05, "loss": 0.9749, "step": 28500 }, { "epoch": 8.566275924256086, "eval_accuracy": 0.7904619110106559, "eval_loss": 1.217193603515625, "eval_runtime": 107.3348, "eval_samples_per_second": 222.202, "eval_steps_per_second": 6.95, "step": 28500 }, { "epoch": 8.716561466786896, "grad_norm": 2.9954679012298584, "learning_rate": 7.1e-05, "loss": 0.9747, "step": 29000 }, { "epoch": 8.716561466786896, "eval_accuracy": 0.7904568069124281, "eval_loss": 1.2054858207702637, "eval_runtime": 111.9259, "eval_samples_per_second": 213.087, "eval_steps_per_second": 6.665, "step": 29000 }, { "epoch": 8.866847009317704, "grad_norm": 2.506471872329712, "learning_rate": 7.05e-05, "loss": 0.9715, "step": 29500 }, { "epoch": 8.866847009317704, "eval_accuracy": 0.790933514493206, "eval_loss": 1.203903317451477, "eval_runtime": 106.5362, "eval_samples_per_second": 223.868, "eval_steps_per_second": 7.002, "step": 29500 }, { "epoch": 9.017132551848512, "grad_norm": 2.67307710647583, "learning_rate": 7e-05, "loss": 0.9716, "step": 30000 }, { "epoch": 9.017132551848512, "eval_accuracy": 0.7914492771721259, "eval_loss": 1.2048789262771606, "eval_runtime": 121.4256, "eval_samples_per_second": 196.417, "eval_steps_per_second": 6.144, "step": 30000 }, { "epoch": 9.16741809437932, "grad_norm": 2.9317779541015625, "learning_rate": 6.95e-05, "loss": 0.962, "step": 30500 }, { "epoch": 9.16741809437932, "eval_accuracy": 0.7917621280752731, "eval_loss": 1.199426293373108, "eval_runtime": 121.1706, "eval_samples_per_second": 196.83, "eval_steps_per_second": 6.157, "step": 30500 }, { "epoch": 9.31770363691013, "grad_norm": 2.679703712463379, "learning_rate": 6.9e-05, "loss": 0.9642, "step": 31000 }, { "epoch": 9.31770363691013, "eval_accuracy": 0.7925397227321643, "eval_loss": 1.193407654762268, "eval_runtime": 123.2276, "eval_samples_per_second": 193.544, "eval_steps_per_second": 6.054, "step": 31000 }, { "epoch": 9.467989179440938, "grad_norm": 2.4788730144500732, "learning_rate": 6.850000000000001e-05, "loss": 0.9601, "step": 31500 }, { "epoch": 9.467989179440938, "eval_accuracy": 0.792334985338317, "eval_loss": 1.1934560537338257, "eval_runtime": 119.6088, "eval_samples_per_second": 199.4, "eval_steps_per_second": 6.237, "step": 31500 }, { "epoch": 9.618274721971746, "grad_norm": 2.8737170696258545, "learning_rate": 6.800000000000001e-05, "loss": 0.9541, "step": 32000 }, { "epoch": 9.618274721971746, "eval_accuracy": 0.7933121889138387, "eval_loss": 1.1921508312225342, "eval_runtime": 124.8462, "eval_samples_per_second": 191.035, "eval_steps_per_second": 5.975, "step": 32000 }, { "epoch": 9.768560264502554, "grad_norm": 2.7593533992767334, "learning_rate": 6.750000000000001e-05, "loss": 0.9599, "step": 32500 }, { "epoch": 9.768560264502554, "eval_accuracy": 0.7930929148774838, "eval_loss": 1.2042547464370728, "eval_runtime": 124.9577, "eval_samples_per_second": 190.865, "eval_steps_per_second": 5.97, "step": 32500 }, { "epoch": 9.918845807033364, "grad_norm": 2.389718532562256, "learning_rate": 6.7e-05, "loss": 0.9649, "step": 33000 }, { "epoch": 9.918845807033364, "eval_accuracy": 0.7939532145922783, "eval_loss": 1.1914194822311401, "eval_runtime": 120.967, "eval_samples_per_second": 197.161, "eval_steps_per_second": 6.167, "step": 33000 }, { "epoch": 10.069131349564172, "grad_norm": 2.846874475479126, "learning_rate": 6.65e-05, "loss": 0.9467, "step": 33500 }, { "epoch": 10.069131349564172, "eval_accuracy": 0.7947739980138828, "eval_loss": 1.1852329969406128, "eval_runtime": 120.8513, "eval_samples_per_second": 197.35, "eval_steps_per_second": 6.173, "step": 33500 }, { "epoch": 10.21941689209498, "grad_norm": 2.58475399017334, "learning_rate": 6.6e-05, "loss": 0.9406, "step": 34000 }, { "epoch": 10.21941689209498, "eval_accuracy": 0.7943515220267853, "eval_loss": 1.1843186616897583, "eval_runtime": 121.8305, "eval_samples_per_second": 195.764, "eval_steps_per_second": 6.123, "step": 34000 }, { "epoch": 10.36970243462579, "grad_norm": 3.003615140914917, "learning_rate": 6.55e-05, "loss": 0.9395, "step": 34500 }, { "epoch": 10.36970243462579, "eval_accuracy": 0.7948140182397172, "eval_loss": 1.1831063032150269, "eval_runtime": 120.2856, "eval_samples_per_second": 198.278, "eval_steps_per_second": 6.202, "step": 34500 }, { "epoch": 10.519987977156598, "grad_norm": 2.6997032165527344, "learning_rate": 6.500000000000001e-05, "loss": 0.9338, "step": 35000 }, { "epoch": 10.519987977156598, "eval_accuracy": 0.7955270740039722, "eval_loss": 1.1888868808746338, "eval_runtime": 121.3291, "eval_samples_per_second": 196.573, "eval_steps_per_second": 6.149, "step": 35000 }, { "epoch": 10.670273519687406, "grad_norm": 2.5620720386505127, "learning_rate": 6.450000000000001e-05, "loss": 0.9413, "step": 35500 }, { "epoch": 10.670273519687406, "eval_accuracy": 0.7960009228127132, "eval_loss": 1.1771271228790283, "eval_runtime": 116.3789, "eval_samples_per_second": 204.934, "eval_steps_per_second": 6.41, "step": 35500 }, { "epoch": 10.820559062218214, "grad_norm": 2.4902310371398926, "learning_rate": 6.400000000000001e-05, "loss": 0.941, "step": 36000 }, { "epoch": 10.820559062218214, "eval_accuracy": 0.7952424860773146, "eval_loss": 1.176774263381958, "eval_runtime": 116.6512, "eval_samples_per_second": 204.456, "eval_steps_per_second": 6.395, "step": 36000 }, { "epoch": 10.970844604749024, "grad_norm": 2.6512043476104736, "learning_rate": 6.35e-05, "loss": 0.9371, "step": 36500 }, { "epoch": 10.970844604749024, "eval_accuracy": 0.796440942176153, "eval_loss": 1.1723679304122925, "eval_runtime": 119.8979, "eval_samples_per_second": 198.919, "eval_steps_per_second": 6.222, "step": 36500 }, { "epoch": 11.121130147279832, "grad_norm": 2.6038336753845215, "learning_rate": 6.3e-05, "loss": 0.9293, "step": 37000 }, { "epoch": 11.121130147279832, "eval_accuracy": 0.7967239525153044, "eval_loss": 1.1750506162643433, "eval_runtime": 120.4764, "eval_samples_per_second": 197.964, "eval_steps_per_second": 6.192, "step": 37000 }, { "epoch": 11.27141568981064, "grad_norm": 2.5120317935943604, "learning_rate": 6.25e-05, "loss": 0.9279, "step": 37500 }, { "epoch": 11.27141568981064, "eval_accuracy": 0.7974463986990397, "eval_loss": 1.1721601486206055, "eval_runtime": 115.5123, "eval_samples_per_second": 206.471, "eval_steps_per_second": 6.458, "step": 37500 }, { "epoch": 11.421701232341448, "grad_norm": 2.6776065826416016, "learning_rate": 6.2e-05, "loss": 0.9273, "step": 38000 }, { "epoch": 11.421701232341448, "eval_accuracy": 0.7977723054491466, "eval_loss": 1.1790024042129517, "eval_runtime": 118.0413, "eval_samples_per_second": 202.048, "eval_steps_per_second": 6.32, "step": 38000 }, { "epoch": 11.571986774872258, "grad_norm": 2.473292589187622, "learning_rate": 6.15e-05, "loss": 0.9176, "step": 38500 }, { "epoch": 11.571986774872258, "eval_accuracy": 0.7978240654330204, "eval_loss": 1.1769903898239136, "eval_runtime": 119.5414, "eval_samples_per_second": 199.512, "eval_steps_per_second": 6.241, "step": 38500 }, { "epoch": 11.722272317403066, "grad_norm": 2.573493242263794, "learning_rate": 6.1e-05, "loss": 0.918, "step": 39000 }, { "epoch": 11.722272317403066, "eval_accuracy": 0.7978464067774307, "eval_loss": 1.1745543479919434, "eval_runtime": 118.8031, "eval_samples_per_second": 200.752, "eval_steps_per_second": 6.279, "step": 39000 }, { "epoch": 11.872557859933874, "grad_norm": 2.496293067932129, "learning_rate": 6.05e-05, "loss": 0.9209, "step": 39500 }, { "epoch": 11.872557859933874, "eval_accuracy": 0.7988222222591843, "eval_loss": 1.1572139263153076, "eval_runtime": 119.3596, "eval_samples_per_second": 199.816, "eval_steps_per_second": 6.25, "step": 39500 }, { "epoch": 12.022843402464684, "grad_norm": 2.8805453777313232, "learning_rate": 6e-05, "loss": 0.9204, "step": 40000 }, { "epoch": 12.022843402464684, "eval_accuracy": 0.799307208811464, "eval_loss": 1.1740858554840088, "eval_runtime": 118.6109, "eval_samples_per_second": 201.078, "eval_steps_per_second": 6.289, "step": 40000 }, { "epoch": 12.173128944995492, "grad_norm": 2.6088273525238037, "learning_rate": 5.95e-05, "loss": 0.9149, "step": 40500 }, { "epoch": 12.173128944995492, "eval_accuracy": 0.7991013549531508, "eval_loss": 1.1727755069732666, "eval_runtime": 117.7954, "eval_samples_per_second": 202.47, "eval_steps_per_second": 6.333, "step": 40500 }, { "epoch": 12.3234144875263, "grad_norm": 2.426567316055298, "learning_rate": 5.9e-05, "loss": 0.9009, "step": 41000 }, { "epoch": 12.3234144875263, "eval_accuracy": 0.798481914968918, "eval_loss": 1.1672251224517822, "eval_runtime": 119.0757, "eval_samples_per_second": 200.293, "eval_steps_per_second": 6.265, "step": 41000 }, { "epoch": 12.473700030057108, "grad_norm": 2.687640428543091, "learning_rate": 5.85e-05, "loss": 0.9094, "step": 41500 }, { "epoch": 12.473700030057108, "eval_accuracy": 0.7998230800562746, "eval_loss": 1.1598495244979858, "eval_runtime": 118.9102, "eval_samples_per_second": 200.572, "eval_steps_per_second": 6.274, "step": 41500 }, { "epoch": 12.623985572587918, "grad_norm": 3.0333776473999023, "learning_rate": 5.8e-05, "loss": 0.9149, "step": 42000 }, { "epoch": 12.623985572587918, "eval_accuracy": 0.7999625316919439, "eval_loss": 1.171325445175171, "eval_runtime": 115.3742, "eval_samples_per_second": 206.719, "eval_steps_per_second": 6.466, "step": 42000 }, { "epoch": 12.774271115118726, "grad_norm": 2.299436569213867, "learning_rate": 5.7499999999999995e-05, "loss": 0.9068, "step": 42500 }, { "epoch": 12.774271115118726, "eval_accuracy": 0.8010648350504735, "eval_loss": 1.1537537574768066, "eval_runtime": 118.7443, "eval_samples_per_second": 200.852, "eval_steps_per_second": 6.282, "step": 42500 }, { "epoch": 12.924556657649534, "grad_norm": 2.7340447902679443, "learning_rate": 5.6999999999999996e-05, "loss": 0.9099, "step": 43000 }, { "epoch": 12.924556657649534, "eval_accuracy": 0.8012578176921267, "eval_loss": 1.1426852941513062, "eval_runtime": 120.2664, "eval_samples_per_second": 198.31, "eval_steps_per_second": 6.203, "step": 43000 }, { "epoch": 13.074842200180342, "grad_norm": 2.6585094928741455, "learning_rate": 5.65e-05, "loss": 0.8951, "step": 43500 }, { "epoch": 13.074842200180342, "eval_accuracy": 0.8005615361847466, "eval_loss": 1.1578710079193115, "eval_runtime": 115.0048, "eval_samples_per_second": 207.383, "eval_steps_per_second": 6.487, "step": 43500 }, { "epoch": 13.225127742711152, "grad_norm": 2.6981582641601562, "learning_rate": 5.6000000000000006e-05, "loss": 0.8886, "step": 44000 }, { "epoch": 13.225127742711152, "eval_accuracy": 0.8010340860790154, "eval_loss": 1.1602416038513184, "eval_runtime": 113.2435, "eval_samples_per_second": 210.608, "eval_steps_per_second": 6.588, "step": 44000 }, { "epoch": 13.37541328524196, "grad_norm": 2.6016407012939453, "learning_rate": 5.550000000000001e-05, "loss": 0.9057, "step": 44500 }, { "epoch": 13.37541328524196, "eval_accuracy": 0.8021396084503188, "eval_loss": 1.1406522989273071, "eval_runtime": 115.7578, "eval_samples_per_second": 206.034, "eval_steps_per_second": 6.444, "step": 44500 }, { "epoch": 13.525698827772768, "grad_norm": 2.693239688873291, "learning_rate": 5.500000000000001e-05, "loss": 0.8921, "step": 45000 }, { "epoch": 13.525698827772768, "eval_accuracy": 0.801952397809383, "eval_loss": 1.1361913681030273, "eval_runtime": 112.7804, "eval_samples_per_second": 211.473, "eval_steps_per_second": 6.615, "step": 45000 }, { "epoch": 13.675984370303578, "grad_norm": 2.386279582977295, "learning_rate": 5.45e-05, "loss": 0.8942, "step": 45500 }, { "epoch": 13.675984370303578, "eval_accuracy": 0.8024716769653207, "eval_loss": 1.1441001892089844, "eval_runtime": 114.7396, "eval_samples_per_second": 207.862, "eval_steps_per_second": 6.502, "step": 45500 }, { "epoch": 13.826269912834386, "grad_norm": 2.3237531185150146, "learning_rate": 5.4000000000000005e-05, "loss": 0.8939, "step": 46000 }, { "epoch": 13.826269912834386, "eval_accuracy": 0.8025017531571386, "eval_loss": 1.141733169555664, "eval_runtime": 112.3505, "eval_samples_per_second": 212.282, "eval_steps_per_second": 6.64, "step": 46000 }, { "epoch": 13.976555455365194, "grad_norm": 2.463498592376709, "learning_rate": 5.3500000000000006e-05, "loss": 0.891, "step": 46500 }, { "epoch": 13.976555455365194, "eval_accuracy": 0.803545043091428, "eval_loss": 1.144685983657837, "eval_runtime": 114.632, "eval_samples_per_second": 208.057, "eval_steps_per_second": 6.508, "step": 46500 }, { "epoch": 14.126840997896002, "grad_norm": 2.7393481731414795, "learning_rate": 5.300000000000001e-05, "loss": 0.8846, "step": 47000 }, { "epoch": 14.126840997896002, "eval_accuracy": 0.8038034340555349, "eval_loss": 1.1320561170578003, "eval_runtime": 112.5915, "eval_samples_per_second": 211.828, "eval_steps_per_second": 6.626, "step": 47000 }, { "epoch": 14.277126540426812, "grad_norm": 2.776505708694458, "learning_rate": 5.25e-05, "loss": 0.8841, "step": 47500 }, { "epoch": 14.277126540426812, "eval_accuracy": 0.8034705833910766, "eval_loss": 1.1436700820922852, "eval_runtime": 114.9721, "eval_samples_per_second": 207.442, "eval_steps_per_second": 6.489, "step": 47500 }, { "epoch": 14.42741208295762, "grad_norm": 2.536817789077759, "learning_rate": 5.2000000000000004e-05, "loss": 0.8811, "step": 48000 }, { "epoch": 14.42741208295762, "eval_accuracy": 0.8044537392561116, "eval_loss": 1.133318305015564, "eval_runtime": 113.5045, "eval_samples_per_second": 210.124, "eval_steps_per_second": 6.572, "step": 48000 }, { "epoch": 14.577697625488428, "grad_norm": 2.524203062057495, "learning_rate": 5.1500000000000005e-05, "loss": 0.8778, "step": 48500 }, { "epoch": 14.577697625488428, "eval_accuracy": 0.8039248049976617, "eval_loss": 1.1258032321929932, "eval_runtime": 114.9378, "eval_samples_per_second": 207.504, "eval_steps_per_second": 6.49, "step": 48500 }, { "epoch": 14.727983168019236, "grad_norm": 2.6129727363586426, "learning_rate": 5.1000000000000006e-05, "loss": 0.8765, "step": 49000 }, { "epoch": 14.727983168019236, "eval_accuracy": 0.8044657041630938, "eval_loss": 1.1352417469024658, "eval_runtime": 114.1357, "eval_samples_per_second": 208.962, "eval_steps_per_second": 6.536, "step": 49000 }, { "epoch": 14.878268710550046, "grad_norm": 2.6218881607055664, "learning_rate": 5.05e-05, "loss": 0.876, "step": 49500 }, { "epoch": 14.878268710550046, "eval_accuracy": 0.8049223412984352, "eval_loss": 1.131935954093933, "eval_runtime": 114.4278, "eval_samples_per_second": 208.428, "eval_steps_per_second": 6.519, "step": 49500 }, { "epoch": 15.028554253080854, "grad_norm": 2.654724597930908, "learning_rate": 5e-05, "loss": 0.879, "step": 50000 }, { "epoch": 15.028554253080854, "eval_accuracy": 0.8056327032697777, "eval_loss": 1.1161094903945923, "eval_runtime": 112.2273, "eval_samples_per_second": 212.515, "eval_steps_per_second": 6.647, "step": 50000 }, { "epoch": 15.178839795611662, "grad_norm": 2.569153308868408, "learning_rate": 4.9500000000000004e-05, "loss": 0.8674, "step": 50500 }, { "epoch": 15.178839795611662, "eval_accuracy": 0.8048139780991767, "eval_loss": 1.1278635263442993, "eval_runtime": 114.7277, "eval_samples_per_second": 207.884, "eval_steps_per_second": 6.502, "step": 50500 }, { "epoch": 15.32912533814247, "grad_norm": 2.5382237434387207, "learning_rate": 4.9e-05, "loss": 0.8676, "step": 51000 }, { "epoch": 15.32912533814247, "eval_accuracy": 0.8064227363767525, "eval_loss": 1.132450819015503, "eval_runtime": 107.2478, "eval_samples_per_second": 222.382, "eval_steps_per_second": 6.956, "step": 51000 }, { "epoch": 15.47941088067328, "grad_norm": 2.480746030807495, "learning_rate": 4.85e-05, "loss": 0.867, "step": 51500 }, { "epoch": 15.47941088067328, "eval_accuracy": 0.806509341887431, "eval_loss": 1.1208624839782715, "eval_runtime": 107.5739, "eval_samples_per_second": 221.708, "eval_steps_per_second": 6.935, "step": 51500 }, { "epoch": 15.629696423204088, "grad_norm": 2.4417829513549805, "learning_rate": 4.8e-05, "loss": 0.8648, "step": 52000 }, { "epoch": 15.629696423204088, "eval_accuracy": 0.80664691560334, "eval_loss": 1.123421549797058, "eval_runtime": 107.0763, "eval_samples_per_second": 222.738, "eval_steps_per_second": 6.967, "step": 52000 }, { "epoch": 15.779981965734896, "grad_norm": 2.3048479557037354, "learning_rate": 4.75e-05, "loss": 0.87, "step": 52500 }, { "epoch": 15.779981965734896, "eval_accuracy": 0.8067313801594327, "eval_loss": 1.1155991554260254, "eval_runtime": 108.1545, "eval_samples_per_second": 220.518, "eval_steps_per_second": 6.898, "step": 52500 }, { "epoch": 15.930267508265704, "grad_norm": 2.6421682834625244, "learning_rate": 4.7e-05, "loss": 0.8694, "step": 53000 }, { "epoch": 15.930267508265704, "eval_accuracy": 0.8066291950596075, "eval_loss": 1.1215757131576538, "eval_runtime": 109.1629, "eval_samples_per_second": 218.481, "eval_steps_per_second": 6.834, "step": 53000 }, { "epoch": 16.080553050796514, "grad_norm": 2.4313910007476807, "learning_rate": 4.6500000000000005e-05, "loss": 0.8668, "step": 53500 }, { "epoch": 16.080553050796514, "eval_accuracy": 0.8079721270128156, "eval_loss": 1.1246066093444824, "eval_runtime": 108.472, "eval_samples_per_second": 219.872, "eval_steps_per_second": 6.877, "step": 53500 }, { "epoch": 16.230838593327324, "grad_norm": 2.2622108459472656, "learning_rate": 4.600000000000001e-05, "loss": 0.8603, "step": 54000 }, { "epoch": 16.230838593327324, "eval_accuracy": 0.8070964068446762, "eval_loss": 1.1118344068527222, "eval_runtime": 107.6522, "eval_samples_per_second": 221.547, "eval_steps_per_second": 6.93, "step": 54000 }, { "epoch": 16.38112413585813, "grad_norm": 2.355970621109009, "learning_rate": 4.55e-05, "loss": 0.8557, "step": 54500 }, { "epoch": 16.38112413585813, "eval_accuracy": 0.8079940925101587, "eval_loss": 1.1261143684387207, "eval_runtime": 107.1531, "eval_samples_per_second": 222.579, "eval_steps_per_second": 6.962, "step": 54500 }, { "epoch": 16.53140967838894, "grad_norm": 2.4477593898773193, "learning_rate": 4.5e-05, "loss": 0.8521, "step": 55000 }, { "epoch": 16.53140967838894, "eval_accuracy": 0.8084604086659483, "eval_loss": 1.1086950302124023, "eval_runtime": 106.6462, "eval_samples_per_second": 223.637, "eval_steps_per_second": 6.995, "step": 55000 }, { "epoch": 16.681695220919746, "grad_norm": 3.1719090938568115, "learning_rate": 4.4500000000000004e-05, "loss": 0.8583, "step": 55500 }, { "epoch": 16.681695220919746, "eval_accuracy": 0.8085530062732024, "eval_loss": 1.1111265420913696, "eval_runtime": 107.453, "eval_samples_per_second": 221.958, "eval_steps_per_second": 6.943, "step": 55500 }, { "epoch": 16.831980763450556, "grad_norm": 2.867218494415283, "learning_rate": 4.4000000000000006e-05, "loss": 0.8509, "step": 56000 }, { "epoch": 16.831980763450556, "eval_accuracy": 0.8083466491211581, "eval_loss": 1.1217560768127441, "eval_runtime": 107.5379, "eval_samples_per_second": 221.782, "eval_steps_per_second": 6.937, "step": 56000 }, { "epoch": 16.982266305981366, "grad_norm": 2.421694040298462, "learning_rate": 4.35e-05, "loss": 0.8591, "step": 56500 }, { "epoch": 16.982266305981366, "eval_accuracy": 0.808431427546503, "eval_loss": 1.1109532117843628, "eval_runtime": 108.5465, "eval_samples_per_second": 219.721, "eval_steps_per_second": 6.873, "step": 56500 }, { "epoch": 17.132551848512172, "grad_norm": 2.6324167251586914, "learning_rate": 4.3e-05, "loss": 0.8417, "step": 57000 }, { "epoch": 17.132551848512172, "eval_accuracy": 0.8085933337773042, "eval_loss": 1.1125506162643433, "eval_runtime": 108.0877, "eval_samples_per_second": 220.654, "eval_steps_per_second": 6.902, "step": 57000 }, { "epoch": 17.282837391042982, "grad_norm": 2.448883056640625, "learning_rate": 4.25e-05, "loss": 0.8511, "step": 57500 }, { "epoch": 17.282837391042982, "eval_accuracy": 0.8093454793356915, "eval_loss": 1.1108651161193848, "eval_runtime": 108.0036, "eval_samples_per_second": 220.826, "eval_steps_per_second": 6.907, "step": 57500 }, { "epoch": 17.43312293357379, "grad_norm": 2.647814989089966, "learning_rate": 4.2e-05, "loss": 0.8472, "step": 58000 }, { "epoch": 17.43312293357379, "eval_accuracy": 0.8097202877059791, "eval_loss": 1.1124111413955688, "eval_runtime": 107.9075, "eval_samples_per_second": 221.023, "eval_steps_per_second": 6.913, "step": 58000 }, { "epoch": 17.583408476104598, "grad_norm": 2.3744354248046875, "learning_rate": 4.15e-05, "loss": 0.8381, "step": 58500 }, { "epoch": 17.583408476104598, "eval_accuracy": 0.8095948479389424, "eval_loss": 1.1056084632873535, "eval_runtime": 107.6903, "eval_samples_per_second": 221.468, "eval_steps_per_second": 6.927, "step": 58500 }, { "epoch": 17.733694018635408, "grad_norm": 2.3030834197998047, "learning_rate": 4.1e-05, "loss": 0.8474, "step": 59000 }, { "epoch": 17.733694018635408, "eval_accuracy": 0.8102527131048097, "eval_loss": 1.1011704206466675, "eval_runtime": 107.5969, "eval_samples_per_second": 221.661, "eval_steps_per_second": 6.933, "step": 59000 }, { "epoch": 17.883979561166214, "grad_norm": 2.610208749771118, "learning_rate": 4.05e-05, "loss": 0.8456, "step": 59500 }, { "epoch": 17.883979561166214, "eval_accuracy": 0.8108135572608609, "eval_loss": 1.097122073173523, "eval_runtime": 106.6046, "eval_samples_per_second": 223.724, "eval_steps_per_second": 6.998, "step": 59500 }, { "epoch": 18.034265103697024, "grad_norm": 2.491633176803589, "learning_rate": 4e-05, "loss": 0.8367, "step": 60000 }, { "epoch": 18.034265103697024, "eval_accuracy": 0.8112650064622723, "eval_loss": 1.0913532972335815, "eval_runtime": 107.2029, "eval_samples_per_second": 222.475, "eval_steps_per_second": 6.959, "step": 60000 }, { "epoch": 18.184550646227834, "grad_norm": 2.270582437515259, "learning_rate": 3.9500000000000005e-05, "loss": 0.8336, "step": 60500 }, { "epoch": 18.184550646227834, "eval_accuracy": 0.8104727212172935, "eval_loss": 1.110021710395813, "eval_runtime": 106.0993, "eval_samples_per_second": 224.789, "eval_steps_per_second": 7.031, "step": 60500 }, { "epoch": 18.33483618875864, "grad_norm": 2.4716830253601074, "learning_rate": 3.9000000000000006e-05, "loss": 0.8404, "step": 61000 }, { "epoch": 18.33483618875864, "eval_accuracy": 0.8114022782578146, "eval_loss": 1.0959724187850952, "eval_runtime": 107.1337, "eval_samples_per_second": 222.619, "eval_steps_per_second": 6.963, "step": 61000 }, { "epoch": 18.48512173128945, "grad_norm": 2.679825782775879, "learning_rate": 3.85e-05, "loss": 0.8365, "step": 61500 }, { "epoch": 18.48512173128945, "eval_accuracy": 0.8112698414020383, "eval_loss": 1.0895620584487915, "eval_runtime": 107.9425, "eval_samples_per_second": 220.951, "eval_steps_per_second": 6.911, "step": 61500 }, { "epoch": 18.63540727382026, "grad_norm": 2.5028414726257324, "learning_rate": 3.8e-05, "loss": 0.8308, "step": 62000 }, { "epoch": 18.63540727382026, "eval_accuracy": 0.8112445219475855, "eval_loss": 1.098541498184204, "eval_runtime": 108.1696, "eval_samples_per_second": 220.487, "eval_steps_per_second": 6.897, "step": 62000 }, { "epoch": 18.785692816351066, "grad_norm": 2.4303503036499023, "learning_rate": 3.7500000000000003e-05, "loss": 0.8308, "step": 62500 }, { "epoch": 18.785692816351066, "eval_accuracy": 0.8120511999201301, "eval_loss": 1.0968284606933594, "eval_runtime": 107.5288, "eval_samples_per_second": 221.801, "eval_steps_per_second": 6.938, "step": 62500 }, { "epoch": 18.935978358881876, "grad_norm": 2.9847395420074463, "learning_rate": 3.7e-05, "loss": 0.8312, "step": 63000 }, { "epoch": 18.935978358881876, "eval_accuracy": 0.8121808312724796, "eval_loss": 1.0884159803390503, "eval_runtime": 107.9223, "eval_samples_per_second": 220.992, "eval_steps_per_second": 6.912, "step": 63000 }, { "epoch": 19.086263901412686, "grad_norm": 2.302724838256836, "learning_rate": 3.65e-05, "loss": 0.8279, "step": 63500 }, { "epoch": 19.086263901412686, "eval_accuracy": 0.8127240285256392, "eval_loss": 1.0900928974151611, "eval_runtime": 106.9119, "eval_samples_per_second": 223.081, "eval_steps_per_second": 6.978, "step": 63500 }, { "epoch": 19.236549443943492, "grad_norm": 2.4187684059143066, "learning_rate": 3.6e-05, "loss": 0.8219, "step": 64000 }, { "epoch": 19.236549443943492, "eval_accuracy": 0.8129403972187366, "eval_loss": 1.0872172117233276, "eval_runtime": 109.8336, "eval_samples_per_second": 217.147, "eval_steps_per_second": 6.792, "step": 64000 }, { "epoch": 19.3868349864743, "grad_norm": 2.740501880645752, "learning_rate": 3.55e-05, "loss": 0.8266, "step": 64500 }, { "epoch": 19.3868349864743, "eval_accuracy": 0.8114303723800741, "eval_loss": 1.0987831354141235, "eval_runtime": 109.7168, "eval_samples_per_second": 217.378, "eval_steps_per_second": 6.799, "step": 64500 }, { "epoch": 19.537120529005108, "grad_norm": 2.326366424560547, "learning_rate": 3.5e-05, "loss": 0.8255, "step": 65000 }, { "epoch": 19.537120529005108, "eval_accuracy": 0.8124286882301512, "eval_loss": 1.0885217189788818, "eval_runtime": 114.778, "eval_samples_per_second": 207.792, "eval_steps_per_second": 6.5, "step": 65000 }, { "epoch": 19.687406071535918, "grad_norm": 3.1694321632385254, "learning_rate": 3.45e-05, "loss": 0.8185, "step": 65500 }, { "epoch": 19.687406071535918, "eval_accuracy": 0.8129749253547676, "eval_loss": 1.0836535692214966, "eval_runtime": 114.5951, "eval_samples_per_second": 208.124, "eval_steps_per_second": 6.51, "step": 65500 }, { "epoch": 19.837691614066728, "grad_norm": 2.6415817737579346, "learning_rate": 3.4000000000000007e-05, "loss": 0.8219, "step": 66000 }, { "epoch": 19.837691614066728, "eval_accuracy": 0.8122496169754481, "eval_loss": 1.0868114233016968, "eval_runtime": 113.7304, "eval_samples_per_second": 209.707, "eval_steps_per_second": 6.559, "step": 66000 }, { "epoch": 19.987977156597534, "grad_norm": 2.567044496536255, "learning_rate": 3.35e-05, "loss": 0.8214, "step": 66500 }, { "epoch": 19.987977156597534, "eval_accuracy": 0.8135328500351103, "eval_loss": 1.0849970579147339, "eval_runtime": 114.2096, "eval_samples_per_second": 208.827, "eval_steps_per_second": 6.532, "step": 66500 }, { "epoch": 20.138262699128344, "grad_norm": 2.475660562515259, "learning_rate": 3.3e-05, "loss": 0.8123, "step": 67000 }, { "epoch": 20.138262699128344, "eval_accuracy": 0.8127171094527156, "eval_loss": 1.0919703245162964, "eval_runtime": 110.4581, "eval_samples_per_second": 215.919, "eval_steps_per_second": 6.754, "step": 67000 }, { "epoch": 20.288548241659154, "grad_norm": 2.9205057621002197, "learning_rate": 3.2500000000000004e-05, "loss": 0.8146, "step": 67500 }, { "epoch": 20.288548241659154, "eval_accuracy": 0.8139352319239987, "eval_loss": 1.0828359127044678, "eval_runtime": 110.5729, "eval_samples_per_second": 215.695, "eval_steps_per_second": 6.747, "step": 67500 }, { "epoch": 20.43883378418996, "grad_norm": 2.775470018386841, "learning_rate": 3.2000000000000005e-05, "loss": 0.8117, "step": 68000 }, { "epoch": 20.43883378418996, "eval_accuracy": 0.8144743916913453, "eval_loss": 1.0843496322631836, "eval_runtime": 110.9504, "eval_samples_per_second": 214.961, "eval_steps_per_second": 6.724, "step": 68000 }, { "epoch": 20.58911932672077, "grad_norm": 2.8596460819244385, "learning_rate": 3.15e-05, "loss": 0.8142, "step": 68500 }, { "epoch": 20.58911932672077, "eval_accuracy": 0.8145867898001746, "eval_loss": 1.0775424242019653, "eval_runtime": 110.2042, "eval_samples_per_second": 216.416, "eval_steps_per_second": 6.769, "step": 68500 }, { "epoch": 20.73940486925158, "grad_norm": 2.5671284198760986, "learning_rate": 3.1e-05, "loss": 0.8176, "step": 69000 }, { "epoch": 20.73940486925158, "eval_accuracy": 0.8140961234786387, "eval_loss": 1.0756142139434814, "eval_runtime": 110.7907, "eval_samples_per_second": 215.271, "eval_steps_per_second": 6.733, "step": 69000 }, { "epoch": 20.889690411782386, "grad_norm": 2.549713373184204, "learning_rate": 3.05e-05, "loss": 0.813, "step": 69500 }, { "epoch": 20.889690411782386, "eval_accuracy": 0.8145543791131244, "eval_loss": 1.0741270780563354, "eval_runtime": 113.2044, "eval_samples_per_second": 210.681, "eval_steps_per_second": 6.59, "step": 69500 }, { "epoch": 21.039975954313196, "grad_norm": 2.433366060256958, "learning_rate": 3e-05, "loss": 0.8044, "step": 70000 }, { "epoch": 21.039975954313196, "eval_accuracy": 0.8138238331972251, "eval_loss": 1.0890129804611206, "eval_runtime": 113.5064, "eval_samples_per_second": 210.12, "eval_steps_per_second": 6.572, "step": 70000 }, { "epoch": 21.190261496844002, "grad_norm": 2.672717571258545, "learning_rate": 2.95e-05, "loss": 0.8034, "step": 70500 }, { "epoch": 21.190261496844002, "eval_accuracy": 0.8146000723609511, "eval_loss": 1.0757637023925781, "eval_runtime": 109.4526, "eval_samples_per_second": 217.902, "eval_steps_per_second": 6.816, "step": 70500 }, { "epoch": 21.340547039374812, "grad_norm": 2.7107882499694824, "learning_rate": 2.9e-05, "loss": 0.8007, "step": 71000 }, { "epoch": 21.340547039374812, "eval_accuracy": 0.815981095746543, "eval_loss": 1.074793815612793, "eval_runtime": 113.0561, "eval_samples_per_second": 210.957, "eval_steps_per_second": 6.598, "step": 71000 }, { "epoch": 21.49083258190562, "grad_norm": 2.419224262237549, "learning_rate": 2.8499999999999998e-05, "loss": 0.8009, "step": 71500 }, { "epoch": 21.49083258190562, "eval_accuracy": 0.8157551774698867, "eval_loss": 1.0713990926742554, "eval_runtime": 114.1603, "eval_samples_per_second": 208.917, "eval_steps_per_second": 6.535, "step": 71500 }, { "epoch": 21.641118124436428, "grad_norm": 2.68849778175354, "learning_rate": 2.8000000000000003e-05, "loss": 0.8042, "step": 72000 }, { "epoch": 21.641118124436428, "eval_accuracy": 0.816240857712766, "eval_loss": 1.0804829597473145, "eval_runtime": 113.3941, "eval_samples_per_second": 210.328, "eval_steps_per_second": 6.579, "step": 72000 }, { "epoch": 21.791403666967238, "grad_norm": 2.3715298175811768, "learning_rate": 2.7500000000000004e-05, "loss": 0.7971, "step": 72500 }, { "epoch": 21.791403666967238, "eval_accuracy": 0.8154816358162141, "eval_loss": 1.0725679397583008, "eval_runtime": 114.0456, "eval_samples_per_second": 209.127, "eval_steps_per_second": 6.541, "step": 72500 }, { "epoch": 21.941689209498048, "grad_norm": 2.6499125957489014, "learning_rate": 2.7000000000000002e-05, "loss": 0.807, "step": 73000 }, { "epoch": 21.941689209498048, "eval_accuracy": 0.8168061183380335, "eval_loss": 1.0588874816894531, "eval_runtime": 112.5964, "eval_samples_per_second": 211.819, "eval_steps_per_second": 6.625, "step": 73000 }, { "epoch": 22.091974752028854, "grad_norm": 2.766451358795166, "learning_rate": 2.6500000000000004e-05, "loss": 0.7973, "step": 73500 }, { "epoch": 22.091974752028854, "eval_accuracy": 0.8168862140868284, "eval_loss": 1.063796877861023, "eval_runtime": 113.9911, "eval_samples_per_second": 209.227, "eval_steps_per_second": 6.544, "step": 73500 }, { "epoch": 22.242260294559664, "grad_norm": 2.5453474521636963, "learning_rate": 2.6000000000000002e-05, "loss": 0.7915, "step": 74000 }, { "epoch": 22.242260294559664, "eval_accuracy": 0.8162707303528417, "eval_loss": 1.068402886390686, "eval_runtime": 113.1917, "eval_samples_per_second": 210.704, "eval_steps_per_second": 6.591, "step": 74000 }, { "epoch": 22.392545837090474, "grad_norm": 2.449525833129883, "learning_rate": 2.5500000000000003e-05, "loss": 0.8019, "step": 74500 }, { "epoch": 22.392545837090474, "eval_accuracy": 0.8168465033060288, "eval_loss": 1.071266531944275, "eval_runtime": 114.1644, "eval_samples_per_second": 208.909, "eval_steps_per_second": 6.534, "step": 74500 }, { "epoch": 22.54283137962128, "grad_norm": 2.782717704772949, "learning_rate": 2.5e-05, "loss": 0.7959, "step": 75000 }, { "epoch": 22.54283137962128, "eval_accuracy": 0.8176195224595183, "eval_loss": 1.0642682313919067, "eval_runtime": 112.938, "eval_samples_per_second": 211.178, "eval_steps_per_second": 6.605, "step": 75000 }, { "epoch": 22.69311692215209, "grad_norm": 2.754309892654419, "learning_rate": 2.45e-05, "loss": 0.7905, "step": 75500 }, { "epoch": 22.69311692215209, "eval_accuracy": 0.8177573368082611, "eval_loss": 1.0715863704681396, "eval_runtime": 113.2687, "eval_samples_per_second": 210.561, "eval_steps_per_second": 6.586, "step": 75500 }, { "epoch": 22.843402464682896, "grad_norm": 2.665132999420166, "learning_rate": 2.4e-05, "loss": 0.7894, "step": 76000 }, { "epoch": 22.843402464682896, "eval_accuracy": 0.8184662117931626, "eval_loss": 1.0566191673278809, "eval_runtime": 114.3215, "eval_samples_per_second": 208.622, "eval_steps_per_second": 6.525, "step": 76000 }, { "epoch": 22.993688007213706, "grad_norm": 2.2912895679473877, "learning_rate": 2.35e-05, "loss": 0.789, "step": 76500 }, { "epoch": 22.993688007213706, "eval_accuracy": 0.8173126447012466, "eval_loss": 1.0590641498565674, "eval_runtime": 114.0949, "eval_samples_per_second": 209.037, "eval_steps_per_second": 6.538, "step": 76500 }, { "epoch": 23.143973549744516, "grad_norm": 2.7320480346679688, "learning_rate": 2.3000000000000003e-05, "loss": 0.7859, "step": 77000 }, { "epoch": 23.143973549744516, "eval_accuracy": 0.8181692377590023, "eval_loss": 1.0568209886550903, "eval_runtime": 114.0712, "eval_samples_per_second": 209.08, "eval_steps_per_second": 6.54, "step": 77000 }, { "epoch": 23.294259092275322, "grad_norm": 2.5960936546325684, "learning_rate": 2.25e-05, "loss": 0.7894, "step": 77500 }, { "epoch": 23.294259092275322, "eval_accuracy": 0.8178769275591534, "eval_loss": 1.061540961265564, "eval_runtime": 113.9113, "eval_samples_per_second": 209.373, "eval_steps_per_second": 6.549, "step": 77500 }, { "epoch": 23.44454463480613, "grad_norm": 2.435558557510376, "learning_rate": 2.2000000000000003e-05, "loss": 0.7887, "step": 78000 }, { "epoch": 23.44454463480613, "eval_accuracy": 0.8177420807085761, "eval_loss": 1.0617201328277588, "eval_runtime": 113.4776, "eval_samples_per_second": 210.174, "eval_steps_per_second": 6.574, "step": 78000 }, { "epoch": 23.59483017733694, "grad_norm": 2.3692948818206787, "learning_rate": 2.15e-05, "loss": 0.7826, "step": 78500 }, { "epoch": 23.59483017733694, "eval_accuracy": 0.8180053577064287, "eval_loss": 1.0567620992660522, "eval_runtime": 113.7057, "eval_samples_per_second": 209.752, "eval_steps_per_second": 6.561, "step": 78500 }, { "epoch": 23.745115719867748, "grad_norm": 2.5609283447265625, "learning_rate": 2.1e-05, "loss": 0.7885, "step": 79000 }, { "epoch": 23.745115719867748, "eval_accuracy": 0.8188002648627605, "eval_loss": 1.0632256269454956, "eval_runtime": 113.6739, "eval_samples_per_second": 209.811, "eval_steps_per_second": 6.563, "step": 79000 }, { "epoch": 23.895401262398558, "grad_norm": 2.3372645378112793, "learning_rate": 2.05e-05, "loss": 0.7883, "step": 79500 }, { "epoch": 23.895401262398558, "eval_accuracy": 0.8187176321623402, "eval_loss": 1.066706657409668, "eval_runtime": 114.0886, "eval_samples_per_second": 209.048, "eval_steps_per_second": 6.539, "step": 79500 }, { "epoch": 24.045686804929367, "grad_norm": 2.3112826347351074, "learning_rate": 2e-05, "loss": 0.783, "step": 80000 }, { "epoch": 24.045686804929367, "eval_accuracy": 0.818846919349616, "eval_loss": 1.0611591339111328, "eval_runtime": 113.1339, "eval_samples_per_second": 210.812, "eval_steps_per_second": 6.594, "step": 80000 }, { "epoch": 24.195972347460174, "grad_norm": 2.514390230178833, "learning_rate": 1.9500000000000003e-05, "loss": 0.7804, "step": 80500 }, { "epoch": 24.195972347460174, "eval_accuracy": 0.818812757678464, "eval_loss": 1.0573077201843262, "eval_runtime": 114.2005, "eval_samples_per_second": 208.843, "eval_steps_per_second": 6.532, "step": 80500 }, { "epoch": 24.346257889990984, "grad_norm": 2.4737348556518555, "learning_rate": 1.9e-05, "loss": 0.7811, "step": 81000 }, { "epoch": 24.346257889990984, "eval_accuracy": 0.8192913294066332, "eval_loss": 1.0586949586868286, "eval_runtime": 113.7459, "eval_samples_per_second": 209.678, "eval_steps_per_second": 6.558, "step": 81000 }, { "epoch": 24.49654343252179, "grad_norm": 2.288757562637329, "learning_rate": 1.85e-05, "loss": 0.7767, "step": 81500 }, { "epoch": 24.49654343252179, "eval_accuracy": 0.8197264062916556, "eval_loss": 1.0525128841400146, "eval_runtime": 113.6132, "eval_samples_per_second": 209.923, "eval_steps_per_second": 6.566, "step": 81500 }, { "epoch": 24.6468289750526, "grad_norm": 2.4246654510498047, "learning_rate": 1.8e-05, "loss": 0.7803, "step": 82000 }, { "epoch": 24.6468289750526, "eval_accuracy": 0.8195798531764643, "eval_loss": 1.0466234683990479, "eval_runtime": 114.7779, "eval_samples_per_second": 207.793, "eval_steps_per_second": 6.5, "step": 82000 }, { "epoch": 24.79711451758341, "grad_norm": 3.0004007816314697, "learning_rate": 1.75e-05, "loss": 0.7688, "step": 82500 }, { "epoch": 24.79711451758341, "eval_accuracy": 0.8197989075740256, "eval_loss": 1.0529950857162476, "eval_runtime": 113.2072, "eval_samples_per_second": 210.676, "eval_steps_per_second": 6.59, "step": 82500 }, { "epoch": 24.947400060114216, "grad_norm": 2.476900577545166, "learning_rate": 1.7000000000000003e-05, "loss": 0.7734, "step": 83000 }, { "epoch": 24.947400060114216, "eval_accuracy": 0.8198411078292097, "eval_loss": 1.0492240190505981, "eval_runtime": 114.2362, "eval_samples_per_second": 208.778, "eval_steps_per_second": 6.53, "step": 83000 }, { "epoch": 25.097685602645026, "grad_norm": 2.6050217151641846, "learning_rate": 1.65e-05, "loss": 0.7741, "step": 83500 }, { "epoch": 25.097685602645026, "eval_accuracy": 0.8199209388676126, "eval_loss": 1.0443811416625977, "eval_runtime": 115.0076, "eval_samples_per_second": 207.378, "eval_steps_per_second": 6.487, "step": 83500 }, { "epoch": 25.247971145175836, "grad_norm": 2.845093011856079, "learning_rate": 1.6000000000000003e-05, "loss": 0.768, "step": 84000 }, { "epoch": 25.247971145175836, "eval_accuracy": 0.8203943019525041, "eval_loss": 1.0556350946426392, "eval_runtime": 115.581, "eval_samples_per_second": 206.349, "eval_steps_per_second": 6.454, "step": 84000 }, { "epoch": 25.398256687706642, "grad_norm": 2.8715054988861084, "learning_rate": 1.55e-05, "loss": 0.7731, "step": 84500 }, { "epoch": 25.398256687706642, "eval_accuracy": 0.8204570484351956, "eval_loss": 1.0443503856658936, "eval_runtime": 114.2662, "eval_samples_per_second": 208.723, "eval_steps_per_second": 6.529, "step": 84500 }, { "epoch": 25.54854223023745, "grad_norm": 2.2550415992736816, "learning_rate": 1.5e-05, "loss": 0.7675, "step": 85000 }, { "epoch": 25.54854223023745, "eval_accuracy": 0.8200469636032075, "eval_loss": 1.0414886474609375, "eval_runtime": 114.9948, "eval_samples_per_second": 207.401, "eval_steps_per_second": 6.487, "step": 85000 }, { "epoch": 25.69882777276826, "grad_norm": 2.238607168197632, "learning_rate": 1.45e-05, "loss": 0.7675, "step": 85500 }, { "epoch": 25.69882777276826, "eval_accuracy": 0.8207652275282068, "eval_loss": 1.0444408655166626, "eval_runtime": 115.1961, "eval_samples_per_second": 207.038, "eval_steps_per_second": 6.476, "step": 85500 }, { "epoch": 25.849113315299068, "grad_norm": 2.242630958557129, "learning_rate": 1.4000000000000001e-05, "loss": 0.7655, "step": 86000 }, { "epoch": 25.849113315299068, "eval_accuracy": 0.8202594142538011, "eval_loss": 1.0470616817474365, "eval_runtime": 116.2229, "eval_samples_per_second": 205.209, "eval_steps_per_second": 6.419, "step": 86000 }, { "epoch": 25.999398857829878, "grad_norm": 2.5453295707702637, "learning_rate": 1.3500000000000001e-05, "loss": 0.7678, "step": 86500 }, { "epoch": 25.999398857829878, "eval_accuracy": 0.8200539868874701, "eval_loss": 1.0508933067321777, "eval_runtime": 114.7052, "eval_samples_per_second": 207.924, "eval_steps_per_second": 6.504, "step": 86500 }, { "epoch": 26.149684400360684, "grad_norm": 2.391352415084839, "learning_rate": 1.3000000000000001e-05, "loss": 0.7653, "step": 87000 }, { "epoch": 26.149684400360684, "eval_accuracy": 0.8205590231388606, "eval_loss": 1.0484099388122559, "eval_runtime": 115.0871, "eval_samples_per_second": 207.234, "eval_steps_per_second": 6.482, "step": 87000 }, { "epoch": 26.299969942891494, "grad_norm": 2.5071208477020264, "learning_rate": 1.25e-05, "loss": 0.7679, "step": 87500 }, { "epoch": 26.299969942891494, "eval_accuracy": 0.8217573625905127, "eval_loss": 1.0400173664093018, "eval_runtime": 113.1376, "eval_samples_per_second": 210.805, "eval_steps_per_second": 6.594, "step": 87500 }, { "epoch": 26.450255485422304, "grad_norm": 2.5453133583068848, "learning_rate": 1.2e-05, "loss": 0.7656, "step": 88000 }, { "epoch": 26.450255485422304, "eval_accuracy": 0.8213452575807276, "eval_loss": 1.0516611337661743, "eval_runtime": 113.3943, "eval_samples_per_second": 210.328, "eval_steps_per_second": 6.579, "step": 88000 }, { "epoch": 26.60054102795311, "grad_norm": 2.509098529815674, "learning_rate": 1.1500000000000002e-05, "loss": 0.7593, "step": 88500 }, { "epoch": 26.60054102795311, "eval_accuracy": 0.8213436048487595, "eval_loss": 1.0389631986618042, "eval_runtime": 116.4627, "eval_samples_per_second": 204.787, "eval_steps_per_second": 6.405, "step": 88500 }, { "epoch": 26.75082657048392, "grad_norm": 2.835665464401245, "learning_rate": 1.1000000000000001e-05, "loss": 0.7606, "step": 89000 }, { "epoch": 26.75082657048392, "eval_accuracy": 0.8212840552795833, "eval_loss": 1.0403192043304443, "eval_runtime": 114.9351, "eval_samples_per_second": 207.508, "eval_steps_per_second": 6.491, "step": 89000 }, { "epoch": 26.90111211301473, "grad_norm": 2.5455708503723145, "learning_rate": 1.05e-05, "loss": 0.7578, "step": 89500 }, { "epoch": 26.90111211301473, "eval_accuracy": 0.821311120726958, "eval_loss": 1.036601185798645, "eval_runtime": 114.3227, "eval_samples_per_second": 208.62, "eval_steps_per_second": 6.525, "step": 89500 }, { "epoch": 27.051397655545536, "grad_norm": 2.821378231048584, "learning_rate": 1e-05, "loss": 0.765, "step": 90000 }, { "epoch": 27.051397655545536, "eval_accuracy": 0.8221304808698725, "eval_loss": 1.0333795547485352, "eval_runtime": 115.4024, "eval_samples_per_second": 206.668, "eval_steps_per_second": 6.464, "step": 90000 }, { "epoch": 27.201683198076346, "grad_norm": 2.59919810295105, "learning_rate": 9.5e-06, "loss": 0.7589, "step": 90500 }, { "epoch": 27.201683198076346, "eval_accuracy": 0.8222628454875403, "eval_loss": 1.033319354057312, "eval_runtime": 115.8055, "eval_samples_per_second": 205.949, "eval_steps_per_second": 6.442, "step": 90500 }, { "epoch": 27.351968740607152, "grad_norm": 2.601203203201294, "learning_rate": 9e-06, "loss": 0.7563, "step": 91000 }, { "epoch": 27.351968740607152, "eval_accuracy": 0.8218045068983223, "eval_loss": 1.046338677406311, "eval_runtime": 116.2837, "eval_samples_per_second": 205.102, "eval_steps_per_second": 6.415, "step": 91000 }, { "epoch": 27.50225428313796, "grad_norm": 2.8450610637664795, "learning_rate": 8.500000000000002e-06, "loss": 0.7559, "step": 91500 }, { "epoch": 27.50225428313796, "eval_accuracy": 0.8218716317162797, "eval_loss": 1.044384241104126, "eval_runtime": 115.6484, "eval_samples_per_second": 206.229, "eval_steps_per_second": 6.451, "step": 91500 }, { "epoch": 27.65253982566877, "grad_norm": 2.7283740043640137, "learning_rate": 8.000000000000001e-06, "loss": 0.7586, "step": 92000 }, { "epoch": 27.65253982566877, "eval_accuracy": 0.8221257250457026, "eval_loss": 1.0296134948730469, "eval_runtime": 115.214, "eval_samples_per_second": 207.006, "eval_steps_per_second": 6.475, "step": 92000 }, { "epoch": 27.802825368199578, "grad_norm": 2.795022964477539, "learning_rate": 7.5e-06, "loss": 0.7585, "step": 92500 }, { "epoch": 27.802825368199578, "eval_accuracy": 0.82162242799983, "eval_loss": 1.045117735862732, "eval_runtime": 113.2778, "eval_samples_per_second": 210.544, "eval_steps_per_second": 6.586, "step": 92500 }, { "epoch": 27.953110910730388, "grad_norm": 2.52536678314209, "learning_rate": 7.000000000000001e-06, "loss": 0.7548, "step": 93000 }, { "epoch": 27.953110910730388, "eval_accuracy": 0.8225299354468154, "eval_loss": 1.0380265712738037, "eval_runtime": 113.5711, "eval_samples_per_second": 210.001, "eval_steps_per_second": 6.569, "step": 93000 }, { "epoch": 28.103396453261198, "grad_norm": 2.3631057739257812, "learning_rate": 6.5000000000000004e-06, "loss": 0.7542, "step": 93500 }, { "epoch": 28.103396453261198, "eval_accuracy": 0.8229969642015583, "eval_loss": 1.0490919351577759, "eval_runtime": 112.4838, "eval_samples_per_second": 212.031, "eval_steps_per_second": 6.632, "step": 93500 }, { "epoch": 28.253681995792004, "grad_norm": 2.2699172496795654, "learning_rate": 6e-06, "loss": 0.7522, "step": 94000 }, { "epoch": 28.253681995792004, "eval_accuracy": 0.8230864296098176, "eval_loss": 1.0274651050567627, "eval_runtime": 111.8716, "eval_samples_per_second": 213.191, "eval_steps_per_second": 6.668, "step": 94000 }, { "epoch": 28.403967538322814, "grad_norm": 2.44769287109375, "learning_rate": 5.500000000000001e-06, "loss": 0.7569, "step": 94500 }, { "epoch": 28.403967538322814, "eval_accuracy": 0.8234413637256175, "eval_loss": 1.0298686027526855, "eval_runtime": 113.5377, "eval_samples_per_second": 210.062, "eval_steps_per_second": 6.571, "step": 94500 }, { "epoch": 28.554253080853623, "grad_norm": 2.7103466987609863, "learning_rate": 5e-06, "loss": 0.7536, "step": 95000 }, { "epoch": 28.554253080853623, "eval_accuracy": 0.8224712064035926, "eval_loss": 1.034175992012024, "eval_runtime": 113.1535, "eval_samples_per_second": 210.776, "eval_steps_per_second": 6.593, "step": 95000 }, { "epoch": 28.70453862338443, "grad_norm": 2.594381809234619, "learning_rate": 4.5e-06, "loss": 0.7547, "step": 95500 }, { "epoch": 28.70453862338443, "eval_accuracy": 0.8231576514254546, "eval_loss": 1.0324558019638062, "eval_runtime": 111.2443, "eval_samples_per_second": 214.393, "eval_steps_per_second": 6.706, "step": 95500 }, { "epoch": 28.85482416591524, "grad_norm": 2.623845100402832, "learning_rate": 4.000000000000001e-06, "loss": 0.7529, "step": 96000 }, { "epoch": 28.85482416591524, "eval_accuracy": 0.8241724336625226, "eval_loss": 1.0312702655792236, "eval_runtime": 115.266, "eval_samples_per_second": 206.913, "eval_steps_per_second": 6.472, "step": 96000 }, { "epoch": 29.005109708446046, "grad_norm": 2.706256628036499, "learning_rate": 3.5000000000000004e-06, "loss": 0.7461, "step": 96500 }, { "epoch": 29.005109708446046, "eval_accuracy": 0.8230462104005479, "eval_loss": 1.0389125347137451, "eval_runtime": 115.814, "eval_samples_per_second": 205.934, "eval_steps_per_second": 6.441, "step": 96500 }, { "epoch": 29.155395250976856, "grad_norm": 2.697993278503418, "learning_rate": 3e-06, "loss": 0.7513, "step": 97000 }, { "epoch": 29.155395250976856, "eval_accuracy": 0.8236184494498082, "eval_loss": 1.036423921585083, "eval_runtime": 115.3283, "eval_samples_per_second": 206.801, "eval_steps_per_second": 6.468, "step": 97000 }, { "epoch": 29.305680793507666, "grad_norm": 2.68867826461792, "learning_rate": 2.5e-06, "loss": 0.7494, "step": 97500 }, { "epoch": 29.305680793507666, "eval_accuracy": 0.82393844898435, "eval_loss": 1.0242578983306885, "eval_runtime": 116.3698, "eval_samples_per_second": 204.95, "eval_steps_per_second": 6.411, "step": 97500 }, { "epoch": 29.455966336038472, "grad_norm": 2.259347915649414, "learning_rate": 2.0000000000000003e-06, "loss": 0.7438, "step": 98000 }, { "epoch": 29.455966336038472, "eval_accuracy": 0.8236859969210806, "eval_loss": 1.0227982997894287, "eval_runtime": 115.3714, "eval_samples_per_second": 206.724, "eval_steps_per_second": 6.466, "step": 98000 }, { "epoch": 29.60625187856928, "grad_norm": 2.8278329372406006, "learning_rate": 1.5e-06, "loss": 0.7499, "step": 98500 }, { "epoch": 29.60625187856928, "eval_accuracy": 0.8236034478162941, "eval_loss": 1.022267460823059, "eval_runtime": 114.5023, "eval_samples_per_second": 208.293, "eval_steps_per_second": 6.515, "step": 98500 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 31, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.31746763541971e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }