{ "best_metric": 0.7950243949890137, "best_model_checkpoint": "./model_fine-tune/glot/xlm-r/kat-Geor/checkpoint-99500", "epoch": 32.34720416124838, "eval_steps": 500, "global_step": 99500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1625487646293888, "grad_norm": 4.931751251220703, "learning_rate": 9.95e-05, "loss": 1.3006, "step": 500 }, { "epoch": 0.1625487646293888, "eval_accuracy": 0.7623556146386671, "eval_loss": 1.3419054746627808, "eval_runtime": 224.5592, "eval_samples_per_second": 90.996, "eval_steps_per_second": 2.846, "step": 500 }, { "epoch": 0.3250975292587776, "grad_norm": 4.145026683807373, "learning_rate": 9.900000000000001e-05, "loss": 1.2154, "step": 1000 }, { "epoch": 0.3250975292587776, "eval_accuracy": 0.7721005611086652, "eval_loss": 1.2983847856521606, "eval_runtime": 225.9654, "eval_samples_per_second": 90.43, "eval_steps_per_second": 2.828, "step": 1000 }, { "epoch": 0.48764629388816644, "grad_norm": 4.589369297027588, "learning_rate": 9.850000000000001e-05, "loss": 1.1779, "step": 1500 }, { "epoch": 0.48764629388816644, "eval_accuracy": 0.7790585887983502, "eval_loss": 1.2703238725662231, "eval_runtime": 226.7078, "eval_samples_per_second": 90.134, "eval_steps_per_second": 2.819, "step": 1500 }, { "epoch": 0.6501950585175552, "grad_norm": 4.552799701690674, "learning_rate": 9.8e-05, "loss": 1.1546, "step": 2000 }, { "epoch": 0.6501950585175552, "eval_accuracy": 0.7819611016961427, "eval_loss": 1.2609333992004395, "eval_runtime": 226.665, "eval_samples_per_second": 90.151, "eval_steps_per_second": 2.819, "step": 2000 }, { "epoch": 0.812743823146944, "grad_norm": 3.709968328475952, "learning_rate": 9.75e-05, "loss": 1.1259, "step": 2500 }, { "epoch": 0.812743823146944, "eval_accuracy": 0.7877356734428504, "eval_loss": 1.2131903171539307, "eval_runtime": 226.8657, "eval_samples_per_second": 90.071, "eval_steps_per_second": 2.817, "step": 2500 }, { "epoch": 0.9752925877763329, "grad_norm": 3.6233718395233154, "learning_rate": 9.7e-05, "loss": 1.0938, "step": 3000 }, { "epoch": 0.9752925877763329, "eval_accuracy": 0.789409001949858, "eval_loss": 1.2194353342056274, "eval_runtime": 226.5843, "eval_samples_per_second": 90.183, "eval_steps_per_second": 2.82, "step": 3000 }, { "epoch": 1.1378413524057218, "grad_norm": 3.7706315517425537, "learning_rate": 9.65e-05, "loss": 1.0728, "step": 3500 }, { "epoch": 1.1378413524057218, "eval_accuracy": 0.7930909730559011, "eval_loss": 1.2004605531692505, "eval_runtime": 225.3386, "eval_samples_per_second": 90.681, "eval_steps_per_second": 2.836, "step": 3500 }, { "epoch": 1.3003901170351106, "grad_norm": 3.654762029647827, "learning_rate": 9.6e-05, "loss": 1.067, "step": 4000 }, { "epoch": 1.3003901170351106, "eval_accuracy": 0.7911667822625772, "eval_loss": 1.206479549407959, "eval_runtime": 229.9442, "eval_samples_per_second": 88.865, "eval_steps_per_second": 2.779, "step": 4000 }, { "epoch": 1.4629388816644995, "grad_norm": 3.6047844886779785, "learning_rate": 9.55e-05, "loss": 1.0623, "step": 4500 }, { "epoch": 1.4629388816644995, "eval_accuracy": 0.7950484617863337, "eval_loss": 1.186224341392517, "eval_runtime": 229.5346, "eval_samples_per_second": 89.024, "eval_steps_per_second": 2.784, "step": 4500 }, { "epoch": 1.6254876462938883, "grad_norm": 3.174414873123169, "learning_rate": 9.5e-05, "loss": 1.0452, "step": 5000 }, { "epoch": 1.6254876462938883, "eval_accuracy": 0.7986192006290228, "eval_loss": 1.1624418497085571, "eval_runtime": 227.8114, "eval_samples_per_second": 89.697, "eval_steps_per_second": 2.805, "step": 5000 }, { "epoch": 1.7880364109232771, "grad_norm": 3.455960988998413, "learning_rate": 9.449999999999999e-05, "loss": 1.0203, "step": 5500 }, { "epoch": 1.7880364109232771, "eval_accuracy": 0.7996825782405662, "eval_loss": 1.160246729850769, "eval_runtime": 227.8469, "eval_samples_per_second": 89.683, "eval_steps_per_second": 2.805, "step": 5500 }, { "epoch": 1.9505851755526658, "grad_norm": 3.4478368759155273, "learning_rate": 9.4e-05, "loss": 1.0102, "step": 6000 }, { "epoch": 1.9505851755526658, "eval_accuracy": 0.8028851426121806, "eval_loss": 1.1382070779800415, "eval_runtime": 225.8399, "eval_samples_per_second": 90.48, "eval_steps_per_second": 2.829, "step": 6000 }, { "epoch": 2.113133940182055, "grad_norm": 3.5076828002929688, "learning_rate": 9.350000000000001e-05, "loss": 1.0007, "step": 6500 }, { "epoch": 2.113133940182055, "eval_accuracy": 0.8036015201468353, "eval_loss": 1.1308486461639404, "eval_runtime": 227.3542, "eval_samples_per_second": 89.877, "eval_steps_per_second": 2.811, "step": 6500 }, { "epoch": 2.2756827048114436, "grad_norm": 2.954378366470337, "learning_rate": 9.300000000000001e-05, "loss": 0.9828, "step": 7000 }, { "epoch": 2.2756827048114436, "eval_accuracy": 0.80609597689472, "eval_loss": 1.1300016641616821, "eval_runtime": 226.3258, "eval_samples_per_second": 90.286, "eval_steps_per_second": 2.823, "step": 7000 }, { "epoch": 2.438231469440832, "grad_norm": 3.6220898628234863, "learning_rate": 9.250000000000001e-05, "loss": 0.9738, "step": 7500 }, { "epoch": 2.438231469440832, "eval_accuracy": 0.807549206838832, "eval_loss": 1.1173433065414429, "eval_runtime": 230.0766, "eval_samples_per_second": 88.814, "eval_steps_per_second": 2.777, "step": 7500 }, { "epoch": 2.6007802340702213, "grad_norm": 3.0354886054992676, "learning_rate": 9.200000000000001e-05, "loss": 0.9726, "step": 8000 }, { "epoch": 2.6007802340702213, "eval_accuracy": 0.8092077277721563, "eval_loss": 1.1192331314086914, "eval_runtime": 228.747, "eval_samples_per_second": 89.33, "eval_steps_per_second": 2.793, "step": 8000 }, { "epoch": 2.7633289986996097, "grad_norm": 2.905066967010498, "learning_rate": 9.15e-05, "loss": 0.9639, "step": 8500 }, { "epoch": 2.7633289986996097, "eval_accuracy": 0.8105359765383991, "eval_loss": 1.0964252948760986, "eval_runtime": 239.0977, "eval_samples_per_second": 85.463, "eval_steps_per_second": 2.673, "step": 8500 }, { "epoch": 2.925877763328999, "grad_norm": 2.5866756439208984, "learning_rate": 9.1e-05, "loss": 0.9562, "step": 9000 }, { "epoch": 2.925877763328999, "eval_accuracy": 0.8124049786000702, "eval_loss": 1.0979543924331665, "eval_runtime": 228.0298, "eval_samples_per_second": 89.611, "eval_steps_per_second": 2.802, "step": 9000 }, { "epoch": 3.0884265279583873, "grad_norm": 3.0912091732025146, "learning_rate": 9.05e-05, "loss": 0.9412, "step": 9500 }, { "epoch": 3.0884265279583873, "eval_accuracy": 0.8128664613154557, "eval_loss": 1.098233699798584, "eval_runtime": 226.3594, "eval_samples_per_second": 90.272, "eval_steps_per_second": 2.823, "step": 9500 }, { "epoch": 3.250975292587776, "grad_norm": 3.310755968093872, "learning_rate": 9e-05, "loss": 0.9404, "step": 10000 }, { "epoch": 3.250975292587776, "eval_accuracy": 0.8141734154155298, "eval_loss": 1.0834821462631226, "eval_runtime": 229.2889, "eval_samples_per_second": 89.119, "eval_steps_per_second": 2.787, "step": 10000 }, { "epoch": 3.413524057217165, "grad_norm": 3.2035183906555176, "learning_rate": 8.950000000000001e-05, "loss": 0.9345, "step": 10500 }, { "epoch": 3.413524057217165, "eval_accuracy": 0.8148430925920552, "eval_loss": 1.071053385734558, "eval_runtime": 228.6307, "eval_samples_per_second": 89.376, "eval_steps_per_second": 2.795, "step": 10500 }, { "epoch": 3.576072821846554, "grad_norm": 3.3350119590759277, "learning_rate": 8.900000000000001e-05, "loss": 0.9287, "step": 11000 }, { "epoch": 3.576072821846554, "eval_accuracy": 0.8148937367897325, "eval_loss": 1.0869466066360474, "eval_runtime": 229.0653, "eval_samples_per_second": 89.206, "eval_steps_per_second": 2.79, "step": 11000 }, { "epoch": 3.7386215864759427, "grad_norm": 3.0126075744628906, "learning_rate": 8.850000000000001e-05, "loss": 0.916, "step": 11500 }, { "epoch": 3.7386215864759427, "eval_accuracy": 0.8164226040931482, "eval_loss": 1.0795150995254517, "eval_runtime": 227.3062, "eval_samples_per_second": 89.896, "eval_steps_per_second": 2.811, "step": 11500 }, { "epoch": 3.9011703511053315, "grad_norm": 2.9025819301605225, "learning_rate": 8.800000000000001e-05, "loss": 0.9114, "step": 12000 }, { "epoch": 3.9011703511053315, "eval_accuracy": 0.8172531893844662, "eval_loss": 1.0739566087722778, "eval_runtime": 227.5695, "eval_samples_per_second": 89.792, "eval_steps_per_second": 2.808, "step": 12000 }, { "epoch": 4.06371911573472, "grad_norm": 3.1317036151885986, "learning_rate": 8.75e-05, "loss": 0.9104, "step": 12500 }, { "epoch": 4.06371911573472, "eval_accuracy": 0.8181712807844302, "eval_loss": 1.057742953300476, "eval_runtime": 226.8271, "eval_samples_per_second": 90.086, "eval_steps_per_second": 2.817, "step": 12500 }, { "epoch": 4.22626788036411, "grad_norm": 2.9879837036132812, "learning_rate": 8.7e-05, "loss": 0.8932, "step": 13000 }, { "epoch": 4.22626788036411, "eval_accuracy": 0.819528608544276, "eval_loss": 1.0524249076843262, "eval_runtime": 227.5673, "eval_samples_per_second": 89.793, "eval_steps_per_second": 2.808, "step": 13000 }, { "epoch": 4.388816644993498, "grad_norm": 3.1617486476898193, "learning_rate": 8.65e-05, "loss": 0.8884, "step": 13500 }, { "epoch": 4.388816644993498, "eval_accuracy": 0.8202382159332025, "eval_loss": 1.0618681907653809, "eval_runtime": 228.7279, "eval_samples_per_second": 89.338, "eval_steps_per_second": 2.794, "step": 13500 }, { "epoch": 4.551365409622887, "grad_norm": 3.287351131439209, "learning_rate": 8.6e-05, "loss": 0.8932, "step": 14000 }, { "epoch": 4.551365409622887, "eval_accuracy": 0.821493227612126, "eval_loss": 1.0429145097732544, "eval_runtime": 229.2177, "eval_samples_per_second": 89.147, "eval_steps_per_second": 2.788, "step": 14000 }, { "epoch": 4.713914174252276, "grad_norm": 3.1148979663848877, "learning_rate": 8.55e-05, "loss": 0.8871, "step": 14500 }, { "epoch": 4.713914174252276, "eval_accuracy": 0.821055764186474, "eval_loss": 1.0499247312545776, "eval_runtime": 227.9461, "eval_samples_per_second": 89.644, "eval_steps_per_second": 2.803, "step": 14500 }, { "epoch": 4.876462938881664, "grad_norm": 3.0099728107452393, "learning_rate": 8.5e-05, "loss": 0.8848, "step": 15000 }, { "epoch": 4.876462938881664, "eval_accuracy": 0.8221091492282949, "eval_loss": 1.0302263498306274, "eval_runtime": 229.0246, "eval_samples_per_second": 89.222, "eval_steps_per_second": 2.79, "step": 15000 }, { "epoch": 5.039011703511053, "grad_norm": 2.8249874114990234, "learning_rate": 8.450000000000001e-05, "loss": 0.8772, "step": 15500 }, { "epoch": 5.039011703511053, "eval_accuracy": 0.8225020975532324, "eval_loss": 1.0407395362854004, "eval_runtime": 230.4007, "eval_samples_per_second": 88.689, "eval_steps_per_second": 2.773, "step": 15500 }, { "epoch": 5.201560468140442, "grad_norm": 2.273552179336548, "learning_rate": 8.4e-05, "loss": 0.8715, "step": 16000 }, { "epoch": 5.201560468140442, "eval_accuracy": 0.8238052823384368, "eval_loss": 1.0325181484222412, "eval_runtime": 230.5347, "eval_samples_per_second": 88.637, "eval_steps_per_second": 2.772, "step": 16000 }, { "epoch": 5.364109232769831, "grad_norm": 2.884051561355591, "learning_rate": 8.35e-05, "loss": 0.8687, "step": 16500 }, { "epoch": 5.364109232769831, "eval_accuracy": 0.8243404153748926, "eval_loss": 1.0298871994018555, "eval_runtime": 231.4091, "eval_samples_per_second": 88.302, "eval_steps_per_second": 2.761, "step": 16500 }, { "epoch": 5.526657997399219, "grad_norm": 3.12709379196167, "learning_rate": 8.3e-05, "loss": 0.8674, "step": 17000 }, { "epoch": 5.526657997399219, "eval_accuracy": 0.8245251637171149, "eval_loss": 1.0233651399612427, "eval_runtime": 225.4766, "eval_samples_per_second": 90.626, "eval_steps_per_second": 2.834, "step": 17000 }, { "epoch": 5.689206762028609, "grad_norm": 2.7715749740600586, "learning_rate": 8.25e-05, "loss": 0.8597, "step": 17500 }, { "epoch": 5.689206762028609, "eval_accuracy": 0.8253348922363006, "eval_loss": 1.0265984535217285, "eval_runtime": 226.4158, "eval_samples_per_second": 90.25, "eval_steps_per_second": 2.822, "step": 17500 }, { "epoch": 5.851755526657997, "grad_norm": 2.630910873413086, "learning_rate": 8.2e-05, "loss": 0.8621, "step": 18000 }, { "epoch": 5.851755526657997, "eval_accuracy": 0.8264286938861036, "eval_loss": 1.025406002998352, "eval_runtime": 225.265, "eval_samples_per_second": 90.711, "eval_steps_per_second": 2.837, "step": 18000 }, { "epoch": 6.014304291287386, "grad_norm": 2.960498571395874, "learning_rate": 8.15e-05, "loss": 0.8505, "step": 18500 }, { "epoch": 6.014304291287386, "eval_accuracy": 0.8266802737227764, "eval_loss": 1.0281902551651, "eval_runtime": 225.5813, "eval_samples_per_second": 90.584, "eval_steps_per_second": 2.833, "step": 18500 }, { "epoch": 6.176853055916775, "grad_norm": 2.7250943183898926, "learning_rate": 8.1e-05, "loss": 0.8434, "step": 19000 }, { "epoch": 6.176853055916775, "eval_accuracy": 0.8273285977403005, "eval_loss": 1.019544005393982, "eval_runtime": 225.1972, "eval_samples_per_second": 90.738, "eval_steps_per_second": 2.838, "step": 19000 }, { "epoch": 6.339401820546164, "grad_norm": 2.45147967338562, "learning_rate": 8.05e-05, "loss": 0.8396, "step": 19500 }, { "epoch": 6.339401820546164, "eval_accuracy": 0.8278276606616609, "eval_loss": 1.016022801399231, "eval_runtime": 225.1071, "eval_samples_per_second": 90.775, "eval_steps_per_second": 2.839, "step": 19500 }, { "epoch": 6.501950585175552, "grad_norm": 3.0252089500427246, "learning_rate": 8e-05, "loss": 0.849, "step": 20000 }, { "epoch": 6.501950585175552, "eval_accuracy": 0.8286753807254127, "eval_loss": 0.9984493851661682, "eval_runtime": 225.6293, "eval_samples_per_second": 90.564, "eval_steps_per_second": 2.832, "step": 20000 }, { "epoch": 6.664499349804942, "grad_norm": 2.79874849319458, "learning_rate": 7.950000000000001e-05, "loss": 0.8344, "step": 20500 }, { "epoch": 6.664499349804942, "eval_accuracy": 0.8295993429023929, "eval_loss": 1.008083462715149, "eval_runtime": 226.4394, "eval_samples_per_second": 90.24, "eval_steps_per_second": 2.822, "step": 20500 }, { "epoch": 6.82704811443433, "grad_norm": 2.726980447769165, "learning_rate": 7.900000000000001e-05, "loss": 0.832, "step": 21000 }, { "epoch": 6.82704811443433, "eval_accuracy": 0.8289772346442087, "eval_loss": 1.0009534358978271, "eval_runtime": 225.2727, "eval_samples_per_second": 90.708, "eval_steps_per_second": 2.837, "step": 21000 }, { "epoch": 6.989596879063719, "grad_norm": 2.6177260875701904, "learning_rate": 7.850000000000001e-05, "loss": 0.8371, "step": 21500 }, { "epoch": 6.989596879063719, "eval_accuracy": 0.828972862081629, "eval_loss": 1.007731556892395, "eval_runtime": 225.1985, "eval_samples_per_second": 90.738, "eval_steps_per_second": 2.837, "step": 21500 }, { "epoch": 7.152145643693108, "grad_norm": 2.511810541152954, "learning_rate": 7.800000000000001e-05, "loss": 0.8221, "step": 22000 }, { "epoch": 7.152145643693108, "eval_accuracy": 0.8296968824079836, "eval_loss": 1.001420259475708, "eval_runtime": 226.3788, "eval_samples_per_second": 90.265, "eval_steps_per_second": 2.823, "step": 22000 }, { "epoch": 7.314694408322497, "grad_norm": 2.490196466445923, "learning_rate": 7.75e-05, "loss": 0.819, "step": 22500 }, { "epoch": 7.314694408322497, "eval_accuracy": 0.8315370661967273, "eval_loss": 1.0057367086410522, "eval_runtime": 226.9835, "eval_samples_per_second": 90.024, "eval_steps_per_second": 2.815, "step": 22500 }, { "epoch": 7.477243172951885, "grad_norm": 2.7740466594696045, "learning_rate": 7.7e-05, "loss": 0.8211, "step": 23000 }, { "epoch": 7.477243172951885, "eval_accuracy": 0.8325044106908798, "eval_loss": 0.9906172752380371, "eval_runtime": 226.1902, "eval_samples_per_second": 90.34, "eval_steps_per_second": 2.825, "step": 23000 }, { "epoch": 7.639791937581275, "grad_norm": 3.0789926052093506, "learning_rate": 7.65e-05, "loss": 0.818, "step": 23500 }, { "epoch": 7.639791937581275, "eval_accuracy": 0.832777941957673, "eval_loss": 0.9794951677322388, "eval_runtime": 226.5187, "eval_samples_per_second": 90.209, "eval_steps_per_second": 2.821, "step": 23500 }, { "epoch": 7.802340702210663, "grad_norm": 2.452514171600342, "learning_rate": 7.6e-05, "loss": 0.8128, "step": 24000 }, { "epoch": 7.802340702210663, "eval_accuracy": 0.8325327815535476, "eval_loss": 0.9824967384338379, "eval_runtime": 225.9154, "eval_samples_per_second": 90.45, "eval_steps_per_second": 2.828, "step": 24000 }, { "epoch": 7.964889466840052, "grad_norm": 2.486287832260132, "learning_rate": 7.55e-05, "loss": 0.8133, "step": 24500 }, { "epoch": 7.964889466840052, "eval_accuracy": 0.8329719850198115, "eval_loss": 0.9840763211250305, "eval_runtime": 225.7676, "eval_samples_per_second": 90.509, "eval_steps_per_second": 2.83, "step": 24500 }, { "epoch": 8.12743823146944, "grad_norm": 2.6238162517547607, "learning_rate": 7.500000000000001e-05, "loss": 0.7972, "step": 25000 }, { "epoch": 8.12743823146944, "eval_accuracy": 0.8329068936503207, "eval_loss": 0.9786662459373474, "eval_runtime": 227.0649, "eval_samples_per_second": 89.992, "eval_steps_per_second": 2.814, "step": 25000 }, { "epoch": 8.289986996098829, "grad_norm": 2.622385025024414, "learning_rate": 7.450000000000001e-05, "loss": 0.8012, "step": 25500 }, { "epoch": 8.289986996098829, "eval_accuracy": 0.83353766380846, "eval_loss": 0.9908860325813293, "eval_runtime": 225.4511, "eval_samples_per_second": 90.636, "eval_steps_per_second": 2.834, "step": 25500 }, { "epoch": 8.45253576072822, "grad_norm": 2.3659889698028564, "learning_rate": 7.4e-05, "loss": 0.8002, "step": 26000 }, { "epoch": 8.45253576072822, "eval_accuracy": 0.834269520268432, "eval_loss": 0.9886682629585266, "eval_runtime": 226.2835, "eval_samples_per_second": 90.303, "eval_steps_per_second": 2.824, "step": 26000 }, { "epoch": 8.615084525357608, "grad_norm": 2.6340997219085693, "learning_rate": 7.35e-05, "loss": 0.796, "step": 26500 }, { "epoch": 8.615084525357608, "eval_accuracy": 0.83431812146113, "eval_loss": 1.1027251482009888, "eval_runtime": 224.7718, "eval_samples_per_second": 90.91, "eval_steps_per_second": 2.843, "step": 26500 }, { "epoch": 8.777633289986996, "grad_norm": 2.351562976837158, "learning_rate": 7.3e-05, "loss": 0.8032, "step": 27000 }, { "epoch": 8.777633289986996, "eval_accuracy": 0.83473517326524, "eval_loss": 0.9701533317565918, "eval_runtime": 224.7228, "eval_samples_per_second": 90.93, "eval_steps_per_second": 2.844, "step": 27000 }, { "epoch": 8.940182054616384, "grad_norm": 3.179107189178467, "learning_rate": 7.25e-05, "loss": 0.7956, "step": 27500 }, { "epoch": 8.940182054616384, "eval_accuracy": 0.8351845716261748, "eval_loss": 0.978928804397583, "eval_runtime": 226.3807, "eval_samples_per_second": 90.264, "eval_steps_per_second": 2.823, "step": 27500 }, { "epoch": 9.102730819245775, "grad_norm": 2.7947020530700684, "learning_rate": 7.2e-05, "loss": 0.7897, "step": 28000 }, { "epoch": 9.102730819245775, "eval_accuracy": 0.8353527515006416, "eval_loss": 0.9703938961029053, "eval_runtime": 227.4785, "eval_samples_per_second": 89.828, "eval_steps_per_second": 2.809, "step": 28000 }, { "epoch": 9.265279583875163, "grad_norm": 2.4941184520721436, "learning_rate": 7.15e-05, "loss": 0.7822, "step": 28500 }, { "epoch": 9.265279583875163, "eval_accuracy": 0.8354081248952335, "eval_loss": 0.985792875289917, "eval_runtime": 225.177, "eval_samples_per_second": 90.746, "eval_steps_per_second": 2.838, "step": 28500 }, { "epoch": 9.427828348504551, "grad_norm": 2.623500108718872, "learning_rate": 7.1e-05, "loss": 0.7885, "step": 29000 }, { "epoch": 9.427828348504551, "eval_accuracy": 0.8367873538892325, "eval_loss": 0.9710731506347656, "eval_runtime": 224.2999, "eval_samples_per_second": 91.101, "eval_steps_per_second": 2.849, "step": 29000 }, { "epoch": 9.59037711313394, "grad_norm": 2.687804698944092, "learning_rate": 7.05e-05, "loss": 0.7853, "step": 29500 }, { "epoch": 9.59037711313394, "eval_accuracy": 0.8370918540482961, "eval_loss": 0.9488577246665955, "eval_runtime": 225.5849, "eval_samples_per_second": 90.582, "eval_steps_per_second": 2.833, "step": 29500 }, { "epoch": 9.752925877763328, "grad_norm": 2.415875196456909, "learning_rate": 7e-05, "loss": 0.7834, "step": 30000 }, { "epoch": 9.752925877763328, "eval_accuracy": 0.8376393865857097, "eval_loss": 0.9489056468009949, "eval_runtime": 224.5743, "eval_samples_per_second": 90.99, "eval_steps_per_second": 2.845, "step": 30000 }, { "epoch": 9.915474642392718, "grad_norm": 2.635620355606079, "learning_rate": 6.95e-05, "loss": 0.7795, "step": 30500 }, { "epoch": 9.915474642392718, "eval_accuracy": 0.8379052346488646, "eval_loss": 0.9600683450698853, "eval_runtime": 227.5569, "eval_samples_per_second": 89.797, "eval_steps_per_second": 2.808, "step": 30500 }, { "epoch": 10.078023407022107, "grad_norm": 2.2729740142822266, "learning_rate": 6.9e-05, "loss": 0.7784, "step": 31000 }, { "epoch": 10.078023407022107, "eval_accuracy": 0.8387711597381718, "eval_loss": 0.9676991701126099, "eval_runtime": 227.0441, "eval_samples_per_second": 90.0, "eval_steps_per_second": 2.814, "step": 31000 }, { "epoch": 10.240572171651495, "grad_norm": 2.665194511413574, "learning_rate": 6.850000000000001e-05, "loss": 0.7731, "step": 31500 }, { "epoch": 10.240572171651495, "eval_accuracy": 0.8384039380286287, "eval_loss": 0.9660561084747314, "eval_runtime": 225.7122, "eval_samples_per_second": 90.531, "eval_steps_per_second": 2.831, "step": 31500 }, { "epoch": 10.403120936280883, "grad_norm": 2.378326892852783, "learning_rate": 6.800000000000001e-05, "loss": 0.7732, "step": 32000 }, { "epoch": 10.403120936280883, "eval_accuracy": 0.8384703620413777, "eval_loss": 0.9522095322608948, "eval_runtime": 226.2237, "eval_samples_per_second": 90.327, "eval_steps_per_second": 2.825, "step": 32000 }, { "epoch": 10.565669700910274, "grad_norm": 2.3873043060302734, "learning_rate": 6.750000000000001e-05, "loss": 0.7675, "step": 32500 }, { "epoch": 10.565669700910274, "eval_accuracy": 0.8387303829039866, "eval_loss": 0.9506338238716125, "eval_runtime": 225.3705, "eval_samples_per_second": 90.668, "eval_steps_per_second": 2.835, "step": 32500 }, { "epoch": 10.728218465539662, "grad_norm": 2.908627986907959, "learning_rate": 6.7e-05, "loss": 0.7632, "step": 33000 }, { "epoch": 10.728218465539662, "eval_accuracy": 0.8393342612931991, "eval_loss": 0.9406708478927612, "eval_runtime": 226.5163, "eval_samples_per_second": 90.21, "eval_steps_per_second": 2.821, "step": 33000 }, { "epoch": 10.89076723016905, "grad_norm": 2.353163242340088, "learning_rate": 6.65e-05, "loss": 0.7652, "step": 33500 }, { "epoch": 10.89076723016905, "eval_accuracy": 0.8397924279467366, "eval_loss": 0.9450008869171143, "eval_runtime": 228.8699, "eval_samples_per_second": 89.282, "eval_steps_per_second": 2.792, "step": 33500 }, { "epoch": 11.053315994798439, "grad_norm": 2.5943007469177246, "learning_rate": 6.6e-05, "loss": 0.7569, "step": 34000 }, { "epoch": 11.053315994798439, "eval_accuracy": 0.8403287583785509, "eval_loss": 0.9399783611297607, "eval_runtime": 226.1121, "eval_samples_per_second": 90.371, "eval_steps_per_second": 2.826, "step": 34000 }, { "epoch": 11.215864759427829, "grad_norm": 2.693864107131958, "learning_rate": 6.55e-05, "loss": 0.7594, "step": 34500 }, { "epoch": 11.215864759427829, "eval_accuracy": 0.8401560936743129, "eval_loss": 0.9480236172676086, "eval_runtime": 225.4517, "eval_samples_per_second": 90.636, "eval_steps_per_second": 2.834, "step": 34500 }, { "epoch": 11.378413524057217, "grad_norm": 2.644212484359741, "learning_rate": 6.500000000000001e-05, "loss": 0.7549, "step": 35000 }, { "epoch": 11.378413524057217, "eval_accuracy": 0.8397807491674708, "eval_loss": 0.947754442691803, "eval_runtime": 224.7666, "eval_samples_per_second": 90.912, "eval_steps_per_second": 2.843, "step": 35000 }, { "epoch": 11.540962288686606, "grad_norm": 2.548055410385132, "learning_rate": 6.450000000000001e-05, "loss": 0.7508, "step": 35500 }, { "epoch": 11.540962288686606, "eval_accuracy": 0.8408923535971057, "eval_loss": 0.9363004565238953, "eval_runtime": 226.2948, "eval_samples_per_second": 90.298, "eval_steps_per_second": 2.824, "step": 35500 }, { "epoch": 11.703511053315994, "grad_norm": 2.2983460426330566, "learning_rate": 6.400000000000001e-05, "loss": 0.7498, "step": 36000 }, { "epoch": 11.703511053315994, "eval_accuracy": 0.8415923695179955, "eval_loss": 0.9559971690177917, "eval_runtime": 225.5228, "eval_samples_per_second": 90.607, "eval_steps_per_second": 2.833, "step": 36000 }, { "epoch": 11.866059817945384, "grad_norm": 2.585132122039795, "learning_rate": 6.35e-05, "loss": 0.7529, "step": 36500 }, { "epoch": 11.866059817945384, "eval_accuracy": 0.8414500365516611, "eval_loss": 0.9397149085998535, "eval_runtime": 225.5184, "eval_samples_per_second": 90.609, "eval_steps_per_second": 2.833, "step": 36500 }, { "epoch": 12.028608582574773, "grad_norm": 2.7008893489837646, "learning_rate": 6.3e-05, "loss": 0.7499, "step": 37000 }, { "epoch": 12.028608582574773, "eval_accuracy": 0.8429048722887895, "eval_loss": 0.9322752952575684, "eval_runtime": 226.0814, "eval_samples_per_second": 90.383, "eval_steps_per_second": 2.826, "step": 37000 }, { "epoch": 12.191157347204161, "grad_norm": 2.234344959259033, "learning_rate": 6.25e-05, "loss": 0.7516, "step": 37500 }, { "epoch": 12.191157347204161, "eval_accuracy": 0.8423553938504648, "eval_loss": 0.9274143576622009, "eval_runtime": 224.6544, "eval_samples_per_second": 90.957, "eval_steps_per_second": 2.844, "step": 37500 }, { "epoch": 12.35370611183355, "grad_norm": 2.38130259513855, "learning_rate": 6.2e-05, "loss": 0.742, "step": 38000 }, { "epoch": 12.35370611183355, "eval_accuracy": 0.8434239606449652, "eval_loss": 0.9298020005226135, "eval_runtime": 224.8881, "eval_samples_per_second": 90.863, "eval_steps_per_second": 2.841, "step": 38000 }, { "epoch": 12.51625487646294, "grad_norm": 2.4753472805023193, "learning_rate": 6.15e-05, "loss": 0.7349, "step": 38500 }, { "epoch": 12.51625487646294, "eval_accuracy": 0.8435835943839733, "eval_loss": 0.9265833497047424, "eval_runtime": 225.4856, "eval_samples_per_second": 90.622, "eval_steps_per_second": 2.834, "step": 38500 }, { "epoch": 12.678803641092328, "grad_norm": 2.6576738357543945, "learning_rate": 6.1e-05, "loss": 0.7408, "step": 39000 }, { "epoch": 12.678803641092328, "eval_accuracy": 0.8433502485604528, "eval_loss": 0.9311428070068359, "eval_runtime": 226.2881, "eval_samples_per_second": 90.301, "eval_steps_per_second": 2.824, "step": 39000 }, { "epoch": 12.841352405721716, "grad_norm": 2.318998336791992, "learning_rate": 6.05e-05, "loss": 0.742, "step": 39500 }, { "epoch": 12.841352405721716, "eval_accuracy": 0.8439347116351804, "eval_loss": 0.9278241991996765, "eval_runtime": 226.17, "eval_samples_per_second": 90.348, "eval_steps_per_second": 2.825, "step": 39500 }, { "epoch": 13.003901170351105, "grad_norm": 2.5888776779174805, "learning_rate": 6e-05, "loss": 0.735, "step": 40000 }, { "epoch": 13.003901170351105, "eval_accuracy": 0.8435309371055906, "eval_loss": 0.9364249110221863, "eval_runtime": 226.5769, "eval_samples_per_second": 90.186, "eval_steps_per_second": 2.82, "step": 40000 }, { "epoch": 13.166449934980495, "grad_norm": 2.5382964611053467, "learning_rate": 5.95e-05, "loss": 0.7317, "step": 40500 }, { "epoch": 13.166449934980495, "eval_accuracy": 0.8435685143232774, "eval_loss": 0.9289753437042236, "eval_runtime": 226.0444, "eval_samples_per_second": 90.398, "eval_steps_per_second": 2.827, "step": 40500 }, { "epoch": 13.328998699609883, "grad_norm": 2.3561949729919434, "learning_rate": 5.9e-05, "loss": 0.7287, "step": 41000 }, { "epoch": 13.328998699609883, "eval_accuracy": 0.8441468378237561, "eval_loss": 0.9176314473152161, "eval_runtime": 225.4845, "eval_samples_per_second": 90.623, "eval_steps_per_second": 2.834, "step": 41000 }, { "epoch": 13.491547464239272, "grad_norm": 2.4950144290924072, "learning_rate": 5.85e-05, "loss": 0.7236, "step": 41500 }, { "epoch": 13.491547464239272, "eval_accuracy": 0.8440490876509515, "eval_loss": 0.9126904010772705, "eval_runtime": 225.1525, "eval_samples_per_second": 90.756, "eval_steps_per_second": 2.838, "step": 41500 }, { "epoch": 13.65409622886866, "grad_norm": 2.5094358921051025, "learning_rate": 5.8e-05, "loss": 0.7271, "step": 42000 }, { "epoch": 13.65409622886866, "eval_accuracy": 0.844420508518643, "eval_loss": 0.9222070574760437, "eval_runtime": 226.8881, "eval_samples_per_second": 90.062, "eval_steps_per_second": 2.816, "step": 42000 }, { "epoch": 13.81664499349805, "grad_norm": 2.5358660221099854, "learning_rate": 5.7499999999999995e-05, "loss": 0.7247, "step": 42500 }, { "epoch": 13.81664499349805, "eval_accuracy": 0.8447406385701589, "eval_loss": 0.911141037940979, "eval_runtime": 226.4876, "eval_samples_per_second": 90.221, "eval_steps_per_second": 2.821, "step": 42500 }, { "epoch": 13.979193758127439, "grad_norm": 2.449117422103882, "learning_rate": 5.6999999999999996e-05, "loss": 0.724, "step": 43000 }, { "epoch": 13.979193758127439, "eval_accuracy": 0.8455180692523965, "eval_loss": 0.9233959317207336, "eval_runtime": 226.6454, "eval_samples_per_second": 90.158, "eval_steps_per_second": 2.819, "step": 43000 }, { "epoch": 14.141742522756827, "grad_norm": 2.9299075603485107, "learning_rate": 5.65e-05, "loss": 0.717, "step": 43500 }, { "epoch": 14.141742522756827, "eval_accuracy": 0.8457589583161474, "eval_loss": 0.9212149381637573, "eval_runtime": 226.3542, "eval_samples_per_second": 90.274, "eval_steps_per_second": 2.823, "step": 43500 }, { "epoch": 14.304291287386215, "grad_norm": 2.705927610397339, "learning_rate": 5.6000000000000006e-05, "loss": 0.7209, "step": 44000 }, { "epoch": 14.304291287386215, "eval_accuracy": 0.8461540554265815, "eval_loss": 0.9096006155014038, "eval_runtime": 225.3071, "eval_samples_per_second": 90.694, "eval_steps_per_second": 2.836, "step": 44000 }, { "epoch": 14.466840052015606, "grad_norm": 2.0237460136413574, "learning_rate": 5.550000000000001e-05, "loss": 0.7143, "step": 44500 }, { "epoch": 14.466840052015606, "eval_accuracy": 0.8467122032637585, "eval_loss": 0.9058763980865479, "eval_runtime": 224.7066, "eval_samples_per_second": 90.936, "eval_steps_per_second": 2.844, "step": 44500 }, { "epoch": 14.629388816644994, "grad_norm": 2.6412603855133057, "learning_rate": 5.500000000000001e-05, "loss": 0.7097, "step": 45000 }, { "epoch": 14.629388816644994, "eval_accuracy": 0.8469896352526538, "eval_loss": 0.9109433889389038, "eval_runtime": 226.4374, "eval_samples_per_second": 90.241, "eval_steps_per_second": 2.822, "step": 45000 }, { "epoch": 14.791937581274382, "grad_norm": 2.0750892162323, "learning_rate": 5.45e-05, "loss": 0.7167, "step": 45500 }, { "epoch": 14.791937581274382, "eval_accuracy": 0.8459233510762237, "eval_loss": 0.9634975790977478, "eval_runtime": 224.623, "eval_samples_per_second": 90.97, "eval_steps_per_second": 2.845, "step": 45500 }, { "epoch": 14.95448634590377, "grad_norm": 3.2299835681915283, "learning_rate": 5.4000000000000005e-05, "loss": 0.7175, "step": 46000 }, { "epoch": 14.95448634590377, "eval_accuracy": 0.8472973119877086, "eval_loss": 0.9039083123207092, "eval_runtime": 226.4661, "eval_samples_per_second": 90.23, "eval_steps_per_second": 2.822, "step": 46000 }, { "epoch": 15.117035110533159, "grad_norm": 2.562220573425293, "learning_rate": 5.3500000000000006e-05, "loss": 0.7032, "step": 46500 }, { "epoch": 15.117035110533159, "eval_accuracy": 0.8473520786144324, "eval_loss": 0.911507785320282, "eval_runtime": 224.8892, "eval_samples_per_second": 90.863, "eval_steps_per_second": 2.841, "step": 46500 }, { "epoch": 15.27958387516255, "grad_norm": 2.8594844341278076, "learning_rate": 5.300000000000001e-05, "loss": 0.7082, "step": 47000 }, { "epoch": 15.27958387516255, "eval_accuracy": 0.8476350142048673, "eval_loss": 0.909827470779419, "eval_runtime": 225.7006, "eval_samples_per_second": 90.536, "eval_steps_per_second": 2.831, "step": 47000 }, { "epoch": 15.442132639791938, "grad_norm": 2.468425989151001, "learning_rate": 5.25e-05, "loss": 0.7041, "step": 47500 }, { "epoch": 15.442132639791938, "eval_accuracy": 0.8478791054352419, "eval_loss": 0.9049926996231079, "eval_runtime": 224.8771, "eval_samples_per_second": 90.867, "eval_steps_per_second": 2.842, "step": 47500 }, { "epoch": 15.604681404421326, "grad_norm": 2.389726400375366, "learning_rate": 5.2000000000000004e-05, "loss": 0.7031, "step": 48000 }, { "epoch": 15.604681404421326, "eval_accuracy": 0.8481061886894397, "eval_loss": 0.890941858291626, "eval_runtime": 226.989, "eval_samples_per_second": 90.022, "eval_steps_per_second": 2.815, "step": 48000 }, { "epoch": 15.767230169050714, "grad_norm": 2.2124507427215576, "learning_rate": 5.1500000000000005e-05, "loss": 0.6986, "step": 48500 }, { "epoch": 15.767230169050714, "eval_accuracy": 0.8491361761433549, "eval_loss": 0.9069699048995972, "eval_runtime": 224.9486, "eval_samples_per_second": 90.839, "eval_steps_per_second": 2.841, "step": 48500 }, { "epoch": 15.929778933680105, "grad_norm": 2.2906134128570557, "learning_rate": 5.1000000000000006e-05, "loss": 0.707, "step": 49000 }, { "epoch": 15.929778933680105, "eval_accuracy": 0.8479968642310016, "eval_loss": 0.9036478996276855, "eval_runtime": 225.9102, "eval_samples_per_second": 90.452, "eval_steps_per_second": 2.829, "step": 49000 }, { "epoch": 16.092327698309493, "grad_norm": 2.6677193641662598, "learning_rate": 5.05e-05, "loss": 0.6948, "step": 49500 }, { "epoch": 16.092327698309493, "eval_accuracy": 0.8493272111938716, "eval_loss": 0.9056188464164734, "eval_runtime": 225.0003, "eval_samples_per_second": 90.818, "eval_steps_per_second": 2.84, "step": 49500 }, { "epoch": 16.25487646293888, "grad_norm": 2.4262144565582275, "learning_rate": 5e-05, "loss": 0.6967, "step": 50000 }, { "epoch": 16.25487646293888, "eval_accuracy": 0.8485872336104957, "eval_loss": 0.896799623966217, "eval_runtime": 224.1595, "eval_samples_per_second": 91.158, "eval_steps_per_second": 2.851, "step": 50000 }, { "epoch": 16.41742522756827, "grad_norm": 2.316122055053711, "learning_rate": 4.9500000000000004e-05, "loss": 0.6977, "step": 50500 }, { "epoch": 16.41742522756827, "eval_accuracy": 0.8495556837644054, "eval_loss": 0.9065157175064087, "eval_runtime": 225.8926, "eval_samples_per_second": 90.459, "eval_steps_per_second": 2.829, "step": 50500 }, { "epoch": 16.579973992197658, "grad_norm": 2.6691653728485107, "learning_rate": 4.9e-05, "loss": 0.6961, "step": 51000 }, { "epoch": 16.579973992197658, "eval_accuracy": 0.8500652800696321, "eval_loss": 0.896096408367157, "eval_runtime": 232.2659, "eval_samples_per_second": 87.977, "eval_steps_per_second": 2.751, "step": 51000 }, { "epoch": 16.742522756827046, "grad_norm": 2.475905656814575, "learning_rate": 4.85e-05, "loss": 0.6916, "step": 51500 }, { "epoch": 16.742522756827046, "eval_accuracy": 0.8497042599251818, "eval_loss": 0.8976150155067444, "eval_runtime": 226.0766, "eval_samples_per_second": 90.385, "eval_steps_per_second": 2.826, "step": 51500 }, { "epoch": 16.90507152145644, "grad_norm": 2.786370277404785, "learning_rate": 4.8e-05, "loss": 0.6914, "step": 52000 }, { "epoch": 16.90507152145644, "eval_accuracy": 0.8496545695743757, "eval_loss": 0.8938505053520203, "eval_runtime": 224.6085, "eval_samples_per_second": 90.976, "eval_steps_per_second": 2.845, "step": 52000 }, { "epoch": 17.067620286085827, "grad_norm": 2.8185195922851562, "learning_rate": 4.75e-05, "loss": 0.6843, "step": 52500 }, { "epoch": 17.067620286085827, "eval_accuracy": 0.8506024205488569, "eval_loss": 0.902123749256134, "eval_runtime": 225.1853, "eval_samples_per_second": 90.743, "eval_steps_per_second": 2.838, "step": 52500 }, { "epoch": 17.230169050715215, "grad_norm": 2.287614107131958, "learning_rate": 4.7e-05, "loss": 0.6886, "step": 53000 }, { "epoch": 17.230169050715215, "eval_accuracy": 0.8509516099496137, "eval_loss": 0.8869491815567017, "eval_runtime": 225.8223, "eval_samples_per_second": 90.487, "eval_steps_per_second": 2.83, "step": 53000 }, { "epoch": 17.392717815344604, "grad_norm": 2.545166492462158, "learning_rate": 4.6500000000000005e-05, "loss": 0.6895, "step": 53500 }, { "epoch": 17.392717815344604, "eval_accuracy": 0.8509741315809264, "eval_loss": 0.88730388879776, "eval_runtime": 225.2327, "eval_samples_per_second": 90.724, "eval_steps_per_second": 2.837, "step": 53500 }, { "epoch": 17.555266579973992, "grad_norm": 2.133690357208252, "learning_rate": 4.600000000000001e-05, "loss": 0.684, "step": 54000 }, { "epoch": 17.555266579973992, "eval_accuracy": 0.8506065339944305, "eval_loss": 0.8796634674072266, "eval_runtime": 225.9145, "eval_samples_per_second": 90.45, "eval_steps_per_second": 2.829, "step": 54000 }, { "epoch": 17.71781534460338, "grad_norm": 2.368135690689087, "learning_rate": 4.55e-05, "loss": 0.6814, "step": 54500 }, { "epoch": 17.71781534460338, "eval_accuracy": 0.8518810256708096, "eval_loss": 0.8883038759231567, "eval_runtime": 226.0516, "eval_samples_per_second": 90.395, "eval_steps_per_second": 2.827, "step": 54500 }, { "epoch": 17.88036410923277, "grad_norm": 2.93131685256958, "learning_rate": 4.5e-05, "loss": 0.6814, "step": 55000 }, { "epoch": 17.88036410923277, "eval_accuracy": 0.8515069138621577, "eval_loss": 0.8804932832717896, "eval_runtime": 224.3083, "eval_samples_per_second": 91.098, "eval_steps_per_second": 2.849, "step": 55000 }, { "epoch": 18.042912873862157, "grad_norm": 2.6115188598632812, "learning_rate": 4.4500000000000004e-05, "loss": 0.6825, "step": 55500 }, { "epoch": 18.042912873862157, "eval_accuracy": 0.8524721846400168, "eval_loss": 0.8792157173156738, "eval_runtime": 225.8719, "eval_samples_per_second": 90.467, "eval_steps_per_second": 2.829, "step": 55500 }, { "epoch": 18.20546163849155, "grad_norm": 3.650696039199829, "learning_rate": 4.4000000000000006e-05, "loss": 0.6754, "step": 56000 }, { "epoch": 18.20546163849155, "eval_accuracy": 0.8515890870808295, "eval_loss": 0.8762602210044861, "eval_runtime": 225.4458, "eval_samples_per_second": 90.638, "eval_steps_per_second": 2.834, "step": 56000 }, { "epoch": 18.368010403120937, "grad_norm": 2.48811936378479, "learning_rate": 4.35e-05, "loss": 0.6694, "step": 56500 }, { "epoch": 18.368010403120937, "eval_accuracy": 0.852083188035357, "eval_loss": 0.8903321623802185, "eval_runtime": 231.1517, "eval_samples_per_second": 88.401, "eval_steps_per_second": 2.764, "step": 56500 }, { "epoch": 18.530559167750326, "grad_norm": 2.664206027984619, "learning_rate": 4.3e-05, "loss": 0.6711, "step": 57000 }, { "epoch": 18.530559167750326, "eval_accuracy": 0.8524635038150364, "eval_loss": 0.8883496522903442, "eval_runtime": 224.7773, "eval_samples_per_second": 90.908, "eval_steps_per_second": 2.843, "step": 57000 }, { "epoch": 18.693107932379714, "grad_norm": 2.5020461082458496, "learning_rate": 4.25e-05, "loss": 0.6701, "step": 57500 }, { "epoch": 18.693107932379714, "eval_accuracy": 0.8527774358008777, "eval_loss": 0.8737440705299377, "eval_runtime": 224.3031, "eval_samples_per_second": 91.1, "eval_steps_per_second": 2.849, "step": 57500 }, { "epoch": 18.855656697009103, "grad_norm": 2.341423273086548, "learning_rate": 4.2e-05, "loss": 0.6722, "step": 58000 }, { "epoch": 18.855656697009103, "eval_accuracy": 0.8529963243635703, "eval_loss": 0.8735175132751465, "eval_runtime": 225.4167, "eval_samples_per_second": 90.65, "eval_steps_per_second": 2.835, "step": 58000 }, { "epoch": 19.01820546163849, "grad_norm": 2.4827890396118164, "learning_rate": 4.15e-05, "loss": 0.6723, "step": 58500 }, { "epoch": 19.01820546163849, "eval_accuracy": 0.853864474597493, "eval_loss": 0.8714447021484375, "eval_runtime": 225.7934, "eval_samples_per_second": 90.499, "eval_steps_per_second": 2.83, "step": 58500 }, { "epoch": 19.18075422626788, "grad_norm": 2.6257293224334717, "learning_rate": 4.1e-05, "loss": 0.6623, "step": 59000 }, { "epoch": 19.18075422626788, "eval_accuracy": 0.8535918441752076, "eval_loss": 0.8707379698753357, "eval_runtime": 233.0012, "eval_samples_per_second": 87.699, "eval_steps_per_second": 2.742, "step": 59000 }, { "epoch": 19.343302990897268, "grad_norm": 2.2881245613098145, "learning_rate": 4.05e-05, "loss": 0.6629, "step": 59500 }, { "epoch": 19.343302990897268, "eval_accuracy": 0.8536284370228246, "eval_loss": 0.8575323820114136, "eval_runtime": 225.6251, "eval_samples_per_second": 90.566, "eval_steps_per_second": 2.832, "step": 59500 }, { "epoch": 19.505851755526656, "grad_norm": 2.485198497772217, "learning_rate": 4e-05, "loss": 0.6629, "step": 60000 }, { "epoch": 19.505851755526656, "eval_accuracy": 0.8541063694677179, "eval_loss": 0.8660362958908081, "eval_runtime": 229.1829, "eval_samples_per_second": 89.16, "eval_steps_per_second": 2.788, "step": 60000 }, { "epoch": 19.668400520156048, "grad_norm": 2.365445375442505, "learning_rate": 3.9500000000000005e-05, "loss": 0.6636, "step": 60500 }, { "epoch": 19.668400520156048, "eval_accuracy": 0.8544101305781451, "eval_loss": 0.8751095533370972, "eval_runtime": 226.4875, "eval_samples_per_second": 90.221, "eval_steps_per_second": 2.821, "step": 60500 }, { "epoch": 19.830949284785437, "grad_norm": 2.7341248989105225, "learning_rate": 3.9000000000000006e-05, "loss": 0.6668, "step": 61000 }, { "epoch": 19.830949284785437, "eval_accuracy": 0.8538102982406334, "eval_loss": 0.8749545812606812, "eval_runtime": 225.3871, "eval_samples_per_second": 90.662, "eval_steps_per_second": 2.835, "step": 61000 }, { "epoch": 19.993498049414825, "grad_norm": 2.200584888458252, "learning_rate": 3.85e-05, "loss": 0.6658, "step": 61500 }, { "epoch": 19.993498049414825, "eval_accuracy": 0.8542889960446646, "eval_loss": 0.8671930432319641, "eval_runtime": 225.4174, "eval_samples_per_second": 90.65, "eval_steps_per_second": 2.835, "step": 61500 }, { "epoch": 20.156046814044213, "grad_norm": 2.7894246578216553, "learning_rate": 3.8e-05, "loss": 0.6616, "step": 62000 }, { "epoch": 20.156046814044213, "eval_accuracy": 0.8548723750905142, "eval_loss": 0.8662538528442383, "eval_runtime": 224.781, "eval_samples_per_second": 90.906, "eval_steps_per_second": 2.843, "step": 62000 }, { "epoch": 20.3185955786736, "grad_norm": 2.3690404891967773, "learning_rate": 3.7500000000000003e-05, "loss": 0.6541, "step": 62500 }, { "epoch": 20.3185955786736, "eval_accuracy": 0.8550845489091575, "eval_loss": 0.865816593170166, "eval_runtime": 228.0119, "eval_samples_per_second": 89.618, "eval_steps_per_second": 2.802, "step": 62500 }, { "epoch": 20.48114434330299, "grad_norm": 2.6345200538635254, "learning_rate": 3.7e-05, "loss": 0.6561, "step": 63000 }, { "epoch": 20.48114434330299, "eval_accuracy": 0.8558111308868751, "eval_loss": 0.8506866097450256, "eval_runtime": 225.0923, "eval_samples_per_second": 90.781, "eval_steps_per_second": 2.839, "step": 63000 }, { "epoch": 20.64369310793238, "grad_norm": 2.1237897872924805, "learning_rate": 3.65e-05, "loss": 0.6578, "step": 63500 }, { "epoch": 20.64369310793238, "eval_accuracy": 0.8554877275609093, "eval_loss": 0.8685211539268494, "eval_runtime": 225.3889, "eval_samples_per_second": 90.661, "eval_steps_per_second": 2.835, "step": 63500 }, { "epoch": 20.806241872561767, "grad_norm": 2.37259578704834, "learning_rate": 3.6e-05, "loss": 0.657, "step": 64000 }, { "epoch": 20.806241872561767, "eval_accuracy": 0.8553885784782709, "eval_loss": 0.8665293455123901, "eval_runtime": 225.5872, "eval_samples_per_second": 90.581, "eval_steps_per_second": 2.833, "step": 64000 }, { "epoch": 20.96879063719116, "grad_norm": 2.595446825027466, "learning_rate": 3.55e-05, "loss": 0.6495, "step": 64500 }, { "epoch": 20.96879063719116, "eval_accuracy": 0.8558350742146078, "eval_loss": 0.8600612878799438, "eval_runtime": 225.3383, "eval_samples_per_second": 90.681, "eval_steps_per_second": 2.836, "step": 64500 }, { "epoch": 21.131339401820547, "grad_norm": 2.689633846282959, "learning_rate": 3.5e-05, "loss": 0.6484, "step": 65000 }, { "epoch": 21.131339401820547, "eval_accuracy": 0.8563225017454439, "eval_loss": 0.8686124086380005, "eval_runtime": 225.5265, "eval_samples_per_second": 90.606, "eval_steps_per_second": 2.833, "step": 65000 }, { "epoch": 21.293888166449936, "grad_norm": 2.4909982681274414, "learning_rate": 3.45e-05, "loss": 0.6429, "step": 65500 }, { "epoch": 21.293888166449936, "eval_accuracy": 0.8557131214639457, "eval_loss": 0.86305832862854, "eval_runtime": 225.2213, "eval_samples_per_second": 90.729, "eval_steps_per_second": 2.837, "step": 65500 }, { "epoch": 21.456436931079324, "grad_norm": 2.7555160522460938, "learning_rate": 3.4000000000000007e-05, "loss": 0.6523, "step": 66000 }, { "epoch": 21.456436931079324, "eval_accuracy": 0.8565990504258199, "eval_loss": 0.8672294616699219, "eval_runtime": 225.3745, "eval_samples_per_second": 90.667, "eval_steps_per_second": 2.835, "step": 66000 }, { "epoch": 21.618985695708712, "grad_norm": 2.4315414428710938, "learning_rate": 3.35e-05, "loss": 0.6384, "step": 66500 }, { "epoch": 21.618985695708712, "eval_accuracy": 0.8567521705315332, "eval_loss": 0.8545786738395691, "eval_runtime": 226.5184, "eval_samples_per_second": 90.209, "eval_steps_per_second": 2.821, "step": 66500 }, { "epoch": 21.7815344603381, "grad_norm": 2.373359441757202, "learning_rate": 3.3e-05, "loss": 0.6512, "step": 67000 }, { "epoch": 21.7815344603381, "eval_accuracy": 0.8568411511443462, "eval_loss": 0.8615005612373352, "eval_runtime": 226.7483, "eval_samples_per_second": 90.118, "eval_steps_per_second": 2.818, "step": 67000 }, { "epoch": 21.94408322496749, "grad_norm": 2.5687010288238525, "learning_rate": 3.2500000000000004e-05, "loss": 0.6452, "step": 67500 }, { "epoch": 21.94408322496749, "eval_accuracy": 0.856976936965993, "eval_loss": 0.8514290452003479, "eval_runtime": 224.8277, "eval_samples_per_second": 90.887, "eval_steps_per_second": 2.842, "step": 67500 }, { "epoch": 22.106631989596877, "grad_norm": 2.3000574111938477, "learning_rate": 3.2000000000000005e-05, "loss": 0.6388, "step": 68000 }, { "epoch": 22.106631989596877, "eval_accuracy": 0.8576112543945463, "eval_loss": 0.8523911833763123, "eval_runtime": 225.5646, "eval_samples_per_second": 90.59, "eval_steps_per_second": 2.833, "step": 68000 }, { "epoch": 22.26918075422627, "grad_norm": 2.5039753913879395, "learning_rate": 3.15e-05, "loss": 0.6382, "step": 68500 }, { "epoch": 22.26918075422627, "eval_accuracy": 0.8567441321838508, "eval_loss": 0.8560938835144043, "eval_runtime": 226.9717, "eval_samples_per_second": 90.029, "eval_steps_per_second": 2.815, "step": 68500 }, { "epoch": 22.431729518855658, "grad_norm": 2.6623408794403076, "learning_rate": 3.1e-05, "loss": 0.6408, "step": 69000 }, { "epoch": 22.431729518855658, "eval_accuracy": 0.8576559655770156, "eval_loss": 0.854763925075531, "eval_runtime": 226.0318, "eval_samples_per_second": 90.403, "eval_steps_per_second": 2.827, "step": 69000 }, { "epoch": 22.594278283485046, "grad_norm": 2.3223702907562256, "learning_rate": 3.05e-05, "loss": 0.6433, "step": 69500 }, { "epoch": 22.594278283485046, "eval_accuracy": 0.8581642158372993, "eval_loss": 0.8585615754127502, "eval_runtime": 225.6463, "eval_samples_per_second": 90.558, "eval_steps_per_second": 2.832, "step": 69500 }, { "epoch": 22.756827048114435, "grad_norm": 2.545360803604126, "learning_rate": 3e-05, "loss": 0.6371, "step": 70000 }, { "epoch": 22.756827048114435, "eval_accuracy": 0.8583454058165224, "eval_loss": 0.8482581973075867, "eval_runtime": 225.5921, "eval_samples_per_second": 90.579, "eval_steps_per_second": 2.833, "step": 70000 }, { "epoch": 22.919375812743823, "grad_norm": 2.18627667427063, "learning_rate": 2.95e-05, "loss": 0.6331, "step": 70500 }, { "epoch": 22.919375812743823, "eval_accuracy": 0.857949745807387, "eval_loss": 0.8507857918739319, "eval_runtime": 226.8073, "eval_samples_per_second": 90.094, "eval_steps_per_second": 2.817, "step": 70500 }, { "epoch": 23.08192457737321, "grad_norm": 2.8513870239257812, "learning_rate": 2.9e-05, "loss": 0.6393, "step": 71000 }, { "epoch": 23.08192457737321, "eval_accuracy": 0.8573411368207433, "eval_loss": 0.8502649664878845, "eval_runtime": 227.8606, "eval_samples_per_second": 89.678, "eval_steps_per_second": 2.804, "step": 71000 }, { "epoch": 23.2444733420026, "grad_norm": 2.961089611053467, "learning_rate": 2.8499999999999998e-05, "loss": 0.6269, "step": 71500 }, { "epoch": 23.2444733420026, "eval_accuracy": 0.8582647966293633, "eval_loss": 0.8488523364067078, "eval_runtime": 226.4431, "eval_samples_per_second": 90.239, "eval_steps_per_second": 2.822, "step": 71500 }, { "epoch": 23.407022106631988, "grad_norm": 2.448005437850952, "learning_rate": 2.8000000000000003e-05, "loss": 0.6284, "step": 72000 }, { "epoch": 23.407022106631988, "eval_accuracy": 0.8588356517259145, "eval_loss": 0.8428735136985779, "eval_runtime": 226.3587, "eval_samples_per_second": 90.273, "eval_steps_per_second": 2.823, "step": 72000 }, { "epoch": 23.56957087126138, "grad_norm": 2.4801974296569824, "learning_rate": 2.7500000000000004e-05, "loss": 0.6311, "step": 72500 }, { "epoch": 23.56957087126138, "eval_accuracy": 0.8585083391102231, "eval_loss": 0.8391257524490356, "eval_runtime": 225.3955, "eval_samples_per_second": 90.658, "eval_steps_per_second": 2.835, "step": 72500 }, { "epoch": 23.73211963589077, "grad_norm": 2.4415576457977295, "learning_rate": 2.7000000000000002e-05, "loss": 0.6333, "step": 73000 }, { "epoch": 23.73211963589077, "eval_accuracy": 0.8584229812472617, "eval_loss": 0.8474059104919434, "eval_runtime": 226.7984, "eval_samples_per_second": 90.098, "eval_steps_per_second": 2.817, "step": 73000 }, { "epoch": 23.894668400520157, "grad_norm": 2.5334153175354004, "learning_rate": 2.6500000000000004e-05, "loss": 0.6291, "step": 73500 }, { "epoch": 23.894668400520157, "eval_accuracy": 0.8590801539747378, "eval_loss": 0.837401270866394, "eval_runtime": 225.1842, "eval_samples_per_second": 90.743, "eval_steps_per_second": 2.838, "step": 73500 }, { "epoch": 24.057217165149545, "grad_norm": 2.569241762161255, "learning_rate": 2.6000000000000002e-05, "loss": 0.6255, "step": 74000 }, { "epoch": 24.057217165149545, "eval_accuracy": 0.8589290048653603, "eval_loss": 0.8332562446594238, "eval_runtime": 229.3684, "eval_samples_per_second": 89.088, "eval_steps_per_second": 2.786, "step": 74000 }, { "epoch": 24.219765929778934, "grad_norm": 2.526848316192627, "learning_rate": 2.5500000000000003e-05, "loss": 0.6236, "step": 74500 }, { "epoch": 24.219765929778934, "eval_accuracy": 0.8597087528316705, "eval_loss": 0.848160445690155, "eval_runtime": 226.4635, "eval_samples_per_second": 90.231, "eval_steps_per_second": 2.822, "step": 74500 }, { "epoch": 24.382314694408322, "grad_norm": 2.655872106552124, "learning_rate": 2.5e-05, "loss": 0.624, "step": 75000 }, { "epoch": 24.382314694408322, "eval_accuracy": 0.8598462984123302, "eval_loss": 0.8461793065071106, "eval_runtime": 225.3124, "eval_samples_per_second": 90.692, "eval_steps_per_second": 2.836, "step": 75000 }, { "epoch": 24.54486345903771, "grad_norm": 2.109790325164795, "learning_rate": 2.45e-05, "loss": 0.6178, "step": 75500 }, { "epoch": 24.54486345903771, "eval_accuracy": 0.8595766101602678, "eval_loss": 0.8289109468460083, "eval_runtime": 226.5489, "eval_samples_per_second": 90.197, "eval_steps_per_second": 2.821, "step": 75500 }, { "epoch": 24.7074122236671, "grad_norm": 2.4060940742492676, "learning_rate": 2.4e-05, "loss": 0.6166, "step": 76000 }, { "epoch": 24.7074122236671, "eval_accuracy": 0.8606532868210031, "eval_loss": 0.8234532475471497, "eval_runtime": 225.3276, "eval_samples_per_second": 90.686, "eval_steps_per_second": 2.836, "step": 76000 }, { "epoch": 24.86996098829649, "grad_norm": 2.24013352394104, "learning_rate": 2.35e-05, "loss": 0.6247, "step": 76500 }, { "epoch": 24.86996098829649, "eval_accuracy": 0.8600803414966184, "eval_loss": 0.8327089548110962, "eval_runtime": 227.0917, "eval_samples_per_second": 89.981, "eval_steps_per_second": 2.814, "step": 76500 }, { "epoch": 25.03250975292588, "grad_norm": 2.370615005493164, "learning_rate": 2.3000000000000003e-05, "loss": 0.624, "step": 77000 }, { "epoch": 25.03250975292588, "eval_accuracy": 0.8600759552685691, "eval_loss": 0.8379160761833191, "eval_runtime": 224.9721, "eval_samples_per_second": 90.829, "eval_steps_per_second": 2.84, "step": 77000 }, { "epoch": 25.195058517555267, "grad_norm": 2.192373514175415, "learning_rate": 2.25e-05, "loss": 0.6161, "step": 77500 }, { "epoch": 25.195058517555267, "eval_accuracy": 0.8597079760637308, "eval_loss": 0.8407602906227112, "eval_runtime": 225.6861, "eval_samples_per_second": 90.542, "eval_steps_per_second": 2.831, "step": 77500 }, { "epoch": 25.357607282184656, "grad_norm": 2.4751832485198975, "learning_rate": 2.2000000000000003e-05, "loss": 0.6142, "step": 78000 }, { "epoch": 25.357607282184656, "eval_accuracy": 0.8608490074683309, "eval_loss": 0.8316988348960876, "eval_runtime": 225.4697, "eval_samples_per_second": 90.629, "eval_steps_per_second": 2.834, "step": 78000 }, { "epoch": 25.520156046814044, "grad_norm": 2.737602472305298, "learning_rate": 2.15e-05, "loss": 0.6193, "step": 78500 }, { "epoch": 25.520156046814044, "eval_accuracy": 0.860743624097801, "eval_loss": 0.8356801867485046, "eval_runtime": 226.5781, "eval_samples_per_second": 90.185, "eval_steps_per_second": 2.82, "step": 78500 }, { "epoch": 25.682704811443433, "grad_norm": 2.1464016437530518, "learning_rate": 2.1e-05, "loss": 0.6117, "step": 79000 }, { "epoch": 25.682704811443433, "eval_accuracy": 0.8607643217464414, "eval_loss": 0.8443505167961121, "eval_runtime": 225.3999, "eval_samples_per_second": 90.657, "eval_steps_per_second": 2.835, "step": 79000 }, { "epoch": 25.84525357607282, "grad_norm": 2.244690418243408, "learning_rate": 2.05e-05, "loss": 0.6155, "step": 79500 }, { "epoch": 25.84525357607282, "eval_accuracy": 0.8608062892739589, "eval_loss": 0.8309040665626526, "eval_runtime": 227.6535, "eval_samples_per_second": 89.759, "eval_steps_per_second": 2.807, "step": 79500 }, { "epoch": 26.00780234070221, "grad_norm": 2.2778093814849854, "learning_rate": 2e-05, "loss": 0.6159, "step": 80000 }, { "epoch": 26.00780234070221, "eval_accuracy": 0.860849372733563, "eval_loss": 0.8468282222747803, "eval_runtime": 226.4415, "eval_samples_per_second": 90.24, "eval_steps_per_second": 2.822, "step": 80000 }, { "epoch": 26.170351105331598, "grad_norm": 2.04994535446167, "learning_rate": 1.9500000000000003e-05, "loss": 0.6095, "step": 80500 }, { "epoch": 26.170351105331598, "eval_accuracy": 0.8617138336966451, "eval_loss": 0.8285297155380249, "eval_runtime": 224.8524, "eval_samples_per_second": 90.877, "eval_steps_per_second": 2.842, "step": 80500 }, { "epoch": 26.33289986996099, "grad_norm": 2.437809944152832, "learning_rate": 1.9e-05, "loss": 0.6105, "step": 81000 }, { "epoch": 26.33289986996099, "eval_accuracy": 0.8610518577000663, "eval_loss": 0.8272110223770142, "eval_runtime": 225.6868, "eval_samples_per_second": 90.541, "eval_steps_per_second": 2.831, "step": 81000 }, { "epoch": 26.495448634590378, "grad_norm": 2.2250375747680664, "learning_rate": 1.85e-05, "loss": 0.6087, "step": 81500 }, { "epoch": 26.495448634590378, "eval_accuracy": 0.8626399458651031, "eval_loss": 0.812351644039154, "eval_runtime": 226.8612, "eval_samples_per_second": 90.073, "eval_steps_per_second": 2.817, "step": 81500 }, { "epoch": 26.657997399219767, "grad_norm": 2.060518980026245, "learning_rate": 1.8e-05, "loss": 0.6151, "step": 82000 }, { "epoch": 26.657997399219767, "eval_accuracy": 0.8616146515518304, "eval_loss": 0.8262282609939575, "eval_runtime": 226.3417, "eval_samples_per_second": 90.279, "eval_steps_per_second": 2.823, "step": 82000 }, { "epoch": 26.820546163849155, "grad_norm": 2.0575308799743652, "learning_rate": 1.75e-05, "loss": 0.6095, "step": 82500 }, { "epoch": 26.820546163849155, "eval_accuracy": 0.8622129077230878, "eval_loss": 0.8227624893188477, "eval_runtime": 228.246, "eval_samples_per_second": 89.526, "eval_steps_per_second": 2.8, "step": 82500 }, { "epoch": 26.983094928478543, "grad_norm": 2.253310203552246, "learning_rate": 1.7000000000000003e-05, "loss": 0.6, "step": 83000 }, { "epoch": 26.983094928478543, "eval_accuracy": 0.8627496487013281, "eval_loss": 0.8195393085479736, "eval_runtime": 226.7315, "eval_samples_per_second": 90.124, "eval_steps_per_second": 2.818, "step": 83000 }, { "epoch": 27.14564369310793, "grad_norm": 2.459129810333252, "learning_rate": 1.65e-05, "loss": 0.6013, "step": 83500 }, { "epoch": 27.14564369310793, "eval_accuracy": 0.8624795555933904, "eval_loss": 0.8182792663574219, "eval_runtime": 225.0241, "eval_samples_per_second": 90.808, "eval_steps_per_second": 2.84, "step": 83500 }, { "epoch": 27.30819245773732, "grad_norm": 2.1593105792999268, "learning_rate": 1.6000000000000003e-05, "loss": 0.6001, "step": 84000 }, { "epoch": 27.30819245773732, "eval_accuracy": 0.8627880443457693, "eval_loss": 0.8216105103492737, "eval_runtime": 224.8609, "eval_samples_per_second": 90.874, "eval_steps_per_second": 2.842, "step": 84000 }, { "epoch": 27.47074122236671, "grad_norm": 2.086055040359497, "learning_rate": 1.55e-05, "loss": 0.6013, "step": 84500 }, { "epoch": 27.47074122236671, "eval_accuracy": 0.8621898394825607, "eval_loss": 0.8210575580596924, "eval_runtime": 226.231, "eval_samples_per_second": 90.324, "eval_steps_per_second": 2.825, "step": 84500 }, { "epoch": 27.6332899869961, "grad_norm": 2.5327186584472656, "learning_rate": 1.5e-05, "loss": 0.6058, "step": 85000 }, { "epoch": 27.6332899869961, "eval_accuracy": 0.8627229258499225, "eval_loss": 0.8185027241706848, "eval_runtime": 226.1064, "eval_samples_per_second": 90.373, "eval_steps_per_second": 2.826, "step": 85000 }, { "epoch": 27.79583875162549, "grad_norm": 2.3839502334594727, "learning_rate": 1.45e-05, "loss": 0.6042, "step": 85500 }, { "epoch": 27.79583875162549, "eval_accuracy": 0.8627582312704439, "eval_loss": 0.8231362104415894, "eval_runtime": 228.8409, "eval_samples_per_second": 89.293, "eval_steps_per_second": 2.792, "step": 85500 }, { "epoch": 27.958387516254877, "grad_norm": 2.195699691772461, "learning_rate": 1.4000000000000001e-05, "loss": 0.5997, "step": 86000 }, { "epoch": 27.958387516254877, "eval_accuracy": 0.8631414086547167, "eval_loss": 0.8138222694396973, "eval_runtime": 225.1659, "eval_samples_per_second": 90.751, "eval_steps_per_second": 2.838, "step": 86000 }, { "epoch": 28.120936280884266, "grad_norm": 2.942133665084839, "learning_rate": 1.3500000000000001e-05, "loss": 0.5976, "step": 86500 }, { "epoch": 28.120936280884266, "eval_accuracy": 0.8629241152120203, "eval_loss": 0.8277115225791931, "eval_runtime": 226.384, "eval_samples_per_second": 90.263, "eval_steps_per_second": 2.823, "step": 86500 }, { "epoch": 28.283485045513654, "grad_norm": 2.2104339599609375, "learning_rate": 1.3000000000000001e-05, "loss": 0.6005, "step": 87000 }, { "epoch": 28.283485045513654, "eval_accuracy": 0.8635328035684335, "eval_loss": 0.8249954581260681, "eval_runtime": 225.8607, "eval_samples_per_second": 90.472, "eval_steps_per_second": 2.829, "step": 87000 }, { "epoch": 28.446033810143042, "grad_norm": 2.4792349338531494, "learning_rate": 1.25e-05, "loss": 0.5964, "step": 87500 }, { "epoch": 28.446033810143042, "eval_accuracy": 0.8631331114250733, "eval_loss": 0.8169623613357544, "eval_runtime": 224.4723, "eval_samples_per_second": 91.031, "eval_steps_per_second": 2.847, "step": 87500 }, { "epoch": 28.60858257477243, "grad_norm": 2.2584621906280518, "learning_rate": 1.2e-05, "loss": 0.5978, "step": 88000 }, { "epoch": 28.60858257477243, "eval_accuracy": 0.8637502162788443, "eval_loss": 0.8240325450897217, "eval_runtime": 226.0844, "eval_samples_per_second": 90.382, "eval_steps_per_second": 2.826, "step": 88000 }, { "epoch": 28.77113133940182, "grad_norm": 2.4756360054016113, "learning_rate": 1.1500000000000002e-05, "loss": 0.5933, "step": 88500 }, { "epoch": 28.77113133940182, "eval_accuracy": 0.863286343484686, "eval_loss": 0.821010947227478, "eval_runtime": 233.6708, "eval_samples_per_second": 87.448, "eval_steps_per_second": 2.735, "step": 88500 }, { "epoch": 28.93368010403121, "grad_norm": 2.4466359615325928, "learning_rate": 1.1000000000000001e-05, "loss": 0.595, "step": 89000 }, { "epoch": 28.93368010403121, "eval_accuracy": 0.8634251119288006, "eval_loss": 0.821042001247406, "eval_runtime": 225.7222, "eval_samples_per_second": 90.527, "eval_steps_per_second": 2.831, "step": 89000 }, { "epoch": 29.0962288686606, "grad_norm": 2.5361807346343994, "learning_rate": 1.05e-05, "loss": 0.5941, "step": 89500 }, { "epoch": 29.0962288686606, "eval_accuracy": 0.8634033694143568, "eval_loss": 0.8251069188117981, "eval_runtime": 226.448, "eval_samples_per_second": 90.237, "eval_steps_per_second": 2.822, "step": 89500 }, { "epoch": 29.258777633289988, "grad_norm": 2.5238239765167236, "learning_rate": 1e-05, "loss": 0.5988, "step": 90000 }, { "epoch": 29.258777633289988, "eval_accuracy": 0.8638910896653752, "eval_loss": 0.8146407604217529, "eval_runtime": 222.0422, "eval_samples_per_second": 92.028, "eval_steps_per_second": 2.878, "step": 90000 }, { "epoch": 29.421326397919376, "grad_norm": 2.291041851043701, "learning_rate": 9.5e-06, "loss": 0.5923, "step": 90500 }, { "epoch": 29.421326397919376, "eval_accuracy": 0.8637874627536761, "eval_loss": 0.8178155422210693, "eval_runtime": 221.0933, "eval_samples_per_second": 92.423, "eval_steps_per_second": 2.89, "step": 90500 }, { "epoch": 29.583875162548765, "grad_norm": 2.601836681365967, "learning_rate": 9e-06, "loss": 0.5887, "step": 91000 }, { "epoch": 29.583875162548765, "eval_accuracy": 0.864438274609619, "eval_loss": 0.8162264823913574, "eval_runtime": 228.6953, "eval_samples_per_second": 89.35, "eval_steps_per_second": 2.794, "step": 91000 }, { "epoch": 29.746423927178153, "grad_norm": 2.56785249710083, "learning_rate": 8.500000000000002e-06, "loss": 0.5833, "step": 91500 }, { "epoch": 29.746423927178153, "eval_accuracy": 0.8640183304537222, "eval_loss": 0.8157439827919006, "eval_runtime": 224.1585, "eval_samples_per_second": 91.159, "eval_steps_per_second": 2.851, "step": 91500 }, { "epoch": 29.90897269180754, "grad_norm": 2.409670114517212, "learning_rate": 8.000000000000001e-06, "loss": 0.5951, "step": 92000 }, { "epoch": 29.90897269180754, "eval_accuracy": 0.8643441985332161, "eval_loss": 0.812107264995575, "eval_runtime": 221.533, "eval_samples_per_second": 92.239, "eval_steps_per_second": 2.884, "step": 92000 }, { "epoch": 30.07152145643693, "grad_norm": 2.19612717628479, "learning_rate": 7.5e-06, "loss": 0.5928, "step": 92500 }, { "epoch": 30.07152145643693, "eval_accuracy": 0.8640605097561019, "eval_loss": 0.8203216791152954, "eval_runtime": 221.1524, "eval_samples_per_second": 92.398, "eval_steps_per_second": 2.889, "step": 92500 }, { "epoch": 30.234070221066318, "grad_norm": 2.249537944793701, "learning_rate": 7.000000000000001e-06, "loss": 0.5878, "step": 93000 }, { "epoch": 30.234070221066318, "eval_accuracy": 0.8644494113060075, "eval_loss": 0.8103818893432617, "eval_runtime": 221.6978, "eval_samples_per_second": 92.17, "eval_steps_per_second": 2.882, "step": 93000 }, { "epoch": 30.39661898569571, "grad_norm": 2.63948917388916, "learning_rate": 6.5000000000000004e-06, "loss": 0.5831, "step": 93500 }, { "epoch": 30.39661898569571, "eval_accuracy": 0.8646731645828304, "eval_loss": 0.8149096369743347, "eval_runtime": 231.6244, "eval_samples_per_second": 88.22, "eval_steps_per_second": 2.759, "step": 93500 }, { "epoch": 30.5591677503251, "grad_norm": 2.257843494415283, "learning_rate": 6e-06, "loss": 0.5922, "step": 94000 }, { "epoch": 30.5591677503251, "eval_accuracy": 0.8642099290677292, "eval_loss": 0.8136395215988159, "eval_runtime": 227.8537, "eval_samples_per_second": 89.68, "eval_steps_per_second": 2.804, "step": 94000 }, { "epoch": 30.721716514954487, "grad_norm": 2.54514479637146, "learning_rate": 5.500000000000001e-06, "loss": 0.5834, "step": 94500 }, { "epoch": 30.721716514954487, "eval_accuracy": 0.8647608401845014, "eval_loss": 0.8055408596992493, "eval_runtime": 222.2677, "eval_samples_per_second": 91.934, "eval_steps_per_second": 2.875, "step": 94500 }, { "epoch": 30.884265279583875, "grad_norm": 2.5504415035247803, "learning_rate": 5e-06, "loss": 0.5851, "step": 95000 }, { "epoch": 30.884265279583875, "eval_accuracy": 0.8655900321339209, "eval_loss": 0.8102414011955261, "eval_runtime": 222.3833, "eval_samples_per_second": 91.886, "eval_steps_per_second": 2.873, "step": 95000 }, { "epoch": 31.046814044213264, "grad_norm": 2.5223162174224854, "learning_rate": 4.5e-06, "loss": 0.5858, "step": 95500 }, { "epoch": 31.046814044213264, "eval_accuracy": 0.8648693759071118, "eval_loss": 0.8096651434898376, "eval_runtime": 222.4149, "eval_samples_per_second": 91.873, "eval_steps_per_second": 2.873, "step": 95500 }, { "epoch": 31.209362808842652, "grad_norm": 2.704538583755493, "learning_rate": 4.000000000000001e-06, "loss": 0.5854, "step": 96000 }, { "epoch": 31.209362808842652, "eval_accuracy": 0.8643289659912363, "eval_loss": 0.8139033317565918, "eval_runtime": 222.3309, "eval_samples_per_second": 91.908, "eval_steps_per_second": 2.874, "step": 96000 }, { "epoch": 31.37191157347204, "grad_norm": 2.3286654949188232, "learning_rate": 3.5000000000000004e-06, "loss": 0.5809, "step": 96500 }, { "epoch": 31.37191157347204, "eval_accuracy": 0.8655915682628923, "eval_loss": 0.8057557940483093, "eval_runtime": 222.7618, "eval_samples_per_second": 91.73, "eval_steps_per_second": 2.869, "step": 96500 }, { "epoch": 31.53446033810143, "grad_norm": 2.572178363800049, "learning_rate": 3e-06, "loss": 0.5846, "step": 97000 }, { "epoch": 31.53446033810143, "eval_accuracy": 0.8653023190042686, "eval_loss": 0.8064507842063904, "eval_runtime": 223.3613, "eval_samples_per_second": 91.484, "eval_steps_per_second": 2.861, "step": 97000 }, { "epoch": 31.69700910273082, "grad_norm": 2.4184162616729736, "learning_rate": 2.5e-06, "loss": 0.585, "step": 97500 }, { "epoch": 31.69700910273082, "eval_accuracy": 0.8657269498564959, "eval_loss": 0.7961297631263733, "eval_runtime": 225.1117, "eval_samples_per_second": 90.773, "eval_steps_per_second": 2.839, "step": 97500 }, { "epoch": 31.85955786736021, "grad_norm": 2.121875047683716, "learning_rate": 2.0000000000000003e-06, "loss": 0.588, "step": 98000 }, { "epoch": 31.85955786736021, "eval_accuracy": 0.8654759221390312, "eval_loss": 0.7994445562362671, "eval_runtime": 222.5184, "eval_samples_per_second": 91.831, "eval_steps_per_second": 2.872, "step": 98000 }, { "epoch": 32.022106631989594, "grad_norm": 2.2691831588745117, "learning_rate": 1.5e-06, "loss": 0.58, "step": 98500 }, { "epoch": 32.022106631989594, "eval_accuracy": 0.8651082523885062, "eval_loss": 0.8115944266319275, "eval_runtime": 222.6443, "eval_samples_per_second": 91.779, "eval_steps_per_second": 2.87, "step": 98500 }, { "epoch": 32.184655396618986, "grad_norm": 2.047391891479492, "learning_rate": 1.0000000000000002e-06, "loss": 0.5776, "step": 99000 }, { "epoch": 32.184655396618986, "eval_accuracy": 0.8659600453633429, "eval_loss": 0.8002920150756836, "eval_runtime": 223.1406, "eval_samples_per_second": 91.575, "eval_steps_per_second": 2.864, "step": 99000 }, { "epoch": 32.34720416124838, "grad_norm": 2.4214348793029785, "learning_rate": 5.000000000000001e-07, "loss": 0.581, "step": 99500 }, { "epoch": 32.34720416124838, "eval_accuracy": 0.8655375271936185, "eval_loss": 0.7950243949890137, "eval_runtime": 240.505, "eval_samples_per_second": 84.963, "eval_steps_per_second": 2.657, "step": 99500 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 33, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.401908931210772e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }